]> gcc.gnu.org Git - gcc.git/blob - gcc/config/i386/i386-expand.c
x86: Update memcpy/memset inline strategies for Ice Lake
[gcc.git] / gcc / config / i386 / i386-expand.c
1 /* Copyright (C) 1988-2021 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
18
19 #define IN_TARGET_CODE 1
20
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "memmodel.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "expmed.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "cgraph.h"
40 #include "diagnostic.h"
41 #include "cfgbuild.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "attribs.h"
45 #include "calls.h"
46 #include "stor-layout.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "insn-attr.h"
50 #include "flags.h"
51 #include "except.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "cfgrtl.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
57 #include "reload.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "tm-constrs.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "dbgcnt.h"
75 #include "case-cfn-macros.h"
76 #include "dojump.h"
77 #include "fold-const-call.h"
78 #include "tree-vrp.h"
79 #include "tree-ssanames.h"
80 #include "selftest.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
83 #include "intl.h"
84 #include "ifcvt.h"
85 #include "symbol-summary.h"
86 #include "ipa-prop.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
90 #include "debug.h"
91 #include "dwarf2out.h"
92 #include "i386-options.h"
93 #include "i386-builtins.h"
94 #include "i386-expand.h"
95
96 /* Split one or more double-mode RTL references into pairs of half-mode
97 references. The RTL can be REG, offsettable MEM, integer constant, or
98 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
99 split and "num" is its length. lo_half and hi_half are output arrays
100 that parallel "operands". */
101
102 void
103 split_double_mode (machine_mode mode, rtx operands[],
104 int num, rtx lo_half[], rtx hi_half[])
105 {
106 machine_mode half_mode;
107 unsigned int byte;
108 rtx mem_op = NULL_RTX;
109 int mem_num = 0;
110
111 switch (mode)
112 {
113 case E_TImode:
114 half_mode = DImode;
115 break;
116 case E_DImode:
117 half_mode = SImode;
118 break;
119 case E_P2HImode:
120 half_mode = HImode;
121 break;
122 case E_P2QImode:
123 half_mode = QImode;
124 break;
125 default:
126 gcc_unreachable ();
127 }
128
129 byte = GET_MODE_SIZE (half_mode);
130
131 while (num--)
132 {
133 rtx op = operands[num];
134
135 /* simplify_subreg refuse to split volatile memory addresses,
136 but we still have to handle it. */
137 if (MEM_P (op))
138 {
139 if (mem_op && rtx_equal_p (op, mem_op))
140 {
141 lo_half[num] = lo_half[mem_num];
142 hi_half[num] = hi_half[mem_num];
143 }
144 else
145 {
146 mem_op = op;
147 mem_num = num;
148 lo_half[num] = adjust_address (op, half_mode, 0);
149 hi_half[num] = adjust_address (op, half_mode, byte);
150 }
151 }
152 else
153 {
154 lo_half[num] = simplify_gen_subreg (half_mode, op,
155 GET_MODE (op) == VOIDmode
156 ? mode : GET_MODE (op), 0);
157 hi_half[num] = simplify_gen_subreg (half_mode, op,
158 GET_MODE (op) == VOIDmode
159 ? mode : GET_MODE (op), byte);
160 }
161 }
162 }
163
164 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
165 for the target. */
166
167 void
168 ix86_expand_clear (rtx dest)
169 {
170 rtx tmp;
171
172 /* We play register width games, which are only valid after reload. */
173 gcc_assert (reload_completed);
174
175 /* Avoid HImode and its attendant prefix byte. */
176 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
177 dest = gen_rtx_REG (SImode, REGNO (dest));
178 tmp = gen_rtx_SET (dest, const0_rtx);
179
180 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
181 {
182 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
183 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
184 }
185
186 emit_insn (tmp);
187 }
188
189 void
190 ix86_expand_move (machine_mode mode, rtx operands[])
191 {
192 rtx op0, op1;
193 rtx tmp, addend = NULL_RTX;
194 enum tls_model model;
195
196 op0 = operands[0];
197 op1 = operands[1];
198
199 /* Avoid complex sets of likely spilled hard registers before reload. */
200 if (!ix86_hardreg_mov_ok (op0, op1))
201 {
202 tmp = gen_reg_rtx (mode);
203 operands[0] = tmp;
204 ix86_expand_move (mode, operands);
205 operands[0] = op0;
206 operands[1] = tmp;
207 op1 = tmp;
208 }
209
210 switch (GET_CODE (op1))
211 {
212 case CONST:
213 tmp = XEXP (op1, 0);
214
215 if (GET_CODE (tmp) != PLUS
216 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
217 break;
218
219 op1 = XEXP (tmp, 0);
220 addend = XEXP (tmp, 1);
221 /* FALLTHRU */
222
223 case SYMBOL_REF:
224 model = SYMBOL_REF_TLS_MODEL (op1);
225
226 if (model)
227 op1 = legitimize_tls_address (op1, model, true);
228 else if (ix86_force_load_from_GOT_p (op1))
229 {
230 /* Load the external function address via GOT slot to avoid PLT. */
231 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
232 (TARGET_64BIT
233 ? UNSPEC_GOTPCREL
234 : UNSPEC_GOT));
235 op1 = gen_rtx_CONST (Pmode, op1);
236 op1 = gen_const_mem (Pmode, op1);
237 set_mem_alias_set (op1, ix86_GOT_alias_set ());
238 }
239 else
240 {
241 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
242 if (tmp)
243 {
244 op1 = tmp;
245 if (!addend)
246 break;
247 }
248 else
249 {
250 op1 = operands[1];
251 break;
252 }
253 }
254
255 if (addend)
256 {
257 op1 = force_operand (op1, NULL_RTX);
258 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
259 op0, 1, OPTAB_DIRECT);
260 }
261 else
262 op1 = force_operand (op1, op0);
263
264 if (op1 == op0)
265 return;
266
267 op1 = convert_to_mode (mode, op1, 1);
268
269 default:
270 break;
271 }
272
273 if ((flag_pic || MACHOPIC_INDIRECT)
274 && symbolic_operand (op1, mode))
275 {
276 if (TARGET_MACHO && !TARGET_64BIT)
277 {
278 #if TARGET_MACHO
279 /* dynamic-no-pic */
280 if (MACHOPIC_INDIRECT)
281 {
282 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
283 ? op0 : gen_reg_rtx (Pmode);
284 op1 = machopic_indirect_data_reference (op1, temp);
285 if (MACHOPIC_PURE)
286 op1 = machopic_legitimize_pic_address (op1, mode,
287 temp == op1 ? 0 : temp);
288 }
289 if (op0 != op1 && GET_CODE (op0) != MEM)
290 {
291 rtx insn = gen_rtx_SET (op0, op1);
292 emit_insn (insn);
293 return;
294 }
295 if (GET_CODE (op0) == MEM)
296 op1 = force_reg (Pmode, op1);
297 else
298 {
299 rtx temp = op0;
300 if (GET_CODE (temp) != REG)
301 temp = gen_reg_rtx (Pmode);
302 temp = legitimize_pic_address (op1, temp);
303 if (temp == op0)
304 return;
305 op1 = temp;
306 }
307 /* dynamic-no-pic */
308 #endif
309 }
310 else
311 {
312 if (MEM_P (op0))
313 op1 = force_reg (mode, op1);
314 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
315 {
316 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
317 op1 = legitimize_pic_address (op1, reg);
318 if (op0 == op1)
319 return;
320 op1 = convert_to_mode (mode, op1, 1);
321 }
322 }
323 }
324 else
325 {
326 if (MEM_P (op0)
327 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
328 || !push_operand (op0, mode))
329 && MEM_P (op1))
330 op1 = force_reg (mode, op1);
331
332 if (push_operand (op0, mode)
333 && ! general_no_elim_operand (op1, mode))
334 op1 = copy_to_mode_reg (mode, op1);
335
336 /* Force large constants in 64bit compilation into register
337 to get them CSEed. */
338 if (can_create_pseudo_p ()
339 && (mode == DImode) && TARGET_64BIT
340 && immediate_operand (op1, mode)
341 && !x86_64_zext_immediate_operand (op1, VOIDmode)
342 && !register_operand (op0, mode)
343 && optimize)
344 op1 = copy_to_mode_reg (mode, op1);
345
346 if (can_create_pseudo_p ()
347 && CONST_DOUBLE_P (op1))
348 {
349 /* If we are loading a floating point constant to a register,
350 force the value to memory now, since we'll get better code
351 out the back end. */
352
353 op1 = validize_mem (force_const_mem (mode, op1));
354 if (!register_operand (op0, mode))
355 {
356 rtx temp = gen_reg_rtx (mode);
357 emit_insn (gen_rtx_SET (temp, op1));
358 emit_move_insn (op0, temp);
359 return;
360 }
361 }
362 }
363
364 emit_insn (gen_rtx_SET (op0, op1));
365 }
366
367 void
368 ix86_expand_vector_move (machine_mode mode, rtx operands[])
369 {
370 rtx op0 = operands[0], op1 = operands[1];
371 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
372 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
373 unsigned int align = (TARGET_IAMCU
374 ? GET_MODE_BITSIZE (mode)
375 : GET_MODE_ALIGNMENT (mode));
376
377 if (push_operand (op0, VOIDmode))
378 op0 = emit_move_resolve_push (mode, op0);
379
380 /* Force constants other than zero into memory. We do not know how
381 the instructions used to build constants modify the upper 64 bits
382 of the register, once we have that information we may be able
383 to handle some of them more efficiently. */
384 if (can_create_pseudo_p ()
385 && (CONSTANT_P (op1)
386 || (SUBREG_P (op1)
387 && CONSTANT_P (SUBREG_REG (op1))))
388 && ((register_operand (op0, mode)
389 && !standard_sse_constant_p (op1, mode))
390 /* ix86_expand_vector_move_misalign() does not like constants. */
391 || (SSE_REG_MODE_P (mode)
392 && MEM_P (op0)
393 && MEM_ALIGN (op0) < align)))
394 {
395 if (SUBREG_P (op1))
396 {
397 machine_mode imode = GET_MODE (SUBREG_REG (op1));
398 rtx r = force_const_mem (imode, SUBREG_REG (op1));
399 if (r)
400 r = validize_mem (r);
401 else
402 r = force_reg (imode, SUBREG_REG (op1));
403 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
404 }
405 else
406 op1 = validize_mem (force_const_mem (mode, op1));
407 }
408
409 /* We need to check memory alignment for SSE mode since attribute
410 can make operands unaligned. */
411 if (can_create_pseudo_p ()
412 && SSE_REG_MODE_P (mode)
413 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
414 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
415 {
416 rtx tmp[2];
417
418 /* ix86_expand_vector_move_misalign() does not like both
419 arguments in memory. */
420 if (!register_operand (op0, mode)
421 && !register_operand (op1, mode))
422 op1 = force_reg (mode, op1);
423
424 tmp[0] = op0; tmp[1] = op1;
425 ix86_expand_vector_move_misalign (mode, tmp);
426 return;
427 }
428
429 /* Make operand1 a register if it isn't already. */
430 if (can_create_pseudo_p ()
431 && !register_operand (op0, mode)
432 && !register_operand (op1, mode))
433 {
434 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
435 return;
436 }
437
438 emit_insn (gen_rtx_SET (op0, op1));
439 }
440
441 /* Split 32-byte AVX unaligned load and store if needed. */
442
443 static void
444 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
445 {
446 rtx m;
447 rtx (*extract) (rtx, rtx, rtx);
448 machine_mode mode;
449
450 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
451 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
452 {
453 emit_insn (gen_rtx_SET (op0, op1));
454 return;
455 }
456
457 rtx orig_op0 = NULL_RTX;
458 mode = GET_MODE (op0);
459 switch (GET_MODE_CLASS (mode))
460 {
461 case MODE_VECTOR_INT:
462 case MODE_INT:
463 if (mode != V32QImode)
464 {
465 if (!MEM_P (op0))
466 {
467 orig_op0 = op0;
468 op0 = gen_reg_rtx (V32QImode);
469 }
470 else
471 op0 = gen_lowpart (V32QImode, op0);
472 op1 = gen_lowpart (V32QImode, op1);
473 mode = V32QImode;
474 }
475 break;
476 case MODE_VECTOR_FLOAT:
477 break;
478 default:
479 gcc_unreachable ();
480 }
481
482 switch (mode)
483 {
484 default:
485 gcc_unreachable ();
486 case E_V32QImode:
487 extract = gen_avx_vextractf128v32qi;
488 mode = V16QImode;
489 break;
490 case E_V8SFmode:
491 extract = gen_avx_vextractf128v8sf;
492 mode = V4SFmode;
493 break;
494 case E_V4DFmode:
495 extract = gen_avx_vextractf128v4df;
496 mode = V2DFmode;
497 break;
498 }
499
500 if (MEM_P (op1))
501 {
502 rtx r = gen_reg_rtx (mode);
503 m = adjust_address (op1, mode, 0);
504 emit_move_insn (r, m);
505 m = adjust_address (op1, mode, 16);
506 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
507 emit_move_insn (op0, r);
508 }
509 else if (MEM_P (op0))
510 {
511 m = adjust_address (op0, mode, 0);
512 emit_insn (extract (m, op1, const0_rtx));
513 m = adjust_address (op0, mode, 16);
514 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
515 }
516 else
517 gcc_unreachable ();
518
519 if (orig_op0)
520 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
521 }
522
523 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
524 straight to ix86_expand_vector_move. */
525 /* Code generation for scalar reg-reg moves of single and double precision data:
526 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
527 movaps reg, reg
528 else
529 movss reg, reg
530 if (x86_sse_partial_reg_dependency == true)
531 movapd reg, reg
532 else
533 movsd reg, reg
534
535 Code generation for scalar loads of double precision data:
536 if (x86_sse_split_regs == true)
537 movlpd mem, reg (gas syntax)
538 else
539 movsd mem, reg
540
541 Code generation for unaligned packed loads of single precision data
542 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
543 if (x86_sse_unaligned_move_optimal)
544 movups mem, reg
545
546 if (x86_sse_partial_reg_dependency == true)
547 {
548 xorps reg, reg
549 movlps mem, reg
550 movhps mem+8, reg
551 }
552 else
553 {
554 movlps mem, reg
555 movhps mem+8, reg
556 }
557
558 Code generation for unaligned packed loads of double precision data
559 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
560 if (x86_sse_unaligned_move_optimal)
561 movupd mem, reg
562
563 if (x86_sse_split_regs == true)
564 {
565 movlpd mem, reg
566 movhpd mem+8, reg
567 }
568 else
569 {
570 movsd mem, reg
571 movhpd mem+8, reg
572 }
573 */
574
575 void
576 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
577 {
578 rtx op0, op1, m;
579
580 op0 = operands[0];
581 op1 = operands[1];
582
583 /* Use unaligned load/store for AVX512 or when optimizing for size. */
584 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
585 {
586 emit_insn (gen_rtx_SET (op0, op1));
587 return;
588 }
589
590 if (TARGET_AVX)
591 {
592 if (GET_MODE_SIZE (mode) == 32)
593 ix86_avx256_split_vector_move_misalign (op0, op1);
594 else
595 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
596 emit_insn (gen_rtx_SET (op0, op1));
597 return;
598 }
599
600 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
601 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
602 {
603 emit_insn (gen_rtx_SET (op0, op1));
604 return;
605 }
606
607 /* ??? If we have typed data, then it would appear that using
608 movdqu is the only way to get unaligned data loaded with
609 integer type. */
610 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
611 {
612 emit_insn (gen_rtx_SET (op0, op1));
613 return;
614 }
615
616 if (MEM_P (op1))
617 {
618 if (TARGET_SSE2 && mode == V2DFmode)
619 {
620 rtx zero;
621
622 /* When SSE registers are split into halves, we can avoid
623 writing to the top half twice. */
624 if (TARGET_SSE_SPLIT_REGS)
625 {
626 emit_clobber (op0);
627 zero = op0;
628 }
629 else
630 {
631 /* ??? Not sure about the best option for the Intel chips.
632 The following would seem to satisfy; the register is
633 entirely cleared, breaking the dependency chain. We
634 then store to the upper half, with a dependency depth
635 of one. A rumor has it that Intel recommends two movsd
636 followed by an unpacklpd, but this is unconfirmed. And
637 given that the dependency depth of the unpacklpd would
638 still be one, I'm not sure why this would be better. */
639 zero = CONST0_RTX (V2DFmode);
640 }
641
642 m = adjust_address (op1, DFmode, 0);
643 emit_insn (gen_sse2_loadlpd (op0, zero, m));
644 m = adjust_address (op1, DFmode, 8);
645 emit_insn (gen_sse2_loadhpd (op0, op0, m));
646 }
647 else
648 {
649 rtx t;
650
651 if (mode != V4SFmode)
652 t = gen_reg_rtx (V4SFmode);
653 else
654 t = op0;
655
656 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
657 emit_move_insn (t, CONST0_RTX (V4SFmode));
658 else
659 emit_clobber (t);
660
661 m = adjust_address (op1, V2SFmode, 0);
662 emit_insn (gen_sse_loadlps (t, t, m));
663 m = adjust_address (op1, V2SFmode, 8);
664 emit_insn (gen_sse_loadhps (t, t, m));
665 if (mode != V4SFmode)
666 emit_move_insn (op0, gen_lowpart (mode, t));
667 }
668 }
669 else if (MEM_P (op0))
670 {
671 if (TARGET_SSE2 && mode == V2DFmode)
672 {
673 m = adjust_address (op0, DFmode, 0);
674 emit_insn (gen_sse2_storelpd (m, op1));
675 m = adjust_address (op0, DFmode, 8);
676 emit_insn (gen_sse2_storehpd (m, op1));
677 }
678 else
679 {
680 if (mode != V4SFmode)
681 op1 = gen_lowpart (V4SFmode, op1);
682
683 m = adjust_address (op0, V2SFmode, 0);
684 emit_insn (gen_sse_storelps (m, op1));
685 m = adjust_address (op0, V2SFmode, 8);
686 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
687 }
688 }
689 else
690 gcc_unreachable ();
691 }
692
693 /* Move bits 64:95 to bits 32:63. */
694
695 void
696 ix86_move_vector_high_sse_to_mmx (rtx op)
697 {
698 rtx mask = gen_rtx_PARALLEL (VOIDmode,
699 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
700 GEN_INT (0), GEN_INT (0)));
701 rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
702 op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
703 rtx insn = gen_rtx_SET (dest, op);
704 emit_insn (insn);
705 }
706
707 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
708
709 void
710 ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
711 {
712 rtx op0 = operands[0];
713 rtx op1 = operands[1];
714 rtx op2 = operands[2];
715
716 machine_mode dmode = GET_MODE (op0);
717 machine_mode smode = GET_MODE (op1);
718 machine_mode inner_dmode = GET_MODE_INNER (dmode);
719 machine_mode inner_smode = GET_MODE_INNER (smode);
720
721 /* Get the corresponding SSE mode for destination. */
722 int nunits = 16 / GET_MODE_SIZE (inner_dmode);
723 machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
724 nunits).require ();
725 machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
726 nunits / 2).require ();
727
728 /* Get the corresponding SSE mode for source. */
729 nunits = 16 / GET_MODE_SIZE (inner_smode);
730 machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
731 nunits).require ();
732
733 /* Generate SSE pack with signed/unsigned saturation. */
734 rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
735 op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
736 op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
737
738 op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
739 op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
740 rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
741 op1, op2));
742 emit_insn (insn);
743
744 ix86_move_vector_high_sse_to_mmx (op0);
745 }
746
747 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
748
749 void
750 ix86_split_mmx_punpck (rtx operands[], bool high_p)
751 {
752 rtx op0 = operands[0];
753 rtx op1 = operands[1];
754 rtx op2 = operands[2];
755 machine_mode mode = GET_MODE (op0);
756 rtx mask;
757 /* The corresponding SSE mode. */
758 machine_mode sse_mode, double_sse_mode;
759
760 switch (mode)
761 {
762 case E_V8QImode:
763 sse_mode = V16QImode;
764 double_sse_mode = V32QImode;
765 mask = gen_rtx_PARALLEL (VOIDmode,
766 gen_rtvec (16,
767 GEN_INT (0), GEN_INT (16),
768 GEN_INT (1), GEN_INT (17),
769 GEN_INT (2), GEN_INT (18),
770 GEN_INT (3), GEN_INT (19),
771 GEN_INT (4), GEN_INT (20),
772 GEN_INT (5), GEN_INT (21),
773 GEN_INT (6), GEN_INT (22),
774 GEN_INT (7), GEN_INT (23)));
775 break;
776
777 case E_V4HImode:
778 sse_mode = V8HImode;
779 double_sse_mode = V16HImode;
780 mask = gen_rtx_PARALLEL (VOIDmode,
781 gen_rtvec (8,
782 GEN_INT (0), GEN_INT (8),
783 GEN_INT (1), GEN_INT (9),
784 GEN_INT (2), GEN_INT (10),
785 GEN_INT (3), GEN_INT (11)));
786 break;
787
788 case E_V2SImode:
789 sse_mode = V4SImode;
790 double_sse_mode = V8SImode;
791 mask = gen_rtx_PARALLEL (VOIDmode,
792 gen_rtvec (4,
793 GEN_INT (0), GEN_INT (4),
794 GEN_INT (1), GEN_INT (5)));
795 break;
796
797 default:
798 gcc_unreachable ();
799 }
800
801 /* Generate SSE punpcklXX. */
802 rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
803 op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
804 op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
805
806 op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
807 op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
808 rtx insn = gen_rtx_SET (dest, op2);
809 emit_insn (insn);
810
811 if (high_p)
812 {
813 /* Move bits 64:127 to bits 0:63. */
814 mask = gen_rtx_PARALLEL (VOIDmode,
815 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
816 GEN_INT (0), GEN_INT (0)));
817 dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
818 op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
819 insn = gen_rtx_SET (dest, op1);
820 emit_insn (insn);
821 }
822 }
823
824 /* Helper function of ix86_fixup_binary_operands to canonicalize
825 operand order. Returns true if the operands should be swapped. */
826
827 static bool
828 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
829 rtx operands[])
830 {
831 rtx dst = operands[0];
832 rtx src1 = operands[1];
833 rtx src2 = operands[2];
834
835 /* If the operation is not commutative, we can't do anything. */
836 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
837 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
838 return false;
839
840 /* Highest priority is that src1 should match dst. */
841 if (rtx_equal_p (dst, src1))
842 return false;
843 if (rtx_equal_p (dst, src2))
844 return true;
845
846 /* Next highest priority is that immediate constants come second. */
847 if (immediate_operand (src2, mode))
848 return false;
849 if (immediate_operand (src1, mode))
850 return true;
851
852 /* Lowest priority is that memory references should come second. */
853 if (MEM_P (src2))
854 return false;
855 if (MEM_P (src1))
856 return true;
857
858 return false;
859 }
860
861
862 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
863 destination to use for the operation. If different from the true
864 destination in operands[0], a copy operation will be required. */
865
866 rtx
867 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
868 rtx operands[])
869 {
870 rtx dst = operands[0];
871 rtx src1 = operands[1];
872 rtx src2 = operands[2];
873
874 /* Canonicalize operand order. */
875 if (ix86_swap_binary_operands_p (code, mode, operands))
876 {
877 /* It is invalid to swap operands of different modes. */
878 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
879
880 std::swap (src1, src2);
881 }
882
883 /* Both source operands cannot be in memory. */
884 if (MEM_P (src1) && MEM_P (src2))
885 {
886 /* Optimization: Only read from memory once. */
887 if (rtx_equal_p (src1, src2))
888 {
889 src2 = force_reg (mode, src2);
890 src1 = src2;
891 }
892 else if (rtx_equal_p (dst, src1))
893 src2 = force_reg (mode, src2);
894 else
895 src1 = force_reg (mode, src1);
896 }
897
898 /* If the destination is memory, and we do not have matching source
899 operands, do things in registers. */
900 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
901 dst = gen_reg_rtx (mode);
902
903 /* Source 1 cannot be a constant. */
904 if (CONSTANT_P (src1))
905 src1 = force_reg (mode, src1);
906
907 /* Source 1 cannot be a non-matching memory. */
908 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
909 src1 = force_reg (mode, src1);
910
911 /* Improve address combine. */
912 if (code == PLUS
913 && GET_MODE_CLASS (mode) == MODE_INT
914 && MEM_P (src2))
915 src2 = force_reg (mode, src2);
916
917 operands[1] = src1;
918 operands[2] = src2;
919 return dst;
920 }
921
922 /* Similarly, but assume that the destination has already been
923 set up properly. */
924
925 void
926 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
927 machine_mode mode, rtx operands[])
928 {
929 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
930 gcc_assert (dst == operands[0]);
931 }
932
933 /* Attempt to expand a binary operator. Make the expansion closer to the
934 actual machine, then just general_operand, which will allow 3 separate
935 memory references (one output, two input) in a single insn. */
936
937 void
938 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
939 rtx operands[])
940 {
941 rtx src1, src2, dst, op, clob;
942
943 dst = ix86_fixup_binary_operands (code, mode, operands);
944 src1 = operands[1];
945 src2 = operands[2];
946
947 /* Emit the instruction. */
948
949 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
950
951 if (reload_completed
952 && code == PLUS
953 && !rtx_equal_p (dst, src1))
954 {
955 /* This is going to be an LEA; avoid splitting it later. */
956 emit_insn (op);
957 }
958 else
959 {
960 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
961 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
962 }
963
964 /* Fix up the destination if needed. */
965 if (dst != operands[0])
966 emit_move_insn (operands[0], dst);
967 }
968
969 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
970 the given OPERANDS. */
971
972 void
973 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
974 rtx operands[])
975 {
976 rtx op1 = NULL_RTX, op2 = NULL_RTX;
977 if (SUBREG_P (operands[1]))
978 {
979 op1 = operands[1];
980 op2 = operands[2];
981 }
982 else if (SUBREG_P (operands[2]))
983 {
984 op1 = operands[2];
985 op2 = operands[1];
986 }
987 /* Optimize (__m128i) d | (__m128i) e and similar code
988 when d and e are float vectors into float vector logical
989 insn. In C/C++ without using intrinsics there is no other way
990 to express vector logical operation on float vectors than
991 to cast them temporarily to integer vectors. */
992 if (op1
993 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
994 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
995 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
996 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
997 && SUBREG_BYTE (op1) == 0
998 && (GET_CODE (op2) == CONST_VECTOR
999 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
1000 && SUBREG_BYTE (op2) == 0))
1001 && can_create_pseudo_p ())
1002 {
1003 rtx dst;
1004 switch (GET_MODE (SUBREG_REG (op1)))
1005 {
1006 case E_V4SFmode:
1007 case E_V8SFmode:
1008 case E_V16SFmode:
1009 case E_V2DFmode:
1010 case E_V4DFmode:
1011 case E_V8DFmode:
1012 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
1013 if (GET_CODE (op2) == CONST_VECTOR)
1014 {
1015 op2 = gen_lowpart (GET_MODE (dst), op2);
1016 op2 = force_reg (GET_MODE (dst), op2);
1017 }
1018 else
1019 {
1020 op1 = operands[1];
1021 op2 = SUBREG_REG (operands[2]);
1022 if (!vector_operand (op2, GET_MODE (dst)))
1023 op2 = force_reg (GET_MODE (dst), op2);
1024 }
1025 op1 = SUBREG_REG (op1);
1026 if (!vector_operand (op1, GET_MODE (dst)))
1027 op1 = force_reg (GET_MODE (dst), op1);
1028 emit_insn (gen_rtx_SET (dst,
1029 gen_rtx_fmt_ee (code, GET_MODE (dst),
1030 op1, op2)));
1031 emit_move_insn (operands[0], gen_lowpart (mode, dst));
1032 return;
1033 default:
1034 break;
1035 }
1036 }
1037 if (!vector_operand (operands[1], mode))
1038 operands[1] = force_reg (mode, operands[1]);
1039 if (!vector_operand (operands[2], mode))
1040 operands[2] = force_reg (mode, operands[2]);
1041 ix86_fixup_binary_operands_no_copy (code, mode, operands);
1042 emit_insn (gen_rtx_SET (operands[0],
1043 gen_rtx_fmt_ee (code, mode, operands[1],
1044 operands[2])));
1045 }
1046
1047 /* Return TRUE or FALSE depending on whether the binary operator meets the
1048 appropriate constraints. */
1049
1050 bool
1051 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1052 rtx operands[3])
1053 {
1054 rtx dst = operands[0];
1055 rtx src1 = operands[1];
1056 rtx src2 = operands[2];
1057
1058 /* Both source operands cannot be in memory. */
1059 if ((MEM_P (src1) || bcst_mem_operand (src1, mode))
1060 && (MEM_P (src2) || bcst_mem_operand (src2, mode)))
1061 return false;
1062
1063 /* Canonicalize operand order for commutative operators. */
1064 if (ix86_swap_binary_operands_p (code, mode, operands))
1065 std::swap (src1, src2);
1066
1067 /* If the destination is memory, we must have a matching source operand. */
1068 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1069 return false;
1070
1071 /* Source 1 cannot be a constant. */
1072 if (CONSTANT_P (src1))
1073 return false;
1074
1075 /* Source 1 cannot be a non-matching memory. */
1076 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1077 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1078 return (code == AND
1079 && (mode == HImode
1080 || mode == SImode
1081 || (TARGET_64BIT && mode == DImode))
1082 && satisfies_constraint_L (src2));
1083
1084 return true;
1085 }
1086
1087 /* Attempt to expand a unary operator. Make the expansion closer to the
1088 actual machine, then just general_operand, which will allow 2 separate
1089 memory references (one output, one input) in a single insn. */
1090
1091 void
1092 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1093 rtx operands[])
1094 {
1095 bool matching_memory = false;
1096 rtx src, dst, op, clob;
1097
1098 dst = operands[0];
1099 src = operands[1];
1100
1101 /* If the destination is memory, and we do not have matching source
1102 operands, do things in registers. */
1103 if (MEM_P (dst))
1104 {
1105 if (rtx_equal_p (dst, src))
1106 matching_memory = true;
1107 else
1108 dst = gen_reg_rtx (mode);
1109 }
1110
1111 /* When source operand is memory, destination must match. */
1112 if (MEM_P (src) && !matching_memory)
1113 src = force_reg (mode, src);
1114
1115 /* Emit the instruction. */
1116
1117 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1118
1119 if (code == NOT)
1120 emit_insn (op);
1121 else
1122 {
1123 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1124 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1125 }
1126
1127 /* Fix up the destination if needed. */
1128 if (dst != operands[0])
1129 emit_move_insn (operands[0], dst);
1130 }
1131
1132 /* Predict just emitted jump instruction to be taken with probability PROB. */
1133
1134 static void
1135 predict_jump (int prob)
1136 {
1137 rtx_insn *insn = get_last_insn ();
1138 gcc_assert (JUMP_P (insn));
1139 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1140 }
1141
1142 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1143 divisor are within the range [0-255]. */
1144
1145 void
1146 ix86_split_idivmod (machine_mode mode, rtx operands[],
1147 bool unsigned_p)
1148 {
1149 rtx_code_label *end_label, *qimode_label;
1150 rtx div, mod;
1151 rtx_insn *insn;
1152 rtx scratch, tmp0, tmp1, tmp2;
1153 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
1154
1155 switch (mode)
1156 {
1157 case E_SImode:
1158 if (GET_MODE (operands[0]) == SImode)
1159 {
1160 if (GET_MODE (operands[1]) == SImode)
1161 gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
1162 else
1163 gen_divmod4_1
1164 = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
1165 }
1166 else
1167 gen_divmod4_1
1168 = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
1169 break;
1170
1171 case E_DImode:
1172 gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
1173 break;
1174
1175 default:
1176 gcc_unreachable ();
1177 }
1178
1179 end_label = gen_label_rtx ();
1180 qimode_label = gen_label_rtx ();
1181
1182 scratch = gen_reg_rtx (mode);
1183
1184 /* Use 8bit unsigned divimod if dividend and divisor are within
1185 the range [0-255]. */
1186 emit_move_insn (scratch, operands[2]);
1187 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1188 scratch, 1, OPTAB_DIRECT);
1189 emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
1190 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1191 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1192 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1193 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1194 pc_rtx);
1195 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1196 predict_jump (REG_BR_PROB_BASE * 50 / 100);
1197 JUMP_LABEL (insn) = qimode_label;
1198
1199 /* Generate original signed/unsigned divimod. */
1200 emit_insn (gen_divmod4_1 (operands[0], operands[1],
1201 operands[2], operands[3]));
1202
1203 /* Branch to the end. */
1204 emit_jump_insn (gen_jump (end_label));
1205 emit_barrier ();
1206
1207 /* Generate 8bit unsigned divide. */
1208 emit_label (qimode_label);
1209 /* Don't use operands[0] for result of 8bit divide since not all
1210 registers support QImode ZERO_EXTRACT. */
1211 tmp0 = lowpart_subreg (HImode, scratch, mode);
1212 tmp1 = lowpart_subreg (HImode, operands[2], mode);
1213 tmp2 = lowpart_subreg (QImode, operands[3], mode);
1214 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1215
1216 if (unsigned_p)
1217 {
1218 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1219 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
1220 }
1221 else
1222 {
1223 div = gen_rtx_DIV (mode, operands[2], operands[3]);
1224 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
1225 }
1226 if (mode == SImode)
1227 {
1228 if (GET_MODE (operands[0]) != SImode)
1229 div = gen_rtx_ZERO_EXTEND (DImode, div);
1230 if (GET_MODE (operands[1]) != SImode)
1231 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1232 }
1233
1234 /* Extract remainder from AH. */
1235 scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
1236 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
1237 GEN_INT (8), GEN_INT (8));
1238 insn = emit_move_insn (operands[1], tmp1);
1239 set_unique_reg_note (insn, REG_EQUAL, mod);
1240
1241 /* Zero extend quotient from AL. */
1242 tmp1 = gen_lowpart (QImode, tmp0);
1243 insn = emit_insn (gen_extend_insn
1244 (operands[0], tmp1,
1245 GET_MODE (operands[0]), QImode, 1));
1246 set_unique_reg_note (insn, REG_EQUAL, div);
1247
1248 emit_label (end_label);
1249 }
1250
1251 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1252 matches destination. RTX includes clobber of FLAGS_REG. */
1253
1254 void
1255 ix86_emit_binop (enum rtx_code code, machine_mode mode,
1256 rtx dst, rtx src)
1257 {
1258 rtx op, clob;
1259
1260 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1261 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1262
1263 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1264 }
1265
1266 /* Return true if regno1 def is nearest to the insn. */
1267
1268 static bool
1269 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1270 {
1271 rtx_insn *prev = insn;
1272 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1273
1274 if (insn == start)
1275 return false;
1276 while (prev && prev != start)
1277 {
1278 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1279 {
1280 prev = PREV_INSN (prev);
1281 continue;
1282 }
1283 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1284 return true;
1285 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1286 return false;
1287 prev = PREV_INSN (prev);
1288 }
1289
1290 /* None of the regs is defined in the bb. */
1291 return false;
1292 }
1293
1294 /* Split lea instructions into a sequence of instructions
1295 which are executed on ALU to avoid AGU stalls.
1296 It is assumed that it is allowed to clobber flags register
1297 at lea position. */
1298
1299 void
1300 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1301 {
1302 unsigned int regno0, regno1, regno2;
1303 struct ix86_address parts;
1304 rtx target, tmp;
1305 int ok, adds;
1306
1307 ok = ix86_decompose_address (operands[1], &parts);
1308 gcc_assert (ok);
1309
1310 target = gen_lowpart (mode, operands[0]);
1311
1312 regno0 = true_regnum (target);
1313 regno1 = INVALID_REGNUM;
1314 regno2 = INVALID_REGNUM;
1315
1316 if (parts.base)
1317 {
1318 parts.base = gen_lowpart (mode, parts.base);
1319 regno1 = true_regnum (parts.base);
1320 }
1321
1322 if (parts.index)
1323 {
1324 parts.index = gen_lowpart (mode, parts.index);
1325 regno2 = true_regnum (parts.index);
1326 }
1327
1328 if (parts.disp)
1329 parts.disp = gen_lowpart (mode, parts.disp);
1330
1331 if (parts.scale > 1)
1332 {
1333 /* Case r1 = r1 + ... */
1334 if (regno1 == regno0)
1335 {
1336 /* If we have a case r1 = r1 + C * r2 then we
1337 should use multiplication which is very
1338 expensive. Assume cost model is wrong if we
1339 have such case here. */
1340 gcc_assert (regno2 != regno0);
1341
1342 for (adds = parts.scale; adds > 0; adds--)
1343 ix86_emit_binop (PLUS, mode, target, parts.index);
1344 }
1345 else
1346 {
1347 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1348 if (regno0 != regno2)
1349 emit_insn (gen_rtx_SET (target, parts.index));
1350
1351 /* Use shift for scaling, but emit it as MULT instead
1352 to avoid it being immediately peephole2 optimized back
1353 into lea. */
1354 ix86_emit_binop (MULT, mode, target, GEN_INT (parts.scale));
1355
1356 if (parts.base)
1357 ix86_emit_binop (PLUS, mode, target, parts.base);
1358
1359 if (parts.disp && parts.disp != const0_rtx)
1360 ix86_emit_binop (PLUS, mode, target, parts.disp);
1361 }
1362 }
1363 else if (!parts.base && !parts.index)
1364 {
1365 gcc_assert(parts.disp);
1366 emit_insn (gen_rtx_SET (target, parts.disp));
1367 }
1368 else
1369 {
1370 if (!parts.base)
1371 {
1372 if (regno0 != regno2)
1373 emit_insn (gen_rtx_SET (target, parts.index));
1374 }
1375 else if (!parts.index)
1376 {
1377 if (regno0 != regno1)
1378 emit_insn (gen_rtx_SET (target, parts.base));
1379 }
1380 else
1381 {
1382 if (regno0 == regno1)
1383 tmp = parts.index;
1384 else if (regno0 == regno2)
1385 tmp = parts.base;
1386 else
1387 {
1388 rtx tmp1;
1389
1390 /* Find better operand for SET instruction, depending
1391 on which definition is farther from the insn. */
1392 if (find_nearest_reg_def (insn, regno1, regno2))
1393 tmp = parts.index, tmp1 = parts.base;
1394 else
1395 tmp = parts.base, tmp1 = parts.index;
1396
1397 emit_insn (gen_rtx_SET (target, tmp));
1398
1399 if (parts.disp && parts.disp != const0_rtx)
1400 ix86_emit_binop (PLUS, mode, target, parts.disp);
1401
1402 ix86_emit_binop (PLUS, mode, target, tmp1);
1403 return;
1404 }
1405
1406 ix86_emit_binop (PLUS, mode, target, tmp);
1407 }
1408
1409 if (parts.disp && parts.disp != const0_rtx)
1410 ix86_emit_binop (PLUS, mode, target, parts.disp);
1411 }
1412 }
1413
1414 /* Post-reload splitter for converting an SF or DFmode value in an
1415 SSE register into an unsigned SImode. */
1416
1417 void
1418 ix86_split_convert_uns_si_sse (rtx operands[])
1419 {
1420 machine_mode vecmode;
1421 rtx value, large, zero_or_two31, input, two31, x;
1422
1423 large = operands[1];
1424 zero_or_two31 = operands[2];
1425 input = operands[3];
1426 two31 = operands[4];
1427 vecmode = GET_MODE (large);
1428 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1429
1430 /* Load up the value into the low element. We must ensure that the other
1431 elements are valid floats -- zero is the easiest such value. */
1432 if (MEM_P (input))
1433 {
1434 if (vecmode == V4SFmode)
1435 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1436 else
1437 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1438 }
1439 else
1440 {
1441 input = gen_rtx_REG (vecmode, REGNO (input));
1442 emit_move_insn (value, CONST0_RTX (vecmode));
1443 if (vecmode == V4SFmode)
1444 emit_insn (gen_sse_movss (value, value, input));
1445 else
1446 emit_insn (gen_sse2_movsd (value, value, input));
1447 }
1448
1449 emit_move_insn (large, two31);
1450 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1451
1452 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1453 emit_insn (gen_rtx_SET (large, x));
1454
1455 x = gen_rtx_AND (vecmode, zero_or_two31, large);
1456 emit_insn (gen_rtx_SET (zero_or_two31, x));
1457
1458 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1459 emit_insn (gen_rtx_SET (value, x));
1460
1461 large = gen_rtx_REG (V4SImode, REGNO (large));
1462 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1463
1464 x = gen_rtx_REG (V4SImode, REGNO (value));
1465 if (vecmode == V4SFmode)
1466 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1467 else
1468 emit_insn (gen_sse2_cvttpd2dq (x, value));
1469 value = x;
1470
1471 emit_insn (gen_xorv4si3 (value, value, large));
1472 }
1473
1474 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
1475 machine_mode mode, rtx target,
1476 rtx var, int one_var);
1477
1478 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1479 Expects the 64-bit DImode to be supplied in a pair of integral
1480 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1481 -mfpmath=sse, !optimize_size only. */
1482
1483 void
1484 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1485 {
1486 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1487 rtx int_xmm, fp_xmm;
1488 rtx biases, exponents;
1489 rtx x;
1490
1491 int_xmm = gen_reg_rtx (V4SImode);
1492 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1493 emit_insn (gen_movdi_to_sse (int_xmm, input));
1494 else if (TARGET_SSE_SPLIT_REGS)
1495 {
1496 emit_clobber (int_xmm);
1497 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1498 }
1499 else
1500 {
1501 x = gen_reg_rtx (V2DImode);
1502 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1503 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1504 }
1505
1506 x = gen_rtx_CONST_VECTOR (V4SImode,
1507 gen_rtvec (4, GEN_INT (0x43300000UL),
1508 GEN_INT (0x45300000UL),
1509 const0_rtx, const0_rtx));
1510 exponents = validize_mem (force_const_mem (V4SImode, x));
1511
1512 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1513 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1514
1515 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1516 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1517 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1518 (0x1.0p84 + double(fp_value_hi_xmm)).
1519 Note these exponents differ by 32. */
1520
1521 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1522
1523 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1524 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1525 real_ldexp (&bias_lo_rvt, &dconst1, 52);
1526 real_ldexp (&bias_hi_rvt, &dconst1, 84);
1527 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1528 x = const_double_from_real_value (bias_hi_rvt, DFmode);
1529 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1530 biases = validize_mem (force_const_mem (V2DFmode, biases));
1531 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1532
1533 /* Add the upper and lower DFmode values together. */
1534 if (TARGET_SSE3)
1535 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1536 else
1537 {
1538 x = copy_to_mode_reg (V2DFmode, fp_xmm);
1539 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1540 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1541 }
1542
1543 ix86_expand_vector_extract (false, target, fp_xmm, 0);
1544 }
1545
1546 /* Not used, but eases macroization of patterns. */
1547 void
1548 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1549 {
1550 gcc_unreachable ();
1551 }
1552
1553 /* Convert an unsigned SImode value into a DFmode. Only currently used
1554 for SSE, but applicable anywhere. */
1555
1556 void
1557 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1558 {
1559 REAL_VALUE_TYPE TWO31r;
1560 rtx x, fp;
1561
1562 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1563 NULL, 1, OPTAB_DIRECT);
1564
1565 fp = gen_reg_rtx (DFmode);
1566 emit_insn (gen_floatsidf2 (fp, x));
1567
1568 real_ldexp (&TWO31r, &dconst1, 31);
1569 x = const_double_from_real_value (TWO31r, DFmode);
1570
1571 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
1572 if (x != target)
1573 emit_move_insn (target, x);
1574 }
1575
1576 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1577 32-bit mode; otherwise we have a direct convert instruction. */
1578
1579 void
1580 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
1581 {
1582 REAL_VALUE_TYPE TWO32r;
1583 rtx fp_lo, fp_hi, x;
1584
1585 fp_lo = gen_reg_rtx (DFmode);
1586 fp_hi = gen_reg_rtx (DFmode);
1587
1588 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
1589
1590 real_ldexp (&TWO32r, &dconst1, 32);
1591 x = const_double_from_real_value (TWO32r, DFmode);
1592 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
1593
1594 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
1595
1596 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
1597 0, OPTAB_DIRECT);
1598 if (x != target)
1599 emit_move_insn (target, x);
1600 }
1601
1602 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1603 For x86_32, -mfpmath=sse, !optimize_size only. */
1604 void
1605 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
1606 {
1607 REAL_VALUE_TYPE ONE16r;
1608 rtx fp_hi, fp_lo, int_hi, int_lo, x;
1609
1610 real_ldexp (&ONE16r, &dconst1, 16);
1611 x = const_double_from_real_value (ONE16r, SFmode);
1612 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
1613 NULL, 0, OPTAB_DIRECT);
1614 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
1615 NULL, 0, OPTAB_DIRECT);
1616 fp_hi = gen_reg_rtx (SFmode);
1617 fp_lo = gen_reg_rtx (SFmode);
1618 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
1619 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
1620 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
1621 0, OPTAB_DIRECT);
1622 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
1623 0, OPTAB_DIRECT);
1624 if (!rtx_equal_p (target, fp_hi))
1625 emit_move_insn (target, fp_hi);
1626 }
1627
1628 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1629 a vector of unsigned ints VAL to vector of floats TARGET. */
1630
1631 void
1632 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
1633 {
1634 rtx tmp[8];
1635 REAL_VALUE_TYPE TWO16r;
1636 machine_mode intmode = GET_MODE (val);
1637 machine_mode fltmode = GET_MODE (target);
1638 rtx (*cvt) (rtx, rtx);
1639
1640 if (intmode == V4SImode)
1641 cvt = gen_floatv4siv4sf2;
1642 else
1643 cvt = gen_floatv8siv8sf2;
1644 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
1645 tmp[0] = force_reg (intmode, tmp[0]);
1646 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
1647 OPTAB_DIRECT);
1648 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
1649 NULL_RTX, 1, OPTAB_DIRECT);
1650 tmp[3] = gen_reg_rtx (fltmode);
1651 emit_insn (cvt (tmp[3], tmp[1]));
1652 tmp[4] = gen_reg_rtx (fltmode);
1653 emit_insn (cvt (tmp[4], tmp[2]));
1654 real_ldexp (&TWO16r, &dconst1, 16);
1655 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
1656 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
1657 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
1658 OPTAB_DIRECT);
1659 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
1660 OPTAB_DIRECT);
1661 if (tmp[7] != target)
1662 emit_move_insn (target, tmp[7]);
1663 }
1664
1665 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1666 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1667 This is done by doing just signed conversion if < 0x1p31, and otherwise by
1668 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
1669
1670 rtx
1671 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
1672 {
1673 REAL_VALUE_TYPE TWO31r;
1674 rtx two31r, tmp[4];
1675 machine_mode mode = GET_MODE (val);
1676 machine_mode scalarmode = GET_MODE_INNER (mode);
1677 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
1678 rtx (*cmp) (rtx, rtx, rtx, rtx);
1679 int i;
1680
1681 for (i = 0; i < 3; i++)
1682 tmp[i] = gen_reg_rtx (mode);
1683 real_ldexp (&TWO31r, &dconst1, 31);
1684 two31r = const_double_from_real_value (TWO31r, scalarmode);
1685 two31r = ix86_build_const_vector (mode, 1, two31r);
1686 two31r = force_reg (mode, two31r);
1687 switch (mode)
1688 {
1689 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
1690 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
1691 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
1692 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
1693 default: gcc_unreachable ();
1694 }
1695 tmp[3] = gen_rtx_LE (mode, two31r, val);
1696 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
1697 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
1698 0, OPTAB_DIRECT);
1699 if (intmode == V4SImode || TARGET_AVX2)
1700 *xorp = expand_simple_binop (intmode, ASHIFT,
1701 gen_lowpart (intmode, tmp[0]),
1702 GEN_INT (31), NULL_RTX, 0,
1703 OPTAB_DIRECT);
1704 else
1705 {
1706 rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
1707 two31 = ix86_build_const_vector (intmode, 1, two31);
1708 *xorp = expand_simple_binop (intmode, AND,
1709 gen_lowpart (intmode, tmp[0]),
1710 two31, NULL_RTX, 0,
1711 OPTAB_DIRECT);
1712 }
1713 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
1714 0, OPTAB_DIRECT);
1715 }
1716
1717 /* Generate code for floating point ABS or NEG. */
1718
1719 void
1720 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1721 rtx operands[])
1722 {
1723 rtx set, dst, src;
1724 bool use_sse = false;
1725 bool vector_mode = VECTOR_MODE_P (mode);
1726 machine_mode vmode = mode;
1727 rtvec par;
1728
1729 if (vector_mode || mode == TFmode)
1730 use_sse = true;
1731 else if (TARGET_SSE_MATH)
1732 {
1733 use_sse = SSE_FLOAT_MODE_P (mode);
1734 if (mode == SFmode)
1735 vmode = V4SFmode;
1736 else if (mode == DFmode)
1737 vmode = V2DFmode;
1738 }
1739
1740 dst = operands[0];
1741 src = operands[1];
1742
1743 set = gen_rtx_fmt_e (code, mode, src);
1744 set = gen_rtx_SET (dst, set);
1745
1746 if (use_sse)
1747 {
1748 rtx mask, use, clob;
1749
1750 /* NEG and ABS performed with SSE use bitwise mask operations.
1751 Create the appropriate mask now. */
1752 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
1753 use = gen_rtx_USE (VOIDmode, mask);
1754 if (vector_mode || mode == TFmode)
1755 par = gen_rtvec (2, set, use);
1756 else
1757 {
1758 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1759 par = gen_rtvec (3, set, use, clob);
1760 }
1761 }
1762 else
1763 {
1764 rtx clob;
1765
1766 /* Changing of sign for FP values is doable using integer unit too. */
1767 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1768 par = gen_rtvec (2, set, clob);
1769 }
1770
1771 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
1772 }
1773
1774 /* Deconstruct a floating point ABS or NEG operation
1775 with integer registers into integer operations. */
1776
1777 void
1778 ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1779 rtx operands[])
1780 {
1781 enum rtx_code absneg_op;
1782 rtx dst, set;
1783
1784 gcc_assert (operands_match_p (operands[0], operands[1]));
1785
1786 switch (mode)
1787 {
1788 case E_SFmode:
1789 dst = gen_lowpart (SImode, operands[0]);
1790
1791 if (code == ABS)
1792 {
1793 set = gen_int_mode (0x7fffffff, SImode);
1794 absneg_op = AND;
1795 }
1796 else
1797 {
1798 set = gen_int_mode (0x80000000, SImode);
1799 absneg_op = XOR;
1800 }
1801 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1802 break;
1803
1804 case E_DFmode:
1805 if (TARGET_64BIT)
1806 {
1807 dst = gen_lowpart (DImode, operands[0]);
1808 dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
1809
1810 if (code == ABS)
1811 set = const0_rtx;
1812 else
1813 set = gen_rtx_NOT (DImode, dst);
1814 }
1815 else
1816 {
1817 dst = gen_highpart (SImode, operands[0]);
1818
1819 if (code == ABS)
1820 {
1821 set = gen_int_mode (0x7fffffff, SImode);
1822 absneg_op = AND;
1823 }
1824 else
1825 {
1826 set = gen_int_mode (0x80000000, SImode);
1827 absneg_op = XOR;
1828 }
1829 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1830 }
1831 break;
1832
1833 case E_XFmode:
1834 dst = gen_rtx_REG (SImode,
1835 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
1836 if (code == ABS)
1837 {
1838 set = GEN_INT (0x7fff);
1839 absneg_op = AND;
1840 }
1841 else
1842 {
1843 set = GEN_INT (0x8000);
1844 absneg_op = XOR;
1845 }
1846 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1847 break;
1848
1849 default:
1850 gcc_unreachable ();
1851 }
1852
1853 set = gen_rtx_SET (dst, set);
1854
1855 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1856 rtvec par = gen_rtvec (2, set, clob);
1857
1858 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
1859 }
1860
1861 /* Expand a copysign operation. Special case operand 0 being a constant. */
1862
1863 void
1864 ix86_expand_copysign (rtx operands[])
1865 {
1866 machine_mode mode, vmode;
1867 rtx dest, op0, op1, mask;
1868
1869 dest = operands[0];
1870 op0 = operands[1];
1871 op1 = operands[2];
1872
1873 mode = GET_MODE (dest);
1874
1875 if (mode == SFmode)
1876 vmode = V4SFmode;
1877 else if (mode == DFmode)
1878 vmode = V2DFmode;
1879 else if (mode == TFmode)
1880 vmode = mode;
1881 else
1882 gcc_unreachable ();
1883
1884 mask = ix86_build_signbit_mask (vmode, 0, 0);
1885
1886 if (CONST_DOUBLE_P (op0))
1887 {
1888 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
1889 op0 = simplify_unary_operation (ABS, mode, op0, mode);
1890
1891 if (mode == SFmode || mode == DFmode)
1892 {
1893 if (op0 == CONST0_RTX (mode))
1894 op0 = CONST0_RTX (vmode);
1895 else
1896 {
1897 rtx v = ix86_build_const_vector (vmode, false, op0);
1898
1899 op0 = force_reg (vmode, v);
1900 }
1901 }
1902 else if (op0 != CONST0_RTX (mode))
1903 op0 = force_reg (mode, op0);
1904
1905 emit_insn (gen_copysign3_const (mode, dest, op0, op1, mask));
1906 }
1907 else
1908 {
1909 rtx nmask = ix86_build_signbit_mask (vmode, 0, 1);
1910
1911 emit_insn (gen_copysign3_var
1912 (mode, dest, NULL_RTX, op0, op1, nmask, mask));
1913 }
1914 }
1915
1916 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
1917 be a constant, and so has already been expanded into a vector constant. */
1918
1919 void
1920 ix86_split_copysign_const (rtx operands[])
1921 {
1922 machine_mode mode, vmode;
1923 rtx dest, op0, mask, x;
1924
1925 dest = operands[0];
1926 op0 = operands[1];
1927 mask = operands[3];
1928
1929 mode = GET_MODE (dest);
1930 vmode = GET_MODE (mask);
1931
1932 dest = lowpart_subreg (vmode, dest, mode);
1933 x = gen_rtx_AND (vmode, dest, mask);
1934 emit_insn (gen_rtx_SET (dest, x));
1935
1936 if (op0 != CONST0_RTX (vmode))
1937 {
1938 x = gen_rtx_IOR (vmode, dest, op0);
1939 emit_insn (gen_rtx_SET (dest, x));
1940 }
1941 }
1942
1943 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
1944 so we have to do two masks. */
1945
1946 void
1947 ix86_split_copysign_var (rtx operands[])
1948 {
1949 machine_mode mode, vmode;
1950 rtx dest, scratch, op0, op1, mask, nmask, x;
1951
1952 dest = operands[0];
1953 scratch = operands[1];
1954 op0 = operands[2];
1955 op1 = operands[3];
1956 nmask = operands[4];
1957 mask = operands[5];
1958
1959 mode = GET_MODE (dest);
1960 vmode = GET_MODE (mask);
1961
1962 if (rtx_equal_p (op0, op1))
1963 {
1964 /* Shouldn't happen often (it's useless, obviously), but when it does
1965 we'd generate incorrect code if we continue below. */
1966 emit_move_insn (dest, op0);
1967 return;
1968 }
1969
1970 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
1971 {
1972 gcc_assert (REGNO (op1) == REGNO (scratch));
1973
1974 x = gen_rtx_AND (vmode, scratch, mask);
1975 emit_insn (gen_rtx_SET (scratch, x));
1976
1977 dest = mask;
1978 op0 = lowpart_subreg (vmode, op0, mode);
1979 x = gen_rtx_NOT (vmode, dest);
1980 x = gen_rtx_AND (vmode, x, op0);
1981 emit_insn (gen_rtx_SET (dest, x));
1982 }
1983 else
1984 {
1985 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
1986 {
1987 x = gen_rtx_AND (vmode, scratch, mask);
1988 }
1989 else /* alternative 2,4 */
1990 {
1991 gcc_assert (REGNO (mask) == REGNO (scratch));
1992 op1 = lowpart_subreg (vmode, op1, mode);
1993 x = gen_rtx_AND (vmode, scratch, op1);
1994 }
1995 emit_insn (gen_rtx_SET (scratch, x));
1996
1997 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
1998 {
1999 dest = lowpart_subreg (vmode, op0, mode);
2000 x = gen_rtx_AND (vmode, dest, nmask);
2001 }
2002 else /* alternative 3,4 */
2003 {
2004 gcc_assert (REGNO (nmask) == REGNO (dest));
2005 dest = nmask;
2006 op0 = lowpart_subreg (vmode, op0, mode);
2007 x = gen_rtx_AND (vmode, dest, op0);
2008 }
2009 emit_insn (gen_rtx_SET (dest, x));
2010 }
2011
2012 x = gen_rtx_IOR (vmode, dest, scratch);
2013 emit_insn (gen_rtx_SET (dest, x));
2014 }
2015
2016 /* Expand an xorsign operation. */
2017
2018 void
2019 ix86_expand_xorsign (rtx operands[])
2020 {
2021 machine_mode mode, vmode;
2022 rtx dest, op0, op1, mask;
2023
2024 dest = operands[0];
2025 op0 = operands[1];
2026 op1 = operands[2];
2027
2028 mode = GET_MODE (dest);
2029
2030 if (mode == SFmode)
2031 vmode = V4SFmode;
2032 else if (mode == DFmode)
2033 vmode = V2DFmode;
2034 else
2035 gcc_unreachable ();
2036
2037 mask = ix86_build_signbit_mask (vmode, 0, 0);
2038
2039 emit_insn (gen_xorsign3_1 (mode, dest, op0, op1, mask));
2040 }
2041
2042 /* Deconstruct an xorsign operation into bit masks. */
2043
2044 void
2045 ix86_split_xorsign (rtx operands[])
2046 {
2047 machine_mode mode, vmode;
2048 rtx dest, op0, mask, x;
2049
2050 dest = operands[0];
2051 op0 = operands[1];
2052 mask = operands[3];
2053
2054 mode = GET_MODE (dest);
2055 vmode = GET_MODE (mask);
2056
2057 dest = lowpart_subreg (vmode, dest, mode);
2058 x = gen_rtx_AND (vmode, dest, mask);
2059 emit_insn (gen_rtx_SET (dest, x));
2060
2061 op0 = lowpart_subreg (vmode, op0, mode);
2062 x = gen_rtx_XOR (vmode, dest, op0);
2063 emit_insn (gen_rtx_SET (dest, x));
2064 }
2065
2066 static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2067
2068 void
2069 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2070 {
2071 machine_mode mode = GET_MODE (op0);
2072 rtx tmp;
2073
2074 /* Handle special case - vector comparsion with boolean result, transform
2075 it using ptest instruction. */
2076 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
2077 {
2078 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
2079 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
2080
2081 gcc_assert (code == EQ || code == NE);
2082 /* Generate XOR since we can't check that one operand is zero vector. */
2083 tmp = gen_reg_rtx (mode);
2084 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
2085 tmp = gen_lowpart (p_mode, tmp);
2086 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
2087 gen_rtx_UNSPEC (CCmode,
2088 gen_rtvec (2, tmp, tmp),
2089 UNSPEC_PTEST)));
2090 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2091 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2092 gen_rtx_LABEL_REF (VOIDmode, label),
2093 pc_rtx);
2094 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2095 return;
2096 }
2097
2098 switch (mode)
2099 {
2100 case E_SFmode:
2101 case E_DFmode:
2102 case E_XFmode:
2103 case E_QImode:
2104 case E_HImode:
2105 case E_SImode:
2106 simple:
2107 tmp = ix86_expand_compare (code, op0, op1);
2108 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2109 gen_rtx_LABEL_REF (VOIDmode, label),
2110 pc_rtx);
2111 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2112 return;
2113
2114 case E_DImode:
2115 if (TARGET_64BIT)
2116 goto simple;
2117 /* For 32-bit target DI comparison may be performed on
2118 SSE registers. To allow this we should avoid split
2119 to SI mode which is achieved by doing xor in DI mode
2120 and then comparing with zero (which is recognized by
2121 STV pass). We don't compare using xor when optimizing
2122 for size. */
2123 if (!optimize_insn_for_size_p ()
2124 && TARGET_STV
2125 && (code == EQ || code == NE))
2126 {
2127 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
2128 op1 = const0_rtx;
2129 }
2130 /* FALLTHRU */
2131 case E_TImode:
2132 /* Expand DImode branch into multiple compare+branch. */
2133 {
2134 rtx lo[2], hi[2];
2135 rtx_code_label *label2;
2136 enum rtx_code code1, code2, code3;
2137 machine_mode submode;
2138
2139 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2140 {
2141 std::swap (op0, op1);
2142 code = swap_condition (code);
2143 }
2144
2145 split_double_mode (mode, &op0, 1, lo+0, hi+0);
2146 split_double_mode (mode, &op1, 1, lo+1, hi+1);
2147
2148 submode = mode == DImode ? SImode : DImode;
2149
2150 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
2151 avoid two branches. This costs one extra insn, so disable when
2152 optimizing for size. */
2153
2154 if ((code == EQ || code == NE)
2155 && (!optimize_insn_for_size_p ()
2156 || hi[1] == const0_rtx || lo[1] == const0_rtx))
2157 {
2158 rtx xor0, xor1;
2159
2160 xor1 = hi[0];
2161 if (hi[1] != const0_rtx)
2162 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
2163 NULL_RTX, 0, OPTAB_WIDEN);
2164
2165 xor0 = lo[0];
2166 if (lo[1] != const0_rtx)
2167 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
2168 NULL_RTX, 0, OPTAB_WIDEN);
2169
2170 tmp = expand_binop (submode, ior_optab, xor1, xor0,
2171 NULL_RTX, 0, OPTAB_WIDEN);
2172
2173 ix86_expand_branch (code, tmp, const0_rtx, label);
2174 return;
2175 }
2176
2177 /* Otherwise, if we are doing less-than or greater-or-equal-than,
2178 op1 is a constant and the low word is zero, then we can just
2179 examine the high word. Similarly for low word -1 and
2180 less-or-equal-than or greater-than. */
2181
2182 if (CONST_INT_P (hi[1]))
2183 switch (code)
2184 {
2185 case LT: case LTU: case GE: case GEU:
2186 if (lo[1] == const0_rtx)
2187 {
2188 ix86_expand_branch (code, hi[0], hi[1], label);
2189 return;
2190 }
2191 break;
2192 case LE: case LEU: case GT: case GTU:
2193 if (lo[1] == constm1_rtx)
2194 {
2195 ix86_expand_branch (code, hi[0], hi[1], label);
2196 return;
2197 }
2198 break;
2199 default:
2200 break;
2201 }
2202
2203 /* Emulate comparisons that do not depend on Zero flag with
2204 double-word subtraction. Note that only Overflow, Sign
2205 and Carry flags are valid, so swap arguments and condition
2206 of comparisons that would otherwise test Zero flag. */
2207
2208 switch (code)
2209 {
2210 case LE: case LEU: case GT: case GTU:
2211 std::swap (lo[0], lo[1]);
2212 std::swap (hi[0], hi[1]);
2213 code = swap_condition (code);
2214 /* FALLTHRU */
2215
2216 case LT: case LTU: case GE: case GEU:
2217 {
2218 bool uns = (code == LTU || code == GEU);
2219 rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2220 = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2221
2222 if (!nonimmediate_operand (lo[0], submode))
2223 lo[0] = force_reg (submode, lo[0]);
2224 if (!x86_64_general_operand (lo[1], submode))
2225 lo[1] = force_reg (submode, lo[1]);
2226
2227 if (!register_operand (hi[0], submode))
2228 hi[0] = force_reg (submode, hi[0]);
2229 if ((uns && !nonimmediate_operand (hi[1], submode))
2230 || (!uns && !x86_64_general_operand (hi[1], submode)))
2231 hi[1] = force_reg (submode, hi[1]);
2232
2233 emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2234
2235 tmp = gen_rtx_SCRATCH (submode);
2236 emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2237
2238 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2239 ix86_expand_branch (code, tmp, const0_rtx, label);
2240 return;
2241 }
2242
2243 default:
2244 break;
2245 }
2246
2247 /* Otherwise, we need two or three jumps. */
2248
2249 label2 = gen_label_rtx ();
2250
2251 code1 = code;
2252 code2 = swap_condition (code);
2253 code3 = unsigned_condition (code);
2254
2255 switch (code)
2256 {
2257 case LT: case GT: case LTU: case GTU:
2258 break;
2259
2260 case LE: code1 = LT; code2 = GT; break;
2261 case GE: code1 = GT; code2 = LT; break;
2262 case LEU: code1 = LTU; code2 = GTU; break;
2263 case GEU: code1 = GTU; code2 = LTU; break;
2264
2265 case EQ: code1 = UNKNOWN; code2 = NE; break;
2266 case NE: code2 = UNKNOWN; break;
2267
2268 default:
2269 gcc_unreachable ();
2270 }
2271
2272 /*
2273 * a < b =>
2274 * if (hi(a) < hi(b)) goto true;
2275 * if (hi(a) > hi(b)) goto false;
2276 * if (lo(a) < lo(b)) goto true;
2277 * false:
2278 */
2279
2280 if (code1 != UNKNOWN)
2281 ix86_expand_branch (code1, hi[0], hi[1], label);
2282 if (code2 != UNKNOWN)
2283 ix86_expand_branch (code2, hi[0], hi[1], label2);
2284
2285 ix86_expand_branch (code3, lo[0], lo[1], label);
2286
2287 if (code2 != UNKNOWN)
2288 emit_label (label2);
2289 return;
2290 }
2291
2292 default:
2293 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2294 goto simple;
2295 }
2296 }
2297
2298 /* Figure out whether to use unordered fp comparisons. */
2299
2300 static bool
2301 ix86_unordered_fp_compare (enum rtx_code code)
2302 {
2303 if (!TARGET_IEEE_FP)
2304 return false;
2305
2306 switch (code)
2307 {
2308 case LT:
2309 case LE:
2310 case GT:
2311 case GE:
2312 case LTGT:
2313 return false;
2314
2315 case EQ:
2316 case NE:
2317
2318 case UNORDERED:
2319 case ORDERED:
2320 case UNLT:
2321 case UNLE:
2322 case UNGT:
2323 case UNGE:
2324 case UNEQ:
2325 return true;
2326
2327 default:
2328 gcc_unreachable ();
2329 }
2330 }
2331
2332 /* Return a comparison we can do and that it is equivalent to
2333 swap_condition (code) apart possibly from orderedness.
2334 But, never change orderedness if TARGET_IEEE_FP, returning
2335 UNKNOWN in that case if necessary. */
2336
2337 static enum rtx_code
2338 ix86_fp_swap_condition (enum rtx_code code)
2339 {
2340 switch (code)
2341 {
2342 case GT: /* GTU - CF=0 & ZF=0 */
2343 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2344 case GE: /* GEU - CF=0 */
2345 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2346 case UNLT: /* LTU - CF=1 */
2347 return TARGET_IEEE_FP ? UNKNOWN : GT;
2348 case UNLE: /* LEU - CF=1 | ZF=1 */
2349 return TARGET_IEEE_FP ? UNKNOWN : GE;
2350 default:
2351 return swap_condition (code);
2352 }
2353 }
2354
2355 /* Return cost of comparison CODE using the best strategy for performance.
2356 All following functions do use number of instructions as a cost metrics.
2357 In future this should be tweaked to compute bytes for optimize_size and
2358 take into account performance of various instructions on various CPUs. */
2359
2360 static int
2361 ix86_fp_comparison_cost (enum rtx_code code)
2362 {
2363 int arith_cost;
2364
2365 /* The cost of code using bit-twiddling on %ah. */
2366 switch (code)
2367 {
2368 case UNLE:
2369 case UNLT:
2370 case LTGT:
2371 case GT:
2372 case GE:
2373 case UNORDERED:
2374 case ORDERED:
2375 case UNEQ:
2376 arith_cost = 4;
2377 break;
2378 case LT:
2379 case NE:
2380 case EQ:
2381 case UNGE:
2382 arith_cost = TARGET_IEEE_FP ? 5 : 4;
2383 break;
2384 case LE:
2385 case UNGT:
2386 arith_cost = TARGET_IEEE_FP ? 6 : 4;
2387 break;
2388 default:
2389 gcc_unreachable ();
2390 }
2391
2392 switch (ix86_fp_comparison_strategy (code))
2393 {
2394 case IX86_FPCMP_COMI:
2395 return arith_cost > 4 ? 3 : 2;
2396 case IX86_FPCMP_SAHF:
2397 return arith_cost > 4 ? 4 : 3;
2398 default:
2399 return arith_cost;
2400 }
2401 }
2402
2403 /* Swap, force into registers, or otherwise massage the two operands
2404 to a fp comparison. The operands are updated in place; the new
2405 comparison code is returned. */
2406
2407 static enum rtx_code
2408 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2409 {
2410 bool unordered_compare = ix86_unordered_fp_compare (code);
2411 rtx op0 = *pop0, op1 = *pop1;
2412 machine_mode op_mode = GET_MODE (op0);
2413 bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
2414
2415 /* All of the unordered compare instructions only work on registers.
2416 The same is true of the fcomi compare instructions. The XFmode
2417 compare instructions require registers except when comparing
2418 against zero or when converting operand 1 from fixed point to
2419 floating point. */
2420
2421 if (!is_sse
2422 && (unordered_compare
2423 || (op_mode == XFmode
2424 && ! (standard_80387_constant_p (op0) == 1
2425 || standard_80387_constant_p (op1) == 1)
2426 && GET_CODE (op1) != FLOAT)
2427 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2428 {
2429 op0 = force_reg (op_mode, op0);
2430 op1 = force_reg (op_mode, op1);
2431 }
2432 else
2433 {
2434 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2435 things around if they appear profitable, otherwise force op0
2436 into a register. */
2437
2438 if (standard_80387_constant_p (op0) == 0
2439 || (MEM_P (op0)
2440 && ! (standard_80387_constant_p (op1) == 0
2441 || MEM_P (op1))))
2442 {
2443 enum rtx_code new_code = ix86_fp_swap_condition (code);
2444 if (new_code != UNKNOWN)
2445 {
2446 std::swap (op0, op1);
2447 code = new_code;
2448 }
2449 }
2450
2451 if (!REG_P (op0))
2452 op0 = force_reg (op_mode, op0);
2453
2454 if (CONSTANT_P (op1))
2455 {
2456 int tmp = standard_80387_constant_p (op1);
2457 if (tmp == 0)
2458 op1 = validize_mem (force_const_mem (op_mode, op1));
2459 else if (tmp == 1)
2460 {
2461 if (TARGET_CMOVE)
2462 op1 = force_reg (op_mode, op1);
2463 }
2464 else
2465 op1 = force_reg (op_mode, op1);
2466 }
2467 }
2468
2469 /* Try to rearrange the comparison to make it cheaper. */
2470 if (ix86_fp_comparison_cost (code)
2471 > ix86_fp_comparison_cost (swap_condition (code))
2472 && (REG_P (op1) || can_create_pseudo_p ()))
2473 {
2474 std::swap (op0, op1);
2475 code = swap_condition (code);
2476 if (!REG_P (op0))
2477 op0 = force_reg (op_mode, op0);
2478 }
2479
2480 *pop0 = op0;
2481 *pop1 = op1;
2482 return code;
2483 }
2484
2485 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2486
2487 static rtx
2488 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2489 {
2490 bool unordered_compare = ix86_unordered_fp_compare (code);
2491 machine_mode cmp_mode;
2492 rtx tmp, scratch;
2493
2494 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2495
2496 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2497 if (unordered_compare)
2498 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2499
2500 /* Do fcomi/sahf based test when profitable. */
2501 switch (ix86_fp_comparison_strategy (code))
2502 {
2503 case IX86_FPCMP_COMI:
2504 cmp_mode = CCFPmode;
2505 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2506 break;
2507
2508 case IX86_FPCMP_SAHF:
2509 cmp_mode = CCFPmode;
2510 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2511 scratch = gen_reg_rtx (HImode);
2512 emit_insn (gen_rtx_SET (scratch, tmp));
2513 emit_insn (gen_x86_sahf_1 (scratch));
2514 break;
2515
2516 case IX86_FPCMP_ARITH:
2517 cmp_mode = CCNOmode;
2518 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2519 scratch = gen_reg_rtx (HImode);
2520 emit_insn (gen_rtx_SET (scratch, tmp));
2521
2522 /* In the unordered case, we have to check C2 for NaN's, which
2523 doesn't happen to work out to anything nice combination-wise.
2524 So do some bit twiddling on the value we've got in AH to come
2525 up with an appropriate set of condition codes. */
2526
2527 switch (code)
2528 {
2529 case GT:
2530 case UNGT:
2531 if (code == GT || !TARGET_IEEE_FP)
2532 {
2533 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2534 code = EQ;
2535 }
2536 else
2537 {
2538 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2539 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2540 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2541 cmp_mode = CCmode;
2542 code = GEU;
2543 }
2544 break;
2545 case LT:
2546 case UNLT:
2547 if (code == LT && TARGET_IEEE_FP)
2548 {
2549 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2550 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2551 cmp_mode = CCmode;
2552 code = EQ;
2553 }
2554 else
2555 {
2556 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2557 code = NE;
2558 }
2559 break;
2560 case GE:
2561 case UNGE:
2562 if (code == GE || !TARGET_IEEE_FP)
2563 {
2564 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
2565 code = EQ;
2566 }
2567 else
2568 {
2569 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2570 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
2571 code = NE;
2572 }
2573 break;
2574 case LE:
2575 case UNLE:
2576 if (code == LE && TARGET_IEEE_FP)
2577 {
2578 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2579 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2580 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2581 cmp_mode = CCmode;
2582 code = LTU;
2583 }
2584 else
2585 {
2586 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2587 code = NE;
2588 }
2589 break;
2590 case EQ:
2591 case UNEQ:
2592 if (code == EQ && TARGET_IEEE_FP)
2593 {
2594 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2595 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2596 cmp_mode = CCmode;
2597 code = EQ;
2598 }
2599 else
2600 {
2601 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2602 code = NE;
2603 }
2604 break;
2605 case NE:
2606 case LTGT:
2607 if (code == NE && TARGET_IEEE_FP)
2608 {
2609 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2610 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
2611 GEN_INT (0x40)));
2612 code = NE;
2613 }
2614 else
2615 {
2616 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2617 code = EQ;
2618 }
2619 break;
2620
2621 case UNORDERED:
2622 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2623 code = NE;
2624 break;
2625 case ORDERED:
2626 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2627 code = EQ;
2628 break;
2629
2630 default:
2631 gcc_unreachable ();
2632 }
2633 break;
2634
2635 default:
2636 gcc_unreachable();
2637 }
2638
2639 /* Return the test that should be put into the flags user, i.e.
2640 the bcc, scc, or cmov instruction. */
2641 return gen_rtx_fmt_ee (code, VOIDmode,
2642 gen_rtx_REG (cmp_mode, FLAGS_REG),
2643 const0_rtx);
2644 }
2645
2646 /* Generate insn patterns to do an integer compare of OPERANDS. */
2647
2648 static rtx
2649 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
2650 {
2651 machine_mode cmpmode;
2652 rtx tmp, flags;
2653
2654 cmpmode = SELECT_CC_MODE (code, op0, op1);
2655 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
2656
2657 /* This is very simple, but making the interface the same as in the
2658 FP case makes the rest of the code easier. */
2659 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
2660 emit_insn (gen_rtx_SET (flags, tmp));
2661
2662 /* Return the test that should be put into the flags user, i.e.
2663 the bcc, scc, or cmov instruction. */
2664 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
2665 }
2666
2667 static rtx
2668 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
2669 {
2670 rtx ret;
2671
2672 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
2673 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
2674
2675 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
2676 {
2677 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
2678 ret = ix86_expand_fp_compare (code, op0, op1);
2679 }
2680 else
2681 ret = ix86_expand_int_compare (code, op0, op1);
2682
2683 return ret;
2684 }
2685
2686 void
2687 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
2688 {
2689 rtx ret;
2690
2691 gcc_assert (GET_MODE (dest) == QImode);
2692
2693 ret = ix86_expand_compare (code, op0, op1);
2694 PUT_MODE (ret, QImode);
2695 emit_insn (gen_rtx_SET (dest, ret));
2696 }
2697
2698 /* Expand comparison setting or clearing carry flag. Return true when
2699 successful and set pop for the operation. */
2700 static bool
2701 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
2702 {
2703 machine_mode mode
2704 = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
2705
2706 /* Do not handle double-mode compares that go through special path. */
2707 if (mode == (TARGET_64BIT ? TImode : DImode))
2708 return false;
2709
2710 if (SCALAR_FLOAT_MODE_P (mode))
2711 {
2712 rtx compare_op;
2713 rtx_insn *compare_seq;
2714
2715 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
2716
2717 /* Shortcut: following common codes never translate
2718 into carry flag compares. */
2719 if (code == EQ || code == NE || code == UNEQ || code == LTGT
2720 || code == ORDERED || code == UNORDERED)
2721 return false;
2722
2723 /* These comparisons require zero flag; swap operands so they won't. */
2724 if ((code == GT || code == UNLE || code == LE || code == UNGT)
2725 && !TARGET_IEEE_FP)
2726 {
2727 std::swap (op0, op1);
2728 code = swap_condition (code);
2729 }
2730
2731 /* Try to expand the comparison and verify that we end up with
2732 carry flag based comparison. This fails to be true only when
2733 we decide to expand comparison using arithmetic that is not
2734 too common scenario. */
2735 start_sequence ();
2736 compare_op = ix86_expand_fp_compare (code, op0, op1);
2737 compare_seq = get_insns ();
2738 end_sequence ();
2739
2740 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
2741 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
2742 else
2743 code = GET_CODE (compare_op);
2744
2745 if (code != LTU && code != GEU)
2746 return false;
2747
2748 emit_insn (compare_seq);
2749 *pop = compare_op;
2750 return true;
2751 }
2752
2753 if (!INTEGRAL_MODE_P (mode))
2754 return false;
2755
2756 switch (code)
2757 {
2758 case LTU:
2759 case GEU:
2760 break;
2761
2762 /* Convert a==0 into (unsigned)a<1. */
2763 case EQ:
2764 case NE:
2765 if (op1 != const0_rtx)
2766 return false;
2767 op1 = const1_rtx;
2768 code = (code == EQ ? LTU : GEU);
2769 break;
2770
2771 /* Convert a>b into b<a or a>=b-1. */
2772 case GTU:
2773 case LEU:
2774 if (CONST_INT_P (op1))
2775 {
2776 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
2777 /* Bail out on overflow. We still can swap operands but that
2778 would force loading of the constant into register. */
2779 if (op1 == const0_rtx
2780 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
2781 return false;
2782 code = (code == GTU ? GEU : LTU);
2783 }
2784 else
2785 {
2786 std::swap (op0, op1);
2787 code = (code == GTU ? LTU : GEU);
2788 }
2789 break;
2790
2791 /* Convert a>=0 into (unsigned)a<0x80000000. */
2792 case LT:
2793 case GE:
2794 if (mode == DImode || op1 != const0_rtx)
2795 return false;
2796 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2797 code = (code == LT ? GEU : LTU);
2798 break;
2799 case LE:
2800 case GT:
2801 if (mode == DImode || op1 != constm1_rtx)
2802 return false;
2803 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2804 code = (code == LE ? GEU : LTU);
2805 break;
2806
2807 default:
2808 return false;
2809 }
2810 /* Swapping operands may cause constant to appear as first operand. */
2811 if (!nonimmediate_operand (op0, VOIDmode))
2812 {
2813 if (!can_create_pseudo_p ())
2814 return false;
2815 op0 = force_reg (mode, op0);
2816 }
2817 *pop = ix86_expand_compare (code, op0, op1);
2818 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
2819 return true;
2820 }
2821
2822 /* Expand conditional increment or decrement using adb/sbb instructions.
2823 The default case using setcc followed by the conditional move can be
2824 done by generic code. */
2825 bool
2826 ix86_expand_int_addcc (rtx operands[])
2827 {
2828 enum rtx_code code = GET_CODE (operands[1]);
2829 rtx flags;
2830 rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
2831 rtx compare_op;
2832 rtx val = const0_rtx;
2833 bool fpcmp = false;
2834 machine_mode mode;
2835 rtx op0 = XEXP (operands[1], 0);
2836 rtx op1 = XEXP (operands[1], 1);
2837
2838 if (operands[3] != const1_rtx
2839 && operands[3] != constm1_rtx)
2840 return false;
2841 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2842 return false;
2843 code = GET_CODE (compare_op);
2844
2845 flags = XEXP (compare_op, 0);
2846
2847 if (GET_MODE (flags) == CCFPmode)
2848 {
2849 fpcmp = true;
2850 code = ix86_fp_compare_code_to_integer (code);
2851 }
2852
2853 if (code != LTU)
2854 {
2855 val = constm1_rtx;
2856 if (fpcmp)
2857 PUT_CODE (compare_op,
2858 reverse_condition_maybe_unordered
2859 (GET_CODE (compare_op)));
2860 else
2861 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
2862 }
2863
2864 mode = GET_MODE (operands[0]);
2865
2866 /* Construct either adc or sbb insn. */
2867 if ((code == LTU) == (operands[3] == constm1_rtx))
2868 insn = gen_sub3_carry;
2869 else
2870 insn = gen_add3_carry;
2871
2872 emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
2873
2874 return true;
2875 }
2876
2877 bool
2878 ix86_expand_int_movcc (rtx operands[])
2879 {
2880 enum rtx_code code = GET_CODE (operands[1]), compare_code;
2881 rtx_insn *compare_seq;
2882 rtx compare_op;
2883 machine_mode mode = GET_MODE (operands[0]);
2884 bool sign_bit_compare_p = false;
2885 rtx op0 = XEXP (operands[1], 0);
2886 rtx op1 = XEXP (operands[1], 1);
2887
2888 if (GET_MODE (op0) == TImode
2889 || (GET_MODE (op0) == DImode
2890 && !TARGET_64BIT))
2891 return false;
2892
2893 start_sequence ();
2894 compare_op = ix86_expand_compare (code, op0, op1);
2895 compare_seq = get_insns ();
2896 end_sequence ();
2897
2898 compare_code = GET_CODE (compare_op);
2899
2900 if ((op1 == const0_rtx && (code == GE || code == LT))
2901 || (op1 == constm1_rtx && (code == GT || code == LE)))
2902 sign_bit_compare_p = true;
2903
2904 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
2905 HImode insns, we'd be swallowed in word prefix ops. */
2906
2907 if ((mode != HImode || TARGET_FAST_PREFIX)
2908 && (mode != (TARGET_64BIT ? TImode : DImode))
2909 && CONST_INT_P (operands[2])
2910 && CONST_INT_P (operands[3]))
2911 {
2912 rtx out = operands[0];
2913 HOST_WIDE_INT ct = INTVAL (operands[2]);
2914 HOST_WIDE_INT cf = INTVAL (operands[3]);
2915 HOST_WIDE_INT diff;
2916
2917 diff = ct - cf;
2918 /* Sign bit compares are better done using shifts than we do by using
2919 sbb. */
2920 if (sign_bit_compare_p
2921 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2922 {
2923 /* Detect overlap between destination and compare sources. */
2924 rtx tmp = out;
2925
2926 if (!sign_bit_compare_p)
2927 {
2928 rtx flags;
2929 bool fpcmp = false;
2930
2931 compare_code = GET_CODE (compare_op);
2932
2933 flags = XEXP (compare_op, 0);
2934
2935 if (GET_MODE (flags) == CCFPmode)
2936 {
2937 fpcmp = true;
2938 compare_code
2939 = ix86_fp_compare_code_to_integer (compare_code);
2940 }
2941
2942 /* To simplify rest of code, restrict to the GEU case. */
2943 if (compare_code == LTU)
2944 {
2945 std::swap (ct, cf);
2946 compare_code = reverse_condition (compare_code);
2947 code = reverse_condition (code);
2948 }
2949 else
2950 {
2951 if (fpcmp)
2952 PUT_CODE (compare_op,
2953 reverse_condition_maybe_unordered
2954 (GET_CODE (compare_op)));
2955 else
2956 PUT_CODE (compare_op,
2957 reverse_condition (GET_CODE (compare_op)));
2958 }
2959 diff = ct - cf;
2960
2961 if (reg_overlap_mentioned_p (out, op0)
2962 || reg_overlap_mentioned_p (out, op1))
2963 tmp = gen_reg_rtx (mode);
2964
2965 if (mode == DImode)
2966 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
2967 else
2968 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
2969 flags, compare_op));
2970 }
2971 else
2972 {
2973 if (code == GT || code == GE)
2974 code = reverse_condition (code);
2975 else
2976 {
2977 std::swap (ct, cf);
2978 diff = ct - cf;
2979 }
2980 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
2981 }
2982
2983 if (diff == 1)
2984 {
2985 /*
2986 * cmpl op0,op1
2987 * sbbl dest,dest
2988 * [addl dest, ct]
2989 *
2990 * Size 5 - 8.
2991 */
2992 if (ct)
2993 tmp = expand_simple_binop (mode, PLUS,
2994 tmp, GEN_INT (ct),
2995 copy_rtx (tmp), 1, OPTAB_DIRECT);
2996 }
2997 else if (cf == -1)
2998 {
2999 /*
3000 * cmpl op0,op1
3001 * sbbl dest,dest
3002 * orl $ct, dest
3003 *
3004 * Size 8.
3005 */
3006 tmp = expand_simple_binop (mode, IOR,
3007 tmp, GEN_INT (ct),
3008 copy_rtx (tmp), 1, OPTAB_DIRECT);
3009 }
3010 else if (diff == -1 && ct)
3011 {
3012 /*
3013 * cmpl op0,op1
3014 * sbbl dest,dest
3015 * notl dest
3016 * [addl dest, cf]
3017 *
3018 * Size 8 - 11.
3019 */
3020 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3021 if (cf)
3022 tmp = expand_simple_binop (mode, PLUS,
3023 copy_rtx (tmp), GEN_INT (cf),
3024 copy_rtx (tmp), 1, OPTAB_DIRECT);
3025 }
3026 else
3027 {
3028 /*
3029 * cmpl op0,op1
3030 * sbbl dest,dest
3031 * [notl dest]
3032 * andl cf - ct, dest
3033 * [addl dest, ct]
3034 *
3035 * Size 8 - 11.
3036 */
3037
3038 if (cf == 0)
3039 {
3040 cf = ct;
3041 ct = 0;
3042 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3043 }
3044
3045 tmp = expand_simple_binop (mode, AND,
3046 copy_rtx (tmp),
3047 gen_int_mode (cf - ct, mode),
3048 copy_rtx (tmp), 1, OPTAB_DIRECT);
3049 if (ct)
3050 tmp = expand_simple_binop (mode, PLUS,
3051 copy_rtx (tmp), GEN_INT (ct),
3052 copy_rtx (tmp), 1, OPTAB_DIRECT);
3053 }
3054
3055 if (!rtx_equal_p (tmp, out))
3056 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3057
3058 return true;
3059 }
3060
3061 if (diff < 0)
3062 {
3063 machine_mode cmp_mode = GET_MODE (op0);
3064 enum rtx_code new_code;
3065
3066 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3067 {
3068 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3069
3070 /* We may be reversing a non-trapping
3071 comparison to a trapping comparison. */
3072 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3073 && code != EQ && code != NE
3074 && code != ORDERED && code != UNORDERED)
3075 new_code = UNKNOWN;
3076 else
3077 new_code = reverse_condition_maybe_unordered (code);
3078 }
3079 else
3080 new_code = ix86_reverse_condition (code, cmp_mode);
3081 if (new_code != UNKNOWN)
3082 {
3083 std::swap (ct, cf);
3084 diff = -diff;
3085 code = new_code;
3086 }
3087 }
3088
3089 compare_code = UNKNOWN;
3090 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3091 && CONST_INT_P (op1))
3092 {
3093 if (op1 == const0_rtx
3094 && (code == LT || code == GE))
3095 compare_code = code;
3096 else if (op1 == constm1_rtx)
3097 {
3098 if (code == LE)
3099 compare_code = LT;
3100 else if (code == GT)
3101 compare_code = GE;
3102 }
3103 }
3104
3105 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3106 if (compare_code != UNKNOWN
3107 && GET_MODE (op0) == GET_MODE (out)
3108 && (cf == -1 || ct == -1))
3109 {
3110 /* If lea code below could be used, only optimize
3111 if it results in a 2 insn sequence. */
3112
3113 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3114 || diff == 3 || diff == 5 || diff == 9)
3115 || (compare_code == LT && ct == -1)
3116 || (compare_code == GE && cf == -1))
3117 {
3118 /*
3119 * notl op1 (if necessary)
3120 * sarl $31, op1
3121 * orl cf, op1
3122 */
3123 if (ct != -1)
3124 {
3125 cf = ct;
3126 ct = -1;
3127 code = reverse_condition (code);
3128 }
3129
3130 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3131
3132 out = expand_simple_binop (mode, IOR,
3133 out, GEN_INT (cf),
3134 out, 1, OPTAB_DIRECT);
3135 if (out != operands[0])
3136 emit_move_insn (operands[0], out);
3137
3138 return true;
3139 }
3140 }
3141
3142
3143 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3144 || diff == 3 || diff == 5 || diff == 9)
3145 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3146 && (mode != DImode
3147 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3148 {
3149 /*
3150 * xorl dest,dest
3151 * cmpl op1,op2
3152 * setcc dest
3153 * lea cf(dest*(ct-cf)),dest
3154 *
3155 * Size 14.
3156 *
3157 * This also catches the degenerate setcc-only case.
3158 */
3159
3160 rtx tmp;
3161 int nops;
3162
3163 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3164
3165 nops = 0;
3166 /* On x86_64 the lea instruction operates on Pmode, so we need
3167 to get arithmetics done in proper mode to match. */
3168 if (diff == 1)
3169 tmp = copy_rtx (out);
3170 else
3171 {
3172 rtx out1;
3173 out1 = copy_rtx (out);
3174 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3175 nops++;
3176 if (diff & 1)
3177 {
3178 tmp = gen_rtx_PLUS (mode, tmp, out1);
3179 nops++;
3180 }
3181 }
3182 if (cf != 0)
3183 {
3184 tmp = plus_constant (mode, tmp, cf);
3185 nops++;
3186 }
3187 if (!rtx_equal_p (tmp, out))
3188 {
3189 if (nops == 1)
3190 out = force_operand (tmp, copy_rtx (out));
3191 else
3192 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3193 }
3194 if (!rtx_equal_p (out, operands[0]))
3195 emit_move_insn (operands[0], copy_rtx (out));
3196
3197 return true;
3198 }
3199
3200 /*
3201 * General case: Jumpful:
3202 * xorl dest,dest cmpl op1, op2
3203 * cmpl op1, op2 movl ct, dest
3204 * setcc dest jcc 1f
3205 * decl dest movl cf, dest
3206 * andl (cf-ct),dest 1:
3207 * addl ct,dest
3208 *
3209 * Size 20. Size 14.
3210 *
3211 * This is reasonably steep, but branch mispredict costs are
3212 * high on modern cpus, so consider failing only if optimizing
3213 * for space.
3214 */
3215
3216 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3217 && BRANCH_COST (optimize_insn_for_speed_p (),
3218 false) >= 2)
3219 {
3220 if (cf == 0)
3221 {
3222 machine_mode cmp_mode = GET_MODE (op0);
3223 enum rtx_code new_code;
3224
3225 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3226 {
3227 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3228
3229 /* We may be reversing a non-trapping
3230 comparison to a trapping comparison. */
3231 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3232 && code != EQ && code != NE
3233 && code != ORDERED && code != UNORDERED)
3234 new_code = UNKNOWN;
3235 else
3236 new_code = reverse_condition_maybe_unordered (code);
3237
3238 }
3239 else
3240 {
3241 new_code = ix86_reverse_condition (code, cmp_mode);
3242 if (compare_code != UNKNOWN && new_code != UNKNOWN)
3243 compare_code = reverse_condition (compare_code);
3244 }
3245
3246 if (new_code != UNKNOWN)
3247 {
3248 cf = ct;
3249 ct = 0;
3250 code = new_code;
3251 }
3252 }
3253
3254 if (compare_code != UNKNOWN)
3255 {
3256 /* notl op1 (if needed)
3257 sarl $31, op1
3258 andl (cf-ct), op1
3259 addl ct, op1
3260
3261 For x < 0 (resp. x <= -1) there will be no notl,
3262 so if possible swap the constants to get rid of the
3263 complement.
3264 True/false will be -1/0 while code below (store flag
3265 followed by decrement) is 0/-1, so the constants need
3266 to be exchanged once more. */
3267
3268 if (compare_code == GE || !cf)
3269 {
3270 code = reverse_condition (code);
3271 compare_code = LT;
3272 }
3273 else
3274 std::swap (ct, cf);
3275
3276 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3277 }
3278 else
3279 {
3280 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3281
3282 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
3283 constm1_rtx,
3284 copy_rtx (out), 1, OPTAB_DIRECT);
3285 }
3286
3287 out = expand_simple_binop (mode, AND, copy_rtx (out),
3288 gen_int_mode (cf - ct, mode),
3289 copy_rtx (out), 1, OPTAB_DIRECT);
3290 if (ct)
3291 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
3292 copy_rtx (out), 1, OPTAB_DIRECT);
3293 if (!rtx_equal_p (out, operands[0]))
3294 emit_move_insn (operands[0], copy_rtx (out));
3295
3296 return true;
3297 }
3298 }
3299
3300 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3301 {
3302 /* Try a few things more with specific constants and a variable. */
3303
3304 optab op;
3305 rtx var, orig_out, out, tmp;
3306
3307 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3308 return false;
3309
3310 /* If one of the two operands is an interesting constant, load a
3311 constant with the above and mask it in with a logical operation. */
3312
3313 if (CONST_INT_P (operands[2]))
3314 {
3315 var = operands[3];
3316 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
3317 operands[3] = constm1_rtx, op = and_optab;
3318 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
3319 operands[3] = const0_rtx, op = ior_optab;
3320 else
3321 return false;
3322 }
3323 else if (CONST_INT_P (operands[3]))
3324 {
3325 var = operands[2];
3326 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
3327 {
3328 /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3329 "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
3330 if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
3331 operands[1] = simplify_gen_relational (LT, VOIDmode,
3332 GET_MODE (op0),
3333 op0, const0_rtx);
3334
3335 operands[2] = constm1_rtx;
3336 op = and_optab;
3337 }
3338 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
3339 operands[2] = const0_rtx, op = ior_optab;
3340 else
3341 return false;
3342 }
3343 else
3344 return false;
3345
3346 orig_out = operands[0];
3347 tmp = gen_reg_rtx (mode);
3348 operands[0] = tmp;
3349
3350 /* Recurse to get the constant loaded. */
3351 if (!ix86_expand_int_movcc (operands))
3352 return false;
3353
3354 /* Mask in the interesting variable. */
3355 out = expand_binop (mode, op, var, tmp, orig_out, 0,
3356 OPTAB_WIDEN);
3357 if (!rtx_equal_p (out, orig_out))
3358 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
3359
3360 return true;
3361 }
3362
3363 /*
3364 * For comparison with above,
3365 *
3366 * movl cf,dest
3367 * movl ct,tmp
3368 * cmpl op1,op2
3369 * cmovcc tmp,dest
3370 *
3371 * Size 15.
3372 */
3373
3374 if (! nonimmediate_operand (operands[2], mode))
3375 operands[2] = force_reg (mode, operands[2]);
3376 if (! nonimmediate_operand (operands[3], mode))
3377 operands[3] = force_reg (mode, operands[3]);
3378
3379 if (! register_operand (operands[2], VOIDmode)
3380 && (mode == QImode
3381 || ! register_operand (operands[3], VOIDmode)))
3382 operands[2] = force_reg (mode, operands[2]);
3383
3384 if (mode == QImode
3385 && ! register_operand (operands[3], VOIDmode))
3386 operands[3] = force_reg (mode, operands[3]);
3387
3388 emit_insn (compare_seq);
3389 emit_insn (gen_rtx_SET (operands[0],
3390 gen_rtx_IF_THEN_ELSE (mode,
3391 compare_op, operands[2],
3392 operands[3])));
3393 return true;
3394 }
3395
3396 /* Detect conditional moves that exactly match min/max operational
3397 semantics. Note that this is IEEE safe, as long as we don't
3398 interchange the operands.
3399
3400 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3401 and TRUE if the operation is successful and instructions are emitted. */
3402
3403 static bool
3404 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
3405 rtx cmp_op1, rtx if_true, rtx if_false)
3406 {
3407 machine_mode mode;
3408 bool is_min;
3409 rtx tmp;
3410
3411 if (code == LT)
3412 ;
3413 else if (code == UNGE)
3414 std::swap (if_true, if_false);
3415 else
3416 return false;
3417
3418 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
3419 is_min = true;
3420 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
3421 is_min = false;
3422 else
3423 return false;
3424
3425 mode = GET_MODE (dest);
3426
3427 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3428 but MODE may be a vector mode and thus not appropriate. */
3429 if (!flag_finite_math_only || flag_signed_zeros)
3430 {
3431 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
3432 rtvec v;
3433
3434 if_true = force_reg (mode, if_true);
3435 v = gen_rtvec (2, if_true, if_false);
3436 tmp = gen_rtx_UNSPEC (mode, v, u);
3437 }
3438 else
3439 {
3440 code = is_min ? SMIN : SMAX;
3441 if (MEM_P (if_true) && MEM_P (if_false))
3442 if_true = force_reg (mode, if_true);
3443 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
3444 }
3445
3446 emit_insn (gen_rtx_SET (dest, tmp));
3447 return true;
3448 }
3449
3450 /* Return true if MODE is valid for vector compare to mask register,
3451 Same result for conditionl vector move with mask register. */
3452 static bool
3453 ix86_valid_mask_cmp_mode (machine_mode mode)
3454 {
3455 /* XOP has its own vector conditional movement. */
3456 if (TARGET_XOP && !TARGET_AVX512F)
3457 return false;
3458
3459 /* AVX512F is needed for mask operation. */
3460 if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
3461 return false;
3462
3463 /* AVX512BW is needed for vector QI/HImode,
3464 AVX512VL is needed for 128/256-bit vector. */
3465 machine_mode inner_mode = GET_MODE_INNER (mode);
3466 int vector_size = GET_MODE_SIZE (mode);
3467 if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
3468 return false;
3469
3470 return vector_size == 64 || TARGET_AVX512VL;
3471 }
3472
3473 /* Return true if integer mask comparison should be used. */
3474 static bool
3475 ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
3476 rtx op_true, rtx op_false)
3477 {
3478 if (GET_MODE_SIZE (mode) == 64)
3479 return true;
3480
3481 /* When op_true is NULL, op_false must be NULL, or vice versa. */
3482 gcc_assert (!op_true == !op_false);
3483
3484 /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
3485 vector dest is required. */
3486 if (!op_true || !ix86_valid_mask_cmp_mode (cmp_mode))
3487 return false;
3488
3489 /* Exclude those that could be optimized in ix86_expand_sse_movcc. */
3490 if (op_false == CONST0_RTX (mode)
3491 || op_true == CONST0_RTX (mode)
3492 || (INTEGRAL_MODE_P (mode)
3493 && (op_true == CONSTM1_RTX (mode)
3494 || op_false == CONSTM1_RTX (mode))))
3495 return false;
3496
3497 return true;
3498 }
3499
3500 /* Expand an SSE comparison. Return the register with the result. */
3501
3502 static rtx
3503 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
3504 rtx op_true, rtx op_false)
3505 {
3506 machine_mode mode = GET_MODE (dest);
3507 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
3508
3509 /* In general case result of comparison can differ from operands' type. */
3510 machine_mode cmp_mode;
3511
3512 /* In AVX512F the result of comparison is an integer mask. */
3513 bool maskcmp = false;
3514 rtx x;
3515
3516 if (ix86_use_mask_cmp_p (mode, cmp_ops_mode, op_true, op_false))
3517 {
3518 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
3519 maskcmp = true;
3520 cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
3521 }
3522 else
3523 cmp_mode = cmp_ops_mode;
3524
3525 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
3526
3527 int (*op1_predicate)(rtx, machine_mode)
3528 = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
3529
3530 if (!op1_predicate (cmp_op1, cmp_ops_mode))
3531 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
3532
3533 if (optimize
3534 || (maskcmp && cmp_mode != mode)
3535 || (op_true && reg_overlap_mentioned_p (dest, op_true))
3536 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
3537 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
3538
3539 if (maskcmp)
3540 {
3541 bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
3542 gcc_assert (ok);
3543 return dest;
3544 }
3545
3546 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
3547
3548 if (cmp_mode != mode)
3549 {
3550 x = force_reg (cmp_ops_mode, x);
3551 convert_move (dest, x, false);
3552 }
3553 else
3554 emit_insn (gen_rtx_SET (dest, x));
3555
3556 return dest;
3557 }
3558
3559 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3560 operations. This is used for both scalar and vector conditional moves. */
3561
3562 void
3563 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
3564 {
3565 machine_mode mode = GET_MODE (dest);
3566 machine_mode cmpmode = GET_MODE (cmp);
3567
3568 /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
3569 if (rtx_equal_p (op_true, op_false))
3570 {
3571 emit_move_insn (dest, op_true);
3572 return;
3573 }
3574
3575 rtx t2, t3, x;
3576
3577 /* If we have an integer mask and FP value then we need
3578 to cast mask to FP mode. */
3579 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
3580 {
3581 cmp = force_reg (cmpmode, cmp);
3582 cmp = gen_rtx_SUBREG (mode, cmp, 0);
3583 }
3584
3585 /* In AVX512F the result of comparison is an integer mask. */
3586 if (mode != cmpmode
3587 && GET_MODE_CLASS (cmpmode) == MODE_INT)
3588 {
3589 gcc_assert (ix86_valid_mask_cmp_mode (mode));
3590 /* Using vector move with mask register. */
3591 cmp = force_reg (cmpmode, cmp);
3592 /* Optimize for mask zero. */
3593 op_true = (op_true != CONST0_RTX (mode)
3594 ? force_reg (mode, op_true) : op_true);
3595 op_false = (op_false != CONST0_RTX (mode)
3596 ? force_reg (mode, op_false) : op_false);
3597 if (op_true == CONST0_RTX (mode))
3598 {
3599 rtx n = gen_reg_rtx (cmpmode);
3600 if (cmpmode == E_DImode && !TARGET_64BIT)
3601 emit_insn (gen_knotdi (n, cmp));
3602 else
3603 emit_insn (gen_rtx_SET (n, gen_rtx_fmt_e (NOT, cmpmode, cmp)));
3604 cmp = n;
3605 /* Reverse op_true op_false. */
3606 std::swap (op_true, op_false);
3607 }
3608
3609 rtx vec_merge = gen_rtx_VEC_MERGE (mode, op_true, op_false, cmp);
3610 emit_insn (gen_rtx_SET (dest, vec_merge));
3611 return;
3612 }
3613 else if (vector_all_ones_operand (op_true, mode)
3614 && op_false == CONST0_RTX (mode))
3615 {
3616 emit_insn (gen_rtx_SET (dest, cmp));
3617 return;
3618 }
3619 else if (op_false == CONST0_RTX (mode))
3620 {
3621 op_true = force_reg (mode, op_true);
3622 x = gen_rtx_AND (mode, cmp, op_true);
3623 emit_insn (gen_rtx_SET (dest, x));
3624 return;
3625 }
3626 else if (op_true == CONST0_RTX (mode))
3627 {
3628 op_false = force_reg (mode, op_false);
3629 x = gen_rtx_NOT (mode, cmp);
3630 x = gen_rtx_AND (mode, x, op_false);
3631 emit_insn (gen_rtx_SET (dest, x));
3632 return;
3633 }
3634 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
3635 {
3636 op_false = force_reg (mode, op_false);
3637 x = gen_rtx_IOR (mode, cmp, op_false);
3638 emit_insn (gen_rtx_SET (dest, x));
3639 return;
3640 }
3641 else if (TARGET_XOP)
3642 {
3643 op_true = force_reg (mode, op_true);
3644
3645 if (!nonimmediate_operand (op_false, mode))
3646 op_false = force_reg (mode, op_false);
3647
3648 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
3649 op_true,
3650 op_false)));
3651 return;
3652 }
3653
3654 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
3655 rtx d = dest;
3656
3657 if (!vector_operand (op_true, mode))
3658 op_true = force_reg (mode, op_true);
3659
3660 op_false = force_reg (mode, op_false);
3661
3662 switch (mode)
3663 {
3664 case E_V4SFmode:
3665 if (TARGET_SSE4_1)
3666 gen = gen_sse4_1_blendvps;
3667 break;
3668 case E_V2DFmode:
3669 if (TARGET_SSE4_1)
3670 gen = gen_sse4_1_blendvpd;
3671 break;
3672 case E_SFmode:
3673 if (TARGET_SSE4_1)
3674 {
3675 gen = gen_sse4_1_blendvss;
3676 op_true = force_reg (mode, op_true);
3677 }
3678 break;
3679 case E_DFmode:
3680 if (TARGET_SSE4_1)
3681 {
3682 gen = gen_sse4_1_blendvsd;
3683 op_true = force_reg (mode, op_true);
3684 }
3685 break;
3686 case E_V16QImode:
3687 case E_V8HImode:
3688 case E_V4SImode:
3689 case E_V2DImode:
3690 if (TARGET_SSE4_1)
3691 {
3692 gen = gen_sse4_1_pblendvb;
3693 if (mode != V16QImode)
3694 d = gen_reg_rtx (V16QImode);
3695 op_false = gen_lowpart (V16QImode, op_false);
3696 op_true = gen_lowpart (V16QImode, op_true);
3697 cmp = gen_lowpart (V16QImode, cmp);
3698 }
3699 break;
3700 case E_V8SFmode:
3701 if (TARGET_AVX)
3702 gen = gen_avx_blendvps256;
3703 break;
3704 case E_V4DFmode:
3705 if (TARGET_AVX)
3706 gen = gen_avx_blendvpd256;
3707 break;
3708 case E_V32QImode:
3709 case E_V16HImode:
3710 case E_V8SImode:
3711 case E_V4DImode:
3712 if (TARGET_AVX2)
3713 {
3714 gen = gen_avx2_pblendvb;
3715 if (mode != V32QImode)
3716 d = gen_reg_rtx (V32QImode);
3717 op_false = gen_lowpart (V32QImode, op_false);
3718 op_true = gen_lowpart (V32QImode, op_true);
3719 cmp = gen_lowpart (V32QImode, cmp);
3720 }
3721 break;
3722
3723 case E_V64QImode:
3724 gen = gen_avx512bw_blendmv64qi;
3725 break;
3726 case E_V32HImode:
3727 gen = gen_avx512bw_blendmv32hi;
3728 break;
3729 case E_V16SImode:
3730 gen = gen_avx512f_blendmv16si;
3731 break;
3732 case E_V8DImode:
3733 gen = gen_avx512f_blendmv8di;
3734 break;
3735 case E_V8DFmode:
3736 gen = gen_avx512f_blendmv8df;
3737 break;
3738 case E_V16SFmode:
3739 gen = gen_avx512f_blendmv16sf;
3740 break;
3741
3742 default:
3743 break;
3744 }
3745
3746 if (gen != NULL)
3747 {
3748 emit_insn (gen (d, op_false, op_true, cmp));
3749 if (d != dest)
3750 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
3751 }
3752 else
3753 {
3754 op_true = force_reg (mode, op_true);
3755
3756 t2 = gen_reg_rtx (mode);
3757 if (optimize)
3758 t3 = gen_reg_rtx (mode);
3759 else
3760 t3 = dest;
3761
3762 x = gen_rtx_AND (mode, op_true, cmp);
3763 emit_insn (gen_rtx_SET (t2, x));
3764
3765 x = gen_rtx_NOT (mode, cmp);
3766 x = gen_rtx_AND (mode, x, op_false);
3767 emit_insn (gen_rtx_SET (t3, x));
3768
3769 x = gen_rtx_IOR (mode, t3, t2);
3770 emit_insn (gen_rtx_SET (dest, x));
3771 }
3772 }
3773
3774 /* Swap, force into registers, or otherwise massage the two operands
3775 to an sse comparison with a mask result. Thus we differ a bit from
3776 ix86_prepare_fp_compare_args which expects to produce a flags result.
3777
3778 The DEST operand exists to help determine whether to commute commutative
3779 operators. The POP0/POP1 operands are updated in place. The new
3780 comparison code is returned, or UNKNOWN if not implementable. */
3781
3782 static enum rtx_code
3783 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
3784 rtx *pop0, rtx *pop1)
3785 {
3786 switch (code)
3787 {
3788 case LTGT:
3789 case UNEQ:
3790 /* AVX supports all the needed comparisons. */
3791 if (TARGET_AVX)
3792 break;
3793 /* We have no LTGT as an operator. We could implement it with
3794 NE & ORDERED, but this requires an extra temporary. It's
3795 not clear that it's worth it. */
3796 return UNKNOWN;
3797
3798 case LT:
3799 case LE:
3800 case UNGT:
3801 case UNGE:
3802 /* These are supported directly. */
3803 break;
3804
3805 case EQ:
3806 case NE:
3807 case UNORDERED:
3808 case ORDERED:
3809 /* AVX has 3 operand comparisons, no need to swap anything. */
3810 if (TARGET_AVX)
3811 break;
3812 /* For commutative operators, try to canonicalize the destination
3813 operand to be first in the comparison - this helps reload to
3814 avoid extra moves. */
3815 if (!dest || !rtx_equal_p (dest, *pop1))
3816 break;
3817 /* FALLTHRU */
3818
3819 case GE:
3820 case GT:
3821 case UNLE:
3822 case UNLT:
3823 /* These are not supported directly before AVX, and furthermore
3824 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
3825 comparison operands to transform into something that is
3826 supported. */
3827 std::swap (*pop0, *pop1);
3828 code = swap_condition (code);
3829 break;
3830
3831 default:
3832 gcc_unreachable ();
3833 }
3834
3835 return code;
3836 }
3837
3838 /* Expand a floating-point conditional move. Return true if successful. */
3839
3840 bool
3841 ix86_expand_fp_movcc (rtx operands[])
3842 {
3843 machine_mode mode = GET_MODE (operands[0]);
3844 enum rtx_code code = GET_CODE (operands[1]);
3845 rtx tmp, compare_op;
3846 rtx op0 = XEXP (operands[1], 0);
3847 rtx op1 = XEXP (operands[1], 1);
3848
3849 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
3850 {
3851 machine_mode cmode;
3852
3853 /* Since we've no cmove for sse registers, don't force bad register
3854 allocation just to gain access to it. Deny movcc when the
3855 comparison mode doesn't match the move mode. */
3856 cmode = GET_MODE (op0);
3857 if (cmode == VOIDmode)
3858 cmode = GET_MODE (op1);
3859 if (cmode != mode)
3860 return false;
3861
3862 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
3863 if (code == UNKNOWN)
3864 return false;
3865
3866 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
3867 operands[2], operands[3]))
3868 return true;
3869
3870 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
3871 operands[2], operands[3]);
3872 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
3873 return true;
3874 }
3875
3876 if (GET_MODE (op0) == TImode
3877 || (GET_MODE (op0) == DImode
3878 && !TARGET_64BIT))
3879 return false;
3880
3881 /* The floating point conditional move instructions don't directly
3882 support conditions resulting from a signed integer comparison. */
3883
3884 compare_op = ix86_expand_compare (code, op0, op1);
3885 if (!fcmov_comparison_operator (compare_op, VOIDmode))
3886 {
3887 tmp = gen_reg_rtx (QImode);
3888 ix86_expand_setcc (tmp, code, op0, op1);
3889
3890 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
3891 }
3892
3893 emit_insn (gen_rtx_SET (operands[0],
3894 gen_rtx_IF_THEN_ELSE (mode, compare_op,
3895 operands[2], operands[3])));
3896
3897 return true;
3898 }
3899
3900 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
3901
3902 static int
3903 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
3904 {
3905 switch (code)
3906 {
3907 case EQ:
3908 return 0;
3909 case LT:
3910 case LTU:
3911 return 1;
3912 case LE:
3913 case LEU:
3914 return 2;
3915 case NE:
3916 return 4;
3917 case GE:
3918 case GEU:
3919 return 5;
3920 case GT:
3921 case GTU:
3922 return 6;
3923 default:
3924 gcc_unreachable ();
3925 }
3926 }
3927
3928 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
3929
3930 static int
3931 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
3932 {
3933 switch (code)
3934 {
3935 case EQ:
3936 return 0x00;
3937 case NE:
3938 return 0x04;
3939 case GT:
3940 return 0x0e;
3941 case LE:
3942 return 0x02;
3943 case GE:
3944 return 0x0d;
3945 case LT:
3946 return 0x01;
3947 case UNLE:
3948 return 0x0a;
3949 case UNLT:
3950 return 0x09;
3951 case UNGE:
3952 return 0x05;
3953 case UNGT:
3954 return 0x06;
3955 case UNEQ:
3956 return 0x18;
3957 case LTGT:
3958 return 0x0c;
3959 case ORDERED:
3960 return 0x07;
3961 case UNORDERED:
3962 return 0x03;
3963 default:
3964 gcc_unreachable ();
3965 }
3966 }
3967
3968 /* Return immediate value to be used in UNSPEC_PCMP
3969 for comparison CODE in MODE. */
3970
3971 static int
3972 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
3973 {
3974 if (FLOAT_MODE_P (mode))
3975 return ix86_fp_cmp_code_to_pcmp_immediate (code);
3976 return ix86_int_cmp_code_to_pcmp_immediate (code);
3977 }
3978
3979 /* Expand AVX-512 vector comparison. */
3980
3981 bool
3982 ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
3983 {
3984 machine_mode mask_mode = GET_MODE (dest);
3985 machine_mode cmp_mode = GET_MODE (cmp_op0);
3986 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
3987 int unspec_code;
3988 rtx unspec;
3989
3990 switch (code)
3991 {
3992 case LEU:
3993 case GTU:
3994 case GEU:
3995 case LTU:
3996 unspec_code = UNSPEC_UNSIGNED_PCMP;
3997 break;
3998
3999 default:
4000 unspec_code = UNSPEC_PCMP;
4001 }
4002
4003 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
4004 unspec_code);
4005 emit_insn (gen_rtx_SET (dest, unspec));
4006
4007 return true;
4008 }
4009
4010 /* Expand fp vector comparison. */
4011
4012 bool
4013 ix86_expand_fp_vec_cmp (rtx operands[])
4014 {
4015 enum rtx_code code = GET_CODE (operands[1]);
4016 rtx cmp;
4017
4018 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4019 &operands[2], &operands[3]);
4020 if (code == UNKNOWN)
4021 {
4022 rtx temp;
4023 switch (GET_CODE (operands[1]))
4024 {
4025 case LTGT:
4026 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
4027 operands[3], NULL, NULL);
4028 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
4029 operands[3], NULL, NULL);
4030 code = AND;
4031 break;
4032 case UNEQ:
4033 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
4034 operands[3], NULL, NULL);
4035 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
4036 operands[3], NULL, NULL);
4037 code = IOR;
4038 break;
4039 default:
4040 gcc_unreachable ();
4041 }
4042 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4043 OPTAB_DIRECT);
4044 }
4045 else
4046 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
4047 NULL, NULL);
4048
4049 if (operands[0] != cmp)
4050 emit_move_insn (operands[0], cmp);
4051
4052 return true;
4053 }
4054
4055 static rtx
4056 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4057 rtx op_true, rtx op_false, bool *negate)
4058 {
4059 machine_mode data_mode = GET_MODE (dest);
4060 machine_mode mode = GET_MODE (cop0);
4061 rtx x;
4062
4063 *negate = false;
4064
4065 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4066 if (TARGET_XOP
4067 && (mode == V16QImode || mode == V8HImode
4068 || mode == V4SImode || mode == V2DImode))
4069 ;
4070 /* AVX512F supports all of the comparsions
4071 on all 128/256/512-bit vector int types. */
4072 else if (ix86_use_mask_cmp_p (data_mode, mode, op_true, op_false))
4073 ;
4074 else
4075 {
4076 /* Canonicalize the comparison to EQ, GT, GTU. */
4077 switch (code)
4078 {
4079 case EQ:
4080 case GT:
4081 case GTU:
4082 break;
4083
4084 case NE:
4085 case LE:
4086 case LEU:
4087 code = reverse_condition (code);
4088 *negate = true;
4089 break;
4090
4091 case GE:
4092 case GEU:
4093 code = reverse_condition (code);
4094 *negate = true;
4095 /* FALLTHRU */
4096
4097 case LT:
4098 case LTU:
4099 std::swap (cop0, cop1);
4100 code = swap_condition (code);
4101 break;
4102
4103 default:
4104 gcc_unreachable ();
4105 }
4106
4107 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4108 if (mode == V2DImode)
4109 {
4110 switch (code)
4111 {
4112 case EQ:
4113 /* SSE4.1 supports EQ. */
4114 if (!TARGET_SSE4_1)
4115 return NULL;
4116 break;
4117
4118 case GT:
4119 case GTU:
4120 /* SSE4.2 supports GT/GTU. */
4121 if (!TARGET_SSE4_2)
4122 return NULL;
4123 break;
4124
4125 default:
4126 gcc_unreachable ();
4127 }
4128 }
4129
4130 rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
4131 rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
4132 if (*negate)
4133 std::swap (optrue, opfalse);
4134
4135 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4136 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4137 min (x, y) == x). While we add one instruction (the minimum),
4138 we remove the need for two instructions in the negation, as the
4139 result is done this way.
4140 When using masks, do it for SI/DImode element types, as it is shorter
4141 than the two subtractions. */
4142 if ((code != EQ
4143 && GET_MODE_SIZE (mode) != 64
4144 && vector_all_ones_operand (opfalse, data_mode)
4145 && optrue == CONST0_RTX (data_mode))
4146 || (code == GTU
4147 && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
4148 /* Don't do it if not using integer masks and we'd end up with
4149 the right values in the registers though. */
4150 && (GET_MODE_SIZE (mode) == 64
4151 || !vector_all_ones_operand (optrue, data_mode)
4152 || opfalse != CONST0_RTX (data_mode))))
4153 {
4154 rtx (*gen) (rtx, rtx, rtx) = NULL;
4155
4156 switch (mode)
4157 {
4158 case E_V16SImode:
4159 gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
4160 break;
4161 case E_V8DImode:
4162 gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
4163 cop0 = force_reg (mode, cop0);
4164 cop1 = force_reg (mode, cop1);
4165 break;
4166 case E_V32QImode:
4167 if (TARGET_AVX2)
4168 gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
4169 break;
4170 case E_V16HImode:
4171 if (TARGET_AVX2)
4172 gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
4173 break;
4174 case E_V8SImode:
4175 if (TARGET_AVX2)
4176 gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
4177 break;
4178 case E_V4DImode:
4179 if (TARGET_AVX512VL)
4180 {
4181 gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
4182 cop0 = force_reg (mode, cop0);
4183 cop1 = force_reg (mode, cop1);
4184 }
4185 break;
4186 case E_V16QImode:
4187 if (code == GTU && TARGET_SSE2)
4188 gen = gen_uminv16qi3;
4189 else if (code == GT && TARGET_SSE4_1)
4190 gen = gen_sminv16qi3;
4191 break;
4192 case E_V8HImode:
4193 if (code == GTU && TARGET_SSE4_1)
4194 gen = gen_uminv8hi3;
4195 else if (code == GT && TARGET_SSE2)
4196 gen = gen_sminv8hi3;
4197 break;
4198 case E_V4SImode:
4199 if (TARGET_SSE4_1)
4200 gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
4201 break;
4202 case E_V2DImode:
4203 if (TARGET_AVX512VL)
4204 {
4205 gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
4206 cop0 = force_reg (mode, cop0);
4207 cop1 = force_reg (mode, cop1);
4208 }
4209 break;
4210 default:
4211 break;
4212 }
4213
4214 if (gen)
4215 {
4216 rtx tem = gen_reg_rtx (mode);
4217 if (!vector_operand (cop0, mode))
4218 cop0 = force_reg (mode, cop0);
4219 if (!vector_operand (cop1, mode))
4220 cop1 = force_reg (mode, cop1);
4221 *negate = !*negate;
4222 emit_insn (gen (tem, cop0, cop1));
4223 cop1 = tem;
4224 code = EQ;
4225 }
4226 }
4227
4228 /* Unsigned parallel compare is not supported by the hardware.
4229 Play some tricks to turn this into a signed comparison
4230 against 0. */
4231 if (code == GTU)
4232 {
4233 cop0 = force_reg (mode, cop0);
4234
4235 switch (mode)
4236 {
4237 case E_V16SImode:
4238 case E_V8DImode:
4239 case E_V8SImode:
4240 case E_V4DImode:
4241 case E_V4SImode:
4242 case E_V2DImode:
4243 {
4244 rtx t1, t2, mask;
4245
4246 /* Subtract (-(INT MAX) - 1) from both operands to make
4247 them signed. */
4248 mask = ix86_build_signbit_mask (mode, true, false);
4249 t1 = gen_reg_rtx (mode);
4250 emit_insn (gen_sub3_insn (t1, cop0, mask));
4251
4252 t2 = gen_reg_rtx (mode);
4253 emit_insn (gen_sub3_insn (t2, cop1, mask));
4254
4255 cop0 = t1;
4256 cop1 = t2;
4257 code = GT;
4258 }
4259 break;
4260
4261 case E_V64QImode:
4262 case E_V32HImode:
4263 case E_V32QImode:
4264 case E_V16HImode:
4265 case E_V16QImode:
4266 case E_V8HImode:
4267 /* Perform a parallel unsigned saturating subtraction. */
4268 x = gen_reg_rtx (mode);
4269 emit_insn (gen_rtx_SET
4270 (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
4271 cop0 = x;
4272 cop1 = CONST0_RTX (mode);
4273 code = EQ;
4274 *negate = !*negate;
4275 break;
4276
4277 default:
4278 gcc_unreachable ();
4279 }
4280 }
4281 }
4282
4283 if (*negate)
4284 std::swap (op_true, op_false);
4285
4286 /* Allow the comparison to be done in one mode, but the movcc to
4287 happen in another mode. */
4288 if (data_mode == mode)
4289 {
4290 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
4291 op_true, op_false);
4292 }
4293 else
4294 {
4295 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
4296 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
4297 op_true, op_false);
4298 if (GET_MODE (x) == mode)
4299 x = gen_lowpart (data_mode, x);
4300 }
4301
4302 return x;
4303 }
4304
4305 /* Expand integer vector comparison. */
4306
4307 bool
4308 ix86_expand_int_vec_cmp (rtx operands[])
4309 {
4310 rtx_code code = GET_CODE (operands[1]);
4311 bool negate = false;
4312 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
4313 operands[3], NULL, NULL, &negate);
4314
4315 if (!cmp)
4316 return false;
4317
4318 if (negate)
4319 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
4320 CONST0_RTX (GET_MODE (cmp)),
4321 NULL, NULL, &negate);
4322
4323 gcc_assert (!negate);
4324
4325 if (operands[0] != cmp)
4326 emit_move_insn (operands[0], cmp);
4327
4328 return true;
4329 }
4330
4331 /* Expand a floating-point vector conditional move; a vcond operation
4332 rather than a movcc operation. */
4333
4334 bool
4335 ix86_expand_fp_vcond (rtx operands[])
4336 {
4337 enum rtx_code code = GET_CODE (operands[3]);
4338 rtx cmp;
4339
4340 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4341 &operands[4], &operands[5]);
4342 if (code == UNKNOWN)
4343 {
4344 rtx temp;
4345 switch (GET_CODE (operands[3]))
4346 {
4347 case LTGT:
4348 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
4349 operands[5], operands[0], operands[0]);
4350 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
4351 operands[5], operands[1], operands[2]);
4352 code = AND;
4353 break;
4354 case UNEQ:
4355 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
4356 operands[5], operands[0], operands[0]);
4357 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
4358 operands[5], operands[1], operands[2]);
4359 code = IOR;
4360 break;
4361 default:
4362 gcc_unreachable ();
4363 }
4364 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4365 OPTAB_DIRECT);
4366 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4367 return true;
4368 }
4369
4370 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
4371 operands[5], operands[1], operands[2]))
4372 return true;
4373
4374 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
4375 operands[1], operands[2]);
4376 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4377 return true;
4378 }
4379
4380 /* Expand a signed/unsigned integral vector conditional move. */
4381
4382 bool
4383 ix86_expand_int_vcond (rtx operands[])
4384 {
4385 machine_mode data_mode = GET_MODE (operands[0]);
4386 machine_mode mode = GET_MODE (operands[4]);
4387 enum rtx_code code = GET_CODE (operands[3]);
4388 bool negate = false;
4389 rtx x, cop0, cop1;
4390
4391 cop0 = operands[4];
4392 cop1 = operands[5];
4393
4394 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4395 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4396 if ((code == LT || code == GE)
4397 && data_mode == mode
4398 && cop1 == CONST0_RTX (mode)
4399 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
4400 && GET_MODE_UNIT_SIZE (data_mode) > 1
4401 && GET_MODE_UNIT_SIZE (data_mode) <= 8
4402 && (GET_MODE_SIZE (data_mode) == 16
4403 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
4404 {
4405 rtx negop = operands[2 - (code == LT)];
4406 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
4407 if (negop == CONST1_RTX (data_mode))
4408 {
4409 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
4410 operands[0], 1, OPTAB_DIRECT);
4411 if (res != operands[0])
4412 emit_move_insn (operands[0], res);
4413 return true;
4414 }
4415 else if (GET_MODE_INNER (data_mode) != DImode
4416 && vector_all_ones_operand (negop, data_mode))
4417 {
4418 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
4419 operands[0], 0, OPTAB_DIRECT);
4420 if (res != operands[0])
4421 emit_move_insn (operands[0], res);
4422 return true;
4423 }
4424 }
4425
4426 if (!nonimmediate_operand (cop1, mode))
4427 cop1 = force_reg (mode, cop1);
4428 if (!general_operand (operands[1], data_mode))
4429 operands[1] = force_reg (data_mode, operands[1]);
4430 if (!general_operand (operands[2], data_mode))
4431 operands[2] = force_reg (data_mode, operands[2]);
4432
4433 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
4434 operands[1], operands[2], &negate);
4435
4436 if (!x)
4437 return false;
4438
4439 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
4440 operands[2-negate]);
4441 return true;
4442 }
4443
4444 static bool
4445 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
4446 struct expand_vec_perm_d *d)
4447 {
4448 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4449 expander, so args are either in d, or in op0, op1 etc. */
4450 machine_mode mode = GET_MODE (d ? d->op0 : op0);
4451 machine_mode maskmode = mode;
4452 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4453
4454 switch (mode)
4455 {
4456 case E_V8HImode:
4457 if (TARGET_AVX512VL && TARGET_AVX512BW)
4458 gen = gen_avx512vl_vpermt2varv8hi3;
4459 break;
4460 case E_V16HImode:
4461 if (TARGET_AVX512VL && TARGET_AVX512BW)
4462 gen = gen_avx512vl_vpermt2varv16hi3;
4463 break;
4464 case E_V64QImode:
4465 if (TARGET_AVX512VBMI)
4466 gen = gen_avx512bw_vpermt2varv64qi3;
4467 break;
4468 case E_V32HImode:
4469 if (TARGET_AVX512BW)
4470 gen = gen_avx512bw_vpermt2varv32hi3;
4471 break;
4472 case E_V4SImode:
4473 if (TARGET_AVX512VL)
4474 gen = gen_avx512vl_vpermt2varv4si3;
4475 break;
4476 case E_V8SImode:
4477 if (TARGET_AVX512VL)
4478 gen = gen_avx512vl_vpermt2varv8si3;
4479 break;
4480 case E_V16SImode:
4481 if (TARGET_AVX512F)
4482 gen = gen_avx512f_vpermt2varv16si3;
4483 break;
4484 case E_V4SFmode:
4485 if (TARGET_AVX512VL)
4486 {
4487 gen = gen_avx512vl_vpermt2varv4sf3;
4488 maskmode = V4SImode;
4489 }
4490 break;
4491 case E_V8SFmode:
4492 if (TARGET_AVX512VL)
4493 {
4494 gen = gen_avx512vl_vpermt2varv8sf3;
4495 maskmode = V8SImode;
4496 }
4497 break;
4498 case E_V16SFmode:
4499 if (TARGET_AVX512F)
4500 {
4501 gen = gen_avx512f_vpermt2varv16sf3;
4502 maskmode = V16SImode;
4503 }
4504 break;
4505 case E_V2DImode:
4506 if (TARGET_AVX512VL)
4507 gen = gen_avx512vl_vpermt2varv2di3;
4508 break;
4509 case E_V4DImode:
4510 if (TARGET_AVX512VL)
4511 gen = gen_avx512vl_vpermt2varv4di3;
4512 break;
4513 case E_V8DImode:
4514 if (TARGET_AVX512F)
4515 gen = gen_avx512f_vpermt2varv8di3;
4516 break;
4517 case E_V2DFmode:
4518 if (TARGET_AVX512VL)
4519 {
4520 gen = gen_avx512vl_vpermt2varv2df3;
4521 maskmode = V2DImode;
4522 }
4523 break;
4524 case E_V4DFmode:
4525 if (TARGET_AVX512VL)
4526 {
4527 gen = gen_avx512vl_vpermt2varv4df3;
4528 maskmode = V4DImode;
4529 }
4530 break;
4531 case E_V8DFmode:
4532 if (TARGET_AVX512F)
4533 {
4534 gen = gen_avx512f_vpermt2varv8df3;
4535 maskmode = V8DImode;
4536 }
4537 break;
4538 default:
4539 break;
4540 }
4541
4542 if (gen == NULL)
4543 return false;
4544
4545 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4546 expander, so args are either in d, or in op0, op1 etc. */
4547 if (d)
4548 {
4549 rtx vec[64];
4550 target = d->target;
4551 op0 = d->op0;
4552 op1 = d->op1;
4553 for (int i = 0; i < d->nelt; ++i)
4554 vec[i] = GEN_INT (d->perm[i]);
4555 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
4556 }
4557
4558 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
4559 return true;
4560 }
4561
4562 /* Expand a variable vector permutation. */
4563
4564 void
4565 ix86_expand_vec_perm (rtx operands[])
4566 {
4567 rtx target = operands[0];
4568 rtx op0 = operands[1];
4569 rtx op1 = operands[2];
4570 rtx mask = operands[3];
4571 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
4572 machine_mode mode = GET_MODE (op0);
4573 machine_mode maskmode = GET_MODE (mask);
4574 int w, e, i;
4575 bool one_operand_shuffle = rtx_equal_p (op0, op1);
4576
4577 /* Number of elements in the vector. */
4578 w = GET_MODE_NUNITS (mode);
4579 e = GET_MODE_UNIT_SIZE (mode);
4580 gcc_assert (w <= 64);
4581
4582 if (TARGET_AVX512F && one_operand_shuffle)
4583 {
4584 rtx (*gen) (rtx, rtx, rtx) = NULL;
4585 switch (mode)
4586 {
4587 case E_V16SImode:
4588 gen =gen_avx512f_permvarv16si;
4589 break;
4590 case E_V16SFmode:
4591 gen = gen_avx512f_permvarv16sf;
4592 break;
4593 case E_V8DImode:
4594 gen = gen_avx512f_permvarv8di;
4595 break;
4596 case E_V8DFmode:
4597 gen = gen_avx512f_permvarv8df;
4598 break;
4599 default:
4600 break;
4601 }
4602 if (gen != NULL)
4603 {
4604 emit_insn (gen (target, op0, mask));
4605 return;
4606 }
4607 }
4608
4609 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
4610 return;
4611
4612 if (TARGET_AVX2)
4613 {
4614 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
4615 {
4616 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
4617 an constant shuffle operand. With a tiny bit of effort we can
4618 use VPERMD instead. A re-interpretation stall for V4DFmode is
4619 unfortunate but there's no avoiding it.
4620 Similarly for V16HImode we don't have instructions for variable
4621 shuffling, while for V32QImode we can use after preparing suitable
4622 masks vpshufb; vpshufb; vpermq; vpor. */
4623
4624 if (mode == V16HImode)
4625 {
4626 maskmode = mode = V32QImode;
4627 w = 32;
4628 e = 1;
4629 }
4630 else
4631 {
4632 maskmode = mode = V8SImode;
4633 w = 8;
4634 e = 4;
4635 }
4636 t1 = gen_reg_rtx (maskmode);
4637
4638 /* Replicate the low bits of the V4DImode mask into V8SImode:
4639 mask = { A B C D }
4640 t1 = { A A B B C C D D }. */
4641 for (i = 0; i < w / 2; ++i)
4642 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
4643 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4644 vt = force_reg (maskmode, vt);
4645 mask = gen_lowpart (maskmode, mask);
4646 if (maskmode == V8SImode)
4647 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
4648 else
4649 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
4650
4651 /* Multiply the shuffle indicies by two. */
4652 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
4653 OPTAB_DIRECT);
4654
4655 /* Add one to the odd shuffle indicies:
4656 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
4657 for (i = 0; i < w / 2; ++i)
4658 {
4659 vec[i * 2] = const0_rtx;
4660 vec[i * 2 + 1] = const1_rtx;
4661 }
4662 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4663 vt = validize_mem (force_const_mem (maskmode, vt));
4664 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
4665 OPTAB_DIRECT);
4666
4667 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
4668 operands[3] = mask = t1;
4669 target = gen_reg_rtx (mode);
4670 op0 = gen_lowpart (mode, op0);
4671 op1 = gen_lowpart (mode, op1);
4672 }
4673
4674 switch (mode)
4675 {
4676 case E_V8SImode:
4677 /* The VPERMD and VPERMPS instructions already properly ignore
4678 the high bits of the shuffle elements. No need for us to
4679 perform an AND ourselves. */
4680 if (one_operand_shuffle)
4681 {
4682 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
4683 if (target != operands[0])
4684 emit_move_insn (operands[0],
4685 gen_lowpart (GET_MODE (operands[0]), target));
4686 }
4687 else
4688 {
4689 t1 = gen_reg_rtx (V8SImode);
4690 t2 = gen_reg_rtx (V8SImode);
4691 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
4692 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
4693 goto merge_two;
4694 }
4695 return;
4696
4697 case E_V8SFmode:
4698 mask = gen_lowpart (V8SImode, mask);
4699 if (one_operand_shuffle)
4700 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
4701 else
4702 {
4703 t1 = gen_reg_rtx (V8SFmode);
4704 t2 = gen_reg_rtx (V8SFmode);
4705 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
4706 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
4707 goto merge_two;
4708 }
4709 return;
4710
4711 case E_V4SImode:
4712 /* By combining the two 128-bit input vectors into one 256-bit
4713 input vector, we can use VPERMD and VPERMPS for the full
4714 two-operand shuffle. */
4715 t1 = gen_reg_rtx (V8SImode);
4716 t2 = gen_reg_rtx (V8SImode);
4717 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
4718 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4719 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
4720 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
4721 return;
4722
4723 case E_V4SFmode:
4724 t1 = gen_reg_rtx (V8SFmode);
4725 t2 = gen_reg_rtx (V8SImode);
4726 mask = gen_lowpart (V4SImode, mask);
4727 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
4728 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4729 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
4730 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
4731 return;
4732
4733 case E_V32QImode:
4734 t1 = gen_reg_rtx (V32QImode);
4735 t2 = gen_reg_rtx (V32QImode);
4736 t3 = gen_reg_rtx (V32QImode);
4737 vt2 = GEN_INT (-128);
4738 vt = gen_const_vec_duplicate (V32QImode, vt2);
4739 vt = force_reg (V32QImode, vt);
4740 for (i = 0; i < 32; i++)
4741 vec[i] = i < 16 ? vt2 : const0_rtx;
4742 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
4743 vt2 = force_reg (V32QImode, vt2);
4744 /* From mask create two adjusted masks, which contain the same
4745 bits as mask in the low 7 bits of each vector element.
4746 The first mask will have the most significant bit clear
4747 if it requests element from the same 128-bit lane
4748 and MSB set if it requests element from the other 128-bit lane.
4749 The second mask will have the opposite values of the MSB,
4750 and additionally will have its 128-bit lanes swapped.
4751 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
4752 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
4753 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
4754 stands for other 12 bytes. */
4755 /* The bit whether element is from the same lane or the other
4756 lane is bit 4, so shift it up by 3 to the MSB position. */
4757 t5 = gen_reg_rtx (V4DImode);
4758 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
4759 GEN_INT (3)));
4760 /* Clear MSB bits from the mask just in case it had them set. */
4761 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
4762 /* After this t1 will have MSB set for elements from other lane. */
4763 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
4764 /* Clear bits other than MSB. */
4765 emit_insn (gen_andv32qi3 (t1, t1, vt));
4766 /* Or in the lower bits from mask into t3. */
4767 emit_insn (gen_iorv32qi3 (t3, t1, t2));
4768 /* And invert MSB bits in t1, so MSB is set for elements from the same
4769 lane. */
4770 emit_insn (gen_xorv32qi3 (t1, t1, vt));
4771 /* Swap 128-bit lanes in t3. */
4772 t6 = gen_reg_rtx (V4DImode);
4773 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
4774 const2_rtx, GEN_INT (3),
4775 const0_rtx, const1_rtx));
4776 /* And or in the lower bits from mask into t1. */
4777 emit_insn (gen_iorv32qi3 (t1, t1, t2));
4778 if (one_operand_shuffle)
4779 {
4780 /* Each of these shuffles will put 0s in places where
4781 element from the other 128-bit lane is needed, otherwise
4782 will shuffle in the requested value. */
4783 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
4784 gen_lowpart (V32QImode, t6)));
4785 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
4786 /* For t3 the 128-bit lanes are swapped again. */
4787 t7 = gen_reg_rtx (V4DImode);
4788 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
4789 const2_rtx, GEN_INT (3),
4790 const0_rtx, const1_rtx));
4791 /* And oring both together leads to the result. */
4792 emit_insn (gen_iorv32qi3 (target, t1,
4793 gen_lowpart (V32QImode, t7)));
4794 if (target != operands[0])
4795 emit_move_insn (operands[0],
4796 gen_lowpart (GET_MODE (operands[0]), target));
4797 return;
4798 }
4799
4800 t4 = gen_reg_rtx (V32QImode);
4801 /* Similarly to the above one_operand_shuffle code,
4802 just for repeated twice for each operand. merge_two:
4803 code will merge the two results together. */
4804 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
4805 gen_lowpart (V32QImode, t6)));
4806 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
4807 gen_lowpart (V32QImode, t6)));
4808 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
4809 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
4810 t7 = gen_reg_rtx (V4DImode);
4811 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
4812 const2_rtx, GEN_INT (3),
4813 const0_rtx, const1_rtx));
4814 t8 = gen_reg_rtx (V4DImode);
4815 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
4816 const2_rtx, GEN_INT (3),
4817 const0_rtx, const1_rtx));
4818 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
4819 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
4820 t1 = t4;
4821 t2 = t3;
4822 goto merge_two;
4823
4824 default:
4825 gcc_assert (GET_MODE_SIZE (mode) <= 16);
4826 break;
4827 }
4828 }
4829
4830 if (TARGET_XOP)
4831 {
4832 /* The XOP VPPERM insn supports three inputs. By ignoring the
4833 one_operand_shuffle special case, we avoid creating another
4834 set of constant vectors in memory. */
4835 one_operand_shuffle = false;
4836
4837 /* mask = mask & {2*w-1, ...} */
4838 vt = GEN_INT (2*w - 1);
4839 }
4840 else
4841 {
4842 /* mask = mask & {w-1, ...} */
4843 vt = GEN_INT (w - 1);
4844 }
4845
4846 vt = gen_const_vec_duplicate (maskmode, vt);
4847 mask = expand_simple_binop (maskmode, AND, mask, vt,
4848 NULL_RTX, 0, OPTAB_DIRECT);
4849
4850 /* For non-QImode operations, convert the word permutation control
4851 into a byte permutation control. */
4852 if (mode != V16QImode)
4853 {
4854 mask = expand_simple_binop (maskmode, ASHIFT, mask,
4855 GEN_INT (exact_log2 (e)),
4856 NULL_RTX, 0, OPTAB_DIRECT);
4857
4858 /* Convert mask to vector of chars. */
4859 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
4860
4861 /* Replicate each of the input bytes into byte positions:
4862 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
4863 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
4864 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
4865 for (i = 0; i < 16; ++i)
4866 vec[i] = GEN_INT (i/e * e);
4867 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
4868 vt = validize_mem (force_const_mem (V16QImode, vt));
4869 if (TARGET_XOP)
4870 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
4871 else
4872 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
4873
4874 /* Convert it into the byte positions by doing
4875 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
4876 for (i = 0; i < 16; ++i)
4877 vec[i] = GEN_INT (i % e);
4878 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
4879 vt = validize_mem (force_const_mem (V16QImode, vt));
4880 emit_insn (gen_addv16qi3 (mask, mask, vt));
4881 }
4882
4883 /* The actual shuffle operations all operate on V16QImode. */
4884 op0 = gen_lowpart (V16QImode, op0);
4885 op1 = gen_lowpart (V16QImode, op1);
4886
4887 if (TARGET_XOP)
4888 {
4889 if (GET_MODE (target) != V16QImode)
4890 target = gen_reg_rtx (V16QImode);
4891 emit_insn (gen_xop_pperm (target, op0, op1, mask));
4892 if (target != operands[0])
4893 emit_move_insn (operands[0],
4894 gen_lowpart (GET_MODE (operands[0]), target));
4895 }
4896 else if (one_operand_shuffle)
4897 {
4898 if (GET_MODE (target) != V16QImode)
4899 target = gen_reg_rtx (V16QImode);
4900 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
4901 if (target != operands[0])
4902 emit_move_insn (operands[0],
4903 gen_lowpart (GET_MODE (operands[0]), target));
4904 }
4905 else
4906 {
4907 rtx xops[6];
4908 bool ok;
4909
4910 /* Shuffle the two input vectors independently. */
4911 t1 = gen_reg_rtx (V16QImode);
4912 t2 = gen_reg_rtx (V16QImode);
4913 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
4914 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
4915
4916 merge_two:
4917 /* Then merge them together. The key is whether any given control
4918 element contained a bit set that indicates the second word. */
4919 mask = operands[3];
4920 vt = GEN_INT (w);
4921 if (maskmode == V2DImode && !TARGET_SSE4_1)
4922 {
4923 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
4924 more shuffle to convert the V2DI input mask into a V4SI
4925 input mask. At which point the masking that expand_int_vcond
4926 will work as desired. */
4927 rtx t3 = gen_reg_rtx (V4SImode);
4928 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
4929 const0_rtx, const0_rtx,
4930 const2_rtx, const2_rtx));
4931 mask = t3;
4932 maskmode = V4SImode;
4933 e = w = 4;
4934 }
4935
4936 vt = gen_const_vec_duplicate (maskmode, vt);
4937 vt = force_reg (maskmode, vt);
4938 mask = expand_simple_binop (maskmode, AND, mask, vt,
4939 NULL_RTX, 0, OPTAB_DIRECT);
4940
4941 if (GET_MODE (target) != mode)
4942 target = gen_reg_rtx (mode);
4943 xops[0] = target;
4944 xops[1] = gen_lowpart (mode, t2);
4945 xops[2] = gen_lowpart (mode, t1);
4946 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
4947 xops[4] = mask;
4948 xops[5] = vt;
4949 ok = ix86_expand_int_vcond (xops);
4950 gcc_assert (ok);
4951 if (target != operands[0])
4952 emit_move_insn (operands[0],
4953 gen_lowpart (GET_MODE (operands[0]), target));
4954 }
4955 }
4956
4957 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
4958 true if we should do zero extension, else sign extension. HIGH_P is
4959 true if we want the N/2 high elements, else the low elements. */
4960
4961 void
4962 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
4963 {
4964 machine_mode imode = GET_MODE (src);
4965 rtx tmp;
4966
4967 if (TARGET_SSE4_1)
4968 {
4969 rtx (*unpack)(rtx, rtx);
4970 rtx (*extract)(rtx, rtx) = NULL;
4971 machine_mode halfmode = BLKmode;
4972
4973 switch (imode)
4974 {
4975 case E_V64QImode:
4976 if (unsigned_p)
4977 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
4978 else
4979 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
4980 halfmode = V32QImode;
4981 extract
4982 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
4983 break;
4984 case E_V32QImode:
4985 if (unsigned_p)
4986 unpack = gen_avx2_zero_extendv16qiv16hi2;
4987 else
4988 unpack = gen_avx2_sign_extendv16qiv16hi2;
4989 halfmode = V16QImode;
4990 extract
4991 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
4992 break;
4993 case E_V32HImode:
4994 if (unsigned_p)
4995 unpack = gen_avx512f_zero_extendv16hiv16si2;
4996 else
4997 unpack = gen_avx512f_sign_extendv16hiv16si2;
4998 halfmode = V16HImode;
4999 extract
5000 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
5001 break;
5002 case E_V16HImode:
5003 if (unsigned_p)
5004 unpack = gen_avx2_zero_extendv8hiv8si2;
5005 else
5006 unpack = gen_avx2_sign_extendv8hiv8si2;
5007 halfmode = V8HImode;
5008 extract
5009 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
5010 break;
5011 case E_V16SImode:
5012 if (unsigned_p)
5013 unpack = gen_avx512f_zero_extendv8siv8di2;
5014 else
5015 unpack = gen_avx512f_sign_extendv8siv8di2;
5016 halfmode = V8SImode;
5017 extract
5018 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
5019 break;
5020 case E_V8SImode:
5021 if (unsigned_p)
5022 unpack = gen_avx2_zero_extendv4siv4di2;
5023 else
5024 unpack = gen_avx2_sign_extendv4siv4di2;
5025 halfmode = V4SImode;
5026 extract
5027 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
5028 break;
5029 case E_V16QImode:
5030 if (unsigned_p)
5031 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
5032 else
5033 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
5034 break;
5035 case E_V8HImode:
5036 if (unsigned_p)
5037 unpack = gen_sse4_1_zero_extendv4hiv4si2;
5038 else
5039 unpack = gen_sse4_1_sign_extendv4hiv4si2;
5040 break;
5041 case E_V4SImode:
5042 if (unsigned_p)
5043 unpack = gen_sse4_1_zero_extendv2siv2di2;
5044 else
5045 unpack = gen_sse4_1_sign_extendv2siv2di2;
5046 break;
5047 default:
5048 gcc_unreachable ();
5049 }
5050
5051 if (GET_MODE_SIZE (imode) >= 32)
5052 {
5053 tmp = gen_reg_rtx (halfmode);
5054 emit_insn (extract (tmp, src));
5055 }
5056 else if (high_p)
5057 {
5058 /* Shift higher 8 bytes to lower 8 bytes. */
5059 tmp = gen_reg_rtx (V1TImode);
5060 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
5061 GEN_INT (64)));
5062 tmp = gen_lowpart (imode, tmp);
5063 }
5064 else
5065 tmp = src;
5066
5067 emit_insn (unpack (dest, tmp));
5068 }
5069 else
5070 {
5071 rtx (*unpack)(rtx, rtx, rtx);
5072
5073 switch (imode)
5074 {
5075 case E_V16QImode:
5076 if (high_p)
5077 unpack = gen_vec_interleave_highv16qi;
5078 else
5079 unpack = gen_vec_interleave_lowv16qi;
5080 break;
5081 case E_V8HImode:
5082 if (high_p)
5083 unpack = gen_vec_interleave_highv8hi;
5084 else
5085 unpack = gen_vec_interleave_lowv8hi;
5086 break;
5087 case E_V4SImode:
5088 if (high_p)
5089 unpack = gen_vec_interleave_highv4si;
5090 else
5091 unpack = gen_vec_interleave_lowv4si;
5092 break;
5093 default:
5094 gcc_unreachable ();
5095 }
5096
5097 if (unsigned_p)
5098 tmp = force_reg (imode, CONST0_RTX (imode));
5099 else
5100 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5101 src, pc_rtx, pc_rtx);
5102
5103 rtx tmp2 = gen_reg_rtx (imode);
5104 emit_insn (unpack (tmp2, src, tmp));
5105 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
5106 }
5107 }
5108
5109 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5110 but works for floating pointer parameters and nonoffsetable memories.
5111 For pushes, it returns just stack offsets; the values will be saved
5112 in the right order. Maximally three parts are generated. */
5113
5114 static int
5115 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
5116 {
5117 int size;
5118
5119 if (!TARGET_64BIT)
5120 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
5121 else
5122 size = (GET_MODE_SIZE (mode) + 4) / 8;
5123
5124 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
5125 gcc_assert (size >= 2 && size <= 4);
5126
5127 /* Optimize constant pool reference to immediates. This is used by fp
5128 moves, that force all constants to memory to allow combining. */
5129 if (MEM_P (operand) && MEM_READONLY_P (operand))
5130 operand = avoid_constant_pool_reference (operand);
5131
5132 if (MEM_P (operand) && !offsettable_memref_p (operand))
5133 {
5134 /* The only non-offsetable memories we handle are pushes. */
5135 int ok = push_operand (operand, VOIDmode);
5136
5137 gcc_assert (ok);
5138
5139 operand = copy_rtx (operand);
5140 PUT_MODE (operand, word_mode);
5141 parts[0] = parts[1] = parts[2] = parts[3] = operand;
5142 return size;
5143 }
5144
5145 if (GET_CODE (operand) == CONST_VECTOR)
5146 {
5147 scalar_int_mode imode = int_mode_for_mode (mode).require ();
5148 /* Caution: if we looked through a constant pool memory above,
5149 the operand may actually have a different mode now. That's
5150 ok, since we want to pun this all the way back to an integer. */
5151 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
5152 gcc_assert (operand != NULL);
5153 mode = imode;
5154 }
5155
5156 if (!TARGET_64BIT)
5157 {
5158 if (mode == DImode)
5159 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5160 else
5161 {
5162 int i;
5163
5164 if (REG_P (operand))
5165 {
5166 gcc_assert (reload_completed);
5167 for (i = 0; i < size; i++)
5168 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
5169 }
5170 else if (offsettable_memref_p (operand))
5171 {
5172 operand = adjust_address (operand, SImode, 0);
5173 parts[0] = operand;
5174 for (i = 1; i < size; i++)
5175 parts[i] = adjust_address (operand, SImode, 4 * i);
5176 }
5177 else if (CONST_DOUBLE_P (operand))
5178 {
5179 const REAL_VALUE_TYPE *r;
5180 long l[4];
5181
5182 r = CONST_DOUBLE_REAL_VALUE (operand);
5183 switch (mode)
5184 {
5185 case E_TFmode:
5186 real_to_target (l, r, mode);
5187 parts[3] = gen_int_mode (l[3], SImode);
5188 parts[2] = gen_int_mode (l[2], SImode);
5189 break;
5190 case E_XFmode:
5191 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5192 long double may not be 80-bit. */
5193 real_to_target (l, r, mode);
5194 parts[2] = gen_int_mode (l[2], SImode);
5195 break;
5196 case E_DFmode:
5197 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
5198 break;
5199 default:
5200 gcc_unreachable ();
5201 }
5202 parts[1] = gen_int_mode (l[1], SImode);
5203 parts[0] = gen_int_mode (l[0], SImode);
5204 }
5205 else
5206 gcc_unreachable ();
5207 }
5208 }
5209 else
5210 {
5211 if (mode == TImode)
5212 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5213 if (mode == XFmode || mode == TFmode)
5214 {
5215 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
5216 if (REG_P (operand))
5217 {
5218 gcc_assert (reload_completed);
5219 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
5220 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
5221 }
5222 else if (offsettable_memref_p (operand))
5223 {
5224 operand = adjust_address (operand, DImode, 0);
5225 parts[0] = operand;
5226 parts[1] = adjust_address (operand, upper_mode, 8);
5227 }
5228 else if (CONST_DOUBLE_P (operand))
5229 {
5230 long l[4];
5231
5232 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
5233
5234 /* real_to_target puts 32-bit pieces in each long. */
5235 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
5236 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
5237 << 32), DImode);
5238
5239 if (upper_mode == SImode)
5240 parts[1] = gen_int_mode (l[2], SImode);
5241 else
5242 parts[1]
5243 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
5244 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
5245 << 32), DImode);
5246 }
5247 else
5248 gcc_unreachable ();
5249 }
5250 }
5251
5252 return size;
5253 }
5254
5255 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5256 Return false when normal moves are needed; true when all required
5257 insns have been emitted. Operands 2-4 contain the input values
5258 int the correct order; operands 5-7 contain the output values. */
5259
5260 void
5261 ix86_split_long_move (rtx operands[])
5262 {
5263 rtx part[2][4];
5264 int nparts, i, j;
5265 int push = 0;
5266 int collisions = 0;
5267 machine_mode mode = GET_MODE (operands[0]);
5268 bool collisionparts[4];
5269
5270 /* The DFmode expanders may ask us to move double.
5271 For 64bit target this is single move. By hiding the fact
5272 here we simplify i386.md splitters. */
5273 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
5274 {
5275 /* Optimize constant pool reference to immediates. This is used by
5276 fp moves, that force all constants to memory to allow combining. */
5277
5278 if (MEM_P (operands[1])
5279 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
5280 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
5281 operands[1] = get_pool_constant (XEXP (operands[1], 0));
5282 if (push_operand (operands[0], VOIDmode))
5283 {
5284 operands[0] = copy_rtx (operands[0]);
5285 PUT_MODE (operands[0], word_mode);
5286 }
5287 else
5288 operands[0] = gen_lowpart (DImode, operands[0]);
5289 operands[1] = gen_lowpart (DImode, operands[1]);
5290 emit_move_insn (operands[0], operands[1]);
5291 return;
5292 }
5293
5294 /* The only non-offsettable memory we handle is push. */
5295 if (push_operand (operands[0], VOIDmode))
5296 push = 1;
5297 else
5298 gcc_assert (!MEM_P (operands[0])
5299 || offsettable_memref_p (operands[0]));
5300
5301 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
5302 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
5303
5304 /* When emitting push, take care for source operands on the stack. */
5305 if (push && MEM_P (operands[1])
5306 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
5307 {
5308 rtx src_base = XEXP (part[1][nparts - 1], 0);
5309
5310 /* Compensate for the stack decrement by 4. */
5311 if (!TARGET_64BIT && nparts == 3
5312 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
5313 src_base = plus_constant (Pmode, src_base, 4);
5314
5315 /* src_base refers to the stack pointer and is
5316 automatically decreased by emitted push. */
5317 for (i = 0; i < nparts; i++)
5318 part[1][i] = change_address (part[1][i],
5319 GET_MODE (part[1][i]), src_base);
5320 }
5321
5322 /* We need to do copy in the right order in case an address register
5323 of the source overlaps the destination. */
5324 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
5325 {
5326 rtx tmp;
5327
5328 for (i = 0; i < nparts; i++)
5329 {
5330 collisionparts[i]
5331 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
5332 if (collisionparts[i])
5333 collisions++;
5334 }
5335
5336 /* Collision in the middle part can be handled by reordering. */
5337 if (collisions == 1 && nparts == 3 && collisionparts [1])
5338 {
5339 std::swap (part[0][1], part[0][2]);
5340 std::swap (part[1][1], part[1][2]);
5341 }
5342 else if (collisions == 1
5343 && nparts == 4
5344 && (collisionparts [1] || collisionparts [2]))
5345 {
5346 if (collisionparts [1])
5347 {
5348 std::swap (part[0][1], part[0][2]);
5349 std::swap (part[1][1], part[1][2]);
5350 }
5351 else
5352 {
5353 std::swap (part[0][2], part[0][3]);
5354 std::swap (part[1][2], part[1][3]);
5355 }
5356 }
5357
5358 /* If there are more collisions, we can't handle it by reordering.
5359 Do an lea to the last part and use only one colliding move. */
5360 else if (collisions > 1)
5361 {
5362 rtx base, addr;
5363
5364 collisions = 1;
5365
5366 base = part[0][nparts - 1];
5367
5368 /* Handle the case when the last part isn't valid for lea.
5369 Happens in 64-bit mode storing the 12-byte XFmode. */
5370 if (GET_MODE (base) != Pmode)
5371 base = gen_rtx_REG (Pmode, REGNO (base));
5372
5373 addr = XEXP (part[1][0], 0);
5374 if (TARGET_TLS_DIRECT_SEG_REFS)
5375 {
5376 struct ix86_address parts;
5377 int ok = ix86_decompose_address (addr, &parts);
5378 gcc_assert (ok);
5379 /* It is not valid to use %gs: or %fs: in lea. */
5380 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
5381 }
5382 emit_insn (gen_rtx_SET (base, addr));
5383 part[1][0] = replace_equiv_address (part[1][0], base);
5384 for (i = 1; i < nparts; i++)
5385 {
5386 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
5387 part[1][i] = replace_equiv_address (part[1][i], tmp);
5388 }
5389 }
5390 }
5391
5392 if (push)
5393 {
5394 if (!TARGET_64BIT)
5395 {
5396 if (nparts == 3)
5397 {
5398 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
5399 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
5400 emit_move_insn (part[0][2], part[1][2]);
5401 }
5402 else if (nparts == 4)
5403 {
5404 emit_move_insn (part[0][3], part[1][3]);
5405 emit_move_insn (part[0][2], part[1][2]);
5406 }
5407 }
5408 else
5409 {
5410 /* In 64bit mode we don't have 32bit push available. In case this is
5411 register, it is OK - we will just use larger counterpart. We also
5412 retype memory - these comes from attempt to avoid REX prefix on
5413 moving of second half of TFmode value. */
5414 if (GET_MODE (part[1][1]) == SImode)
5415 {
5416 switch (GET_CODE (part[1][1]))
5417 {
5418 case MEM:
5419 part[1][1] = adjust_address (part[1][1], DImode, 0);
5420 break;
5421
5422 case REG:
5423 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
5424 break;
5425
5426 default:
5427 gcc_unreachable ();
5428 }
5429
5430 if (GET_MODE (part[1][0]) == SImode)
5431 part[1][0] = part[1][1];
5432 }
5433 }
5434 emit_move_insn (part[0][1], part[1][1]);
5435 emit_move_insn (part[0][0], part[1][0]);
5436 return;
5437 }
5438
5439 /* Choose correct order to not overwrite the source before it is copied. */
5440 if ((REG_P (part[0][0])
5441 && REG_P (part[1][1])
5442 && (REGNO (part[0][0]) == REGNO (part[1][1])
5443 || (nparts == 3
5444 && REGNO (part[0][0]) == REGNO (part[1][2]))
5445 || (nparts == 4
5446 && REGNO (part[0][0]) == REGNO (part[1][3]))))
5447 || (collisions > 0
5448 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
5449 {
5450 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
5451 {
5452 operands[2 + i] = part[0][j];
5453 operands[6 + i] = part[1][j];
5454 }
5455 }
5456 else
5457 {
5458 for (i = 0; i < nparts; i++)
5459 {
5460 operands[2 + i] = part[0][i];
5461 operands[6 + i] = part[1][i];
5462 }
5463 }
5464
5465 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
5466 if (optimize_insn_for_size_p ())
5467 {
5468 for (j = 0; j < nparts - 1; j++)
5469 if (CONST_INT_P (operands[6 + j])
5470 && operands[6 + j] != const0_rtx
5471 && REG_P (operands[2 + j]))
5472 for (i = j; i < nparts - 1; i++)
5473 if (CONST_INT_P (operands[7 + i])
5474 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
5475 operands[7 + i] = operands[2 + j];
5476 }
5477
5478 for (i = 0; i < nparts; i++)
5479 emit_move_insn (operands[2 + i], operands[6 + i]);
5480
5481 return;
5482 }
5483
5484 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
5485 left shift by a constant, either using a single shift or
5486 a sequence of add instructions. */
5487
5488 static void
5489 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
5490 {
5491 if (count == 1
5492 || (count * ix86_cost->add <= ix86_cost->shift_const
5493 && !optimize_insn_for_size_p ()))
5494 {
5495 while (count-- > 0)
5496 emit_insn (gen_add2_insn (operand, operand));
5497 }
5498 else
5499 {
5500 rtx (*insn)(rtx, rtx, rtx);
5501
5502 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5503 emit_insn (insn (operand, operand, GEN_INT (count)));
5504 }
5505 }
5506
5507 void
5508 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
5509 {
5510 rtx (*gen_ashl3)(rtx, rtx, rtx);
5511 rtx (*gen_shld)(rtx, rtx, rtx);
5512 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5513 machine_mode half_mode;
5514
5515 rtx low[2], high[2];
5516 int count;
5517
5518 if (CONST_INT_P (operands[2]))
5519 {
5520 split_double_mode (mode, operands, 2, low, high);
5521 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5522
5523 if (count >= half_width)
5524 {
5525 emit_move_insn (high[0], low[1]);
5526 emit_move_insn (low[0], const0_rtx);
5527
5528 if (count > half_width)
5529 ix86_expand_ashl_const (high[0], count - half_width, mode);
5530 }
5531 else
5532 {
5533 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5534
5535 if (!rtx_equal_p (operands[0], operands[1]))
5536 emit_move_insn (operands[0], operands[1]);
5537
5538 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
5539 ix86_expand_ashl_const (low[0], count, mode);
5540 }
5541 return;
5542 }
5543
5544 split_double_mode (mode, operands, 1, low, high);
5545 half_mode = mode == DImode ? SImode : DImode;
5546
5547 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5548
5549 if (operands[1] == const1_rtx)
5550 {
5551 /* Assuming we've chosen a QImode capable registers, then 1 << N
5552 can be done with two 32/64-bit shifts, no branches, no cmoves. */
5553 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
5554 {
5555 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
5556
5557 ix86_expand_clear (low[0]);
5558 ix86_expand_clear (high[0]);
5559 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
5560
5561 d = gen_lowpart (QImode, low[0]);
5562 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5563 s = gen_rtx_EQ (QImode, flags, const0_rtx);
5564 emit_insn (gen_rtx_SET (d, s));
5565
5566 d = gen_lowpart (QImode, high[0]);
5567 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5568 s = gen_rtx_NE (QImode, flags, const0_rtx);
5569 emit_insn (gen_rtx_SET (d, s));
5570 }
5571
5572 /* Otherwise, we can get the same results by manually performing
5573 a bit extract operation on bit 5/6, and then performing the two
5574 shifts. The two methods of getting 0/1 into low/high are exactly
5575 the same size. Avoiding the shift in the bit extract case helps
5576 pentium4 a bit; no one else seems to care much either way. */
5577 else
5578 {
5579 rtx (*gen_lshr3)(rtx, rtx, rtx);
5580 rtx (*gen_and3)(rtx, rtx, rtx);
5581 rtx (*gen_xor3)(rtx, rtx, rtx);
5582 HOST_WIDE_INT bits;
5583 rtx x;
5584
5585 if (mode == DImode)
5586 {
5587 gen_lshr3 = gen_lshrsi3;
5588 gen_and3 = gen_andsi3;
5589 gen_xor3 = gen_xorsi3;
5590 bits = 5;
5591 }
5592 else
5593 {
5594 gen_lshr3 = gen_lshrdi3;
5595 gen_and3 = gen_anddi3;
5596 gen_xor3 = gen_xordi3;
5597 bits = 6;
5598 }
5599
5600 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
5601 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
5602 else
5603 x = gen_lowpart (half_mode, operands[2]);
5604 emit_insn (gen_rtx_SET (high[0], x));
5605
5606 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
5607 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
5608 emit_move_insn (low[0], high[0]);
5609 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
5610 }
5611
5612 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5613 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
5614 return;
5615 }
5616
5617 if (operands[1] == constm1_rtx)
5618 {
5619 /* For -1 << N, we can avoid the shld instruction, because we
5620 know that we're shifting 0...31/63 ones into a -1. */
5621 emit_move_insn (low[0], constm1_rtx);
5622 if (optimize_insn_for_size_p ())
5623 emit_move_insn (high[0], low[0]);
5624 else
5625 emit_move_insn (high[0], constm1_rtx);
5626 }
5627 else
5628 {
5629 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5630
5631 if (!rtx_equal_p (operands[0], operands[1]))
5632 emit_move_insn (operands[0], operands[1]);
5633
5634 split_double_mode (mode, operands, 1, low, high);
5635 emit_insn (gen_shld (high[0], low[0], operands[2]));
5636 }
5637
5638 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5639
5640 if (TARGET_CMOVE && scratch)
5641 {
5642 ix86_expand_clear (scratch);
5643 emit_insn (gen_x86_shift_adj_1
5644 (half_mode, high[0], low[0], operands[2], scratch));
5645 }
5646 else
5647 emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
5648 }
5649
5650 void
5651 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
5652 {
5653 rtx (*gen_ashr3)(rtx, rtx, rtx)
5654 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
5655 rtx (*gen_shrd)(rtx, rtx, rtx);
5656 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5657
5658 rtx low[2], high[2];
5659 int count;
5660
5661 if (CONST_INT_P (operands[2]))
5662 {
5663 split_double_mode (mode, operands, 2, low, high);
5664 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5665
5666 if (count == GET_MODE_BITSIZE (mode) - 1)
5667 {
5668 emit_move_insn (high[0], high[1]);
5669 emit_insn (gen_ashr3 (high[0], high[0],
5670 GEN_INT (half_width - 1)));
5671 emit_move_insn (low[0], high[0]);
5672
5673 }
5674 else if (count >= half_width)
5675 {
5676 emit_move_insn (low[0], high[1]);
5677 emit_move_insn (high[0], low[0]);
5678 emit_insn (gen_ashr3 (high[0], high[0],
5679 GEN_INT (half_width - 1)));
5680
5681 if (count > half_width)
5682 emit_insn (gen_ashr3 (low[0], low[0],
5683 GEN_INT (count - half_width)));
5684 }
5685 else
5686 {
5687 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5688
5689 if (!rtx_equal_p (operands[0], operands[1]))
5690 emit_move_insn (operands[0], operands[1]);
5691
5692 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
5693 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
5694 }
5695 }
5696 else
5697 {
5698 machine_mode half_mode;
5699
5700 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5701
5702 if (!rtx_equal_p (operands[0], operands[1]))
5703 emit_move_insn (operands[0], operands[1]);
5704
5705 split_double_mode (mode, operands, 1, low, high);
5706 half_mode = mode == DImode ? SImode : DImode;
5707
5708 emit_insn (gen_shrd (low[0], high[0], operands[2]));
5709 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
5710
5711 if (TARGET_CMOVE && scratch)
5712 {
5713 emit_move_insn (scratch, high[0]);
5714 emit_insn (gen_ashr3 (scratch, scratch,
5715 GEN_INT (half_width - 1)));
5716 emit_insn (gen_x86_shift_adj_1
5717 (half_mode, low[0], high[0], operands[2], scratch));
5718 }
5719 else
5720 emit_insn (gen_x86_shift_adj_3
5721 (half_mode, low[0], high[0], operands[2]));
5722 }
5723 }
5724
5725 void
5726 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
5727 {
5728 rtx (*gen_lshr3)(rtx, rtx, rtx)
5729 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
5730 rtx (*gen_shrd)(rtx, rtx, rtx);
5731 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5732
5733 rtx low[2], high[2];
5734 int count;
5735
5736 if (CONST_INT_P (operands[2]))
5737 {
5738 split_double_mode (mode, operands, 2, low, high);
5739 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5740
5741 if (count >= half_width)
5742 {
5743 emit_move_insn (low[0], high[1]);
5744 ix86_expand_clear (high[0]);
5745
5746 if (count > half_width)
5747 emit_insn (gen_lshr3 (low[0], low[0],
5748 GEN_INT (count - half_width)));
5749 }
5750 else
5751 {
5752 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5753
5754 if (!rtx_equal_p (operands[0], operands[1]))
5755 emit_move_insn (operands[0], operands[1]);
5756
5757 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
5758 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
5759 }
5760 }
5761 else
5762 {
5763 machine_mode half_mode;
5764
5765 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5766
5767 if (!rtx_equal_p (operands[0], operands[1]))
5768 emit_move_insn (operands[0], operands[1]);
5769
5770 split_double_mode (mode, operands, 1, low, high);
5771 half_mode = mode == DImode ? SImode : DImode;
5772
5773 emit_insn (gen_shrd (low[0], high[0], operands[2]));
5774 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
5775
5776 if (TARGET_CMOVE && scratch)
5777 {
5778 ix86_expand_clear (scratch);
5779 emit_insn (gen_x86_shift_adj_1
5780 (half_mode, low[0], high[0], operands[2], scratch));
5781 }
5782 else
5783 emit_insn (gen_x86_shift_adj_2
5784 (half_mode, low[0], high[0], operands[2]));
5785 }
5786 }
5787
5788 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
5789 DImode for constant loop counts. */
5790
5791 static machine_mode
5792 counter_mode (rtx count_exp)
5793 {
5794 if (GET_MODE (count_exp) != VOIDmode)
5795 return GET_MODE (count_exp);
5796 if (!CONST_INT_P (count_exp))
5797 return Pmode;
5798 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
5799 return DImode;
5800 return SImode;
5801 }
5802
5803 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
5804 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
5805 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
5806 memory by VALUE (supposed to be in MODE).
5807
5808 The size is rounded down to whole number of chunk size moved at once.
5809 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
5810
5811
5812 static void
5813 expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
5814 rtx destptr, rtx srcptr, rtx value,
5815 rtx count, machine_mode mode, int unroll,
5816 int expected_size, bool issetmem)
5817 {
5818 rtx_code_label *out_label, *top_label;
5819 rtx iter, tmp;
5820 machine_mode iter_mode = counter_mode (count);
5821 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
5822 rtx piece_size = GEN_INT (piece_size_n);
5823 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
5824 rtx size;
5825 int i;
5826
5827 top_label = gen_label_rtx ();
5828 out_label = gen_label_rtx ();
5829 iter = gen_reg_rtx (iter_mode);
5830
5831 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
5832 NULL, 1, OPTAB_DIRECT);
5833 /* Those two should combine. */
5834 if (piece_size == const1_rtx)
5835 {
5836 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
5837 true, out_label);
5838 predict_jump (REG_BR_PROB_BASE * 10 / 100);
5839 }
5840 emit_move_insn (iter, const0_rtx);
5841
5842 emit_label (top_label);
5843
5844 tmp = convert_modes (Pmode, iter_mode, iter, true);
5845
5846 /* This assert could be relaxed - in this case we'll need to compute
5847 smallest power of two, containing in PIECE_SIZE_N and pass it to
5848 offset_address. */
5849 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
5850 destmem = offset_address (destmem, tmp, piece_size_n);
5851 destmem = adjust_address (destmem, mode, 0);
5852
5853 if (!issetmem)
5854 {
5855 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
5856 srcmem = adjust_address (srcmem, mode, 0);
5857
5858 /* When unrolling for chips that reorder memory reads and writes,
5859 we can save registers by using single temporary.
5860 Also using 4 temporaries is overkill in 32bit mode. */
5861 if (!TARGET_64BIT && 0)
5862 {
5863 for (i = 0; i < unroll; i++)
5864 {
5865 if (i)
5866 {
5867 destmem = adjust_address (copy_rtx (destmem), mode,
5868 GET_MODE_SIZE (mode));
5869 srcmem = adjust_address (copy_rtx (srcmem), mode,
5870 GET_MODE_SIZE (mode));
5871 }
5872 emit_move_insn (destmem, srcmem);
5873 }
5874 }
5875 else
5876 {
5877 rtx tmpreg[4];
5878 gcc_assert (unroll <= 4);
5879 for (i = 0; i < unroll; i++)
5880 {
5881 tmpreg[i] = gen_reg_rtx (mode);
5882 if (i)
5883 srcmem = adjust_address (copy_rtx (srcmem), mode,
5884 GET_MODE_SIZE (mode));
5885 emit_move_insn (tmpreg[i], srcmem);
5886 }
5887 for (i = 0; i < unroll; i++)
5888 {
5889 if (i)
5890 destmem = adjust_address (copy_rtx (destmem), mode,
5891 GET_MODE_SIZE (mode));
5892 emit_move_insn (destmem, tmpreg[i]);
5893 }
5894 }
5895 }
5896 else
5897 for (i = 0; i < unroll; i++)
5898 {
5899 if (i)
5900 destmem = adjust_address (copy_rtx (destmem), mode,
5901 GET_MODE_SIZE (mode));
5902 emit_move_insn (destmem, value);
5903 }
5904
5905 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
5906 true, OPTAB_LIB_WIDEN);
5907 if (tmp != iter)
5908 emit_move_insn (iter, tmp);
5909
5910 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
5911 true, top_label);
5912 if (expected_size != -1)
5913 {
5914 expected_size /= GET_MODE_SIZE (mode) * unroll;
5915 if (expected_size == 0)
5916 predict_jump (0);
5917 else if (expected_size > REG_BR_PROB_BASE)
5918 predict_jump (REG_BR_PROB_BASE - 1);
5919 else
5920 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
5921 / expected_size);
5922 }
5923 else
5924 predict_jump (REG_BR_PROB_BASE * 80 / 100);
5925 iter = ix86_zero_extend_to_Pmode (iter);
5926 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
5927 true, OPTAB_LIB_WIDEN);
5928 if (tmp != destptr)
5929 emit_move_insn (destptr, tmp);
5930 if (!issetmem)
5931 {
5932 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
5933 true, OPTAB_LIB_WIDEN);
5934 if (tmp != srcptr)
5935 emit_move_insn (srcptr, tmp);
5936 }
5937 emit_label (out_label);
5938 }
5939
5940 /* Divide COUNTREG by SCALE. */
5941 static rtx
5942 scale_counter (rtx countreg, int scale)
5943 {
5944 rtx sc;
5945
5946 if (scale == 1)
5947 return countreg;
5948 if (CONST_INT_P (countreg))
5949 return GEN_INT (INTVAL (countreg) / scale);
5950 gcc_assert (REG_P (countreg));
5951
5952 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
5953 GEN_INT (exact_log2 (scale)),
5954 NULL, 1, OPTAB_DIRECT);
5955 return sc;
5956 }
5957
5958 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
5959 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
5960 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
5961 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
5962 ORIG_VALUE is the original value passed to memset to fill the memory with.
5963 Other arguments have same meaning as for previous function. */
5964
5965 static void
5966 expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
5967 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
5968 rtx count,
5969 machine_mode mode, bool issetmem)
5970 {
5971 rtx destexp;
5972 rtx srcexp;
5973 rtx countreg;
5974 HOST_WIDE_INT rounded_count;
5975
5976 /* If possible, it is shorter to use rep movs.
5977 TODO: Maybe it is better to move this logic to decide_alg. */
5978 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
5979 && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
5980 && (!issetmem || orig_value == const0_rtx))
5981 mode = SImode;
5982
5983 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
5984 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
5985
5986 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
5987 GET_MODE_SIZE (mode)));
5988 if (mode != QImode)
5989 {
5990 destexp = gen_rtx_ASHIFT (Pmode, countreg,
5991 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
5992 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
5993 }
5994 else
5995 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
5996 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
5997 {
5998 rounded_count
5999 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
6000 destmem = shallow_copy_rtx (destmem);
6001 set_mem_size (destmem, rounded_count);
6002 }
6003 else if (MEM_SIZE_KNOWN_P (destmem))
6004 clear_mem_size (destmem);
6005
6006 if (issetmem)
6007 {
6008 value = force_reg (mode, gen_lowpart (mode, value));
6009 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
6010 }
6011 else
6012 {
6013 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
6014 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
6015 if (mode != QImode)
6016 {
6017 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
6018 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
6019 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
6020 }
6021 else
6022 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
6023 if (CONST_INT_P (count))
6024 {
6025 rounded_count
6026 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
6027 srcmem = shallow_copy_rtx (srcmem);
6028 set_mem_size (srcmem, rounded_count);
6029 }
6030 else
6031 {
6032 if (MEM_SIZE_KNOWN_P (srcmem))
6033 clear_mem_size (srcmem);
6034 }
6035 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
6036 destexp, srcexp));
6037 }
6038 }
6039
6040 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
6041 DESTMEM.
6042 SRC is passed by pointer to be updated on return.
6043 Return value is updated DST. */
6044 static rtx
6045 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
6046 HOST_WIDE_INT size_to_move)
6047 {
6048 rtx dst = destmem, src = *srcmem, tempreg;
6049 enum insn_code code;
6050 machine_mode move_mode;
6051 int piece_size, i;
6052
6053 /* Find the widest mode in which we could perform moves.
6054 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6055 it until move of such size is supported. */
6056 piece_size = 1 << floor_log2 (size_to_move);
6057 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
6058 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6059 {
6060 gcc_assert (piece_size > 1);
6061 piece_size >>= 1;
6062 }
6063
6064 /* Find the corresponding vector mode with the same size as MOVE_MODE.
6065 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
6066 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
6067 {
6068 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
6069 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
6070 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6071 {
6072 move_mode = word_mode;
6073 piece_size = GET_MODE_SIZE (move_mode);
6074 code = optab_handler (mov_optab, move_mode);
6075 }
6076 }
6077 gcc_assert (code != CODE_FOR_nothing);
6078
6079 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6080 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
6081
6082 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6083 gcc_assert (size_to_move % piece_size == 0);
6084
6085 for (i = 0; i < size_to_move; i += piece_size)
6086 {
6087 /* We move from memory to memory, so we'll need to do it via
6088 a temporary register. */
6089 tempreg = gen_reg_rtx (move_mode);
6090 emit_insn (GEN_FCN (code) (tempreg, src));
6091 emit_insn (GEN_FCN (code) (dst, tempreg));
6092
6093 emit_move_insn (destptr,
6094 plus_constant (Pmode, copy_rtx (destptr), piece_size));
6095 emit_move_insn (srcptr,
6096 plus_constant (Pmode, copy_rtx (srcptr), piece_size));
6097
6098 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6099 piece_size);
6100 src = adjust_automodify_address_nv (src, move_mode, srcptr,
6101 piece_size);
6102 }
6103
6104 /* Update DST and SRC rtx. */
6105 *srcmem = src;
6106 return dst;
6107 }
6108
6109 /* Helper function for the string operations below. Dest VARIABLE whether
6110 it is aligned to VALUE bytes. If true, jump to the label. */
6111
6112 static rtx_code_label *
6113 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
6114 {
6115 rtx_code_label *label = gen_label_rtx ();
6116 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
6117 if (GET_MODE (variable) == DImode)
6118 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
6119 else
6120 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
6121 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
6122 1, label);
6123 if (epilogue)
6124 predict_jump (REG_BR_PROB_BASE * 50 / 100);
6125 else
6126 predict_jump (REG_BR_PROB_BASE * 90 / 100);
6127 return label;
6128 }
6129
6130
6131 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
6132
6133 static void
6134 expand_cpymem_epilogue (rtx destmem, rtx srcmem,
6135 rtx destptr, rtx srcptr, rtx count, int max_size)
6136 {
6137 rtx src, dest;
6138 if (CONST_INT_P (count))
6139 {
6140 HOST_WIDE_INT countval = INTVAL (count);
6141 HOST_WIDE_INT epilogue_size = countval % max_size;
6142 int i;
6143
6144 /* For now MAX_SIZE should be a power of 2. This assert could be
6145 relaxed, but it'll require a bit more complicated epilogue
6146 expanding. */
6147 gcc_assert ((max_size & (max_size - 1)) == 0);
6148 for (i = max_size; i >= 1; i >>= 1)
6149 {
6150 if (epilogue_size & i)
6151 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6152 }
6153 return;
6154 }
6155 if (max_size > 8)
6156 {
6157 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
6158 count, 1, OPTAB_DIRECT);
6159 expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
6160 count, QImode, 1, 4, false);
6161 return;
6162 }
6163
6164 /* When there are stringops, we can cheaply increase dest and src pointers.
6165 Otherwise we save code size by maintaining offset (zero is readily
6166 available from preceding rep operation) and using x86 addressing modes.
6167 */
6168 if (TARGET_SINGLE_STRINGOP)
6169 {
6170 if (max_size > 4)
6171 {
6172 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6173 src = change_address (srcmem, SImode, srcptr);
6174 dest = change_address (destmem, SImode, destptr);
6175 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6176 emit_label (label);
6177 LABEL_NUSES (label) = 1;
6178 }
6179 if (max_size > 2)
6180 {
6181 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6182 src = change_address (srcmem, HImode, srcptr);
6183 dest = change_address (destmem, HImode, destptr);
6184 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6185 emit_label (label);
6186 LABEL_NUSES (label) = 1;
6187 }
6188 if (max_size > 1)
6189 {
6190 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6191 src = change_address (srcmem, QImode, srcptr);
6192 dest = change_address (destmem, QImode, destptr);
6193 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6194 emit_label (label);
6195 LABEL_NUSES (label) = 1;
6196 }
6197 }
6198 else
6199 {
6200 rtx offset = force_reg (Pmode, const0_rtx);
6201 rtx tmp;
6202
6203 if (max_size > 4)
6204 {
6205 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6206 src = change_address (srcmem, SImode, srcptr);
6207 dest = change_address (destmem, SImode, destptr);
6208 emit_move_insn (dest, src);
6209 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
6210 true, OPTAB_LIB_WIDEN);
6211 if (tmp != offset)
6212 emit_move_insn (offset, tmp);
6213 emit_label (label);
6214 LABEL_NUSES (label) = 1;
6215 }
6216 if (max_size > 2)
6217 {
6218 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6219 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6220 src = change_address (srcmem, HImode, tmp);
6221 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6222 dest = change_address (destmem, HImode, tmp);
6223 emit_move_insn (dest, src);
6224 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
6225 true, OPTAB_LIB_WIDEN);
6226 if (tmp != offset)
6227 emit_move_insn (offset, tmp);
6228 emit_label (label);
6229 LABEL_NUSES (label) = 1;
6230 }
6231 if (max_size > 1)
6232 {
6233 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6234 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6235 src = change_address (srcmem, QImode, tmp);
6236 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6237 dest = change_address (destmem, QImode, tmp);
6238 emit_move_insn (dest, src);
6239 emit_label (label);
6240 LABEL_NUSES (label) = 1;
6241 }
6242 }
6243 }
6244
6245 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
6246 with value PROMOTED_VAL.
6247 SRC is passed by pointer to be updated on return.
6248 Return value is updated DST. */
6249 static rtx
6250 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
6251 HOST_WIDE_INT size_to_move)
6252 {
6253 rtx dst = destmem;
6254 enum insn_code code;
6255 machine_mode move_mode;
6256 int piece_size, i;
6257
6258 /* Find the widest mode in which we could perform moves.
6259 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6260 it until move of such size is supported. */
6261 move_mode = GET_MODE (promoted_val);
6262 if (move_mode == VOIDmode)
6263 move_mode = QImode;
6264 if (size_to_move < GET_MODE_SIZE (move_mode))
6265 {
6266 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
6267 move_mode = int_mode_for_size (move_bits, 0).require ();
6268 promoted_val = gen_lowpart (move_mode, promoted_val);
6269 }
6270 piece_size = GET_MODE_SIZE (move_mode);
6271 code = optab_handler (mov_optab, move_mode);
6272 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
6273
6274 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6275
6276 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6277 gcc_assert (size_to_move % piece_size == 0);
6278
6279 for (i = 0; i < size_to_move; i += piece_size)
6280 {
6281 if (piece_size <= GET_MODE_SIZE (word_mode))
6282 {
6283 emit_insn (gen_strset (destptr, dst, promoted_val));
6284 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6285 piece_size);
6286 continue;
6287 }
6288
6289 emit_insn (GEN_FCN (code) (dst, promoted_val));
6290
6291 emit_move_insn (destptr,
6292 plus_constant (Pmode, copy_rtx (destptr), piece_size));
6293
6294 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6295 piece_size);
6296 }
6297
6298 /* Update DST rtx. */
6299 return dst;
6300 }
6301 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6302 static void
6303 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
6304 rtx count, int max_size)
6305 {
6306 count = expand_simple_binop (counter_mode (count), AND, count,
6307 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
6308 expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
6309 gen_lowpart (QImode, value), count, QImode,
6310 1, max_size / 2, true);
6311 }
6312
6313 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6314 static void
6315 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
6316 rtx count, int max_size)
6317 {
6318 rtx dest;
6319
6320 if (CONST_INT_P (count))
6321 {
6322 HOST_WIDE_INT countval = INTVAL (count);
6323 HOST_WIDE_INT epilogue_size = countval % max_size;
6324 int i;
6325
6326 /* For now MAX_SIZE should be a power of 2. This assert could be
6327 relaxed, but it'll require a bit more complicated epilogue
6328 expanding. */
6329 gcc_assert ((max_size & (max_size - 1)) == 0);
6330 for (i = max_size; i >= 1; i >>= 1)
6331 {
6332 if (epilogue_size & i)
6333 {
6334 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6335 destmem = emit_memset (destmem, destptr, vec_value, i);
6336 else
6337 destmem = emit_memset (destmem, destptr, value, i);
6338 }
6339 }
6340 return;
6341 }
6342 if (max_size > 32)
6343 {
6344 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
6345 return;
6346 }
6347 if (max_size > 16)
6348 {
6349 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
6350 if (TARGET_64BIT)
6351 {
6352 dest = change_address (destmem, DImode, destptr);
6353 emit_insn (gen_strset (destptr, dest, value));
6354 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
6355 emit_insn (gen_strset (destptr, dest, value));
6356 }
6357 else
6358 {
6359 dest = change_address (destmem, SImode, destptr);
6360 emit_insn (gen_strset (destptr, dest, value));
6361 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6362 emit_insn (gen_strset (destptr, dest, value));
6363 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
6364 emit_insn (gen_strset (destptr, dest, value));
6365 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
6366 emit_insn (gen_strset (destptr, dest, value));
6367 }
6368 emit_label (label);
6369 LABEL_NUSES (label) = 1;
6370 }
6371 if (max_size > 8)
6372 {
6373 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
6374 if (TARGET_64BIT)
6375 {
6376 dest = change_address (destmem, DImode, destptr);
6377 emit_insn (gen_strset (destptr, dest, value));
6378 }
6379 else
6380 {
6381 dest = change_address (destmem, SImode, destptr);
6382 emit_insn (gen_strset (destptr, dest, value));
6383 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6384 emit_insn (gen_strset (destptr, dest, value));
6385 }
6386 emit_label (label);
6387 LABEL_NUSES (label) = 1;
6388 }
6389 if (max_size > 4)
6390 {
6391 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6392 dest = change_address (destmem, SImode, destptr);
6393 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
6394 emit_label (label);
6395 LABEL_NUSES (label) = 1;
6396 }
6397 if (max_size > 2)
6398 {
6399 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6400 dest = change_address (destmem, HImode, destptr);
6401 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
6402 emit_label (label);
6403 LABEL_NUSES (label) = 1;
6404 }
6405 if (max_size > 1)
6406 {
6407 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6408 dest = change_address (destmem, QImode, destptr);
6409 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
6410 emit_label (label);
6411 LABEL_NUSES (label) = 1;
6412 }
6413 }
6414
6415 /* Adjust COUNTER by the VALUE. */
6416 static void
6417 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
6418 {
6419 emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
6420 }
6421
6422 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
6423 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
6424 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
6425 ignored.
6426 Return value is updated DESTMEM. */
6427
6428 static rtx
6429 expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
6430 rtx destptr, rtx srcptr, rtx value,
6431 rtx vec_value, rtx count, int align,
6432 int desired_alignment, bool issetmem)
6433 {
6434 int i;
6435 for (i = 1; i < desired_alignment; i <<= 1)
6436 {
6437 if (align <= i)
6438 {
6439 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
6440 if (issetmem)
6441 {
6442 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6443 destmem = emit_memset (destmem, destptr, vec_value, i);
6444 else
6445 destmem = emit_memset (destmem, destptr, value, i);
6446 }
6447 else
6448 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6449 ix86_adjust_counter (count, i);
6450 emit_label (label);
6451 LABEL_NUSES (label) = 1;
6452 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
6453 }
6454 }
6455 return destmem;
6456 }
6457
6458 /* Test if COUNT&SIZE is nonzero and if so, expand movme
6459 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
6460 and jump to DONE_LABEL. */
6461 static void
6462 expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
6463 rtx destptr, rtx srcptr,
6464 rtx value, rtx vec_value,
6465 rtx count, int size,
6466 rtx done_label, bool issetmem)
6467 {
6468 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
6469 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
6470 rtx modesize;
6471 int n;
6472
6473 /* If we do not have vector value to copy, we must reduce size. */
6474 if (issetmem)
6475 {
6476 if (!vec_value)
6477 {
6478 if (GET_MODE (value) == VOIDmode && size > 8)
6479 mode = Pmode;
6480 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
6481 mode = GET_MODE (value);
6482 }
6483 else
6484 mode = GET_MODE (vec_value), value = vec_value;
6485 }
6486 else
6487 {
6488 /* Choose appropriate vector mode. */
6489 if (size >= 32)
6490 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
6491 else if (size >= 16)
6492 mode = TARGET_SSE ? V16QImode : DImode;
6493 srcmem = change_address (srcmem, mode, srcptr);
6494 }
6495 destmem = change_address (destmem, mode, destptr);
6496 modesize = GEN_INT (GET_MODE_SIZE (mode));
6497 gcc_assert (GET_MODE_SIZE (mode) <= size);
6498 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6499 {
6500 if (issetmem)
6501 emit_move_insn (destmem, gen_lowpart (mode, value));
6502 else
6503 {
6504 emit_move_insn (destmem, srcmem);
6505 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6506 }
6507 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6508 }
6509
6510 destmem = offset_address (destmem, count, 1);
6511 destmem = offset_address (destmem, GEN_INT (-2 * size),
6512 GET_MODE_SIZE (mode));
6513 if (!issetmem)
6514 {
6515 srcmem = offset_address (srcmem, count, 1);
6516 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
6517 GET_MODE_SIZE (mode));
6518 }
6519 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6520 {
6521 if (issetmem)
6522 emit_move_insn (destmem, gen_lowpart (mode, value));
6523 else
6524 {
6525 emit_move_insn (destmem, srcmem);
6526 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6527 }
6528 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6529 }
6530 emit_jump_insn (gen_jump (done_label));
6531 emit_barrier ();
6532
6533 emit_label (label);
6534 LABEL_NUSES (label) = 1;
6535 }
6536
6537 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
6538 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
6539 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
6540 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
6541 DONE_LABEL is a label after the whole copying sequence. The label is created
6542 on demand if *DONE_LABEL is NULL.
6543 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
6544 bounds after the initial copies.
6545
6546 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
6547 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
6548 we will dispatch to a library call for large blocks.
6549
6550 In pseudocode we do:
6551
6552 if (COUNT < SIZE)
6553 {
6554 Assume that SIZE is 4. Bigger sizes are handled analogously
6555 if (COUNT & 4)
6556 {
6557 copy 4 bytes from SRCPTR to DESTPTR
6558 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
6559 goto done_label
6560 }
6561 if (!COUNT)
6562 goto done_label;
6563 copy 1 byte from SRCPTR to DESTPTR
6564 if (COUNT & 2)
6565 {
6566 copy 2 bytes from SRCPTR to DESTPTR
6567 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
6568 }
6569 }
6570 else
6571 {
6572 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
6573 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
6574
6575 OLD_DESPTR = DESTPTR;
6576 Align DESTPTR up to DESIRED_ALIGN
6577 SRCPTR += DESTPTR - OLD_DESTPTR
6578 COUNT -= DEST_PTR - OLD_DESTPTR
6579 if (DYNAMIC_CHECK)
6580 Round COUNT down to multiple of SIZE
6581 << optional caller supplied zero size guard is here >>
6582 << optional caller supplied dynamic check is here >>
6583 << caller supplied main copy loop is here >>
6584 }
6585 done_label:
6586 */
6587 static void
6588 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
6589 rtx *destptr, rtx *srcptr,
6590 machine_mode mode,
6591 rtx value, rtx vec_value,
6592 rtx *count,
6593 rtx_code_label **done_label,
6594 int size,
6595 int desired_align,
6596 int align,
6597 unsigned HOST_WIDE_INT *min_size,
6598 bool dynamic_check,
6599 bool issetmem)
6600 {
6601 rtx_code_label *loop_label = NULL, *label;
6602 int n;
6603 rtx modesize;
6604 int prolog_size = 0;
6605 rtx mode_value;
6606
6607 /* Chose proper value to copy. */
6608 if (issetmem && VECTOR_MODE_P (mode))
6609 mode_value = vec_value;
6610 else
6611 mode_value = value;
6612 gcc_assert (GET_MODE_SIZE (mode) <= size);
6613
6614 /* See if block is big or small, handle small blocks. */
6615 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
6616 {
6617 int size2 = size;
6618 loop_label = gen_label_rtx ();
6619
6620 if (!*done_label)
6621 *done_label = gen_label_rtx ();
6622
6623 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
6624 1, loop_label);
6625 size2 >>= 1;
6626
6627 /* Handle sizes > 3. */
6628 for (;size2 > 2; size2 >>= 1)
6629 expand_small_cpymem_or_setmem (destmem, srcmem,
6630 *destptr, *srcptr,
6631 value, vec_value,
6632 *count,
6633 size2, *done_label, issetmem);
6634 /* Nothing to copy? Jump to DONE_LABEL if so */
6635 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
6636 1, *done_label);
6637
6638 /* Do a byte copy. */
6639 destmem = change_address (destmem, QImode, *destptr);
6640 if (issetmem)
6641 emit_move_insn (destmem, gen_lowpart (QImode, value));
6642 else
6643 {
6644 srcmem = change_address (srcmem, QImode, *srcptr);
6645 emit_move_insn (destmem, srcmem);
6646 }
6647
6648 /* Handle sizes 2 and 3. */
6649 label = ix86_expand_aligntest (*count, 2, false);
6650 destmem = change_address (destmem, HImode, *destptr);
6651 destmem = offset_address (destmem, *count, 1);
6652 destmem = offset_address (destmem, GEN_INT (-2), 2);
6653 if (issetmem)
6654 emit_move_insn (destmem, gen_lowpart (HImode, value));
6655 else
6656 {
6657 srcmem = change_address (srcmem, HImode, *srcptr);
6658 srcmem = offset_address (srcmem, *count, 1);
6659 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
6660 emit_move_insn (destmem, srcmem);
6661 }
6662
6663 emit_label (label);
6664 LABEL_NUSES (label) = 1;
6665 emit_jump_insn (gen_jump (*done_label));
6666 emit_barrier ();
6667 }
6668 else
6669 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
6670 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
6671
6672 /* Start memcpy for COUNT >= SIZE. */
6673 if (loop_label)
6674 {
6675 emit_label (loop_label);
6676 LABEL_NUSES (loop_label) = 1;
6677 }
6678
6679 /* Copy first desired_align bytes. */
6680 if (!issetmem)
6681 srcmem = change_address (srcmem, mode, *srcptr);
6682 destmem = change_address (destmem, mode, *destptr);
6683 modesize = GEN_INT (GET_MODE_SIZE (mode));
6684 for (n = 0; prolog_size < desired_align - align; n++)
6685 {
6686 if (issetmem)
6687 emit_move_insn (destmem, mode_value);
6688 else
6689 {
6690 emit_move_insn (destmem, srcmem);
6691 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6692 }
6693 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6694 prolog_size += GET_MODE_SIZE (mode);
6695 }
6696
6697
6698 /* Copy last SIZE bytes. */
6699 destmem = offset_address (destmem, *count, 1);
6700 destmem = offset_address (destmem,
6701 GEN_INT (-size - prolog_size),
6702 1);
6703 if (issetmem)
6704 emit_move_insn (destmem, mode_value);
6705 else
6706 {
6707 srcmem = offset_address (srcmem, *count, 1);
6708 srcmem = offset_address (srcmem,
6709 GEN_INT (-size - prolog_size),
6710 1);
6711 emit_move_insn (destmem, srcmem);
6712 }
6713 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
6714 {
6715 destmem = offset_address (destmem, modesize, 1);
6716 if (issetmem)
6717 emit_move_insn (destmem, mode_value);
6718 else
6719 {
6720 srcmem = offset_address (srcmem, modesize, 1);
6721 emit_move_insn (destmem, srcmem);
6722 }
6723 }
6724
6725 /* Align destination. */
6726 if (desired_align > 1 && desired_align > align)
6727 {
6728 rtx saveddest = *destptr;
6729
6730 gcc_assert (desired_align <= size);
6731 /* Align destptr up, place it to new register. */
6732 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
6733 GEN_INT (prolog_size),
6734 NULL_RTX, 1, OPTAB_DIRECT);
6735 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
6736 REG_POINTER (*destptr) = 1;
6737 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
6738 GEN_INT (-desired_align),
6739 *destptr, 1, OPTAB_DIRECT);
6740 /* See how many bytes we skipped. */
6741 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
6742 *destptr,
6743 saveddest, 1, OPTAB_DIRECT);
6744 /* Adjust srcptr and count. */
6745 if (!issetmem)
6746 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
6747 saveddest, *srcptr, 1, OPTAB_DIRECT);
6748 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
6749 saveddest, *count, 1, OPTAB_DIRECT);
6750 /* We copied at most size + prolog_size. */
6751 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
6752 *min_size
6753 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
6754 else
6755 *min_size = 0;
6756
6757 /* Our loops always round down the block size, but for dispatch to
6758 library we need precise value. */
6759 if (dynamic_check)
6760 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
6761 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
6762 }
6763 else
6764 {
6765 gcc_assert (prolog_size == 0);
6766 /* Decrease count, so we won't end up copying last word twice. */
6767 if (!CONST_INT_P (*count))
6768 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
6769 constm1_rtx, *count, 1, OPTAB_DIRECT);
6770 else
6771 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
6772 (unsigned HOST_WIDE_INT)size));
6773 if (*min_size)
6774 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
6775 }
6776 }
6777
6778
6779 /* This function is like the previous one, except here we know how many bytes
6780 need to be copied. That allows us to update alignment not only of DST, which
6781 is returned, but also of SRC, which is passed as a pointer for that
6782 reason. */
6783 static rtx
6784 expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
6785 rtx srcreg, rtx value, rtx vec_value,
6786 int desired_align, int align_bytes,
6787 bool issetmem)
6788 {
6789 rtx src = NULL;
6790 rtx orig_dst = dst;
6791 rtx orig_src = NULL;
6792 int piece_size = 1;
6793 int copied_bytes = 0;
6794
6795 if (!issetmem)
6796 {
6797 gcc_assert (srcp != NULL);
6798 src = *srcp;
6799 orig_src = src;
6800 }
6801
6802 for (piece_size = 1;
6803 piece_size <= desired_align && copied_bytes < align_bytes;
6804 piece_size <<= 1)
6805 {
6806 if (align_bytes & piece_size)
6807 {
6808 if (issetmem)
6809 {
6810 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
6811 dst = emit_memset (dst, destreg, vec_value, piece_size);
6812 else
6813 dst = emit_memset (dst, destreg, value, piece_size);
6814 }
6815 else
6816 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
6817 copied_bytes += piece_size;
6818 }
6819 }
6820 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
6821 set_mem_align (dst, desired_align * BITS_PER_UNIT);
6822 if (MEM_SIZE_KNOWN_P (orig_dst))
6823 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
6824
6825 if (!issetmem)
6826 {
6827 int src_align_bytes = get_mem_align_offset (src, desired_align
6828 * BITS_PER_UNIT);
6829 if (src_align_bytes >= 0)
6830 src_align_bytes = desired_align - src_align_bytes;
6831 if (src_align_bytes >= 0)
6832 {
6833 unsigned int src_align;
6834 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
6835 {
6836 if ((src_align_bytes & (src_align - 1))
6837 == (align_bytes & (src_align - 1)))
6838 break;
6839 }
6840 if (src_align > (unsigned int) desired_align)
6841 src_align = desired_align;
6842 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
6843 set_mem_align (src, src_align * BITS_PER_UNIT);
6844 }
6845 if (MEM_SIZE_KNOWN_P (orig_src))
6846 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
6847 *srcp = src;
6848 }
6849
6850 return dst;
6851 }
6852
6853 /* Return true if ALG can be used in current context.
6854 Assume we expand memset if MEMSET is true. */
6855 static bool
6856 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
6857 {
6858 if (alg == no_stringop)
6859 return false;
6860 if (alg == vector_loop)
6861 return TARGET_SSE || TARGET_AVX;
6862 /* Algorithms using the rep prefix want at least edi and ecx;
6863 additionally, memset wants eax and memcpy wants esi. Don't
6864 consider such algorithms if the user has appropriated those
6865 registers for their own purposes, or if we have a non-default
6866 address space, since some string insns cannot override the segment. */
6867 if (alg == rep_prefix_1_byte
6868 || alg == rep_prefix_4_byte
6869 || alg == rep_prefix_8_byte)
6870 {
6871 if (have_as)
6872 return false;
6873 if (fixed_regs[CX_REG]
6874 || fixed_regs[DI_REG]
6875 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
6876 return false;
6877 }
6878 return true;
6879 }
6880
6881 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
6882 static enum stringop_alg
6883 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
6884 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
6885 bool memset, bool zero_memset, bool have_as,
6886 int *dynamic_check, bool *noalign, bool recur)
6887 {
6888 const struct stringop_algs *algs;
6889 bool optimize_for_speed;
6890 int max = 0;
6891 const struct processor_costs *cost;
6892 int i;
6893 bool any_alg_usable_p = false;
6894
6895 *noalign = false;
6896 *dynamic_check = -1;
6897
6898 /* Even if the string operation call is cold, we still might spend a lot
6899 of time processing large blocks. */
6900 if (optimize_function_for_size_p (cfun)
6901 || (optimize_insn_for_size_p ()
6902 && (max_size < 256
6903 || (expected_size != -1 && expected_size < 256))))
6904 optimize_for_speed = false;
6905 else
6906 optimize_for_speed = true;
6907
6908 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
6909 if (memset)
6910 algs = &cost->memset[TARGET_64BIT != 0];
6911 else
6912 algs = &cost->memcpy[TARGET_64BIT != 0];
6913
6914 /* See maximal size for user defined algorithm. */
6915 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
6916 {
6917 enum stringop_alg candidate = algs->size[i].alg;
6918 bool usable = alg_usable_p (candidate, memset, have_as);
6919 any_alg_usable_p |= usable;
6920
6921 if (candidate != libcall && candidate && usable)
6922 max = algs->size[i].max;
6923 }
6924
6925 /* If expected size is not known but max size is small enough
6926 so inline version is a win, set expected size into
6927 the range. */
6928 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
6929 && expected_size == -1)
6930 expected_size = min_size / 2 + max_size / 2;
6931
6932 /* If user specified the algorithm, honor it if possible. */
6933 if (ix86_stringop_alg != no_stringop
6934 && alg_usable_p (ix86_stringop_alg, memset, have_as))
6935 return ix86_stringop_alg;
6936 /* rep; movq or rep; movl is the smallest variant. */
6937 else if (!optimize_for_speed)
6938 {
6939 *noalign = true;
6940 if (!count || (count & 3) || (memset && !zero_memset))
6941 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
6942 ? rep_prefix_1_byte : loop_1_byte;
6943 else
6944 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
6945 ? rep_prefix_4_byte : loop;
6946 }
6947 /* Very tiny blocks are best handled via the loop, REP is expensive to
6948 setup. */
6949 else if (expected_size != -1 && expected_size < 4)
6950 return loop_1_byte;
6951 else if (expected_size != -1)
6952 {
6953 enum stringop_alg alg = libcall;
6954 bool alg_noalign = false;
6955 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
6956 {
6957 /* We get here if the algorithms that were not libcall-based
6958 were rep-prefix based and we are unable to use rep prefixes
6959 based on global register usage. Break out of the loop and
6960 use the heuristic below. */
6961 if (algs->size[i].max == 0)
6962 break;
6963 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
6964 {
6965 enum stringop_alg candidate = algs->size[i].alg;
6966
6967 if (candidate != libcall
6968 && alg_usable_p (candidate, memset, have_as))
6969 {
6970 alg = candidate;
6971 alg_noalign = algs->size[i].noalign;
6972 }
6973 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
6974 last non-libcall inline algorithm. */
6975 if (TARGET_INLINE_ALL_STRINGOPS)
6976 {
6977 /* When the current size is best to be copied by a libcall,
6978 but we are still forced to inline, run the heuristic below
6979 that will pick code for medium sized blocks. */
6980 if (alg != libcall)
6981 {
6982 *noalign = alg_noalign;
6983 return alg;
6984 }
6985 else if (!any_alg_usable_p)
6986 break;
6987 }
6988 else if (alg_usable_p (candidate, memset, have_as)
6989 && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
6990 && candidate == rep_prefix_1_byte
6991 /* NB: If min_size != max_size, size is
6992 unknown. */
6993 && min_size != max_size))
6994 {
6995 *noalign = algs->size[i].noalign;
6996 return candidate;
6997 }
6998 }
6999 }
7000 }
7001 /* When asked to inline the call anyway, try to pick meaningful choice.
7002 We look for maximal size of block that is faster to copy by hand and
7003 take blocks of at most of that size guessing that average size will
7004 be roughly half of the block.
7005
7006 If this turns out to be bad, we might simply specify the preferred
7007 choice in ix86_costs. */
7008 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
7009 && (algs->unknown_size == libcall
7010 || !alg_usable_p (algs->unknown_size, memset, have_as)))
7011 {
7012 enum stringop_alg alg;
7013 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
7014
7015 /* If there aren't any usable algorithms or if recursing already,
7016 then recursing on smaller sizes or same size isn't going to
7017 find anything. Just return the simple byte-at-a-time copy loop. */
7018 if (!any_alg_usable_p || recur)
7019 {
7020 /* Pick something reasonable. */
7021 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
7022 *dynamic_check = 128;
7023 return loop_1_byte;
7024 }
7025 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
7026 zero_memset, have_as, dynamic_check, noalign, true);
7027 gcc_assert (*dynamic_check == -1);
7028 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
7029 *dynamic_check = max;
7030 else
7031 gcc_assert (alg != libcall);
7032 return alg;
7033 }
7034 return (alg_usable_p (algs->unknown_size, memset, have_as)
7035 ? algs->unknown_size : libcall);
7036 }
7037
7038 /* Decide on alignment. We know that the operand is already aligned to ALIGN
7039 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
7040 static int
7041 decide_alignment (int align,
7042 enum stringop_alg alg,
7043 int expected_size,
7044 machine_mode move_mode)
7045 {
7046 int desired_align = 0;
7047
7048 gcc_assert (alg != no_stringop);
7049
7050 if (alg == libcall)
7051 return 0;
7052 if (move_mode == VOIDmode)
7053 return 0;
7054
7055 desired_align = GET_MODE_SIZE (move_mode);
7056 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
7057 copying whole cacheline at once. */
7058 if (TARGET_PENTIUMPRO
7059 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
7060 desired_align = 8;
7061
7062 if (optimize_size)
7063 desired_align = 1;
7064 if (desired_align < align)
7065 desired_align = align;
7066 if (expected_size != -1 && expected_size < 4)
7067 desired_align = align;
7068
7069 return desired_align;
7070 }
7071
7072
7073 /* Helper function for memcpy. For QImode value 0xXY produce
7074 0xXYXYXYXY of wide specified by MODE. This is essentially
7075 a * 0x10101010, but we can do slightly better than
7076 synth_mult by unwinding the sequence by hand on CPUs with
7077 slow multiply. */
7078 static rtx
7079 promote_duplicated_reg (machine_mode mode, rtx val)
7080 {
7081 machine_mode valmode = GET_MODE (val);
7082 rtx tmp;
7083 int nops = mode == DImode ? 3 : 2;
7084
7085 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
7086 if (val == const0_rtx)
7087 return copy_to_mode_reg (mode, CONST0_RTX (mode));
7088 if (CONST_INT_P (val))
7089 {
7090 HOST_WIDE_INT v = INTVAL (val) & 255;
7091
7092 v |= v << 8;
7093 v |= v << 16;
7094 if (mode == DImode)
7095 v |= (v << 16) << 16;
7096 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
7097 }
7098
7099 if (valmode == VOIDmode)
7100 valmode = QImode;
7101 if (valmode != QImode)
7102 val = gen_lowpart (QImode, val);
7103 if (mode == QImode)
7104 return val;
7105 if (!TARGET_PARTIAL_REG_STALL)
7106 nops--;
7107 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
7108 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
7109 <= (ix86_cost->shift_const + ix86_cost->add) * nops
7110 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
7111 {
7112 rtx reg = convert_modes (mode, QImode, val, true);
7113 tmp = promote_duplicated_reg (mode, const1_rtx);
7114 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
7115 OPTAB_DIRECT);
7116 }
7117 else
7118 {
7119 rtx reg = convert_modes (mode, QImode, val, true);
7120
7121 if (!TARGET_PARTIAL_REG_STALL)
7122 emit_insn (gen_insv_1 (mode, reg, reg));
7123 else
7124 {
7125 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
7126 NULL, 1, OPTAB_DIRECT);
7127 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
7128 OPTAB_DIRECT);
7129 }
7130 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
7131 NULL, 1, OPTAB_DIRECT);
7132 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7133 if (mode == SImode)
7134 return reg;
7135 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
7136 NULL, 1, OPTAB_DIRECT);
7137 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7138 return reg;
7139 }
7140 }
7141
7142 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
7143 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
7144 alignment from ALIGN to DESIRED_ALIGN. */
7145 static rtx
7146 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
7147 int align)
7148 {
7149 rtx promoted_val;
7150
7151 if (TARGET_64BIT
7152 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
7153 promoted_val = promote_duplicated_reg (DImode, val);
7154 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
7155 promoted_val = promote_duplicated_reg (SImode, val);
7156 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
7157 promoted_val = promote_duplicated_reg (HImode, val);
7158 else
7159 promoted_val = val;
7160
7161 return promoted_val;
7162 }
7163
7164 /* Copy the address to a Pmode register. This is used for x32 to
7165 truncate DImode TLS address to a SImode register. */
7166
7167 static rtx
7168 ix86_copy_addr_to_reg (rtx addr)
7169 {
7170 rtx reg;
7171 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
7172 {
7173 reg = copy_addr_to_reg (addr);
7174 REG_POINTER (reg) = 1;
7175 return reg;
7176 }
7177 else
7178 {
7179 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
7180 reg = copy_to_mode_reg (DImode, addr);
7181 REG_POINTER (reg) = 1;
7182 return gen_rtx_SUBREG (SImode, reg, 0);
7183 }
7184 }
7185
7186 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
7187 operations when profitable. The code depends upon architecture, block size
7188 and alignment, but always has one of the following overall structures:
7189
7190 Aligned move sequence:
7191
7192 1) Prologue guard: Conditional that jumps up to epilogues for small
7193 blocks that can be handled by epilogue alone. This is faster
7194 but also needed for correctness, since prologue assume the block
7195 is larger than the desired alignment.
7196
7197 Optional dynamic check for size and libcall for large
7198 blocks is emitted here too, with -minline-stringops-dynamically.
7199
7200 2) Prologue: copy first few bytes in order to get destination
7201 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
7202 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
7203 copied. We emit either a jump tree on power of two sized
7204 blocks, or a byte loop.
7205
7206 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7207 with specified algorithm.
7208
7209 4) Epilogue: code copying tail of the block that is too small to be
7210 handled by main body (or up to size guarded by prologue guard).
7211
7212 Misaligned move sequence
7213
7214 1) missaligned move prologue/epilogue containing:
7215 a) Prologue handling small memory blocks and jumping to done_label
7216 (skipped if blocks are known to be large enough)
7217 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
7218 needed by single possibly misaligned move
7219 (skipped if alignment is not needed)
7220 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
7221
7222 2) Zero size guard dispatching to done_label, if needed
7223
7224 3) dispatch to library call, if needed,
7225
7226 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7227 with specified algorithm. */
7228 bool
7229 ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
7230 rtx align_exp, rtx expected_align_exp,
7231 rtx expected_size_exp, rtx min_size_exp,
7232 rtx max_size_exp, rtx probable_max_size_exp,
7233 bool issetmem)
7234 {
7235 rtx destreg;
7236 rtx srcreg = NULL;
7237 rtx_code_label *label = NULL;
7238 rtx tmp;
7239 rtx_code_label *jump_around_label = NULL;
7240 HOST_WIDE_INT align = 1;
7241 unsigned HOST_WIDE_INT count = 0;
7242 HOST_WIDE_INT expected_size = -1;
7243 int size_needed = 0, epilogue_size_needed;
7244 int desired_align = 0, align_bytes = 0;
7245 enum stringop_alg alg;
7246 rtx promoted_val = NULL;
7247 rtx vec_promoted_val = NULL;
7248 bool force_loopy_epilogue = false;
7249 int dynamic_check;
7250 bool need_zero_guard = false;
7251 bool noalign;
7252 machine_mode move_mode = VOIDmode;
7253 machine_mode wider_mode;
7254 int unroll_factor = 1;
7255 /* TODO: Once value ranges are available, fill in proper data. */
7256 unsigned HOST_WIDE_INT min_size = 0;
7257 unsigned HOST_WIDE_INT max_size = -1;
7258 unsigned HOST_WIDE_INT probable_max_size = -1;
7259 bool misaligned_prologue_used = false;
7260 bool have_as;
7261
7262 if (CONST_INT_P (align_exp))
7263 align = INTVAL (align_exp);
7264 /* i386 can do misaligned access on reasonably increased cost. */
7265 if (CONST_INT_P (expected_align_exp)
7266 && INTVAL (expected_align_exp) > align)
7267 align = INTVAL (expected_align_exp);
7268 /* ALIGN is the minimum of destination and source alignment, but we care here
7269 just about destination alignment. */
7270 else if (!issetmem
7271 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
7272 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
7273
7274 if (CONST_INT_P (count_exp))
7275 {
7276 min_size = max_size = probable_max_size = count = expected_size
7277 = INTVAL (count_exp);
7278 /* When COUNT is 0, there is nothing to do. */
7279 if (!count)
7280 return true;
7281 }
7282 else
7283 {
7284 if (min_size_exp)
7285 min_size = INTVAL (min_size_exp);
7286 if (max_size_exp)
7287 max_size = INTVAL (max_size_exp);
7288 if (probable_max_size_exp)
7289 probable_max_size = INTVAL (probable_max_size_exp);
7290 if (CONST_INT_P (expected_size_exp))
7291 expected_size = INTVAL (expected_size_exp);
7292 }
7293
7294 /* Make sure we don't need to care about overflow later on. */
7295 if (count > (HOST_WIDE_INT_1U << 30))
7296 return false;
7297
7298 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
7299 if (!issetmem)
7300 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
7301
7302 /* Step 0: Decide on preferred algorithm, desired alignment and
7303 size of chunks to be copied by main loop. */
7304 alg = decide_alg (count, expected_size, min_size, probable_max_size,
7305 issetmem,
7306 issetmem && val_exp == const0_rtx, have_as,
7307 &dynamic_check, &noalign, false);
7308
7309 if (dump_file)
7310 fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
7311 stringop_alg_names[alg]);
7312
7313 if (alg == libcall)
7314 return false;
7315 gcc_assert (alg != no_stringop);
7316
7317 /* For now vector-version of memset is generated only for memory zeroing, as
7318 creating of promoted vector value is very cheap in this case. */
7319 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
7320 alg = unrolled_loop;
7321
7322 if (!count)
7323 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
7324 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
7325 if (!issetmem)
7326 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
7327
7328 unroll_factor = 1;
7329 move_mode = word_mode;
7330 switch (alg)
7331 {
7332 case libcall:
7333 case no_stringop:
7334 case last_alg:
7335 gcc_unreachable ();
7336 case loop_1_byte:
7337 need_zero_guard = true;
7338 move_mode = QImode;
7339 break;
7340 case loop:
7341 need_zero_guard = true;
7342 break;
7343 case unrolled_loop:
7344 need_zero_guard = true;
7345 unroll_factor = (TARGET_64BIT ? 4 : 2);
7346 break;
7347 case vector_loop:
7348 need_zero_guard = true;
7349 unroll_factor = 4;
7350 /* Find the widest supported mode. */
7351 move_mode = word_mode;
7352 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
7353 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
7354 move_mode = wider_mode;
7355
7356 if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
7357 move_mode = TImode;
7358
7359 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7360 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7361 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
7362 {
7363 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
7364 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
7365 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
7366 move_mode = word_mode;
7367 }
7368 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
7369 break;
7370 case rep_prefix_8_byte:
7371 move_mode = DImode;
7372 break;
7373 case rep_prefix_4_byte:
7374 move_mode = SImode;
7375 break;
7376 case rep_prefix_1_byte:
7377 move_mode = QImode;
7378 break;
7379 }
7380 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
7381 epilogue_size_needed = size_needed;
7382
7383 /* If we are going to call any library calls conditionally, make sure any
7384 pending stack adjustment happen before the first conditional branch,
7385 otherwise they will be emitted before the library call only and won't
7386 happen from the other branches. */
7387 if (dynamic_check != -1)
7388 do_pending_stack_adjust ();
7389
7390 desired_align = decide_alignment (align, alg, expected_size, move_mode);
7391 if (!TARGET_ALIGN_STRINGOPS || noalign)
7392 align = desired_align;
7393
7394 /* Step 1: Prologue guard. */
7395
7396 /* Alignment code needs count to be in register. */
7397 if (CONST_INT_P (count_exp) && desired_align > align)
7398 {
7399 if (INTVAL (count_exp) > desired_align
7400 && INTVAL (count_exp) > size_needed)
7401 {
7402 align_bytes
7403 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
7404 if (align_bytes <= 0)
7405 align_bytes = 0;
7406 else
7407 align_bytes = desired_align - align_bytes;
7408 }
7409 if (align_bytes == 0)
7410 count_exp = force_reg (counter_mode (count_exp), count_exp);
7411 }
7412 gcc_assert (desired_align >= 1 && align >= 1);
7413
7414 /* Misaligned move sequences handle both prologue and epilogue at once.
7415 Default code generation results in a smaller code for large alignments
7416 and also avoids redundant job when sizes are known precisely. */
7417 misaligned_prologue_used
7418 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
7419 && MAX (desired_align, epilogue_size_needed) <= 32
7420 && desired_align <= epilogue_size_needed
7421 && ((desired_align > align && !align_bytes)
7422 || (!count && epilogue_size_needed > 1)));
7423
7424 /* Do the cheap promotion to allow better CSE across the
7425 main loop and epilogue (ie one load of the big constant in the
7426 front of all code.
7427 For now the misaligned move sequences do not have fast path
7428 without broadcasting. */
7429 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
7430 {
7431 if (alg == vector_loop)
7432 {
7433 gcc_assert (val_exp == const0_rtx);
7434 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
7435 promoted_val = promote_duplicated_reg_to_size (val_exp,
7436 GET_MODE_SIZE (word_mode),
7437 desired_align, align);
7438 }
7439 else
7440 {
7441 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7442 desired_align, align);
7443 }
7444 }
7445 /* Misaligned move sequences handles both prologues and epilogues at once.
7446 Default code generation results in smaller code for large alignments and
7447 also avoids redundant job when sizes are known precisely. */
7448 if (misaligned_prologue_used)
7449 {
7450 /* Misaligned move prologue handled small blocks by itself. */
7451 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
7452 (dst, src, &destreg, &srcreg,
7453 move_mode, promoted_val, vec_promoted_val,
7454 &count_exp,
7455 &jump_around_label,
7456 desired_align < align
7457 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
7458 desired_align, align, &min_size, dynamic_check, issetmem);
7459 if (!issetmem)
7460 src = change_address (src, BLKmode, srcreg);
7461 dst = change_address (dst, BLKmode, destreg);
7462 set_mem_align (dst, desired_align * BITS_PER_UNIT);
7463 epilogue_size_needed = 0;
7464 if (need_zero_guard
7465 && min_size < (unsigned HOST_WIDE_INT) size_needed)
7466 {
7467 /* It is possible that we copied enough so the main loop will not
7468 execute. */
7469 gcc_assert (size_needed > 1);
7470 if (jump_around_label == NULL_RTX)
7471 jump_around_label = gen_label_rtx ();
7472 emit_cmp_and_jump_insns (count_exp,
7473 GEN_INT (size_needed),
7474 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
7475 if (expected_size == -1
7476 || expected_size < (desired_align - align) / 2 + size_needed)
7477 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7478 else
7479 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7480 }
7481 }
7482 /* Ensure that alignment prologue won't copy past end of block. */
7483 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
7484 {
7485 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
7486 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
7487 Make sure it is power of 2. */
7488 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
7489
7490 /* To improve performance of small blocks, we jump around the VAL
7491 promoting mode. This mean that if the promoted VAL is not constant,
7492 we might not use it in the epilogue and have to use byte
7493 loop variant. */
7494 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
7495 force_loopy_epilogue = true;
7496 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7497 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7498 {
7499 /* If main algorithm works on QImode, no epilogue is needed.
7500 For small sizes just don't align anything. */
7501 if (size_needed == 1)
7502 desired_align = align;
7503 else
7504 goto epilogue;
7505 }
7506 else if (!count
7507 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7508 {
7509 label = gen_label_rtx ();
7510 emit_cmp_and_jump_insns (count_exp,
7511 GEN_INT (epilogue_size_needed),
7512 LTU, 0, counter_mode (count_exp), 1, label);
7513 if (expected_size == -1 || expected_size < epilogue_size_needed)
7514 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7515 else
7516 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7517 }
7518 }
7519
7520 /* Emit code to decide on runtime whether library call or inline should be
7521 used. */
7522 if (dynamic_check != -1)
7523 {
7524 if (!issetmem && CONST_INT_P (count_exp))
7525 {
7526 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
7527 {
7528 emit_block_copy_via_libcall (dst, src, count_exp);
7529 count_exp = const0_rtx;
7530 goto epilogue;
7531 }
7532 }
7533 else
7534 {
7535 rtx_code_label *hot_label = gen_label_rtx ();
7536 if (jump_around_label == NULL_RTX)
7537 jump_around_label = gen_label_rtx ();
7538 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
7539 LEU, 0, counter_mode (count_exp),
7540 1, hot_label);
7541 predict_jump (REG_BR_PROB_BASE * 90 / 100);
7542 if (issetmem)
7543 set_storage_via_libcall (dst, count_exp, val_exp);
7544 else
7545 emit_block_copy_via_libcall (dst, src, count_exp);
7546 emit_jump (jump_around_label);
7547 emit_label (hot_label);
7548 }
7549 }
7550
7551 /* Step 2: Alignment prologue. */
7552 /* Do the expensive promotion once we branched off the small blocks. */
7553 if (issetmem && !promoted_val)
7554 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7555 desired_align, align);
7556
7557 if (desired_align > align && !misaligned_prologue_used)
7558 {
7559 if (align_bytes == 0)
7560 {
7561 /* Except for the first move in prologue, we no longer know
7562 constant offset in aliasing info. It don't seems to worth
7563 the pain to maintain it for the first move, so throw away
7564 the info early. */
7565 dst = change_address (dst, BLKmode, destreg);
7566 if (!issetmem)
7567 src = change_address (src, BLKmode, srcreg);
7568 dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
7569 promoted_val, vec_promoted_val,
7570 count_exp, align, desired_align,
7571 issetmem);
7572 /* At most desired_align - align bytes are copied. */
7573 if (min_size < (unsigned)(desired_align - align))
7574 min_size = 0;
7575 else
7576 min_size -= desired_align - align;
7577 }
7578 else
7579 {
7580 /* If we know how many bytes need to be stored before dst is
7581 sufficiently aligned, maintain aliasing info accurately. */
7582 dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
7583 srcreg,
7584 promoted_val,
7585 vec_promoted_val,
7586 desired_align,
7587 align_bytes,
7588 issetmem);
7589
7590 count_exp = plus_constant (counter_mode (count_exp),
7591 count_exp, -align_bytes);
7592 count -= align_bytes;
7593 min_size -= align_bytes;
7594 max_size -= align_bytes;
7595 }
7596 if (need_zero_guard
7597 && min_size < (unsigned HOST_WIDE_INT) size_needed
7598 && (count < (unsigned HOST_WIDE_INT) size_needed
7599 || (align_bytes == 0
7600 && count < ((unsigned HOST_WIDE_INT) size_needed
7601 + desired_align - align))))
7602 {
7603 /* It is possible that we copied enough so the main loop will not
7604 execute. */
7605 gcc_assert (size_needed > 1);
7606 if (label == NULL_RTX)
7607 label = gen_label_rtx ();
7608 emit_cmp_and_jump_insns (count_exp,
7609 GEN_INT (size_needed),
7610 LTU, 0, counter_mode (count_exp), 1, label);
7611 if (expected_size == -1
7612 || expected_size < (desired_align - align) / 2 + size_needed)
7613 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7614 else
7615 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7616 }
7617 }
7618 if (label && size_needed == 1)
7619 {
7620 emit_label (label);
7621 LABEL_NUSES (label) = 1;
7622 label = NULL;
7623 epilogue_size_needed = 1;
7624 if (issetmem)
7625 promoted_val = val_exp;
7626 }
7627 else if (label == NULL_RTX && !misaligned_prologue_used)
7628 epilogue_size_needed = size_needed;
7629
7630 /* Step 3: Main loop. */
7631
7632 switch (alg)
7633 {
7634 case libcall:
7635 case no_stringop:
7636 case last_alg:
7637 gcc_unreachable ();
7638 case loop_1_byte:
7639 case loop:
7640 case unrolled_loop:
7641 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
7642 count_exp, move_mode, unroll_factor,
7643 expected_size, issetmem);
7644 break;
7645 case vector_loop:
7646 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
7647 vec_promoted_val, count_exp, move_mode,
7648 unroll_factor, expected_size, issetmem);
7649 break;
7650 case rep_prefix_8_byte:
7651 case rep_prefix_4_byte:
7652 case rep_prefix_1_byte:
7653 expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
7654 val_exp, count_exp, move_mode, issetmem);
7655 break;
7656 }
7657 /* Adjust properly the offset of src and dest memory for aliasing. */
7658 if (CONST_INT_P (count_exp))
7659 {
7660 if (!issetmem)
7661 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
7662 (count / size_needed) * size_needed);
7663 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
7664 (count / size_needed) * size_needed);
7665 }
7666 else
7667 {
7668 if (!issetmem)
7669 src = change_address (src, BLKmode, srcreg);
7670 dst = change_address (dst, BLKmode, destreg);
7671 }
7672
7673 /* Step 4: Epilogue to copy the remaining bytes. */
7674 epilogue:
7675 if (label)
7676 {
7677 /* When the main loop is done, COUNT_EXP might hold original count,
7678 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
7679 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
7680 bytes. Compensate if needed. */
7681
7682 if (size_needed < epilogue_size_needed)
7683 {
7684 tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
7685 GEN_INT (size_needed - 1), count_exp, 1,
7686 OPTAB_DIRECT);
7687 if (tmp != count_exp)
7688 emit_move_insn (count_exp, tmp);
7689 }
7690 emit_label (label);
7691 LABEL_NUSES (label) = 1;
7692 }
7693
7694 if (count_exp != const0_rtx && epilogue_size_needed > 1)
7695 {
7696 if (force_loopy_epilogue)
7697 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
7698 epilogue_size_needed);
7699 else
7700 {
7701 if (issetmem)
7702 expand_setmem_epilogue (dst, destreg, promoted_val,
7703 vec_promoted_val, count_exp,
7704 epilogue_size_needed);
7705 else
7706 expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
7707 epilogue_size_needed);
7708 }
7709 }
7710 if (jump_around_label)
7711 emit_label (jump_around_label);
7712 return true;
7713 }
7714
7715 /* Expand cmpstrn or memcmp. */
7716
7717 bool
7718 ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2,
7719 rtx length, rtx align, bool is_cmpstrn)
7720 {
7721 /* Expand strncmp and memcmp only with -minline-all-stringops since
7722 "repz cmpsb" can be much slower than strncmp and memcmp functions
7723 implemented with vector instructions, see
7724
7725 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
7726 */
7727 if (!TARGET_INLINE_ALL_STRINGOPS)
7728 return false;
7729
7730 /* Can't use this if the user has appropriated ecx, esi or edi. */
7731 if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
7732 return false;
7733
7734 if (is_cmpstrn)
7735 {
7736 /* For strncmp, length is the maximum length, which can be larger
7737 than actual string lengths. We can expand the cmpstrn pattern
7738 to "repz cmpsb" only if one of the strings is a constant so
7739 that expand_builtin_strncmp() can write the length argument to
7740 be the minimum of the const string length and the actual length
7741 argument. Otherwise, "repz cmpsb" may pass the 0 byte. */
7742 tree t1 = MEM_EXPR (src1);
7743 tree t2 = MEM_EXPR (src2);
7744 if (!((t1 && TREE_CODE (t1) == MEM_REF
7745 && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
7746 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0))
7747 == STRING_CST))
7748 || (t2 && TREE_CODE (t2) == MEM_REF
7749 && TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
7750 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0))
7751 == STRING_CST))))
7752 return false;
7753 }
7754
7755 rtx addr1 = copy_addr_to_reg (XEXP (src1, 0));
7756 rtx addr2 = copy_addr_to_reg (XEXP (src2, 0));
7757 if (addr1 != XEXP (src1, 0))
7758 src1 = replace_equiv_address_nv (src1, addr1);
7759 if (addr2 != XEXP (src2, 0))
7760 src2 = replace_equiv_address_nv (src2, addr2);
7761
7762 /* NB: Make a copy of the data length to avoid changing the original
7763 data length by cmpstrnqi patterns. */
7764 length = ix86_zero_extend_to_Pmode (length);
7765 rtx lengthreg = gen_reg_rtx (Pmode);
7766 emit_move_insn (lengthreg, length);
7767
7768 /* If we are testing strict equality, we can use known alignment to
7769 good advantage. This may be possible with combine, particularly
7770 once cc0 is dead. */
7771 if (CONST_INT_P (length))
7772 {
7773 if (length == const0_rtx)
7774 {
7775 emit_move_insn (result, const0_rtx);
7776 return true;
7777 }
7778 emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align,
7779 src1, src2));
7780 }
7781 else
7782 {
7783 emit_insn (gen_cmp_1 (Pmode, lengthreg, lengthreg));
7784 emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align,
7785 src1, src2));
7786 }
7787
7788 rtx out = gen_lowpart (QImode, result);
7789 emit_insn (gen_cmpintqi (out));
7790 emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out));
7791
7792 return true;
7793 }
7794
7795 /* Expand the appropriate insns for doing strlen if not just doing
7796 repnz; scasb
7797
7798 out = result, initialized with the start address
7799 align_rtx = alignment of the address.
7800 scratch = scratch register, initialized with the startaddress when
7801 not aligned, otherwise undefined
7802
7803 This is just the body. It needs the initializations mentioned above and
7804 some address computing at the end. These things are done in i386.md. */
7805
7806 static void
7807 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
7808 {
7809 int align;
7810 rtx tmp;
7811 rtx_code_label *align_2_label = NULL;
7812 rtx_code_label *align_3_label = NULL;
7813 rtx_code_label *align_4_label = gen_label_rtx ();
7814 rtx_code_label *end_0_label = gen_label_rtx ();
7815 rtx mem;
7816 rtx tmpreg = gen_reg_rtx (SImode);
7817 rtx scratch = gen_reg_rtx (SImode);
7818 rtx cmp;
7819
7820 align = 0;
7821 if (CONST_INT_P (align_rtx))
7822 align = INTVAL (align_rtx);
7823
7824 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
7825
7826 /* Is there a known alignment and is it less than 4? */
7827 if (align < 4)
7828 {
7829 rtx scratch1 = gen_reg_rtx (Pmode);
7830 emit_move_insn (scratch1, out);
7831 /* Is there a known alignment and is it not 2? */
7832 if (align != 2)
7833 {
7834 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
7835 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
7836
7837 /* Leave just the 3 lower bits. */
7838 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
7839 NULL_RTX, 0, OPTAB_WIDEN);
7840
7841 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
7842 Pmode, 1, align_4_label);
7843 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
7844 Pmode, 1, align_2_label);
7845 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
7846 Pmode, 1, align_3_label);
7847 }
7848 else
7849 {
7850 /* Since the alignment is 2, we have to check 2 or 0 bytes;
7851 check if is aligned to 4 - byte. */
7852
7853 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
7854 NULL_RTX, 0, OPTAB_WIDEN);
7855
7856 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
7857 Pmode, 1, align_4_label);
7858 }
7859
7860 mem = change_address (src, QImode, out);
7861
7862 /* Now compare the bytes. */
7863
7864 /* Compare the first n unaligned byte on a byte per byte basis. */
7865 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
7866 QImode, 1, end_0_label);
7867
7868 /* Increment the address. */
7869 emit_insn (gen_add2_insn (out, const1_rtx));
7870
7871 /* Not needed with an alignment of 2 */
7872 if (align != 2)
7873 {
7874 emit_label (align_2_label);
7875
7876 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
7877 end_0_label);
7878
7879 emit_insn (gen_add2_insn (out, const1_rtx));
7880
7881 emit_label (align_3_label);
7882 }
7883
7884 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
7885 end_0_label);
7886
7887 emit_insn (gen_add2_insn (out, const1_rtx));
7888 }
7889
7890 /* Generate loop to check 4 bytes at a time. It is not a good idea to
7891 align this loop. It gives only huge programs, but does not help to
7892 speed up. */
7893 emit_label (align_4_label);
7894
7895 mem = change_address (src, SImode, out);
7896 emit_move_insn (scratch, mem);
7897 emit_insn (gen_add2_insn (out, GEN_INT (4)));
7898
7899 /* This formula yields a nonzero result iff one of the bytes is zero.
7900 This saves three branches inside loop and many cycles. */
7901
7902 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
7903 emit_insn (gen_one_cmplsi2 (scratch, scratch));
7904 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
7905 emit_insn (gen_andsi3 (tmpreg, tmpreg,
7906 gen_int_mode (0x80808080, SImode)));
7907 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
7908 align_4_label);
7909
7910 if (TARGET_CMOVE)
7911 {
7912 rtx reg = gen_reg_rtx (SImode);
7913 rtx reg2 = gen_reg_rtx (Pmode);
7914 emit_move_insn (reg, tmpreg);
7915 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
7916
7917 /* If zero is not in the first two bytes, move two bytes forward. */
7918 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
7919 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7920 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
7921 emit_insn (gen_rtx_SET (tmpreg,
7922 gen_rtx_IF_THEN_ELSE (SImode, tmp,
7923 reg,
7924 tmpreg)));
7925 /* Emit lea manually to avoid clobbering of flags. */
7926 emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
7927
7928 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7929 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
7930 emit_insn (gen_rtx_SET (out,
7931 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
7932 reg2,
7933 out)));
7934 }
7935 else
7936 {
7937 rtx_code_label *end_2_label = gen_label_rtx ();
7938 /* Is zero in the first two bytes? */
7939
7940 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
7941 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7942 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
7943 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
7944 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
7945 pc_rtx);
7946 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
7947 JUMP_LABEL (tmp) = end_2_label;
7948
7949 /* Not in the first two. Move two bytes forward. */
7950 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
7951 emit_insn (gen_add2_insn (out, const2_rtx));
7952
7953 emit_label (end_2_label);
7954
7955 }
7956
7957 /* Avoid branch in fixing the byte. */
7958 tmpreg = gen_lowpart (QImode, tmpreg);
7959 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
7960 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
7961 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
7962 emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
7963
7964 emit_label (end_0_label);
7965 }
7966
7967 /* Expand strlen. */
7968
7969 bool
7970 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
7971 {
7972 if (TARGET_UNROLL_STRLEN
7973 && TARGET_INLINE_ALL_STRINGOPS
7974 && eoschar == const0_rtx
7975 && optimize > 1)
7976 {
7977 /* The generic case of strlen expander is long. Avoid it's
7978 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
7979 rtx addr = force_reg (Pmode, XEXP (src, 0));
7980 /* Well it seems that some optimizer does not combine a call like
7981 foo(strlen(bar), strlen(bar));
7982 when the move and the subtraction is done here. It does calculate
7983 the length just once when these instructions are done inside of
7984 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
7985 often used and I use one fewer register for the lifetime of
7986 output_strlen_unroll() this is better. */
7987
7988 emit_move_insn (out, addr);
7989
7990 ix86_expand_strlensi_unroll_1 (out, src, align);
7991
7992 /* strlensi_unroll_1 returns the address of the zero at the end of
7993 the string, like memchr(), so compute the length by subtracting
7994 the start address. */
7995 emit_insn (gen_sub2_insn (out, addr));
7996 return true;
7997 }
7998 else
7999 return false;
8000 }
8001
8002 /* For given symbol (function) construct code to compute address of it's PLT
8003 entry in large x86-64 PIC model. */
8004
8005 static rtx
8006 construct_plt_address (rtx symbol)
8007 {
8008 rtx tmp, unspec;
8009
8010 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
8011 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
8012 gcc_assert (Pmode == DImode);
8013
8014 tmp = gen_reg_rtx (Pmode);
8015 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
8016
8017 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
8018 emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
8019 return tmp;
8020 }
8021
8022 /* Additional registers that are clobbered by SYSV calls. */
8023
8024 static int const x86_64_ms_sysv_extra_clobbered_registers
8025 [NUM_X86_64_MS_CLOBBERED_REGS] =
8026 {
8027 SI_REG, DI_REG,
8028 XMM6_REG, XMM7_REG,
8029 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
8030 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
8031 };
8032
8033 rtx_insn *
8034 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
8035 rtx callarg2,
8036 rtx pop, bool sibcall)
8037 {
8038 rtx vec[3];
8039 rtx use = NULL, call;
8040 unsigned int vec_len = 0;
8041 tree fndecl;
8042
8043 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
8044 {
8045 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
8046 if (fndecl
8047 && (lookup_attribute ("interrupt",
8048 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
8049 error ("interrupt service routine cannot be called directly");
8050 }
8051 else
8052 fndecl = NULL_TREE;
8053
8054 if (pop == const0_rtx)
8055 pop = NULL;
8056 gcc_assert (!TARGET_64BIT || !pop);
8057
8058 if (TARGET_MACHO && !TARGET_64BIT)
8059 {
8060 #if TARGET_MACHO
8061 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
8062 fnaddr = machopic_indirect_call_target (fnaddr);
8063 #endif
8064 }
8065 else
8066 {
8067 /* Static functions and indirect calls don't need the pic register. Also,
8068 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
8069 it an indirect call. */
8070 rtx addr = XEXP (fnaddr, 0);
8071 if (flag_pic
8072 && GET_CODE (addr) == SYMBOL_REF
8073 && !SYMBOL_REF_LOCAL_P (addr))
8074 {
8075 if (flag_plt
8076 && (SYMBOL_REF_DECL (addr) == NULL_TREE
8077 || !lookup_attribute ("noplt",
8078 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
8079 {
8080 if (!TARGET_64BIT
8081 || (ix86_cmodel == CM_LARGE_PIC
8082 && DEFAULT_ABI != MS_ABI))
8083 {
8084 use_reg (&use, gen_rtx_REG (Pmode,
8085 REAL_PIC_OFFSET_TABLE_REGNUM));
8086 if (ix86_use_pseudo_pic_reg ())
8087 emit_move_insn (gen_rtx_REG (Pmode,
8088 REAL_PIC_OFFSET_TABLE_REGNUM),
8089 pic_offset_table_rtx);
8090 }
8091 }
8092 else if (!TARGET_PECOFF && !TARGET_MACHO)
8093 {
8094 if (TARGET_64BIT
8095 && ix86_cmodel == CM_LARGE_PIC
8096 && DEFAULT_ABI != MS_ABI)
8097 {
8098 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
8099 UNSPEC_GOT);
8100 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
8101 fnaddr = force_reg (Pmode, fnaddr);
8102 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr);
8103 }
8104 else if (TARGET_64BIT)
8105 {
8106 fnaddr = gen_rtx_UNSPEC (Pmode,
8107 gen_rtvec (1, addr),
8108 UNSPEC_GOTPCREL);
8109 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
8110 }
8111 else
8112 {
8113 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
8114 UNSPEC_GOT);
8115 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
8116 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
8117 fnaddr);
8118 }
8119 fnaddr = gen_const_mem (Pmode, fnaddr);
8120 /* Pmode may not be the same as word_mode for x32, which
8121 doesn't support indirect branch via 32-bit memory slot.
8122 Since x32 GOT slot is 64 bit with zero upper 32 bits,
8123 indirect branch via x32 GOT slot is OK. */
8124 if (GET_MODE (fnaddr) != word_mode)
8125 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
8126 fnaddr = gen_rtx_MEM (QImode, fnaddr);
8127 }
8128 }
8129 }
8130
8131 /* Skip setting up RAX register for -mskip-rax-setup when there are no
8132 parameters passed in vector registers. */
8133 if (TARGET_64BIT
8134 && (INTVAL (callarg2) > 0
8135 || (INTVAL (callarg2) == 0
8136 && (TARGET_SSE || !flag_skip_rax_setup))))
8137 {
8138 rtx al = gen_rtx_REG (QImode, AX_REG);
8139 emit_move_insn (al, callarg2);
8140 use_reg (&use, al);
8141 }
8142
8143 if (ix86_cmodel == CM_LARGE_PIC
8144 && !TARGET_PECOFF
8145 && MEM_P (fnaddr)
8146 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
8147 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
8148 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
8149 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
8150 branch via x32 GOT slot is OK. */
8151 else if (!(TARGET_X32
8152 && MEM_P (fnaddr)
8153 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
8154 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
8155 && (sibcall
8156 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
8157 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
8158 {
8159 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
8160 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
8161 }
8162
8163 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
8164
8165 if (retval)
8166 call = gen_rtx_SET (retval, call);
8167 vec[vec_len++] = call;
8168
8169 if (pop)
8170 {
8171 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
8172 pop = gen_rtx_SET (stack_pointer_rtx, pop);
8173 vec[vec_len++] = pop;
8174 }
8175
8176 if (cfun->machine->no_caller_saved_registers
8177 && (!fndecl
8178 || (!TREE_THIS_VOLATILE (fndecl)
8179 && !lookup_attribute ("no_caller_saved_registers",
8180 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
8181 {
8182 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
8183 bool is_64bit_ms_abi = (TARGET_64BIT
8184 && ix86_function_abi (fndecl) == MS_ABI);
8185 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
8186
8187 /* If there are no caller-saved registers, add all registers
8188 that are clobbered by the call which returns. */
8189 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
8190 if (!fixed_regs[i]
8191 && (ix86_call_used_regs[i] == 1
8192 || (ix86_call_used_regs[i] & c_mask))
8193 && !STACK_REGNO_P (i)
8194 && !MMX_REGNO_P (i))
8195 clobber_reg (&use,
8196 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
8197 }
8198 else if (TARGET_64BIT_MS_ABI
8199 && (!callarg2 || INTVAL (callarg2) != -2))
8200 {
8201 unsigned i;
8202
8203 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
8204 {
8205 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
8206 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
8207
8208 clobber_reg (&use, gen_rtx_REG (mode, regno));
8209 }
8210
8211 /* Set here, but it may get cleared later. */
8212 if (TARGET_CALL_MS2SYSV_XLOGUES)
8213 {
8214 if (!TARGET_SSE)
8215 ;
8216
8217 /* Don't break hot-patched functions. */
8218 else if (ix86_function_ms_hook_prologue (current_function_decl))
8219 ;
8220
8221 /* TODO: Cases not yet examined. */
8222 else if (flag_split_stack)
8223 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
8224
8225 else
8226 {
8227 gcc_assert (!reload_completed);
8228 cfun->machine->call_ms2sysv = true;
8229 }
8230 }
8231 }
8232
8233 if (vec_len > 1)
8234 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
8235 rtx_insn *call_insn = emit_call_insn (call);
8236 if (use)
8237 CALL_INSN_FUNCTION_USAGE (call_insn) = use;
8238
8239 return call_insn;
8240 }
8241
8242 /* Split simple return with popping POPC bytes from stack to indirect
8243 branch with stack adjustment . */
8244
8245 void
8246 ix86_split_simple_return_pop_internal (rtx popc)
8247 {
8248 struct machine_function *m = cfun->machine;
8249 rtx ecx = gen_rtx_REG (SImode, CX_REG);
8250 rtx_insn *insn;
8251
8252 /* There is no "pascal" calling convention in any 64bit ABI. */
8253 gcc_assert (!TARGET_64BIT);
8254
8255 insn = emit_insn (gen_pop (ecx));
8256 m->fs.cfa_offset -= UNITS_PER_WORD;
8257 m->fs.sp_offset -= UNITS_PER_WORD;
8258
8259 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
8260 x = gen_rtx_SET (stack_pointer_rtx, x);
8261 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8262 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
8263 RTX_FRAME_RELATED_P (insn) = 1;
8264
8265 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
8266 x = gen_rtx_SET (stack_pointer_rtx, x);
8267 insn = emit_insn (x);
8268 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8269 RTX_FRAME_RELATED_P (insn) = 1;
8270
8271 /* Now return address is in ECX. */
8272 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
8273 }
8274
8275 /* Errors in the source file can cause expand_expr to return const0_rtx
8276 where we expect a vector. To avoid crashing, use one of the vector
8277 clear instructions. */
8278
8279 static rtx
8280 safe_vector_operand (rtx x, machine_mode mode)
8281 {
8282 if (x == const0_rtx)
8283 x = CONST0_RTX (mode);
8284 return x;
8285 }
8286
8287 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
8288
8289 static rtx
8290 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
8291 {
8292 rtx pat;
8293 tree arg0 = CALL_EXPR_ARG (exp, 0);
8294 tree arg1 = CALL_EXPR_ARG (exp, 1);
8295 rtx op0 = expand_normal (arg0);
8296 rtx op1 = expand_normal (arg1);
8297 machine_mode tmode = insn_data[icode].operand[0].mode;
8298 machine_mode mode0 = insn_data[icode].operand[1].mode;
8299 machine_mode mode1 = insn_data[icode].operand[2].mode;
8300
8301 if (VECTOR_MODE_P (mode0))
8302 op0 = safe_vector_operand (op0, mode0);
8303 if (VECTOR_MODE_P (mode1))
8304 op1 = safe_vector_operand (op1, mode1);
8305
8306 if (optimize || !target
8307 || GET_MODE (target) != tmode
8308 || !insn_data[icode].operand[0].predicate (target, tmode))
8309 target = gen_reg_rtx (tmode);
8310
8311 if (GET_MODE (op1) == SImode && mode1 == TImode)
8312 {
8313 rtx x = gen_reg_rtx (V4SImode);
8314 emit_insn (gen_sse2_loadd (x, op1));
8315 op1 = gen_lowpart (TImode, x);
8316 }
8317
8318 if (!insn_data[icode].operand[1].predicate (op0, mode0))
8319 op0 = copy_to_mode_reg (mode0, op0);
8320 if (!insn_data[icode].operand[2].predicate (op1, mode1))
8321 op1 = copy_to_mode_reg (mode1, op1);
8322
8323 pat = GEN_FCN (icode) (target, op0, op1);
8324 if (! pat)
8325 return 0;
8326
8327 emit_insn (pat);
8328
8329 return target;
8330 }
8331
8332 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
8333
8334 static rtx
8335 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
8336 enum ix86_builtin_func_type m_type,
8337 enum rtx_code sub_code)
8338 {
8339 rtx pat;
8340 unsigned int i, nargs;
8341 bool comparison_p = false;
8342 bool tf_p = false;
8343 bool last_arg_constant = false;
8344 int num_memory = 0;
8345 rtx xops[4];
8346
8347 machine_mode tmode = insn_data[icode].operand[0].mode;
8348
8349 switch (m_type)
8350 {
8351 case MULTI_ARG_4_DF2_DI_I:
8352 case MULTI_ARG_4_DF2_DI_I1:
8353 case MULTI_ARG_4_SF2_SI_I:
8354 case MULTI_ARG_4_SF2_SI_I1:
8355 nargs = 4;
8356 last_arg_constant = true;
8357 break;
8358
8359 case MULTI_ARG_3_SF:
8360 case MULTI_ARG_3_DF:
8361 case MULTI_ARG_3_SF2:
8362 case MULTI_ARG_3_DF2:
8363 case MULTI_ARG_3_DI:
8364 case MULTI_ARG_3_SI:
8365 case MULTI_ARG_3_SI_DI:
8366 case MULTI_ARG_3_HI:
8367 case MULTI_ARG_3_HI_SI:
8368 case MULTI_ARG_3_QI:
8369 case MULTI_ARG_3_DI2:
8370 case MULTI_ARG_3_SI2:
8371 case MULTI_ARG_3_HI2:
8372 case MULTI_ARG_3_QI2:
8373 nargs = 3;
8374 break;
8375
8376 case MULTI_ARG_2_SF:
8377 case MULTI_ARG_2_DF:
8378 case MULTI_ARG_2_DI:
8379 case MULTI_ARG_2_SI:
8380 case MULTI_ARG_2_HI:
8381 case MULTI_ARG_2_QI:
8382 nargs = 2;
8383 break;
8384
8385 case MULTI_ARG_2_DI_IMM:
8386 case MULTI_ARG_2_SI_IMM:
8387 case MULTI_ARG_2_HI_IMM:
8388 case MULTI_ARG_2_QI_IMM:
8389 nargs = 2;
8390 last_arg_constant = true;
8391 break;
8392
8393 case MULTI_ARG_1_SF:
8394 case MULTI_ARG_1_DF:
8395 case MULTI_ARG_1_SF2:
8396 case MULTI_ARG_1_DF2:
8397 case MULTI_ARG_1_DI:
8398 case MULTI_ARG_1_SI:
8399 case MULTI_ARG_1_HI:
8400 case MULTI_ARG_1_QI:
8401 case MULTI_ARG_1_SI_DI:
8402 case MULTI_ARG_1_HI_DI:
8403 case MULTI_ARG_1_HI_SI:
8404 case MULTI_ARG_1_QI_DI:
8405 case MULTI_ARG_1_QI_SI:
8406 case MULTI_ARG_1_QI_HI:
8407 nargs = 1;
8408 break;
8409
8410 case MULTI_ARG_2_DI_CMP:
8411 case MULTI_ARG_2_SI_CMP:
8412 case MULTI_ARG_2_HI_CMP:
8413 case MULTI_ARG_2_QI_CMP:
8414 nargs = 2;
8415 comparison_p = true;
8416 break;
8417
8418 case MULTI_ARG_2_SF_TF:
8419 case MULTI_ARG_2_DF_TF:
8420 case MULTI_ARG_2_DI_TF:
8421 case MULTI_ARG_2_SI_TF:
8422 case MULTI_ARG_2_HI_TF:
8423 case MULTI_ARG_2_QI_TF:
8424 nargs = 2;
8425 tf_p = true;
8426 break;
8427
8428 default:
8429 gcc_unreachable ();
8430 }
8431
8432 if (optimize || !target
8433 || GET_MODE (target) != tmode
8434 || !insn_data[icode].operand[0].predicate (target, tmode))
8435 target = gen_reg_rtx (tmode);
8436 else if (memory_operand (target, tmode))
8437 num_memory++;
8438
8439 gcc_assert (nargs <= ARRAY_SIZE (xops));
8440
8441 for (i = 0; i < nargs; i++)
8442 {
8443 tree arg = CALL_EXPR_ARG (exp, i);
8444 rtx op = expand_normal (arg);
8445 int adjust = (comparison_p) ? 1 : 0;
8446 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
8447
8448 if (last_arg_constant && i == nargs - 1)
8449 {
8450 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
8451 {
8452 enum insn_code new_icode = icode;
8453 switch (icode)
8454 {
8455 case CODE_FOR_xop_vpermil2v2df3:
8456 case CODE_FOR_xop_vpermil2v4sf3:
8457 case CODE_FOR_xop_vpermil2v4df3:
8458 case CODE_FOR_xop_vpermil2v8sf3:
8459 error ("the last argument must be a 2-bit immediate");
8460 return gen_reg_rtx (tmode);
8461 case CODE_FOR_xop_rotlv2di3:
8462 new_icode = CODE_FOR_rotlv2di3;
8463 goto xop_rotl;
8464 case CODE_FOR_xop_rotlv4si3:
8465 new_icode = CODE_FOR_rotlv4si3;
8466 goto xop_rotl;
8467 case CODE_FOR_xop_rotlv8hi3:
8468 new_icode = CODE_FOR_rotlv8hi3;
8469 goto xop_rotl;
8470 case CODE_FOR_xop_rotlv16qi3:
8471 new_icode = CODE_FOR_rotlv16qi3;
8472 xop_rotl:
8473 if (CONST_INT_P (op))
8474 {
8475 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
8476 op = GEN_INT (INTVAL (op) & mask);
8477 gcc_checking_assert
8478 (insn_data[icode].operand[i + 1].predicate (op, mode));
8479 }
8480 else
8481 {
8482 gcc_checking_assert
8483 (nargs == 2
8484 && insn_data[new_icode].operand[0].mode == tmode
8485 && insn_data[new_icode].operand[1].mode == tmode
8486 && insn_data[new_icode].operand[2].mode == mode
8487 && insn_data[new_icode].operand[0].predicate
8488 == insn_data[icode].operand[0].predicate
8489 && insn_data[new_icode].operand[1].predicate
8490 == insn_data[icode].operand[1].predicate);
8491 icode = new_icode;
8492 goto non_constant;
8493 }
8494 break;
8495 default:
8496 gcc_unreachable ();
8497 }
8498 }
8499 }
8500 else
8501 {
8502 non_constant:
8503 if (VECTOR_MODE_P (mode))
8504 op = safe_vector_operand (op, mode);
8505
8506 /* If we aren't optimizing, only allow one memory operand to be
8507 generated. */
8508 if (memory_operand (op, mode))
8509 num_memory++;
8510
8511 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
8512
8513 if (optimize
8514 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
8515 || num_memory > 1)
8516 op = force_reg (mode, op);
8517 }
8518
8519 xops[i] = op;
8520 }
8521
8522 switch (nargs)
8523 {
8524 case 1:
8525 pat = GEN_FCN (icode) (target, xops[0]);
8526 break;
8527
8528 case 2:
8529 if (tf_p)
8530 pat = GEN_FCN (icode) (target, xops[0], xops[1],
8531 GEN_INT ((int)sub_code));
8532 else if (! comparison_p)
8533 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
8534 else
8535 {
8536 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
8537 xops[0], xops[1]);
8538
8539 pat = GEN_FCN (icode) (target, cmp_op, xops[0], xops[1]);
8540 }
8541 break;
8542
8543 case 3:
8544 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
8545 break;
8546
8547 case 4:
8548 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
8549 break;
8550
8551 default:
8552 gcc_unreachable ();
8553 }
8554
8555 if (! pat)
8556 return 0;
8557
8558 emit_insn (pat);
8559 return target;
8560 }
8561
8562 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
8563 insns with vec_merge. */
8564
8565 static rtx
8566 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
8567 rtx target)
8568 {
8569 rtx pat;
8570 tree arg0 = CALL_EXPR_ARG (exp, 0);
8571 rtx op1, op0 = expand_normal (arg0);
8572 machine_mode tmode = insn_data[icode].operand[0].mode;
8573 machine_mode mode0 = insn_data[icode].operand[1].mode;
8574
8575 if (optimize || !target
8576 || GET_MODE (target) != tmode
8577 || !insn_data[icode].operand[0].predicate (target, tmode))
8578 target = gen_reg_rtx (tmode);
8579
8580 if (VECTOR_MODE_P (mode0))
8581 op0 = safe_vector_operand (op0, mode0);
8582
8583 if ((optimize && !register_operand (op0, mode0))
8584 || !insn_data[icode].operand[1].predicate (op0, mode0))
8585 op0 = copy_to_mode_reg (mode0, op0);
8586
8587 op1 = op0;
8588 if (!insn_data[icode].operand[2].predicate (op1, mode0))
8589 op1 = copy_to_mode_reg (mode0, op1);
8590
8591 pat = GEN_FCN (icode) (target, op0, op1);
8592 if (! pat)
8593 return 0;
8594 emit_insn (pat);
8595 return target;
8596 }
8597
8598 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
8599
8600 static rtx
8601 ix86_expand_sse_compare (const struct builtin_description *d,
8602 tree exp, rtx target, bool swap)
8603 {
8604 rtx pat;
8605 tree arg0 = CALL_EXPR_ARG (exp, 0);
8606 tree arg1 = CALL_EXPR_ARG (exp, 1);
8607 rtx op0 = expand_normal (arg0);
8608 rtx op1 = expand_normal (arg1);
8609 rtx op2;
8610 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8611 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8612 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8613 enum rtx_code comparison = d->comparison;
8614
8615 if (VECTOR_MODE_P (mode0))
8616 op0 = safe_vector_operand (op0, mode0);
8617 if (VECTOR_MODE_P (mode1))
8618 op1 = safe_vector_operand (op1, mode1);
8619
8620 /* Swap operands if we have a comparison that isn't available in
8621 hardware. */
8622 if (swap)
8623 std::swap (op0, op1);
8624
8625 if (optimize || !target
8626 || GET_MODE (target) != tmode
8627 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8628 target = gen_reg_rtx (tmode);
8629
8630 if ((optimize && !register_operand (op0, mode0))
8631 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
8632 op0 = copy_to_mode_reg (mode0, op0);
8633 if ((optimize && !register_operand (op1, mode1))
8634 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
8635 op1 = copy_to_mode_reg (mode1, op1);
8636
8637 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
8638 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8639 if (! pat)
8640 return 0;
8641 emit_insn (pat);
8642 return target;
8643 }
8644
8645 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
8646
8647 static rtx
8648 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
8649 rtx target)
8650 {
8651 rtx pat;
8652 tree arg0 = CALL_EXPR_ARG (exp, 0);
8653 tree arg1 = CALL_EXPR_ARG (exp, 1);
8654 rtx op0 = expand_normal (arg0);
8655 rtx op1 = expand_normal (arg1);
8656 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
8657 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
8658 enum rtx_code comparison = d->comparison;
8659
8660 if (VECTOR_MODE_P (mode0))
8661 op0 = safe_vector_operand (op0, mode0);
8662 if (VECTOR_MODE_P (mode1))
8663 op1 = safe_vector_operand (op1, mode1);
8664
8665 target = gen_reg_rtx (SImode);
8666 emit_move_insn (target, const0_rtx);
8667 target = gen_rtx_SUBREG (QImode, target, 0);
8668
8669 if ((optimize && !register_operand (op0, mode0))
8670 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8671 op0 = copy_to_mode_reg (mode0, op0);
8672 if ((optimize && !register_operand (op1, mode1))
8673 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8674 op1 = copy_to_mode_reg (mode1, op1);
8675
8676 pat = GEN_FCN (d->icode) (op0, op1);
8677 if (! pat)
8678 return 0;
8679 emit_insn (pat);
8680 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8681 gen_rtx_fmt_ee (comparison, QImode,
8682 SET_DEST (pat),
8683 const0_rtx)));
8684
8685 return SUBREG_REG (target);
8686 }
8687
8688 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
8689
8690 static rtx
8691 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
8692 rtx target)
8693 {
8694 rtx pat;
8695 tree arg0 = CALL_EXPR_ARG (exp, 0);
8696 rtx op1, op0 = expand_normal (arg0);
8697 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8698 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8699
8700 if (optimize || target == 0
8701 || GET_MODE (target) != tmode
8702 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8703 target = gen_reg_rtx (tmode);
8704
8705 if (VECTOR_MODE_P (mode0))
8706 op0 = safe_vector_operand (op0, mode0);
8707
8708 if ((optimize && !register_operand (op0, mode0))
8709 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8710 op0 = copy_to_mode_reg (mode0, op0);
8711
8712 op1 = GEN_INT (d->comparison);
8713
8714 pat = GEN_FCN (d->icode) (target, op0, op1);
8715 if (! pat)
8716 return 0;
8717 emit_insn (pat);
8718 return target;
8719 }
8720
8721 static rtx
8722 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
8723 tree exp, rtx target)
8724 {
8725 rtx pat;
8726 tree arg0 = CALL_EXPR_ARG (exp, 0);
8727 tree arg1 = CALL_EXPR_ARG (exp, 1);
8728 rtx op0 = expand_normal (arg0);
8729 rtx op1 = expand_normal (arg1);
8730 rtx op2;
8731 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8732 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8733 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8734
8735 if (optimize || target == 0
8736 || GET_MODE (target) != tmode
8737 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8738 target = gen_reg_rtx (tmode);
8739
8740 op0 = safe_vector_operand (op0, mode0);
8741 op1 = safe_vector_operand (op1, mode1);
8742
8743 if ((optimize && !register_operand (op0, mode0))
8744 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8745 op0 = copy_to_mode_reg (mode0, op0);
8746 if ((optimize && !register_operand (op1, mode1))
8747 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8748 op1 = copy_to_mode_reg (mode1, op1);
8749
8750 op2 = GEN_INT (d->comparison);
8751
8752 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8753 if (! pat)
8754 return 0;
8755 emit_insn (pat);
8756 return target;
8757 }
8758
8759 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
8760
8761 static rtx
8762 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
8763 rtx target)
8764 {
8765 rtx pat;
8766 tree arg0 = CALL_EXPR_ARG (exp, 0);
8767 tree arg1 = CALL_EXPR_ARG (exp, 1);
8768 rtx op0 = expand_normal (arg0);
8769 rtx op1 = expand_normal (arg1);
8770 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
8771 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
8772 enum rtx_code comparison = d->comparison;
8773
8774 if (VECTOR_MODE_P (mode0))
8775 op0 = safe_vector_operand (op0, mode0);
8776 if (VECTOR_MODE_P (mode1))
8777 op1 = safe_vector_operand (op1, mode1);
8778
8779 target = gen_reg_rtx (SImode);
8780 emit_move_insn (target, const0_rtx);
8781 target = gen_rtx_SUBREG (QImode, target, 0);
8782
8783 if ((optimize && !register_operand (op0, mode0))
8784 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8785 op0 = copy_to_mode_reg (mode0, op0);
8786 if ((optimize && !register_operand (op1, mode1))
8787 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8788 op1 = copy_to_mode_reg (mode1, op1);
8789
8790 pat = GEN_FCN (d->icode) (op0, op1);
8791 if (! pat)
8792 return 0;
8793 emit_insn (pat);
8794 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8795 gen_rtx_fmt_ee (comparison, QImode,
8796 SET_DEST (pat),
8797 const0_rtx)));
8798
8799 return SUBREG_REG (target);
8800 }
8801
8802 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
8803
8804 static rtx
8805 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
8806 tree exp, rtx target)
8807 {
8808 rtx pat;
8809 tree arg0 = CALL_EXPR_ARG (exp, 0);
8810 tree arg1 = CALL_EXPR_ARG (exp, 1);
8811 tree arg2 = CALL_EXPR_ARG (exp, 2);
8812 tree arg3 = CALL_EXPR_ARG (exp, 3);
8813 tree arg4 = CALL_EXPR_ARG (exp, 4);
8814 rtx scratch0, scratch1;
8815 rtx op0 = expand_normal (arg0);
8816 rtx op1 = expand_normal (arg1);
8817 rtx op2 = expand_normal (arg2);
8818 rtx op3 = expand_normal (arg3);
8819 rtx op4 = expand_normal (arg4);
8820 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
8821
8822 tmode0 = insn_data[d->icode].operand[0].mode;
8823 tmode1 = insn_data[d->icode].operand[1].mode;
8824 modev2 = insn_data[d->icode].operand[2].mode;
8825 modei3 = insn_data[d->icode].operand[3].mode;
8826 modev4 = insn_data[d->icode].operand[4].mode;
8827 modei5 = insn_data[d->icode].operand[5].mode;
8828 modeimm = insn_data[d->icode].operand[6].mode;
8829
8830 if (VECTOR_MODE_P (modev2))
8831 op0 = safe_vector_operand (op0, modev2);
8832 if (VECTOR_MODE_P (modev4))
8833 op2 = safe_vector_operand (op2, modev4);
8834
8835 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
8836 op0 = copy_to_mode_reg (modev2, op0);
8837 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
8838 op1 = copy_to_mode_reg (modei3, op1);
8839 if ((optimize && !register_operand (op2, modev4))
8840 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
8841 op2 = copy_to_mode_reg (modev4, op2);
8842 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
8843 op3 = copy_to_mode_reg (modei5, op3);
8844
8845 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
8846 {
8847 error ("the fifth argument must be an 8-bit immediate");
8848 return const0_rtx;
8849 }
8850
8851 if (d->code == IX86_BUILTIN_PCMPESTRI128)
8852 {
8853 if (optimize || !target
8854 || GET_MODE (target) != tmode0
8855 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
8856 target = gen_reg_rtx (tmode0);
8857
8858 scratch1 = gen_reg_rtx (tmode1);
8859
8860 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
8861 }
8862 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
8863 {
8864 if (optimize || !target
8865 || GET_MODE (target) != tmode1
8866 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
8867 target = gen_reg_rtx (tmode1);
8868
8869 scratch0 = gen_reg_rtx (tmode0);
8870
8871 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
8872 }
8873 else
8874 {
8875 gcc_assert (d->flag);
8876
8877 scratch0 = gen_reg_rtx (tmode0);
8878 scratch1 = gen_reg_rtx (tmode1);
8879
8880 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
8881 }
8882
8883 if (! pat)
8884 return 0;
8885
8886 emit_insn (pat);
8887
8888 if (d->flag)
8889 {
8890 target = gen_reg_rtx (SImode);
8891 emit_move_insn (target, const0_rtx);
8892 target = gen_rtx_SUBREG (QImode, target, 0);
8893
8894 emit_insn
8895 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8896 gen_rtx_fmt_ee (EQ, QImode,
8897 gen_rtx_REG ((machine_mode) d->flag,
8898 FLAGS_REG),
8899 const0_rtx)));
8900 return SUBREG_REG (target);
8901 }
8902 else
8903 return target;
8904 }
8905
8906
8907 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
8908
8909 static rtx
8910 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
8911 tree exp, rtx target)
8912 {
8913 rtx pat;
8914 tree arg0 = CALL_EXPR_ARG (exp, 0);
8915 tree arg1 = CALL_EXPR_ARG (exp, 1);
8916 tree arg2 = CALL_EXPR_ARG (exp, 2);
8917 rtx scratch0, scratch1;
8918 rtx op0 = expand_normal (arg0);
8919 rtx op1 = expand_normal (arg1);
8920 rtx op2 = expand_normal (arg2);
8921 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
8922
8923 tmode0 = insn_data[d->icode].operand[0].mode;
8924 tmode1 = insn_data[d->icode].operand[1].mode;
8925 modev2 = insn_data[d->icode].operand[2].mode;
8926 modev3 = insn_data[d->icode].operand[3].mode;
8927 modeimm = insn_data[d->icode].operand[4].mode;
8928
8929 if (VECTOR_MODE_P (modev2))
8930 op0 = safe_vector_operand (op0, modev2);
8931 if (VECTOR_MODE_P (modev3))
8932 op1 = safe_vector_operand (op1, modev3);
8933
8934 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
8935 op0 = copy_to_mode_reg (modev2, op0);
8936 if ((optimize && !register_operand (op1, modev3))
8937 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
8938 op1 = copy_to_mode_reg (modev3, op1);
8939
8940 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
8941 {
8942 error ("the third argument must be an 8-bit immediate");
8943 return const0_rtx;
8944 }
8945
8946 if (d->code == IX86_BUILTIN_PCMPISTRI128)
8947 {
8948 if (optimize || !target
8949 || GET_MODE (target) != tmode0
8950 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
8951 target = gen_reg_rtx (tmode0);
8952
8953 scratch1 = gen_reg_rtx (tmode1);
8954
8955 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
8956 }
8957 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
8958 {
8959 if (optimize || !target
8960 || GET_MODE (target) != tmode1
8961 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
8962 target = gen_reg_rtx (tmode1);
8963
8964 scratch0 = gen_reg_rtx (tmode0);
8965
8966 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
8967 }
8968 else
8969 {
8970 gcc_assert (d->flag);
8971
8972 scratch0 = gen_reg_rtx (tmode0);
8973 scratch1 = gen_reg_rtx (tmode1);
8974
8975 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
8976 }
8977
8978 if (! pat)
8979 return 0;
8980
8981 emit_insn (pat);
8982
8983 if (d->flag)
8984 {
8985 target = gen_reg_rtx (SImode);
8986 emit_move_insn (target, const0_rtx);
8987 target = gen_rtx_SUBREG (QImode, target, 0);
8988
8989 emit_insn
8990 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8991 gen_rtx_fmt_ee (EQ, QImode,
8992 gen_rtx_REG ((machine_mode) d->flag,
8993 FLAGS_REG),
8994 const0_rtx)));
8995 return SUBREG_REG (target);
8996 }
8997 else
8998 return target;
8999 }
9000
9001 /* Fixup modeless constants to fit required mode. */
9002
9003 static rtx
9004 fixup_modeless_constant (rtx x, machine_mode mode)
9005 {
9006 if (GET_MODE (x) == VOIDmode)
9007 x = convert_to_mode (mode, x, 1);
9008 return x;
9009 }
9010
9011 /* Subroutine of ix86_expand_builtin to take care of insns with
9012 variable number of operands. */
9013
9014 static rtx
9015 ix86_expand_args_builtin (const struct builtin_description *d,
9016 tree exp, rtx target)
9017 {
9018 rtx pat, real_target;
9019 unsigned int i, nargs;
9020 unsigned int nargs_constant = 0;
9021 unsigned int mask_pos = 0;
9022 int num_memory = 0;
9023 rtx xops[6];
9024 bool second_arg_count = false;
9025 enum insn_code icode = d->icode;
9026 const struct insn_data_d *insn_p = &insn_data[icode];
9027 machine_mode tmode = insn_p->operand[0].mode;
9028 machine_mode rmode = VOIDmode;
9029 bool swap = false;
9030 enum rtx_code comparison = d->comparison;
9031
9032 switch ((enum ix86_builtin_func_type) d->flag)
9033 {
9034 case V2DF_FTYPE_V2DF_ROUND:
9035 case V4DF_FTYPE_V4DF_ROUND:
9036 case V8DF_FTYPE_V8DF_ROUND:
9037 case V4SF_FTYPE_V4SF_ROUND:
9038 case V8SF_FTYPE_V8SF_ROUND:
9039 case V16SF_FTYPE_V16SF_ROUND:
9040 case V4SI_FTYPE_V4SF_ROUND:
9041 case V8SI_FTYPE_V8SF_ROUND:
9042 case V16SI_FTYPE_V16SF_ROUND:
9043 return ix86_expand_sse_round (d, exp, target);
9044 case V4SI_FTYPE_V2DF_V2DF_ROUND:
9045 case V8SI_FTYPE_V4DF_V4DF_ROUND:
9046 case V16SI_FTYPE_V8DF_V8DF_ROUND:
9047 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
9048 case INT_FTYPE_V8SF_V8SF_PTEST:
9049 case INT_FTYPE_V4DI_V4DI_PTEST:
9050 case INT_FTYPE_V4DF_V4DF_PTEST:
9051 case INT_FTYPE_V4SF_V4SF_PTEST:
9052 case INT_FTYPE_V2DI_V2DI_PTEST:
9053 case INT_FTYPE_V2DF_V2DF_PTEST:
9054 return ix86_expand_sse_ptest (d, exp, target);
9055 case FLOAT128_FTYPE_FLOAT128:
9056 case FLOAT_FTYPE_FLOAT:
9057 case INT_FTYPE_INT:
9058 case UINT_FTYPE_UINT:
9059 case UINT16_FTYPE_UINT16:
9060 case UINT64_FTYPE_INT:
9061 case UINT64_FTYPE_UINT64:
9062 case INT64_FTYPE_INT64:
9063 case INT64_FTYPE_V4SF:
9064 case INT64_FTYPE_V2DF:
9065 case INT_FTYPE_V16QI:
9066 case INT_FTYPE_V8QI:
9067 case INT_FTYPE_V8SF:
9068 case INT_FTYPE_V4DF:
9069 case INT_FTYPE_V4SF:
9070 case INT_FTYPE_V2DF:
9071 case INT_FTYPE_V32QI:
9072 case V16QI_FTYPE_V16QI:
9073 case V8SI_FTYPE_V8SF:
9074 case V8SI_FTYPE_V4SI:
9075 case V8HI_FTYPE_V8HI:
9076 case V8HI_FTYPE_V16QI:
9077 case V8QI_FTYPE_V8QI:
9078 case V8SF_FTYPE_V8SF:
9079 case V8SF_FTYPE_V8SI:
9080 case V8SF_FTYPE_V4SF:
9081 case V8SF_FTYPE_V8HI:
9082 case V4SI_FTYPE_V4SI:
9083 case V4SI_FTYPE_V16QI:
9084 case V4SI_FTYPE_V4SF:
9085 case V4SI_FTYPE_V8SI:
9086 case V4SI_FTYPE_V8HI:
9087 case V4SI_FTYPE_V4DF:
9088 case V4SI_FTYPE_V2DF:
9089 case V4HI_FTYPE_V4HI:
9090 case V4DF_FTYPE_V4DF:
9091 case V4DF_FTYPE_V4SI:
9092 case V4DF_FTYPE_V4SF:
9093 case V4DF_FTYPE_V2DF:
9094 case V4SF_FTYPE_V4SF:
9095 case V4SF_FTYPE_V4SI:
9096 case V4SF_FTYPE_V8SF:
9097 case V4SF_FTYPE_V4DF:
9098 case V4SF_FTYPE_V8HI:
9099 case V4SF_FTYPE_V2DF:
9100 case V2DI_FTYPE_V2DI:
9101 case V2DI_FTYPE_V16QI:
9102 case V2DI_FTYPE_V8HI:
9103 case V2DI_FTYPE_V4SI:
9104 case V2DF_FTYPE_V2DF:
9105 case V2DF_FTYPE_V4SI:
9106 case V2DF_FTYPE_V4DF:
9107 case V2DF_FTYPE_V4SF:
9108 case V2DF_FTYPE_V2SI:
9109 case V2SI_FTYPE_V2SI:
9110 case V2SI_FTYPE_V4SF:
9111 case V2SI_FTYPE_V2SF:
9112 case V2SI_FTYPE_V2DF:
9113 case V2SF_FTYPE_V2SF:
9114 case V2SF_FTYPE_V2SI:
9115 case V32QI_FTYPE_V32QI:
9116 case V32QI_FTYPE_V16QI:
9117 case V16HI_FTYPE_V16HI:
9118 case V16HI_FTYPE_V8HI:
9119 case V8SI_FTYPE_V8SI:
9120 case V16HI_FTYPE_V16QI:
9121 case V8SI_FTYPE_V16QI:
9122 case V4DI_FTYPE_V16QI:
9123 case V8SI_FTYPE_V8HI:
9124 case V4DI_FTYPE_V8HI:
9125 case V4DI_FTYPE_V4SI:
9126 case V4DI_FTYPE_V2DI:
9127 case UQI_FTYPE_UQI:
9128 case UHI_FTYPE_UHI:
9129 case USI_FTYPE_USI:
9130 case USI_FTYPE_UQI:
9131 case USI_FTYPE_UHI:
9132 case UDI_FTYPE_UDI:
9133 case UHI_FTYPE_V16QI:
9134 case USI_FTYPE_V32QI:
9135 case UDI_FTYPE_V64QI:
9136 case V16QI_FTYPE_UHI:
9137 case V32QI_FTYPE_USI:
9138 case V64QI_FTYPE_UDI:
9139 case V8HI_FTYPE_UQI:
9140 case V16HI_FTYPE_UHI:
9141 case V32HI_FTYPE_USI:
9142 case V4SI_FTYPE_UQI:
9143 case V8SI_FTYPE_UQI:
9144 case V4SI_FTYPE_UHI:
9145 case V8SI_FTYPE_UHI:
9146 case UQI_FTYPE_V8HI:
9147 case UHI_FTYPE_V16HI:
9148 case USI_FTYPE_V32HI:
9149 case UQI_FTYPE_V4SI:
9150 case UQI_FTYPE_V8SI:
9151 case UHI_FTYPE_V16SI:
9152 case UQI_FTYPE_V2DI:
9153 case UQI_FTYPE_V4DI:
9154 case UQI_FTYPE_V8DI:
9155 case V16SI_FTYPE_UHI:
9156 case V2DI_FTYPE_UQI:
9157 case V4DI_FTYPE_UQI:
9158 case V16SI_FTYPE_INT:
9159 case V16SF_FTYPE_V8SF:
9160 case V16SI_FTYPE_V8SI:
9161 case V16SF_FTYPE_V4SF:
9162 case V16SI_FTYPE_V4SI:
9163 case V16SI_FTYPE_V16SF:
9164 case V16SI_FTYPE_V16SI:
9165 case V64QI_FTYPE_V64QI:
9166 case V32HI_FTYPE_V32HI:
9167 case V16SF_FTYPE_V16SF:
9168 case V8DI_FTYPE_UQI:
9169 case V8DI_FTYPE_V8DI:
9170 case V8DF_FTYPE_V4DF:
9171 case V8DF_FTYPE_V2DF:
9172 case V8DF_FTYPE_V8DF:
9173 case V4DI_FTYPE_V4DI:
9174 case V16HI_FTYPE_V16SF:
9175 case V8HI_FTYPE_V8SF:
9176 case V8HI_FTYPE_V4SF:
9177 nargs = 1;
9178 break;
9179 case V4SF_FTYPE_V4SF_VEC_MERGE:
9180 case V2DF_FTYPE_V2DF_VEC_MERGE:
9181 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
9182 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
9183 case V16QI_FTYPE_V16QI_V16QI:
9184 case V16QI_FTYPE_V8HI_V8HI:
9185 case V16SF_FTYPE_V16SF_V16SF:
9186 case V8QI_FTYPE_V8QI_V8QI:
9187 case V8QI_FTYPE_V4HI_V4HI:
9188 case V8HI_FTYPE_V8HI_V8HI:
9189 case V8HI_FTYPE_V16QI_V16QI:
9190 case V8HI_FTYPE_V4SI_V4SI:
9191 case V8SF_FTYPE_V8SF_V8SF:
9192 case V8SF_FTYPE_V8SF_V8SI:
9193 case V8DF_FTYPE_V8DF_V8DF:
9194 case V4SI_FTYPE_V4SI_V4SI:
9195 case V4SI_FTYPE_V8HI_V8HI:
9196 case V4SI_FTYPE_V2DF_V2DF:
9197 case V4HI_FTYPE_V4HI_V4HI:
9198 case V4HI_FTYPE_V8QI_V8QI:
9199 case V4HI_FTYPE_V2SI_V2SI:
9200 case V4DF_FTYPE_V4DF_V4DF:
9201 case V4DF_FTYPE_V4DF_V4DI:
9202 case V4SF_FTYPE_V4SF_V4SF:
9203 case V4SF_FTYPE_V4SF_V4SI:
9204 case V4SF_FTYPE_V4SF_V2SI:
9205 case V4SF_FTYPE_V4SF_V2DF:
9206 case V4SF_FTYPE_V4SF_UINT:
9207 case V4SF_FTYPE_V4SF_DI:
9208 case V4SF_FTYPE_V4SF_SI:
9209 case V2DI_FTYPE_V2DI_V2DI:
9210 case V2DI_FTYPE_V16QI_V16QI:
9211 case V2DI_FTYPE_V4SI_V4SI:
9212 case V2DI_FTYPE_V2DI_V16QI:
9213 case V2SI_FTYPE_V2SI_V2SI:
9214 case V2SI_FTYPE_V4HI_V4HI:
9215 case V2SI_FTYPE_V2SF_V2SF:
9216 case V2DF_FTYPE_V2DF_V2DF:
9217 case V2DF_FTYPE_V2DF_V4SF:
9218 case V2DF_FTYPE_V2DF_V2DI:
9219 case V2DF_FTYPE_V2DF_DI:
9220 case V2DF_FTYPE_V2DF_SI:
9221 case V2DF_FTYPE_V2DF_UINT:
9222 case V2SF_FTYPE_V2SF_V2SF:
9223 case V1DI_FTYPE_V1DI_V1DI:
9224 case V1DI_FTYPE_V8QI_V8QI:
9225 case V1DI_FTYPE_V2SI_V2SI:
9226 case V32QI_FTYPE_V16HI_V16HI:
9227 case V16HI_FTYPE_V8SI_V8SI:
9228 case V64QI_FTYPE_V64QI_V64QI:
9229 case V32QI_FTYPE_V32QI_V32QI:
9230 case V16HI_FTYPE_V32QI_V32QI:
9231 case V16HI_FTYPE_V16HI_V16HI:
9232 case V8SI_FTYPE_V4DF_V4DF:
9233 case V8SI_FTYPE_V8SI_V8SI:
9234 case V8SI_FTYPE_V16HI_V16HI:
9235 case V4DI_FTYPE_V4DI_V4DI:
9236 case V4DI_FTYPE_V8SI_V8SI:
9237 case V8DI_FTYPE_V64QI_V64QI:
9238 if (comparison == UNKNOWN)
9239 return ix86_expand_binop_builtin (icode, exp, target);
9240 nargs = 2;
9241 break;
9242 case V4SF_FTYPE_V4SF_V4SF_SWAP:
9243 case V2DF_FTYPE_V2DF_V2DF_SWAP:
9244 gcc_assert (comparison != UNKNOWN);
9245 nargs = 2;
9246 swap = true;
9247 break;
9248 case V16HI_FTYPE_V16HI_V8HI_COUNT:
9249 case V16HI_FTYPE_V16HI_SI_COUNT:
9250 case V8SI_FTYPE_V8SI_V4SI_COUNT:
9251 case V8SI_FTYPE_V8SI_SI_COUNT:
9252 case V4DI_FTYPE_V4DI_V2DI_COUNT:
9253 case V4DI_FTYPE_V4DI_INT_COUNT:
9254 case V8HI_FTYPE_V8HI_V8HI_COUNT:
9255 case V8HI_FTYPE_V8HI_SI_COUNT:
9256 case V4SI_FTYPE_V4SI_V4SI_COUNT:
9257 case V4SI_FTYPE_V4SI_SI_COUNT:
9258 case V4HI_FTYPE_V4HI_V4HI_COUNT:
9259 case V4HI_FTYPE_V4HI_SI_COUNT:
9260 case V2DI_FTYPE_V2DI_V2DI_COUNT:
9261 case V2DI_FTYPE_V2DI_SI_COUNT:
9262 case V2SI_FTYPE_V2SI_V2SI_COUNT:
9263 case V2SI_FTYPE_V2SI_SI_COUNT:
9264 case V1DI_FTYPE_V1DI_V1DI_COUNT:
9265 case V1DI_FTYPE_V1DI_SI_COUNT:
9266 nargs = 2;
9267 second_arg_count = true;
9268 break;
9269 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
9270 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
9271 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
9272 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
9273 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
9274 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
9275 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
9276 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
9277 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
9278 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
9279 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
9280 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
9281 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
9282 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
9283 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
9284 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
9285 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
9286 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
9287 nargs = 4;
9288 second_arg_count = true;
9289 break;
9290 case UINT64_FTYPE_UINT64_UINT64:
9291 case UINT_FTYPE_UINT_UINT:
9292 case UINT_FTYPE_UINT_USHORT:
9293 case UINT_FTYPE_UINT_UCHAR:
9294 case UINT16_FTYPE_UINT16_INT:
9295 case UINT8_FTYPE_UINT8_INT:
9296 case UQI_FTYPE_UQI_UQI:
9297 case UHI_FTYPE_UHI_UHI:
9298 case USI_FTYPE_USI_USI:
9299 case UDI_FTYPE_UDI_UDI:
9300 case V16SI_FTYPE_V8DF_V8DF:
9301 case V32HI_FTYPE_V16SF_V16SF:
9302 case V16HI_FTYPE_V8SF_V8SF:
9303 case V8HI_FTYPE_V4SF_V4SF:
9304 case V16HI_FTYPE_V16SF_UHI:
9305 case V8HI_FTYPE_V8SF_UQI:
9306 case V8HI_FTYPE_V4SF_UQI:
9307 nargs = 2;
9308 break;
9309 case V2DI_FTYPE_V2DI_INT_CONVERT:
9310 nargs = 2;
9311 rmode = V1TImode;
9312 nargs_constant = 1;
9313 break;
9314 case V4DI_FTYPE_V4DI_INT_CONVERT:
9315 nargs = 2;
9316 rmode = V2TImode;
9317 nargs_constant = 1;
9318 break;
9319 case V8DI_FTYPE_V8DI_INT_CONVERT:
9320 nargs = 2;
9321 rmode = V4TImode;
9322 nargs_constant = 1;
9323 break;
9324 case V8HI_FTYPE_V8HI_INT:
9325 case V8HI_FTYPE_V8SF_INT:
9326 case V16HI_FTYPE_V16SF_INT:
9327 case V8HI_FTYPE_V4SF_INT:
9328 case V8SF_FTYPE_V8SF_INT:
9329 case V4SF_FTYPE_V16SF_INT:
9330 case V16SF_FTYPE_V16SF_INT:
9331 case V4SI_FTYPE_V4SI_INT:
9332 case V4SI_FTYPE_V8SI_INT:
9333 case V4HI_FTYPE_V4HI_INT:
9334 case V4DF_FTYPE_V4DF_INT:
9335 case V4DF_FTYPE_V8DF_INT:
9336 case V4SF_FTYPE_V4SF_INT:
9337 case V4SF_FTYPE_V8SF_INT:
9338 case V2DI_FTYPE_V2DI_INT:
9339 case V2DF_FTYPE_V2DF_INT:
9340 case V2DF_FTYPE_V4DF_INT:
9341 case V16HI_FTYPE_V16HI_INT:
9342 case V8SI_FTYPE_V8SI_INT:
9343 case V16SI_FTYPE_V16SI_INT:
9344 case V4SI_FTYPE_V16SI_INT:
9345 case V4DI_FTYPE_V4DI_INT:
9346 case V2DI_FTYPE_V4DI_INT:
9347 case V4DI_FTYPE_V8DI_INT:
9348 case UQI_FTYPE_UQI_UQI_CONST:
9349 case UHI_FTYPE_UHI_UQI:
9350 case USI_FTYPE_USI_UQI:
9351 case UDI_FTYPE_UDI_UQI:
9352 nargs = 2;
9353 nargs_constant = 1;
9354 break;
9355 case V16QI_FTYPE_V16QI_V16QI_V16QI:
9356 case V8SF_FTYPE_V8SF_V8SF_V8SF:
9357 case V4DF_FTYPE_V4DF_V4DF_V4DF:
9358 case V4SF_FTYPE_V4SF_V4SF_V4SF:
9359 case V2DF_FTYPE_V2DF_V2DF_V2DF:
9360 case V32QI_FTYPE_V32QI_V32QI_V32QI:
9361 case UHI_FTYPE_V16SI_V16SI_UHI:
9362 case UQI_FTYPE_V8DI_V8DI_UQI:
9363 case V16HI_FTYPE_V16SI_V16HI_UHI:
9364 case V16QI_FTYPE_V16SI_V16QI_UHI:
9365 case V16QI_FTYPE_V8DI_V16QI_UQI:
9366 case V16SF_FTYPE_V16SF_V16SF_UHI:
9367 case V16SF_FTYPE_V4SF_V16SF_UHI:
9368 case V16SI_FTYPE_SI_V16SI_UHI:
9369 case V16SI_FTYPE_V16HI_V16SI_UHI:
9370 case V16SI_FTYPE_V16QI_V16SI_UHI:
9371 case V8SF_FTYPE_V4SF_V8SF_UQI:
9372 case V4DF_FTYPE_V2DF_V4DF_UQI:
9373 case V8SI_FTYPE_V4SI_V8SI_UQI:
9374 case V8SI_FTYPE_SI_V8SI_UQI:
9375 case V4SI_FTYPE_V4SI_V4SI_UQI:
9376 case V4SI_FTYPE_SI_V4SI_UQI:
9377 case V4DI_FTYPE_V2DI_V4DI_UQI:
9378 case V4DI_FTYPE_DI_V4DI_UQI:
9379 case V2DI_FTYPE_V2DI_V2DI_UQI:
9380 case V2DI_FTYPE_DI_V2DI_UQI:
9381 case V64QI_FTYPE_V64QI_V64QI_UDI:
9382 case V64QI_FTYPE_V16QI_V64QI_UDI:
9383 case V64QI_FTYPE_QI_V64QI_UDI:
9384 case V32QI_FTYPE_V32QI_V32QI_USI:
9385 case V32QI_FTYPE_V16QI_V32QI_USI:
9386 case V32QI_FTYPE_QI_V32QI_USI:
9387 case V16QI_FTYPE_V16QI_V16QI_UHI:
9388 case V16QI_FTYPE_QI_V16QI_UHI:
9389 case V32HI_FTYPE_V8HI_V32HI_USI:
9390 case V32HI_FTYPE_HI_V32HI_USI:
9391 case V16HI_FTYPE_V8HI_V16HI_UHI:
9392 case V16HI_FTYPE_HI_V16HI_UHI:
9393 case V8HI_FTYPE_V8HI_V8HI_UQI:
9394 case V8HI_FTYPE_HI_V8HI_UQI:
9395 case V8SF_FTYPE_V8HI_V8SF_UQI:
9396 case V4SF_FTYPE_V8HI_V4SF_UQI:
9397 case V8SI_FTYPE_V8SF_V8SI_UQI:
9398 case V4SI_FTYPE_V4SF_V4SI_UQI:
9399 case V4DI_FTYPE_V4SF_V4DI_UQI:
9400 case V2DI_FTYPE_V4SF_V2DI_UQI:
9401 case V4SF_FTYPE_V4DI_V4SF_UQI:
9402 case V4SF_FTYPE_V2DI_V4SF_UQI:
9403 case V4DF_FTYPE_V4DI_V4DF_UQI:
9404 case V2DF_FTYPE_V2DI_V2DF_UQI:
9405 case V16QI_FTYPE_V8HI_V16QI_UQI:
9406 case V16QI_FTYPE_V16HI_V16QI_UHI:
9407 case V16QI_FTYPE_V4SI_V16QI_UQI:
9408 case V16QI_FTYPE_V8SI_V16QI_UQI:
9409 case V8HI_FTYPE_V4SI_V8HI_UQI:
9410 case V8HI_FTYPE_V8SI_V8HI_UQI:
9411 case V16QI_FTYPE_V2DI_V16QI_UQI:
9412 case V16QI_FTYPE_V4DI_V16QI_UQI:
9413 case V8HI_FTYPE_V2DI_V8HI_UQI:
9414 case V8HI_FTYPE_V4DI_V8HI_UQI:
9415 case V4SI_FTYPE_V2DI_V4SI_UQI:
9416 case V4SI_FTYPE_V4DI_V4SI_UQI:
9417 case V32QI_FTYPE_V32HI_V32QI_USI:
9418 case UHI_FTYPE_V16QI_V16QI_UHI:
9419 case USI_FTYPE_V32QI_V32QI_USI:
9420 case UDI_FTYPE_V64QI_V64QI_UDI:
9421 case UQI_FTYPE_V8HI_V8HI_UQI:
9422 case UHI_FTYPE_V16HI_V16HI_UHI:
9423 case USI_FTYPE_V32HI_V32HI_USI:
9424 case UQI_FTYPE_V4SI_V4SI_UQI:
9425 case UQI_FTYPE_V8SI_V8SI_UQI:
9426 case UQI_FTYPE_V2DI_V2DI_UQI:
9427 case UQI_FTYPE_V4DI_V4DI_UQI:
9428 case V4SF_FTYPE_V2DF_V4SF_UQI:
9429 case V4SF_FTYPE_V4DF_V4SF_UQI:
9430 case V16SI_FTYPE_V16SI_V16SI_UHI:
9431 case V16SI_FTYPE_V4SI_V16SI_UHI:
9432 case V2DI_FTYPE_V4SI_V2DI_UQI:
9433 case V2DI_FTYPE_V8HI_V2DI_UQI:
9434 case V2DI_FTYPE_V16QI_V2DI_UQI:
9435 case V4DI_FTYPE_V4DI_V4DI_UQI:
9436 case V4DI_FTYPE_V4SI_V4DI_UQI:
9437 case V4DI_FTYPE_V8HI_V4DI_UQI:
9438 case V4DI_FTYPE_V16QI_V4DI_UQI:
9439 case V4DI_FTYPE_V4DF_V4DI_UQI:
9440 case V2DI_FTYPE_V2DF_V2DI_UQI:
9441 case V4SI_FTYPE_V4DF_V4SI_UQI:
9442 case V4SI_FTYPE_V2DF_V4SI_UQI:
9443 case V4SI_FTYPE_V8HI_V4SI_UQI:
9444 case V4SI_FTYPE_V16QI_V4SI_UQI:
9445 case V4DI_FTYPE_V4DI_V4DI_V4DI:
9446 case V8DF_FTYPE_V2DF_V8DF_UQI:
9447 case V8DF_FTYPE_V4DF_V8DF_UQI:
9448 case V8DF_FTYPE_V8DF_V8DF_UQI:
9449 case V8SF_FTYPE_V8SF_V8SF_UQI:
9450 case V8SF_FTYPE_V8SI_V8SF_UQI:
9451 case V4DF_FTYPE_V4DF_V4DF_UQI:
9452 case V4SF_FTYPE_V4SF_V4SF_UQI:
9453 case V2DF_FTYPE_V2DF_V2DF_UQI:
9454 case V2DF_FTYPE_V4SF_V2DF_UQI:
9455 case V2DF_FTYPE_V4SI_V2DF_UQI:
9456 case V4SF_FTYPE_V4SI_V4SF_UQI:
9457 case V4DF_FTYPE_V4SF_V4DF_UQI:
9458 case V4DF_FTYPE_V4SI_V4DF_UQI:
9459 case V8SI_FTYPE_V8SI_V8SI_UQI:
9460 case V8SI_FTYPE_V8HI_V8SI_UQI:
9461 case V8SI_FTYPE_V16QI_V8SI_UQI:
9462 case V8DF_FTYPE_V8SI_V8DF_UQI:
9463 case V8DI_FTYPE_DI_V8DI_UQI:
9464 case V16SF_FTYPE_V8SF_V16SF_UHI:
9465 case V16SI_FTYPE_V8SI_V16SI_UHI:
9466 case V16HI_FTYPE_V16HI_V16HI_UHI:
9467 case V8HI_FTYPE_V16QI_V8HI_UQI:
9468 case V16HI_FTYPE_V16QI_V16HI_UHI:
9469 case V32HI_FTYPE_V32HI_V32HI_USI:
9470 case V32HI_FTYPE_V32QI_V32HI_USI:
9471 case V8DI_FTYPE_V16QI_V8DI_UQI:
9472 case V8DI_FTYPE_V2DI_V8DI_UQI:
9473 case V8DI_FTYPE_V4DI_V8DI_UQI:
9474 case V8DI_FTYPE_V8DI_V8DI_UQI:
9475 case V8DI_FTYPE_V8HI_V8DI_UQI:
9476 case V8DI_FTYPE_V8SI_V8DI_UQI:
9477 case V8HI_FTYPE_V8DI_V8HI_UQI:
9478 case V8SI_FTYPE_V8DI_V8SI_UQI:
9479 case V4SI_FTYPE_V4SI_V4SI_V4SI:
9480 case V16SI_FTYPE_V16SI_V16SI_V16SI:
9481 case V8DI_FTYPE_V8DI_V8DI_V8DI:
9482 case V32HI_FTYPE_V32HI_V32HI_V32HI:
9483 case V2DI_FTYPE_V2DI_V2DI_V2DI:
9484 case V16HI_FTYPE_V16HI_V16HI_V16HI:
9485 case V8SI_FTYPE_V8SI_V8SI_V8SI:
9486 case V8HI_FTYPE_V8HI_V8HI_V8HI:
9487 case V32HI_FTYPE_V16SF_V16SF_USI:
9488 case V16HI_FTYPE_V8SF_V8SF_UHI:
9489 case V8HI_FTYPE_V4SF_V4SF_UQI:
9490 case V16HI_FTYPE_V16SF_V16HI_UHI:
9491 case V8HI_FTYPE_V8SF_V8HI_UQI:
9492 case V8HI_FTYPE_V4SF_V8HI_UQI:
9493 case V16SF_FTYPE_V16SF_V32HI_V32HI:
9494 case V8SF_FTYPE_V8SF_V16HI_V16HI:
9495 case V4SF_FTYPE_V4SF_V8HI_V8HI:
9496 nargs = 3;
9497 break;
9498 case V32QI_FTYPE_V32QI_V32QI_INT:
9499 case V16HI_FTYPE_V16HI_V16HI_INT:
9500 case V16QI_FTYPE_V16QI_V16QI_INT:
9501 case V4DI_FTYPE_V4DI_V4DI_INT:
9502 case V8HI_FTYPE_V8HI_V8HI_INT:
9503 case V8SI_FTYPE_V8SI_V8SI_INT:
9504 case V8SI_FTYPE_V8SI_V4SI_INT:
9505 case V8SF_FTYPE_V8SF_V8SF_INT:
9506 case V8SF_FTYPE_V8SF_V4SF_INT:
9507 case V4SI_FTYPE_V4SI_V4SI_INT:
9508 case V4DF_FTYPE_V4DF_V4DF_INT:
9509 case V16SF_FTYPE_V16SF_V16SF_INT:
9510 case V16SF_FTYPE_V16SF_V4SF_INT:
9511 case V16SI_FTYPE_V16SI_V4SI_INT:
9512 case V4DF_FTYPE_V4DF_V2DF_INT:
9513 case V4SF_FTYPE_V4SF_V4SF_INT:
9514 case V2DI_FTYPE_V2DI_V2DI_INT:
9515 case V4DI_FTYPE_V4DI_V2DI_INT:
9516 case V2DF_FTYPE_V2DF_V2DF_INT:
9517 case UQI_FTYPE_V8DI_V8UDI_INT:
9518 case UQI_FTYPE_V8DF_V8DF_INT:
9519 case UQI_FTYPE_V2DF_V2DF_INT:
9520 case UQI_FTYPE_V4SF_V4SF_INT:
9521 case UHI_FTYPE_V16SI_V16SI_INT:
9522 case UHI_FTYPE_V16SF_V16SF_INT:
9523 case V64QI_FTYPE_V64QI_V64QI_INT:
9524 case V32HI_FTYPE_V32HI_V32HI_INT:
9525 case V16SI_FTYPE_V16SI_V16SI_INT:
9526 case V8DI_FTYPE_V8DI_V8DI_INT:
9527 nargs = 3;
9528 nargs_constant = 1;
9529 break;
9530 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
9531 nargs = 3;
9532 rmode = V4DImode;
9533 nargs_constant = 1;
9534 break;
9535 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
9536 nargs = 3;
9537 rmode = V2DImode;
9538 nargs_constant = 1;
9539 break;
9540 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
9541 nargs = 3;
9542 rmode = DImode;
9543 nargs_constant = 1;
9544 break;
9545 case V2DI_FTYPE_V2DI_UINT_UINT:
9546 nargs = 3;
9547 nargs_constant = 2;
9548 break;
9549 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
9550 nargs = 3;
9551 rmode = V8DImode;
9552 nargs_constant = 1;
9553 break;
9554 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
9555 nargs = 5;
9556 rmode = V8DImode;
9557 mask_pos = 2;
9558 nargs_constant = 1;
9559 break;
9560 case QI_FTYPE_V8DF_INT_UQI:
9561 case QI_FTYPE_V4DF_INT_UQI:
9562 case QI_FTYPE_V2DF_INT_UQI:
9563 case HI_FTYPE_V16SF_INT_UHI:
9564 case QI_FTYPE_V8SF_INT_UQI:
9565 case QI_FTYPE_V4SF_INT_UQI:
9566 case V4SI_FTYPE_V4SI_V4SI_UHI:
9567 case V8SI_FTYPE_V8SI_V8SI_UHI:
9568 nargs = 3;
9569 mask_pos = 1;
9570 nargs_constant = 1;
9571 break;
9572 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
9573 nargs = 5;
9574 rmode = V4DImode;
9575 mask_pos = 2;
9576 nargs_constant = 1;
9577 break;
9578 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
9579 nargs = 5;
9580 rmode = V2DImode;
9581 mask_pos = 2;
9582 nargs_constant = 1;
9583 break;
9584 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
9585 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
9586 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
9587 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
9588 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
9589 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
9590 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
9591 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
9592 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
9593 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
9594 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
9595 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
9596 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
9597 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
9598 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
9599 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
9600 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
9601 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
9602 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
9603 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
9604 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
9605 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
9606 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
9607 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
9608 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
9609 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
9610 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
9611 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
9612 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
9613 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
9614 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
9615 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
9616 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
9617 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
9618 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
9619 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
9620 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
9621 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
9622 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
9623 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
9624 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
9625 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
9626 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
9627 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
9628 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
9629 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
9630 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
9631 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
9632 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
9633 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
9634 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
9635 case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
9636 case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
9637 case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
9638 nargs = 4;
9639 break;
9640 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
9641 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
9642 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
9643 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
9644 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
9645 nargs = 4;
9646 nargs_constant = 1;
9647 break;
9648 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
9649 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
9650 case QI_FTYPE_V4DF_V4DF_INT_UQI:
9651 case QI_FTYPE_V8SF_V8SF_INT_UQI:
9652 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
9653 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
9654 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
9655 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
9656 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
9657 case USI_FTYPE_V32QI_V32QI_INT_USI:
9658 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
9659 case USI_FTYPE_V32HI_V32HI_INT_USI:
9660 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
9661 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
9662 nargs = 4;
9663 mask_pos = 1;
9664 nargs_constant = 1;
9665 break;
9666 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
9667 nargs = 4;
9668 nargs_constant = 2;
9669 break;
9670 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
9671 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
9672 case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
9673 case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
9674 case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
9675 nargs = 4;
9676 break;
9677 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
9678 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
9679 mask_pos = 1;
9680 nargs = 4;
9681 nargs_constant = 1;
9682 break;
9683 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
9684 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
9685 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
9686 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
9687 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
9688 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
9689 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
9690 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
9691 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
9692 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
9693 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
9694 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
9695 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
9696 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
9697 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
9698 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
9699 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
9700 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
9701 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
9702 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
9703 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
9704 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
9705 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
9706 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
9707 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
9708 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
9709 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
9710 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
9711 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
9712 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
9713 nargs = 4;
9714 mask_pos = 2;
9715 nargs_constant = 1;
9716 break;
9717 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
9718 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
9719 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
9720 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
9721 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
9722 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
9723 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
9724 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
9725 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
9726 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
9727 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
9728 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
9729 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
9730 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
9731 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
9732 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
9733 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
9734 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
9735 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
9736 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
9737 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
9738 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
9739 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
9740 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
9741 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
9742 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
9743 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
9744 nargs = 5;
9745 mask_pos = 2;
9746 nargs_constant = 1;
9747 break;
9748 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
9749 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
9750 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
9751 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
9752 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
9753 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
9754 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
9755 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
9756 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
9757 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
9758 nargs = 5;
9759 mask_pos = 1;
9760 nargs_constant = 1;
9761 break;
9762 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
9763 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
9764 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
9765 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
9766 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
9767 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
9768 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
9769 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
9770 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
9771 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
9772 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
9773 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
9774 nargs = 5;
9775 mask_pos = 1;
9776 nargs_constant = 2;
9777 break;
9778
9779 default:
9780 gcc_unreachable ();
9781 }
9782
9783 gcc_assert (nargs <= ARRAY_SIZE (xops));
9784
9785 if (comparison != UNKNOWN)
9786 {
9787 gcc_assert (nargs == 2);
9788 return ix86_expand_sse_compare (d, exp, target, swap);
9789 }
9790
9791 if (rmode == VOIDmode || rmode == tmode)
9792 {
9793 if (optimize
9794 || target == 0
9795 || GET_MODE (target) != tmode
9796 || !insn_p->operand[0].predicate (target, tmode))
9797 target = gen_reg_rtx (tmode);
9798 else if (memory_operand (target, tmode))
9799 num_memory++;
9800 real_target = target;
9801 }
9802 else
9803 {
9804 real_target = gen_reg_rtx (tmode);
9805 target = lowpart_subreg (rmode, real_target, tmode);
9806 }
9807
9808 for (i = 0; i < nargs; i++)
9809 {
9810 tree arg = CALL_EXPR_ARG (exp, i);
9811 rtx op = expand_normal (arg);
9812 machine_mode mode = insn_p->operand[i + 1].mode;
9813 bool match = insn_p->operand[i + 1].predicate (op, mode);
9814
9815 if (second_arg_count && i == 1)
9816 {
9817 /* SIMD shift insns take either an 8-bit immediate or
9818 register as count. But builtin functions take int as
9819 count. If count doesn't match, we put it in register.
9820 The instructions are using 64-bit count, if op is just
9821 32-bit, zero-extend it, as negative shift counts
9822 are undefined behavior and zero-extension is more
9823 efficient. */
9824 if (!match)
9825 {
9826 if (SCALAR_INT_MODE_P (GET_MODE (op)))
9827 op = convert_modes (mode, GET_MODE (op), op, 1);
9828 else
9829 op = lowpart_subreg (mode, op, GET_MODE (op));
9830 if (!insn_p->operand[i + 1].predicate (op, mode))
9831 op = copy_to_reg (op);
9832 }
9833 }
9834 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
9835 (!mask_pos && (nargs - i) <= nargs_constant))
9836 {
9837 if (!match)
9838 switch (icode)
9839 {
9840 case CODE_FOR_avx_vinsertf128v4di:
9841 case CODE_FOR_avx_vextractf128v4di:
9842 error ("the last argument must be an 1-bit immediate");
9843 return const0_rtx;
9844
9845 case CODE_FOR_avx512f_cmpv8di3_mask:
9846 case CODE_FOR_avx512f_cmpv16si3_mask:
9847 case CODE_FOR_avx512f_ucmpv8di3_mask:
9848 case CODE_FOR_avx512f_ucmpv16si3_mask:
9849 case CODE_FOR_avx512vl_cmpv4di3_mask:
9850 case CODE_FOR_avx512vl_cmpv8si3_mask:
9851 case CODE_FOR_avx512vl_ucmpv4di3_mask:
9852 case CODE_FOR_avx512vl_ucmpv8si3_mask:
9853 case CODE_FOR_avx512vl_cmpv2di3_mask:
9854 case CODE_FOR_avx512vl_cmpv4si3_mask:
9855 case CODE_FOR_avx512vl_ucmpv2di3_mask:
9856 case CODE_FOR_avx512vl_ucmpv4si3_mask:
9857 error ("the last argument must be a 3-bit immediate");
9858 return const0_rtx;
9859
9860 case CODE_FOR_sse4_1_roundsd:
9861 case CODE_FOR_sse4_1_roundss:
9862
9863 case CODE_FOR_sse4_1_roundpd:
9864 case CODE_FOR_sse4_1_roundps:
9865 case CODE_FOR_avx_roundpd256:
9866 case CODE_FOR_avx_roundps256:
9867
9868 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
9869 case CODE_FOR_sse4_1_roundps_sfix:
9870 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
9871 case CODE_FOR_avx_roundps_sfix256:
9872
9873 case CODE_FOR_sse4_1_blendps:
9874 case CODE_FOR_avx_blendpd256:
9875 case CODE_FOR_avx_vpermilv4df:
9876 case CODE_FOR_avx_vpermilv4df_mask:
9877 case CODE_FOR_avx512f_getmantv8df_mask:
9878 case CODE_FOR_avx512f_getmantv16sf_mask:
9879 case CODE_FOR_avx512vl_getmantv8sf_mask:
9880 case CODE_FOR_avx512vl_getmantv4df_mask:
9881 case CODE_FOR_avx512vl_getmantv4sf_mask:
9882 case CODE_FOR_avx512vl_getmantv2df_mask:
9883 case CODE_FOR_avx512dq_rangepv8df_mask_round:
9884 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
9885 case CODE_FOR_avx512dq_rangepv4df_mask:
9886 case CODE_FOR_avx512dq_rangepv8sf_mask:
9887 case CODE_FOR_avx512dq_rangepv2df_mask:
9888 case CODE_FOR_avx512dq_rangepv4sf_mask:
9889 case CODE_FOR_avx_shufpd256_mask:
9890 error ("the last argument must be a 4-bit immediate");
9891 return const0_rtx;
9892
9893 case CODE_FOR_sha1rnds4:
9894 case CODE_FOR_sse4_1_blendpd:
9895 case CODE_FOR_avx_vpermilv2df:
9896 case CODE_FOR_avx_vpermilv2df_mask:
9897 case CODE_FOR_xop_vpermil2v2df3:
9898 case CODE_FOR_xop_vpermil2v4sf3:
9899 case CODE_FOR_xop_vpermil2v4df3:
9900 case CODE_FOR_xop_vpermil2v8sf3:
9901 case CODE_FOR_avx512f_vinsertf32x4_mask:
9902 case CODE_FOR_avx512f_vinserti32x4_mask:
9903 case CODE_FOR_avx512f_vextractf32x4_mask:
9904 case CODE_FOR_avx512f_vextracti32x4_mask:
9905 case CODE_FOR_sse2_shufpd:
9906 case CODE_FOR_sse2_shufpd_mask:
9907 case CODE_FOR_avx512dq_shuf_f64x2_mask:
9908 case CODE_FOR_avx512dq_shuf_i64x2_mask:
9909 case CODE_FOR_avx512vl_shuf_i32x4_mask:
9910 case CODE_FOR_avx512vl_shuf_f32x4_mask:
9911 error ("the last argument must be a 2-bit immediate");
9912 return const0_rtx;
9913
9914 case CODE_FOR_avx_vextractf128v4df:
9915 case CODE_FOR_avx_vextractf128v8sf:
9916 case CODE_FOR_avx_vextractf128v8si:
9917 case CODE_FOR_avx_vinsertf128v4df:
9918 case CODE_FOR_avx_vinsertf128v8sf:
9919 case CODE_FOR_avx_vinsertf128v8si:
9920 case CODE_FOR_avx512f_vinsertf64x4_mask:
9921 case CODE_FOR_avx512f_vinserti64x4_mask:
9922 case CODE_FOR_avx512f_vextractf64x4_mask:
9923 case CODE_FOR_avx512f_vextracti64x4_mask:
9924 case CODE_FOR_avx512dq_vinsertf32x8_mask:
9925 case CODE_FOR_avx512dq_vinserti32x8_mask:
9926 case CODE_FOR_avx512vl_vinsertv4df:
9927 case CODE_FOR_avx512vl_vinsertv4di:
9928 case CODE_FOR_avx512vl_vinsertv8sf:
9929 case CODE_FOR_avx512vl_vinsertv8si:
9930 error ("the last argument must be a 1-bit immediate");
9931 return const0_rtx;
9932
9933 case CODE_FOR_avx_vmcmpv2df3:
9934 case CODE_FOR_avx_vmcmpv4sf3:
9935 case CODE_FOR_avx_cmpv2df3:
9936 case CODE_FOR_avx_cmpv4sf3:
9937 case CODE_FOR_avx_cmpv4df3:
9938 case CODE_FOR_avx_cmpv8sf3:
9939 case CODE_FOR_avx512f_cmpv8df3_mask:
9940 case CODE_FOR_avx512f_cmpv16sf3_mask:
9941 case CODE_FOR_avx512f_vmcmpv2df3_mask:
9942 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
9943 error ("the last argument must be a 5-bit immediate");
9944 return const0_rtx;
9945
9946 default:
9947 switch (nargs_constant)
9948 {
9949 case 2:
9950 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
9951 (!mask_pos && (nargs - i) == nargs_constant))
9952 {
9953 error ("the next to last argument must be an 8-bit immediate");
9954 break;
9955 }
9956 /* FALLTHRU */
9957 case 1:
9958 error ("the last argument must be an 8-bit immediate");
9959 break;
9960 default:
9961 gcc_unreachable ();
9962 }
9963 return const0_rtx;
9964 }
9965 }
9966 else
9967 {
9968 if (VECTOR_MODE_P (mode))
9969 op = safe_vector_operand (op, mode);
9970
9971 /* If we aren't optimizing, only allow one memory operand to
9972 be generated. */
9973 if (memory_operand (op, mode))
9974 num_memory++;
9975
9976 op = fixup_modeless_constant (op, mode);
9977
9978 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
9979 {
9980 if (optimize || !match || num_memory > 1)
9981 op = copy_to_mode_reg (mode, op);
9982 }
9983 else
9984 {
9985 op = copy_to_reg (op);
9986 op = lowpart_subreg (mode, op, GET_MODE (op));
9987 }
9988 }
9989
9990 xops[i] = op;
9991 }
9992
9993 switch (nargs)
9994 {
9995 case 1:
9996 pat = GEN_FCN (icode) (real_target, xops[0]);
9997 break;
9998 case 2:
9999 pat = GEN_FCN (icode) (real_target, xops[0], xops[1]);
10000 break;
10001 case 3:
10002 pat = GEN_FCN (icode) (real_target, xops[0], xops[1], xops[2]);
10003 break;
10004 case 4:
10005 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
10006 xops[2], xops[3]);
10007 break;
10008 case 5:
10009 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
10010 xops[2], xops[3], xops[4]);
10011 break;
10012 case 6:
10013 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
10014 xops[2], xops[3], xops[4], xops[5]);
10015 break;
10016 default:
10017 gcc_unreachable ();
10018 }
10019
10020 if (! pat)
10021 return 0;
10022
10023 emit_insn (pat);
10024 return target;
10025 }
10026
10027 /* Transform pattern of following layout:
10028 (set A
10029 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
10030 )
10031 into:
10032 (set (A B)) */
10033
10034 static rtx
10035 ix86_erase_embedded_rounding (rtx pat)
10036 {
10037 if (GET_CODE (pat) == INSN)
10038 pat = PATTERN (pat);
10039
10040 gcc_assert (GET_CODE (pat) == SET);
10041 rtx src = SET_SRC (pat);
10042 gcc_assert (XVECLEN (src, 0) == 2);
10043 rtx p0 = XVECEXP (src, 0, 0);
10044 gcc_assert (GET_CODE (src) == UNSPEC
10045 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
10046 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
10047 return res;
10048 }
10049
10050 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
10051 with rounding. */
10052 static rtx
10053 ix86_expand_sse_comi_round (const struct builtin_description *d,
10054 tree exp, rtx target)
10055 {
10056 rtx pat, set_dst;
10057 tree arg0 = CALL_EXPR_ARG (exp, 0);
10058 tree arg1 = CALL_EXPR_ARG (exp, 1);
10059 tree arg2 = CALL_EXPR_ARG (exp, 2);
10060 tree arg3 = CALL_EXPR_ARG (exp, 3);
10061 rtx op0 = expand_normal (arg0);
10062 rtx op1 = expand_normal (arg1);
10063 rtx op2 = expand_normal (arg2);
10064 rtx op3 = expand_normal (arg3);
10065 enum insn_code icode = d->icode;
10066 const struct insn_data_d *insn_p = &insn_data[icode];
10067 machine_mode mode0 = insn_p->operand[0].mode;
10068 machine_mode mode1 = insn_p->operand[1].mode;
10069
10070 /* See avxintrin.h for values. */
10071 static const enum rtx_code comparisons[32] =
10072 {
10073 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
10074 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
10075 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
10076 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
10077 };
10078 static const bool ordereds[32] =
10079 {
10080 true, true, true, false, false, false, false, true,
10081 false, false, false, true, true, true, true, false,
10082 true, true, true, false, false, false, false, true,
10083 false, false, false, true, true, true, true, false
10084 };
10085 static const bool non_signalings[32] =
10086 {
10087 true, false, false, true, true, false, false, true,
10088 true, false, false, true, true, false, false, true,
10089 false, true, true, false, false, true, true, false,
10090 false, true, true, false, false, true, true, false
10091 };
10092
10093 if (!CONST_INT_P (op2))
10094 {
10095 error ("the third argument must be comparison constant");
10096 return const0_rtx;
10097 }
10098 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
10099 {
10100 error ("incorrect comparison mode");
10101 return const0_rtx;
10102 }
10103
10104 if (!insn_p->operand[2].predicate (op3, SImode))
10105 {
10106 error ("incorrect rounding operand");
10107 return const0_rtx;
10108 }
10109
10110 if (VECTOR_MODE_P (mode0))
10111 op0 = safe_vector_operand (op0, mode0);
10112 if (VECTOR_MODE_P (mode1))
10113 op1 = safe_vector_operand (op1, mode1);
10114
10115 enum rtx_code comparison = comparisons[INTVAL (op2)];
10116 bool ordered = ordereds[INTVAL (op2)];
10117 bool non_signaling = non_signalings[INTVAL (op2)];
10118 rtx const_val = const0_rtx;
10119
10120 bool check_unordered = false;
10121 machine_mode mode = CCFPmode;
10122 switch (comparison)
10123 {
10124 case ORDERED:
10125 if (!ordered)
10126 {
10127 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
10128 if (!non_signaling)
10129 ordered = true;
10130 mode = CCSmode;
10131 }
10132 else
10133 {
10134 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
10135 if (non_signaling)
10136 ordered = false;
10137 mode = CCPmode;
10138 }
10139 comparison = NE;
10140 break;
10141 case UNORDERED:
10142 if (ordered)
10143 {
10144 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
10145 if (non_signaling)
10146 ordered = false;
10147 mode = CCSmode;
10148 }
10149 else
10150 {
10151 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
10152 if (!non_signaling)
10153 ordered = true;
10154 mode = CCPmode;
10155 }
10156 comparison = EQ;
10157 break;
10158
10159 case LE: /* -> GE */
10160 case LT: /* -> GT */
10161 case UNGE: /* -> UNLE */
10162 case UNGT: /* -> UNLT */
10163 std::swap (op0, op1);
10164 comparison = swap_condition (comparison);
10165 /* FALLTHRU */
10166 case GT:
10167 case GE:
10168 case UNEQ:
10169 case UNLT:
10170 case UNLE:
10171 case LTGT:
10172 /* These are supported by CCFPmode. NB: Use ordered/signaling
10173 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
10174 with NAN operands. */
10175 if (ordered == non_signaling)
10176 ordered = !ordered;
10177 break;
10178 case EQ:
10179 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10180 _CMP_EQ_OQ/_CMP_EQ_OS. */
10181 check_unordered = true;
10182 mode = CCZmode;
10183 break;
10184 case NE:
10185 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10186 _CMP_NEQ_UQ/_CMP_NEQ_US. */
10187 gcc_assert (!ordered);
10188 check_unordered = true;
10189 mode = CCZmode;
10190 const_val = const1_rtx;
10191 break;
10192 default:
10193 gcc_unreachable ();
10194 }
10195
10196 target = gen_reg_rtx (SImode);
10197 emit_move_insn (target, const_val);
10198 target = gen_rtx_SUBREG (QImode, target, 0);
10199
10200 if ((optimize && !register_operand (op0, mode0))
10201 || !insn_p->operand[0].predicate (op0, mode0))
10202 op0 = copy_to_mode_reg (mode0, op0);
10203 if ((optimize && !register_operand (op1, mode1))
10204 || !insn_p->operand[1].predicate (op1, mode1))
10205 op1 = copy_to_mode_reg (mode1, op1);
10206
10207 /*
10208 1. COMI: ordered and signaling.
10209 2. UCOMI: unordered and non-signaling.
10210 */
10211 if (non_signaling)
10212 icode = (icode == CODE_FOR_sse_comi_round
10213 ? CODE_FOR_sse_ucomi_round
10214 : CODE_FOR_sse2_ucomi_round);
10215
10216 pat = GEN_FCN (icode) (op0, op1, op3);
10217 if (! pat)
10218 return 0;
10219
10220 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
10221 if (INTVAL (op3) == NO_ROUND)
10222 {
10223 pat = ix86_erase_embedded_rounding (pat);
10224 if (! pat)
10225 return 0;
10226
10227 set_dst = SET_DEST (pat);
10228 }
10229 else
10230 {
10231 gcc_assert (GET_CODE (pat) == SET);
10232 set_dst = SET_DEST (pat);
10233 }
10234
10235 emit_insn (pat);
10236
10237 rtx_code_label *label = NULL;
10238
10239 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10240 with NAN operands. */
10241 if (check_unordered)
10242 {
10243 gcc_assert (comparison == EQ || comparison == NE);
10244
10245 rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
10246 label = gen_label_rtx ();
10247 rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
10248 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
10249 gen_rtx_LABEL_REF (VOIDmode, label),
10250 pc_rtx);
10251 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
10252 }
10253
10254 /* NB: Set CCFPmode and check a different CCmode which is in subset
10255 of CCFPmode. */
10256 if (GET_MODE (set_dst) != mode)
10257 {
10258 gcc_assert (mode == CCAmode || mode == CCCmode
10259 || mode == CCOmode || mode == CCPmode
10260 || mode == CCSmode || mode == CCZmode);
10261 set_dst = gen_rtx_REG (mode, FLAGS_REG);
10262 }
10263
10264 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10265 gen_rtx_fmt_ee (comparison, QImode,
10266 set_dst,
10267 const0_rtx)));
10268
10269 if (label)
10270 emit_label (label);
10271
10272 return SUBREG_REG (target);
10273 }
10274
10275 static rtx
10276 ix86_expand_round_builtin (const struct builtin_description *d,
10277 tree exp, rtx target)
10278 {
10279 rtx pat;
10280 unsigned int i, nargs;
10281 rtx xops[6];
10282 enum insn_code icode = d->icode;
10283 const struct insn_data_d *insn_p = &insn_data[icode];
10284 machine_mode tmode = insn_p->operand[0].mode;
10285 unsigned int nargs_constant = 0;
10286 unsigned int redundant_embed_rnd = 0;
10287
10288 switch ((enum ix86_builtin_func_type) d->flag)
10289 {
10290 case UINT64_FTYPE_V2DF_INT:
10291 case UINT64_FTYPE_V4SF_INT:
10292 case UINT_FTYPE_V2DF_INT:
10293 case UINT_FTYPE_V4SF_INT:
10294 case INT64_FTYPE_V2DF_INT:
10295 case INT64_FTYPE_V4SF_INT:
10296 case INT_FTYPE_V2DF_INT:
10297 case INT_FTYPE_V4SF_INT:
10298 nargs = 2;
10299 break;
10300 case V4SF_FTYPE_V4SF_UINT_INT:
10301 case V4SF_FTYPE_V4SF_UINT64_INT:
10302 case V2DF_FTYPE_V2DF_UINT64_INT:
10303 case V4SF_FTYPE_V4SF_INT_INT:
10304 case V4SF_FTYPE_V4SF_INT64_INT:
10305 case V2DF_FTYPE_V2DF_INT64_INT:
10306 case V4SF_FTYPE_V4SF_V4SF_INT:
10307 case V2DF_FTYPE_V2DF_V2DF_INT:
10308 case V4SF_FTYPE_V4SF_V2DF_INT:
10309 case V2DF_FTYPE_V2DF_V4SF_INT:
10310 nargs = 3;
10311 break;
10312 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
10313 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
10314 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
10315 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
10316 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
10317 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
10318 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
10319 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
10320 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
10321 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
10322 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
10323 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
10324 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
10325 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
10326 nargs = 4;
10327 break;
10328 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
10329 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
10330 nargs_constant = 2;
10331 nargs = 4;
10332 break;
10333 case INT_FTYPE_V4SF_V4SF_INT_INT:
10334 case INT_FTYPE_V2DF_V2DF_INT_INT:
10335 return ix86_expand_sse_comi_round (d, exp, target);
10336 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
10337 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
10338 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
10339 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
10340 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
10341 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
10342 case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
10343 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
10344 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
10345 case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
10346 nargs = 5;
10347 break;
10348 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
10349 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
10350 case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT:
10351 case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT:
10352 nargs_constant = 4;
10353 nargs = 5;
10354 break;
10355 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
10356 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
10357 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
10358 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
10359 nargs_constant = 3;
10360 nargs = 5;
10361 break;
10362 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
10363 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
10364 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
10365 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
10366 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
10367 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
10368 nargs = 6;
10369 nargs_constant = 4;
10370 break;
10371 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
10372 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
10373 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
10374 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
10375 nargs = 6;
10376 nargs_constant = 3;
10377 break;
10378 default:
10379 gcc_unreachable ();
10380 }
10381 gcc_assert (nargs <= ARRAY_SIZE (xops));
10382
10383 if (optimize
10384 || target == 0
10385 || GET_MODE (target) != tmode
10386 || !insn_p->operand[0].predicate (target, tmode))
10387 target = gen_reg_rtx (tmode);
10388
10389 for (i = 0; i < nargs; i++)
10390 {
10391 tree arg = CALL_EXPR_ARG (exp, i);
10392 rtx op = expand_normal (arg);
10393 machine_mode mode = insn_p->operand[i + 1].mode;
10394 bool match = insn_p->operand[i + 1].predicate (op, mode);
10395
10396 if (i == nargs - nargs_constant)
10397 {
10398 if (!match)
10399 {
10400 switch (icode)
10401 {
10402 case CODE_FOR_avx512f_getmantv8df_mask_round:
10403 case CODE_FOR_avx512f_getmantv16sf_mask_round:
10404 case CODE_FOR_avx512f_vgetmantv2df_round:
10405 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
10406 case CODE_FOR_avx512f_vgetmantv4sf_round:
10407 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
10408 error ("the immediate argument must be a 4-bit immediate");
10409 return const0_rtx;
10410 case CODE_FOR_avx512f_cmpv8df3_mask_round:
10411 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
10412 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
10413 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
10414 error ("the immediate argument must be a 5-bit immediate");
10415 return const0_rtx;
10416 default:
10417 error ("the immediate argument must be an 8-bit immediate");
10418 return const0_rtx;
10419 }
10420 }
10421 }
10422 else if (i == nargs-1)
10423 {
10424 if (!insn_p->operand[nargs].predicate (op, SImode))
10425 {
10426 error ("incorrect rounding operand");
10427 return const0_rtx;
10428 }
10429
10430 /* If there is no rounding use normal version of the pattern. */
10431 if (INTVAL (op) == NO_ROUND)
10432 redundant_embed_rnd = 1;
10433 }
10434 else
10435 {
10436 if (VECTOR_MODE_P (mode))
10437 op = safe_vector_operand (op, mode);
10438
10439 op = fixup_modeless_constant (op, mode);
10440
10441 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10442 {
10443 if (optimize || !match)
10444 op = copy_to_mode_reg (mode, op);
10445 }
10446 else
10447 {
10448 op = copy_to_reg (op);
10449 op = lowpart_subreg (mode, op, GET_MODE (op));
10450 }
10451 }
10452
10453 xops[i] = op;
10454 }
10455
10456 switch (nargs)
10457 {
10458 case 1:
10459 pat = GEN_FCN (icode) (target, xops[0]);
10460 break;
10461 case 2:
10462 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
10463 break;
10464 case 3:
10465 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
10466 break;
10467 case 4:
10468 pat = GEN_FCN (icode) (target, xops[0], xops[1],
10469 xops[2], xops[3]);
10470 break;
10471 case 5:
10472 pat = GEN_FCN (icode) (target, xops[0], xops[1],
10473 xops[2], xops[3], xops[4]);
10474 break;
10475 case 6:
10476 pat = GEN_FCN (icode) (target, xops[0], xops[1],
10477 xops[2], xops[3], xops[4], xops[5]);
10478 break;
10479 default:
10480 gcc_unreachable ();
10481 }
10482
10483 if (!pat)
10484 return 0;
10485
10486 if (redundant_embed_rnd)
10487 pat = ix86_erase_embedded_rounding (pat);
10488
10489 emit_insn (pat);
10490 return target;
10491 }
10492
10493 /* Subroutine of ix86_expand_builtin to take care of special insns
10494 with variable number of operands. */
10495
10496 static rtx
10497 ix86_expand_special_args_builtin (const struct builtin_description *d,
10498 tree exp, rtx target)
10499 {
10500 tree arg;
10501 rtx pat, op;
10502 unsigned int i, nargs, arg_adjust, memory;
10503 bool aligned_mem = false;
10504 rtx xops[3];
10505 enum insn_code icode = d->icode;
10506 const struct insn_data_d *insn_p = &insn_data[icode];
10507 machine_mode tmode = insn_p->operand[0].mode;
10508 enum { load, store } klass;
10509
10510 switch ((enum ix86_builtin_func_type) d->flag)
10511 {
10512 case VOID_FTYPE_VOID:
10513 emit_insn (GEN_FCN (icode) (target));
10514 return 0;
10515 case VOID_FTYPE_UINT64:
10516 case VOID_FTYPE_UNSIGNED:
10517 nargs = 0;
10518 klass = store;
10519 memory = 0;
10520 break;
10521
10522 case INT_FTYPE_VOID:
10523 case USHORT_FTYPE_VOID:
10524 case UINT64_FTYPE_VOID:
10525 case UINT_FTYPE_VOID:
10526 case UINT8_FTYPE_VOID:
10527 case UNSIGNED_FTYPE_VOID:
10528 nargs = 0;
10529 klass = load;
10530 memory = 0;
10531 break;
10532 case UINT64_FTYPE_PUNSIGNED:
10533 case V2DI_FTYPE_PV2DI:
10534 case V4DI_FTYPE_PV4DI:
10535 case V32QI_FTYPE_PCCHAR:
10536 case V16QI_FTYPE_PCCHAR:
10537 case V8SF_FTYPE_PCV4SF:
10538 case V8SF_FTYPE_PCFLOAT:
10539 case V4SF_FTYPE_PCFLOAT:
10540 case V4DF_FTYPE_PCV2DF:
10541 case V4DF_FTYPE_PCDOUBLE:
10542 case V2DF_FTYPE_PCDOUBLE:
10543 case VOID_FTYPE_PVOID:
10544 case V8DI_FTYPE_PV8DI:
10545 nargs = 1;
10546 klass = load;
10547 memory = 0;
10548 switch (icode)
10549 {
10550 case CODE_FOR_sse4_1_movntdqa:
10551 case CODE_FOR_avx2_movntdqa:
10552 case CODE_FOR_avx512f_movntdqa:
10553 aligned_mem = true;
10554 break;
10555 default:
10556 break;
10557 }
10558 break;
10559 case VOID_FTYPE_PV2SF_V4SF:
10560 case VOID_FTYPE_PV8DI_V8DI:
10561 case VOID_FTYPE_PV4DI_V4DI:
10562 case VOID_FTYPE_PV2DI_V2DI:
10563 case VOID_FTYPE_PCHAR_V32QI:
10564 case VOID_FTYPE_PCHAR_V16QI:
10565 case VOID_FTYPE_PFLOAT_V16SF:
10566 case VOID_FTYPE_PFLOAT_V8SF:
10567 case VOID_FTYPE_PFLOAT_V4SF:
10568 case VOID_FTYPE_PDOUBLE_V8DF:
10569 case VOID_FTYPE_PDOUBLE_V4DF:
10570 case VOID_FTYPE_PDOUBLE_V2DF:
10571 case VOID_FTYPE_PLONGLONG_LONGLONG:
10572 case VOID_FTYPE_PULONGLONG_ULONGLONG:
10573 case VOID_FTYPE_PUNSIGNED_UNSIGNED:
10574 case VOID_FTYPE_PINT_INT:
10575 nargs = 1;
10576 klass = store;
10577 /* Reserve memory operand for target. */
10578 memory = ARRAY_SIZE (xops);
10579 switch (icode)
10580 {
10581 /* These builtins and instructions require the memory
10582 to be properly aligned. */
10583 case CODE_FOR_avx_movntv4di:
10584 case CODE_FOR_sse2_movntv2di:
10585 case CODE_FOR_avx_movntv8sf:
10586 case CODE_FOR_sse_movntv4sf:
10587 case CODE_FOR_sse4a_vmmovntv4sf:
10588 case CODE_FOR_avx_movntv4df:
10589 case CODE_FOR_sse2_movntv2df:
10590 case CODE_FOR_sse4a_vmmovntv2df:
10591 case CODE_FOR_sse2_movntidi:
10592 case CODE_FOR_sse_movntq:
10593 case CODE_FOR_sse2_movntisi:
10594 case CODE_FOR_avx512f_movntv16sf:
10595 case CODE_FOR_avx512f_movntv8df:
10596 case CODE_FOR_avx512f_movntv8di:
10597 aligned_mem = true;
10598 break;
10599 default:
10600 break;
10601 }
10602 break;
10603 case VOID_FTYPE_PVOID_PCVOID:
10604 nargs = 1;
10605 klass = store;
10606 memory = 0;
10607
10608 break;
10609 case V4SF_FTYPE_V4SF_PCV2SF:
10610 case V2DF_FTYPE_V2DF_PCDOUBLE:
10611 nargs = 2;
10612 klass = load;
10613 memory = 1;
10614 break;
10615 case V8SF_FTYPE_PCV8SF_V8SI:
10616 case V4DF_FTYPE_PCV4DF_V4DI:
10617 case V4SF_FTYPE_PCV4SF_V4SI:
10618 case V2DF_FTYPE_PCV2DF_V2DI:
10619 case V8SI_FTYPE_PCV8SI_V8SI:
10620 case V4DI_FTYPE_PCV4DI_V4DI:
10621 case V4SI_FTYPE_PCV4SI_V4SI:
10622 case V2DI_FTYPE_PCV2DI_V2DI:
10623 case VOID_FTYPE_INT_INT64:
10624 nargs = 2;
10625 klass = load;
10626 memory = 0;
10627 break;
10628 case VOID_FTYPE_PV8DF_V8DF_UQI:
10629 case VOID_FTYPE_PV4DF_V4DF_UQI:
10630 case VOID_FTYPE_PV2DF_V2DF_UQI:
10631 case VOID_FTYPE_PV16SF_V16SF_UHI:
10632 case VOID_FTYPE_PV8SF_V8SF_UQI:
10633 case VOID_FTYPE_PV4SF_V4SF_UQI:
10634 case VOID_FTYPE_PV8DI_V8DI_UQI:
10635 case VOID_FTYPE_PV4DI_V4DI_UQI:
10636 case VOID_FTYPE_PV2DI_V2DI_UQI:
10637 case VOID_FTYPE_PV16SI_V16SI_UHI:
10638 case VOID_FTYPE_PV8SI_V8SI_UQI:
10639 case VOID_FTYPE_PV4SI_V4SI_UQI:
10640 case VOID_FTYPE_PV64QI_V64QI_UDI:
10641 case VOID_FTYPE_PV32HI_V32HI_USI:
10642 case VOID_FTYPE_PV32QI_V32QI_USI:
10643 case VOID_FTYPE_PV16QI_V16QI_UHI:
10644 case VOID_FTYPE_PV16HI_V16HI_UHI:
10645 case VOID_FTYPE_PV8HI_V8HI_UQI:
10646 switch (icode)
10647 {
10648 /* These builtins and instructions require the memory
10649 to be properly aligned. */
10650 case CODE_FOR_avx512f_storev16sf_mask:
10651 case CODE_FOR_avx512f_storev16si_mask:
10652 case CODE_FOR_avx512f_storev8df_mask:
10653 case CODE_FOR_avx512f_storev8di_mask:
10654 case CODE_FOR_avx512vl_storev8sf_mask:
10655 case CODE_FOR_avx512vl_storev8si_mask:
10656 case CODE_FOR_avx512vl_storev4df_mask:
10657 case CODE_FOR_avx512vl_storev4di_mask:
10658 case CODE_FOR_avx512vl_storev4sf_mask:
10659 case CODE_FOR_avx512vl_storev4si_mask:
10660 case CODE_FOR_avx512vl_storev2df_mask:
10661 case CODE_FOR_avx512vl_storev2di_mask:
10662 aligned_mem = true;
10663 break;
10664 default:
10665 break;
10666 }
10667 /* FALLTHRU */
10668 case VOID_FTYPE_PV8SF_V8SI_V8SF:
10669 case VOID_FTYPE_PV4DF_V4DI_V4DF:
10670 case VOID_FTYPE_PV4SF_V4SI_V4SF:
10671 case VOID_FTYPE_PV2DF_V2DI_V2DF:
10672 case VOID_FTYPE_PV8SI_V8SI_V8SI:
10673 case VOID_FTYPE_PV4DI_V4DI_V4DI:
10674 case VOID_FTYPE_PV4SI_V4SI_V4SI:
10675 case VOID_FTYPE_PV2DI_V2DI_V2DI:
10676 case VOID_FTYPE_PV8SI_V8DI_UQI:
10677 case VOID_FTYPE_PV8HI_V8DI_UQI:
10678 case VOID_FTYPE_PV16HI_V16SI_UHI:
10679 case VOID_FTYPE_PUDI_V8DI_UQI:
10680 case VOID_FTYPE_PV16QI_V16SI_UHI:
10681 case VOID_FTYPE_PV4SI_V4DI_UQI:
10682 case VOID_FTYPE_PUDI_V2DI_UQI:
10683 case VOID_FTYPE_PUDI_V4DI_UQI:
10684 case VOID_FTYPE_PUSI_V2DI_UQI:
10685 case VOID_FTYPE_PV8HI_V8SI_UQI:
10686 case VOID_FTYPE_PUDI_V4SI_UQI:
10687 case VOID_FTYPE_PUSI_V4DI_UQI:
10688 case VOID_FTYPE_PUHI_V2DI_UQI:
10689 case VOID_FTYPE_PUDI_V8SI_UQI:
10690 case VOID_FTYPE_PUSI_V4SI_UQI:
10691 case VOID_FTYPE_PCHAR_V64QI_UDI:
10692 case VOID_FTYPE_PCHAR_V32QI_USI:
10693 case VOID_FTYPE_PCHAR_V16QI_UHI:
10694 case VOID_FTYPE_PSHORT_V32HI_USI:
10695 case VOID_FTYPE_PSHORT_V16HI_UHI:
10696 case VOID_FTYPE_PSHORT_V8HI_UQI:
10697 case VOID_FTYPE_PINT_V16SI_UHI:
10698 case VOID_FTYPE_PINT_V8SI_UQI:
10699 case VOID_FTYPE_PINT_V4SI_UQI:
10700 case VOID_FTYPE_PINT64_V8DI_UQI:
10701 case VOID_FTYPE_PINT64_V4DI_UQI:
10702 case VOID_FTYPE_PINT64_V2DI_UQI:
10703 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
10704 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
10705 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
10706 case VOID_FTYPE_PFLOAT_V16SF_UHI:
10707 case VOID_FTYPE_PFLOAT_V8SF_UQI:
10708 case VOID_FTYPE_PFLOAT_V4SF_UQI:
10709 case VOID_FTYPE_PV32QI_V32HI_USI:
10710 case VOID_FTYPE_PV16QI_V16HI_UHI:
10711 case VOID_FTYPE_PUDI_V8HI_UQI:
10712 nargs = 2;
10713 klass = store;
10714 /* Reserve memory operand for target. */
10715 memory = ARRAY_SIZE (xops);
10716 break;
10717 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
10718 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
10719 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
10720 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
10721 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
10722 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
10723 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
10724 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
10725 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
10726 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
10727 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
10728 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
10729 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
10730 case V32HI_FTYPE_PCV32HI_V32HI_USI:
10731 case V32QI_FTYPE_PCV32QI_V32QI_USI:
10732 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
10733 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
10734 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
10735 switch (icode)
10736 {
10737 /* These builtins and instructions require the memory
10738 to be properly aligned. */
10739 case CODE_FOR_avx512f_loadv16sf_mask:
10740 case CODE_FOR_avx512f_loadv16si_mask:
10741 case CODE_FOR_avx512f_loadv8df_mask:
10742 case CODE_FOR_avx512f_loadv8di_mask:
10743 case CODE_FOR_avx512vl_loadv8sf_mask:
10744 case CODE_FOR_avx512vl_loadv8si_mask:
10745 case CODE_FOR_avx512vl_loadv4df_mask:
10746 case CODE_FOR_avx512vl_loadv4di_mask:
10747 case CODE_FOR_avx512vl_loadv4sf_mask:
10748 case CODE_FOR_avx512vl_loadv4si_mask:
10749 case CODE_FOR_avx512vl_loadv2df_mask:
10750 case CODE_FOR_avx512vl_loadv2di_mask:
10751 case CODE_FOR_avx512bw_loadv64qi_mask:
10752 case CODE_FOR_avx512vl_loadv32qi_mask:
10753 case CODE_FOR_avx512vl_loadv16qi_mask:
10754 case CODE_FOR_avx512bw_loadv32hi_mask:
10755 case CODE_FOR_avx512vl_loadv16hi_mask:
10756 case CODE_FOR_avx512vl_loadv8hi_mask:
10757 aligned_mem = true;
10758 break;
10759 default:
10760 break;
10761 }
10762 /* FALLTHRU */
10763 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
10764 case V32QI_FTYPE_PCCHAR_V32QI_USI:
10765 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
10766 case V32HI_FTYPE_PCSHORT_V32HI_USI:
10767 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
10768 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
10769 case V16SI_FTYPE_PCINT_V16SI_UHI:
10770 case V8SI_FTYPE_PCINT_V8SI_UQI:
10771 case V4SI_FTYPE_PCINT_V4SI_UQI:
10772 case V8DI_FTYPE_PCINT64_V8DI_UQI:
10773 case V4DI_FTYPE_PCINT64_V4DI_UQI:
10774 case V2DI_FTYPE_PCINT64_V2DI_UQI:
10775 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
10776 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
10777 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
10778 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
10779 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
10780 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
10781 nargs = 3;
10782 klass = load;
10783 memory = 0;
10784 break;
10785 default:
10786 gcc_unreachable ();
10787 }
10788
10789 gcc_assert (nargs <= ARRAY_SIZE (xops));
10790
10791 if (klass == store)
10792 {
10793 arg = CALL_EXPR_ARG (exp, 0);
10794 op = expand_normal (arg);
10795 gcc_assert (target == 0);
10796 if (memory)
10797 {
10798 op = ix86_zero_extend_to_Pmode (op);
10799 target = gen_rtx_MEM (tmode, op);
10800 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
10801 on it. Try to improve it using get_pointer_alignment,
10802 and if the special builtin is one that requires strict
10803 mode alignment, also from it's GET_MODE_ALIGNMENT.
10804 Failure to do so could lead to ix86_legitimate_combined_insn
10805 rejecting all changes to such insns. */
10806 unsigned int align = get_pointer_alignment (arg);
10807 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
10808 align = GET_MODE_ALIGNMENT (tmode);
10809 if (MEM_ALIGN (target) < align)
10810 set_mem_align (target, align);
10811 }
10812 else
10813 target = force_reg (tmode, op);
10814 arg_adjust = 1;
10815 }
10816 else
10817 {
10818 arg_adjust = 0;
10819 if (optimize
10820 || target == 0
10821 || !register_operand (target, tmode)
10822 || GET_MODE (target) != tmode)
10823 target = gen_reg_rtx (tmode);
10824 }
10825
10826 for (i = 0; i < nargs; i++)
10827 {
10828 machine_mode mode = insn_p->operand[i + 1].mode;
10829
10830 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
10831 op = expand_normal (arg);
10832
10833 if (i == memory)
10834 {
10835 /* This must be the memory operand. */
10836 op = ix86_zero_extend_to_Pmode (op);
10837 op = gen_rtx_MEM (mode, op);
10838 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
10839 on it. Try to improve it using get_pointer_alignment,
10840 and if the special builtin is one that requires strict
10841 mode alignment, also from it's GET_MODE_ALIGNMENT.
10842 Failure to do so could lead to ix86_legitimate_combined_insn
10843 rejecting all changes to such insns. */
10844 unsigned int align = get_pointer_alignment (arg);
10845 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
10846 align = GET_MODE_ALIGNMENT (mode);
10847 if (MEM_ALIGN (op) < align)
10848 set_mem_align (op, align);
10849 }
10850 else
10851 {
10852 /* This must be register. */
10853 if (VECTOR_MODE_P (mode))
10854 op = safe_vector_operand (op, mode);
10855
10856 op = fixup_modeless_constant (op, mode);
10857
10858 /* NB: 3-operands load implied it's a mask load,
10859 and that mask operand shoud be at the end.
10860 Keep all-ones mask which would be simplified by the expander. */
10861 if (nargs == 3 && i == 2 && klass == load
10862 && constm1_operand (op, mode))
10863 ;
10864 else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10865 op = copy_to_mode_reg (mode, op);
10866 else
10867 {
10868 op = copy_to_reg (op);
10869 op = lowpart_subreg (mode, op, GET_MODE (op));
10870 }
10871 }
10872
10873 xops[i]= op;
10874 }
10875
10876 switch (nargs)
10877 {
10878 case 0:
10879 pat = GEN_FCN (icode) (target);
10880 break;
10881 case 1:
10882 pat = GEN_FCN (icode) (target, xops[0]);
10883 break;
10884 case 2:
10885 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
10886 break;
10887 case 3:
10888 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
10889 break;
10890 default:
10891 gcc_unreachable ();
10892 }
10893
10894 if (! pat)
10895 return 0;
10896
10897 emit_insn (pat);
10898 return klass == store ? 0 : target;
10899 }
10900
10901 /* Return the integer constant in ARG. Constrain it to be in the range
10902 of the subparts of VEC_TYPE; issue an error if not. */
10903
10904 static int
10905 get_element_number (tree vec_type, tree arg)
10906 {
10907 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
10908
10909 if (!tree_fits_uhwi_p (arg)
10910 || (elt = tree_to_uhwi (arg), elt > max))
10911 {
10912 error ("selector must be an integer constant in the range "
10913 "[0, %wi]", max);
10914 return 0;
10915 }
10916
10917 return elt;
10918 }
10919
10920 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10921 ix86_expand_vector_init. We DO have language-level syntax for this, in
10922 the form of (type){ init-list }. Except that since we can't place emms
10923 instructions from inside the compiler, we can't allow the use of MMX
10924 registers unless the user explicitly asks for it. So we do *not* define
10925 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
10926 we have builtins invoked by mmintrin.h that gives us license to emit
10927 these sorts of instructions. */
10928
10929 static rtx
10930 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
10931 {
10932 machine_mode tmode = TYPE_MODE (type);
10933 machine_mode inner_mode = GET_MODE_INNER (tmode);
10934 int i, n_elt = GET_MODE_NUNITS (tmode);
10935 rtvec v = rtvec_alloc (n_elt);
10936
10937 gcc_assert (VECTOR_MODE_P (tmode));
10938 gcc_assert (call_expr_nargs (exp) == n_elt);
10939
10940 for (i = 0; i < n_elt; ++i)
10941 {
10942 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
10943 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
10944 }
10945
10946 if (!target || !register_operand (target, tmode))
10947 target = gen_reg_rtx (tmode);
10948
10949 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
10950 return target;
10951 }
10952
10953 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10954 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
10955 had a language-level syntax for referencing vector elements. */
10956
10957 static rtx
10958 ix86_expand_vec_ext_builtin (tree exp, rtx target)
10959 {
10960 machine_mode tmode, mode0;
10961 tree arg0, arg1;
10962 int elt;
10963 rtx op0;
10964
10965 arg0 = CALL_EXPR_ARG (exp, 0);
10966 arg1 = CALL_EXPR_ARG (exp, 1);
10967
10968 op0 = expand_normal (arg0);
10969 elt = get_element_number (TREE_TYPE (arg0), arg1);
10970
10971 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
10972 mode0 = TYPE_MODE (TREE_TYPE (arg0));
10973 gcc_assert (VECTOR_MODE_P (mode0));
10974
10975 op0 = force_reg (mode0, op0);
10976
10977 if (optimize || !target || !register_operand (target, tmode))
10978 target = gen_reg_rtx (tmode);
10979
10980 ix86_expand_vector_extract (true, target, op0, elt);
10981
10982 return target;
10983 }
10984
10985 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10986 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
10987 a language-level syntax for referencing vector elements. */
10988
10989 static rtx
10990 ix86_expand_vec_set_builtin (tree exp)
10991 {
10992 machine_mode tmode, mode1;
10993 tree arg0, arg1, arg2;
10994 int elt;
10995 rtx op0, op1, target;
10996
10997 arg0 = CALL_EXPR_ARG (exp, 0);
10998 arg1 = CALL_EXPR_ARG (exp, 1);
10999 arg2 = CALL_EXPR_ARG (exp, 2);
11000
11001 tmode = TYPE_MODE (TREE_TYPE (arg0));
11002 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
11003 gcc_assert (VECTOR_MODE_P (tmode));
11004
11005 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
11006 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
11007 elt = get_element_number (TREE_TYPE (arg0), arg2);
11008
11009 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
11010 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
11011
11012 op0 = force_reg (tmode, op0);
11013 op1 = force_reg (mode1, op1);
11014
11015 /* OP0 is the source of these builtin functions and shouldn't be
11016 modified. Create a copy, use it and return it as target. */
11017 target = gen_reg_rtx (tmode);
11018 emit_move_insn (target, op0);
11019 ix86_expand_vector_set (true, target, op1, elt);
11020
11021 return target;
11022 }
11023
11024 /* Expand an expression EXP that calls a built-in function,
11025 with result going to TARGET if that's convenient
11026 (and in mode MODE if that's convenient).
11027 SUBTARGET may be used as the target for computing one of EXP's operands.
11028 IGNORE is nonzero if the value is to be ignored. */
11029
11030 rtx
11031 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
11032 machine_mode mode, int ignore)
11033 {
11034 size_t i;
11035 enum insn_code icode, icode2;
11036 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
11037 tree arg0, arg1, arg2, arg3, arg4;
11038 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
11039 machine_mode mode0, mode1, mode2, mode3, mode4;
11040 unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
11041
11042 /* For CPU builtins that can be folded, fold first and expand the fold. */
11043 switch (fcode)
11044 {
11045 case IX86_BUILTIN_CPU_INIT:
11046 {
11047 /* Make it call __cpu_indicator_init in libgcc. */
11048 tree call_expr, fndecl, type;
11049 type = build_function_type_list (integer_type_node, NULL_TREE);
11050 fndecl = build_fn_decl ("__cpu_indicator_init", type);
11051 call_expr = build_call_expr (fndecl, 0);
11052 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
11053 }
11054 case IX86_BUILTIN_CPU_IS:
11055 case IX86_BUILTIN_CPU_SUPPORTS:
11056 {
11057 tree arg0 = CALL_EXPR_ARG (exp, 0);
11058 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
11059 gcc_assert (fold_expr != NULL_TREE);
11060 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
11061 }
11062 }
11063
11064 HOST_WIDE_INT isa = ix86_isa_flags;
11065 HOST_WIDE_INT isa2 = ix86_isa_flags2;
11066 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
11067 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
11068 /* The general case is we require all the ISAs specified in bisa{,2}
11069 to be enabled.
11070 The exceptions are:
11071 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
11072 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
11073 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
11074 (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
11075 OPTION_MASK_ISA2_AVXVNNI
11076 where for each such pair it is sufficient if either of the ISAs is
11077 enabled, plus if it is ored with other options also those others.
11078 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
11079 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
11080 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
11081 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
11082 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
11083
11084 if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
11085 == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
11086 && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
11087 isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
11088
11089 if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
11090 == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
11091 && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
11092 isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
11093
11094 if ((((bisa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
11095 == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
11096 || (bisa2 & OPTION_MASK_ISA2_AVXVNNI) != 0)
11097 && (((isa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
11098 == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
11099 || (isa2 & OPTION_MASK_ISA2_AVXVNNI) != 0))
11100 {
11101 isa |= OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL;
11102 isa2 |= OPTION_MASK_ISA2_AVXVNNI;
11103 }
11104
11105 if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
11106 /* __builtin_ia32_maskmovq requires MMX registers. */
11107 && fcode != IX86_BUILTIN_MASKMOVQ)
11108 {
11109 bisa &= ~OPTION_MASK_ISA_MMX;
11110 bisa |= OPTION_MASK_ISA_SSE2;
11111 }
11112
11113 if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
11114 {
11115 bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
11116 if (TARGET_ABI_X32)
11117 bisa |= OPTION_MASK_ABI_X32;
11118 else
11119 bisa |= OPTION_MASK_ABI_64;
11120 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
11121 (enum fpmath_unit) 0,
11122 (enum prefer_vector_width) 0,
11123 false, add_abi_p);
11124 if (!opts)
11125 error ("%qE needs unknown isa option", fndecl);
11126 else
11127 {
11128 gcc_assert (opts != NULL);
11129 error ("%qE needs isa option %s", fndecl, opts);
11130 free (opts);
11131 }
11132 return expand_call (exp, target, ignore);
11133 }
11134
11135 switch (fcode)
11136 {
11137 case IX86_BUILTIN_MASKMOVQ:
11138 case IX86_BUILTIN_MASKMOVDQU:
11139 icode = (fcode == IX86_BUILTIN_MASKMOVQ
11140 ? CODE_FOR_mmx_maskmovq
11141 : CODE_FOR_sse2_maskmovdqu);
11142 /* Note the arg order is different from the operand order. */
11143 arg1 = CALL_EXPR_ARG (exp, 0);
11144 arg2 = CALL_EXPR_ARG (exp, 1);
11145 arg0 = CALL_EXPR_ARG (exp, 2);
11146 op0 = expand_normal (arg0);
11147 op1 = expand_normal (arg1);
11148 op2 = expand_normal (arg2);
11149 mode0 = insn_data[icode].operand[0].mode;
11150 mode1 = insn_data[icode].operand[1].mode;
11151 mode2 = insn_data[icode].operand[2].mode;
11152
11153 op0 = ix86_zero_extend_to_Pmode (op0);
11154 op0 = gen_rtx_MEM (mode1, op0);
11155
11156 if (!insn_data[icode].operand[0].predicate (op0, mode0))
11157 op0 = copy_to_mode_reg (mode0, op0);
11158 if (!insn_data[icode].operand[1].predicate (op1, mode1))
11159 op1 = copy_to_mode_reg (mode1, op1);
11160 if (!insn_data[icode].operand[2].predicate (op2, mode2))
11161 op2 = copy_to_mode_reg (mode2, op2);
11162 pat = GEN_FCN (icode) (op0, op1, op2);
11163 if (! pat)
11164 return 0;
11165 emit_insn (pat);
11166 return 0;
11167
11168 case IX86_BUILTIN_LDMXCSR:
11169 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
11170 target = assign_386_stack_local (SImode, SLOT_TEMP);
11171 emit_move_insn (target, op0);
11172 emit_insn (gen_sse_ldmxcsr (target));
11173 return 0;
11174
11175 case IX86_BUILTIN_STMXCSR:
11176 target = assign_386_stack_local (SImode, SLOT_TEMP);
11177 emit_insn (gen_sse_stmxcsr (target));
11178 return copy_to_mode_reg (SImode, target);
11179
11180 case IX86_BUILTIN_CLFLUSH:
11181 arg0 = CALL_EXPR_ARG (exp, 0);
11182 op0 = expand_normal (arg0);
11183 icode = CODE_FOR_sse2_clflush;
11184 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11185 op0 = ix86_zero_extend_to_Pmode (op0);
11186
11187 emit_insn (gen_sse2_clflush (op0));
11188 return 0;
11189
11190 case IX86_BUILTIN_CLWB:
11191 arg0 = CALL_EXPR_ARG (exp, 0);
11192 op0 = expand_normal (arg0);
11193 icode = CODE_FOR_clwb;
11194 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11195 op0 = ix86_zero_extend_to_Pmode (op0);
11196
11197 emit_insn (gen_clwb (op0));
11198 return 0;
11199
11200 case IX86_BUILTIN_CLFLUSHOPT:
11201 arg0 = CALL_EXPR_ARG (exp, 0);
11202 op0 = expand_normal (arg0);
11203 icode = CODE_FOR_clflushopt;
11204 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11205 op0 = ix86_zero_extend_to_Pmode (op0);
11206
11207 emit_insn (gen_clflushopt (op0));
11208 return 0;
11209
11210 case IX86_BUILTIN_MONITOR:
11211 case IX86_BUILTIN_MONITORX:
11212 arg0 = CALL_EXPR_ARG (exp, 0);
11213 arg1 = CALL_EXPR_ARG (exp, 1);
11214 arg2 = CALL_EXPR_ARG (exp, 2);
11215 op0 = expand_normal (arg0);
11216 op1 = expand_normal (arg1);
11217 op2 = expand_normal (arg2);
11218 if (!REG_P (op0))
11219 op0 = ix86_zero_extend_to_Pmode (op0);
11220 if (!REG_P (op1))
11221 op1 = copy_to_mode_reg (SImode, op1);
11222 if (!REG_P (op2))
11223 op2 = copy_to_mode_reg (SImode, op2);
11224
11225 emit_insn (fcode == IX86_BUILTIN_MONITOR
11226 ? gen_sse3_monitor (Pmode, op0, op1, op2)
11227 : gen_monitorx (Pmode, op0, op1, op2));
11228 return 0;
11229
11230 case IX86_BUILTIN_MWAIT:
11231 arg0 = CALL_EXPR_ARG (exp, 0);
11232 arg1 = CALL_EXPR_ARG (exp, 1);
11233 op0 = expand_normal (arg0);
11234 op1 = expand_normal (arg1);
11235 if (!REG_P (op0))
11236 op0 = copy_to_mode_reg (SImode, op0);
11237 if (!REG_P (op1))
11238 op1 = copy_to_mode_reg (SImode, op1);
11239 emit_insn (gen_sse3_mwait (op0, op1));
11240 return 0;
11241
11242 case IX86_BUILTIN_MWAITX:
11243 arg0 = CALL_EXPR_ARG (exp, 0);
11244 arg1 = CALL_EXPR_ARG (exp, 1);
11245 arg2 = CALL_EXPR_ARG (exp, 2);
11246 op0 = expand_normal (arg0);
11247 op1 = expand_normal (arg1);
11248 op2 = expand_normal (arg2);
11249 if (!REG_P (op0))
11250 op0 = copy_to_mode_reg (SImode, op0);
11251 if (!REG_P (op1))
11252 op1 = copy_to_mode_reg (SImode, op1);
11253 if (!REG_P (op2))
11254 op2 = copy_to_mode_reg (SImode, op2);
11255 emit_insn (gen_mwaitx (op0, op1, op2));
11256 return 0;
11257
11258 case IX86_BUILTIN_UMONITOR:
11259 arg0 = CALL_EXPR_ARG (exp, 0);
11260 op0 = expand_normal (arg0);
11261
11262 op0 = ix86_zero_extend_to_Pmode (op0);
11263 emit_insn (gen_umonitor (Pmode, op0));
11264 return 0;
11265
11266 case IX86_BUILTIN_UMWAIT:
11267 case IX86_BUILTIN_TPAUSE:
11268 arg0 = CALL_EXPR_ARG (exp, 0);
11269 arg1 = CALL_EXPR_ARG (exp, 1);
11270 op0 = expand_normal (arg0);
11271 op1 = expand_normal (arg1);
11272
11273 if (!REG_P (op0))
11274 op0 = copy_to_mode_reg (SImode, op0);
11275
11276 op1 = force_reg (DImode, op1);
11277
11278 if (TARGET_64BIT)
11279 {
11280 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11281 NULL, 1, OPTAB_DIRECT);
11282 switch (fcode)
11283 {
11284 case IX86_BUILTIN_UMWAIT:
11285 icode = CODE_FOR_umwait_rex64;
11286 break;
11287 case IX86_BUILTIN_TPAUSE:
11288 icode = CODE_FOR_tpause_rex64;
11289 break;
11290 default:
11291 gcc_unreachable ();
11292 }
11293
11294 op2 = gen_lowpart (SImode, op2);
11295 op1 = gen_lowpart (SImode, op1);
11296 pat = GEN_FCN (icode) (op0, op1, op2);
11297 }
11298 else
11299 {
11300 switch (fcode)
11301 {
11302 case IX86_BUILTIN_UMWAIT:
11303 icode = CODE_FOR_umwait;
11304 break;
11305 case IX86_BUILTIN_TPAUSE:
11306 icode = CODE_FOR_tpause;
11307 break;
11308 default:
11309 gcc_unreachable ();
11310 }
11311 pat = GEN_FCN (icode) (op0, op1);
11312 }
11313
11314 if (!pat)
11315 return 0;
11316
11317 emit_insn (pat);
11318
11319 if (target == 0
11320 || !register_operand (target, QImode))
11321 target = gen_reg_rtx (QImode);
11322
11323 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11324 const0_rtx);
11325 emit_insn (gen_rtx_SET (target, pat));
11326
11327 return target;
11328
11329 case IX86_BUILTIN_TESTUI:
11330 emit_insn (gen_testui ());
11331
11332 if (target == 0
11333 || !register_operand (target, QImode))
11334 target = gen_reg_rtx (QImode);
11335
11336 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11337 const0_rtx);
11338 emit_insn (gen_rtx_SET (target, pat));
11339
11340 return target;
11341
11342 case IX86_BUILTIN_CLZERO:
11343 arg0 = CALL_EXPR_ARG (exp, 0);
11344 op0 = expand_normal (arg0);
11345 if (!REG_P (op0))
11346 op0 = ix86_zero_extend_to_Pmode (op0);
11347 emit_insn (gen_clzero (Pmode, op0));
11348 return 0;
11349
11350 case IX86_BUILTIN_CLDEMOTE:
11351 arg0 = CALL_EXPR_ARG (exp, 0);
11352 op0 = expand_normal (arg0);
11353 icode = CODE_FOR_cldemote;
11354 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11355 op0 = ix86_zero_extend_to_Pmode (op0);
11356
11357 emit_insn (gen_cldemote (op0));
11358 return 0;
11359
11360 case IX86_BUILTIN_LOADIWKEY:
11361 {
11362 arg0 = CALL_EXPR_ARG (exp, 0);
11363 arg1 = CALL_EXPR_ARG (exp, 1);
11364 arg2 = CALL_EXPR_ARG (exp, 2);
11365 arg3 = CALL_EXPR_ARG (exp, 3);
11366
11367 op0 = expand_normal (arg0);
11368 op1 = expand_normal (arg1);
11369 op2 = expand_normal (arg2);
11370 op3 = expand_normal (arg3);
11371
11372 if (!REG_P (op0))
11373 op0 = copy_to_mode_reg (V2DImode, op0);
11374 if (!REG_P (op1))
11375 op1 = copy_to_mode_reg (V2DImode, op1);
11376 if (!REG_P (op2))
11377 op2 = copy_to_mode_reg (V2DImode, op2);
11378 if (!REG_P (op3))
11379 op3 = copy_to_mode_reg (SImode, op3);
11380
11381 emit_insn (gen_loadiwkey (op0, op1, op2, op3));
11382
11383 return 0;
11384 }
11385
11386 case IX86_BUILTIN_AESDEC128KLU8:
11387 icode = CODE_FOR_aesdec128klu8;
11388 goto aesdecenc_expand;
11389
11390 case IX86_BUILTIN_AESDEC256KLU8:
11391 icode = CODE_FOR_aesdec256klu8;
11392 goto aesdecenc_expand;
11393
11394 case IX86_BUILTIN_AESENC128KLU8:
11395 icode = CODE_FOR_aesenc128klu8;
11396 goto aesdecenc_expand;
11397
11398 case IX86_BUILTIN_AESENC256KLU8:
11399 icode = CODE_FOR_aesenc256klu8;
11400
11401 aesdecenc_expand:
11402
11403 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i *odata
11404 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i idata
11405 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
11406
11407 op0 = expand_normal (arg0);
11408 op1 = expand_normal (arg1);
11409 op2 = expand_normal (arg2);
11410
11411 if (!address_operand (op0, V2DImode))
11412 {
11413 op0 = convert_memory_address (Pmode, op0);
11414 op0 = copy_addr_to_reg (op0);
11415 }
11416 op0 = gen_rtx_MEM (V2DImode, op0);
11417
11418 if (!REG_P (op1))
11419 op1 = copy_to_mode_reg (V2DImode, op1);
11420
11421 if (!address_operand (op2, VOIDmode))
11422 {
11423 op2 = convert_memory_address (Pmode, op2);
11424 op2 = copy_addr_to_reg (op2);
11425 }
11426 op2 = gen_rtx_MEM (BLKmode, op2);
11427
11428 emit_insn (GEN_FCN (icode) (op1, op1, op2));
11429
11430 if (target == 0)
11431 target = gen_reg_rtx (QImode);
11432
11433 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCZmode, FLAGS_REG),
11434 const0_rtx);
11435 emit_insn (gen_rtx_SET (target, pat));
11436
11437 emit_insn (gen_rtx_SET (op0, op1));
11438
11439 return target;
11440
11441 case IX86_BUILTIN_AESDECWIDE128KLU8:
11442 icode = CODE_FOR_aesdecwide128klu8;
11443 goto wideaesdecenc_expand;
11444
11445 case IX86_BUILTIN_AESDECWIDE256KLU8:
11446 icode = CODE_FOR_aesdecwide256klu8;
11447 goto wideaesdecenc_expand;
11448
11449 case IX86_BUILTIN_AESENCWIDE128KLU8:
11450 icode = CODE_FOR_aesencwide128klu8;
11451 goto wideaesdecenc_expand;
11452
11453 case IX86_BUILTIN_AESENCWIDE256KLU8:
11454 icode = CODE_FOR_aesencwide256klu8;
11455
11456 wideaesdecenc_expand:
11457
11458 rtx xmm_regs[8];
11459 rtx op;
11460
11461 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i * odata
11462 arg1 = CALL_EXPR_ARG (exp, 1); // const __m128i * idata
11463 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
11464
11465 op0 = expand_normal (arg0);
11466 op1 = expand_normal (arg1);
11467 op2 = expand_normal (arg2);
11468
11469 if (!address_operand (op2, VOIDmode))
11470 {
11471 op2 = convert_memory_address (Pmode, op2);
11472 op2 = copy_addr_to_reg (op2);
11473 }
11474 op2 = gen_rtx_MEM (BLKmode, op2);
11475
11476 for (i = 0; i < 8; i++)
11477 {
11478 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
11479
11480 op = gen_rtx_MEM (V2DImode,
11481 plus_constant (Pmode, op1, (i * 16)));
11482
11483 emit_move_insn (xmm_regs[i], op);
11484 }
11485
11486 emit_insn (GEN_FCN (icode) (op2));
11487
11488 if (target == 0)
11489 target = gen_reg_rtx (QImode);
11490
11491 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCZmode, FLAGS_REG),
11492 const0_rtx);
11493 emit_insn (gen_rtx_SET (target, pat));
11494
11495 for (i = 0; i < 8; i++)
11496 {
11497 op = gen_rtx_MEM (V2DImode,
11498 plus_constant (Pmode, op0, (i * 16)));
11499 emit_move_insn (op, xmm_regs[i]);
11500 }
11501
11502 return target;
11503
11504 case IX86_BUILTIN_ENCODEKEY128U32:
11505 {
11506 rtx op, xmm_regs[7];
11507
11508 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
11509 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i key
11510 arg2 = CALL_EXPR_ARG (exp, 2); // void *h
11511
11512 op0 = expand_normal (arg0);
11513 op1 = expand_normal (arg1);
11514 op2 = expand_normal (arg2);
11515
11516 if (!REG_P (op0))
11517 op0 = copy_to_mode_reg (SImode, op0);
11518
11519 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
11520 emit_move_insn (op, op1);
11521
11522 for (i = 0; i < 3; i++)
11523 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
11524
11525 if (target == 0)
11526 target = gen_reg_rtx (SImode);
11527
11528 emit_insn (gen_encodekey128u32 (target, op0));
11529
11530 for (i = 0; i < 3; i++)
11531 {
11532 op = gen_rtx_MEM (V2DImode,
11533 plus_constant (Pmode, op2, (i * 16)));
11534 emit_move_insn (op, xmm_regs[i]);
11535 }
11536
11537 return target;
11538 }
11539 case IX86_BUILTIN_ENCODEKEY256U32:
11540 {
11541 rtx op, xmm_regs[7];
11542
11543 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
11544 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i keylow
11545 arg2 = CALL_EXPR_ARG (exp, 2); // __m128i keyhi
11546 arg3 = CALL_EXPR_ARG (exp, 3); // void *h
11547
11548 op0 = expand_normal (arg0);
11549 op1 = expand_normal (arg1);
11550 op2 = expand_normal (arg2);
11551 op3 = expand_normal (arg3);
11552
11553 if (!REG_P (op0))
11554 op0 = copy_to_mode_reg (SImode, op0);
11555
11556 /* Force to use xmm0, xmm1 for keylow, keyhi*/
11557 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
11558 emit_move_insn (op, op1);
11559 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (1));
11560 emit_move_insn (op, op2);
11561
11562 for (i = 0; i < 4; i++)
11563 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
11564
11565 if (target == 0)
11566 target = gen_reg_rtx (SImode);
11567
11568 emit_insn (gen_encodekey256u32 (target, op0));
11569
11570 for (i = 0; i < 4; i++)
11571 {
11572 op = gen_rtx_MEM (V2DImode,
11573 plus_constant (Pmode, op3, (i * 16)));
11574 emit_move_insn (op, xmm_regs[i]);
11575 }
11576
11577 return target;
11578 }
11579
11580 case IX86_BUILTIN_VEC_INIT_V2SI:
11581 case IX86_BUILTIN_VEC_INIT_V4HI:
11582 case IX86_BUILTIN_VEC_INIT_V8QI:
11583 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
11584
11585 case IX86_BUILTIN_VEC_EXT_V2DF:
11586 case IX86_BUILTIN_VEC_EXT_V2DI:
11587 case IX86_BUILTIN_VEC_EXT_V4SF:
11588 case IX86_BUILTIN_VEC_EXT_V4SI:
11589 case IX86_BUILTIN_VEC_EXT_V8HI:
11590 case IX86_BUILTIN_VEC_EXT_V2SI:
11591 case IX86_BUILTIN_VEC_EXT_V4HI:
11592 case IX86_BUILTIN_VEC_EXT_V16QI:
11593 return ix86_expand_vec_ext_builtin (exp, target);
11594
11595 case IX86_BUILTIN_VEC_SET_V2DI:
11596 case IX86_BUILTIN_VEC_SET_V4SF:
11597 case IX86_BUILTIN_VEC_SET_V4SI:
11598 case IX86_BUILTIN_VEC_SET_V8HI:
11599 case IX86_BUILTIN_VEC_SET_V4HI:
11600 case IX86_BUILTIN_VEC_SET_V16QI:
11601 return ix86_expand_vec_set_builtin (exp);
11602
11603 case IX86_BUILTIN_NANQ:
11604 case IX86_BUILTIN_NANSQ:
11605 return expand_call (exp, target, ignore);
11606
11607 case IX86_BUILTIN_RDPID:
11608
11609 op0 = gen_reg_rtx (word_mode);
11610
11611 if (TARGET_64BIT)
11612 {
11613 insn = gen_rdpid_rex64 (op0);
11614 op0 = convert_to_mode (SImode, op0, 1);
11615 }
11616 else
11617 insn = gen_rdpid (op0);
11618
11619 emit_insn (insn);
11620
11621 if (target == 0
11622 || !register_operand (target, SImode))
11623 target = gen_reg_rtx (SImode);
11624
11625 emit_move_insn (target, op0);
11626 return target;
11627
11628 case IX86_BUILTIN_2INTERSECTD512:
11629 case IX86_BUILTIN_2INTERSECTQ512:
11630 case IX86_BUILTIN_2INTERSECTD256:
11631 case IX86_BUILTIN_2INTERSECTQ256:
11632 case IX86_BUILTIN_2INTERSECTD128:
11633 case IX86_BUILTIN_2INTERSECTQ128:
11634 arg0 = CALL_EXPR_ARG (exp, 0);
11635 arg1 = CALL_EXPR_ARG (exp, 1);
11636 arg2 = CALL_EXPR_ARG (exp, 2);
11637 arg3 = CALL_EXPR_ARG (exp, 3);
11638 op0 = expand_normal (arg0);
11639 op1 = expand_normal (arg1);
11640 op2 = expand_normal (arg2);
11641 op3 = expand_normal (arg3);
11642
11643 if (!address_operand (op0, VOIDmode))
11644 {
11645 op0 = convert_memory_address (Pmode, op0);
11646 op0 = copy_addr_to_reg (op0);
11647 }
11648 if (!address_operand (op1, VOIDmode))
11649 {
11650 op1 = convert_memory_address (Pmode, op1);
11651 op1 = copy_addr_to_reg (op1);
11652 }
11653
11654 switch (fcode)
11655 {
11656 case IX86_BUILTIN_2INTERSECTD512:
11657 mode4 = P2HImode;
11658 icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
11659 break;
11660 case IX86_BUILTIN_2INTERSECTQ512:
11661 mode4 = P2QImode;
11662 icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
11663 break;
11664 case IX86_BUILTIN_2INTERSECTD256:
11665 mode4 = P2QImode;
11666 icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
11667 break;
11668 case IX86_BUILTIN_2INTERSECTQ256:
11669 mode4 = P2QImode;
11670 icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
11671 break;
11672 case IX86_BUILTIN_2INTERSECTD128:
11673 mode4 = P2QImode;
11674 icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
11675 break;
11676 case IX86_BUILTIN_2INTERSECTQ128:
11677 mode4 = P2QImode;
11678 icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
11679 break;
11680 default:
11681 gcc_unreachable ();
11682 }
11683
11684 mode2 = insn_data[icode].operand[1].mode;
11685 mode3 = insn_data[icode].operand[2].mode;
11686 if (!insn_data[icode].operand[1].predicate (op2, mode2))
11687 op2 = copy_to_mode_reg (mode2, op2);
11688 if (!insn_data[icode].operand[2].predicate (op3, mode3))
11689 op3 = copy_to_mode_reg (mode3, op3);
11690
11691 op4 = gen_reg_rtx (mode4);
11692 emit_insn (GEN_FCN (icode) (op4, op2, op3));
11693 mode0 = mode4 == P2HImode ? HImode : QImode;
11694 emit_move_insn (gen_rtx_MEM (mode0, op0),
11695 gen_lowpart (mode0, op4));
11696 emit_move_insn (gen_rtx_MEM (mode0, op1),
11697 gen_highpart (mode0, op4));
11698
11699 return 0;
11700
11701 case IX86_BUILTIN_RDPMC:
11702 case IX86_BUILTIN_RDTSC:
11703 case IX86_BUILTIN_RDTSCP:
11704 case IX86_BUILTIN_XGETBV:
11705
11706 op0 = gen_reg_rtx (DImode);
11707 op1 = gen_reg_rtx (DImode);
11708
11709 if (fcode == IX86_BUILTIN_RDPMC)
11710 {
11711 arg0 = CALL_EXPR_ARG (exp, 0);
11712 op2 = expand_normal (arg0);
11713 if (!register_operand (op2, SImode))
11714 op2 = copy_to_mode_reg (SImode, op2);
11715
11716 insn = (TARGET_64BIT
11717 ? gen_rdpmc_rex64 (op0, op1, op2)
11718 : gen_rdpmc (op0, op2));
11719 emit_insn (insn);
11720 }
11721 else if (fcode == IX86_BUILTIN_XGETBV)
11722 {
11723 arg0 = CALL_EXPR_ARG (exp, 0);
11724 op2 = expand_normal (arg0);
11725 if (!register_operand (op2, SImode))
11726 op2 = copy_to_mode_reg (SImode, op2);
11727
11728 insn = (TARGET_64BIT
11729 ? gen_xgetbv_rex64 (op0, op1, op2)
11730 : gen_xgetbv (op0, op2));
11731 emit_insn (insn);
11732 }
11733 else if (fcode == IX86_BUILTIN_RDTSC)
11734 {
11735 insn = (TARGET_64BIT
11736 ? gen_rdtsc_rex64 (op0, op1)
11737 : gen_rdtsc (op0));
11738 emit_insn (insn);
11739 }
11740 else
11741 {
11742 op2 = gen_reg_rtx (SImode);
11743
11744 insn = (TARGET_64BIT
11745 ? gen_rdtscp_rex64 (op0, op1, op2)
11746 : gen_rdtscp (op0, op2));
11747 emit_insn (insn);
11748
11749 arg0 = CALL_EXPR_ARG (exp, 0);
11750 op4 = expand_normal (arg0);
11751 if (!address_operand (op4, VOIDmode))
11752 {
11753 op4 = convert_memory_address (Pmode, op4);
11754 op4 = copy_addr_to_reg (op4);
11755 }
11756 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
11757 }
11758
11759 if (target == 0
11760 || !register_operand (target, DImode))
11761 target = gen_reg_rtx (DImode);
11762
11763 if (TARGET_64BIT)
11764 {
11765 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
11766 op1, 1, OPTAB_DIRECT);
11767 op0 = expand_simple_binop (DImode, IOR, op0, op1,
11768 op0, 1, OPTAB_DIRECT);
11769 }
11770
11771 emit_move_insn (target, op0);
11772 return target;
11773
11774 case IX86_BUILTIN_ENQCMD:
11775 case IX86_BUILTIN_ENQCMDS:
11776 case IX86_BUILTIN_MOVDIR64B:
11777
11778 arg0 = CALL_EXPR_ARG (exp, 0);
11779 arg1 = CALL_EXPR_ARG (exp, 1);
11780 op0 = expand_normal (arg0);
11781 op1 = expand_normal (arg1);
11782
11783 op0 = ix86_zero_extend_to_Pmode (op0);
11784 if (!address_operand (op1, VOIDmode))
11785 {
11786 op1 = convert_memory_address (Pmode, op1);
11787 op1 = copy_addr_to_reg (op1);
11788 }
11789 op1 = gen_rtx_MEM (XImode, op1);
11790
11791 if (fcode == IX86_BUILTIN_MOVDIR64B)
11792 {
11793 emit_insn (gen_movdir64b (Pmode, op0, op1));
11794 return 0;
11795 }
11796 else
11797 {
11798 if (target == 0
11799 || !register_operand (target, SImode))
11800 target = gen_reg_rtx (SImode);
11801
11802 emit_move_insn (target, const0_rtx);
11803 target = gen_rtx_SUBREG (QImode, target, 0);
11804
11805 int unspecv = (fcode == IX86_BUILTIN_ENQCMD
11806 ? UNSPECV_ENQCMD
11807 : UNSPECV_ENQCMDS);
11808 icode = code_for_enqcmd (unspecv, Pmode);
11809 emit_insn (GEN_FCN (icode) (op0, op1));
11810
11811 emit_insn
11812 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
11813 gen_rtx_fmt_ee (EQ, QImode,
11814 gen_rtx_REG (CCZmode, FLAGS_REG),
11815 const0_rtx)));
11816 return SUBREG_REG (target);
11817 }
11818
11819 case IX86_BUILTIN_FXSAVE:
11820 case IX86_BUILTIN_FXRSTOR:
11821 case IX86_BUILTIN_FXSAVE64:
11822 case IX86_BUILTIN_FXRSTOR64:
11823 case IX86_BUILTIN_FNSTENV:
11824 case IX86_BUILTIN_FLDENV:
11825 mode0 = BLKmode;
11826 switch (fcode)
11827 {
11828 case IX86_BUILTIN_FXSAVE:
11829 icode = CODE_FOR_fxsave;
11830 break;
11831 case IX86_BUILTIN_FXRSTOR:
11832 icode = CODE_FOR_fxrstor;
11833 break;
11834 case IX86_BUILTIN_FXSAVE64:
11835 icode = CODE_FOR_fxsave64;
11836 break;
11837 case IX86_BUILTIN_FXRSTOR64:
11838 icode = CODE_FOR_fxrstor64;
11839 break;
11840 case IX86_BUILTIN_FNSTENV:
11841 icode = CODE_FOR_fnstenv;
11842 break;
11843 case IX86_BUILTIN_FLDENV:
11844 icode = CODE_FOR_fldenv;
11845 break;
11846 default:
11847 gcc_unreachable ();
11848 }
11849
11850 arg0 = CALL_EXPR_ARG (exp, 0);
11851 op0 = expand_normal (arg0);
11852
11853 if (!address_operand (op0, VOIDmode))
11854 {
11855 op0 = convert_memory_address (Pmode, op0);
11856 op0 = copy_addr_to_reg (op0);
11857 }
11858 op0 = gen_rtx_MEM (mode0, op0);
11859
11860 pat = GEN_FCN (icode) (op0);
11861 if (pat)
11862 emit_insn (pat);
11863 return 0;
11864
11865 case IX86_BUILTIN_XSETBV:
11866 arg0 = CALL_EXPR_ARG (exp, 0);
11867 arg1 = CALL_EXPR_ARG (exp, 1);
11868 op0 = expand_normal (arg0);
11869 op1 = expand_normal (arg1);
11870
11871 if (!REG_P (op0))
11872 op0 = copy_to_mode_reg (SImode, op0);
11873
11874 op1 = force_reg (DImode, op1);
11875
11876 if (TARGET_64BIT)
11877 {
11878 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11879 NULL, 1, OPTAB_DIRECT);
11880
11881 icode = CODE_FOR_xsetbv_rex64;
11882
11883 op2 = gen_lowpart (SImode, op2);
11884 op1 = gen_lowpart (SImode, op1);
11885 pat = GEN_FCN (icode) (op0, op1, op2);
11886 }
11887 else
11888 {
11889 icode = CODE_FOR_xsetbv;
11890
11891 pat = GEN_FCN (icode) (op0, op1);
11892 }
11893 if (pat)
11894 emit_insn (pat);
11895 return 0;
11896
11897 case IX86_BUILTIN_XSAVE:
11898 case IX86_BUILTIN_XRSTOR:
11899 case IX86_BUILTIN_XSAVE64:
11900 case IX86_BUILTIN_XRSTOR64:
11901 case IX86_BUILTIN_XSAVEOPT:
11902 case IX86_BUILTIN_XSAVEOPT64:
11903 case IX86_BUILTIN_XSAVES:
11904 case IX86_BUILTIN_XRSTORS:
11905 case IX86_BUILTIN_XSAVES64:
11906 case IX86_BUILTIN_XRSTORS64:
11907 case IX86_BUILTIN_XSAVEC:
11908 case IX86_BUILTIN_XSAVEC64:
11909 arg0 = CALL_EXPR_ARG (exp, 0);
11910 arg1 = CALL_EXPR_ARG (exp, 1);
11911 op0 = expand_normal (arg0);
11912 op1 = expand_normal (arg1);
11913
11914 if (!address_operand (op0, VOIDmode))
11915 {
11916 op0 = convert_memory_address (Pmode, op0);
11917 op0 = copy_addr_to_reg (op0);
11918 }
11919 op0 = gen_rtx_MEM (BLKmode, op0);
11920
11921 op1 = force_reg (DImode, op1);
11922
11923 if (TARGET_64BIT)
11924 {
11925 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11926 NULL, 1, OPTAB_DIRECT);
11927 switch (fcode)
11928 {
11929 case IX86_BUILTIN_XSAVE:
11930 icode = CODE_FOR_xsave_rex64;
11931 break;
11932 case IX86_BUILTIN_XRSTOR:
11933 icode = CODE_FOR_xrstor_rex64;
11934 break;
11935 case IX86_BUILTIN_XSAVE64:
11936 icode = CODE_FOR_xsave64;
11937 break;
11938 case IX86_BUILTIN_XRSTOR64:
11939 icode = CODE_FOR_xrstor64;
11940 break;
11941 case IX86_BUILTIN_XSAVEOPT:
11942 icode = CODE_FOR_xsaveopt_rex64;
11943 break;
11944 case IX86_BUILTIN_XSAVEOPT64:
11945 icode = CODE_FOR_xsaveopt64;
11946 break;
11947 case IX86_BUILTIN_XSAVES:
11948 icode = CODE_FOR_xsaves_rex64;
11949 break;
11950 case IX86_BUILTIN_XRSTORS:
11951 icode = CODE_FOR_xrstors_rex64;
11952 break;
11953 case IX86_BUILTIN_XSAVES64:
11954 icode = CODE_FOR_xsaves64;
11955 break;
11956 case IX86_BUILTIN_XRSTORS64:
11957 icode = CODE_FOR_xrstors64;
11958 break;
11959 case IX86_BUILTIN_XSAVEC:
11960 icode = CODE_FOR_xsavec_rex64;
11961 break;
11962 case IX86_BUILTIN_XSAVEC64:
11963 icode = CODE_FOR_xsavec64;
11964 break;
11965 default:
11966 gcc_unreachable ();
11967 }
11968
11969 op2 = gen_lowpart (SImode, op2);
11970 op1 = gen_lowpart (SImode, op1);
11971 pat = GEN_FCN (icode) (op0, op1, op2);
11972 }
11973 else
11974 {
11975 switch (fcode)
11976 {
11977 case IX86_BUILTIN_XSAVE:
11978 icode = CODE_FOR_xsave;
11979 break;
11980 case IX86_BUILTIN_XRSTOR:
11981 icode = CODE_FOR_xrstor;
11982 break;
11983 case IX86_BUILTIN_XSAVEOPT:
11984 icode = CODE_FOR_xsaveopt;
11985 break;
11986 case IX86_BUILTIN_XSAVES:
11987 icode = CODE_FOR_xsaves;
11988 break;
11989 case IX86_BUILTIN_XRSTORS:
11990 icode = CODE_FOR_xrstors;
11991 break;
11992 case IX86_BUILTIN_XSAVEC:
11993 icode = CODE_FOR_xsavec;
11994 break;
11995 default:
11996 gcc_unreachable ();
11997 }
11998 pat = GEN_FCN (icode) (op0, op1);
11999 }
12000
12001 if (pat)
12002 emit_insn (pat);
12003 return 0;
12004
12005 case IX86_BUILTIN_LLWPCB:
12006 arg0 = CALL_EXPR_ARG (exp, 0);
12007 op0 = expand_normal (arg0);
12008
12009 if (!register_operand (op0, Pmode))
12010 op0 = ix86_zero_extend_to_Pmode (op0);
12011 emit_insn (gen_lwp_llwpcb (Pmode, op0));
12012 return 0;
12013
12014 case IX86_BUILTIN_SLWPCB:
12015 if (!target
12016 || !register_operand (target, Pmode))
12017 target = gen_reg_rtx (Pmode);
12018 emit_insn (gen_lwp_slwpcb (Pmode, target));
12019 return target;
12020
12021 case IX86_BUILTIN_LWPVAL32:
12022 case IX86_BUILTIN_LWPVAL64:
12023 case IX86_BUILTIN_LWPINS32:
12024 case IX86_BUILTIN_LWPINS64:
12025 mode = ((fcode == IX86_BUILTIN_LWPVAL32
12026 || fcode == IX86_BUILTIN_LWPINS32)
12027 ? SImode : DImode);
12028
12029 if (fcode == IX86_BUILTIN_LWPVAL32
12030 || fcode == IX86_BUILTIN_LWPVAL64)
12031 icode = code_for_lwp_lwpval (mode);
12032 else
12033 icode = code_for_lwp_lwpins (mode);
12034
12035 arg0 = CALL_EXPR_ARG (exp, 0);
12036 arg1 = CALL_EXPR_ARG (exp, 1);
12037 arg2 = CALL_EXPR_ARG (exp, 2);
12038 op0 = expand_normal (arg0);
12039 op1 = expand_normal (arg1);
12040 op2 = expand_normal (arg2);
12041 mode0 = insn_data[icode].operand[0].mode;
12042
12043 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12044 op0 = copy_to_mode_reg (mode0, op0);
12045 if (!insn_data[icode].operand[1].predicate (op1, SImode))
12046 op1 = copy_to_mode_reg (SImode, op1);
12047
12048 if (!CONST_INT_P (op2))
12049 {
12050 error ("the last argument must be a 32-bit immediate");
12051 return const0_rtx;
12052 }
12053
12054 emit_insn (GEN_FCN (icode) (op0, op1, op2));
12055
12056 if (fcode == IX86_BUILTIN_LWPINS32
12057 || fcode == IX86_BUILTIN_LWPINS64)
12058 {
12059 if (target == 0
12060 || !nonimmediate_operand (target, QImode))
12061 target = gen_reg_rtx (QImode);
12062
12063 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12064 const0_rtx);
12065 emit_insn (gen_rtx_SET (target, pat));
12066
12067 return target;
12068 }
12069 else
12070 return 0;
12071
12072 case IX86_BUILTIN_BEXTRI32:
12073 case IX86_BUILTIN_BEXTRI64:
12074 mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
12075
12076 arg0 = CALL_EXPR_ARG (exp, 0);
12077 arg1 = CALL_EXPR_ARG (exp, 1);
12078 op0 = expand_normal (arg0);
12079 op1 = expand_normal (arg1);
12080
12081 if (!CONST_INT_P (op1))
12082 {
12083 error ("last argument must be an immediate");
12084 return const0_rtx;
12085 }
12086 else
12087 {
12088 unsigned char lsb_index = UINTVAL (op1);
12089 unsigned char length = UINTVAL (op1) >> 8;
12090
12091 unsigned char bitsize = GET_MODE_BITSIZE (mode);
12092
12093 icode = code_for_tbm_bextri (mode);
12094
12095 mode1 = insn_data[icode].operand[1].mode;
12096 if (!insn_data[icode].operand[1].predicate (op0, mode1))
12097 op0 = copy_to_mode_reg (mode1, op0);
12098
12099 mode0 = insn_data[icode].operand[0].mode;
12100 if (target == 0
12101 || !register_operand (target, mode0))
12102 target = gen_reg_rtx (mode0);
12103
12104 if (length == 0 || lsb_index >= bitsize)
12105 {
12106 emit_move_insn (target, const0_rtx);
12107 return target;
12108 }
12109
12110 if (length + lsb_index > bitsize)
12111 length = bitsize - lsb_index;
12112
12113 op1 = GEN_INT (length);
12114 op2 = GEN_INT (lsb_index);
12115
12116 emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
12117 return target;
12118 }
12119
12120 case IX86_BUILTIN_RDRAND16_STEP:
12121 mode = HImode;
12122 goto rdrand_step;
12123
12124 case IX86_BUILTIN_RDRAND32_STEP:
12125 mode = SImode;
12126 goto rdrand_step;
12127
12128 case IX86_BUILTIN_RDRAND64_STEP:
12129 mode = DImode;
12130
12131 rdrand_step:
12132 arg0 = CALL_EXPR_ARG (exp, 0);
12133 op1 = expand_normal (arg0);
12134 if (!address_operand (op1, VOIDmode))
12135 {
12136 op1 = convert_memory_address (Pmode, op1);
12137 op1 = copy_addr_to_reg (op1);
12138 }
12139
12140 op0 = gen_reg_rtx (mode);
12141 emit_insn (gen_rdrand (mode, op0));
12142
12143 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
12144
12145 op1 = force_reg (SImode, const1_rtx);
12146
12147 /* Emit SImode conditional move. */
12148 if (mode == HImode)
12149 {
12150 if (TARGET_ZERO_EXTEND_WITH_AND
12151 && optimize_function_for_speed_p (cfun))
12152 {
12153 op2 = force_reg (SImode, const0_rtx);
12154
12155 emit_insn (gen_movstricthi
12156 (gen_lowpart (HImode, op2), op0));
12157 }
12158 else
12159 {
12160 op2 = gen_reg_rtx (SImode);
12161
12162 emit_insn (gen_zero_extendhisi2 (op2, op0));
12163 }
12164 }
12165 else if (mode == SImode)
12166 op2 = op0;
12167 else
12168 op2 = gen_rtx_SUBREG (SImode, op0, 0);
12169
12170 if (target == 0
12171 || !register_operand (target, SImode))
12172 target = gen_reg_rtx (SImode);
12173
12174 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
12175 const0_rtx);
12176 emit_insn (gen_rtx_SET (target,
12177 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
12178 return target;
12179
12180 case IX86_BUILTIN_RDSEED16_STEP:
12181 mode = HImode;
12182 goto rdseed_step;
12183
12184 case IX86_BUILTIN_RDSEED32_STEP:
12185 mode = SImode;
12186 goto rdseed_step;
12187
12188 case IX86_BUILTIN_RDSEED64_STEP:
12189 mode = DImode;
12190
12191 rdseed_step:
12192 arg0 = CALL_EXPR_ARG (exp, 0);
12193 op1 = expand_normal (arg0);
12194 if (!address_operand (op1, VOIDmode))
12195 {
12196 op1 = convert_memory_address (Pmode, op1);
12197 op1 = copy_addr_to_reg (op1);
12198 }
12199
12200 op0 = gen_reg_rtx (mode);
12201 emit_insn (gen_rdseed (mode, op0));
12202
12203 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
12204
12205 op2 = gen_reg_rtx (QImode);
12206
12207 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12208 const0_rtx);
12209 emit_insn (gen_rtx_SET (op2, pat));
12210
12211 if (target == 0
12212 || !register_operand (target, SImode))
12213 target = gen_reg_rtx (SImode);
12214
12215 emit_insn (gen_zero_extendqisi2 (target, op2));
12216 return target;
12217
12218 case IX86_BUILTIN_SBB32:
12219 icode = CODE_FOR_subborrowsi;
12220 icode2 = CODE_FOR_subborrowsi_0;
12221 mode0 = SImode;
12222 mode1 = DImode;
12223 mode2 = CCmode;
12224 goto handlecarry;
12225
12226 case IX86_BUILTIN_SBB64:
12227 icode = CODE_FOR_subborrowdi;
12228 icode2 = CODE_FOR_subborrowdi_0;
12229 mode0 = DImode;
12230 mode1 = TImode;
12231 mode2 = CCmode;
12232 goto handlecarry;
12233
12234 case IX86_BUILTIN_ADDCARRYX32:
12235 icode = CODE_FOR_addcarrysi;
12236 icode2 = CODE_FOR_addcarrysi_0;
12237 mode0 = SImode;
12238 mode1 = DImode;
12239 mode2 = CCCmode;
12240 goto handlecarry;
12241
12242 case IX86_BUILTIN_ADDCARRYX64:
12243 icode = CODE_FOR_addcarrydi;
12244 icode2 = CODE_FOR_addcarrydi_0;
12245 mode0 = DImode;
12246 mode1 = TImode;
12247 mode2 = CCCmode;
12248
12249 handlecarry:
12250 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
12251 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
12252 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
12253 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
12254
12255 op1 = expand_normal (arg0);
12256 if (!integer_zerop (arg0))
12257 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
12258
12259 op2 = expand_normal (arg1);
12260 if (!register_operand (op2, mode0))
12261 op2 = copy_to_mode_reg (mode0, op2);
12262
12263 op3 = expand_normal (arg2);
12264 if (!register_operand (op3, mode0))
12265 op3 = copy_to_mode_reg (mode0, op3);
12266
12267 op4 = expand_normal (arg3);
12268 if (!address_operand (op4, VOIDmode))
12269 {
12270 op4 = convert_memory_address (Pmode, op4);
12271 op4 = copy_addr_to_reg (op4);
12272 }
12273
12274 op0 = gen_reg_rtx (mode0);
12275 if (integer_zerop (arg0))
12276 {
12277 /* If arg0 is 0, optimize right away into add or sub
12278 instruction that sets CCCmode flags. */
12279 op1 = gen_rtx_REG (mode2, FLAGS_REG);
12280 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
12281 }
12282 else
12283 {
12284 /* Generate CF from input operand. */
12285 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
12286
12287 /* Generate instruction that consumes CF. */
12288 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
12289 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
12290 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
12291 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
12292 }
12293
12294 /* Return current CF value. */
12295 if (target == 0)
12296 target = gen_reg_rtx (QImode);
12297
12298 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
12299 emit_insn (gen_rtx_SET (target, pat));
12300
12301 /* Store the result. */
12302 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
12303
12304 return target;
12305
12306 case IX86_BUILTIN_READ_FLAGS:
12307 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
12308
12309 if (optimize
12310 || target == NULL_RTX
12311 || !nonimmediate_operand (target, word_mode)
12312 || GET_MODE (target) != word_mode)
12313 target = gen_reg_rtx (word_mode);
12314
12315 emit_insn (gen_pop (target));
12316 return target;
12317
12318 case IX86_BUILTIN_WRITE_FLAGS:
12319
12320 arg0 = CALL_EXPR_ARG (exp, 0);
12321 op0 = expand_normal (arg0);
12322 if (!general_no_elim_operand (op0, word_mode))
12323 op0 = copy_to_mode_reg (word_mode, op0);
12324
12325 emit_insn (gen_push (op0));
12326 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
12327 return 0;
12328
12329 case IX86_BUILTIN_KTESTC8:
12330 icode = CODE_FOR_ktestqi;
12331 mode3 = CCCmode;
12332 goto kortest;
12333
12334 case IX86_BUILTIN_KTESTZ8:
12335 icode = CODE_FOR_ktestqi;
12336 mode3 = CCZmode;
12337 goto kortest;
12338
12339 case IX86_BUILTIN_KTESTC16:
12340 icode = CODE_FOR_ktesthi;
12341 mode3 = CCCmode;
12342 goto kortest;
12343
12344 case IX86_BUILTIN_KTESTZ16:
12345 icode = CODE_FOR_ktesthi;
12346 mode3 = CCZmode;
12347 goto kortest;
12348
12349 case IX86_BUILTIN_KTESTC32:
12350 icode = CODE_FOR_ktestsi;
12351 mode3 = CCCmode;
12352 goto kortest;
12353
12354 case IX86_BUILTIN_KTESTZ32:
12355 icode = CODE_FOR_ktestsi;
12356 mode3 = CCZmode;
12357 goto kortest;
12358
12359 case IX86_BUILTIN_KTESTC64:
12360 icode = CODE_FOR_ktestdi;
12361 mode3 = CCCmode;
12362 goto kortest;
12363
12364 case IX86_BUILTIN_KTESTZ64:
12365 icode = CODE_FOR_ktestdi;
12366 mode3 = CCZmode;
12367 goto kortest;
12368
12369 case IX86_BUILTIN_KORTESTC8:
12370 icode = CODE_FOR_kortestqi;
12371 mode3 = CCCmode;
12372 goto kortest;
12373
12374 case IX86_BUILTIN_KORTESTZ8:
12375 icode = CODE_FOR_kortestqi;
12376 mode3 = CCZmode;
12377 goto kortest;
12378
12379 case IX86_BUILTIN_KORTESTC16:
12380 icode = CODE_FOR_kortesthi;
12381 mode3 = CCCmode;
12382 goto kortest;
12383
12384 case IX86_BUILTIN_KORTESTZ16:
12385 icode = CODE_FOR_kortesthi;
12386 mode3 = CCZmode;
12387 goto kortest;
12388
12389 case IX86_BUILTIN_KORTESTC32:
12390 icode = CODE_FOR_kortestsi;
12391 mode3 = CCCmode;
12392 goto kortest;
12393
12394 case IX86_BUILTIN_KORTESTZ32:
12395 icode = CODE_FOR_kortestsi;
12396 mode3 = CCZmode;
12397 goto kortest;
12398
12399 case IX86_BUILTIN_KORTESTC64:
12400 icode = CODE_FOR_kortestdi;
12401 mode3 = CCCmode;
12402 goto kortest;
12403
12404 case IX86_BUILTIN_KORTESTZ64:
12405 icode = CODE_FOR_kortestdi;
12406 mode3 = CCZmode;
12407
12408 kortest:
12409 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
12410 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
12411 op0 = expand_normal (arg0);
12412 op1 = expand_normal (arg1);
12413
12414 mode0 = insn_data[icode].operand[0].mode;
12415 mode1 = insn_data[icode].operand[1].mode;
12416
12417 if (GET_MODE (op0) != VOIDmode)
12418 op0 = force_reg (GET_MODE (op0), op0);
12419
12420 op0 = gen_lowpart (mode0, op0);
12421
12422 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12423 op0 = copy_to_mode_reg (mode0, op0);
12424
12425 if (GET_MODE (op1) != VOIDmode)
12426 op1 = force_reg (GET_MODE (op1), op1);
12427
12428 op1 = gen_lowpart (mode1, op1);
12429
12430 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12431 op1 = copy_to_mode_reg (mode1, op1);
12432
12433 target = gen_reg_rtx (QImode);
12434
12435 /* Emit kortest. */
12436 emit_insn (GEN_FCN (icode) (op0, op1));
12437 /* And use setcc to return result from flags. */
12438 ix86_expand_setcc (target, EQ,
12439 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
12440 return target;
12441
12442 case IX86_BUILTIN_GATHERSIV2DF:
12443 icode = CODE_FOR_avx2_gathersiv2df;
12444 goto gather_gen;
12445 case IX86_BUILTIN_GATHERSIV4DF:
12446 icode = CODE_FOR_avx2_gathersiv4df;
12447 goto gather_gen;
12448 case IX86_BUILTIN_GATHERDIV2DF:
12449 icode = CODE_FOR_avx2_gatherdiv2df;
12450 goto gather_gen;
12451 case IX86_BUILTIN_GATHERDIV4DF:
12452 icode = CODE_FOR_avx2_gatherdiv4df;
12453 goto gather_gen;
12454 case IX86_BUILTIN_GATHERSIV4SF:
12455 icode = CODE_FOR_avx2_gathersiv4sf;
12456 goto gather_gen;
12457 case IX86_BUILTIN_GATHERSIV8SF:
12458 icode = CODE_FOR_avx2_gathersiv8sf;
12459 goto gather_gen;
12460 case IX86_BUILTIN_GATHERDIV4SF:
12461 icode = CODE_FOR_avx2_gatherdiv4sf;
12462 goto gather_gen;
12463 case IX86_BUILTIN_GATHERDIV8SF:
12464 icode = CODE_FOR_avx2_gatherdiv8sf;
12465 goto gather_gen;
12466 case IX86_BUILTIN_GATHERSIV2DI:
12467 icode = CODE_FOR_avx2_gathersiv2di;
12468 goto gather_gen;
12469 case IX86_BUILTIN_GATHERSIV4DI:
12470 icode = CODE_FOR_avx2_gathersiv4di;
12471 goto gather_gen;
12472 case IX86_BUILTIN_GATHERDIV2DI:
12473 icode = CODE_FOR_avx2_gatherdiv2di;
12474 goto gather_gen;
12475 case IX86_BUILTIN_GATHERDIV4DI:
12476 icode = CODE_FOR_avx2_gatherdiv4di;
12477 goto gather_gen;
12478 case IX86_BUILTIN_GATHERSIV4SI:
12479 icode = CODE_FOR_avx2_gathersiv4si;
12480 goto gather_gen;
12481 case IX86_BUILTIN_GATHERSIV8SI:
12482 icode = CODE_FOR_avx2_gathersiv8si;
12483 goto gather_gen;
12484 case IX86_BUILTIN_GATHERDIV4SI:
12485 icode = CODE_FOR_avx2_gatherdiv4si;
12486 goto gather_gen;
12487 case IX86_BUILTIN_GATHERDIV8SI:
12488 icode = CODE_FOR_avx2_gatherdiv8si;
12489 goto gather_gen;
12490 case IX86_BUILTIN_GATHERALTSIV4DF:
12491 icode = CODE_FOR_avx2_gathersiv4df;
12492 goto gather_gen;
12493 case IX86_BUILTIN_GATHERALTDIV8SF:
12494 icode = CODE_FOR_avx2_gatherdiv8sf;
12495 goto gather_gen;
12496 case IX86_BUILTIN_GATHERALTSIV4DI:
12497 icode = CODE_FOR_avx2_gathersiv4di;
12498 goto gather_gen;
12499 case IX86_BUILTIN_GATHERALTDIV8SI:
12500 icode = CODE_FOR_avx2_gatherdiv8si;
12501 goto gather_gen;
12502 case IX86_BUILTIN_GATHER3SIV16SF:
12503 icode = CODE_FOR_avx512f_gathersiv16sf;
12504 goto gather_gen;
12505 case IX86_BUILTIN_GATHER3SIV8DF:
12506 icode = CODE_FOR_avx512f_gathersiv8df;
12507 goto gather_gen;
12508 case IX86_BUILTIN_GATHER3DIV16SF:
12509 icode = CODE_FOR_avx512f_gatherdiv16sf;
12510 goto gather_gen;
12511 case IX86_BUILTIN_GATHER3DIV8DF:
12512 icode = CODE_FOR_avx512f_gatherdiv8df;
12513 goto gather_gen;
12514 case IX86_BUILTIN_GATHER3SIV16SI:
12515 icode = CODE_FOR_avx512f_gathersiv16si;
12516 goto gather_gen;
12517 case IX86_BUILTIN_GATHER3SIV8DI:
12518 icode = CODE_FOR_avx512f_gathersiv8di;
12519 goto gather_gen;
12520 case IX86_BUILTIN_GATHER3DIV16SI:
12521 icode = CODE_FOR_avx512f_gatherdiv16si;
12522 goto gather_gen;
12523 case IX86_BUILTIN_GATHER3DIV8DI:
12524 icode = CODE_FOR_avx512f_gatherdiv8di;
12525 goto gather_gen;
12526 case IX86_BUILTIN_GATHER3ALTSIV8DF:
12527 icode = CODE_FOR_avx512f_gathersiv8df;
12528 goto gather_gen;
12529 case IX86_BUILTIN_GATHER3ALTDIV16SF:
12530 icode = CODE_FOR_avx512f_gatherdiv16sf;
12531 goto gather_gen;
12532 case IX86_BUILTIN_GATHER3ALTSIV8DI:
12533 icode = CODE_FOR_avx512f_gathersiv8di;
12534 goto gather_gen;
12535 case IX86_BUILTIN_GATHER3ALTDIV16SI:
12536 icode = CODE_FOR_avx512f_gatherdiv16si;
12537 goto gather_gen;
12538 case IX86_BUILTIN_GATHER3SIV2DF:
12539 icode = CODE_FOR_avx512vl_gathersiv2df;
12540 goto gather_gen;
12541 case IX86_BUILTIN_GATHER3SIV4DF:
12542 icode = CODE_FOR_avx512vl_gathersiv4df;
12543 goto gather_gen;
12544 case IX86_BUILTIN_GATHER3DIV2DF:
12545 icode = CODE_FOR_avx512vl_gatherdiv2df;
12546 goto gather_gen;
12547 case IX86_BUILTIN_GATHER3DIV4DF:
12548 icode = CODE_FOR_avx512vl_gatherdiv4df;
12549 goto gather_gen;
12550 case IX86_BUILTIN_GATHER3SIV4SF:
12551 icode = CODE_FOR_avx512vl_gathersiv4sf;
12552 goto gather_gen;
12553 case IX86_BUILTIN_GATHER3SIV8SF:
12554 icode = CODE_FOR_avx512vl_gathersiv8sf;
12555 goto gather_gen;
12556 case IX86_BUILTIN_GATHER3DIV4SF:
12557 icode = CODE_FOR_avx512vl_gatherdiv4sf;
12558 goto gather_gen;
12559 case IX86_BUILTIN_GATHER3DIV8SF:
12560 icode = CODE_FOR_avx512vl_gatherdiv8sf;
12561 goto gather_gen;
12562 case IX86_BUILTIN_GATHER3SIV2DI:
12563 icode = CODE_FOR_avx512vl_gathersiv2di;
12564 goto gather_gen;
12565 case IX86_BUILTIN_GATHER3SIV4DI:
12566 icode = CODE_FOR_avx512vl_gathersiv4di;
12567 goto gather_gen;
12568 case IX86_BUILTIN_GATHER3DIV2DI:
12569 icode = CODE_FOR_avx512vl_gatherdiv2di;
12570 goto gather_gen;
12571 case IX86_BUILTIN_GATHER3DIV4DI:
12572 icode = CODE_FOR_avx512vl_gatherdiv4di;
12573 goto gather_gen;
12574 case IX86_BUILTIN_GATHER3SIV4SI:
12575 icode = CODE_FOR_avx512vl_gathersiv4si;
12576 goto gather_gen;
12577 case IX86_BUILTIN_GATHER3SIV8SI:
12578 icode = CODE_FOR_avx512vl_gathersiv8si;
12579 goto gather_gen;
12580 case IX86_BUILTIN_GATHER3DIV4SI:
12581 icode = CODE_FOR_avx512vl_gatherdiv4si;
12582 goto gather_gen;
12583 case IX86_BUILTIN_GATHER3DIV8SI:
12584 icode = CODE_FOR_avx512vl_gatherdiv8si;
12585 goto gather_gen;
12586 case IX86_BUILTIN_GATHER3ALTSIV4DF:
12587 icode = CODE_FOR_avx512vl_gathersiv4df;
12588 goto gather_gen;
12589 case IX86_BUILTIN_GATHER3ALTDIV8SF:
12590 icode = CODE_FOR_avx512vl_gatherdiv8sf;
12591 goto gather_gen;
12592 case IX86_BUILTIN_GATHER3ALTSIV4DI:
12593 icode = CODE_FOR_avx512vl_gathersiv4di;
12594 goto gather_gen;
12595 case IX86_BUILTIN_GATHER3ALTDIV8SI:
12596 icode = CODE_FOR_avx512vl_gatherdiv8si;
12597 goto gather_gen;
12598 case IX86_BUILTIN_SCATTERSIV16SF:
12599 icode = CODE_FOR_avx512f_scattersiv16sf;
12600 goto scatter_gen;
12601 case IX86_BUILTIN_SCATTERSIV8DF:
12602 icode = CODE_FOR_avx512f_scattersiv8df;
12603 goto scatter_gen;
12604 case IX86_BUILTIN_SCATTERDIV16SF:
12605 icode = CODE_FOR_avx512f_scatterdiv16sf;
12606 goto scatter_gen;
12607 case IX86_BUILTIN_SCATTERDIV8DF:
12608 icode = CODE_FOR_avx512f_scatterdiv8df;
12609 goto scatter_gen;
12610 case IX86_BUILTIN_SCATTERSIV16SI:
12611 icode = CODE_FOR_avx512f_scattersiv16si;
12612 goto scatter_gen;
12613 case IX86_BUILTIN_SCATTERSIV8DI:
12614 icode = CODE_FOR_avx512f_scattersiv8di;
12615 goto scatter_gen;
12616 case IX86_BUILTIN_SCATTERDIV16SI:
12617 icode = CODE_FOR_avx512f_scatterdiv16si;
12618 goto scatter_gen;
12619 case IX86_BUILTIN_SCATTERDIV8DI:
12620 icode = CODE_FOR_avx512f_scatterdiv8di;
12621 goto scatter_gen;
12622 case IX86_BUILTIN_SCATTERSIV8SF:
12623 icode = CODE_FOR_avx512vl_scattersiv8sf;
12624 goto scatter_gen;
12625 case IX86_BUILTIN_SCATTERSIV4SF:
12626 icode = CODE_FOR_avx512vl_scattersiv4sf;
12627 goto scatter_gen;
12628 case IX86_BUILTIN_SCATTERSIV4DF:
12629 icode = CODE_FOR_avx512vl_scattersiv4df;
12630 goto scatter_gen;
12631 case IX86_BUILTIN_SCATTERSIV2DF:
12632 icode = CODE_FOR_avx512vl_scattersiv2df;
12633 goto scatter_gen;
12634 case IX86_BUILTIN_SCATTERDIV8SF:
12635 icode = CODE_FOR_avx512vl_scatterdiv8sf;
12636 goto scatter_gen;
12637 case IX86_BUILTIN_SCATTERDIV4SF:
12638 icode = CODE_FOR_avx512vl_scatterdiv4sf;
12639 goto scatter_gen;
12640 case IX86_BUILTIN_SCATTERDIV4DF:
12641 icode = CODE_FOR_avx512vl_scatterdiv4df;
12642 goto scatter_gen;
12643 case IX86_BUILTIN_SCATTERDIV2DF:
12644 icode = CODE_FOR_avx512vl_scatterdiv2df;
12645 goto scatter_gen;
12646 case IX86_BUILTIN_SCATTERSIV8SI:
12647 icode = CODE_FOR_avx512vl_scattersiv8si;
12648 goto scatter_gen;
12649 case IX86_BUILTIN_SCATTERSIV4SI:
12650 icode = CODE_FOR_avx512vl_scattersiv4si;
12651 goto scatter_gen;
12652 case IX86_BUILTIN_SCATTERSIV4DI:
12653 icode = CODE_FOR_avx512vl_scattersiv4di;
12654 goto scatter_gen;
12655 case IX86_BUILTIN_SCATTERSIV2DI:
12656 icode = CODE_FOR_avx512vl_scattersiv2di;
12657 goto scatter_gen;
12658 case IX86_BUILTIN_SCATTERDIV8SI:
12659 icode = CODE_FOR_avx512vl_scatterdiv8si;
12660 goto scatter_gen;
12661 case IX86_BUILTIN_SCATTERDIV4SI:
12662 icode = CODE_FOR_avx512vl_scatterdiv4si;
12663 goto scatter_gen;
12664 case IX86_BUILTIN_SCATTERDIV4DI:
12665 icode = CODE_FOR_avx512vl_scatterdiv4di;
12666 goto scatter_gen;
12667 case IX86_BUILTIN_SCATTERDIV2DI:
12668 icode = CODE_FOR_avx512vl_scatterdiv2di;
12669 goto scatter_gen;
12670 case IX86_BUILTIN_GATHERPFDPD:
12671 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
12672 goto vec_prefetch_gen;
12673 case IX86_BUILTIN_SCATTERALTSIV8DF:
12674 icode = CODE_FOR_avx512f_scattersiv8df;
12675 goto scatter_gen;
12676 case IX86_BUILTIN_SCATTERALTDIV16SF:
12677 icode = CODE_FOR_avx512f_scatterdiv16sf;
12678 goto scatter_gen;
12679 case IX86_BUILTIN_SCATTERALTSIV8DI:
12680 icode = CODE_FOR_avx512f_scattersiv8di;
12681 goto scatter_gen;
12682 case IX86_BUILTIN_SCATTERALTDIV16SI:
12683 icode = CODE_FOR_avx512f_scatterdiv16si;
12684 goto scatter_gen;
12685 case IX86_BUILTIN_SCATTERALTSIV4DF:
12686 icode = CODE_FOR_avx512vl_scattersiv4df;
12687 goto scatter_gen;
12688 case IX86_BUILTIN_SCATTERALTDIV8SF:
12689 icode = CODE_FOR_avx512vl_scatterdiv8sf;
12690 goto scatter_gen;
12691 case IX86_BUILTIN_SCATTERALTSIV4DI:
12692 icode = CODE_FOR_avx512vl_scattersiv4di;
12693 goto scatter_gen;
12694 case IX86_BUILTIN_SCATTERALTDIV8SI:
12695 icode = CODE_FOR_avx512vl_scatterdiv8si;
12696 goto scatter_gen;
12697 case IX86_BUILTIN_SCATTERALTSIV2DF:
12698 icode = CODE_FOR_avx512vl_scattersiv2df;
12699 goto scatter_gen;
12700 case IX86_BUILTIN_SCATTERALTDIV4SF:
12701 icode = CODE_FOR_avx512vl_scatterdiv4sf;
12702 goto scatter_gen;
12703 case IX86_BUILTIN_SCATTERALTSIV2DI:
12704 icode = CODE_FOR_avx512vl_scattersiv2di;
12705 goto scatter_gen;
12706 case IX86_BUILTIN_SCATTERALTDIV4SI:
12707 icode = CODE_FOR_avx512vl_scatterdiv4si;
12708 goto scatter_gen;
12709 case IX86_BUILTIN_GATHERPFDPS:
12710 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
12711 goto vec_prefetch_gen;
12712 case IX86_BUILTIN_GATHERPFQPD:
12713 icode = CODE_FOR_avx512pf_gatherpfv8didf;
12714 goto vec_prefetch_gen;
12715 case IX86_BUILTIN_GATHERPFQPS:
12716 icode = CODE_FOR_avx512pf_gatherpfv8disf;
12717 goto vec_prefetch_gen;
12718 case IX86_BUILTIN_SCATTERPFDPD:
12719 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
12720 goto vec_prefetch_gen;
12721 case IX86_BUILTIN_SCATTERPFDPS:
12722 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
12723 goto vec_prefetch_gen;
12724 case IX86_BUILTIN_SCATTERPFQPD:
12725 icode = CODE_FOR_avx512pf_scatterpfv8didf;
12726 goto vec_prefetch_gen;
12727 case IX86_BUILTIN_SCATTERPFQPS:
12728 icode = CODE_FOR_avx512pf_scatterpfv8disf;
12729 goto vec_prefetch_gen;
12730
12731 gather_gen:
12732 rtx half;
12733 rtx (*gen) (rtx, rtx);
12734
12735 arg0 = CALL_EXPR_ARG (exp, 0);
12736 arg1 = CALL_EXPR_ARG (exp, 1);
12737 arg2 = CALL_EXPR_ARG (exp, 2);
12738 arg3 = CALL_EXPR_ARG (exp, 3);
12739 arg4 = CALL_EXPR_ARG (exp, 4);
12740 op0 = expand_normal (arg0);
12741 op1 = expand_normal (arg1);
12742 op2 = expand_normal (arg2);
12743 op3 = expand_normal (arg3);
12744 op4 = expand_normal (arg4);
12745 /* Note the arg order is different from the operand order. */
12746 mode0 = insn_data[icode].operand[1].mode;
12747 mode2 = insn_data[icode].operand[3].mode;
12748 mode3 = insn_data[icode].operand[4].mode;
12749 mode4 = insn_data[icode].operand[5].mode;
12750
12751 if (target == NULL_RTX
12752 || GET_MODE (target) != insn_data[icode].operand[0].mode
12753 || !insn_data[icode].operand[0].predicate (target,
12754 GET_MODE (target)))
12755 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
12756 else
12757 subtarget = target;
12758
12759 switch (fcode)
12760 {
12761 case IX86_BUILTIN_GATHER3ALTSIV8DF:
12762 case IX86_BUILTIN_GATHER3ALTSIV8DI:
12763 half = gen_reg_rtx (V8SImode);
12764 if (!nonimmediate_operand (op2, V16SImode))
12765 op2 = copy_to_mode_reg (V16SImode, op2);
12766 emit_insn (gen_vec_extract_lo_v16si (half, op2));
12767 op2 = half;
12768 break;
12769 case IX86_BUILTIN_GATHER3ALTSIV4DF:
12770 case IX86_BUILTIN_GATHER3ALTSIV4DI:
12771 case IX86_BUILTIN_GATHERALTSIV4DF:
12772 case IX86_BUILTIN_GATHERALTSIV4DI:
12773 half = gen_reg_rtx (V4SImode);
12774 if (!nonimmediate_operand (op2, V8SImode))
12775 op2 = copy_to_mode_reg (V8SImode, op2);
12776 emit_insn (gen_vec_extract_lo_v8si (half, op2));
12777 op2 = half;
12778 break;
12779 case IX86_BUILTIN_GATHER3ALTDIV16SF:
12780 case IX86_BUILTIN_GATHER3ALTDIV16SI:
12781 half = gen_reg_rtx (mode0);
12782 if (mode0 == V8SFmode)
12783 gen = gen_vec_extract_lo_v16sf;
12784 else
12785 gen = gen_vec_extract_lo_v16si;
12786 if (!nonimmediate_operand (op0, GET_MODE (op0)))
12787 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
12788 emit_insn (gen (half, op0));
12789 op0 = half;
12790 op3 = lowpart_subreg (QImode, op3, HImode);
12791 break;
12792 case IX86_BUILTIN_GATHER3ALTDIV8SF:
12793 case IX86_BUILTIN_GATHER3ALTDIV8SI:
12794 case IX86_BUILTIN_GATHERALTDIV8SF:
12795 case IX86_BUILTIN_GATHERALTDIV8SI:
12796 half = gen_reg_rtx (mode0);
12797 if (mode0 == V4SFmode)
12798 gen = gen_vec_extract_lo_v8sf;
12799 else
12800 gen = gen_vec_extract_lo_v8si;
12801 if (!nonimmediate_operand (op0, GET_MODE (op0)))
12802 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
12803 emit_insn (gen (half, op0));
12804 op0 = half;
12805 if (VECTOR_MODE_P (GET_MODE (op3)))
12806 {
12807 half = gen_reg_rtx (mode0);
12808 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12809 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12810 emit_insn (gen (half, op3));
12811 op3 = half;
12812 }
12813 break;
12814 default:
12815 break;
12816 }
12817
12818 /* Force memory operand only with base register here. But we
12819 don't want to do it on memory operand for other builtin
12820 functions. */
12821 op1 = ix86_zero_extend_to_Pmode (op1);
12822
12823 if (!insn_data[icode].operand[1].predicate (op0, mode0))
12824 op0 = copy_to_mode_reg (mode0, op0);
12825 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
12826 op1 = copy_to_mode_reg (Pmode, op1);
12827 if (!insn_data[icode].operand[3].predicate (op2, mode2))
12828 op2 = copy_to_mode_reg (mode2, op2);
12829
12830 op3 = fixup_modeless_constant (op3, mode3);
12831
12832 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
12833 {
12834 if (!insn_data[icode].operand[4].predicate (op3, mode3))
12835 op3 = copy_to_mode_reg (mode3, op3);
12836 }
12837 else
12838 {
12839 op3 = copy_to_reg (op3);
12840 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
12841 }
12842 if (!insn_data[icode].operand[5].predicate (op4, mode4))
12843 {
12844 error ("the last argument must be scale 1, 2, 4, 8");
12845 return const0_rtx;
12846 }
12847
12848 /* Optimize. If mask is known to have all high bits set,
12849 replace op0 with pc_rtx to signal that the instruction
12850 overwrites the whole destination and doesn't use its
12851 previous contents. */
12852 if (optimize)
12853 {
12854 if (TREE_CODE (arg3) == INTEGER_CST)
12855 {
12856 if (integer_all_onesp (arg3))
12857 op0 = pc_rtx;
12858 }
12859 else if (TREE_CODE (arg3) == VECTOR_CST)
12860 {
12861 unsigned int negative = 0;
12862 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
12863 {
12864 tree cst = VECTOR_CST_ELT (arg3, i);
12865 if (TREE_CODE (cst) == INTEGER_CST
12866 && tree_int_cst_sign_bit (cst))
12867 negative++;
12868 else if (TREE_CODE (cst) == REAL_CST
12869 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
12870 negative++;
12871 }
12872 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
12873 op0 = pc_rtx;
12874 }
12875 else if (TREE_CODE (arg3) == SSA_NAME
12876 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
12877 {
12878 /* Recognize also when mask is like:
12879 __v2df src = _mm_setzero_pd ();
12880 __v2df mask = _mm_cmpeq_pd (src, src);
12881 or
12882 __v8sf src = _mm256_setzero_ps ();
12883 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
12884 as that is a cheaper way to load all ones into
12885 a register than having to load a constant from
12886 memory. */
12887 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
12888 if (is_gimple_call (def_stmt))
12889 {
12890 tree fndecl = gimple_call_fndecl (def_stmt);
12891 if (fndecl
12892 && fndecl_built_in_p (fndecl, BUILT_IN_MD))
12893 switch (DECL_MD_FUNCTION_CODE (fndecl))
12894 {
12895 case IX86_BUILTIN_CMPPD:
12896 case IX86_BUILTIN_CMPPS:
12897 case IX86_BUILTIN_CMPPD256:
12898 case IX86_BUILTIN_CMPPS256:
12899 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
12900 break;
12901 /* FALLTHRU */
12902 case IX86_BUILTIN_CMPEQPD:
12903 case IX86_BUILTIN_CMPEQPS:
12904 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
12905 && initializer_zerop (gimple_call_arg (def_stmt,
12906 1)))
12907 op0 = pc_rtx;
12908 break;
12909 default:
12910 break;
12911 }
12912 }
12913 }
12914 }
12915
12916 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
12917 if (! pat)
12918 return const0_rtx;
12919 emit_insn (pat);
12920
12921 switch (fcode)
12922 {
12923 case IX86_BUILTIN_GATHER3DIV16SF:
12924 if (target == NULL_RTX)
12925 target = gen_reg_rtx (V8SFmode);
12926 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
12927 break;
12928 case IX86_BUILTIN_GATHER3DIV16SI:
12929 if (target == NULL_RTX)
12930 target = gen_reg_rtx (V8SImode);
12931 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
12932 break;
12933 case IX86_BUILTIN_GATHER3DIV8SF:
12934 case IX86_BUILTIN_GATHERDIV8SF:
12935 if (target == NULL_RTX)
12936 target = gen_reg_rtx (V4SFmode);
12937 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
12938 break;
12939 case IX86_BUILTIN_GATHER3DIV8SI:
12940 case IX86_BUILTIN_GATHERDIV8SI:
12941 if (target == NULL_RTX)
12942 target = gen_reg_rtx (V4SImode);
12943 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
12944 break;
12945 default:
12946 target = subtarget;
12947 break;
12948 }
12949 return target;
12950
12951 scatter_gen:
12952 arg0 = CALL_EXPR_ARG (exp, 0);
12953 arg1 = CALL_EXPR_ARG (exp, 1);
12954 arg2 = CALL_EXPR_ARG (exp, 2);
12955 arg3 = CALL_EXPR_ARG (exp, 3);
12956 arg4 = CALL_EXPR_ARG (exp, 4);
12957 op0 = expand_normal (arg0);
12958 op1 = expand_normal (arg1);
12959 op2 = expand_normal (arg2);
12960 op3 = expand_normal (arg3);
12961 op4 = expand_normal (arg4);
12962 mode1 = insn_data[icode].operand[1].mode;
12963 mode2 = insn_data[icode].operand[2].mode;
12964 mode3 = insn_data[icode].operand[3].mode;
12965 mode4 = insn_data[icode].operand[4].mode;
12966
12967 /* Scatter instruction stores operand op3 to memory with
12968 indices from op2 and scale from op4 under writemask op1.
12969 If index operand op2 has more elements then source operand
12970 op3 one need to use only its low half. And vice versa. */
12971 switch (fcode)
12972 {
12973 case IX86_BUILTIN_SCATTERALTSIV8DF:
12974 case IX86_BUILTIN_SCATTERALTSIV8DI:
12975 half = gen_reg_rtx (V8SImode);
12976 if (!nonimmediate_operand (op2, V16SImode))
12977 op2 = copy_to_mode_reg (V16SImode, op2);
12978 emit_insn (gen_vec_extract_lo_v16si (half, op2));
12979 op2 = half;
12980 break;
12981 case IX86_BUILTIN_SCATTERALTDIV16SF:
12982 case IX86_BUILTIN_SCATTERALTDIV16SI:
12983 half = gen_reg_rtx (mode3);
12984 if (mode3 == V8SFmode)
12985 gen = gen_vec_extract_lo_v16sf;
12986 else
12987 gen = gen_vec_extract_lo_v16si;
12988 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12989 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12990 emit_insn (gen (half, op3));
12991 op3 = half;
12992 break;
12993 case IX86_BUILTIN_SCATTERALTSIV4DF:
12994 case IX86_BUILTIN_SCATTERALTSIV4DI:
12995 half = gen_reg_rtx (V4SImode);
12996 if (!nonimmediate_operand (op2, V8SImode))
12997 op2 = copy_to_mode_reg (V8SImode, op2);
12998 emit_insn (gen_vec_extract_lo_v8si (half, op2));
12999 op2 = half;
13000 break;
13001 case IX86_BUILTIN_SCATTERALTDIV8SF:
13002 case IX86_BUILTIN_SCATTERALTDIV8SI:
13003 half = gen_reg_rtx (mode3);
13004 if (mode3 == V4SFmode)
13005 gen = gen_vec_extract_lo_v8sf;
13006 else
13007 gen = gen_vec_extract_lo_v8si;
13008 if (!nonimmediate_operand (op3, GET_MODE (op3)))
13009 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
13010 emit_insn (gen (half, op3));
13011 op3 = half;
13012 break;
13013 case IX86_BUILTIN_SCATTERALTSIV2DF:
13014 case IX86_BUILTIN_SCATTERALTSIV2DI:
13015 if (!nonimmediate_operand (op2, V4SImode))
13016 op2 = copy_to_mode_reg (V4SImode, op2);
13017 break;
13018 case IX86_BUILTIN_SCATTERALTDIV4SF:
13019 case IX86_BUILTIN_SCATTERALTDIV4SI:
13020 if (!nonimmediate_operand (op3, GET_MODE (op3)))
13021 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
13022 break;
13023 default:
13024 break;
13025 }
13026
13027 /* Force memory operand only with base register here. But we
13028 don't want to do it on memory operand for other builtin
13029 functions. */
13030 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
13031
13032 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
13033 op0 = copy_to_mode_reg (Pmode, op0);
13034
13035 op1 = fixup_modeless_constant (op1, mode1);
13036
13037 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
13038 {
13039 if (!insn_data[icode].operand[1].predicate (op1, mode1))
13040 op1 = copy_to_mode_reg (mode1, op1);
13041 }
13042 else
13043 {
13044 op1 = copy_to_reg (op1);
13045 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
13046 }
13047
13048 if (!insn_data[icode].operand[2].predicate (op2, mode2))
13049 op2 = copy_to_mode_reg (mode2, op2);
13050
13051 if (!insn_data[icode].operand[3].predicate (op3, mode3))
13052 op3 = copy_to_mode_reg (mode3, op3);
13053
13054 if (!insn_data[icode].operand[4].predicate (op4, mode4))
13055 {
13056 error ("the last argument must be scale 1, 2, 4, 8");
13057 return const0_rtx;
13058 }
13059
13060 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
13061 if (! pat)
13062 return const0_rtx;
13063
13064 emit_insn (pat);
13065 return 0;
13066
13067 vec_prefetch_gen:
13068 arg0 = CALL_EXPR_ARG (exp, 0);
13069 arg1 = CALL_EXPR_ARG (exp, 1);
13070 arg2 = CALL_EXPR_ARG (exp, 2);
13071 arg3 = CALL_EXPR_ARG (exp, 3);
13072 arg4 = CALL_EXPR_ARG (exp, 4);
13073 op0 = expand_normal (arg0);
13074 op1 = expand_normal (arg1);
13075 op2 = expand_normal (arg2);
13076 op3 = expand_normal (arg3);
13077 op4 = expand_normal (arg4);
13078 mode0 = insn_data[icode].operand[0].mode;
13079 mode1 = insn_data[icode].operand[1].mode;
13080 mode3 = insn_data[icode].operand[3].mode;
13081 mode4 = insn_data[icode].operand[4].mode;
13082
13083 op0 = fixup_modeless_constant (op0, mode0);
13084
13085 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
13086 {
13087 if (!insn_data[icode].operand[0].predicate (op0, mode0))
13088 op0 = copy_to_mode_reg (mode0, op0);
13089 }
13090 else
13091 {
13092 op0 = copy_to_reg (op0);
13093 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
13094 }
13095
13096 if (!insn_data[icode].operand[1].predicate (op1, mode1))
13097 op1 = copy_to_mode_reg (mode1, op1);
13098
13099 /* Force memory operand only with base register here. But we
13100 don't want to do it on memory operand for other builtin
13101 functions. */
13102 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
13103
13104 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
13105 op2 = copy_to_mode_reg (Pmode, op2);
13106
13107 if (!insn_data[icode].operand[3].predicate (op3, mode3))
13108 {
13109 error ("the forth argument must be scale 1, 2, 4, 8");
13110 return const0_rtx;
13111 }
13112
13113 if (!insn_data[icode].operand[4].predicate (op4, mode4))
13114 {
13115 error ("incorrect hint operand");
13116 return const0_rtx;
13117 }
13118
13119 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
13120 if (! pat)
13121 return const0_rtx;
13122
13123 emit_insn (pat);
13124
13125 return 0;
13126
13127 case IX86_BUILTIN_XABORT:
13128 icode = CODE_FOR_xabort;
13129 arg0 = CALL_EXPR_ARG (exp, 0);
13130 op0 = expand_normal (arg0);
13131 mode0 = insn_data[icode].operand[0].mode;
13132 if (!insn_data[icode].operand[0].predicate (op0, mode0))
13133 {
13134 error ("the argument to %<xabort%> intrinsic must "
13135 "be an 8-bit immediate");
13136 return const0_rtx;
13137 }
13138 emit_insn (gen_xabort (op0));
13139 return 0;
13140
13141 case IX86_BUILTIN_RDSSPD:
13142 case IX86_BUILTIN_RDSSPQ:
13143 mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
13144
13145 if (target == 0
13146 || !register_operand (target, mode))
13147 target = gen_reg_rtx (mode);
13148
13149 op0 = force_reg (mode, const0_rtx);
13150
13151 emit_insn (gen_rdssp (mode, target, op0));
13152 return target;
13153
13154 case IX86_BUILTIN_INCSSPD:
13155 case IX86_BUILTIN_INCSSPQ:
13156 mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
13157
13158 arg0 = CALL_EXPR_ARG (exp, 0);
13159 op0 = expand_normal (arg0);
13160
13161 op0 = force_reg (mode, op0);
13162
13163 emit_insn (gen_incssp (mode, op0));
13164 return 0;
13165
13166 case IX86_BUILTIN_HRESET:
13167 icode = CODE_FOR_hreset;
13168 arg0 = CALL_EXPR_ARG (exp, 0);
13169 op0 = expand_normal (arg0);
13170 op0 = force_reg (SImode, op0);
13171 emit_insn (gen_hreset (op0));
13172 return 0;
13173
13174 case IX86_BUILTIN_RSTORSSP:
13175 case IX86_BUILTIN_CLRSSBSY:
13176 arg0 = CALL_EXPR_ARG (exp, 0);
13177 op0 = expand_normal (arg0);
13178 icode = (fcode == IX86_BUILTIN_RSTORSSP
13179 ? CODE_FOR_rstorssp
13180 : CODE_FOR_clrssbsy);
13181
13182 if (!address_operand (op0, VOIDmode))
13183 {
13184 op0 = convert_memory_address (Pmode, op0);
13185 op0 = copy_addr_to_reg (op0);
13186 }
13187 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
13188 return 0;
13189
13190 case IX86_BUILTIN_WRSSD:
13191 case IX86_BUILTIN_WRSSQ:
13192 case IX86_BUILTIN_WRUSSD:
13193 case IX86_BUILTIN_WRUSSQ:
13194 mode = ((fcode == IX86_BUILTIN_WRSSD
13195 || fcode == IX86_BUILTIN_WRUSSD)
13196 ? SImode : DImode);
13197
13198 arg0 = CALL_EXPR_ARG (exp, 0);
13199 op0 = expand_normal (arg0);
13200 arg1 = CALL_EXPR_ARG (exp, 1);
13201 op1 = expand_normal (arg1);
13202
13203 op0 = force_reg (mode, op0);
13204
13205 if (!address_operand (op1, VOIDmode))
13206 {
13207 op1 = convert_memory_address (Pmode, op1);
13208 op1 = copy_addr_to_reg (op1);
13209 }
13210 op1 = gen_rtx_MEM (mode, op1);
13211
13212 icode = ((fcode == IX86_BUILTIN_WRSSD
13213 || fcode == IX86_BUILTIN_WRSSQ)
13214 ? code_for_wrss (mode)
13215 : code_for_wruss (mode));
13216 emit_insn (GEN_FCN (icode) (op0, op1));
13217
13218 return 0;
13219
13220 case IX86_BUILTIN_VZEROUPPER:
13221 cfun->machine->has_explicit_vzeroupper = true;
13222 break;
13223
13224 default:
13225 break;
13226 }
13227
13228 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
13229 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
13230 {
13231 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
13232 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
13233 target);
13234 }
13235
13236 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
13237 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
13238 {
13239 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
13240 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
13241 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
13242 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
13243 int masked = 1;
13244 machine_mode mode, wide_mode, nar_mode;
13245
13246 nar_mode = V4SFmode;
13247 mode = V16SFmode;
13248 wide_mode = V64SFmode;
13249 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
13250 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
13251
13252 switch (fcode)
13253 {
13254 case IX86_BUILTIN_4FMAPS:
13255 fcn = gen_avx5124fmaddps_4fmaddps;
13256 masked = 0;
13257 goto v4fma_expand;
13258
13259 case IX86_BUILTIN_4DPWSSD:
13260 nar_mode = V4SImode;
13261 mode = V16SImode;
13262 wide_mode = V64SImode;
13263 fcn = gen_avx5124vnniw_vp4dpwssd;
13264 masked = 0;
13265 goto v4fma_expand;
13266
13267 case IX86_BUILTIN_4DPWSSDS:
13268 nar_mode = V4SImode;
13269 mode = V16SImode;
13270 wide_mode = V64SImode;
13271 fcn = gen_avx5124vnniw_vp4dpwssds;
13272 masked = 0;
13273 goto v4fma_expand;
13274
13275 case IX86_BUILTIN_4FNMAPS:
13276 fcn = gen_avx5124fmaddps_4fnmaddps;
13277 masked = 0;
13278 goto v4fma_expand;
13279
13280 case IX86_BUILTIN_4FNMAPS_MASK:
13281 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
13282 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
13283 goto v4fma_expand;
13284
13285 case IX86_BUILTIN_4DPWSSD_MASK:
13286 nar_mode = V4SImode;
13287 mode = V16SImode;
13288 wide_mode = V64SImode;
13289 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
13290 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
13291 goto v4fma_expand;
13292
13293 case IX86_BUILTIN_4DPWSSDS_MASK:
13294 nar_mode = V4SImode;
13295 mode = V16SImode;
13296 wide_mode = V64SImode;
13297 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
13298 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
13299 goto v4fma_expand;
13300
13301 case IX86_BUILTIN_4FMAPS_MASK:
13302 {
13303 tree args[4];
13304 rtx ops[4];
13305 rtx wide_reg;
13306 rtx accum;
13307 rtx addr;
13308 rtx mem;
13309
13310 v4fma_expand:
13311 wide_reg = gen_reg_rtx (wide_mode);
13312 for (i = 0; i < 4; i++)
13313 {
13314 args[i] = CALL_EXPR_ARG (exp, i);
13315 ops[i] = expand_normal (args[i]);
13316
13317 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
13318 ops[i]);
13319 }
13320
13321 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
13322 accum = force_reg (mode, accum);
13323
13324 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
13325 addr = force_reg (Pmode, addr);
13326
13327 mem = gen_rtx_MEM (nar_mode, addr);
13328
13329 target = gen_reg_rtx (mode);
13330
13331 emit_move_insn (target, accum);
13332
13333 if (! masked)
13334 emit_insn (fcn (target, accum, wide_reg, mem));
13335 else
13336 {
13337 rtx merge, mask;
13338 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
13339
13340 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
13341
13342 if (CONST_INT_P (mask))
13343 mask = fixup_modeless_constant (mask, HImode);
13344
13345 mask = force_reg (HImode, mask);
13346
13347 if (GET_MODE (mask) != HImode)
13348 mask = gen_rtx_SUBREG (HImode, mask, 0);
13349
13350 /* If merge is 0 then we're about to emit z-masked variant. */
13351 if (const0_operand (merge, mode))
13352 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
13353 /* If merge is the same as accum then emit merge-masked variant. */
13354 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
13355 {
13356 merge = force_reg (mode, merge);
13357 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
13358 }
13359 /* Merge with something unknown might happen if we z-mask w/ -O0. */
13360 else
13361 {
13362 target = gen_reg_rtx (mode);
13363 emit_move_insn (target, merge);
13364 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
13365 }
13366 }
13367 return target;
13368 }
13369
13370 case IX86_BUILTIN_4FNMASS:
13371 fcn = gen_avx5124fmaddps_4fnmaddss;
13372 masked = 0;
13373 goto s4fma_expand;
13374
13375 case IX86_BUILTIN_4FMASS:
13376 fcn = gen_avx5124fmaddps_4fmaddss;
13377 masked = 0;
13378 goto s4fma_expand;
13379
13380 case IX86_BUILTIN_4FNMASS_MASK:
13381 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
13382 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
13383 goto s4fma_expand;
13384
13385 case IX86_BUILTIN_4FMASS_MASK:
13386 {
13387 tree args[4];
13388 rtx ops[4];
13389 rtx wide_reg;
13390 rtx accum;
13391 rtx addr;
13392 rtx mem;
13393
13394 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
13395 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
13396
13397 s4fma_expand:
13398 mode = V4SFmode;
13399 wide_reg = gen_reg_rtx (V64SFmode);
13400 for (i = 0; i < 4; i++)
13401 {
13402 rtx tmp;
13403 args[i] = CALL_EXPR_ARG (exp, i);
13404 ops[i] = expand_normal (args[i]);
13405
13406 tmp = gen_reg_rtx (SFmode);
13407 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
13408
13409 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
13410 gen_rtx_SUBREG (V16SFmode, tmp, 0));
13411 }
13412
13413 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
13414 accum = force_reg (V4SFmode, accum);
13415
13416 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
13417 addr = force_reg (Pmode, addr);
13418
13419 mem = gen_rtx_MEM (V4SFmode, addr);
13420
13421 target = gen_reg_rtx (V4SFmode);
13422
13423 emit_move_insn (target, accum);
13424
13425 if (! masked)
13426 emit_insn (fcn (target, accum, wide_reg, mem));
13427 else
13428 {
13429 rtx merge, mask;
13430 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
13431
13432 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
13433
13434 if (CONST_INT_P (mask))
13435 mask = fixup_modeless_constant (mask, QImode);
13436
13437 mask = force_reg (QImode, mask);
13438
13439 if (GET_MODE (mask) != QImode)
13440 mask = gen_rtx_SUBREG (QImode, mask, 0);
13441
13442 /* If merge is 0 then we're about to emit z-masked variant. */
13443 if (const0_operand (merge, mode))
13444 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
13445 /* If merge is the same as accum then emit merge-masked
13446 variant. */
13447 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
13448 {
13449 merge = force_reg (mode, merge);
13450 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
13451 }
13452 /* Merge with something unknown might happen if we z-mask
13453 w/ -O0. */
13454 else
13455 {
13456 target = gen_reg_rtx (mode);
13457 emit_move_insn (target, merge);
13458 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
13459 }
13460 }
13461 return target;
13462 }
13463 case IX86_BUILTIN_RDPID:
13464 return ix86_expand_special_args_builtin (bdesc_args + i, exp,
13465 target);
13466 case IX86_BUILTIN_FABSQ:
13467 case IX86_BUILTIN_COPYSIGNQ:
13468 if (!TARGET_SSE)
13469 /* Emit a normal call if SSE isn't available. */
13470 return expand_call (exp, target, ignore);
13471 /* FALLTHRU */
13472 default:
13473 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
13474 }
13475 }
13476
13477 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
13478 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
13479 {
13480 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
13481 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
13482 }
13483
13484 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
13485 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
13486 {
13487 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
13488 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
13489 }
13490
13491 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
13492 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
13493 {
13494 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
13495 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
13496 }
13497
13498 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
13499 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
13500 {
13501 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
13502 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
13503 }
13504
13505 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
13506 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
13507 {
13508 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
13509 const struct builtin_description *d = bdesc_multi_arg + i;
13510 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
13511 (enum ix86_builtin_func_type)
13512 d->flag, d->comparison);
13513 }
13514
13515 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
13516 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
13517 {
13518 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
13519 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
13520 target);
13521 }
13522
13523 gcc_unreachable ();
13524 }
13525
13526 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
13527 fill target with val via vec_duplicate. */
13528
13529 static bool
13530 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
13531 {
13532 bool ok;
13533 rtx_insn *insn;
13534 rtx dup;
13535
13536 /* First attempt to recognize VAL as-is. */
13537 dup = gen_vec_duplicate (mode, val);
13538 insn = emit_insn (gen_rtx_SET (target, dup));
13539 if (recog_memoized (insn) < 0)
13540 {
13541 rtx_insn *seq;
13542 machine_mode innermode = GET_MODE_INNER (mode);
13543 rtx reg;
13544
13545 /* If that fails, force VAL into a register. */
13546
13547 start_sequence ();
13548 reg = force_reg (innermode, val);
13549 if (GET_MODE (reg) != innermode)
13550 reg = gen_lowpart (innermode, reg);
13551 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
13552 seq = get_insns ();
13553 end_sequence ();
13554 if (seq)
13555 emit_insn_before (seq, insn);
13556
13557 ok = recog_memoized (insn) >= 0;
13558 gcc_assert (ok);
13559 }
13560 return true;
13561 }
13562
13563 /* Get a vector mode of the same size as the original but with elements
13564 twice as wide. This is only guaranteed to apply to integral vectors. */
13565
13566 static machine_mode
13567 get_mode_wider_vector (machine_mode o)
13568 {
13569 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
13570 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
13571 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
13572 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
13573 return n;
13574 }
13575
13576 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
13577 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
13578
13579 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13580 with all elements equal to VAR. Return true if successful. */
13581
13582 static bool
13583 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
13584 rtx target, rtx val)
13585 {
13586 bool ok;
13587
13588 switch (mode)
13589 {
13590 case E_V2SImode:
13591 case E_V2SFmode:
13592 if (!mmx_ok)
13593 return false;
13594 /* FALLTHRU */
13595
13596 case E_V4DFmode:
13597 case E_V4DImode:
13598 case E_V8SFmode:
13599 case E_V8SImode:
13600 case E_V2DFmode:
13601 case E_V2DImode:
13602 case E_V4SFmode:
13603 case E_V4SImode:
13604 case E_V16SImode:
13605 case E_V8DImode:
13606 case E_V16SFmode:
13607 case E_V8DFmode:
13608 return ix86_vector_duplicate_value (mode, target, val);
13609
13610 case E_V4HImode:
13611 if (!mmx_ok)
13612 return false;
13613 if (TARGET_SSE || TARGET_3DNOW_A)
13614 {
13615 rtx x;
13616
13617 val = gen_lowpart (SImode, val);
13618 x = gen_rtx_TRUNCATE (HImode, val);
13619 x = gen_rtx_VEC_DUPLICATE (mode, x);
13620 emit_insn (gen_rtx_SET (target, x));
13621 return true;
13622 }
13623 goto widen;
13624
13625 case E_V8QImode:
13626 if (!mmx_ok)
13627 return false;
13628 goto widen;
13629
13630 case E_V8HImode:
13631 if (TARGET_AVX2)
13632 return ix86_vector_duplicate_value (mode, target, val);
13633
13634 if (TARGET_SSE2)
13635 {
13636 struct expand_vec_perm_d dperm;
13637 rtx tmp1, tmp2;
13638
13639 permute:
13640 memset (&dperm, 0, sizeof (dperm));
13641 dperm.target = target;
13642 dperm.vmode = mode;
13643 dperm.nelt = GET_MODE_NUNITS (mode);
13644 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
13645 dperm.one_operand_p = true;
13646
13647 /* Extend to SImode using a paradoxical SUBREG. */
13648 tmp1 = gen_reg_rtx (SImode);
13649 emit_move_insn (tmp1, gen_lowpart (SImode, val));
13650
13651 /* Insert the SImode value as low element of a V4SImode vector. */
13652 tmp2 = gen_reg_rtx (V4SImode);
13653 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
13654 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
13655
13656 ok = (expand_vec_perm_1 (&dperm)
13657 || expand_vec_perm_broadcast_1 (&dperm));
13658 gcc_assert (ok);
13659 return ok;
13660 }
13661 goto widen;
13662
13663 case E_V16QImode:
13664 if (TARGET_AVX2)
13665 return ix86_vector_duplicate_value (mode, target, val);
13666
13667 if (TARGET_SSE2)
13668 goto permute;
13669 goto widen;
13670
13671 widen:
13672 /* Replicate the value once into the next wider mode and recurse. */
13673 {
13674 machine_mode smode, wsmode, wvmode;
13675 rtx x;
13676
13677 smode = GET_MODE_INNER (mode);
13678 wvmode = get_mode_wider_vector (mode);
13679 wsmode = GET_MODE_INNER (wvmode);
13680
13681 val = convert_modes (wsmode, smode, val, true);
13682 x = expand_simple_binop (wsmode, ASHIFT, val,
13683 GEN_INT (GET_MODE_BITSIZE (smode)),
13684 NULL_RTX, 1, OPTAB_LIB_WIDEN);
13685 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
13686
13687 x = gen_reg_rtx (wvmode);
13688 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
13689 gcc_assert (ok);
13690 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
13691 return ok;
13692 }
13693
13694 case E_V16HImode:
13695 case E_V32QImode:
13696 if (TARGET_AVX2)
13697 return ix86_vector_duplicate_value (mode, target, val);
13698 else
13699 {
13700 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
13701 rtx x = gen_reg_rtx (hvmode);
13702
13703 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
13704 gcc_assert (ok);
13705
13706 x = gen_rtx_VEC_CONCAT (mode, x, x);
13707 emit_insn (gen_rtx_SET (target, x));
13708 }
13709 return true;
13710
13711 case E_V64QImode:
13712 case E_V32HImode:
13713 if (TARGET_AVX512BW)
13714 return ix86_vector_duplicate_value (mode, target, val);
13715 else
13716 {
13717 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
13718 rtx x = gen_reg_rtx (hvmode);
13719
13720 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
13721 gcc_assert (ok);
13722
13723 x = gen_rtx_VEC_CONCAT (mode, x, x);
13724 emit_insn (gen_rtx_SET (target, x));
13725 }
13726 return true;
13727
13728 default:
13729 return false;
13730 }
13731 }
13732
13733 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13734 whose ONE_VAR element is VAR, and other elements are zero. Return true
13735 if successful. */
13736
13737 static bool
13738 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
13739 rtx target, rtx var, int one_var)
13740 {
13741 machine_mode vsimode;
13742 rtx new_target;
13743 rtx x, tmp;
13744 bool use_vector_set = false;
13745 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
13746
13747 switch (mode)
13748 {
13749 case E_V2DImode:
13750 /* For SSE4.1, we normally use vector set. But if the second
13751 element is zero and inter-unit moves are OK, we use movq
13752 instead. */
13753 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
13754 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
13755 && one_var == 0));
13756 break;
13757 case E_V16QImode:
13758 case E_V4SImode:
13759 case E_V4SFmode:
13760 use_vector_set = TARGET_SSE4_1;
13761 break;
13762 case E_V8HImode:
13763 use_vector_set = TARGET_SSE2;
13764 break;
13765 case E_V8QImode:
13766 use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
13767 break;
13768 case E_V4HImode:
13769 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
13770 break;
13771 case E_V32QImode:
13772 case E_V16HImode:
13773 use_vector_set = TARGET_AVX;
13774 break;
13775 case E_V8SImode:
13776 use_vector_set = TARGET_AVX;
13777 gen_vec_set_0 = gen_vec_setv8si_0;
13778 break;
13779 case E_V8SFmode:
13780 use_vector_set = TARGET_AVX;
13781 gen_vec_set_0 = gen_vec_setv8sf_0;
13782 break;
13783 case E_V4DFmode:
13784 use_vector_set = TARGET_AVX;
13785 gen_vec_set_0 = gen_vec_setv4df_0;
13786 break;
13787 case E_V4DImode:
13788 /* Use ix86_expand_vector_set in 64bit mode only. */
13789 use_vector_set = TARGET_AVX && TARGET_64BIT;
13790 gen_vec_set_0 = gen_vec_setv4di_0;
13791 break;
13792 case E_V16SImode:
13793 use_vector_set = TARGET_AVX512F && one_var == 0;
13794 gen_vec_set_0 = gen_vec_setv16si_0;
13795 break;
13796 case E_V16SFmode:
13797 use_vector_set = TARGET_AVX512F && one_var == 0;
13798 gen_vec_set_0 = gen_vec_setv16sf_0;
13799 break;
13800 case E_V8DFmode:
13801 use_vector_set = TARGET_AVX512F && one_var == 0;
13802 gen_vec_set_0 = gen_vec_setv8df_0;
13803 break;
13804 case E_V8DImode:
13805 /* Use ix86_expand_vector_set in 64bit mode only. */
13806 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
13807 gen_vec_set_0 = gen_vec_setv8di_0;
13808 break;
13809 default:
13810 break;
13811 }
13812
13813 if (use_vector_set)
13814 {
13815 if (gen_vec_set_0 && one_var == 0)
13816 {
13817 var = force_reg (GET_MODE_INNER (mode), var);
13818 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
13819 return true;
13820 }
13821 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
13822 var = force_reg (GET_MODE_INNER (mode), var);
13823 ix86_expand_vector_set (mmx_ok, target, var, one_var);
13824 return true;
13825 }
13826
13827 switch (mode)
13828 {
13829 case E_V2SFmode:
13830 case E_V2SImode:
13831 if (!mmx_ok)
13832 return false;
13833 /* FALLTHRU */
13834
13835 case E_V2DFmode:
13836 case E_V2DImode:
13837 if (one_var != 0)
13838 return false;
13839 var = force_reg (GET_MODE_INNER (mode), var);
13840 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
13841 emit_insn (gen_rtx_SET (target, x));
13842 return true;
13843
13844 case E_V4SFmode:
13845 case E_V4SImode:
13846 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
13847 new_target = gen_reg_rtx (mode);
13848 else
13849 new_target = target;
13850 var = force_reg (GET_MODE_INNER (mode), var);
13851 x = gen_rtx_VEC_DUPLICATE (mode, var);
13852 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
13853 emit_insn (gen_rtx_SET (new_target, x));
13854 if (one_var != 0)
13855 {
13856 /* We need to shuffle the value to the correct position, so
13857 create a new pseudo to store the intermediate result. */
13858
13859 /* With SSE2, we can use the integer shuffle insns. */
13860 if (mode != V4SFmode && TARGET_SSE2)
13861 {
13862 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
13863 const1_rtx,
13864 GEN_INT (one_var == 1 ? 0 : 1),
13865 GEN_INT (one_var == 2 ? 0 : 1),
13866 GEN_INT (one_var == 3 ? 0 : 1)));
13867 if (target != new_target)
13868 emit_move_insn (target, new_target);
13869 return true;
13870 }
13871
13872 /* Otherwise convert the intermediate result to V4SFmode and
13873 use the SSE1 shuffle instructions. */
13874 if (mode != V4SFmode)
13875 {
13876 tmp = gen_reg_rtx (V4SFmode);
13877 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
13878 }
13879 else
13880 tmp = new_target;
13881
13882 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
13883 const1_rtx,
13884 GEN_INT (one_var == 1 ? 0 : 1),
13885 GEN_INT (one_var == 2 ? 0+4 : 1+4),
13886 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
13887
13888 if (mode != V4SFmode)
13889 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
13890 else if (tmp != target)
13891 emit_move_insn (target, tmp);
13892 }
13893 else if (target != new_target)
13894 emit_move_insn (target, new_target);
13895 return true;
13896
13897 case E_V8HImode:
13898 case E_V16QImode:
13899 vsimode = V4SImode;
13900 goto widen;
13901 case E_V4HImode:
13902 case E_V8QImode:
13903 if (!mmx_ok)
13904 return false;
13905 vsimode = V2SImode;
13906 goto widen;
13907 widen:
13908 if (one_var != 0)
13909 return false;
13910
13911 /* Zero extend the variable element to SImode and recurse. */
13912 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
13913
13914 x = gen_reg_rtx (vsimode);
13915 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
13916 var, one_var))
13917 gcc_unreachable ();
13918
13919 emit_move_insn (target, gen_lowpart (mode, x));
13920 return true;
13921
13922 default:
13923 return false;
13924 }
13925 }
13926
13927 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13928 consisting of the values in VALS. It is known that all elements
13929 except ONE_VAR are constants. Return true if successful. */
13930
13931 static bool
13932 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
13933 rtx target, rtx vals, int one_var)
13934 {
13935 rtx var = XVECEXP (vals, 0, one_var);
13936 machine_mode wmode;
13937 rtx const_vec, x;
13938
13939 const_vec = copy_rtx (vals);
13940 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
13941 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
13942
13943 switch (mode)
13944 {
13945 case E_V2DFmode:
13946 case E_V2DImode:
13947 case E_V2SFmode:
13948 case E_V2SImode:
13949 /* For the two element vectors, it's just as easy to use
13950 the general case. */
13951 return false;
13952
13953 case E_V4DImode:
13954 /* Use ix86_expand_vector_set in 64bit mode only. */
13955 if (!TARGET_64BIT)
13956 return false;
13957 /* FALLTHRU */
13958 case E_V4DFmode:
13959 case E_V8SFmode:
13960 case E_V8SImode:
13961 case E_V16HImode:
13962 case E_V32QImode:
13963 case E_V4SFmode:
13964 case E_V4SImode:
13965 case E_V8HImode:
13966 case E_V4HImode:
13967 break;
13968
13969 case E_V16QImode:
13970 if (TARGET_SSE4_1)
13971 break;
13972 wmode = V8HImode;
13973 goto widen;
13974 case E_V8QImode:
13975 if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
13976 break;
13977 wmode = V4HImode;
13978 goto widen;
13979 widen:
13980 /* There's no way to set one QImode entry easily. Combine
13981 the variable value with its adjacent constant value, and
13982 promote to an HImode set. */
13983 x = XVECEXP (vals, 0, one_var ^ 1);
13984 if (one_var & 1)
13985 {
13986 var = convert_modes (HImode, QImode, var, true);
13987 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
13988 NULL_RTX, 1, OPTAB_LIB_WIDEN);
13989 x = GEN_INT (INTVAL (x) & 0xff);
13990 }
13991 else
13992 {
13993 var = convert_modes (HImode, QImode, var, true);
13994 x = gen_int_mode (UINTVAL (x) << 8, HImode);
13995 }
13996 if (x != const0_rtx)
13997 var = expand_simple_binop (HImode, IOR, var, x, var,
13998 1, OPTAB_LIB_WIDEN);
13999
14000 x = gen_reg_rtx (wmode);
14001 emit_move_insn (x, gen_lowpart (wmode, const_vec));
14002 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
14003
14004 emit_move_insn (target, gen_lowpart (mode, x));
14005 return true;
14006
14007 default:
14008 return false;
14009 }
14010
14011 emit_move_insn (target, const_vec);
14012 ix86_expand_vector_set (mmx_ok, target, var, one_var);
14013 return true;
14014 }
14015
14016 /* A subroutine of ix86_expand_vector_init_general. Use vector
14017 concatenate to handle the most general case: all values variable,
14018 and none identical. */
14019
14020 static void
14021 ix86_expand_vector_init_concat (machine_mode mode,
14022 rtx target, rtx *ops, int n)
14023 {
14024 machine_mode half_mode = VOIDmode;
14025 rtx half[2];
14026 rtvec v;
14027 int i, j;
14028
14029 switch (n)
14030 {
14031 case 2:
14032 switch (mode)
14033 {
14034 case E_V16SImode:
14035 half_mode = V8SImode;
14036 break;
14037 case E_V16SFmode:
14038 half_mode = V8SFmode;
14039 break;
14040 case E_V8DImode:
14041 half_mode = V4DImode;
14042 break;
14043 case E_V8DFmode:
14044 half_mode = V4DFmode;
14045 break;
14046 case E_V8SImode:
14047 half_mode = V4SImode;
14048 break;
14049 case E_V8SFmode:
14050 half_mode = V4SFmode;
14051 break;
14052 case E_V4DImode:
14053 half_mode = V2DImode;
14054 break;
14055 case E_V4DFmode:
14056 half_mode = V2DFmode;
14057 break;
14058 case E_V4SImode:
14059 half_mode = V2SImode;
14060 break;
14061 case E_V4SFmode:
14062 half_mode = V2SFmode;
14063 break;
14064 case E_V2DImode:
14065 half_mode = DImode;
14066 break;
14067 case E_V2SImode:
14068 half_mode = SImode;
14069 break;
14070 case E_V2DFmode:
14071 half_mode = DFmode;
14072 break;
14073 case E_V2SFmode:
14074 half_mode = SFmode;
14075 break;
14076 default:
14077 gcc_unreachable ();
14078 }
14079
14080 if (!register_operand (ops[1], half_mode))
14081 ops[1] = force_reg (half_mode, ops[1]);
14082 if (!register_operand (ops[0], half_mode))
14083 ops[0] = force_reg (half_mode, ops[0]);
14084 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
14085 ops[1])));
14086 break;
14087
14088 case 4:
14089 switch (mode)
14090 {
14091 case E_V4DImode:
14092 half_mode = V2DImode;
14093 break;
14094 case E_V4DFmode:
14095 half_mode = V2DFmode;
14096 break;
14097 case E_V4SImode:
14098 half_mode = V2SImode;
14099 break;
14100 case E_V4SFmode:
14101 half_mode = V2SFmode;
14102 break;
14103 default:
14104 gcc_unreachable ();
14105 }
14106 goto half;
14107
14108 case 8:
14109 switch (mode)
14110 {
14111 case E_V8DImode:
14112 half_mode = V4DImode;
14113 break;
14114 case E_V8DFmode:
14115 half_mode = V4DFmode;
14116 break;
14117 case E_V8SImode:
14118 half_mode = V4SImode;
14119 break;
14120 case E_V8SFmode:
14121 half_mode = V4SFmode;
14122 break;
14123 default:
14124 gcc_unreachable ();
14125 }
14126 goto half;
14127
14128 case 16:
14129 switch (mode)
14130 {
14131 case E_V16SImode:
14132 half_mode = V8SImode;
14133 break;
14134 case E_V16SFmode:
14135 half_mode = V8SFmode;
14136 break;
14137 default:
14138 gcc_unreachable ();
14139 }
14140 goto half;
14141
14142 half:
14143 /* FIXME: We process inputs backward to help RA. PR 36222. */
14144 i = n - 1;
14145 for (j = 1; j != -1; j--)
14146 {
14147 half[j] = gen_reg_rtx (half_mode);
14148 switch (n >> 1)
14149 {
14150 case 2:
14151 v = gen_rtvec (2, ops[i-1], ops[i]);
14152 i -= 2;
14153 break;
14154 case 4:
14155 v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
14156 i -= 4;
14157 break;
14158 case 8:
14159 v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
14160 ops[i-3], ops[i-2], ops[i-1], ops[i]);
14161 i -= 8;
14162 break;
14163 default:
14164 gcc_unreachable ();
14165 }
14166 ix86_expand_vector_init (false, half[j],
14167 gen_rtx_PARALLEL (half_mode, v));
14168 }
14169
14170 ix86_expand_vector_init_concat (mode, target, half, 2);
14171 break;
14172
14173 default:
14174 gcc_unreachable ();
14175 }
14176 }
14177
14178 /* A subroutine of ix86_expand_vector_init_general. Use vector
14179 interleave to handle the most general case: all values variable,
14180 and none identical. */
14181
14182 static void
14183 ix86_expand_vector_init_interleave (machine_mode mode,
14184 rtx target, rtx *ops, int n)
14185 {
14186 machine_mode first_imode, second_imode, third_imode, inner_mode;
14187 int i, j;
14188 rtx op0, op1;
14189 rtx (*gen_load_even) (rtx, rtx, rtx);
14190 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
14191 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
14192
14193 switch (mode)
14194 {
14195 case E_V8HImode:
14196 gen_load_even = gen_vec_setv8hi;
14197 gen_interleave_first_low = gen_vec_interleave_lowv4si;
14198 gen_interleave_second_low = gen_vec_interleave_lowv2di;
14199 inner_mode = HImode;
14200 first_imode = V4SImode;
14201 second_imode = V2DImode;
14202 third_imode = VOIDmode;
14203 break;
14204 case E_V16QImode:
14205 gen_load_even = gen_vec_setv16qi;
14206 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
14207 gen_interleave_second_low = gen_vec_interleave_lowv4si;
14208 inner_mode = QImode;
14209 first_imode = V8HImode;
14210 second_imode = V4SImode;
14211 third_imode = V2DImode;
14212 break;
14213 default:
14214 gcc_unreachable ();
14215 }
14216
14217 for (i = 0; i < n; i++)
14218 {
14219 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
14220 op0 = gen_reg_rtx (SImode);
14221 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
14222
14223 /* Insert the SImode value as low element of V4SImode vector. */
14224 op1 = gen_reg_rtx (V4SImode);
14225 op0 = gen_rtx_VEC_MERGE (V4SImode,
14226 gen_rtx_VEC_DUPLICATE (V4SImode,
14227 op0),
14228 CONST0_RTX (V4SImode),
14229 const1_rtx);
14230 emit_insn (gen_rtx_SET (op1, op0));
14231
14232 /* Cast the V4SImode vector back to a vector in orignal mode. */
14233 op0 = gen_reg_rtx (mode);
14234 emit_move_insn (op0, gen_lowpart (mode, op1));
14235
14236 /* Load even elements into the second position. */
14237 emit_insn (gen_load_even (op0,
14238 force_reg (inner_mode,
14239 ops [i + i + 1]),
14240 const1_rtx));
14241
14242 /* Cast vector to FIRST_IMODE vector. */
14243 ops[i] = gen_reg_rtx (first_imode);
14244 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
14245 }
14246
14247 /* Interleave low FIRST_IMODE vectors. */
14248 for (i = j = 0; i < n; i += 2, j++)
14249 {
14250 op0 = gen_reg_rtx (first_imode);
14251 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
14252
14253 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
14254 ops[j] = gen_reg_rtx (second_imode);
14255 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
14256 }
14257
14258 /* Interleave low SECOND_IMODE vectors. */
14259 switch (second_imode)
14260 {
14261 case E_V4SImode:
14262 for (i = j = 0; i < n / 2; i += 2, j++)
14263 {
14264 op0 = gen_reg_rtx (second_imode);
14265 emit_insn (gen_interleave_second_low (op0, ops[i],
14266 ops[i + 1]));
14267
14268 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
14269 vector. */
14270 ops[j] = gen_reg_rtx (third_imode);
14271 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
14272 }
14273 second_imode = V2DImode;
14274 gen_interleave_second_low = gen_vec_interleave_lowv2di;
14275 /* FALLTHRU */
14276
14277 case E_V2DImode:
14278 op0 = gen_reg_rtx (second_imode);
14279 emit_insn (gen_interleave_second_low (op0, ops[0],
14280 ops[1]));
14281
14282 /* Cast the SECOND_IMODE vector back to a vector on original
14283 mode. */
14284 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
14285 break;
14286
14287 default:
14288 gcc_unreachable ();
14289 }
14290 }
14291
14292 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
14293 all values variable, and none identical. */
14294
14295 static void
14296 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
14297 rtx target, rtx vals)
14298 {
14299 rtx ops[64], op0, op1, op2, op3, op4, op5;
14300 machine_mode half_mode = VOIDmode;
14301 machine_mode quarter_mode = VOIDmode;
14302 int n, i;
14303
14304 switch (mode)
14305 {
14306 case E_V2SFmode:
14307 case E_V2SImode:
14308 if (!mmx_ok && !TARGET_SSE)
14309 break;
14310 /* FALLTHRU */
14311
14312 case E_V16SImode:
14313 case E_V16SFmode:
14314 case E_V8DFmode:
14315 case E_V8DImode:
14316 case E_V8SFmode:
14317 case E_V8SImode:
14318 case E_V4DFmode:
14319 case E_V4DImode:
14320 case E_V4SFmode:
14321 case E_V4SImode:
14322 case E_V2DFmode:
14323 case E_V2DImode:
14324 n = GET_MODE_NUNITS (mode);
14325 for (i = 0; i < n; i++)
14326 ops[i] = XVECEXP (vals, 0, i);
14327 ix86_expand_vector_init_concat (mode, target, ops, n);
14328 return;
14329
14330 case E_V2TImode:
14331 for (i = 0; i < 2; i++)
14332 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
14333 op0 = gen_reg_rtx (V4DImode);
14334 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
14335 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
14336 return;
14337
14338 case E_V4TImode:
14339 for (i = 0; i < 4; i++)
14340 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
14341 ops[4] = gen_reg_rtx (V4DImode);
14342 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
14343 ops[5] = gen_reg_rtx (V4DImode);
14344 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
14345 op0 = gen_reg_rtx (V8DImode);
14346 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
14347 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
14348 return;
14349
14350 case E_V32QImode:
14351 half_mode = V16QImode;
14352 goto half;
14353
14354 case E_V16HImode:
14355 half_mode = V8HImode;
14356 goto half;
14357
14358 half:
14359 n = GET_MODE_NUNITS (mode);
14360 for (i = 0; i < n; i++)
14361 ops[i] = XVECEXP (vals, 0, i);
14362 op0 = gen_reg_rtx (half_mode);
14363 op1 = gen_reg_rtx (half_mode);
14364 ix86_expand_vector_init_interleave (half_mode, op0, ops,
14365 n >> 2);
14366 ix86_expand_vector_init_interleave (half_mode, op1,
14367 &ops [n >> 1], n >> 2);
14368 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
14369 return;
14370
14371 case E_V64QImode:
14372 quarter_mode = V16QImode;
14373 half_mode = V32QImode;
14374 goto quarter;
14375
14376 case E_V32HImode:
14377 quarter_mode = V8HImode;
14378 half_mode = V16HImode;
14379 goto quarter;
14380
14381 quarter:
14382 n = GET_MODE_NUNITS (mode);
14383 for (i = 0; i < n; i++)
14384 ops[i] = XVECEXP (vals, 0, i);
14385 op0 = gen_reg_rtx (quarter_mode);
14386 op1 = gen_reg_rtx (quarter_mode);
14387 op2 = gen_reg_rtx (quarter_mode);
14388 op3 = gen_reg_rtx (quarter_mode);
14389 op4 = gen_reg_rtx (half_mode);
14390 op5 = gen_reg_rtx (half_mode);
14391 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
14392 n >> 3);
14393 ix86_expand_vector_init_interleave (quarter_mode, op1,
14394 &ops [n >> 2], n >> 3);
14395 ix86_expand_vector_init_interleave (quarter_mode, op2,
14396 &ops [n >> 1], n >> 3);
14397 ix86_expand_vector_init_interleave (quarter_mode, op3,
14398 &ops [(n >> 1) | (n >> 2)], n >> 3);
14399 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
14400 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
14401 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
14402 return;
14403
14404 case E_V16QImode:
14405 if (!TARGET_SSE4_1)
14406 break;
14407 /* FALLTHRU */
14408
14409 case E_V8HImode:
14410 if (!TARGET_SSE2)
14411 break;
14412
14413 /* Don't use ix86_expand_vector_init_interleave if we can't
14414 move from GPR to SSE register directly. */
14415 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
14416 break;
14417
14418 n = GET_MODE_NUNITS (mode);
14419 for (i = 0; i < n; i++)
14420 ops[i] = XVECEXP (vals, 0, i);
14421 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
14422 return;
14423
14424 case E_V4HImode:
14425 case E_V8QImode:
14426 break;
14427
14428 default:
14429 gcc_unreachable ();
14430 }
14431
14432 {
14433 int i, j, n_elts, n_words, n_elt_per_word;
14434 machine_mode inner_mode;
14435 rtx words[4], shift;
14436
14437 inner_mode = GET_MODE_INNER (mode);
14438 n_elts = GET_MODE_NUNITS (mode);
14439 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
14440 n_elt_per_word = n_elts / n_words;
14441 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
14442
14443 for (i = 0; i < n_words; ++i)
14444 {
14445 rtx word = NULL_RTX;
14446
14447 for (j = 0; j < n_elt_per_word; ++j)
14448 {
14449 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
14450 elt = convert_modes (word_mode, inner_mode, elt, true);
14451
14452 if (j == 0)
14453 word = elt;
14454 else
14455 {
14456 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
14457 word, 1, OPTAB_LIB_WIDEN);
14458 word = expand_simple_binop (word_mode, IOR, word, elt,
14459 word, 1, OPTAB_LIB_WIDEN);
14460 }
14461 }
14462
14463 words[i] = word;
14464 }
14465
14466 if (n_words == 1)
14467 emit_move_insn (target, gen_lowpart (mode, words[0]));
14468 else if (n_words == 2)
14469 {
14470 rtx tmp = gen_reg_rtx (mode);
14471 emit_clobber (tmp);
14472 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
14473 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
14474 emit_move_insn (target, tmp);
14475 }
14476 else if (n_words == 4)
14477 {
14478 rtx tmp = gen_reg_rtx (V4SImode);
14479 gcc_assert (word_mode == SImode);
14480 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
14481 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
14482 emit_move_insn (target, gen_lowpart (mode, tmp));
14483 }
14484 else
14485 gcc_unreachable ();
14486 }
14487 }
14488
14489 /* Initialize vector TARGET via VALS. Suppress the use of MMX
14490 instructions unless MMX_OK is true. */
14491
14492 void
14493 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
14494 {
14495 machine_mode mode = GET_MODE (target);
14496 machine_mode inner_mode = GET_MODE_INNER (mode);
14497 int n_elts = GET_MODE_NUNITS (mode);
14498 int n_var = 0, one_var = -1;
14499 bool all_same = true, all_const_zero = true;
14500 int i;
14501 rtx x;
14502
14503 /* Handle first initialization from vector elts. */
14504 if (n_elts != XVECLEN (vals, 0))
14505 {
14506 rtx subtarget = target;
14507 x = XVECEXP (vals, 0, 0);
14508 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
14509 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
14510 {
14511 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
14512 if (inner_mode == QImode || inner_mode == HImode)
14513 {
14514 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
14515 mode = mode_for_vector (SImode, n_bits / 4).require ();
14516 inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
14517 ops[0] = gen_lowpart (inner_mode, ops[0]);
14518 ops[1] = gen_lowpart (inner_mode, ops[1]);
14519 subtarget = gen_reg_rtx (mode);
14520 }
14521 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
14522 if (subtarget != target)
14523 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
14524 return;
14525 }
14526 gcc_unreachable ();
14527 }
14528
14529 for (i = 0; i < n_elts; ++i)
14530 {
14531 x = XVECEXP (vals, 0, i);
14532 if (!(CONST_SCALAR_INT_P (x)
14533 || CONST_DOUBLE_P (x)
14534 || CONST_FIXED_P (x)))
14535 n_var++, one_var = i;
14536 else if (x != CONST0_RTX (inner_mode))
14537 all_const_zero = false;
14538 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
14539 all_same = false;
14540 }
14541
14542 /* Constants are best loaded from the constant pool. */
14543 if (n_var == 0)
14544 {
14545 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
14546 return;
14547 }
14548
14549 /* If all values are identical, broadcast the value. */
14550 if (all_same
14551 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
14552 XVECEXP (vals, 0, 0)))
14553 return;
14554
14555 /* Values where only one field is non-constant are best loaded from
14556 the pool and overwritten via move later. */
14557 if (n_var == 1)
14558 {
14559 if (all_const_zero
14560 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
14561 XVECEXP (vals, 0, one_var),
14562 one_var))
14563 return;
14564
14565 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
14566 return;
14567 }
14568
14569 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
14570 }
14571
14572 /* Implemented as
14573 V setg (V v, int idx, T val)
14574 {
14575 V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
14576 V valv = (V){val, val, val, val, val, val, val, val};
14577 V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
14578 v = (v & ~mask) | (valv & mask);
14579 return v;
14580 }. */
14581 void
14582 ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
14583 {
14584 rtx vec[64];
14585 machine_mode mode = GET_MODE (target);
14586 machine_mode cmp_mode = mode;
14587 int n_elts = GET_MODE_NUNITS (mode);
14588 rtx valv,idxv,constv,idx_tmp;
14589 bool ok = false;
14590
14591 /* 512-bits vector byte/word broadcast and comparison only available
14592 under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
14593 when without TARGET_AVX512BW. */
14594 if ((mode == V32HImode || mode == V64QImode) && !TARGET_AVX512BW)
14595 {
14596 gcc_assert (TARGET_AVX512F);
14597 rtx vhi, vlo, idx_hi;
14598 machine_mode half_mode;
14599 rtx (*extract_hi)(rtx, rtx);
14600 rtx (*extract_lo)(rtx, rtx);
14601
14602 if (mode == V32HImode)
14603 {
14604 half_mode = V16HImode;
14605 extract_hi = gen_vec_extract_hi_v32hi;
14606 extract_lo = gen_vec_extract_lo_v32hi;
14607 }
14608 else
14609 {
14610 half_mode = V32QImode;
14611 extract_hi = gen_vec_extract_hi_v64qi;
14612 extract_lo = gen_vec_extract_lo_v64qi;
14613 }
14614
14615 vhi = gen_reg_rtx (half_mode);
14616 vlo = gen_reg_rtx (half_mode);
14617 idx_hi = gen_reg_rtx (GET_MODE (idx));
14618 emit_insn (extract_hi (vhi, target));
14619 emit_insn (extract_lo (vlo, target));
14620 vec[0] = idx_hi;
14621 vec[1] = idx;
14622 vec[2] = GEN_INT (n_elts/2);
14623 ix86_expand_binary_operator (MINUS, GET_MODE (idx), vec);
14624 ix86_expand_vector_set_var (vhi, val, idx_hi);
14625 ix86_expand_vector_set_var (vlo, val, idx);
14626 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, vlo, vhi)));
14627 return;
14628 }
14629
14630 if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
14631 {
14632 switch (mode)
14633 {
14634 case E_V2DFmode:
14635 cmp_mode = V2DImode;
14636 break;
14637 case E_V4DFmode:
14638 cmp_mode = V4DImode;
14639 break;
14640 case E_V8DFmode:
14641 cmp_mode = V8DImode;
14642 break;
14643 case E_V4SFmode:
14644 cmp_mode = V4SImode;
14645 break;
14646 case E_V8SFmode:
14647 cmp_mode = V8SImode;
14648 break;
14649 case E_V16SFmode:
14650 cmp_mode = V16SImode;
14651 break;
14652 default:
14653 gcc_unreachable ();
14654 }
14655 }
14656
14657 for (int i = 0; i != n_elts; i++)
14658 vec[i] = GEN_INT (i);
14659 constv = gen_rtx_CONST_VECTOR (cmp_mode, gen_rtvec_v (n_elts, vec));
14660 valv = gen_reg_rtx (mode);
14661 idxv = gen_reg_rtx (cmp_mode);
14662 idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
14663
14664 ok = ix86_expand_vector_init_duplicate (false, mode, valv, val);
14665 gcc_assert (ok);
14666 ok = ix86_expand_vector_init_duplicate (false, cmp_mode, idxv, idx_tmp);
14667 gcc_assert (ok);
14668 vec[0] = target;
14669 vec[1] = valv;
14670 vec[2] = target;
14671 vec[3] = gen_rtx_EQ (mode, idxv, constv);
14672 vec[4] = idxv;
14673 vec[5] = constv;
14674 ok = ix86_expand_int_vcond (vec);
14675 gcc_assert (ok);
14676 }
14677
14678 void
14679 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
14680 {
14681 machine_mode mode = GET_MODE (target);
14682 machine_mode inner_mode = GET_MODE_INNER (mode);
14683 machine_mode half_mode;
14684 bool use_vec_merge = false;
14685 rtx tmp;
14686 static rtx (*gen_extract[6][2]) (rtx, rtx)
14687 = {
14688 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
14689 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
14690 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
14691 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
14692 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
14693 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
14694 };
14695 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
14696 = {
14697 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
14698 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
14699 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
14700 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
14701 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
14702 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
14703 };
14704 int i, j, n;
14705 machine_mode mmode = VOIDmode;
14706 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
14707
14708 switch (mode)
14709 {
14710 case E_V2SImode:
14711 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14712 if (use_vec_merge)
14713 break;
14714 /* FALLTHRU */
14715
14716 case E_V2SFmode:
14717 if (mmx_ok)
14718 {
14719 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
14720 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
14721 if (elt == 0)
14722 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
14723 else
14724 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
14725 emit_insn (gen_rtx_SET (target, tmp));
14726 return;
14727 }
14728 break;
14729
14730 case E_V2DImode:
14731 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
14732 if (use_vec_merge)
14733 break;
14734
14735 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
14736 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
14737 if (elt == 0)
14738 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
14739 else
14740 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
14741 emit_insn (gen_rtx_SET (target, tmp));
14742 return;
14743
14744 case E_V2DFmode:
14745 /* NB: For ELT == 0, use standard scalar operation patterns which
14746 preserve the rest of the vector for combiner:
14747
14748 (vec_merge:V2DF
14749 (vec_duplicate:V2DF (reg:DF))
14750 (reg:V2DF)
14751 (const_int 1))
14752 */
14753 if (elt == 0)
14754 goto do_vec_merge;
14755
14756 {
14757 rtx op0, op1;
14758
14759 /* For the two element vectors, we implement a VEC_CONCAT with
14760 the extraction of the other element. */
14761
14762 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
14763 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
14764
14765 if (elt == 0)
14766 op0 = val, op1 = tmp;
14767 else
14768 op0 = tmp, op1 = val;
14769
14770 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
14771 emit_insn (gen_rtx_SET (target, tmp));
14772 }
14773 return;
14774
14775 case E_V4SFmode:
14776 use_vec_merge = TARGET_SSE4_1;
14777 if (use_vec_merge)
14778 break;
14779
14780 switch (elt)
14781 {
14782 case 0:
14783 use_vec_merge = true;
14784 break;
14785
14786 case 1:
14787 /* tmp = target = A B C D */
14788 tmp = copy_to_reg (target);
14789 /* target = A A B B */
14790 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
14791 /* target = X A B B */
14792 ix86_expand_vector_set (false, target, val, 0);
14793 /* target = A X C D */
14794 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14795 const1_rtx, const0_rtx,
14796 GEN_INT (2+4), GEN_INT (3+4)));
14797 return;
14798
14799 case 2:
14800 /* tmp = target = A B C D */
14801 tmp = copy_to_reg (target);
14802 /* tmp = X B C D */
14803 ix86_expand_vector_set (false, tmp, val, 0);
14804 /* target = A B X D */
14805 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14806 const0_rtx, const1_rtx,
14807 GEN_INT (0+4), GEN_INT (3+4)));
14808 return;
14809
14810 case 3:
14811 /* tmp = target = A B C D */
14812 tmp = copy_to_reg (target);
14813 /* tmp = X B C D */
14814 ix86_expand_vector_set (false, tmp, val, 0);
14815 /* target = A B X D */
14816 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14817 const0_rtx, const1_rtx,
14818 GEN_INT (2+4), GEN_INT (0+4)));
14819 return;
14820
14821 default:
14822 gcc_unreachable ();
14823 }
14824 break;
14825
14826 case E_V4SImode:
14827 use_vec_merge = TARGET_SSE4_1;
14828 if (use_vec_merge)
14829 break;
14830
14831 /* Element 0 handled by vec_merge below. */
14832 if (elt == 0)
14833 {
14834 use_vec_merge = true;
14835 break;
14836 }
14837
14838 if (TARGET_SSE2)
14839 {
14840 /* With SSE2, use integer shuffles to swap element 0 and ELT,
14841 store into element 0, then shuffle them back. */
14842
14843 rtx order[4];
14844
14845 order[0] = GEN_INT (elt);
14846 order[1] = const1_rtx;
14847 order[2] = const2_rtx;
14848 order[3] = GEN_INT (3);
14849 order[elt] = const0_rtx;
14850
14851 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
14852 order[1], order[2], order[3]));
14853
14854 ix86_expand_vector_set (false, target, val, 0);
14855
14856 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
14857 order[1], order[2], order[3]));
14858 }
14859 else
14860 {
14861 /* For SSE1, we have to reuse the V4SF code. */
14862 rtx t = gen_reg_rtx (V4SFmode);
14863 emit_move_insn (t, gen_lowpart (V4SFmode, target));
14864 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
14865 emit_move_insn (target, gen_lowpart (mode, t));
14866 }
14867 return;
14868
14869 case E_V8HImode:
14870 use_vec_merge = TARGET_SSE2;
14871 break;
14872 case E_V4HImode:
14873 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
14874 break;
14875
14876 case E_V16QImode:
14877 use_vec_merge = TARGET_SSE4_1;
14878 break;
14879
14880 case E_V8QImode:
14881 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14882 break;
14883
14884 case E_V32QImode:
14885 half_mode = V16QImode;
14886 j = 0;
14887 n = 16;
14888 goto half;
14889
14890 case E_V16HImode:
14891 half_mode = V8HImode;
14892 j = 1;
14893 n = 8;
14894 goto half;
14895
14896 case E_V8SImode:
14897 half_mode = V4SImode;
14898 j = 2;
14899 n = 4;
14900 goto half;
14901
14902 case E_V4DImode:
14903 half_mode = V2DImode;
14904 j = 3;
14905 n = 2;
14906 goto half;
14907
14908 case E_V8SFmode:
14909 half_mode = V4SFmode;
14910 j = 4;
14911 n = 4;
14912 goto half;
14913
14914 case E_V4DFmode:
14915 half_mode = V2DFmode;
14916 j = 5;
14917 n = 2;
14918 goto half;
14919
14920 half:
14921 /* Compute offset. */
14922 i = elt / n;
14923 elt %= n;
14924
14925 gcc_assert (i <= 1);
14926
14927 /* Extract the half. */
14928 tmp = gen_reg_rtx (half_mode);
14929 emit_insn (gen_extract[j][i] (tmp, target));
14930
14931 /* Put val in tmp at elt. */
14932 ix86_expand_vector_set (false, tmp, val, elt);
14933
14934 /* Put it back. */
14935 emit_insn (gen_insert[j][i] (target, target, tmp));
14936 return;
14937
14938 case E_V8DFmode:
14939 if (TARGET_AVX512F)
14940 {
14941 mmode = QImode;
14942 gen_blendm = gen_avx512f_blendmv8df;
14943 }
14944 break;
14945
14946 case E_V8DImode:
14947 if (TARGET_AVX512F)
14948 {
14949 mmode = QImode;
14950 gen_blendm = gen_avx512f_blendmv8di;
14951 }
14952 break;
14953
14954 case E_V16SFmode:
14955 if (TARGET_AVX512F)
14956 {
14957 mmode = HImode;
14958 gen_blendm = gen_avx512f_blendmv16sf;
14959 }
14960 break;
14961
14962 case E_V16SImode:
14963 if (TARGET_AVX512F)
14964 {
14965 mmode = HImode;
14966 gen_blendm = gen_avx512f_blendmv16si;
14967 }
14968 break;
14969
14970 case E_V32HImode:
14971 if (TARGET_AVX512BW)
14972 {
14973 mmode = SImode;
14974 gen_blendm = gen_avx512bw_blendmv32hi;
14975 }
14976 else if (TARGET_AVX512F)
14977 {
14978 half_mode = E_V8HImode;
14979 n = 8;
14980 goto quarter;
14981 }
14982 break;
14983
14984 case E_V64QImode:
14985 if (TARGET_AVX512BW)
14986 {
14987 mmode = DImode;
14988 gen_blendm = gen_avx512bw_blendmv64qi;
14989 }
14990 else if (TARGET_AVX512F)
14991 {
14992 half_mode = E_V16QImode;
14993 n = 16;
14994 goto quarter;
14995 }
14996 break;
14997
14998 quarter:
14999 /* Compute offset. */
15000 i = elt / n;
15001 elt %= n;
15002
15003 gcc_assert (i <= 3);
15004
15005 {
15006 /* Extract the quarter. */
15007 tmp = gen_reg_rtx (V4SImode);
15008 rtx tmp2 = gen_lowpart (V16SImode, target);
15009 rtx mask = gen_reg_rtx (QImode);
15010
15011 emit_move_insn (mask, constm1_rtx);
15012 emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
15013 tmp, mask));
15014
15015 tmp2 = gen_reg_rtx (half_mode);
15016 emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
15017 tmp = tmp2;
15018
15019 /* Put val in tmp at elt. */
15020 ix86_expand_vector_set (false, tmp, val, elt);
15021
15022 /* Put it back. */
15023 tmp2 = gen_reg_rtx (V16SImode);
15024 rtx tmp3 = gen_lowpart (V16SImode, target);
15025 mask = gen_reg_rtx (HImode);
15026 emit_move_insn (mask, constm1_rtx);
15027 tmp = gen_lowpart (V4SImode, tmp);
15028 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
15029 tmp3, mask));
15030 emit_move_insn (target, gen_lowpart (mode, tmp2));
15031 }
15032 return;
15033
15034 default:
15035 break;
15036 }
15037
15038 if (mmode != VOIDmode)
15039 {
15040 tmp = gen_reg_rtx (mode);
15041 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
15042 /* The avx512*_blendm<mode> expanders have different operand order
15043 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
15044 elements where the mask is set and second input operand otherwise,
15045 in {sse,avx}*_*blend* the first input operand is used for elements
15046 where the mask is clear and second input operand otherwise. */
15047 emit_insn (gen_blendm (target, target, tmp,
15048 force_reg (mmode,
15049 gen_int_mode (HOST_WIDE_INT_1U << elt,
15050 mmode))));
15051 }
15052 else if (use_vec_merge)
15053 {
15054 do_vec_merge:
15055 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
15056 tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
15057 GEN_INT (HOST_WIDE_INT_1U << elt));
15058 emit_insn (gen_rtx_SET (target, tmp));
15059 }
15060 else
15061 {
15062 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
15063
15064 emit_move_insn (mem, target);
15065
15066 tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
15067 emit_move_insn (tmp, val);
15068
15069 emit_move_insn (target, mem);
15070 }
15071 }
15072
15073 void
15074 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
15075 {
15076 machine_mode mode = GET_MODE (vec);
15077 machine_mode inner_mode = GET_MODE_INNER (mode);
15078 bool use_vec_extr = false;
15079 rtx tmp;
15080
15081 switch (mode)
15082 {
15083 case E_V2SImode:
15084 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
15085 if (use_vec_extr)
15086 break;
15087 /* FALLTHRU */
15088
15089 case E_V2SFmode:
15090 if (!mmx_ok)
15091 break;
15092 /* FALLTHRU */
15093
15094 case E_V2DFmode:
15095 case E_V2DImode:
15096 case E_V2TImode:
15097 case E_V4TImode:
15098 use_vec_extr = true;
15099 break;
15100
15101 case E_V4SFmode:
15102 use_vec_extr = TARGET_SSE4_1;
15103 if (use_vec_extr)
15104 break;
15105
15106 switch (elt)
15107 {
15108 case 0:
15109 tmp = vec;
15110 break;
15111
15112 case 1:
15113 case 3:
15114 tmp = gen_reg_rtx (mode);
15115 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
15116 GEN_INT (elt), GEN_INT (elt),
15117 GEN_INT (elt+4), GEN_INT (elt+4)));
15118 break;
15119
15120 case 2:
15121 tmp = gen_reg_rtx (mode);
15122 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
15123 break;
15124
15125 default:
15126 gcc_unreachable ();
15127 }
15128 vec = tmp;
15129 use_vec_extr = true;
15130 elt = 0;
15131 break;
15132
15133 case E_V4SImode:
15134 use_vec_extr = TARGET_SSE4_1;
15135 if (use_vec_extr)
15136 break;
15137
15138 if (TARGET_SSE2)
15139 {
15140 switch (elt)
15141 {
15142 case 0:
15143 tmp = vec;
15144 break;
15145
15146 case 1:
15147 case 3:
15148 tmp = gen_reg_rtx (mode);
15149 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
15150 GEN_INT (elt), GEN_INT (elt),
15151 GEN_INT (elt), GEN_INT (elt)));
15152 break;
15153
15154 case 2:
15155 tmp = gen_reg_rtx (mode);
15156 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
15157 break;
15158
15159 default:
15160 gcc_unreachable ();
15161 }
15162 vec = tmp;
15163 use_vec_extr = true;
15164 elt = 0;
15165 }
15166 else
15167 {
15168 /* For SSE1, we have to reuse the V4SF code. */
15169 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
15170 gen_lowpart (V4SFmode, vec), elt);
15171 return;
15172 }
15173 break;
15174
15175 case E_V8HImode:
15176 use_vec_extr = TARGET_SSE2;
15177 break;
15178 case E_V4HImode:
15179 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
15180 break;
15181
15182 case E_V16QImode:
15183 use_vec_extr = TARGET_SSE4_1;
15184 if (!use_vec_extr
15185 && TARGET_SSE2
15186 && elt == 0
15187 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
15188 {
15189 tmp = gen_reg_rtx (SImode);
15190 ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
15191 0);
15192 emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
15193 return;
15194 }
15195 break;
15196
15197 case E_V8SFmode:
15198 if (TARGET_AVX)
15199 {
15200 tmp = gen_reg_rtx (V4SFmode);
15201 if (elt < 4)
15202 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
15203 else
15204 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
15205 ix86_expand_vector_extract (false, target, tmp, elt & 3);
15206 return;
15207 }
15208 break;
15209
15210 case E_V4DFmode:
15211 if (TARGET_AVX)
15212 {
15213 tmp = gen_reg_rtx (V2DFmode);
15214 if (elt < 2)
15215 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
15216 else
15217 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
15218 ix86_expand_vector_extract (false, target, tmp, elt & 1);
15219 return;
15220 }
15221 break;
15222
15223 case E_V32QImode:
15224 if (TARGET_AVX)
15225 {
15226 tmp = gen_reg_rtx (V16QImode);
15227 if (elt < 16)
15228 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
15229 else
15230 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
15231 ix86_expand_vector_extract (false, target, tmp, elt & 15);
15232 return;
15233 }
15234 break;
15235
15236 case E_V16HImode:
15237 if (TARGET_AVX)
15238 {
15239 tmp = gen_reg_rtx (V8HImode);
15240 if (elt < 8)
15241 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
15242 else
15243 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
15244 ix86_expand_vector_extract (false, target, tmp, elt & 7);
15245 return;
15246 }
15247 break;
15248
15249 case E_V8SImode:
15250 if (TARGET_AVX)
15251 {
15252 tmp = gen_reg_rtx (V4SImode);
15253 if (elt < 4)
15254 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
15255 else
15256 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
15257 ix86_expand_vector_extract (false, target, tmp, elt & 3);
15258 return;
15259 }
15260 break;
15261
15262 case E_V4DImode:
15263 if (TARGET_AVX)
15264 {
15265 tmp = gen_reg_rtx (V2DImode);
15266 if (elt < 2)
15267 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
15268 else
15269 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
15270 ix86_expand_vector_extract (false, target, tmp, elt & 1);
15271 return;
15272 }
15273 break;
15274
15275 case E_V32HImode:
15276 if (TARGET_AVX512BW)
15277 {
15278 tmp = gen_reg_rtx (V16HImode);
15279 if (elt < 16)
15280 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
15281 else
15282 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
15283 ix86_expand_vector_extract (false, target, tmp, elt & 15);
15284 return;
15285 }
15286 break;
15287
15288 case E_V64QImode:
15289 if (TARGET_AVX512BW)
15290 {
15291 tmp = gen_reg_rtx (V32QImode);
15292 if (elt < 32)
15293 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
15294 else
15295 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
15296 ix86_expand_vector_extract (false, target, tmp, elt & 31);
15297 return;
15298 }
15299 break;
15300
15301 case E_V16SFmode:
15302 tmp = gen_reg_rtx (V8SFmode);
15303 if (elt < 8)
15304 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
15305 else
15306 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
15307 ix86_expand_vector_extract (false, target, tmp, elt & 7);
15308 return;
15309
15310 case E_V8DFmode:
15311 tmp = gen_reg_rtx (V4DFmode);
15312 if (elt < 4)
15313 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
15314 else
15315 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
15316 ix86_expand_vector_extract (false, target, tmp, elt & 3);
15317 return;
15318
15319 case E_V16SImode:
15320 tmp = gen_reg_rtx (V8SImode);
15321 if (elt < 8)
15322 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
15323 else
15324 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
15325 ix86_expand_vector_extract (false, target, tmp, elt & 7);
15326 return;
15327
15328 case E_V8DImode:
15329 tmp = gen_reg_rtx (V4DImode);
15330 if (elt < 4)
15331 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
15332 else
15333 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
15334 ix86_expand_vector_extract (false, target, tmp, elt & 3);
15335 return;
15336
15337 case E_V8QImode:
15338 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
15339 /* ??? Could extract the appropriate HImode element and shift. */
15340 break;
15341
15342 default:
15343 break;
15344 }
15345
15346 if (use_vec_extr)
15347 {
15348 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
15349 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
15350
15351 /* Let the rtl optimizers know about the zero extension performed. */
15352 if (inner_mode == QImode || inner_mode == HImode)
15353 {
15354 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
15355 target = gen_lowpart (SImode, target);
15356 }
15357
15358 emit_insn (gen_rtx_SET (target, tmp));
15359 }
15360 else
15361 {
15362 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
15363
15364 emit_move_insn (mem, vec);
15365
15366 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
15367 emit_move_insn (target, tmp);
15368 }
15369 }
15370
15371 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
15372 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
15373 The upper bits of DEST are undefined, though they shouldn't cause
15374 exceptions (some bits from src or all zeros are ok). */
15375
15376 static void
15377 emit_reduc_half (rtx dest, rtx src, int i)
15378 {
15379 rtx tem, d = dest;
15380 switch (GET_MODE (src))
15381 {
15382 case E_V4SFmode:
15383 if (i == 128)
15384 tem = gen_sse_movhlps (dest, src, src);
15385 else
15386 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
15387 GEN_INT (1 + 4), GEN_INT (1 + 4));
15388 break;
15389 case E_V2DFmode:
15390 tem = gen_vec_interleave_highv2df (dest, src, src);
15391 break;
15392 case E_V16QImode:
15393 case E_V8HImode:
15394 case E_V4SImode:
15395 case E_V2DImode:
15396 d = gen_reg_rtx (V1TImode);
15397 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
15398 GEN_INT (i / 2));
15399 break;
15400 case E_V8SFmode:
15401 if (i == 256)
15402 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
15403 else
15404 tem = gen_avx_shufps256 (dest, src, src,
15405 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
15406 break;
15407 case E_V4DFmode:
15408 if (i == 256)
15409 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
15410 else
15411 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
15412 break;
15413 case E_V32QImode:
15414 case E_V16HImode:
15415 case E_V8SImode:
15416 case E_V4DImode:
15417 if (i == 256)
15418 {
15419 if (GET_MODE (dest) != V4DImode)
15420 d = gen_reg_rtx (V4DImode);
15421 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
15422 gen_lowpart (V4DImode, src),
15423 const1_rtx);
15424 }
15425 else
15426 {
15427 d = gen_reg_rtx (V2TImode);
15428 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
15429 GEN_INT (i / 2));
15430 }
15431 break;
15432 case E_V64QImode:
15433 case E_V32HImode:
15434 if (i < 64)
15435 {
15436 d = gen_reg_rtx (V4TImode);
15437 tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
15438 GEN_INT (i / 2));
15439 break;
15440 }
15441 /* FALLTHRU */
15442 case E_V16SImode:
15443 case E_V16SFmode:
15444 case E_V8DImode:
15445 case E_V8DFmode:
15446 if (i > 128)
15447 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
15448 gen_lowpart (V16SImode, src),
15449 gen_lowpart (V16SImode, src),
15450 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
15451 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
15452 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
15453 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
15454 GEN_INT (0xC), GEN_INT (0xD),
15455 GEN_INT (0xE), GEN_INT (0xF),
15456 GEN_INT (0x10), GEN_INT (0x11),
15457 GEN_INT (0x12), GEN_INT (0x13),
15458 GEN_INT (0x14), GEN_INT (0x15),
15459 GEN_INT (0x16), GEN_INT (0x17));
15460 else
15461 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
15462 gen_lowpart (V16SImode, src),
15463 GEN_INT (i == 128 ? 0x2 : 0x1),
15464 GEN_INT (0x3),
15465 GEN_INT (0x3),
15466 GEN_INT (0x3),
15467 GEN_INT (i == 128 ? 0x6 : 0x5),
15468 GEN_INT (0x7),
15469 GEN_INT (0x7),
15470 GEN_INT (0x7),
15471 GEN_INT (i == 128 ? 0xA : 0x9),
15472 GEN_INT (0xB),
15473 GEN_INT (0xB),
15474 GEN_INT (0xB),
15475 GEN_INT (i == 128 ? 0xE : 0xD),
15476 GEN_INT (0xF),
15477 GEN_INT (0xF),
15478 GEN_INT (0xF));
15479 break;
15480 default:
15481 gcc_unreachable ();
15482 }
15483 emit_insn (tem);
15484 if (d != dest)
15485 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
15486 }
15487
15488 /* Expand a vector reduction. FN is the binary pattern to reduce;
15489 DEST is the destination; IN is the input vector. */
15490
15491 void
15492 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
15493 {
15494 rtx half, dst, vec = in;
15495 machine_mode mode = GET_MODE (in);
15496 int i;
15497
15498 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
15499 if (TARGET_SSE4_1
15500 && mode == V8HImode
15501 && fn == gen_uminv8hi3)
15502 {
15503 emit_insn (gen_sse4_1_phminposuw (dest, in));
15504 return;
15505 }
15506
15507 for (i = GET_MODE_BITSIZE (mode);
15508 i > GET_MODE_UNIT_BITSIZE (mode);
15509 i >>= 1)
15510 {
15511 half = gen_reg_rtx (mode);
15512 emit_reduc_half (half, vec, i);
15513 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
15514 dst = dest;
15515 else
15516 dst = gen_reg_rtx (mode);
15517 emit_insn (fn (dst, half, vec));
15518 vec = dst;
15519 }
15520 }
15521
15522 /* Output code to perform a conditional jump to LABEL, if C2 flag in
15523 FP status register is set. */
15524
15525 void
15526 ix86_emit_fp_unordered_jump (rtx label)
15527 {
15528 rtx reg = gen_reg_rtx (HImode);
15529 rtx_insn *insn;
15530 rtx temp;
15531
15532 emit_insn (gen_x86_fnstsw_1 (reg));
15533
15534 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
15535 {
15536 emit_insn (gen_x86_sahf_1 (reg));
15537
15538 temp = gen_rtx_REG (CCmode, FLAGS_REG);
15539 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
15540 }
15541 else
15542 {
15543 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
15544
15545 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
15546 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
15547 }
15548
15549 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
15550 gen_rtx_LABEL_REF (VOIDmode, label),
15551 pc_rtx);
15552 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
15553 predict_jump (REG_BR_PROB_BASE * 10 / 100);
15554 JUMP_LABEL (insn) = label;
15555 }
15556
15557 /* Output code to perform an sinh XFmode calculation. */
15558
15559 void ix86_emit_i387_sinh (rtx op0, rtx op1)
15560 {
15561 rtx e1 = gen_reg_rtx (XFmode);
15562 rtx e2 = gen_reg_rtx (XFmode);
15563 rtx scratch = gen_reg_rtx (HImode);
15564 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15565 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15566 rtx cst1, tmp;
15567 rtx_code_label *jump_label = gen_label_rtx ();
15568 rtx_insn *insn;
15569
15570 /* scratch = fxam (op1) */
15571 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15572
15573 /* e1 = expm1 (|op1|) */
15574 emit_insn (gen_absxf2 (e2, op1));
15575 emit_insn (gen_expm1xf2 (e1, e2));
15576
15577 /* e2 = e1 / (e1 + 1.0) + e1 */
15578 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15579 emit_insn (gen_addxf3 (e2, e1, cst1));
15580 emit_insn (gen_divxf3 (e2, e1, e2));
15581 emit_insn (gen_addxf3 (e2, e2, e1));
15582
15583 /* flags = signbit (op1) */
15584 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15585
15586 /* if (flags) then e2 = -e2 */
15587 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15588 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15589 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15590 pc_rtx);
15591 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15592 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15593 JUMP_LABEL (insn) = jump_label;
15594
15595 emit_insn (gen_negxf2 (e2, e2));
15596
15597 emit_label (jump_label);
15598 LABEL_NUSES (jump_label) = 1;
15599
15600 /* op0 = 0.5 * e2 */
15601 half = force_reg (XFmode, half);
15602 emit_insn (gen_mulxf3 (op0, e2, half));
15603 }
15604
15605 /* Output code to perform an cosh XFmode calculation. */
15606
15607 void ix86_emit_i387_cosh (rtx op0, rtx op1)
15608 {
15609 rtx e1 = gen_reg_rtx (XFmode);
15610 rtx e2 = gen_reg_rtx (XFmode);
15611 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15612 rtx cst1;
15613
15614 /* e1 = exp (op1) */
15615 emit_insn (gen_expxf2 (e1, op1));
15616
15617 /* e2 = e1 + 1.0 / e1 */
15618 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15619 emit_insn (gen_divxf3 (e2, cst1, e1));
15620 emit_insn (gen_addxf3 (e2, e1, e2));
15621
15622 /* op0 = 0.5 * e2 */
15623 half = force_reg (XFmode, half);
15624 emit_insn (gen_mulxf3 (op0, e2, half));
15625 }
15626
15627 /* Output code to perform an tanh XFmode calculation. */
15628
15629 void ix86_emit_i387_tanh (rtx op0, rtx op1)
15630 {
15631 rtx e1 = gen_reg_rtx (XFmode);
15632 rtx e2 = gen_reg_rtx (XFmode);
15633 rtx scratch = gen_reg_rtx (HImode);
15634 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15635 rtx cst2, tmp;
15636 rtx_code_label *jump_label = gen_label_rtx ();
15637 rtx_insn *insn;
15638
15639 /* scratch = fxam (op1) */
15640 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15641
15642 /* e1 = expm1 (-|2 * op1|) */
15643 emit_insn (gen_addxf3 (e2, op1, op1));
15644 emit_insn (gen_absxf2 (e2, e2));
15645 emit_insn (gen_negxf2 (e2, e2));
15646 emit_insn (gen_expm1xf2 (e1, e2));
15647
15648 /* e2 = e1 / (e1 + 2.0) */
15649 cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
15650 emit_insn (gen_addxf3 (e2, e1, cst2));
15651 emit_insn (gen_divxf3 (e2, e1, e2));
15652
15653 /* flags = signbit (op1) */
15654 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15655
15656 /* if (!flags) then e2 = -e2 */
15657 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15658 gen_rtx_NE (VOIDmode, flags, const0_rtx),
15659 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15660 pc_rtx);
15661 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15662 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15663 JUMP_LABEL (insn) = jump_label;
15664
15665 emit_insn (gen_negxf2 (e2, e2));
15666
15667 emit_label (jump_label);
15668 LABEL_NUSES (jump_label) = 1;
15669
15670 emit_move_insn (op0, e2);
15671 }
15672
15673 /* Output code to perform an asinh XFmode calculation. */
15674
15675 void ix86_emit_i387_asinh (rtx op0, rtx op1)
15676 {
15677 rtx e1 = gen_reg_rtx (XFmode);
15678 rtx e2 = gen_reg_rtx (XFmode);
15679 rtx scratch = gen_reg_rtx (HImode);
15680 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15681 rtx cst1, tmp;
15682 rtx_code_label *jump_label = gen_label_rtx ();
15683 rtx_insn *insn;
15684
15685 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
15686 emit_insn (gen_mulxf3 (e1, op1, op1));
15687 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15688 emit_insn (gen_addxf3 (e2, e1, cst1));
15689 emit_insn (gen_sqrtxf2 (e2, e2));
15690 emit_insn (gen_addxf3 (e2, e2, cst1));
15691
15692 /* e1 = e1 / e2 */
15693 emit_insn (gen_divxf3 (e1, e1, e2));
15694
15695 /* scratch = fxam (op1) */
15696 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15697
15698 /* e1 = e1 + |op1| */
15699 emit_insn (gen_absxf2 (e2, op1));
15700 emit_insn (gen_addxf3 (e1, e1, e2));
15701
15702 /* e2 = log1p (e1) */
15703 ix86_emit_i387_log1p (e2, e1);
15704
15705 /* flags = signbit (op1) */
15706 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15707
15708 /* if (flags) then e2 = -e2 */
15709 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15710 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15711 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15712 pc_rtx);
15713 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15714 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15715 JUMP_LABEL (insn) = jump_label;
15716
15717 emit_insn (gen_negxf2 (e2, e2));
15718
15719 emit_label (jump_label);
15720 LABEL_NUSES (jump_label) = 1;
15721
15722 emit_move_insn (op0, e2);
15723 }
15724
15725 /* Output code to perform an acosh XFmode calculation. */
15726
15727 void ix86_emit_i387_acosh (rtx op0, rtx op1)
15728 {
15729 rtx e1 = gen_reg_rtx (XFmode);
15730 rtx e2 = gen_reg_rtx (XFmode);
15731 rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15732
15733 /* e2 = sqrt (op1 + 1.0) */
15734 emit_insn (gen_addxf3 (e2, op1, cst1));
15735 emit_insn (gen_sqrtxf2 (e2, e2));
15736
15737 /* e1 = sqrt (op1 - 1.0) */
15738 emit_insn (gen_subxf3 (e1, op1, cst1));
15739 emit_insn (gen_sqrtxf2 (e1, e1));
15740
15741 /* e1 = e1 * e2 */
15742 emit_insn (gen_mulxf3 (e1, e1, e2));
15743
15744 /* e1 = e1 + op1 */
15745 emit_insn (gen_addxf3 (e1, e1, op1));
15746
15747 /* op0 = log (e1) */
15748 emit_insn (gen_logxf2 (op0, e1));
15749 }
15750
15751 /* Output code to perform an atanh XFmode calculation. */
15752
15753 void ix86_emit_i387_atanh (rtx op0, rtx op1)
15754 {
15755 rtx e1 = gen_reg_rtx (XFmode);
15756 rtx e2 = gen_reg_rtx (XFmode);
15757 rtx scratch = gen_reg_rtx (HImode);
15758 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15759 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15760 rtx cst1, tmp;
15761 rtx_code_label *jump_label = gen_label_rtx ();
15762 rtx_insn *insn;
15763
15764 /* scratch = fxam (op1) */
15765 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15766
15767 /* e2 = |op1| */
15768 emit_insn (gen_absxf2 (e2, op1));
15769
15770 /* e1 = -(e2 + e2) / (e2 + 1.0) */
15771 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15772 emit_insn (gen_addxf3 (e1, e2, cst1));
15773 emit_insn (gen_addxf3 (e2, e2, e2));
15774 emit_insn (gen_negxf2 (e2, e2));
15775 emit_insn (gen_divxf3 (e1, e2, e1));
15776
15777 /* e2 = log1p (e1) */
15778 ix86_emit_i387_log1p (e2, e1);
15779
15780 /* flags = signbit (op1) */
15781 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15782
15783 /* if (!flags) then e2 = -e2 */
15784 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15785 gen_rtx_NE (VOIDmode, flags, const0_rtx),
15786 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15787 pc_rtx);
15788 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15789 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15790 JUMP_LABEL (insn) = jump_label;
15791
15792 emit_insn (gen_negxf2 (e2, e2));
15793
15794 emit_label (jump_label);
15795 LABEL_NUSES (jump_label) = 1;
15796
15797 /* op0 = 0.5 * e2 */
15798 half = force_reg (XFmode, half);
15799 emit_insn (gen_mulxf3 (op0, e2, half));
15800 }
15801
15802 /* Output code to perform a log1p XFmode calculation. */
15803
15804 void ix86_emit_i387_log1p (rtx op0, rtx op1)
15805 {
15806 rtx_code_label *label1 = gen_label_rtx ();
15807 rtx_code_label *label2 = gen_label_rtx ();
15808
15809 rtx tmp = gen_reg_rtx (XFmode);
15810 rtx res = gen_reg_rtx (XFmode);
15811 rtx cst, cstln2, cst1;
15812 rtx_insn *insn;
15813
15814 cst = const_double_from_real_value
15815 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
15816 cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
15817
15818 emit_insn (gen_absxf2 (tmp, op1));
15819
15820 cst = force_reg (XFmode, cst);
15821 ix86_expand_branch (GE, tmp, cst, label1);
15822 predict_jump (REG_BR_PROB_BASE * 10 / 100);
15823 insn = get_last_insn ();
15824 JUMP_LABEL (insn) = label1;
15825
15826 emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
15827 emit_jump (label2);
15828
15829 emit_label (label1);
15830 LABEL_NUSES (label1) = 1;
15831
15832 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15833 emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
15834 emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
15835
15836 emit_label (label2);
15837 LABEL_NUSES (label2) = 1;
15838
15839 emit_move_insn (op0, res);
15840 }
15841
15842 /* Emit code for round calculation. */
15843 void ix86_emit_i387_round (rtx op0, rtx op1)
15844 {
15845 machine_mode inmode = GET_MODE (op1);
15846 machine_mode outmode = GET_MODE (op0);
15847 rtx e1 = gen_reg_rtx (XFmode);
15848 rtx e2 = gen_reg_rtx (XFmode);
15849 rtx scratch = gen_reg_rtx (HImode);
15850 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15851 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15852 rtx res = gen_reg_rtx (outmode);
15853 rtx_code_label *jump_label = gen_label_rtx ();
15854 rtx (*floor_insn) (rtx, rtx);
15855 rtx (*neg_insn) (rtx, rtx);
15856 rtx_insn *insn;
15857 rtx tmp;
15858
15859 switch (inmode)
15860 {
15861 case E_SFmode:
15862 case E_DFmode:
15863 tmp = gen_reg_rtx (XFmode);
15864
15865 emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
15866 op1 = tmp;
15867 break;
15868 case E_XFmode:
15869 break;
15870 default:
15871 gcc_unreachable ();
15872 }
15873
15874 switch (outmode)
15875 {
15876 case E_SFmode:
15877 floor_insn = gen_frndintxf2_floor;
15878 neg_insn = gen_negsf2;
15879 break;
15880 case E_DFmode:
15881 floor_insn = gen_frndintxf2_floor;
15882 neg_insn = gen_negdf2;
15883 break;
15884 case E_XFmode:
15885 floor_insn = gen_frndintxf2_floor;
15886 neg_insn = gen_negxf2;
15887 break;
15888 case E_HImode:
15889 floor_insn = gen_lfloorxfhi2;
15890 neg_insn = gen_neghi2;
15891 break;
15892 case E_SImode:
15893 floor_insn = gen_lfloorxfsi2;
15894 neg_insn = gen_negsi2;
15895 break;
15896 case E_DImode:
15897 floor_insn = gen_lfloorxfdi2;
15898 neg_insn = gen_negdi2;
15899 break;
15900 default:
15901 gcc_unreachable ();
15902 }
15903
15904 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
15905
15906 /* scratch = fxam(op1) */
15907 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15908
15909 /* e1 = fabs(op1) */
15910 emit_insn (gen_absxf2 (e1, op1));
15911
15912 /* e2 = e1 + 0.5 */
15913 half = force_reg (XFmode, half);
15914 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
15915
15916 /* res = floor(e2) */
15917 switch (outmode)
15918 {
15919 case E_SFmode:
15920 case E_DFmode:
15921 {
15922 tmp = gen_reg_rtx (XFmode);
15923
15924 emit_insn (floor_insn (tmp, e2));
15925 emit_insn (gen_rtx_SET (res,
15926 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
15927 UNSPEC_TRUNC_NOOP)));
15928 }
15929 break;
15930 default:
15931 emit_insn (floor_insn (res, e2));
15932 }
15933
15934 /* flags = signbit(a) */
15935 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15936
15937 /* if (flags) then res = -res */
15938 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15939 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15940 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15941 pc_rtx);
15942 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15943 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15944 JUMP_LABEL (insn) = jump_label;
15945
15946 emit_insn (neg_insn (res, res));
15947
15948 emit_label (jump_label);
15949 LABEL_NUSES (jump_label) = 1;
15950
15951 emit_move_insn (op0, res);
15952 }
15953
15954 /* Output code to perform a Newton-Rhapson approximation of a single precision
15955 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
15956
15957 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
15958 {
15959 rtx x0, x1, e0, e1;
15960
15961 x0 = gen_reg_rtx (mode);
15962 e0 = gen_reg_rtx (mode);
15963 e1 = gen_reg_rtx (mode);
15964 x1 = gen_reg_rtx (mode);
15965
15966 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
15967
15968 b = force_reg (mode, b);
15969
15970 /* x0 = rcp(b) estimate */
15971 if (mode == V16SFmode || mode == V8DFmode)
15972 {
15973 if (TARGET_AVX512ER)
15974 {
15975 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15976 UNSPEC_RCP28)));
15977 /* res = a * x0 */
15978 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
15979 return;
15980 }
15981 else
15982 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15983 UNSPEC_RCP14)));
15984 }
15985 else
15986 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15987 UNSPEC_RCP)));
15988
15989 /* e0 = x0 * b */
15990 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
15991
15992 /* e0 = x0 * e0 */
15993 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
15994
15995 /* e1 = x0 + x0 */
15996 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
15997
15998 /* x1 = e1 - e0 */
15999 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
16000
16001 /* res = a * x1 */
16002 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
16003 }
16004
16005 /* Output code to perform a Newton-Rhapson approximation of a
16006 single precision floating point [reciprocal] square root. */
16007
16008 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
16009 {
16010 rtx x0, e0, e1, e2, e3, mthree, mhalf;
16011 REAL_VALUE_TYPE r;
16012 int unspec;
16013
16014 x0 = gen_reg_rtx (mode);
16015 e0 = gen_reg_rtx (mode);
16016 e1 = gen_reg_rtx (mode);
16017 e2 = gen_reg_rtx (mode);
16018 e3 = gen_reg_rtx (mode);
16019
16020 if (TARGET_AVX512ER && mode == V16SFmode)
16021 {
16022 if (recip)
16023 /* res = rsqrt28(a) estimate */
16024 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
16025 UNSPEC_RSQRT28)));
16026 else
16027 {
16028 /* x0 = rsqrt28(a) estimate */
16029 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
16030 UNSPEC_RSQRT28)));
16031 /* res = rcp28(x0) estimate */
16032 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
16033 UNSPEC_RCP28)));
16034 }
16035 return;
16036 }
16037
16038 real_from_integer (&r, VOIDmode, -3, SIGNED);
16039 mthree = const_double_from_real_value (r, SFmode);
16040
16041 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
16042 mhalf = const_double_from_real_value (r, SFmode);
16043 unspec = UNSPEC_RSQRT;
16044
16045 if (VECTOR_MODE_P (mode))
16046 {
16047 mthree = ix86_build_const_vector (mode, true, mthree);
16048 mhalf = ix86_build_const_vector (mode, true, mhalf);
16049 /* There is no 512-bit rsqrt. There is however rsqrt14. */
16050 if (GET_MODE_SIZE (mode) == 64)
16051 unspec = UNSPEC_RSQRT14;
16052 }
16053
16054 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
16055 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
16056
16057 a = force_reg (mode, a);
16058
16059 /* x0 = rsqrt(a) estimate */
16060 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
16061 unspec)));
16062
16063 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
16064 if (!recip)
16065 {
16066 rtx zero = force_reg (mode, CONST0_RTX(mode));
16067 rtx mask;
16068
16069 /* Handle masked compare. */
16070 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
16071 {
16072 mask = gen_reg_rtx (HImode);
16073 /* Imm value 0x4 corresponds to not-equal comparison. */
16074 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
16075 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
16076 }
16077 else
16078 {
16079 mask = gen_reg_rtx (mode);
16080 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
16081 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
16082 }
16083 }
16084
16085 mthree = force_reg (mode, mthree);
16086
16087 /* e0 = x0 * a */
16088 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
16089
16090 unsigned vector_size = GET_MODE_SIZE (mode);
16091 if (TARGET_FMA
16092 || (TARGET_AVX512F && vector_size == 64)
16093 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
16094 emit_insn (gen_rtx_SET (e2,
16095 gen_rtx_FMA (mode, e0, x0, mthree)));
16096 else
16097 {
16098 /* e1 = e0 * x0 */
16099 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
16100
16101 /* e2 = e1 - 3. */
16102 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
16103 }
16104
16105 mhalf = force_reg (mode, mhalf);
16106 if (recip)
16107 /* e3 = -.5 * x0 */
16108 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
16109 else
16110 /* e3 = -.5 * e0 */
16111 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
16112 /* ret = e2 * e3 */
16113 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
16114 }
16115
16116 /* Expand fabs (OP0) and return a new rtx that holds the result. The
16117 mask for masking out the sign-bit is stored in *SMASK, if that is
16118 non-null. */
16119
16120 static rtx
16121 ix86_expand_sse_fabs (rtx op0, rtx *smask)
16122 {
16123 machine_mode vmode, mode = GET_MODE (op0);
16124 rtx xa, mask;
16125
16126 xa = gen_reg_rtx (mode);
16127 if (mode == SFmode)
16128 vmode = V4SFmode;
16129 else if (mode == DFmode)
16130 vmode = V2DFmode;
16131 else
16132 vmode = mode;
16133 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
16134 if (!VECTOR_MODE_P (mode))
16135 {
16136 /* We need to generate a scalar mode mask in this case. */
16137 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
16138 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
16139 mask = gen_reg_rtx (mode);
16140 emit_insn (gen_rtx_SET (mask, tmp));
16141 }
16142 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
16143
16144 if (smask)
16145 *smask = mask;
16146
16147 return xa;
16148 }
16149
16150 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
16151 swapping the operands if SWAP_OPERANDS is true. The expanded
16152 code is a forward jump to a newly created label in case the
16153 comparison is true. The generated label rtx is returned. */
16154 static rtx_code_label *
16155 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
16156 bool swap_operands)
16157 {
16158 bool unordered_compare = ix86_unordered_fp_compare (code);
16159 rtx_code_label *label;
16160 rtx tmp, reg;
16161
16162 if (swap_operands)
16163 std::swap (op0, op1);
16164
16165 label = gen_label_rtx ();
16166 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
16167 if (unordered_compare)
16168 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
16169 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
16170 emit_insn (gen_rtx_SET (reg, tmp));
16171 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
16172 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
16173 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
16174 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
16175 JUMP_LABEL (tmp) = label;
16176
16177 return label;
16178 }
16179
16180 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
16181 using comparison code CODE. Operands are swapped for the comparison if
16182 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
16183 static rtx
16184 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
16185 bool swap_operands)
16186 {
16187 rtx (*insn)(rtx, rtx, rtx, rtx);
16188 machine_mode mode = GET_MODE (op0);
16189 rtx mask = gen_reg_rtx (mode);
16190
16191 if (swap_operands)
16192 std::swap (op0, op1);
16193
16194 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
16195
16196 emit_insn (insn (mask, op0, op1,
16197 gen_rtx_fmt_ee (code, mode, op0, op1)));
16198 return mask;
16199 }
16200
16201 /* Expand copysign from SIGN to the positive value ABS_VALUE
16202 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
16203 the sign-bit. */
16204
16205 static void
16206 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
16207 {
16208 machine_mode mode = GET_MODE (sign);
16209 rtx sgn = gen_reg_rtx (mode);
16210 if (mask == NULL_RTX)
16211 {
16212 machine_mode vmode;
16213
16214 if (mode == SFmode)
16215 vmode = V4SFmode;
16216 else if (mode == DFmode)
16217 vmode = V2DFmode;
16218 else
16219 vmode = mode;
16220
16221 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
16222 if (!VECTOR_MODE_P (mode))
16223 {
16224 /* We need to generate a scalar mode mask in this case. */
16225 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
16226 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
16227 mask = gen_reg_rtx (mode);
16228 emit_insn (gen_rtx_SET (mask, tmp));
16229 }
16230 }
16231 else
16232 mask = gen_rtx_NOT (mode, mask);
16233 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
16234 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
16235 }
16236
16237 /* Expand SSE sequence for computing lround from OP1 storing
16238 into OP0. */
16239
16240 void
16241 ix86_expand_lround (rtx op0, rtx op1)
16242 {
16243 /* C code for the stuff we're doing below:
16244 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
16245 return (long)tmp;
16246 */
16247 machine_mode mode = GET_MODE (op1);
16248 const struct real_format *fmt;
16249 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16250 rtx adj;
16251
16252 /* load nextafter (0.5, 0.0) */
16253 fmt = REAL_MODE_FORMAT (mode);
16254 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16255 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16256
16257 /* adj = copysign (0.5, op1) */
16258 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
16259 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
16260
16261 /* adj = op1 + adj */
16262 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
16263
16264 /* op0 = (imode)adj */
16265 expand_fix (op0, adj, 0);
16266 }
16267
16268 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
16269 into OPERAND0. */
16270
16271 void
16272 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
16273 {
16274 /* C code for the stuff we're doing below (for do_floor):
16275 xi = (long)op1;
16276 xi -= (double)xi > op1 ? 1 : 0;
16277 return xi;
16278 */
16279 machine_mode fmode = GET_MODE (op1);
16280 machine_mode imode = GET_MODE (op0);
16281 rtx ireg, freg, tmp;
16282 rtx_code_label *label;
16283
16284 /* reg = (long)op1 */
16285 ireg = gen_reg_rtx (imode);
16286 expand_fix (ireg, op1, 0);
16287
16288 /* freg = (double)reg */
16289 freg = gen_reg_rtx (fmode);
16290 expand_float (freg, ireg, 0);
16291
16292 /* ireg = (freg > op1) ? ireg - 1 : ireg */
16293 label = ix86_expand_sse_compare_and_jump (UNLE,
16294 freg, op1, !do_floor);
16295 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
16296 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
16297 emit_move_insn (ireg, tmp);
16298
16299 emit_label (label);
16300 LABEL_NUSES (label) = 1;
16301
16302 emit_move_insn (op0, ireg);
16303 }
16304
16305 /* Generate and return a rtx of mode MODE for 2**n where n is the number
16306 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
16307
16308 static rtx
16309 ix86_gen_TWO52 (machine_mode mode)
16310 {
16311 const struct real_format *fmt;
16312 REAL_VALUE_TYPE TWO52r;
16313 rtx TWO52;
16314
16315 fmt = REAL_MODE_FORMAT (mode);
16316 real_2expN (&TWO52r, fmt->p - 1, mode);
16317 TWO52 = const_double_from_real_value (TWO52r, mode);
16318 TWO52 = force_reg (mode, TWO52);
16319
16320 return TWO52;
16321 }
16322
16323 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
16324
16325 void
16326 ix86_expand_rint (rtx operand0, rtx operand1)
16327 {
16328 /* C code for the stuff we're doing below:
16329 xa = fabs (operand1);
16330 if (!isless (xa, 2**52))
16331 return operand1;
16332 two52 = 2**52;
16333 if (flag_rounding_math)
16334 {
16335 two52 = copysign (two52, operand1);
16336 xa = operand1;
16337 }
16338 xa = xa + two52 - two52;
16339 return copysign (xa, operand1);
16340 */
16341 machine_mode mode = GET_MODE (operand0);
16342 rtx res, xa, TWO52, mask;
16343 rtx_code_label *label;
16344
16345 TWO52 = ix86_gen_TWO52 (mode);
16346
16347 /* Temporary for holding the result, initialized to the input
16348 operand to ease control flow. */
16349 res = copy_to_reg (operand1);
16350
16351 /* xa = abs (operand1) */
16352 xa = ix86_expand_sse_fabs (res, &mask);
16353
16354 /* if (!isless (xa, TWO52)) goto label; */
16355 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16356
16357 if (flag_rounding_math)
16358 {
16359 ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask);
16360 xa = res;
16361 }
16362
16363 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16364 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
16365
16366 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
16367 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
16368 xa = ix86_expand_sse_fabs (xa, NULL);
16369
16370 ix86_sse_copysign_to_positive (res, xa, res, mask);
16371
16372 emit_label (label);
16373 LABEL_NUSES (label) = 1;
16374
16375 emit_move_insn (operand0, res);
16376 }
16377
16378 /* Expand SSE2 sequence for computing floor or ceil
16379 from OPERAND1 storing into OPERAND0. */
16380 void
16381 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
16382 {
16383 /* C code for the stuff we expand below.
16384 double xa = fabs (x), x2;
16385 if (!isless (xa, TWO52))
16386 return x;
16387 x2 = (double)(long)x;
16388
16389 Compensate. Floor:
16390 if (x2 > x)
16391 x2 -= 1;
16392 Compensate. Ceil:
16393 if (x2 < x)
16394 x2 += 1;
16395
16396 if (HONOR_SIGNED_ZEROS (mode))
16397 return copysign (x2, x);
16398 return x2;
16399 */
16400 machine_mode mode = GET_MODE (operand0);
16401 rtx xa, xi, TWO52, tmp, one, res, mask;
16402 rtx_code_label *label;
16403
16404 TWO52 = ix86_gen_TWO52 (mode);
16405
16406 /* Temporary for holding the result, initialized to the input
16407 operand to ease control flow. */
16408 res = copy_to_reg (operand1);
16409
16410 /* xa = abs (operand1) */
16411 xa = ix86_expand_sse_fabs (res, &mask);
16412
16413 /* if (!isless (xa, TWO52)) goto label; */
16414 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16415
16416 /* xa = (double)(long)x */
16417 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
16418 expand_fix (xi, res, 0);
16419 expand_float (xa, xi, 0);
16420
16421 /* generate 1.0 */
16422 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
16423
16424 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
16425 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
16426 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
16427 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
16428 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16429 if (HONOR_SIGNED_ZEROS (mode))
16430 {
16431 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
16432 if (do_floor && flag_rounding_math)
16433 tmp = ix86_expand_sse_fabs (tmp, NULL);
16434
16435 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
16436 }
16437 emit_move_insn (res, tmp);
16438
16439 emit_label (label);
16440 LABEL_NUSES (label) = 1;
16441
16442 emit_move_insn (operand0, res);
16443 }
16444
16445 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
16446 into OPERAND0 without relying on DImode truncation via cvttsd2siq
16447 that is only available on 64bit targets. */
16448 void
16449 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
16450 {
16451 /* C code for the stuff we expand below.
16452 double xa = fabs (x), x2;
16453 if (!isless (xa, TWO52))
16454 return x;
16455 xa = xa + TWO52 - TWO52;
16456 x2 = copysign (xa, x);
16457
16458 Compensate. Floor:
16459 if (x2 > x)
16460 x2 -= 1;
16461 Compensate. Ceil:
16462 if (x2 < x)
16463 x2 += 1;
16464
16465 if (HONOR_SIGNED_ZEROS (mode))
16466 x2 = copysign (x2, x);
16467 return x2;
16468 */
16469 machine_mode mode = GET_MODE (operand0);
16470 rtx xa, TWO52, tmp, one, res, mask;
16471 rtx_code_label *label;
16472
16473 TWO52 = ix86_gen_TWO52 (mode);
16474
16475 /* Temporary for holding the result, initialized to the input
16476 operand to ease control flow. */
16477 res = copy_to_reg (operand1);
16478
16479 /* xa = abs (operand1) */
16480 xa = ix86_expand_sse_fabs (res, &mask);
16481
16482 /* if (!isless (xa, TWO52)) goto label; */
16483 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16484
16485 /* xa = xa + TWO52 - TWO52; */
16486 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16487 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
16488
16489 /* xa = copysign (xa, operand1) */
16490 ix86_sse_copysign_to_positive (xa, xa, res, mask);
16491
16492 /* generate 1.0 */
16493 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
16494
16495 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
16496 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
16497 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
16498 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
16499 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16500 if (HONOR_SIGNED_ZEROS (mode))
16501 {
16502 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
16503 if (do_floor && flag_rounding_math)
16504 tmp = ix86_expand_sse_fabs (tmp, NULL);
16505
16506 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
16507 }
16508 emit_move_insn (res, tmp);
16509
16510 emit_label (label);
16511 LABEL_NUSES (label) = 1;
16512
16513 emit_move_insn (operand0, res);
16514 }
16515
16516 /* Expand SSE sequence for computing trunc
16517 from OPERAND1 storing into OPERAND0. */
16518 void
16519 ix86_expand_trunc (rtx operand0, rtx operand1)
16520 {
16521 /* C code for SSE variant we expand below.
16522 double xa = fabs (x), x2;
16523 if (!isless (xa, TWO52))
16524 return x;
16525 x2 = (double)(long)x;
16526 if (HONOR_SIGNED_ZEROS (mode))
16527 return copysign (x2, x);
16528 return x2;
16529 */
16530 machine_mode mode = GET_MODE (operand0);
16531 rtx xa, xi, TWO52, res, mask;
16532 rtx_code_label *label;
16533
16534 TWO52 = ix86_gen_TWO52 (mode);
16535
16536 /* Temporary for holding the result, initialized to the input
16537 operand to ease control flow. */
16538 res = copy_to_reg (operand1);
16539
16540 /* xa = abs (operand1) */
16541 xa = ix86_expand_sse_fabs (res, &mask);
16542
16543 /* if (!isless (xa, TWO52)) goto label; */
16544 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16545
16546 /* xa = (double)(long)x */
16547 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
16548 expand_fix (xi, res, 0);
16549 expand_float (xa, xi, 0);
16550
16551 if (HONOR_SIGNED_ZEROS (mode))
16552 ix86_sse_copysign_to_positive (xa, xa, res, mask);
16553
16554 emit_move_insn (res, xa);
16555
16556 emit_label (label);
16557 LABEL_NUSES (label) = 1;
16558
16559 emit_move_insn (operand0, res);
16560 }
16561
16562 /* Expand SSE sequence for computing trunc from OPERAND1 storing
16563 into OPERAND0 without relying on DImode truncation via cvttsd2siq
16564 that is only available on 64bit targets. */
16565 void
16566 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
16567 {
16568 machine_mode mode = GET_MODE (operand0);
16569 rtx xa, xa2, TWO52, tmp, one, res, mask;
16570 rtx_code_label *label;
16571
16572 /* C code for SSE variant we expand below.
16573 double xa = fabs (x), x2;
16574 if (!isless (xa, TWO52))
16575 return x;
16576 xa2 = xa + TWO52 - TWO52;
16577 Compensate:
16578 if (xa2 > xa)
16579 xa2 -= 1.0;
16580 x2 = copysign (xa2, x);
16581 return x2;
16582 */
16583
16584 TWO52 = ix86_gen_TWO52 (mode);
16585
16586 /* Temporary for holding the result, initialized to the input
16587 operand to ease control flow. */
16588 res =copy_to_reg (operand1);
16589
16590 /* xa = abs (operand1) */
16591 xa = ix86_expand_sse_fabs (res, &mask);
16592
16593 /* if (!isless (xa, TWO52)) goto label; */
16594 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16595
16596 /* xa2 = xa + TWO52 - TWO52; */
16597 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16598 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
16599
16600 /* generate 1.0 */
16601 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
16602
16603 /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */
16604 tmp = ix86_expand_sse_compare_mask (UNGT, xa2, xa, false);
16605 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
16606 tmp = expand_simple_binop (mode, MINUS,
16607 xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16608 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
16609 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
16610 tmp = ix86_expand_sse_fabs (tmp, NULL);
16611
16612 /* res = copysign (xa2, operand1) */
16613 ix86_sse_copysign_to_positive (res, tmp, res, mask);
16614
16615 emit_label (label);
16616 LABEL_NUSES (label) = 1;
16617
16618 emit_move_insn (operand0, res);
16619 }
16620
16621 /* Expand SSE sequence for computing round
16622 from OPERAND1 storing into OPERAND0. */
16623 void
16624 ix86_expand_round (rtx operand0, rtx operand1)
16625 {
16626 /* C code for the stuff we're doing below:
16627 double xa = fabs (x);
16628 if (!isless (xa, TWO52))
16629 return x;
16630 xa = (double)(long)(xa + nextafter (0.5, 0.0));
16631 return copysign (xa, x);
16632 */
16633 machine_mode mode = GET_MODE (operand0);
16634 rtx res, TWO52, xa, xi, half, mask;
16635 rtx_code_label *label;
16636 const struct real_format *fmt;
16637 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16638
16639 /* Temporary for holding the result, initialized to the input
16640 operand to ease control flow. */
16641 res = copy_to_reg (operand1);
16642
16643 TWO52 = ix86_gen_TWO52 (mode);
16644 xa = ix86_expand_sse_fabs (res, &mask);
16645 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16646
16647 /* load nextafter (0.5, 0.0) */
16648 fmt = REAL_MODE_FORMAT (mode);
16649 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16650 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16651
16652 /* xa = xa + 0.5 */
16653 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
16654 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
16655
16656 /* xa = (double)(int64_t)xa */
16657 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
16658 expand_fix (xi, xa, 0);
16659 expand_float (xa, xi, 0);
16660
16661 /* res = copysign (xa, operand1) */
16662 ix86_sse_copysign_to_positive (res, xa, res, mask);
16663
16664 emit_label (label);
16665 LABEL_NUSES (label) = 1;
16666
16667 emit_move_insn (operand0, res);
16668 }
16669
16670 /* Expand SSE sequence for computing round from OPERAND1 storing
16671 into OPERAND0 without relying on DImode truncation via cvttsd2siq
16672 that is only available on 64bit targets. */
16673 void
16674 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
16675 {
16676 /* C code for the stuff we expand below.
16677 double xa = fabs (x), xa2, x2;
16678 if (!isless (xa, TWO52))
16679 return x;
16680 Using the absolute value and copying back sign makes
16681 -0.0 -> -0.0 correct.
16682 xa2 = xa + TWO52 - TWO52;
16683 Compensate.
16684 dxa = xa2 - xa;
16685 if (dxa <= -0.5)
16686 xa2 += 1;
16687 else if (dxa > 0.5)
16688 xa2 -= 1;
16689 x2 = copysign (xa2, x);
16690 return x2;
16691 */
16692 machine_mode mode = GET_MODE (operand0);
16693 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
16694 rtx_code_label *label;
16695
16696 TWO52 = ix86_gen_TWO52 (mode);
16697
16698 /* Temporary for holding the result, initialized to the input
16699 operand to ease control flow. */
16700 res = copy_to_reg (operand1);
16701
16702 /* xa = abs (operand1) */
16703 xa = ix86_expand_sse_fabs (res, &mask);
16704
16705 /* if (!isless (xa, TWO52)) goto label; */
16706 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16707
16708 /* xa2 = xa + TWO52 - TWO52; */
16709 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16710 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
16711
16712 /* dxa = xa2 - xa; */
16713 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
16714
16715 /* generate 0.5, 1.0 and -0.5 */
16716 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
16717 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
16718 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
16719 0, OPTAB_DIRECT);
16720
16721 /* Compensate. */
16722 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
16723 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
16724 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
16725 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16726 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
16727 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
16728 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
16729 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16730
16731 /* res = copysign (xa2, operand1) */
16732 ix86_sse_copysign_to_positive (res, xa2, res, mask);
16733
16734 emit_label (label);
16735 LABEL_NUSES (label) = 1;
16736
16737 emit_move_insn (operand0, res);
16738 }
16739
16740 /* Expand SSE sequence for computing round
16741 from OP1 storing into OP0 using sse4 round insn. */
16742 void
16743 ix86_expand_round_sse4 (rtx op0, rtx op1)
16744 {
16745 machine_mode mode = GET_MODE (op0);
16746 rtx e1, e2, res, half;
16747 const struct real_format *fmt;
16748 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16749 rtx (*gen_copysign) (rtx, rtx, rtx);
16750 rtx (*gen_round) (rtx, rtx, rtx);
16751
16752 switch (mode)
16753 {
16754 case E_SFmode:
16755 gen_copysign = gen_copysignsf3;
16756 gen_round = gen_sse4_1_roundsf2;
16757 break;
16758 case E_DFmode:
16759 gen_copysign = gen_copysigndf3;
16760 gen_round = gen_sse4_1_rounddf2;
16761 break;
16762 default:
16763 gcc_unreachable ();
16764 }
16765
16766 /* round (a) = trunc (a + copysign (0.5, a)) */
16767
16768 /* load nextafter (0.5, 0.0) */
16769 fmt = REAL_MODE_FORMAT (mode);
16770 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16771 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16772 half = const_double_from_real_value (pred_half, mode);
16773
16774 /* e1 = copysign (0.5, op1) */
16775 e1 = gen_reg_rtx (mode);
16776 emit_insn (gen_copysign (e1, half, op1));
16777
16778 /* e2 = op1 + e1 */
16779 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
16780
16781 /* res = trunc (e2) */
16782 res = gen_reg_rtx (mode);
16783 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
16784
16785 emit_move_insn (op0, res);
16786 }
16787
16788 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
16789 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
16790 insn every time. */
16791
16792 static GTY(()) rtx_insn *vselect_insn;
16793
16794 /* Initialize vselect_insn. */
16795
16796 static void
16797 init_vselect_insn (void)
16798 {
16799 unsigned i;
16800 rtx x;
16801
16802 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
16803 for (i = 0; i < MAX_VECT_LEN; ++i)
16804 XVECEXP (x, 0, i) = const0_rtx;
16805 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
16806 const0_rtx), x);
16807 x = gen_rtx_SET (const0_rtx, x);
16808 start_sequence ();
16809 vselect_insn = emit_insn (x);
16810 end_sequence ();
16811 }
16812
16813 /* Construct (set target (vec_select op0 (parallel perm))) and
16814 return true if that's a valid instruction in the active ISA. */
16815
16816 static bool
16817 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
16818 unsigned nelt, bool testing_p)
16819 {
16820 unsigned int i;
16821 rtx x, save_vconcat;
16822 int icode;
16823
16824 if (vselect_insn == NULL_RTX)
16825 init_vselect_insn ();
16826
16827 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
16828 PUT_NUM_ELEM (XVEC (x, 0), nelt);
16829 for (i = 0; i < nelt; ++i)
16830 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
16831 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
16832 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
16833 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
16834 SET_DEST (PATTERN (vselect_insn)) = target;
16835 icode = recog_memoized (vselect_insn);
16836
16837 if (icode >= 0 && !testing_p)
16838 emit_insn (copy_rtx (PATTERN (vselect_insn)));
16839
16840 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
16841 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
16842 INSN_CODE (vselect_insn) = -1;
16843
16844 return icode >= 0;
16845 }
16846
16847 /* Similar, but generate a vec_concat from op0 and op1 as well. */
16848
16849 static bool
16850 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
16851 const unsigned char *perm, unsigned nelt,
16852 bool testing_p)
16853 {
16854 machine_mode v2mode;
16855 rtx x;
16856 bool ok;
16857
16858 if (vselect_insn == NULL_RTX)
16859 init_vselect_insn ();
16860
16861 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
16862 return false;
16863 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
16864 PUT_MODE (x, v2mode);
16865 XEXP (x, 0) = op0;
16866 XEXP (x, 1) = op1;
16867 ok = expand_vselect (target, x, perm, nelt, testing_p);
16868 XEXP (x, 0) = const0_rtx;
16869 XEXP (x, 1) = const0_rtx;
16870 return ok;
16871 }
16872
16873 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16874 using movss or movsd. */
16875 static bool
16876 expand_vec_perm_movs (struct expand_vec_perm_d *d)
16877 {
16878 machine_mode vmode = d->vmode;
16879 unsigned i, nelt = d->nelt;
16880 rtx x;
16881
16882 if (d->one_operand_p)
16883 return false;
16884
16885 if (!(TARGET_SSE && vmode == V4SFmode)
16886 && !(TARGET_MMX_WITH_SSE && vmode == V2SFmode)
16887 && !(TARGET_SSE2 && vmode == V2DFmode))
16888 return false;
16889
16890 /* Only the first element is changed. */
16891 if (d->perm[0] != nelt && d->perm[0] != 0)
16892 return false;
16893 for (i = 1; i < nelt; ++i)
16894 if (d->perm[i] != i + nelt - d->perm[0])
16895 return false;
16896
16897 if (d->testing_p)
16898 return true;
16899
16900 if (d->perm[0] == nelt)
16901 x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
16902 else
16903 x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
16904
16905 emit_insn (gen_rtx_SET (d->target, x));
16906
16907 return true;
16908 }
16909
16910 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16911 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
16912
16913 static bool
16914 expand_vec_perm_blend (struct expand_vec_perm_d *d)
16915 {
16916 machine_mode mmode, vmode = d->vmode;
16917 unsigned i, nelt = d->nelt;
16918 unsigned HOST_WIDE_INT mask;
16919 rtx target, op0, op1, maskop, x;
16920 rtx rperm[32], vperm;
16921
16922 if (d->one_operand_p)
16923 return false;
16924 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
16925 && (TARGET_AVX512BW
16926 || GET_MODE_UNIT_SIZE (vmode) >= 4))
16927 ;
16928 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
16929 ;
16930 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
16931 ;
16932 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
16933 ;
16934 else
16935 return false;
16936
16937 /* This is a blend, not a permute. Elements must stay in their
16938 respective lanes. */
16939 for (i = 0; i < nelt; ++i)
16940 {
16941 unsigned e = d->perm[i];
16942 if (!(e == i || e == i + nelt))
16943 return false;
16944 }
16945
16946 if (d->testing_p)
16947 return true;
16948
16949 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
16950 decision should be extracted elsewhere, so that we only try that
16951 sequence once all budget==3 options have been tried. */
16952 target = d->target;
16953 op0 = d->op0;
16954 op1 = d->op1;
16955 mask = 0;
16956
16957 switch (vmode)
16958 {
16959 case E_V8DFmode:
16960 case E_V16SFmode:
16961 case E_V4DFmode:
16962 case E_V8SFmode:
16963 case E_V2DFmode:
16964 case E_V4SFmode:
16965 case E_V8HImode:
16966 case E_V8SImode:
16967 case E_V32HImode:
16968 case E_V64QImode:
16969 case E_V16SImode:
16970 case E_V8DImode:
16971 for (i = 0; i < nelt; ++i)
16972 mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
16973 break;
16974
16975 case E_V2DImode:
16976 for (i = 0; i < 2; ++i)
16977 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
16978 vmode = V8HImode;
16979 goto do_subreg;
16980
16981 case E_V4SImode:
16982 for (i = 0; i < 4; ++i)
16983 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
16984 vmode = V8HImode;
16985 goto do_subreg;
16986
16987 case E_V16QImode:
16988 /* See if bytes move in pairs so we can use pblendw with
16989 an immediate argument, rather than pblendvb with a vector
16990 argument. */
16991 for (i = 0; i < 16; i += 2)
16992 if (d->perm[i] + 1 != d->perm[i + 1])
16993 {
16994 use_pblendvb:
16995 for (i = 0; i < nelt; ++i)
16996 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
16997
16998 finish_pblendvb:
16999 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
17000 vperm = force_reg (vmode, vperm);
17001
17002 if (GET_MODE_SIZE (vmode) == 16)
17003 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
17004 else
17005 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
17006 if (target != d->target)
17007 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
17008 return true;
17009 }
17010
17011 for (i = 0; i < 8; ++i)
17012 mask |= (d->perm[i * 2] >= 16) << i;
17013 vmode = V8HImode;
17014 /* FALLTHRU */
17015
17016 do_subreg:
17017 target = gen_reg_rtx (vmode);
17018 op0 = gen_lowpart (vmode, op0);
17019 op1 = gen_lowpart (vmode, op1);
17020 break;
17021
17022 case E_V32QImode:
17023 /* See if bytes move in pairs. If not, vpblendvb must be used. */
17024 for (i = 0; i < 32; i += 2)
17025 if (d->perm[i] + 1 != d->perm[i + 1])
17026 goto use_pblendvb;
17027 /* See if bytes move in quadruplets. If yes, vpblendd
17028 with immediate can be used. */
17029 for (i = 0; i < 32; i += 4)
17030 if (d->perm[i] + 2 != d->perm[i + 2])
17031 break;
17032 if (i < 32)
17033 {
17034 /* See if bytes move the same in both lanes. If yes,
17035 vpblendw with immediate can be used. */
17036 for (i = 0; i < 16; i += 2)
17037 if (d->perm[i] + 16 != d->perm[i + 16])
17038 goto use_pblendvb;
17039
17040 /* Use vpblendw. */
17041 for (i = 0; i < 16; ++i)
17042 mask |= (d->perm[i * 2] >= 32) << i;
17043 vmode = V16HImode;
17044 goto do_subreg;
17045 }
17046
17047 /* Use vpblendd. */
17048 for (i = 0; i < 8; ++i)
17049 mask |= (d->perm[i * 4] >= 32) << i;
17050 vmode = V8SImode;
17051 goto do_subreg;
17052
17053 case E_V16HImode:
17054 /* See if words move in pairs. If yes, vpblendd can be used. */
17055 for (i = 0; i < 16; i += 2)
17056 if (d->perm[i] + 1 != d->perm[i + 1])
17057 break;
17058 if (i < 16)
17059 {
17060 /* See if words move the same in both lanes. If not,
17061 vpblendvb must be used. */
17062 for (i = 0; i < 8; i++)
17063 if (d->perm[i] + 8 != d->perm[i + 8])
17064 {
17065 /* Use vpblendvb. */
17066 for (i = 0; i < 32; ++i)
17067 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
17068
17069 vmode = V32QImode;
17070 nelt = 32;
17071 target = gen_reg_rtx (vmode);
17072 op0 = gen_lowpart (vmode, op0);
17073 op1 = gen_lowpart (vmode, op1);
17074 goto finish_pblendvb;
17075 }
17076
17077 /* Use vpblendw. */
17078 for (i = 0; i < 16; ++i)
17079 mask |= (d->perm[i] >= 16) << i;
17080 break;
17081 }
17082
17083 /* Use vpblendd. */
17084 for (i = 0; i < 8; ++i)
17085 mask |= (d->perm[i * 2] >= 16) << i;
17086 vmode = V8SImode;
17087 goto do_subreg;
17088
17089 case E_V4DImode:
17090 /* Use vpblendd. */
17091 for (i = 0; i < 4; ++i)
17092 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
17093 vmode = V8SImode;
17094 goto do_subreg;
17095
17096 default:
17097 gcc_unreachable ();
17098 }
17099
17100 switch (vmode)
17101 {
17102 case E_V8DFmode:
17103 case E_V8DImode:
17104 mmode = QImode;
17105 break;
17106 case E_V16SFmode:
17107 case E_V16SImode:
17108 mmode = HImode;
17109 break;
17110 case E_V32HImode:
17111 mmode = SImode;
17112 break;
17113 case E_V64QImode:
17114 mmode = DImode;
17115 break;
17116 default:
17117 mmode = VOIDmode;
17118 }
17119
17120 if (mmode != VOIDmode)
17121 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
17122 else
17123 maskop = GEN_INT (mask);
17124
17125 /* This matches five different patterns with the different modes. */
17126 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
17127 x = gen_rtx_SET (target, x);
17128 emit_insn (x);
17129 if (target != d->target)
17130 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
17131
17132 return true;
17133 }
17134
17135 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
17136 in terms of the variable form of vpermilps.
17137
17138 Note that we will have already failed the immediate input vpermilps,
17139 which requires that the high and low part shuffle be identical; the
17140 variable form doesn't require that. */
17141
17142 static bool
17143 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
17144 {
17145 rtx rperm[8], vperm;
17146 unsigned i;
17147
17148 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
17149 return false;
17150
17151 /* We can only permute within the 128-bit lane. */
17152 for (i = 0; i < 8; ++i)
17153 {
17154 unsigned e = d->perm[i];
17155 if (i < 4 ? e >= 4 : e < 4)
17156 return false;
17157 }
17158
17159 if (d->testing_p)
17160 return true;
17161
17162 for (i = 0; i < 8; ++i)
17163 {
17164 unsigned e = d->perm[i];
17165
17166 /* Within each 128-bit lane, the elements of op0 are numbered
17167 from 0 and the elements of op1 are numbered from 4. */
17168 if (e >= 8 + 4)
17169 e -= 8;
17170 else if (e >= 4)
17171 e -= 4;
17172
17173 rperm[i] = GEN_INT (e);
17174 }
17175
17176 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
17177 vperm = force_reg (V8SImode, vperm);
17178 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
17179
17180 return true;
17181 }
17182
17183 /* Return true if permutation D can be performed as VMODE permutation
17184 instead. */
17185
17186 static bool
17187 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
17188 {
17189 unsigned int i, j, chunk;
17190
17191 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
17192 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
17193 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
17194 return false;
17195
17196 if (GET_MODE_NUNITS (vmode) >= d->nelt)
17197 return true;
17198
17199 chunk = d->nelt / GET_MODE_NUNITS (vmode);
17200 for (i = 0; i < d->nelt; i += chunk)
17201 if (d->perm[i] & (chunk - 1))
17202 return false;
17203 else
17204 for (j = 1; j < chunk; ++j)
17205 if (d->perm[i] + j != d->perm[i + j])
17206 return false;
17207
17208 return true;
17209 }
17210
17211 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
17212 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
17213
17214 static bool
17215 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
17216 {
17217 unsigned i, nelt, eltsz, mask;
17218 unsigned char perm[64];
17219 machine_mode vmode = V16QImode;
17220 rtx rperm[64], vperm, target, op0, op1;
17221
17222 nelt = d->nelt;
17223
17224 if (!d->one_operand_p)
17225 {
17226 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
17227 {
17228 if (TARGET_AVX2
17229 && valid_perm_using_mode_p (V2TImode, d))
17230 {
17231 if (d->testing_p)
17232 return true;
17233
17234 /* Use vperm2i128 insn. The pattern uses
17235 V4DImode instead of V2TImode. */
17236 target = d->target;
17237 if (d->vmode != V4DImode)
17238 target = gen_reg_rtx (V4DImode);
17239 op0 = gen_lowpart (V4DImode, d->op0);
17240 op1 = gen_lowpart (V4DImode, d->op1);
17241 rperm[0]
17242 = GEN_INT ((d->perm[0] / (nelt / 2))
17243 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
17244 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
17245 if (target != d->target)
17246 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
17247 return true;
17248 }
17249 return false;
17250 }
17251 }
17252 else
17253 {
17254 if (GET_MODE_SIZE (d->vmode) == 16)
17255 {
17256 if (!TARGET_SSSE3)
17257 return false;
17258 }
17259 else if (GET_MODE_SIZE (d->vmode) == 32)
17260 {
17261 if (!TARGET_AVX2)
17262 return false;
17263
17264 /* V4DImode should be already handled through
17265 expand_vselect by vpermq instruction. */
17266 gcc_assert (d->vmode != V4DImode);
17267
17268 vmode = V32QImode;
17269 if (d->vmode == V8SImode
17270 || d->vmode == V16HImode
17271 || d->vmode == V32QImode)
17272 {
17273 /* First see if vpermq can be used for
17274 V8SImode/V16HImode/V32QImode. */
17275 if (valid_perm_using_mode_p (V4DImode, d))
17276 {
17277 for (i = 0; i < 4; i++)
17278 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
17279 if (d->testing_p)
17280 return true;
17281 target = gen_reg_rtx (V4DImode);
17282 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
17283 perm, 4, false))
17284 {
17285 emit_move_insn (d->target,
17286 gen_lowpart (d->vmode, target));
17287 return true;
17288 }
17289 return false;
17290 }
17291
17292 /* Next see if vpermd can be used. */
17293 if (valid_perm_using_mode_p (V8SImode, d))
17294 vmode = V8SImode;
17295 }
17296 /* Or if vpermps can be used. */
17297 else if (d->vmode == V8SFmode)
17298 vmode = V8SImode;
17299
17300 if (vmode == V32QImode)
17301 {
17302 /* vpshufb only works intra lanes, it is not
17303 possible to shuffle bytes in between the lanes. */
17304 for (i = 0; i < nelt; ++i)
17305 if ((d->perm[i] ^ i) & (nelt / 2))
17306 return false;
17307 }
17308 }
17309 else if (GET_MODE_SIZE (d->vmode) == 64)
17310 {
17311 if (!TARGET_AVX512BW)
17312 return false;
17313
17314 /* If vpermq didn't work, vpshufb won't work either. */
17315 if (d->vmode == V8DFmode || d->vmode == V8DImode)
17316 return false;
17317
17318 vmode = V64QImode;
17319 if (d->vmode == V16SImode
17320 || d->vmode == V32HImode
17321 || d->vmode == V64QImode)
17322 {
17323 /* First see if vpermq can be used for
17324 V16SImode/V32HImode/V64QImode. */
17325 if (valid_perm_using_mode_p (V8DImode, d))
17326 {
17327 for (i = 0; i < 8; i++)
17328 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
17329 if (d->testing_p)
17330 return true;
17331 target = gen_reg_rtx (V8DImode);
17332 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
17333 perm, 8, false))
17334 {
17335 emit_move_insn (d->target,
17336 gen_lowpart (d->vmode, target));
17337 return true;
17338 }
17339 return false;
17340 }
17341
17342 /* Next see if vpermd can be used. */
17343 if (valid_perm_using_mode_p (V16SImode, d))
17344 vmode = V16SImode;
17345 }
17346 /* Or if vpermps can be used. */
17347 else if (d->vmode == V16SFmode)
17348 vmode = V16SImode;
17349 if (vmode == V64QImode)
17350 {
17351 /* vpshufb only works intra lanes, it is not
17352 possible to shuffle bytes in between the lanes. */
17353 for (i = 0; i < nelt; ++i)
17354 if ((d->perm[i] ^ i) & (3 * nelt / 4))
17355 return false;
17356 }
17357 }
17358 else
17359 return false;
17360 }
17361
17362 if (d->testing_p)
17363 return true;
17364
17365 if (vmode == V8SImode)
17366 for (i = 0; i < 8; ++i)
17367 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
17368 else if (vmode == V16SImode)
17369 for (i = 0; i < 16; ++i)
17370 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
17371 else
17372 {
17373 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
17374 if (!d->one_operand_p)
17375 mask = 2 * nelt - 1;
17376 else if (vmode == V16QImode)
17377 mask = nelt - 1;
17378 else if (vmode == V64QImode)
17379 mask = nelt / 4 - 1;
17380 else
17381 mask = nelt / 2 - 1;
17382
17383 for (i = 0; i < nelt; ++i)
17384 {
17385 unsigned j, e = d->perm[i] & mask;
17386 for (j = 0; j < eltsz; ++j)
17387 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
17388 }
17389 }
17390
17391 vperm = gen_rtx_CONST_VECTOR (vmode,
17392 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
17393 vperm = force_reg (vmode, vperm);
17394
17395 target = d->target;
17396 if (d->vmode != vmode)
17397 target = gen_reg_rtx (vmode);
17398 op0 = gen_lowpart (vmode, d->op0);
17399 if (d->one_operand_p)
17400 {
17401 if (vmode == V16QImode)
17402 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
17403 else if (vmode == V32QImode)
17404 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
17405 else if (vmode == V64QImode)
17406 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
17407 else if (vmode == V8SFmode)
17408 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
17409 else if (vmode == V8SImode)
17410 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
17411 else if (vmode == V16SFmode)
17412 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
17413 else if (vmode == V16SImode)
17414 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
17415 else
17416 gcc_unreachable ();
17417 }
17418 else
17419 {
17420 op1 = gen_lowpart (vmode, d->op1);
17421 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
17422 }
17423 if (target != d->target)
17424 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
17425
17426 return true;
17427 }
17428
17429 /* For V*[QHS]Imode permutations, check if the same permutation
17430 can't be performed in a 2x, 4x or 8x wider inner mode. */
17431
17432 static bool
17433 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
17434 struct expand_vec_perm_d *nd)
17435 {
17436 int i;
17437 machine_mode mode = VOIDmode;
17438
17439 switch (d->vmode)
17440 {
17441 case E_V16QImode: mode = V8HImode; break;
17442 case E_V32QImode: mode = V16HImode; break;
17443 case E_V64QImode: mode = V32HImode; break;
17444 case E_V8HImode: mode = V4SImode; break;
17445 case E_V16HImode: mode = V8SImode; break;
17446 case E_V32HImode: mode = V16SImode; break;
17447 case E_V4SImode: mode = V2DImode; break;
17448 case E_V8SImode: mode = V4DImode; break;
17449 case E_V16SImode: mode = V8DImode; break;
17450 default: return false;
17451 }
17452 for (i = 0; i < d->nelt; i += 2)
17453 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
17454 return false;
17455 nd->vmode = mode;
17456 nd->nelt = d->nelt / 2;
17457 for (i = 0; i < nd->nelt; i++)
17458 nd->perm[i] = d->perm[2 * i] / 2;
17459 if (GET_MODE_INNER (mode) != DImode)
17460 canonicalize_vector_int_perm (nd, nd);
17461 if (nd != d)
17462 {
17463 nd->one_operand_p = d->one_operand_p;
17464 nd->testing_p = d->testing_p;
17465 if (d->op0 == d->op1)
17466 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
17467 else
17468 {
17469 nd->op0 = gen_lowpart (nd->vmode, d->op0);
17470 nd->op1 = gen_lowpart (nd->vmode, d->op1);
17471 }
17472 if (d->testing_p)
17473 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
17474 else
17475 nd->target = gen_reg_rtx (nd->vmode);
17476 }
17477 return true;
17478 }
17479
17480 /* Try to expand one-operand permutation with constant mask. */
17481
17482 static bool
17483 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
17484 {
17485 machine_mode mode = GET_MODE (d->op0);
17486 machine_mode maskmode = mode;
17487 rtx (*gen) (rtx, rtx, rtx) = NULL;
17488 rtx target, op0, mask;
17489 rtx vec[64];
17490
17491 if (!rtx_equal_p (d->op0, d->op1))
17492 return false;
17493
17494 if (!TARGET_AVX512F)
17495 return false;
17496
17497 switch (mode)
17498 {
17499 case E_V16SImode:
17500 gen = gen_avx512f_permvarv16si;
17501 break;
17502 case E_V16SFmode:
17503 gen = gen_avx512f_permvarv16sf;
17504 maskmode = V16SImode;
17505 break;
17506 case E_V8DImode:
17507 gen = gen_avx512f_permvarv8di;
17508 break;
17509 case E_V8DFmode:
17510 gen = gen_avx512f_permvarv8df;
17511 maskmode = V8DImode;
17512 break;
17513 default:
17514 return false;
17515 }
17516
17517 target = d->target;
17518 op0 = d->op0;
17519 for (int i = 0; i < d->nelt; ++i)
17520 vec[i] = GEN_INT (d->perm[i]);
17521 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
17522 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
17523 return true;
17524 }
17525
17526 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
17527
17528 /* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
17529 in a single instruction. */
17530
17531 static bool
17532 expand_vec_perm_1 (struct expand_vec_perm_d *d)
17533 {
17534 unsigned i, nelt = d->nelt;
17535 struct expand_vec_perm_d nd;
17536
17537 /* Check plain VEC_SELECT first, because AVX has instructions that could
17538 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
17539 input where SEL+CONCAT may not. */
17540 if (d->one_operand_p)
17541 {
17542 int mask = nelt - 1;
17543 bool identity_perm = true;
17544 bool broadcast_perm = true;
17545
17546 for (i = 0; i < nelt; i++)
17547 {
17548 nd.perm[i] = d->perm[i] & mask;
17549 if (nd.perm[i] != i)
17550 identity_perm = false;
17551 if (nd.perm[i])
17552 broadcast_perm = false;
17553 }
17554
17555 if (identity_perm)
17556 {
17557 if (!d->testing_p)
17558 emit_move_insn (d->target, d->op0);
17559 return true;
17560 }
17561 else if (broadcast_perm && TARGET_AVX2)
17562 {
17563 /* Use vpbroadcast{b,w,d}. */
17564 rtx (*gen) (rtx, rtx) = NULL;
17565 switch (d->vmode)
17566 {
17567 case E_V64QImode:
17568 if (TARGET_AVX512BW)
17569 gen = gen_avx512bw_vec_dupv64qi_1;
17570 break;
17571 case E_V32QImode:
17572 gen = gen_avx2_pbroadcastv32qi_1;
17573 break;
17574 case E_V32HImode:
17575 if (TARGET_AVX512BW)
17576 gen = gen_avx512bw_vec_dupv32hi_1;
17577 break;
17578 case E_V16HImode:
17579 gen = gen_avx2_pbroadcastv16hi_1;
17580 break;
17581 case E_V16SImode:
17582 if (TARGET_AVX512F)
17583 gen = gen_avx512f_vec_dupv16si_1;
17584 break;
17585 case E_V8SImode:
17586 gen = gen_avx2_pbroadcastv8si_1;
17587 break;
17588 case E_V16QImode:
17589 gen = gen_avx2_pbroadcastv16qi;
17590 break;
17591 case E_V8HImode:
17592 gen = gen_avx2_pbroadcastv8hi;
17593 break;
17594 case E_V16SFmode:
17595 if (TARGET_AVX512F)
17596 gen = gen_avx512f_vec_dupv16sf_1;
17597 break;
17598 case E_V8SFmode:
17599 gen = gen_avx2_vec_dupv8sf_1;
17600 break;
17601 case E_V8DFmode:
17602 if (TARGET_AVX512F)
17603 gen = gen_avx512f_vec_dupv8df_1;
17604 break;
17605 case E_V8DImode:
17606 if (TARGET_AVX512F)
17607 gen = gen_avx512f_vec_dupv8di_1;
17608 break;
17609 /* For other modes prefer other shuffles this function creates. */
17610 default: break;
17611 }
17612 if (gen != NULL)
17613 {
17614 if (!d->testing_p)
17615 emit_insn (gen (d->target, d->op0));
17616 return true;
17617 }
17618 }
17619
17620 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
17621 return true;
17622
17623 /* There are plenty of patterns in sse.md that are written for
17624 SEL+CONCAT and are not replicated for a single op. Perhaps
17625 that should be changed, to avoid the nastiness here. */
17626
17627 /* Recognize interleave style patterns, which means incrementing
17628 every other permutation operand. */
17629 for (i = 0; i < nelt; i += 2)
17630 {
17631 nd.perm[i] = d->perm[i] & mask;
17632 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
17633 }
17634 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
17635 d->testing_p))
17636 return true;
17637
17638 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
17639 if (nelt >= 4)
17640 {
17641 for (i = 0; i < nelt; i += 4)
17642 {
17643 nd.perm[i + 0] = d->perm[i + 0] & mask;
17644 nd.perm[i + 1] = d->perm[i + 1] & mask;
17645 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
17646 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
17647 }
17648
17649 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
17650 d->testing_p))
17651 return true;
17652 }
17653 }
17654
17655 /* Try movss/movsd instructions. */
17656 if (expand_vec_perm_movs (d))
17657 return true;
17658
17659 /* Finally, try the fully general two operand permute. */
17660 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
17661 d->testing_p))
17662 return true;
17663
17664 /* Recognize interleave style patterns with reversed operands. */
17665 if (!d->one_operand_p)
17666 {
17667 for (i = 0; i < nelt; ++i)
17668 {
17669 unsigned e = d->perm[i];
17670 if (e >= nelt)
17671 e -= nelt;
17672 else
17673 e += nelt;
17674 nd.perm[i] = e;
17675 }
17676
17677 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
17678 d->testing_p))
17679 return true;
17680 }
17681
17682 /* Try the SSE4.1 blend variable merge instructions. */
17683 if (expand_vec_perm_blend (d))
17684 return true;
17685
17686 /* Try one of the AVX vpermil variable permutations. */
17687 if (expand_vec_perm_vpermil (d))
17688 return true;
17689
17690 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
17691 vpshufb, vpermd, vpermps or vpermq variable permutation. */
17692 if (expand_vec_perm_pshufb (d))
17693 return true;
17694
17695 /* Try the AVX2 vpalignr instruction. */
17696 if (expand_vec_perm_palignr (d, true))
17697 return true;
17698
17699 /* Try the AVX512F vperm{s,d} instructions. */
17700 if (ix86_expand_vec_one_operand_perm_avx512 (d))
17701 return true;
17702
17703 /* Try the AVX512F vpermt2/vpermi2 instructions. */
17704 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
17705 return true;
17706
17707 /* See if we can get the same permutation in different vector integer
17708 mode. */
17709 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
17710 {
17711 if (!d->testing_p)
17712 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
17713 return true;
17714 }
17715 return false;
17716 }
17717
17718 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
17719 in terms of a pair of pshuflw + pshufhw instructions. */
17720
17721 static bool
17722 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
17723 {
17724 unsigned char perm2[MAX_VECT_LEN];
17725 unsigned i;
17726 bool ok;
17727
17728 if (d->vmode != V8HImode || !d->one_operand_p)
17729 return false;
17730
17731 /* The two permutations only operate in 64-bit lanes. */
17732 for (i = 0; i < 4; ++i)
17733 if (d->perm[i] >= 4)
17734 return false;
17735 for (i = 4; i < 8; ++i)
17736 if (d->perm[i] < 4)
17737 return false;
17738
17739 if (d->testing_p)
17740 return true;
17741
17742 /* Emit the pshuflw. */
17743 memcpy (perm2, d->perm, 4);
17744 for (i = 4; i < 8; ++i)
17745 perm2[i] = i;
17746 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
17747 gcc_assert (ok);
17748
17749 /* Emit the pshufhw. */
17750 memcpy (perm2 + 4, d->perm + 4, 4);
17751 for (i = 0; i < 4; ++i)
17752 perm2[i] = i;
17753 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
17754 gcc_assert (ok);
17755
17756 return true;
17757 }
17758
17759 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17760 the permutation using the SSSE3 palignr instruction. This succeeds
17761 when all of the elements in PERM fit within one vector and we merely
17762 need to shift them down so that a single vector permutation has a
17763 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
17764 the vpalignr instruction itself can perform the requested permutation. */
17765
17766 static bool
17767 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
17768 {
17769 unsigned i, nelt = d->nelt;
17770 unsigned min, max, minswap, maxswap;
17771 bool in_order, ok, swap = false;
17772 rtx shift, target;
17773 struct expand_vec_perm_d dcopy;
17774
17775 /* Even with AVX, palignr only operates on 128-bit vectors,
17776 in AVX2 palignr operates on both 128-bit lanes. */
17777 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
17778 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
17779 return false;
17780
17781 min = 2 * nelt;
17782 max = 0;
17783 minswap = 2 * nelt;
17784 maxswap = 0;
17785 for (i = 0; i < nelt; ++i)
17786 {
17787 unsigned e = d->perm[i];
17788 unsigned eswap = d->perm[i] ^ nelt;
17789 if (GET_MODE_SIZE (d->vmode) == 32)
17790 {
17791 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
17792 eswap = e ^ (nelt / 2);
17793 }
17794 if (e < min)
17795 min = e;
17796 if (e > max)
17797 max = e;
17798 if (eswap < minswap)
17799 minswap = eswap;
17800 if (eswap > maxswap)
17801 maxswap = eswap;
17802 }
17803 if (min == 0
17804 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
17805 {
17806 if (d->one_operand_p
17807 || minswap == 0
17808 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
17809 ? nelt / 2 : nelt))
17810 return false;
17811 swap = true;
17812 min = minswap;
17813 max = maxswap;
17814 }
17815
17816 /* Given that we have SSSE3, we know we'll be able to implement the
17817 single operand permutation after the palignr with pshufb for
17818 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
17819 first. */
17820 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
17821 return true;
17822
17823 dcopy = *d;
17824 if (swap)
17825 {
17826 dcopy.op0 = d->op1;
17827 dcopy.op1 = d->op0;
17828 for (i = 0; i < nelt; ++i)
17829 dcopy.perm[i] ^= nelt;
17830 }
17831
17832 in_order = true;
17833 for (i = 0; i < nelt; ++i)
17834 {
17835 unsigned e = dcopy.perm[i];
17836 if (GET_MODE_SIZE (d->vmode) == 32
17837 && e >= nelt
17838 && (e & (nelt / 2 - 1)) < min)
17839 e = e - min - (nelt / 2);
17840 else
17841 e = e - min;
17842 if (e != i)
17843 in_order = false;
17844 dcopy.perm[i] = e;
17845 }
17846 dcopy.one_operand_p = true;
17847
17848 if (single_insn_only_p && !in_order)
17849 return false;
17850
17851 /* For AVX2, test whether we can permute the result in one instruction. */
17852 if (d->testing_p)
17853 {
17854 if (in_order)
17855 return true;
17856 dcopy.op1 = dcopy.op0;
17857 return expand_vec_perm_1 (&dcopy);
17858 }
17859
17860 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
17861 if (GET_MODE_SIZE (d->vmode) == 16)
17862 {
17863 target = gen_reg_rtx (TImode);
17864 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
17865 gen_lowpart (TImode, dcopy.op0), shift));
17866 }
17867 else
17868 {
17869 target = gen_reg_rtx (V2TImode);
17870 emit_insn (gen_avx2_palignrv2ti (target,
17871 gen_lowpart (V2TImode, dcopy.op1),
17872 gen_lowpart (V2TImode, dcopy.op0),
17873 shift));
17874 }
17875
17876 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
17877
17878 /* Test for the degenerate case where the alignment by itself
17879 produces the desired permutation. */
17880 if (in_order)
17881 {
17882 emit_move_insn (d->target, dcopy.op0);
17883 return true;
17884 }
17885
17886 ok = expand_vec_perm_1 (&dcopy);
17887 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
17888
17889 return ok;
17890 }
17891
17892 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17893 the permutation using the SSE4_1 pblendv instruction. Potentially
17894 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
17895
17896 static bool
17897 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
17898 {
17899 unsigned i, which, nelt = d->nelt;
17900 struct expand_vec_perm_d dcopy, dcopy1;
17901 machine_mode vmode = d->vmode;
17902 bool ok;
17903
17904 /* Use the same checks as in expand_vec_perm_blend. */
17905 if (d->one_operand_p)
17906 return false;
17907 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
17908 ;
17909 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
17910 ;
17911 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
17912 ;
17913 else
17914 return false;
17915
17916 /* Figure out where permutation elements stay not in their
17917 respective lanes. */
17918 for (i = 0, which = 0; i < nelt; ++i)
17919 {
17920 unsigned e = d->perm[i];
17921 if (e != i)
17922 which |= (e < nelt ? 1 : 2);
17923 }
17924 /* We can pblend the part where elements stay not in their
17925 respective lanes only when these elements are all in one
17926 half of a permutation.
17927 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
17928 lanes, but both 8 and 9 >= 8
17929 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
17930 respective lanes and 8 >= 8, but 2 not. */
17931 if (which != 1 && which != 2)
17932 return false;
17933 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
17934 return true;
17935
17936 /* First we apply one operand permutation to the part where
17937 elements stay not in their respective lanes. */
17938 dcopy = *d;
17939 if (which == 2)
17940 dcopy.op0 = dcopy.op1 = d->op1;
17941 else
17942 dcopy.op0 = dcopy.op1 = d->op0;
17943 if (!d->testing_p)
17944 dcopy.target = gen_reg_rtx (vmode);
17945 dcopy.one_operand_p = true;
17946
17947 for (i = 0; i < nelt; ++i)
17948 dcopy.perm[i] = d->perm[i] & (nelt - 1);
17949
17950 ok = expand_vec_perm_1 (&dcopy);
17951 if (GET_MODE_SIZE (vmode) != 16 && !ok)
17952 return false;
17953 else
17954 gcc_assert (ok);
17955 if (d->testing_p)
17956 return true;
17957
17958 /* Next we put permuted elements into their positions. */
17959 dcopy1 = *d;
17960 if (which == 2)
17961 dcopy1.op1 = dcopy.target;
17962 else
17963 dcopy1.op0 = dcopy.target;
17964
17965 for (i = 0; i < nelt; ++i)
17966 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
17967
17968 ok = expand_vec_perm_blend (&dcopy1);
17969 gcc_assert (ok);
17970
17971 return true;
17972 }
17973
17974 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
17975
17976 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17977 a two vector permutation into a single vector permutation by using
17978 an interleave operation to merge the vectors. */
17979
17980 static bool
17981 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
17982 {
17983 struct expand_vec_perm_d dremap, dfinal;
17984 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
17985 unsigned HOST_WIDE_INT contents;
17986 unsigned char remap[2 * MAX_VECT_LEN];
17987 rtx_insn *seq;
17988 bool ok, same_halves = false;
17989
17990 if (GET_MODE_SIZE (d->vmode) == 16)
17991 {
17992 if (d->one_operand_p)
17993 return false;
17994 }
17995 else if (GET_MODE_SIZE (d->vmode) == 32)
17996 {
17997 if (!TARGET_AVX)
17998 return false;
17999 /* For 32-byte modes allow even d->one_operand_p.
18000 The lack of cross-lane shuffling in some instructions
18001 might prevent a single insn shuffle. */
18002 dfinal = *d;
18003 dfinal.testing_p = true;
18004 /* If expand_vec_perm_interleave3 can expand this into
18005 a 3 insn sequence, give up and let it be expanded as
18006 3 insn sequence. While that is one insn longer,
18007 it doesn't need a memory operand and in the common
18008 case that both interleave low and high permutations
18009 with the same operands are adjacent needs 4 insns
18010 for both after CSE. */
18011 if (expand_vec_perm_interleave3 (&dfinal))
18012 return false;
18013 }
18014 else
18015 return false;
18016
18017 /* Examine from whence the elements come. */
18018 contents = 0;
18019 for (i = 0; i < nelt; ++i)
18020 contents |= HOST_WIDE_INT_1U << d->perm[i];
18021
18022 memset (remap, 0xff, sizeof (remap));
18023 dremap = *d;
18024
18025 if (GET_MODE_SIZE (d->vmode) == 16)
18026 {
18027 unsigned HOST_WIDE_INT h1, h2, h3, h4;
18028
18029 /* Split the two input vectors into 4 halves. */
18030 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
18031 h2 = h1 << nelt2;
18032 h3 = h2 << nelt2;
18033 h4 = h3 << nelt2;
18034
18035 /* If the elements from the low halves use interleave low, and similarly
18036 for interleave high. If the elements are from mis-matched halves, we
18037 can use shufps for V4SF/V4SI or do a DImode shuffle. */
18038 if ((contents & (h1 | h3)) == contents)
18039 {
18040 /* punpckl* */
18041 for (i = 0; i < nelt2; ++i)
18042 {
18043 remap[i] = i * 2;
18044 remap[i + nelt] = i * 2 + 1;
18045 dremap.perm[i * 2] = i;
18046 dremap.perm[i * 2 + 1] = i + nelt;
18047 }
18048 if (!TARGET_SSE2 && d->vmode == V4SImode)
18049 dremap.vmode = V4SFmode;
18050 }
18051 else if ((contents & (h2 | h4)) == contents)
18052 {
18053 /* punpckh* */
18054 for (i = 0; i < nelt2; ++i)
18055 {
18056 remap[i + nelt2] = i * 2;
18057 remap[i + nelt + nelt2] = i * 2 + 1;
18058 dremap.perm[i * 2] = i + nelt2;
18059 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
18060 }
18061 if (!TARGET_SSE2 && d->vmode == V4SImode)
18062 dremap.vmode = V4SFmode;
18063 }
18064 else if ((contents & (h1 | h4)) == contents)
18065 {
18066 /* shufps */
18067 for (i = 0; i < nelt2; ++i)
18068 {
18069 remap[i] = i;
18070 remap[i + nelt + nelt2] = i + nelt2;
18071 dremap.perm[i] = i;
18072 dremap.perm[i + nelt2] = i + nelt + nelt2;
18073 }
18074 if (nelt != 4)
18075 {
18076 /* shufpd */
18077 dremap.vmode = V2DImode;
18078 dremap.nelt = 2;
18079 dremap.perm[0] = 0;
18080 dremap.perm[1] = 3;
18081 }
18082 }
18083 else if ((contents & (h2 | h3)) == contents)
18084 {
18085 /* shufps */
18086 for (i = 0; i < nelt2; ++i)
18087 {
18088 remap[i + nelt2] = i;
18089 remap[i + nelt] = i + nelt2;
18090 dremap.perm[i] = i + nelt2;
18091 dremap.perm[i + nelt2] = i + nelt;
18092 }
18093 if (nelt != 4)
18094 {
18095 /* shufpd */
18096 dremap.vmode = V2DImode;
18097 dremap.nelt = 2;
18098 dremap.perm[0] = 1;
18099 dremap.perm[1] = 2;
18100 }
18101 }
18102 else
18103 return false;
18104 }
18105 else
18106 {
18107 unsigned int nelt4 = nelt / 4, nzcnt = 0;
18108 unsigned HOST_WIDE_INT q[8];
18109 unsigned int nonzero_halves[4];
18110
18111 /* Split the two input vectors into 8 quarters. */
18112 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
18113 for (i = 1; i < 8; ++i)
18114 q[i] = q[0] << (nelt4 * i);
18115 for (i = 0; i < 4; ++i)
18116 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
18117 {
18118 nonzero_halves[nzcnt] = i;
18119 ++nzcnt;
18120 }
18121
18122 if (nzcnt == 1)
18123 {
18124 gcc_assert (d->one_operand_p);
18125 nonzero_halves[1] = nonzero_halves[0];
18126 same_halves = true;
18127 }
18128 else if (d->one_operand_p)
18129 {
18130 gcc_assert (nonzero_halves[0] == 0);
18131 gcc_assert (nonzero_halves[1] == 1);
18132 }
18133
18134 if (nzcnt <= 2)
18135 {
18136 if (d->perm[0] / nelt2 == nonzero_halves[1])
18137 {
18138 /* Attempt to increase the likelihood that dfinal
18139 shuffle will be intra-lane. */
18140 std::swap (nonzero_halves[0], nonzero_halves[1]);
18141 }
18142
18143 /* vperm2f128 or vperm2i128. */
18144 for (i = 0; i < nelt2; ++i)
18145 {
18146 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
18147 remap[i + nonzero_halves[0] * nelt2] = i;
18148 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
18149 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
18150 }
18151
18152 if (d->vmode != V8SFmode
18153 && d->vmode != V4DFmode
18154 && d->vmode != V8SImode)
18155 {
18156 dremap.vmode = V8SImode;
18157 dremap.nelt = 8;
18158 for (i = 0; i < 4; ++i)
18159 {
18160 dremap.perm[i] = i + nonzero_halves[0] * 4;
18161 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
18162 }
18163 }
18164 }
18165 else if (d->one_operand_p)
18166 return false;
18167 else if (TARGET_AVX2
18168 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
18169 {
18170 /* vpunpckl* */
18171 for (i = 0; i < nelt4; ++i)
18172 {
18173 remap[i] = i * 2;
18174 remap[i + nelt] = i * 2 + 1;
18175 remap[i + nelt2] = i * 2 + nelt2;
18176 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
18177 dremap.perm[i * 2] = i;
18178 dremap.perm[i * 2 + 1] = i + nelt;
18179 dremap.perm[i * 2 + nelt2] = i + nelt2;
18180 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
18181 }
18182 }
18183 else if (TARGET_AVX2
18184 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
18185 {
18186 /* vpunpckh* */
18187 for (i = 0; i < nelt4; ++i)
18188 {
18189 remap[i + nelt4] = i * 2;
18190 remap[i + nelt + nelt4] = i * 2 + 1;
18191 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
18192 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
18193 dremap.perm[i * 2] = i + nelt4;
18194 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
18195 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
18196 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
18197 }
18198 }
18199 else
18200 return false;
18201 }
18202
18203 /* Use the remapping array set up above to move the elements from their
18204 swizzled locations into their final destinations. */
18205 dfinal = *d;
18206 for (i = 0; i < nelt; ++i)
18207 {
18208 unsigned e = remap[d->perm[i]];
18209 gcc_assert (e < nelt);
18210 /* If same_halves is true, both halves of the remapped vector are the
18211 same. Avoid cross-lane accesses if possible. */
18212 if (same_halves && i >= nelt2)
18213 {
18214 gcc_assert (e < nelt2);
18215 dfinal.perm[i] = e + nelt2;
18216 }
18217 else
18218 dfinal.perm[i] = e;
18219 }
18220 if (!d->testing_p)
18221 {
18222 dremap.target = gen_reg_rtx (dremap.vmode);
18223 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
18224 }
18225 dfinal.op1 = dfinal.op0;
18226 dfinal.one_operand_p = true;
18227
18228 /* Test if the final remap can be done with a single insn. For V4SFmode or
18229 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
18230 start_sequence ();
18231 ok = expand_vec_perm_1 (&dfinal);
18232 seq = get_insns ();
18233 end_sequence ();
18234
18235 if (!ok)
18236 return false;
18237
18238 if (d->testing_p)
18239 return true;
18240
18241 if (dremap.vmode != dfinal.vmode)
18242 {
18243 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
18244 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
18245 }
18246
18247 ok = expand_vec_perm_1 (&dremap);
18248 gcc_assert (ok);
18249
18250 emit_insn (seq);
18251 return true;
18252 }
18253
18254 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
18255 a single vector cross-lane permutation into vpermq followed
18256 by any of the single insn permutations. */
18257
18258 static bool
18259 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
18260 {
18261 struct expand_vec_perm_d dremap, dfinal;
18262 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
18263 unsigned contents[2];
18264 bool ok;
18265
18266 if (!(TARGET_AVX2
18267 && (d->vmode == V32QImode || d->vmode == V16HImode)
18268 && d->one_operand_p))
18269 return false;
18270
18271 contents[0] = 0;
18272 contents[1] = 0;
18273 for (i = 0; i < nelt2; ++i)
18274 {
18275 contents[0] |= 1u << (d->perm[i] / nelt4);
18276 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
18277 }
18278
18279 for (i = 0; i < 2; ++i)
18280 {
18281 unsigned int cnt = 0;
18282 for (j = 0; j < 4; ++j)
18283 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
18284 return false;
18285 }
18286
18287 if (d->testing_p)
18288 return true;
18289
18290 dremap = *d;
18291 dremap.vmode = V4DImode;
18292 dremap.nelt = 4;
18293 dremap.target = gen_reg_rtx (V4DImode);
18294 dremap.op0 = gen_lowpart (V4DImode, d->op0);
18295 dremap.op1 = dremap.op0;
18296 dremap.one_operand_p = true;
18297 for (i = 0; i < 2; ++i)
18298 {
18299 unsigned int cnt = 0;
18300 for (j = 0; j < 4; ++j)
18301 if ((contents[i] & (1u << j)) != 0)
18302 dremap.perm[2 * i + cnt++] = j;
18303 for (; cnt < 2; ++cnt)
18304 dremap.perm[2 * i + cnt] = 0;
18305 }
18306
18307 dfinal = *d;
18308 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
18309 dfinal.op1 = dfinal.op0;
18310 dfinal.one_operand_p = true;
18311 for (i = 0, j = 0; i < nelt; ++i)
18312 {
18313 if (i == nelt2)
18314 j = 2;
18315 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
18316 if ((d->perm[i] / nelt4) == dremap.perm[j])
18317 ;
18318 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
18319 dfinal.perm[i] |= nelt4;
18320 else
18321 gcc_unreachable ();
18322 }
18323
18324 ok = expand_vec_perm_1 (&dremap);
18325 gcc_assert (ok);
18326
18327 ok = expand_vec_perm_1 (&dfinal);
18328 gcc_assert (ok);
18329
18330 return true;
18331 }
18332
18333 static bool canonicalize_perm (struct expand_vec_perm_d *d);
18334
18335 /* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
18336 a vector permutation using two instructions, vperm2f128 resp.
18337 vperm2i128 followed by any single in-lane permutation. */
18338
18339 static bool
18340 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
18341 {
18342 struct expand_vec_perm_d dfirst, dsecond;
18343 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
18344 bool ok;
18345
18346 if (!TARGET_AVX
18347 || GET_MODE_SIZE (d->vmode) != 32
18348 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
18349 return false;
18350
18351 dsecond = *d;
18352 dsecond.one_operand_p = false;
18353 dsecond.testing_p = true;
18354
18355 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
18356 immediate. For perm < 16 the second permutation uses
18357 d->op0 as first operand, for perm >= 16 it uses d->op1
18358 as first operand. The second operand is the result of
18359 vperm2[fi]128. */
18360 for (perm = 0; perm < 32; perm++)
18361 {
18362 /* Ignore permutations which do not move anything cross-lane. */
18363 if (perm < 16)
18364 {
18365 /* The second shuffle for e.g. V4DFmode has
18366 0123 and ABCD operands.
18367 Ignore AB23, as 23 is already in the second lane
18368 of the first operand. */
18369 if ((perm & 0xc) == (1 << 2)) continue;
18370 /* And 01CD, as 01 is in the first lane of the first
18371 operand. */
18372 if ((perm & 3) == 0) continue;
18373 /* And 4567, as then the vperm2[fi]128 doesn't change
18374 anything on the original 4567 second operand. */
18375 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
18376 }
18377 else
18378 {
18379 /* The second shuffle for e.g. V4DFmode has
18380 4567 and ABCD operands.
18381 Ignore AB67, as 67 is already in the second lane
18382 of the first operand. */
18383 if ((perm & 0xc) == (3 << 2)) continue;
18384 /* And 45CD, as 45 is in the first lane of the first
18385 operand. */
18386 if ((perm & 3) == 2) continue;
18387 /* And 0123, as then the vperm2[fi]128 doesn't change
18388 anything on the original 0123 first operand. */
18389 if ((perm & 0xf) == (1 << 2)) continue;
18390 }
18391
18392 for (i = 0; i < nelt; i++)
18393 {
18394 j = d->perm[i] / nelt2;
18395 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
18396 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
18397 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
18398 dsecond.perm[i] = d->perm[i] & (nelt - 1);
18399 else
18400 break;
18401 }
18402
18403 if (i == nelt)
18404 {
18405 start_sequence ();
18406 ok = expand_vec_perm_1 (&dsecond);
18407 end_sequence ();
18408 }
18409 else
18410 ok = false;
18411
18412 if (ok)
18413 {
18414 if (d->testing_p)
18415 return true;
18416
18417 /* Found a usable second shuffle. dfirst will be
18418 vperm2f128 on d->op0 and d->op1. */
18419 dsecond.testing_p = false;
18420 dfirst = *d;
18421 dfirst.target = gen_reg_rtx (d->vmode);
18422 for (i = 0; i < nelt; i++)
18423 dfirst.perm[i] = (i & (nelt2 - 1))
18424 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
18425
18426 canonicalize_perm (&dfirst);
18427 ok = expand_vec_perm_1 (&dfirst);
18428 gcc_assert (ok);
18429
18430 /* And dsecond is some single insn shuffle, taking
18431 d->op0 and result of vperm2f128 (if perm < 16) or
18432 d->op1 and result of vperm2f128 (otherwise). */
18433 if (perm >= 16)
18434 dsecond.op0 = dsecond.op1;
18435 dsecond.op1 = dfirst.target;
18436
18437 ok = expand_vec_perm_1 (&dsecond);
18438 gcc_assert (ok);
18439
18440 return true;
18441 }
18442
18443 /* For one operand, the only useful vperm2f128 permutation is 0x01
18444 aka lanes swap. */
18445 if (d->one_operand_p)
18446 return false;
18447 }
18448
18449 return false;
18450 }
18451
18452 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
18453 a two vector permutation using 2 intra-lane interleave insns
18454 and cross-lane shuffle for 32-byte vectors. */
18455
18456 static bool
18457 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
18458 {
18459 unsigned i, nelt;
18460 rtx (*gen) (rtx, rtx, rtx);
18461
18462 if (d->one_operand_p)
18463 return false;
18464 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
18465 ;
18466 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
18467 ;
18468 else
18469 return false;
18470
18471 nelt = d->nelt;
18472 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
18473 return false;
18474 for (i = 0; i < nelt; i += 2)
18475 if (d->perm[i] != d->perm[0] + i / 2
18476 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
18477 return false;
18478
18479 if (d->testing_p)
18480 return true;
18481
18482 switch (d->vmode)
18483 {
18484 case E_V32QImode:
18485 if (d->perm[0])
18486 gen = gen_vec_interleave_highv32qi;
18487 else
18488 gen = gen_vec_interleave_lowv32qi;
18489 break;
18490 case E_V16HImode:
18491 if (d->perm[0])
18492 gen = gen_vec_interleave_highv16hi;
18493 else
18494 gen = gen_vec_interleave_lowv16hi;
18495 break;
18496 case E_V8SImode:
18497 if (d->perm[0])
18498 gen = gen_vec_interleave_highv8si;
18499 else
18500 gen = gen_vec_interleave_lowv8si;
18501 break;
18502 case E_V4DImode:
18503 if (d->perm[0])
18504 gen = gen_vec_interleave_highv4di;
18505 else
18506 gen = gen_vec_interleave_lowv4di;
18507 break;
18508 case E_V8SFmode:
18509 if (d->perm[0])
18510 gen = gen_vec_interleave_highv8sf;
18511 else
18512 gen = gen_vec_interleave_lowv8sf;
18513 break;
18514 case E_V4DFmode:
18515 if (d->perm[0])
18516 gen = gen_vec_interleave_highv4df;
18517 else
18518 gen = gen_vec_interleave_lowv4df;
18519 break;
18520 default:
18521 gcc_unreachable ();
18522 }
18523
18524 emit_insn (gen (d->target, d->op0, d->op1));
18525 return true;
18526 }
18527
18528 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
18529 a single vector permutation using a single intra-lane vector
18530 permutation, vperm2f128 swapping the lanes and vblend* insn blending
18531 the non-swapped and swapped vectors together. */
18532
18533 static bool
18534 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
18535 {
18536 struct expand_vec_perm_d dfirst, dsecond;
18537 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
18538 rtx_insn *seq;
18539 bool ok;
18540 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
18541
18542 if (!TARGET_AVX
18543 || TARGET_AVX2
18544 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
18545 || !d->one_operand_p)
18546 return false;
18547
18548 dfirst = *d;
18549 for (i = 0; i < nelt; i++)
18550 dfirst.perm[i] = 0xff;
18551 for (i = 0, msk = 0; i < nelt; i++)
18552 {
18553 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
18554 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
18555 return false;
18556 dfirst.perm[j] = d->perm[i];
18557 if (j != i)
18558 msk |= (1 << i);
18559 }
18560 for (i = 0; i < nelt; i++)
18561 if (dfirst.perm[i] == 0xff)
18562 dfirst.perm[i] = i;
18563
18564 if (!d->testing_p)
18565 dfirst.target = gen_reg_rtx (dfirst.vmode);
18566
18567 start_sequence ();
18568 ok = expand_vec_perm_1 (&dfirst);
18569 seq = get_insns ();
18570 end_sequence ();
18571
18572 if (!ok)
18573 return false;
18574
18575 if (d->testing_p)
18576 return true;
18577
18578 emit_insn (seq);
18579
18580 dsecond = *d;
18581 dsecond.op0 = dfirst.target;
18582 dsecond.op1 = dfirst.target;
18583 dsecond.one_operand_p = true;
18584 dsecond.target = gen_reg_rtx (dsecond.vmode);
18585 for (i = 0; i < nelt; i++)
18586 dsecond.perm[i] = i ^ nelt2;
18587
18588 ok = expand_vec_perm_1 (&dsecond);
18589 gcc_assert (ok);
18590
18591 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
18592 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
18593 return true;
18594 }
18595
18596 /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
18597 permutation using two vperm2f128, followed by a vshufpd insn blending
18598 the two vectors together. */
18599
18600 static bool
18601 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
18602 {
18603 struct expand_vec_perm_d dfirst, dsecond, dthird;
18604 bool ok;
18605
18606 if (!TARGET_AVX || (d->vmode != V4DFmode))
18607 return false;
18608
18609 if (d->testing_p)
18610 return true;
18611
18612 dfirst = *d;
18613 dsecond = *d;
18614 dthird = *d;
18615
18616 dfirst.perm[0] = (d->perm[0] & ~1);
18617 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
18618 dfirst.perm[2] = (d->perm[2] & ~1);
18619 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
18620 dsecond.perm[0] = (d->perm[1] & ~1);
18621 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
18622 dsecond.perm[2] = (d->perm[3] & ~1);
18623 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
18624 dthird.perm[0] = (d->perm[0] % 2);
18625 dthird.perm[1] = (d->perm[1] % 2) + 4;
18626 dthird.perm[2] = (d->perm[2] % 2) + 2;
18627 dthird.perm[3] = (d->perm[3] % 2) + 6;
18628
18629 dfirst.target = gen_reg_rtx (dfirst.vmode);
18630 dsecond.target = gen_reg_rtx (dsecond.vmode);
18631 dthird.op0 = dfirst.target;
18632 dthird.op1 = dsecond.target;
18633 dthird.one_operand_p = false;
18634
18635 canonicalize_perm (&dfirst);
18636 canonicalize_perm (&dsecond);
18637
18638 ok = expand_vec_perm_1 (&dfirst)
18639 && expand_vec_perm_1 (&dsecond)
18640 && expand_vec_perm_1 (&dthird);
18641
18642 gcc_assert (ok);
18643
18644 return true;
18645 }
18646
18647 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
18648
18649 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
18650 a two vector permutation using two intra-lane vector
18651 permutations, vperm2f128 swapping the lanes and vblend* insn blending
18652 the non-swapped and swapped vectors together. */
18653
18654 static bool
18655 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
18656 {
18657 struct expand_vec_perm_d dfirst, dsecond, dthird;
18658 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
18659 rtx_insn *seq1, *seq2;
18660 bool ok;
18661 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
18662
18663 if (!TARGET_AVX
18664 || TARGET_AVX2
18665 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
18666 || d->one_operand_p)
18667 return false;
18668
18669 dfirst = *d;
18670 dsecond = *d;
18671 for (i = 0; i < nelt; i++)
18672 {
18673 dfirst.perm[i] = 0xff;
18674 dsecond.perm[i] = 0xff;
18675 }
18676 for (i = 0, msk = 0; i < nelt; i++)
18677 {
18678 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
18679 if (j == i)
18680 {
18681 dfirst.perm[j] = d->perm[i];
18682 which1 |= (d->perm[i] < nelt ? 1 : 2);
18683 }
18684 else
18685 {
18686 dsecond.perm[j] = d->perm[i];
18687 which2 |= (d->perm[i] < nelt ? 1 : 2);
18688 msk |= (1U << i);
18689 }
18690 }
18691 if (msk == 0 || msk == (1U << nelt) - 1)
18692 return false;
18693
18694 if (!d->testing_p)
18695 {
18696 dfirst.target = gen_reg_rtx (dfirst.vmode);
18697 dsecond.target = gen_reg_rtx (dsecond.vmode);
18698 }
18699
18700 for (i = 0; i < nelt; i++)
18701 {
18702 if (dfirst.perm[i] == 0xff)
18703 dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
18704 if (dsecond.perm[i] == 0xff)
18705 dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
18706 }
18707 canonicalize_perm (&dfirst);
18708 start_sequence ();
18709 ok = ix86_expand_vec_perm_const_1 (&dfirst);
18710 seq1 = get_insns ();
18711 end_sequence ();
18712
18713 if (!ok)
18714 return false;
18715
18716 canonicalize_perm (&dsecond);
18717 start_sequence ();
18718 ok = ix86_expand_vec_perm_const_1 (&dsecond);
18719 seq2 = get_insns ();
18720 end_sequence ();
18721
18722 if (!ok)
18723 return false;
18724
18725 if (d->testing_p)
18726 return true;
18727
18728 emit_insn (seq1);
18729 emit_insn (seq2);
18730
18731 dthird = *d;
18732 dthird.op0 = dsecond.target;
18733 dthird.op1 = dsecond.target;
18734 dthird.one_operand_p = true;
18735 dthird.target = gen_reg_rtx (dthird.vmode);
18736 for (i = 0; i < nelt; i++)
18737 dthird.perm[i] = i ^ nelt2;
18738
18739 ok = expand_vec_perm_1 (&dthird);
18740 gcc_assert (ok);
18741
18742 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
18743 emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
18744 return true;
18745 }
18746
18747 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
18748 permutation with two pshufb insns and an ior. We should have already
18749 failed all two instruction sequences. */
18750
18751 static bool
18752 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
18753 {
18754 rtx rperm[2][16], vperm, l, h, op, m128;
18755 unsigned int i, nelt, eltsz;
18756
18757 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
18758 return false;
18759 gcc_assert (!d->one_operand_p);
18760
18761 if (d->testing_p)
18762 return true;
18763
18764 nelt = d->nelt;
18765 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18766
18767 /* Generate two permutation masks. If the required element is within
18768 the given vector it is shuffled into the proper lane. If the required
18769 element is in the other vector, force a zero into the lane by setting
18770 bit 7 in the permutation mask. */
18771 m128 = GEN_INT (-128);
18772 for (i = 0; i < nelt; ++i)
18773 {
18774 unsigned j, e = d->perm[i];
18775 unsigned which = (e >= nelt);
18776 if (e >= nelt)
18777 e -= nelt;
18778
18779 for (j = 0; j < eltsz; ++j)
18780 {
18781 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
18782 rperm[1-which][i*eltsz + j] = m128;
18783 }
18784 }
18785
18786 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
18787 vperm = force_reg (V16QImode, vperm);
18788
18789 l = gen_reg_rtx (V16QImode);
18790 op = gen_lowpart (V16QImode, d->op0);
18791 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
18792
18793 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
18794 vperm = force_reg (V16QImode, vperm);
18795
18796 h = gen_reg_rtx (V16QImode);
18797 op = gen_lowpart (V16QImode, d->op1);
18798 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
18799
18800 op = d->target;
18801 if (d->vmode != V16QImode)
18802 op = gen_reg_rtx (V16QImode);
18803 emit_insn (gen_iorv16qi3 (op, l, h));
18804 if (op != d->target)
18805 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18806
18807 return true;
18808 }
18809
18810 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
18811 with two vpshufb insns, vpermq and vpor. We should have already failed
18812 all two or three instruction sequences. */
18813
18814 static bool
18815 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
18816 {
18817 rtx rperm[2][32], vperm, l, h, hp, op, m128;
18818 unsigned int i, nelt, eltsz;
18819
18820 if (!TARGET_AVX2
18821 || !d->one_operand_p
18822 || (d->vmode != V32QImode && d->vmode != V16HImode))
18823 return false;
18824
18825 if (d->testing_p)
18826 return true;
18827
18828 nelt = d->nelt;
18829 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18830
18831 /* Generate two permutation masks. If the required element is within
18832 the same lane, it is shuffled in. If the required element from the
18833 other lane, force a zero by setting bit 7 in the permutation mask.
18834 In the other mask the mask has non-negative elements if element
18835 is requested from the other lane, but also moved to the other lane,
18836 so that the result of vpshufb can have the two V2TImode halves
18837 swapped. */
18838 m128 = GEN_INT (-128);
18839 for (i = 0; i < nelt; ++i)
18840 {
18841 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18842 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
18843
18844 for (j = 0; j < eltsz; ++j)
18845 {
18846 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
18847 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
18848 }
18849 }
18850
18851 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
18852 vperm = force_reg (V32QImode, vperm);
18853
18854 h = gen_reg_rtx (V32QImode);
18855 op = gen_lowpart (V32QImode, d->op0);
18856 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
18857
18858 /* Swap the 128-byte lanes of h into hp. */
18859 hp = gen_reg_rtx (V4DImode);
18860 op = gen_lowpart (V4DImode, h);
18861 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
18862 const1_rtx));
18863
18864 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
18865 vperm = force_reg (V32QImode, vperm);
18866
18867 l = gen_reg_rtx (V32QImode);
18868 op = gen_lowpart (V32QImode, d->op0);
18869 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
18870
18871 op = d->target;
18872 if (d->vmode != V32QImode)
18873 op = gen_reg_rtx (V32QImode);
18874 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
18875 if (op != d->target)
18876 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18877
18878 return true;
18879 }
18880
18881 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18882 and extract-odd permutations of two V32QImode and V16QImode operand
18883 with two vpshufb insns, vpor and vpermq. We should have already
18884 failed all two or three instruction sequences. */
18885
18886 static bool
18887 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
18888 {
18889 rtx rperm[2][32], vperm, l, h, ior, op, m128;
18890 unsigned int i, nelt, eltsz;
18891
18892 if (!TARGET_AVX2
18893 || d->one_operand_p
18894 || (d->vmode != V32QImode && d->vmode != V16HImode))
18895 return false;
18896
18897 for (i = 0; i < d->nelt; ++i)
18898 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
18899 return false;
18900
18901 if (d->testing_p)
18902 return true;
18903
18904 nelt = d->nelt;
18905 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18906
18907 /* Generate two permutation masks. In the first permutation mask
18908 the first quarter will contain indexes for the first half
18909 of the op0, the second quarter will contain bit 7 set, third quarter
18910 will contain indexes for the second half of the op0 and the
18911 last quarter bit 7 set. In the second permutation mask
18912 the first quarter will contain bit 7 set, the second quarter
18913 indexes for the first half of the op1, the third quarter bit 7 set
18914 and last quarter indexes for the second half of the op1.
18915 I.e. the first mask e.g. for V32QImode extract even will be:
18916 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
18917 (all values masked with 0xf except for -128) and second mask
18918 for extract even will be
18919 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
18920 m128 = GEN_INT (-128);
18921 for (i = 0; i < nelt; ++i)
18922 {
18923 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18924 unsigned which = d->perm[i] >= nelt;
18925 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
18926
18927 for (j = 0; j < eltsz; ++j)
18928 {
18929 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
18930 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
18931 }
18932 }
18933
18934 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
18935 vperm = force_reg (V32QImode, vperm);
18936
18937 l = gen_reg_rtx (V32QImode);
18938 op = gen_lowpart (V32QImode, d->op0);
18939 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
18940
18941 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
18942 vperm = force_reg (V32QImode, vperm);
18943
18944 h = gen_reg_rtx (V32QImode);
18945 op = gen_lowpart (V32QImode, d->op1);
18946 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
18947
18948 ior = gen_reg_rtx (V32QImode);
18949 emit_insn (gen_iorv32qi3 (ior, l, h));
18950
18951 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
18952 op = gen_reg_rtx (V4DImode);
18953 ior = gen_lowpart (V4DImode, ior);
18954 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
18955 const1_rtx, GEN_INT (3)));
18956 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18957
18958 return true;
18959 }
18960
18961 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18962 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
18963 with two "and" and "pack" or two "shift" and "pack" insns. We should
18964 have already failed all two instruction sequences. */
18965
18966 static bool
18967 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
18968 {
18969 rtx op, dop0, dop1, t;
18970 unsigned i, odd, c, s, nelt = d->nelt;
18971 bool end_perm = false;
18972 machine_mode half_mode;
18973 rtx (*gen_and) (rtx, rtx, rtx);
18974 rtx (*gen_pack) (rtx, rtx, rtx);
18975 rtx (*gen_shift) (rtx, rtx, rtx);
18976
18977 if (d->one_operand_p)
18978 return false;
18979
18980 switch (d->vmode)
18981 {
18982 case E_V8HImode:
18983 /* Required for "pack". */
18984 if (!TARGET_SSE4_1)
18985 return false;
18986 c = 0xffff;
18987 s = 16;
18988 half_mode = V4SImode;
18989 gen_and = gen_andv4si3;
18990 gen_pack = gen_sse4_1_packusdw;
18991 gen_shift = gen_lshrv4si3;
18992 break;
18993 case E_V16QImode:
18994 /* No check as all instructions are SSE2. */
18995 c = 0xff;
18996 s = 8;
18997 half_mode = V8HImode;
18998 gen_and = gen_andv8hi3;
18999 gen_pack = gen_sse2_packuswb;
19000 gen_shift = gen_lshrv8hi3;
19001 break;
19002 case E_V16HImode:
19003 if (!TARGET_AVX2)
19004 return false;
19005 c = 0xffff;
19006 s = 16;
19007 half_mode = V8SImode;
19008 gen_and = gen_andv8si3;
19009 gen_pack = gen_avx2_packusdw;
19010 gen_shift = gen_lshrv8si3;
19011 end_perm = true;
19012 break;
19013 case E_V32QImode:
19014 if (!TARGET_AVX2)
19015 return false;
19016 c = 0xff;
19017 s = 8;
19018 half_mode = V16HImode;
19019 gen_and = gen_andv16hi3;
19020 gen_pack = gen_avx2_packuswb;
19021 gen_shift = gen_lshrv16hi3;
19022 end_perm = true;
19023 break;
19024 default:
19025 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
19026 general shuffles. */
19027 return false;
19028 }
19029
19030 /* Check that permutation is even or odd. */
19031 odd = d->perm[0];
19032 if (odd > 1)
19033 return false;
19034
19035 for (i = 1; i < nelt; ++i)
19036 if (d->perm[i] != 2 * i + odd)
19037 return false;
19038
19039 if (d->testing_p)
19040 return true;
19041
19042 dop0 = gen_reg_rtx (half_mode);
19043 dop1 = gen_reg_rtx (half_mode);
19044 if (odd == 0)
19045 {
19046 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
19047 t = force_reg (half_mode, t);
19048 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
19049 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
19050 }
19051 else
19052 {
19053 emit_insn (gen_shift (dop0,
19054 gen_lowpart (half_mode, d->op0),
19055 GEN_INT (s)));
19056 emit_insn (gen_shift (dop1,
19057 gen_lowpart (half_mode, d->op1),
19058 GEN_INT (s)));
19059 }
19060 /* In AVX2 for 256 bit case we need to permute pack result. */
19061 if (TARGET_AVX2 && end_perm)
19062 {
19063 op = gen_reg_rtx (d->vmode);
19064 t = gen_reg_rtx (V4DImode);
19065 emit_insn (gen_pack (op, dop0, dop1));
19066 emit_insn (gen_avx2_permv4di_1 (t,
19067 gen_lowpart (V4DImode, op),
19068 const0_rtx,
19069 const2_rtx,
19070 const1_rtx,
19071 GEN_INT (3)));
19072 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
19073 }
19074 else
19075 emit_insn (gen_pack (d->target, dop0, dop1));
19076
19077 return true;
19078 }
19079
19080 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
19081 and extract-odd permutations of two V64QI operands
19082 with two "shifts", two "truncs" and one "concat" insns for "odd"
19083 and two "truncs" and one concat insn for "even."
19084 Have already failed all two instruction sequences. */
19085
19086 static bool
19087 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
19088 {
19089 rtx t1, t2, t3, t4;
19090 unsigned i, odd, nelt = d->nelt;
19091
19092 if (!TARGET_AVX512BW
19093 || d->one_operand_p
19094 || d->vmode != V64QImode)
19095 return false;
19096
19097 /* Check that permutation is even or odd. */
19098 odd = d->perm[0];
19099 if (odd > 1)
19100 return false;
19101
19102 for (i = 1; i < nelt; ++i)
19103 if (d->perm[i] != 2 * i + odd)
19104 return false;
19105
19106 if (d->testing_p)
19107 return true;
19108
19109
19110 if (odd)
19111 {
19112 t1 = gen_reg_rtx (V32HImode);
19113 t2 = gen_reg_rtx (V32HImode);
19114 emit_insn (gen_lshrv32hi3 (t1,
19115 gen_lowpart (V32HImode, d->op0),
19116 GEN_INT (8)));
19117 emit_insn (gen_lshrv32hi3 (t2,
19118 gen_lowpart (V32HImode, d->op1),
19119 GEN_INT (8)));
19120 }
19121 else
19122 {
19123 t1 = gen_lowpart (V32HImode, d->op0);
19124 t2 = gen_lowpart (V32HImode, d->op1);
19125 }
19126
19127 t3 = gen_reg_rtx (V32QImode);
19128 t4 = gen_reg_rtx (V32QImode);
19129 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
19130 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
19131 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
19132
19133 return true;
19134 }
19135
19136 /* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
19137 and extract-odd permutations. */
19138
19139 static bool
19140 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
19141 {
19142 rtx t1, t2, t3, t4, t5;
19143
19144 switch (d->vmode)
19145 {
19146 case E_V4DFmode:
19147 if (d->testing_p)
19148 break;
19149 t1 = gen_reg_rtx (V4DFmode);
19150 t2 = gen_reg_rtx (V4DFmode);
19151
19152 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
19153 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
19154 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
19155
19156 /* Now an unpck[lh]pd will produce the result required. */
19157 if (odd)
19158 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
19159 else
19160 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
19161 emit_insn (t3);
19162 break;
19163
19164 case E_V8SFmode:
19165 {
19166 int mask = odd ? 0xdd : 0x88;
19167
19168 if (d->testing_p)
19169 break;
19170 t1 = gen_reg_rtx (V8SFmode);
19171 t2 = gen_reg_rtx (V8SFmode);
19172 t3 = gen_reg_rtx (V8SFmode);
19173
19174 /* Shuffle within the 128-bit lanes to produce:
19175 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
19176 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
19177 GEN_INT (mask)));
19178
19179 /* Shuffle the lanes around to produce:
19180 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
19181 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
19182 GEN_INT (0x3)));
19183
19184 /* Shuffle within the 128-bit lanes to produce:
19185 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
19186 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
19187
19188 /* Shuffle within the 128-bit lanes to produce:
19189 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
19190 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
19191
19192 /* Shuffle the lanes around to produce:
19193 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
19194 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
19195 GEN_INT (0x20)));
19196 }
19197 break;
19198
19199 case E_V2DFmode:
19200 case E_V4SFmode:
19201 case E_V2DImode:
19202 case E_V2SImode:
19203 case E_V4SImode:
19204 /* These are always directly implementable by expand_vec_perm_1. */
19205 gcc_unreachable ();
19206
19207 case E_V2SFmode:
19208 gcc_assert (TARGET_MMX_WITH_SSE);
19209 /* We have no suitable instructions. */
19210 if (d->testing_p)
19211 return false;
19212 break;
19213
19214 case E_V4HImode:
19215 if (d->testing_p)
19216 break;
19217 /* We need 2*log2(N)-1 operations to achieve odd/even
19218 with interleave. */
19219 t1 = gen_reg_rtx (V4HImode);
19220 emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
19221 emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
19222 if (odd)
19223 t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
19224 else
19225 t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
19226 emit_insn (t2);
19227 break;
19228
19229 case E_V8HImode:
19230 if (TARGET_SSE4_1)
19231 return expand_vec_perm_even_odd_pack (d);
19232 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
19233 return expand_vec_perm_pshufb2 (d);
19234 else
19235 {
19236 if (d->testing_p)
19237 break;
19238 /* We need 2*log2(N)-1 operations to achieve odd/even
19239 with interleave. */
19240 t1 = gen_reg_rtx (V8HImode);
19241 t2 = gen_reg_rtx (V8HImode);
19242 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
19243 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
19244 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
19245 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
19246 if (odd)
19247 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
19248 else
19249 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
19250 emit_insn (t3);
19251 }
19252 break;
19253
19254 case E_V16QImode:
19255 return expand_vec_perm_even_odd_pack (d);
19256
19257 case E_V16HImode:
19258 case E_V32QImode:
19259 return expand_vec_perm_even_odd_pack (d);
19260
19261 case E_V64QImode:
19262 return expand_vec_perm_even_odd_trunc (d);
19263
19264 case E_V4DImode:
19265 if (!TARGET_AVX2)
19266 {
19267 struct expand_vec_perm_d d_copy = *d;
19268 d_copy.vmode = V4DFmode;
19269 if (d->testing_p)
19270 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
19271 else
19272 d_copy.target = gen_reg_rtx (V4DFmode);
19273 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
19274 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
19275 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
19276 {
19277 if (!d->testing_p)
19278 emit_move_insn (d->target,
19279 gen_lowpart (V4DImode, d_copy.target));
19280 return true;
19281 }
19282 return false;
19283 }
19284
19285 if (d->testing_p)
19286 break;
19287
19288 t1 = gen_reg_rtx (V4DImode);
19289 t2 = gen_reg_rtx (V4DImode);
19290
19291 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
19292 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
19293 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
19294
19295 /* Now an vpunpck[lh]qdq will produce the result required. */
19296 if (odd)
19297 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
19298 else
19299 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
19300 emit_insn (t3);
19301 break;
19302
19303 case E_V8SImode:
19304 if (!TARGET_AVX2)
19305 {
19306 struct expand_vec_perm_d d_copy = *d;
19307 d_copy.vmode = V8SFmode;
19308 if (d->testing_p)
19309 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
19310 else
19311 d_copy.target = gen_reg_rtx (V8SFmode);
19312 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
19313 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
19314 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
19315 {
19316 if (!d->testing_p)
19317 emit_move_insn (d->target,
19318 gen_lowpart (V8SImode, d_copy.target));
19319 return true;
19320 }
19321 return false;
19322 }
19323
19324 if (d->testing_p)
19325 break;
19326
19327 t1 = gen_reg_rtx (V8SImode);
19328 t2 = gen_reg_rtx (V8SImode);
19329 t3 = gen_reg_rtx (V4DImode);
19330 t4 = gen_reg_rtx (V4DImode);
19331 t5 = gen_reg_rtx (V4DImode);
19332
19333 /* Shuffle the lanes around into
19334 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
19335 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
19336 gen_lowpart (V4DImode, d->op1),
19337 GEN_INT (0x20)));
19338 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
19339 gen_lowpart (V4DImode, d->op1),
19340 GEN_INT (0x31)));
19341
19342 /* Swap the 2nd and 3rd position in each lane into
19343 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
19344 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
19345 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
19346 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
19347 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
19348
19349 /* Now an vpunpck[lh]qdq will produce
19350 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
19351 if (odd)
19352 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
19353 gen_lowpart (V4DImode, t2));
19354 else
19355 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
19356 gen_lowpart (V4DImode, t2));
19357 emit_insn (t3);
19358 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
19359 break;
19360
19361 default:
19362 gcc_unreachable ();
19363 }
19364
19365 return true;
19366 }
19367
19368 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
19369 extract-even and extract-odd permutations. */
19370
19371 static bool
19372 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
19373 {
19374 unsigned i, odd, nelt = d->nelt;
19375
19376 odd = d->perm[0];
19377 if (odd != 0 && odd != 1)
19378 return false;
19379
19380 for (i = 1; i < nelt; ++i)
19381 if (d->perm[i] != 2 * i + odd)
19382 return false;
19383
19384 return expand_vec_perm_even_odd_1 (d, odd);
19385 }
19386
19387 /* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
19388 permutations. We assume that expand_vec_perm_1 has already failed. */
19389
19390 static bool
19391 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
19392 {
19393 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
19394 machine_mode vmode = d->vmode;
19395 unsigned char perm2[4];
19396 rtx op0 = d->op0, dest;
19397 bool ok;
19398
19399 switch (vmode)
19400 {
19401 case E_V4DFmode:
19402 case E_V8SFmode:
19403 /* These are special-cased in sse.md so that we can optionally
19404 use the vbroadcast instruction. They expand to two insns
19405 if the input happens to be in a register. */
19406 gcc_unreachable ();
19407
19408 case E_V2DFmode:
19409 case E_V2SFmode:
19410 case E_V4SFmode:
19411 case E_V2DImode:
19412 case E_V2SImode:
19413 case E_V4SImode:
19414 /* These are always implementable using standard shuffle patterns. */
19415 gcc_unreachable ();
19416
19417 case E_V8HImode:
19418 case E_V16QImode:
19419 /* These can be implemented via interleave. We save one insn by
19420 stopping once we have promoted to V4SImode and then use pshufd. */
19421 if (d->testing_p)
19422 return true;
19423 do
19424 {
19425 rtx dest;
19426 rtx (*gen) (rtx, rtx, rtx)
19427 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
19428 : gen_vec_interleave_lowv8hi;
19429
19430 if (elt >= nelt2)
19431 {
19432 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
19433 : gen_vec_interleave_highv8hi;
19434 elt -= nelt2;
19435 }
19436 nelt2 /= 2;
19437
19438 dest = gen_reg_rtx (vmode);
19439 emit_insn (gen (dest, op0, op0));
19440 vmode = get_mode_wider_vector (vmode);
19441 op0 = gen_lowpart (vmode, dest);
19442 }
19443 while (vmode != V4SImode);
19444
19445 memset (perm2, elt, 4);
19446 dest = gen_reg_rtx (V4SImode);
19447 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
19448 gcc_assert (ok);
19449 if (!d->testing_p)
19450 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
19451 return true;
19452
19453 case E_V64QImode:
19454 case E_V32QImode:
19455 case E_V16HImode:
19456 case E_V8SImode:
19457 case E_V4DImode:
19458 /* For AVX2 broadcasts of the first element vpbroadcast* or
19459 vpermq should be used by expand_vec_perm_1. */
19460 gcc_assert (!TARGET_AVX2 || d->perm[0]);
19461 return false;
19462
19463 default:
19464 gcc_unreachable ();
19465 }
19466 }
19467
19468 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
19469 broadcast permutations. */
19470
19471 static bool
19472 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
19473 {
19474 unsigned i, elt, nelt = d->nelt;
19475
19476 if (!d->one_operand_p)
19477 return false;
19478
19479 elt = d->perm[0];
19480 for (i = 1; i < nelt; ++i)
19481 if (d->perm[i] != elt)
19482 return false;
19483
19484 return expand_vec_perm_broadcast_1 (d);
19485 }
19486
19487 /* Implement arbitrary permutations of two V64QImode operands
19488 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
19489 static bool
19490 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
19491 {
19492 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
19493 return false;
19494
19495 if (d->testing_p)
19496 return true;
19497
19498 struct expand_vec_perm_d ds[2];
19499 rtx rperm[128], vperm, target0, target1;
19500 unsigned int i, nelt;
19501 machine_mode vmode;
19502
19503 nelt = d->nelt;
19504 vmode = V64QImode;
19505
19506 for (i = 0; i < 2; i++)
19507 {
19508 ds[i] = *d;
19509 ds[i].vmode = V32HImode;
19510 ds[i].nelt = 32;
19511 ds[i].target = gen_reg_rtx (V32HImode);
19512 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
19513 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
19514 }
19515
19516 /* Prepare permutations such that the first one takes care of
19517 putting the even bytes into the right positions or one higher
19518 positions (ds[0]) and the second one takes care of
19519 putting the odd bytes into the right positions or one below
19520 (ds[1]). */
19521
19522 for (i = 0; i < nelt; i++)
19523 {
19524 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
19525 if (i & 1)
19526 {
19527 rperm[i] = constm1_rtx;
19528 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
19529 }
19530 else
19531 {
19532 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
19533 rperm[i + 64] = constm1_rtx;
19534 }
19535 }
19536
19537 bool ok = expand_vec_perm_1 (&ds[0]);
19538 gcc_assert (ok);
19539 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
19540
19541 ok = expand_vec_perm_1 (&ds[1]);
19542 gcc_assert (ok);
19543 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
19544
19545 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
19546 vperm = force_reg (vmode, vperm);
19547 target0 = gen_reg_rtx (V64QImode);
19548 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
19549
19550 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
19551 vperm = force_reg (vmode, vperm);
19552 target1 = gen_reg_rtx (V64QImode);
19553 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
19554
19555 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
19556 return true;
19557 }
19558
19559 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
19560 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
19561 all the shorter instruction sequences. */
19562
19563 static bool
19564 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
19565 {
19566 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
19567 unsigned int i, nelt, eltsz;
19568 bool used[4];
19569
19570 if (!TARGET_AVX2
19571 || d->one_operand_p
19572 || (d->vmode != V32QImode && d->vmode != V16HImode))
19573 return false;
19574
19575 if (d->testing_p)
19576 return true;
19577
19578 nelt = d->nelt;
19579 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
19580
19581 /* Generate 4 permutation masks. If the required element is within
19582 the same lane, it is shuffled in. If the required element from the
19583 other lane, force a zero by setting bit 7 in the permutation mask.
19584 In the other mask the mask has non-negative elements if element
19585 is requested from the other lane, but also moved to the other lane,
19586 so that the result of vpshufb can have the two V2TImode halves
19587 swapped. */
19588 m128 = GEN_INT (-128);
19589 for (i = 0; i < 32; ++i)
19590 {
19591 rperm[0][i] = m128;
19592 rperm[1][i] = m128;
19593 rperm[2][i] = m128;
19594 rperm[3][i] = m128;
19595 }
19596 used[0] = false;
19597 used[1] = false;
19598 used[2] = false;
19599 used[3] = false;
19600 for (i = 0; i < nelt; ++i)
19601 {
19602 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
19603 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
19604 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
19605
19606 for (j = 0; j < eltsz; ++j)
19607 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
19608 used[which] = true;
19609 }
19610
19611 for (i = 0; i < 2; ++i)
19612 {
19613 if (!used[2 * i + 1])
19614 {
19615 h[i] = NULL_RTX;
19616 continue;
19617 }
19618 vperm = gen_rtx_CONST_VECTOR (V32QImode,
19619 gen_rtvec_v (32, rperm[2 * i + 1]));
19620 vperm = force_reg (V32QImode, vperm);
19621 h[i] = gen_reg_rtx (V32QImode);
19622 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
19623 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
19624 }
19625
19626 /* Swap the 128-byte lanes of h[X]. */
19627 for (i = 0; i < 2; ++i)
19628 {
19629 if (h[i] == NULL_RTX)
19630 continue;
19631 op = gen_reg_rtx (V4DImode);
19632 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
19633 const2_rtx, GEN_INT (3), const0_rtx,
19634 const1_rtx));
19635 h[i] = gen_lowpart (V32QImode, op);
19636 }
19637
19638 for (i = 0; i < 2; ++i)
19639 {
19640 if (!used[2 * i])
19641 {
19642 l[i] = NULL_RTX;
19643 continue;
19644 }
19645 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
19646 vperm = force_reg (V32QImode, vperm);
19647 l[i] = gen_reg_rtx (V32QImode);
19648 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
19649 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
19650 }
19651
19652 for (i = 0; i < 2; ++i)
19653 {
19654 if (h[i] && l[i])
19655 {
19656 op = gen_reg_rtx (V32QImode);
19657 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
19658 l[i] = op;
19659 }
19660 else if (h[i])
19661 l[i] = h[i];
19662 }
19663
19664 gcc_assert (l[0] && l[1]);
19665 op = d->target;
19666 if (d->vmode != V32QImode)
19667 op = gen_reg_rtx (V32QImode);
19668 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
19669 if (op != d->target)
19670 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
19671 return true;
19672 }
19673
19674 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
19675 taken care of, perform the expansion in D and return true on success. */
19676
19677 static bool
19678 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
19679 {
19680 /* Try a single instruction expansion. */
19681 if (expand_vec_perm_1 (d))
19682 return true;
19683
19684 /* Try sequences of two instructions. */
19685
19686 if (expand_vec_perm_pshuflw_pshufhw (d))
19687 return true;
19688
19689 if (expand_vec_perm_palignr (d, false))
19690 return true;
19691
19692 if (expand_vec_perm_interleave2 (d))
19693 return true;
19694
19695 if (expand_vec_perm_broadcast (d))
19696 return true;
19697
19698 if (expand_vec_perm_vpermq_perm_1 (d))
19699 return true;
19700
19701 if (expand_vec_perm_vperm2f128 (d))
19702 return true;
19703
19704 if (expand_vec_perm_pblendv (d))
19705 return true;
19706
19707 /* Try sequences of three instructions. */
19708
19709 if (expand_vec_perm_even_odd_pack (d))
19710 return true;
19711
19712 if (expand_vec_perm_2vperm2f128_vshuf (d))
19713 return true;
19714
19715 if (expand_vec_perm_pshufb2 (d))
19716 return true;
19717
19718 if (expand_vec_perm_interleave3 (d))
19719 return true;
19720
19721 if (expand_vec_perm_vperm2f128_vblend (d))
19722 return true;
19723
19724 /* Try sequences of four instructions. */
19725
19726 if (expand_vec_perm_even_odd_trunc (d))
19727 return true;
19728 if (expand_vec_perm_vpshufb2_vpermq (d))
19729 return true;
19730
19731 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
19732 return true;
19733
19734 if (expand_vec_perm_vpermt2_vpshub2 (d))
19735 return true;
19736
19737 /* ??? Look for narrow permutations whose element orderings would
19738 allow the promotion to a wider mode. */
19739
19740 /* ??? Look for sequences of interleave or a wider permute that place
19741 the data into the correct lanes for a half-vector shuffle like
19742 pshuf[lh]w or vpermilps. */
19743
19744 /* ??? Look for sequences of interleave that produce the desired results.
19745 The combinatorics of punpck[lh] get pretty ugly... */
19746
19747 if (expand_vec_perm_even_odd (d))
19748 return true;
19749
19750 /* Even longer sequences. */
19751 if (expand_vec_perm_vpshufb4_vpermq2 (d))
19752 return true;
19753
19754 /* See if we can get the same permutation in different vector integer
19755 mode. */
19756 struct expand_vec_perm_d nd;
19757 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19758 {
19759 if (!d->testing_p)
19760 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19761 return true;
19762 }
19763
19764 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
19765 if (expand_vec_perm2_vperm2f128_vblend (d))
19766 return true;
19767
19768 return false;
19769 }
19770
19771 /* If a permutation only uses one operand, make it clear. Returns true
19772 if the permutation references both operands. */
19773
19774 static bool
19775 canonicalize_perm (struct expand_vec_perm_d *d)
19776 {
19777 int i, which, nelt = d->nelt;
19778
19779 for (i = which = 0; i < nelt; ++i)
19780 which |= (d->perm[i] < nelt ? 1 : 2);
19781
19782 d->one_operand_p = true;
19783 switch (which)
19784 {
19785 default:
19786 gcc_unreachable();
19787
19788 case 3:
19789 if (!rtx_equal_p (d->op0, d->op1))
19790 {
19791 d->one_operand_p = false;
19792 break;
19793 }
19794 /* The elements of PERM do not suggest that only the first operand
19795 is used, but both operands are identical. Allow easier matching
19796 of the permutation by folding the permutation into the single
19797 input vector. */
19798 /* FALLTHRU */
19799
19800 case 2:
19801 for (i = 0; i < nelt; ++i)
19802 d->perm[i] &= nelt - 1;
19803 d->op0 = d->op1;
19804 break;
19805
19806 case 1:
19807 d->op1 = d->op0;
19808 break;
19809 }
19810
19811 return (which == 3);
19812 }
19813
19814 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
19815
19816 bool
19817 ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
19818 rtx op1, const vec_perm_indices &sel)
19819 {
19820 struct expand_vec_perm_d d;
19821 unsigned char perm[MAX_VECT_LEN];
19822 unsigned int i, nelt, which;
19823 bool two_args;
19824
19825 d.target = target;
19826 d.op0 = op0;
19827 d.op1 = op1;
19828
19829 d.vmode = vmode;
19830 gcc_assert (VECTOR_MODE_P (d.vmode));
19831 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19832 d.testing_p = !target;
19833
19834 gcc_assert (sel.length () == nelt);
19835 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
19836
19837 /* Given sufficient ISA support we can just return true here
19838 for selected vector modes. */
19839 switch (d.vmode)
19840 {
19841 case E_V16SFmode:
19842 case E_V16SImode:
19843 case E_V8DImode:
19844 case E_V8DFmode:
19845 if (!TARGET_AVX512F)
19846 return false;
19847 /* All implementable with a single vperm[it]2 insn. */
19848 if (d.testing_p)
19849 return true;
19850 break;
19851 case E_V32HImode:
19852 if (!TARGET_AVX512BW)
19853 return false;
19854 if (d.testing_p)
19855 /* All implementable with a single vperm[it]2 insn. */
19856 return true;
19857 break;
19858 case E_V64QImode:
19859 if (!TARGET_AVX512BW)
19860 return false;
19861 if (d.testing_p)
19862 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
19863 return true;
19864 break;
19865 case E_V8SImode:
19866 case E_V8SFmode:
19867 case E_V4DFmode:
19868 case E_V4DImode:
19869 if (!TARGET_AVX)
19870 return false;
19871 if (d.testing_p && TARGET_AVX512VL)
19872 /* All implementable with a single vperm[it]2 insn. */
19873 return true;
19874 break;
19875 case E_V16HImode:
19876 if (!TARGET_SSE2)
19877 return false;
19878 if (d.testing_p && TARGET_AVX2)
19879 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19880 return true;
19881 break;
19882 case E_V32QImode:
19883 if (!TARGET_SSE2)
19884 return false;
19885 if (d.testing_p && TARGET_AVX2)
19886 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19887 return true;
19888 break;
19889 case E_V8HImode:
19890 case E_V16QImode:
19891 if (!TARGET_SSE2)
19892 return false;
19893 /* Fall through. */
19894 case E_V4SImode:
19895 case E_V4SFmode:
19896 if (!TARGET_SSE)
19897 return false;
19898 /* All implementable with a single vpperm insn. */
19899 if (d.testing_p && TARGET_XOP)
19900 return true;
19901 /* All implementable with 2 pshufb + 1 ior. */
19902 if (d.testing_p && TARGET_SSSE3)
19903 return true;
19904 break;
19905 case E_V2SFmode:
19906 case E_V2SImode:
19907 case E_V4HImode:
19908 if (!TARGET_MMX_WITH_SSE)
19909 return false;
19910 break;
19911 case E_V2DImode:
19912 case E_V2DFmode:
19913 if (!TARGET_SSE)
19914 return false;
19915 /* All implementable with shufpd or unpck[lh]pd. */
19916 if (d.testing_p)
19917 return true;
19918 break;
19919 default:
19920 return false;
19921 }
19922
19923 for (i = which = 0; i < nelt; ++i)
19924 {
19925 unsigned char e = sel[i];
19926 gcc_assert (e < 2 * nelt);
19927 d.perm[i] = e;
19928 perm[i] = e;
19929 which |= (e < nelt ? 1 : 2);
19930 }
19931
19932 if (d.testing_p)
19933 {
19934 /* For all elements from second vector, fold the elements to first. */
19935 if (which == 2)
19936 for (i = 0; i < nelt; ++i)
19937 d.perm[i] -= nelt;
19938
19939 /* Check whether the mask can be applied to the vector type. */
19940 d.one_operand_p = (which != 3);
19941
19942 /* Implementable with shufps or pshufd. */
19943 if (d.one_operand_p
19944 && (d.vmode == V4SFmode || d.vmode == V2SFmode
19945 || d.vmode == V4SImode || d.vmode == V2SImode))
19946 return true;
19947
19948 /* Otherwise we have to go through the motions and see if we can
19949 figure out how to generate the requested permutation. */
19950 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
19951 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
19952 if (!d.one_operand_p)
19953 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
19954
19955 start_sequence ();
19956 bool ret = ix86_expand_vec_perm_const_1 (&d);
19957 end_sequence ();
19958
19959 return ret;
19960 }
19961
19962 two_args = canonicalize_perm (&d);
19963
19964 /* If one of the operands is a zero vector, try to match pmovzx. */
19965 if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
19966 {
19967 struct expand_vec_perm_d dzero = d;
19968 if (d.op0 == CONST0_RTX (vmode))
19969 {
19970 d.op1 = dzero.op1 = force_reg (vmode, d.op1);
19971 std::swap (dzero.op0, dzero.op1);
19972 for (i = 0; i < nelt; ++i)
19973 dzero.perm[i] ^= nelt;
19974 }
19975 else
19976 d.op0 = dzero.op0 = force_reg (vmode, d.op0);
19977
19978 if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
19979 dzero.perm, nelt, dzero.testing_p))
19980 return true;
19981 }
19982
19983 /* Force operands into registers. */
19984 rtx nop0 = force_reg (vmode, d.op0);
19985 if (d.op0 == d.op1)
19986 d.op1 = nop0;
19987 d.op0 = nop0;
19988 d.op1 = force_reg (vmode, d.op1);
19989
19990 if (ix86_expand_vec_perm_const_1 (&d))
19991 return true;
19992
19993 /* If the selector says both arguments are needed, but the operands are the
19994 same, the above tried to expand with one_operand_p and flattened selector.
19995 If that didn't work, retry without one_operand_p; we succeeded with that
19996 during testing. */
19997 if (two_args && d.one_operand_p)
19998 {
19999 d.one_operand_p = false;
20000 memcpy (d.perm, perm, sizeof (perm));
20001 return ix86_expand_vec_perm_const_1 (&d);
20002 }
20003
20004 return false;
20005 }
20006
20007 void
20008 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
20009 {
20010 struct expand_vec_perm_d d;
20011 unsigned i, nelt;
20012
20013 d.target = targ;
20014 d.op0 = op0;
20015 d.op1 = op1;
20016 d.vmode = GET_MODE (targ);
20017 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
20018 d.one_operand_p = false;
20019 d.testing_p = false;
20020
20021 for (i = 0; i < nelt; ++i)
20022 d.perm[i] = i * 2 + odd;
20023
20024 /* We'll either be able to implement the permutation directly... */
20025 if (expand_vec_perm_1 (&d))
20026 return;
20027
20028 /* ... or we use the special-case patterns. */
20029 expand_vec_perm_even_odd_1 (&d, odd);
20030 }
20031
20032 static void
20033 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
20034 {
20035 struct expand_vec_perm_d d;
20036 unsigned i, nelt, base;
20037 bool ok;
20038
20039 d.target = targ;
20040 d.op0 = op0;
20041 d.op1 = op1;
20042 d.vmode = GET_MODE (targ);
20043 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
20044 d.one_operand_p = false;
20045 d.testing_p = false;
20046
20047 base = high_p ? nelt / 2 : 0;
20048 for (i = 0; i < nelt / 2; ++i)
20049 {
20050 d.perm[i * 2] = i + base;
20051 d.perm[i * 2 + 1] = i + base + nelt;
20052 }
20053
20054 /* Note that for AVX this isn't one instruction. */
20055 ok = ix86_expand_vec_perm_const_1 (&d);
20056 gcc_assert (ok);
20057 }
20058
20059 /* Optimize vector MUL generation for V8QI, V16QI and V32QI
20060 under TARGET_AVX512BW. i.e. for v16qi a * b, it has
20061
20062 vpmovzxbw ymm2, xmm0
20063 vpmovzxbw ymm3, xmm1
20064 vpmullw ymm4, ymm2, ymm3
20065 vpmovwb xmm0, ymm4
20066
20067 it would take less instructions than ix86_expand_vecop_qihi.
20068 Return true if success. */
20069
20070 bool
20071 ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2)
20072 {
20073 machine_mode himode, qimode = GET_MODE (dest);
20074 rtx hop1, hop2, hdest;
20075 rtx (*gen_extend)(rtx, rtx);
20076 rtx (*gen_truncate)(rtx, rtx);
20077
20078 /* There's no V64HImode multiplication instruction. */
20079 if (qimode == E_V64QImode)
20080 return false;
20081
20082 /* vpmovwb only available under AVX512BW. */
20083 if (!TARGET_AVX512BW)
20084 return false;
20085 if ((qimode == V8QImode || qimode == V16QImode)
20086 && !TARGET_AVX512VL)
20087 return false;
20088 /* Not generate zmm instruction when prefer 128/256 bit vector width. */
20089 if (qimode == V32QImode
20090 && (TARGET_PREFER_AVX128 || TARGET_PREFER_AVX256))
20091 return false;
20092
20093 switch (qimode)
20094 {
20095 case E_V8QImode:
20096 himode = V8HImode;
20097 gen_extend = gen_zero_extendv8qiv8hi2;
20098 gen_truncate = gen_truncv8hiv8qi2;
20099 break;
20100 case E_V16QImode:
20101 himode = V16HImode;
20102 gen_extend = gen_zero_extendv16qiv16hi2;
20103 gen_truncate = gen_truncv16hiv16qi2;
20104 break;
20105 case E_V32QImode:
20106 himode = V32HImode;
20107 gen_extend = gen_zero_extendv32qiv32hi2;
20108 gen_truncate = gen_truncv32hiv32qi2;
20109 break;
20110 default:
20111 gcc_unreachable ();
20112 }
20113
20114 hop1 = gen_reg_rtx (himode);
20115 hop2 = gen_reg_rtx (himode);
20116 hdest = gen_reg_rtx (himode);
20117 emit_insn (gen_extend (hop1, op1));
20118 emit_insn (gen_extend (hop2, op2));
20119 emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (MULT, himode,
20120 hop1, hop2)));
20121 emit_insn (gen_truncate (dest, hdest));
20122 return true;
20123 }
20124
20125 /* Expand a vector operation shift by constant for a V*QImode in terms of the
20126 same operation on V*HImode. Return true if success. */
20127 bool
20128 ix86_expand_vec_shift_qihi_constant (enum rtx_code code, rtx dest, rtx op1, rtx op2)
20129 {
20130 machine_mode qimode, himode;
20131 HOST_WIDE_INT and_constant, xor_constant;
20132 HOST_WIDE_INT shift_amount;
20133 rtx vec_const_and, vec_const_xor;
20134 rtx tmp, op1_subreg;
20135 rtx (*gen_shift) (rtx, rtx, rtx);
20136 rtx (*gen_and) (rtx, rtx, rtx);
20137 rtx (*gen_xor) (rtx, rtx, rtx);
20138 rtx (*gen_sub) (rtx, rtx, rtx);
20139
20140 /* Only optimize shift by constant. */
20141 if (!CONST_INT_P (op2))
20142 return false;
20143
20144 qimode = GET_MODE (dest);
20145 shift_amount = INTVAL (op2);
20146 /* Do nothing when shift amount greater equal 8. */
20147 if (shift_amount > 7)
20148 return false;
20149
20150 gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
20151 /* Record sign bit. */
20152 xor_constant = 1 << (8 - shift_amount - 1);
20153
20154 /* Zero upper/lower bits shift from left/right element. */
20155 and_constant
20156 = (code == ASHIFT ? 256 - (1 << shift_amount)
20157 : (1 << (8 - shift_amount)) - 1);
20158
20159 switch (qimode)
20160 {
20161 case V16QImode:
20162 himode = V8HImode;
20163 gen_shift =
20164 ((code == ASHIFT)
20165 ? gen_ashlv8hi3
20166 : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
20167 gen_and = gen_andv16qi3;
20168 gen_xor = gen_xorv16qi3;
20169 gen_sub = gen_subv16qi3;
20170 break;
20171 case V32QImode:
20172 himode = V16HImode;
20173 gen_shift =
20174 ((code == ASHIFT)
20175 ? gen_ashlv16hi3
20176 : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
20177 gen_and = gen_andv32qi3;
20178 gen_xor = gen_xorv32qi3;
20179 gen_sub = gen_subv32qi3;
20180 break;
20181 case V64QImode:
20182 himode = V32HImode;
20183 gen_shift =
20184 ((code == ASHIFT)
20185 ? gen_ashlv32hi3
20186 : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
20187 gen_and = gen_andv64qi3;
20188 gen_xor = gen_xorv64qi3;
20189 gen_sub = gen_subv64qi3;
20190 break;
20191 default:
20192 gcc_unreachable ();
20193 }
20194
20195 tmp = gen_reg_rtx (himode);
20196 vec_const_and = gen_reg_rtx (qimode);
20197 op1_subreg = lowpart_subreg (himode, op1, qimode);
20198
20199 /* For ASHIFT and LSHIFTRT, perform operation like
20200 vpsllw/vpsrlw $shift_amount, %op1, %dest.
20201 vpand %vec_const_and, %dest. */
20202 emit_insn (gen_shift (tmp, op1_subreg, op2));
20203 emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
20204 emit_move_insn (vec_const_and,
20205 ix86_build_const_vector (qimode, true,
20206 gen_int_mode (and_constant, QImode)));
20207 emit_insn (gen_and (dest, dest, vec_const_and));
20208
20209 /* For ASHIFTRT, perform extra operation like
20210 vpxor %vec_const_xor, %dest, %dest
20211 vpsubb %vec_const_xor, %dest, %dest */
20212 if (code == ASHIFTRT)
20213 {
20214 vec_const_xor = gen_reg_rtx (qimode);
20215 emit_move_insn (vec_const_xor,
20216 ix86_build_const_vector (qimode, true,
20217 gen_int_mode (xor_constant, QImode)));
20218 emit_insn (gen_xor (dest, dest, vec_const_xor));
20219 emit_insn (gen_sub (dest, dest, vec_const_xor));
20220 }
20221 return true;
20222 }
20223
20224 /* Expand a vector operation CODE for a V*QImode in terms of the
20225 same operation on V*HImode. */
20226
20227 void
20228 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
20229 {
20230 machine_mode qimode = GET_MODE (dest);
20231 machine_mode himode;
20232 rtx (*gen_il) (rtx, rtx, rtx);
20233 rtx (*gen_ih) (rtx, rtx, rtx);
20234 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
20235 struct expand_vec_perm_d d;
20236 bool ok, full_interleave;
20237 bool uns_p = false;
20238 int i;
20239
20240 switch (qimode)
20241 {
20242 case E_V16QImode:
20243 himode = V8HImode;
20244 gen_il = gen_vec_interleave_lowv16qi;
20245 gen_ih = gen_vec_interleave_highv16qi;
20246 break;
20247 case E_V32QImode:
20248 himode = V16HImode;
20249 gen_il = gen_avx2_interleave_lowv32qi;
20250 gen_ih = gen_avx2_interleave_highv32qi;
20251 break;
20252 case E_V64QImode:
20253 himode = V32HImode;
20254 gen_il = gen_avx512bw_interleave_lowv64qi;
20255 gen_ih = gen_avx512bw_interleave_highv64qi;
20256 break;
20257 default:
20258 gcc_unreachable ();
20259 }
20260
20261 op2_l = op2_h = op2;
20262 switch (code)
20263 {
20264 case MULT:
20265 /* Unpack data such that we've got a source byte in each low byte of
20266 each word. We don't care what goes into the high byte of each word.
20267 Rather than trying to get zero in there, most convenient is to let
20268 it be a copy of the low byte. */
20269 op2_l = gen_reg_rtx (qimode);
20270 op2_h = gen_reg_rtx (qimode);
20271 emit_insn (gen_il (op2_l, op2, op2));
20272 emit_insn (gen_ih (op2_h, op2, op2));
20273
20274 op1_l = gen_reg_rtx (qimode);
20275 op1_h = gen_reg_rtx (qimode);
20276 emit_insn (gen_il (op1_l, op1, op1));
20277 emit_insn (gen_ih (op1_h, op1, op1));
20278 full_interleave = qimode == V16QImode;
20279 break;
20280
20281 case ASHIFT:
20282 case LSHIFTRT:
20283 uns_p = true;
20284 /* FALLTHRU */
20285 case ASHIFTRT:
20286 op1_l = gen_reg_rtx (himode);
20287 op1_h = gen_reg_rtx (himode);
20288 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
20289 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
20290 full_interleave = true;
20291 break;
20292 default:
20293 gcc_unreachable ();
20294 }
20295
20296 /* Perform the operation. */
20297 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
20298 1, OPTAB_DIRECT);
20299 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
20300 1, OPTAB_DIRECT);
20301 gcc_assert (res_l && res_h);
20302
20303 /* Merge the data back into the right place. */
20304 d.target = dest;
20305 d.op0 = gen_lowpart (qimode, res_l);
20306 d.op1 = gen_lowpart (qimode, res_h);
20307 d.vmode = qimode;
20308 d.nelt = GET_MODE_NUNITS (qimode);
20309 d.one_operand_p = false;
20310 d.testing_p = false;
20311
20312 if (full_interleave)
20313 {
20314 /* For SSE2, we used an full interleave, so the desired
20315 results are in the even elements. */
20316 for (i = 0; i < d.nelt; ++i)
20317 d.perm[i] = i * 2;
20318 }
20319 else
20320 {
20321 /* For AVX, the interleave used above was not cross-lane. So the
20322 extraction is evens but with the second and third quarter swapped.
20323 Happily, that is even one insn shorter than even extraction.
20324 For AVX512BW we have 4 lanes. We extract evens from within a lane,
20325 always first from the first and then from the second source operand,
20326 the index bits above the low 4 bits remains the same.
20327 Thus, for d.nelt == 32 we want permutation
20328 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
20329 and for d.nelt == 64 we want permutation
20330 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
20331 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
20332 for (i = 0; i < d.nelt; ++i)
20333 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
20334 }
20335
20336 ok = ix86_expand_vec_perm_const_1 (&d);
20337 gcc_assert (ok);
20338
20339 set_unique_reg_note (get_last_insn (), REG_EQUAL,
20340 gen_rtx_fmt_ee (code, qimode, op1, op2));
20341 }
20342
20343 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
20344 if op is CONST_VECTOR with all odd elements equal to their
20345 preceding element. */
20346
20347 static bool
20348 const_vector_equal_evenodd_p (rtx op)
20349 {
20350 machine_mode mode = GET_MODE (op);
20351 int i, nunits = GET_MODE_NUNITS (mode);
20352 if (GET_CODE (op) != CONST_VECTOR
20353 || nunits != CONST_VECTOR_NUNITS (op))
20354 return false;
20355 for (i = 0; i < nunits; i += 2)
20356 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
20357 return false;
20358 return true;
20359 }
20360
20361 void
20362 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
20363 bool uns_p, bool odd_p)
20364 {
20365 machine_mode mode = GET_MODE (op1);
20366 machine_mode wmode = GET_MODE (dest);
20367 rtx x;
20368 rtx orig_op1 = op1, orig_op2 = op2;
20369
20370 if (!nonimmediate_operand (op1, mode))
20371 op1 = force_reg (mode, op1);
20372 if (!nonimmediate_operand (op2, mode))
20373 op2 = force_reg (mode, op2);
20374
20375 /* We only play even/odd games with vectors of SImode. */
20376 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
20377
20378 /* If we're looking for the odd results, shift those members down to
20379 the even slots. For some cpus this is faster than a PSHUFD. */
20380 if (odd_p)
20381 {
20382 /* For XOP use vpmacsdqh, but only for smult, as it is only
20383 signed. */
20384 if (TARGET_XOP && mode == V4SImode && !uns_p)
20385 {
20386 x = force_reg (wmode, CONST0_RTX (wmode));
20387 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
20388 return;
20389 }
20390
20391 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
20392 if (!const_vector_equal_evenodd_p (orig_op1))
20393 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
20394 x, NULL, 1, OPTAB_DIRECT);
20395 if (!const_vector_equal_evenodd_p (orig_op2))
20396 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
20397 x, NULL, 1, OPTAB_DIRECT);
20398 op1 = gen_lowpart (mode, op1);
20399 op2 = gen_lowpart (mode, op2);
20400 }
20401
20402 if (mode == V16SImode)
20403 {
20404 if (uns_p)
20405 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
20406 else
20407 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
20408 }
20409 else if (mode == V8SImode)
20410 {
20411 if (uns_p)
20412 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
20413 else
20414 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
20415 }
20416 else if (uns_p)
20417 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
20418 else if (TARGET_SSE4_1)
20419 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
20420 else
20421 {
20422 rtx s1, s2, t0, t1, t2;
20423
20424 /* The easiest way to implement this without PMULDQ is to go through
20425 the motions as if we are performing a full 64-bit multiply. With
20426 the exception that we need to do less shuffling of the elements. */
20427
20428 /* Compute the sign-extension, aka highparts, of the two operands. */
20429 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
20430 op1, pc_rtx, pc_rtx);
20431 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
20432 op2, pc_rtx, pc_rtx);
20433
20434 /* Multiply LO(A) * HI(B), and vice-versa. */
20435 t1 = gen_reg_rtx (wmode);
20436 t2 = gen_reg_rtx (wmode);
20437 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
20438 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
20439
20440 /* Multiply LO(A) * LO(B). */
20441 t0 = gen_reg_rtx (wmode);
20442 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
20443
20444 /* Combine and shift the highparts into place. */
20445 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
20446 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
20447 1, OPTAB_DIRECT);
20448
20449 /* Combine high and low parts. */
20450 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
20451 return;
20452 }
20453 emit_insn (x);
20454 }
20455
20456 void
20457 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
20458 bool uns_p, bool high_p)
20459 {
20460 machine_mode wmode = GET_MODE (dest);
20461 machine_mode mode = GET_MODE (op1);
20462 rtx t1, t2, t3, t4, mask;
20463
20464 switch (mode)
20465 {
20466 case E_V4SImode:
20467 t1 = gen_reg_rtx (mode);
20468 t2 = gen_reg_rtx (mode);
20469 if (TARGET_XOP && !uns_p)
20470 {
20471 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
20472 shuffle the elements once so that all elements are in the right
20473 place for immediate use: { A C B D }. */
20474 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
20475 const1_rtx, GEN_INT (3)));
20476 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
20477 const1_rtx, GEN_INT (3)));
20478 }
20479 else
20480 {
20481 /* Put the elements into place for the multiply. */
20482 ix86_expand_vec_interleave (t1, op1, op1, high_p);
20483 ix86_expand_vec_interleave (t2, op2, op2, high_p);
20484 high_p = false;
20485 }
20486 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
20487 break;
20488
20489 case E_V8SImode:
20490 /* Shuffle the elements between the lanes. After this we
20491 have { A B E F | C D G H } for each operand. */
20492 t1 = gen_reg_rtx (V4DImode);
20493 t2 = gen_reg_rtx (V4DImode);
20494 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
20495 const0_rtx, const2_rtx,
20496 const1_rtx, GEN_INT (3)));
20497 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
20498 const0_rtx, const2_rtx,
20499 const1_rtx, GEN_INT (3)));
20500
20501 /* Shuffle the elements within the lanes. After this we
20502 have { A A B B | C C D D } or { E E F F | G G H H }. */
20503 t3 = gen_reg_rtx (V8SImode);
20504 t4 = gen_reg_rtx (V8SImode);
20505 mask = GEN_INT (high_p
20506 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
20507 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
20508 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
20509 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
20510
20511 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
20512 break;
20513
20514 case E_V8HImode:
20515 case E_V16HImode:
20516 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
20517 uns_p, OPTAB_DIRECT);
20518 t2 = expand_binop (mode,
20519 uns_p ? umul_highpart_optab : smul_highpart_optab,
20520 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
20521 gcc_assert (t1 && t2);
20522
20523 t3 = gen_reg_rtx (mode);
20524 ix86_expand_vec_interleave (t3, t1, t2, high_p);
20525 emit_move_insn (dest, gen_lowpart (wmode, t3));
20526 break;
20527
20528 case E_V16QImode:
20529 case E_V32QImode:
20530 case E_V32HImode:
20531 case E_V16SImode:
20532 case E_V64QImode:
20533 t1 = gen_reg_rtx (wmode);
20534 t2 = gen_reg_rtx (wmode);
20535 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
20536 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
20537
20538 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
20539 break;
20540
20541 default:
20542 gcc_unreachable ();
20543 }
20544 }
20545
20546 void
20547 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
20548 {
20549 rtx res_1, res_2, res_3, res_4;
20550
20551 res_1 = gen_reg_rtx (V4SImode);
20552 res_2 = gen_reg_rtx (V4SImode);
20553 res_3 = gen_reg_rtx (V2DImode);
20554 res_4 = gen_reg_rtx (V2DImode);
20555 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
20556 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
20557
20558 /* Move the results in element 2 down to element 1; we don't care
20559 what goes in elements 2 and 3. Then we can merge the parts
20560 back together with an interleave.
20561
20562 Note that two other sequences were tried:
20563 (1) Use interleaves at the start instead of psrldq, which allows
20564 us to use a single shufps to merge things back at the end.
20565 (2) Use shufps here to combine the two vectors, then pshufd to
20566 put the elements in the correct order.
20567 In both cases the cost of the reformatting stall was too high
20568 and the overall sequence slower. */
20569
20570 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
20571 const0_rtx, const2_rtx,
20572 const0_rtx, const0_rtx));
20573 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
20574 const0_rtx, const2_rtx,
20575 const0_rtx, const0_rtx));
20576 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
20577
20578 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
20579 }
20580
20581 void
20582 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
20583 {
20584 machine_mode mode = GET_MODE (op0);
20585 rtx t1, t2, t3, t4, t5, t6;
20586
20587 if (TARGET_AVX512DQ && mode == V8DImode)
20588 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
20589 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
20590 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
20591 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
20592 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
20593 else if (TARGET_XOP && mode == V2DImode)
20594 {
20595 /* op1: A,B,C,D, op2: E,F,G,H */
20596 op1 = gen_lowpart (V4SImode, op1);
20597 op2 = gen_lowpart (V4SImode, op2);
20598
20599 t1 = gen_reg_rtx (V4SImode);
20600 t2 = gen_reg_rtx (V4SImode);
20601 t3 = gen_reg_rtx (V2DImode);
20602 t4 = gen_reg_rtx (V2DImode);
20603
20604 /* t1: B,A,D,C */
20605 emit_insn (gen_sse2_pshufd_1 (t1, op1,
20606 GEN_INT (1),
20607 GEN_INT (0),
20608 GEN_INT (3),
20609 GEN_INT (2)));
20610
20611 /* t2: (B*E),(A*F),(D*G),(C*H) */
20612 emit_insn (gen_mulv4si3 (t2, t1, op2));
20613
20614 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
20615 emit_insn (gen_xop_phadddq (t3, t2));
20616
20617 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
20618 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
20619
20620 /* Multiply lower parts and add all */
20621 t5 = gen_reg_rtx (V2DImode);
20622 emit_insn (gen_vec_widen_umult_even_v4si (t5,
20623 gen_lowpart (V4SImode, op1),
20624 gen_lowpart (V4SImode, op2)));
20625 force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
20626 }
20627 else
20628 {
20629 machine_mode nmode;
20630 rtx (*umul) (rtx, rtx, rtx);
20631
20632 if (mode == V2DImode)
20633 {
20634 umul = gen_vec_widen_umult_even_v4si;
20635 nmode = V4SImode;
20636 }
20637 else if (mode == V4DImode)
20638 {
20639 umul = gen_vec_widen_umult_even_v8si;
20640 nmode = V8SImode;
20641 }
20642 else if (mode == V8DImode)
20643 {
20644 umul = gen_vec_widen_umult_even_v16si;
20645 nmode = V16SImode;
20646 }
20647 else
20648 gcc_unreachable ();
20649
20650
20651 /* Multiply low parts. */
20652 t1 = gen_reg_rtx (mode);
20653 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
20654
20655 /* Shift input vectors right 32 bits so we can multiply high parts. */
20656 t6 = GEN_INT (32);
20657 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
20658 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
20659
20660 /* Multiply high parts by low parts. */
20661 t4 = gen_reg_rtx (mode);
20662 t5 = gen_reg_rtx (mode);
20663 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
20664 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
20665
20666 /* Combine and shift the highparts back. */
20667 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
20668 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
20669
20670 /* Combine high and low parts. */
20671 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
20672 }
20673
20674 set_unique_reg_note (get_last_insn (), REG_EQUAL,
20675 gen_rtx_MULT (mode, op1, op2));
20676 }
20677
20678 /* Return 1 if control tansfer instruction INSN
20679 should be encoded with notrack prefix. */
20680
20681 bool
20682 ix86_notrack_prefixed_insn_p (rtx_insn *insn)
20683 {
20684 if (!insn || !((flag_cf_protection & CF_BRANCH)))
20685 return false;
20686
20687 if (CALL_P (insn))
20688 {
20689 rtx call = get_call_rtx_from (insn);
20690 gcc_assert (call != NULL_RTX);
20691 rtx addr = XEXP (call, 0);
20692
20693 /* Do not emit 'notrack' if it's not an indirect call. */
20694 if (MEM_P (addr)
20695 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
20696 return false;
20697 else
20698 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
20699 }
20700
20701 if (JUMP_P (insn) && !flag_cet_switch)
20702 {
20703 rtx target = JUMP_LABEL (insn);
20704 if (target == NULL_RTX || ANY_RETURN_P (target))
20705 return false;
20706
20707 /* Check the jump is a switch table. */
20708 rtx_insn *label = as_a<rtx_insn *> (target);
20709 rtx_insn *table = next_insn (label);
20710 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
20711 return false;
20712 else
20713 return true;
20714 }
20715 return false;
20716 }
20717
20718 /* Calculate integer abs() using only SSE2 instructions. */
20719
20720 void
20721 ix86_expand_sse2_abs (rtx target, rtx input)
20722 {
20723 machine_mode mode = GET_MODE (target);
20724 rtx tmp0, tmp1, x;
20725
20726 switch (mode)
20727 {
20728 case E_V2DImode:
20729 case E_V4DImode:
20730 /* For 64-bit signed integer X, with SSE4.2 use
20731 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
20732 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
20733 32 and use logical instead of arithmetic right shift (which is
20734 unimplemented) and subtract. */
20735 if (TARGET_SSE4_2)
20736 {
20737 tmp0 = gen_reg_rtx (mode);
20738 tmp1 = gen_reg_rtx (mode);
20739 emit_move_insn (tmp1, CONST0_RTX (mode));
20740 if (mode == E_V2DImode)
20741 emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
20742 else
20743 emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
20744 }
20745 else
20746 {
20747 tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
20748 GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
20749 - 1), NULL, 0, OPTAB_DIRECT);
20750 tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
20751 }
20752
20753 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
20754 NULL, 0, OPTAB_DIRECT);
20755 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
20756 target, 0, OPTAB_DIRECT);
20757 break;
20758
20759 case E_V4SImode:
20760 /* For 32-bit signed integer X, the best way to calculate the absolute
20761 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
20762 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
20763 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
20764 NULL, 0, OPTAB_DIRECT);
20765 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
20766 NULL, 0, OPTAB_DIRECT);
20767 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
20768 target, 0, OPTAB_DIRECT);
20769 break;
20770
20771 case E_V8HImode:
20772 /* For 16-bit signed integer X, the best way to calculate the absolute
20773 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
20774 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
20775
20776 x = expand_simple_binop (mode, SMAX, tmp0, input,
20777 target, 0, OPTAB_DIRECT);
20778 break;
20779
20780 case E_V16QImode:
20781 /* For 8-bit signed integer X, the best way to calculate the absolute
20782 value of X is min ((unsigned char) X, (unsigned char) (-X)),
20783 as SSE2 provides the PMINUB insn. */
20784 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
20785
20786 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
20787 target, 0, OPTAB_DIRECT);
20788 break;
20789
20790 default:
20791 gcc_unreachable ();
20792 }
20793
20794 if (x != target)
20795 emit_move_insn (target, x);
20796 }
20797
20798 /* Expand an extract from a vector register through pextr insn.
20799 Return true if successful. */
20800
20801 bool
20802 ix86_expand_pextr (rtx *operands)
20803 {
20804 rtx dst = operands[0];
20805 rtx src = operands[1];
20806
20807 unsigned int size = INTVAL (operands[2]);
20808 unsigned int pos = INTVAL (operands[3]);
20809
20810 if (SUBREG_P (dst))
20811 {
20812 /* Reject non-lowpart subregs. */
20813 if (SUBREG_BYTE (dst) > 0)
20814 return false;
20815 dst = SUBREG_REG (dst);
20816 }
20817
20818 if (SUBREG_P (src))
20819 {
20820 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
20821 src = SUBREG_REG (src);
20822 }
20823
20824 switch (GET_MODE (src))
20825 {
20826 case E_V16QImode:
20827 case E_V8HImode:
20828 case E_V4SImode:
20829 case E_V2DImode:
20830 case E_V1TImode:
20831 {
20832 machine_mode srcmode, dstmode;
20833 rtx d, pat;
20834
20835 if (!int_mode_for_size (size, 0).exists (&dstmode))
20836 return false;
20837
20838 switch (dstmode)
20839 {
20840 case E_QImode:
20841 if (!TARGET_SSE4_1)
20842 return false;
20843 srcmode = V16QImode;
20844 break;
20845
20846 case E_HImode:
20847 if (!TARGET_SSE2)
20848 return false;
20849 srcmode = V8HImode;
20850 break;
20851
20852 case E_SImode:
20853 if (!TARGET_SSE4_1)
20854 return false;
20855 srcmode = V4SImode;
20856 break;
20857
20858 case E_DImode:
20859 gcc_assert (TARGET_64BIT);
20860 if (!TARGET_SSE4_1)
20861 return false;
20862 srcmode = V2DImode;
20863 break;
20864
20865 default:
20866 return false;
20867 }
20868
20869 /* Reject extractions from misaligned positions. */
20870 if (pos & (size-1))
20871 return false;
20872
20873 if (GET_MODE (dst) == dstmode)
20874 d = dst;
20875 else
20876 d = gen_reg_rtx (dstmode);
20877
20878 /* Construct insn pattern. */
20879 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
20880 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
20881
20882 /* Let the rtl optimizers know about the zero extension performed. */
20883 if (dstmode == QImode || dstmode == HImode)
20884 {
20885 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
20886 d = gen_lowpart (SImode, d);
20887 }
20888
20889 emit_insn (gen_rtx_SET (d, pat));
20890
20891 if (d != dst)
20892 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
20893 return true;
20894 }
20895
20896 default:
20897 return false;
20898 }
20899 }
20900
20901 /* Expand an insert into a vector register through pinsr insn.
20902 Return true if successful. */
20903
20904 bool
20905 ix86_expand_pinsr (rtx *operands)
20906 {
20907 rtx dst = operands[0];
20908 rtx src = operands[3];
20909
20910 unsigned int size = INTVAL (operands[1]);
20911 unsigned int pos = INTVAL (operands[2]);
20912
20913 if (SUBREG_P (dst))
20914 {
20915 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
20916 dst = SUBREG_REG (dst);
20917 }
20918
20919 switch (GET_MODE (dst))
20920 {
20921 case E_V16QImode:
20922 case E_V8HImode:
20923 case E_V4SImode:
20924 case E_V2DImode:
20925 case E_V1TImode:
20926 {
20927 machine_mode srcmode, dstmode;
20928 rtx (*pinsr)(rtx, rtx, rtx, rtx);
20929 rtx d;
20930
20931 if (!int_mode_for_size (size, 0).exists (&srcmode))
20932 return false;
20933
20934 switch (srcmode)
20935 {
20936 case E_QImode:
20937 if (!TARGET_SSE4_1)
20938 return false;
20939 dstmode = V16QImode;
20940 pinsr = gen_sse4_1_pinsrb;
20941 break;
20942
20943 case E_HImode:
20944 if (!TARGET_SSE2)
20945 return false;
20946 dstmode = V8HImode;
20947 pinsr = gen_sse2_pinsrw;
20948 break;
20949
20950 case E_SImode:
20951 if (!TARGET_SSE4_1)
20952 return false;
20953 dstmode = V4SImode;
20954 pinsr = gen_sse4_1_pinsrd;
20955 break;
20956
20957 case E_DImode:
20958 gcc_assert (TARGET_64BIT);
20959 if (!TARGET_SSE4_1)
20960 return false;
20961 dstmode = V2DImode;
20962 pinsr = gen_sse4_1_pinsrq;
20963 break;
20964
20965 default:
20966 return false;
20967 }
20968
20969 /* Reject insertions to misaligned positions. */
20970 if (pos & (size-1))
20971 return false;
20972
20973 if (SUBREG_P (src))
20974 {
20975 unsigned int srcpos = SUBREG_BYTE (src);
20976
20977 if (srcpos > 0)
20978 {
20979 rtx extr_ops[4];
20980
20981 extr_ops[0] = gen_reg_rtx (srcmode);
20982 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
20983 extr_ops[2] = GEN_INT (size);
20984 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
20985
20986 if (!ix86_expand_pextr (extr_ops))
20987 return false;
20988
20989 src = extr_ops[0];
20990 }
20991 else
20992 src = gen_lowpart (srcmode, SUBREG_REG (src));
20993 }
20994
20995 if (GET_MODE (dst) == dstmode)
20996 d = dst;
20997 else
20998 d = gen_reg_rtx (dstmode);
20999
21000 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
21001 gen_lowpart (srcmode, src),
21002 GEN_INT (1 << (pos / size))));
21003 if (d != dst)
21004 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
21005 return true;
21006 }
21007
21008 default:
21009 return false;
21010 }
21011 }
21012
21013 /* All CPUs prefer to avoid cross-lane operations so perform reductions
21014 upper against lower halves up to SSE reg size. */
21015
21016 machine_mode
21017 ix86_split_reduction (machine_mode mode)
21018 {
21019 /* Reduce lowpart against highpart until we reach SSE reg width to
21020 avoid cross-lane operations. */
21021 switch (mode)
21022 {
21023 case E_V8DImode:
21024 case E_V4DImode:
21025 return V2DImode;
21026 case E_V16SImode:
21027 case E_V8SImode:
21028 return V4SImode;
21029 case E_V32HImode:
21030 case E_V16HImode:
21031 return V8HImode;
21032 case E_V64QImode:
21033 case E_V32QImode:
21034 return V16QImode;
21035 case E_V16SFmode:
21036 case E_V8SFmode:
21037 return V4SFmode;
21038 case E_V8DFmode:
21039 case E_V4DFmode:
21040 return V2DFmode;
21041 default:
21042 return mode;
21043 }
21044 }
21045
21046 /* Generate call to __divmoddi4. */
21047
21048 void
21049 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
21050 rtx op0, rtx op1,
21051 rtx *quot_p, rtx *rem_p)
21052 {
21053 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
21054
21055 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
21056 mode, op0, mode, op1, mode,
21057 XEXP (rem, 0), Pmode);
21058 *quot_p = quot;
21059 *rem_p = rem;
21060 }
21061
21062 #include "gt-i386-expand.h"
This page took 0.925711 seconds and 6 git commands to generate.