]> gcc.gnu.org Git - gcc.git/blob - gcc/config/i386/i386-expand.cc
Support Intel CMPccXADD
[gcc.git] / gcc / config / i386 / i386-expand.cc
1 /* Copyright (C) 1988-2022 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
18
19 #define IN_TARGET_CODE 1
20
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "memmodel.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "expmed.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "cgraph.h"
40 #include "diagnostic.h"
41 #include "cfgbuild.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "attribs.h"
45 #include "calls.h"
46 #include "stor-layout.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "insn-attr.h"
50 #include "flags.h"
51 #include "except.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "cfgrtl.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
57 #include "reload.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "tm-constrs.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "dbgcnt.h"
75 #include "case-cfn-macros.h"
76 #include "dojump.h"
77 #include "fold-const-call.h"
78 #include "tree-vrp.h"
79 #include "tree-ssanames.h"
80 #include "selftest.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
83 #include "intl.h"
84 #include "ifcvt.h"
85 #include "symbol-summary.h"
86 #include "ipa-prop.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
90 #include "debug.h"
91 #include "dwarf2out.h"
92 #include "i386-options.h"
93 #include "i386-builtins.h"
94 #include "i386-expand.h"
95
96 /* Split one or more double-mode RTL references into pairs of half-mode
97 references. The RTL can be REG, offsettable MEM, integer constant, or
98 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
99 split and "num" is its length. lo_half and hi_half are output arrays
100 that parallel "operands". */
101
102 void
103 split_double_mode (machine_mode mode, rtx operands[],
104 int num, rtx lo_half[], rtx hi_half[])
105 {
106 machine_mode half_mode;
107 unsigned int byte;
108 rtx mem_op = NULL_RTX;
109 int mem_num = 0;
110
111 switch (mode)
112 {
113 case E_TImode:
114 half_mode = DImode;
115 break;
116 case E_DImode:
117 half_mode = SImode;
118 break;
119 case E_P2HImode:
120 half_mode = HImode;
121 break;
122 case E_P2QImode:
123 half_mode = QImode;
124 break;
125 default:
126 gcc_unreachable ();
127 }
128
129 byte = GET_MODE_SIZE (half_mode);
130
131 while (num--)
132 {
133 rtx op = operands[num];
134
135 /* simplify_subreg refuse to split volatile memory addresses,
136 but we still have to handle it. */
137 if (MEM_P (op))
138 {
139 if (mem_op && rtx_equal_p (op, mem_op))
140 {
141 lo_half[num] = lo_half[mem_num];
142 hi_half[num] = hi_half[mem_num];
143 }
144 else
145 {
146 mem_op = op;
147 mem_num = num;
148 lo_half[num] = adjust_address (op, half_mode, 0);
149 hi_half[num] = adjust_address (op, half_mode, byte);
150 }
151 }
152 else
153 {
154 lo_half[num] = simplify_gen_subreg (half_mode, op,
155 GET_MODE (op) == VOIDmode
156 ? mode : GET_MODE (op), 0);
157
158 rtx tmp = simplify_gen_subreg (half_mode, op,
159 GET_MODE (op) == VOIDmode
160 ? mode : GET_MODE (op), byte);
161 /* simplify_gen_subreg will return NULL RTX for the
162 high half of the paradoxical subreg. */
163 hi_half[num] = tmp ? tmp : gen_reg_rtx (half_mode);
164 }
165 }
166 }
167
168 /* Emit the double word assignment DST = { LO, HI }. */
169
170 void
171 split_double_concat (machine_mode mode, rtx dst, rtx lo, rtx hi)
172 {
173 rtx dlo, dhi;
174 int deleted_move_count = 0;
175 split_double_mode (mode, &dst, 1, &dlo, &dhi);
176 if (!rtx_equal_p (dlo, hi))
177 {
178 if (!rtx_equal_p (dlo, lo))
179 emit_move_insn (dlo, lo);
180 else
181 deleted_move_count++;
182 if (!rtx_equal_p (dhi, hi))
183 emit_move_insn (dhi, hi);
184 else
185 deleted_move_count++;
186 }
187 else if (!rtx_equal_p (lo, dhi))
188 {
189 if (!rtx_equal_p (dhi, hi))
190 emit_move_insn (dhi, hi);
191 else
192 deleted_move_count++;
193 if (!rtx_equal_p (dlo, lo))
194 emit_move_insn (dlo, lo);
195 else
196 deleted_move_count++;
197 }
198 else if (mode == TImode)
199 emit_insn (gen_swapdi (dlo, dhi));
200 else
201 emit_insn (gen_swapsi (dlo, dhi));
202
203 if (deleted_move_count == 2)
204 emit_note (NOTE_INSN_DELETED);
205 }
206
207
208 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
209 for the target. */
210
211 void
212 ix86_expand_clear (rtx dest)
213 {
214 rtx tmp;
215
216 /* We play register width games, which are only valid after reload. */
217 gcc_assert (reload_completed);
218
219 /* Avoid HImode and its attendant prefix byte. */
220 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
221 dest = gen_rtx_REG (SImode, REGNO (dest));
222 tmp = gen_rtx_SET (dest, const0_rtx);
223
224 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
225 {
226 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
227 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
228 }
229
230 emit_insn (tmp);
231 }
232
233 /* Return true if V can be broadcasted from an integer of WIDTH bits
234 which is returned in VAL_BROADCAST. Otherwise, return false. */
235
236 static bool
237 ix86_broadcast (HOST_WIDE_INT v, unsigned int width,
238 HOST_WIDE_INT &val_broadcast)
239 {
240 wide_int val = wi::uhwi (v, HOST_BITS_PER_WIDE_INT);
241 val_broadcast = wi::extract_uhwi (val, 0, width);
242 for (unsigned int i = width; i < HOST_BITS_PER_WIDE_INT; i += width)
243 {
244 HOST_WIDE_INT each = wi::extract_uhwi (val, i, width);
245 if (val_broadcast != each)
246 return false;
247 }
248 val_broadcast = sext_hwi (val_broadcast, width);
249 return true;
250 }
251
252 /* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */
253
254 static rtx
255 ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
256 {
257 /* Don't use integer vector broadcast if we can't move from GPR to SSE
258 register directly. */
259 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
260 return nullptr;
261
262 /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
263 broadcast only if vector broadcast is available. */
264 if (!TARGET_AVX
265 || !CONST_WIDE_INT_P (op)
266 || standard_sse_constant_p (op, mode))
267 return nullptr;
268
269 HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0);
270 HOST_WIDE_INT val_broadcast;
271 scalar_int_mode broadcast_mode;
272 if (TARGET_AVX2
273 && ix86_broadcast (val, GET_MODE_BITSIZE (QImode),
274 val_broadcast))
275 broadcast_mode = QImode;
276 else if (TARGET_AVX2
277 && ix86_broadcast (val, GET_MODE_BITSIZE (HImode),
278 val_broadcast))
279 broadcast_mode = HImode;
280 else if (ix86_broadcast (val, GET_MODE_BITSIZE (SImode),
281 val_broadcast))
282 broadcast_mode = SImode;
283 else if (TARGET_64BIT
284 && ix86_broadcast (val, GET_MODE_BITSIZE (DImode),
285 val_broadcast))
286 broadcast_mode = DImode;
287 else
288 return nullptr;
289
290 /* Check if OP can be broadcasted from VAL. */
291 for (int i = 1; i < CONST_WIDE_INT_NUNITS (op); i++)
292 if (val != CONST_WIDE_INT_ELT (op, i))
293 return nullptr;
294
295 unsigned int nunits = (GET_MODE_SIZE (mode)
296 / GET_MODE_SIZE (broadcast_mode));
297 machine_mode vector_mode;
298 if (!mode_for_vector (broadcast_mode, nunits).exists (&vector_mode))
299 gcc_unreachable ();
300 rtx target = ix86_gen_scratch_sse_rtx (vector_mode);
301 bool ok = ix86_expand_vector_init_duplicate (false, vector_mode,
302 target,
303 GEN_INT (val_broadcast));
304 gcc_assert (ok);
305 target = lowpart_subreg (mode, target, vector_mode);
306 return target;
307 }
308
309 void
310 ix86_expand_move (machine_mode mode, rtx operands[])
311 {
312 rtx op0, op1;
313 rtx tmp, addend = NULL_RTX;
314 enum tls_model model;
315
316 op0 = operands[0];
317 op1 = operands[1];
318
319 /* Avoid complex sets of likely spilled hard registers before reload. */
320 if (!ix86_hardreg_mov_ok (op0, op1))
321 {
322 tmp = gen_reg_rtx (mode);
323 operands[0] = tmp;
324 ix86_expand_move (mode, operands);
325 operands[0] = op0;
326 operands[1] = tmp;
327 op1 = tmp;
328 }
329
330 switch (GET_CODE (op1))
331 {
332 case CONST:
333 tmp = XEXP (op1, 0);
334
335 if (GET_CODE (tmp) != PLUS
336 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
337 break;
338
339 op1 = XEXP (tmp, 0);
340 addend = XEXP (tmp, 1);
341 /* FALLTHRU */
342
343 case SYMBOL_REF:
344 model = SYMBOL_REF_TLS_MODEL (op1);
345
346 if (model)
347 op1 = legitimize_tls_address (op1, model, true);
348 else if (ix86_force_load_from_GOT_p (op1))
349 {
350 /* Load the external function address via GOT slot to avoid PLT. */
351 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
352 (TARGET_64BIT
353 ? UNSPEC_GOTPCREL
354 : UNSPEC_GOT));
355 op1 = gen_rtx_CONST (Pmode, op1);
356 op1 = gen_const_mem (Pmode, op1);
357 set_mem_alias_set (op1, ix86_GOT_alias_set ());
358 }
359 else
360 {
361 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
362 if (tmp)
363 {
364 op1 = tmp;
365 if (!addend)
366 break;
367 }
368 else
369 {
370 op1 = operands[1];
371 break;
372 }
373 }
374
375 if (addend)
376 {
377 op1 = force_operand (op1, NULL_RTX);
378 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
379 op0, 1, OPTAB_DIRECT);
380 }
381 else
382 op1 = force_operand (op1, op0);
383
384 if (op1 == op0)
385 return;
386
387 op1 = convert_to_mode (mode, op1, 1);
388
389 default:
390 break;
391 }
392
393 if ((flag_pic || MACHOPIC_INDIRECT)
394 && symbolic_operand (op1, mode))
395 {
396 if (TARGET_MACHO && !TARGET_64BIT)
397 {
398 #if TARGET_MACHO
399 /* dynamic-no-pic */
400 if (MACHOPIC_INDIRECT)
401 {
402 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
403 ? op0 : gen_reg_rtx (Pmode);
404 op1 = machopic_indirect_data_reference (op1, temp);
405 if (MACHOPIC_PURE)
406 op1 = machopic_legitimize_pic_address (op1, mode,
407 temp == op1 ? 0 : temp);
408 }
409 if (op0 != op1 && GET_CODE (op0) != MEM)
410 {
411 rtx insn = gen_rtx_SET (op0, op1);
412 emit_insn (insn);
413 return;
414 }
415 if (GET_CODE (op0) == MEM)
416 op1 = force_reg (Pmode, op1);
417 else
418 {
419 rtx temp = op0;
420 if (GET_CODE (temp) != REG)
421 temp = gen_reg_rtx (Pmode);
422 temp = legitimize_pic_address (op1, temp);
423 if (temp == op0)
424 return;
425 op1 = temp;
426 }
427 /* dynamic-no-pic */
428 #endif
429 }
430 else
431 {
432 if (MEM_P (op0))
433 op1 = force_reg (mode, op1);
434 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
435 {
436 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
437 op1 = legitimize_pic_address (op1, reg);
438 if (op0 == op1)
439 return;
440 op1 = convert_to_mode (mode, op1, 1);
441 }
442 }
443 }
444 else
445 {
446 if (MEM_P (op0)
447 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
448 || !push_operand (op0, mode))
449 && MEM_P (op1))
450 op1 = force_reg (mode, op1);
451
452 if (push_operand (op0, mode)
453 && ! general_no_elim_operand (op1, mode))
454 op1 = copy_to_mode_reg (mode, op1);
455
456 /* Force large constants in 64bit compilation into register
457 to get them CSEed. */
458 if (can_create_pseudo_p ()
459 && (mode == DImode) && TARGET_64BIT
460 && immediate_operand (op1, mode)
461 && !x86_64_zext_immediate_operand (op1, VOIDmode)
462 && !register_operand (op0, mode)
463 && optimize)
464 op1 = copy_to_mode_reg (mode, op1);
465
466 if (can_create_pseudo_p ())
467 {
468 if (CONST_DOUBLE_P (op1))
469 {
470 /* If we are loading a floating point constant to a
471 register, force the value to memory now, since we'll
472 get better code out the back end. */
473
474 op1 = validize_mem (force_const_mem (mode, op1));
475 if (!register_operand (op0, mode))
476 {
477 rtx temp = gen_reg_rtx (mode);
478 emit_insn (gen_rtx_SET (temp, op1));
479 emit_move_insn (op0, temp);
480 return;
481 }
482 }
483 else if (GET_MODE_SIZE (mode) >= 16)
484 {
485 rtx tmp = ix86_convert_const_wide_int_to_broadcast
486 (GET_MODE (op0), op1);
487 if (tmp != nullptr)
488 op1 = tmp;
489 }
490 }
491 }
492
493 emit_insn (gen_rtx_SET (op0, op1));
494 }
495
496 /* OP is a memref of CONST_VECTOR, return scalar constant mem
497 if CONST_VECTOR is a vec_duplicate, else return NULL. */
498 static rtx
499 ix86_broadcast_from_constant (machine_mode mode, rtx op)
500 {
501 int nunits = GET_MODE_NUNITS (mode);
502 if (nunits < 2)
503 return nullptr;
504
505 /* Don't use integer vector broadcast if we can't move from GPR to SSE
506 register directly. */
507 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
508 && INTEGRAL_MODE_P (mode))
509 return nullptr;
510
511 /* Convert CONST_VECTOR to a non-standard SSE constant integer
512 broadcast only if vector broadcast is available. */
513 if (!(TARGET_AVX2
514 || (TARGET_AVX
515 && (GET_MODE_INNER (mode) == SImode
516 || GET_MODE_INNER (mode) == DImode))
517 || FLOAT_MODE_P (mode))
518 || standard_sse_constant_p (op, mode))
519 return nullptr;
520
521 /* Don't broadcast from a 64-bit integer constant in 32-bit mode.
522 We can still put 64-bit integer constant in memory when
523 avx512 embed broadcast is available. */
524 if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT
525 && (!TARGET_AVX512F
526 || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL)))
527 return nullptr;
528
529 if (GET_MODE_INNER (mode) == TImode)
530 return nullptr;
531
532 rtx constant = get_pool_constant (XEXP (op, 0));
533 if (GET_CODE (constant) != CONST_VECTOR)
534 return nullptr;
535
536 /* There could be some rtx like
537 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
538 but with "*.LC1" refer to V2DI constant vector. */
539 if (GET_MODE (constant) != mode)
540 {
541 constant = simplify_subreg (mode, constant, GET_MODE (constant),
542 0);
543 if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
544 return nullptr;
545 }
546
547 rtx first = XVECEXP (constant, 0, 0);
548
549 for (int i = 1; i < nunits; ++i)
550 {
551 rtx tmp = XVECEXP (constant, 0, i);
552 /* Vector duplicate value. */
553 if (!rtx_equal_p (tmp, first))
554 return nullptr;
555 }
556
557 return first;
558 }
559
560 void
561 ix86_expand_vector_move (machine_mode mode, rtx operands[])
562 {
563 rtx op0 = operands[0], op1 = operands[1];
564 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
565 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
566 unsigned int align = (TARGET_IAMCU
567 ? GET_MODE_BITSIZE (mode)
568 : GET_MODE_ALIGNMENT (mode));
569
570 if (push_operand (op0, VOIDmode))
571 op0 = emit_move_resolve_push (mode, op0);
572
573 /* Force constants other than zero into memory. We do not know how
574 the instructions used to build constants modify the upper 64 bits
575 of the register, once we have that information we may be able
576 to handle some of them more efficiently. */
577 if (can_create_pseudo_p ()
578 && (CONSTANT_P (op1)
579 || (SUBREG_P (op1)
580 && CONSTANT_P (SUBREG_REG (op1))))
581 && ((register_operand (op0, mode)
582 && !standard_sse_constant_p (op1, mode))
583 /* ix86_expand_vector_move_misalign() does not like constants. */
584 || (SSE_REG_MODE_P (mode)
585 && MEM_P (op0)
586 && MEM_ALIGN (op0) < align)))
587 {
588 if (SUBREG_P (op1))
589 {
590 machine_mode imode = GET_MODE (SUBREG_REG (op1));
591 rtx r = force_const_mem (imode, SUBREG_REG (op1));
592 if (r)
593 r = validize_mem (r);
594 else
595 r = force_reg (imode, SUBREG_REG (op1));
596 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
597 }
598 else
599 {
600 machine_mode mode = GET_MODE (op0);
601 rtx tmp = ix86_convert_const_wide_int_to_broadcast
602 (mode, op1);
603 if (tmp == nullptr)
604 op1 = validize_mem (force_const_mem (mode, op1));
605 else
606 op1 = tmp;
607 }
608 }
609
610 if (can_create_pseudo_p ()
611 && GET_MODE_SIZE (mode) >= 16
612 && VECTOR_MODE_P (mode)
613 && (MEM_P (op1)
614 && SYMBOL_REF_P (XEXP (op1, 0))
615 && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
616 {
617 rtx first = ix86_broadcast_from_constant (mode, op1);
618 if (first != nullptr)
619 {
620 /* Broadcast to XMM/YMM/ZMM register from an integer
621 constant or scalar mem. */
622 op1 = gen_reg_rtx (mode);
623 if (FLOAT_MODE_P (mode)
624 || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
625 first = force_const_mem (GET_MODE_INNER (mode), first);
626 bool ok = ix86_expand_vector_init_duplicate (false, mode,
627 op1, first);
628 gcc_assert (ok);
629 emit_move_insn (op0, op1);
630 return;
631 }
632 }
633
634 /* We need to check memory alignment for SSE mode since attribute
635 can make operands unaligned. */
636 if (can_create_pseudo_p ()
637 && SSE_REG_MODE_P (mode)
638 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
639 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
640 {
641 rtx tmp[2];
642
643 /* ix86_expand_vector_move_misalign() does not like both
644 arguments in memory. */
645 if (!register_operand (op0, mode)
646 && !register_operand (op1, mode))
647 {
648 rtx scratch = ix86_gen_scratch_sse_rtx (mode);
649 emit_move_insn (scratch, op1);
650 op1 = scratch;
651 }
652
653 tmp[0] = op0; tmp[1] = op1;
654 ix86_expand_vector_move_misalign (mode, tmp);
655 return;
656 }
657
658 /* Special case TImode to V1TImode conversions, via V2DI. */
659 if (mode == V1TImode
660 && SUBREG_P (op1)
661 && GET_MODE (SUBREG_REG (op1)) == TImode
662 && TARGET_64BIT && TARGET_SSE
663 && can_create_pseudo_p ())
664 {
665 rtx tmp = gen_reg_rtx (V2DImode);
666 rtx lo = gen_reg_rtx (DImode);
667 rtx hi = gen_reg_rtx (DImode);
668 emit_move_insn (lo, gen_lowpart (DImode, SUBREG_REG (op1)));
669 emit_move_insn (hi, gen_highpart (DImode, SUBREG_REG (op1)));
670 emit_insn (gen_vec_concatv2di (tmp, lo, hi));
671 emit_move_insn (op0, gen_lowpart (V1TImode, tmp));
672 return;
673 }
674
675 /* If operand0 is a hard register, make operand1 a pseudo. */
676 if (can_create_pseudo_p ()
677 && !ix86_hardreg_mov_ok (op0, op1))
678 {
679 rtx tmp = gen_reg_rtx (GET_MODE (op0));
680 emit_move_insn (tmp, op1);
681 emit_move_insn (op0, tmp);
682 return;
683 }
684
685 /* Make operand1 a register if it isn't already. */
686 if (can_create_pseudo_p ()
687 && !register_operand (op0, mode)
688 && !register_operand (op1, mode))
689 {
690 rtx tmp = ix86_gen_scratch_sse_rtx (GET_MODE (op0));
691 emit_move_insn (tmp, op1);
692 emit_move_insn (op0, tmp);
693 return;
694 }
695
696 emit_insn (gen_rtx_SET (op0, op1));
697 }
698
699 /* Split 32-byte AVX unaligned load and store if needed. */
700
701 static void
702 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
703 {
704 rtx m;
705 rtx (*extract) (rtx, rtx, rtx);
706 machine_mode mode;
707
708 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
709 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
710 {
711 emit_insn (gen_rtx_SET (op0, op1));
712 return;
713 }
714
715 rtx orig_op0 = NULL_RTX;
716 mode = GET_MODE (op0);
717 switch (GET_MODE_CLASS (mode))
718 {
719 case MODE_VECTOR_INT:
720 case MODE_INT:
721 if (mode != V32QImode)
722 {
723 if (!MEM_P (op0))
724 {
725 orig_op0 = op0;
726 op0 = gen_reg_rtx (V32QImode);
727 }
728 else
729 op0 = gen_lowpart (V32QImode, op0);
730 op1 = gen_lowpart (V32QImode, op1);
731 mode = V32QImode;
732 }
733 break;
734 case MODE_VECTOR_FLOAT:
735 break;
736 default:
737 gcc_unreachable ();
738 }
739
740 switch (mode)
741 {
742 default:
743 gcc_unreachable ();
744 case E_V32QImode:
745 extract = gen_avx_vextractf128v32qi;
746 mode = V16QImode;
747 break;
748 case E_V16BFmode:
749 extract = gen_avx_vextractf128v16bf;
750 mode = V8BFmode;
751 break;
752 case E_V16HFmode:
753 extract = gen_avx_vextractf128v16hf;
754 mode = V8HFmode;
755 break;
756 case E_V8SFmode:
757 extract = gen_avx_vextractf128v8sf;
758 mode = V4SFmode;
759 break;
760 case E_V4DFmode:
761 extract = gen_avx_vextractf128v4df;
762 mode = V2DFmode;
763 break;
764 }
765
766 if (MEM_P (op1))
767 {
768 rtx r = gen_reg_rtx (mode);
769 m = adjust_address (op1, mode, 0);
770 emit_move_insn (r, m);
771 m = adjust_address (op1, mode, 16);
772 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
773 emit_move_insn (op0, r);
774 }
775 else if (MEM_P (op0))
776 {
777 m = adjust_address (op0, mode, 0);
778 emit_insn (extract (m, op1, const0_rtx));
779 m = adjust_address (op0, mode, 16);
780 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
781 }
782 else
783 gcc_unreachable ();
784
785 if (orig_op0)
786 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
787 }
788
789 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
790 straight to ix86_expand_vector_move. */
791 /* Code generation for scalar reg-reg moves of single and double precision data:
792 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
793 movaps reg, reg
794 else
795 movss reg, reg
796 if (x86_sse_partial_reg_dependency == true)
797 movapd reg, reg
798 else
799 movsd reg, reg
800
801 Code generation for scalar loads of double precision data:
802 if (x86_sse_split_regs == true)
803 movlpd mem, reg (gas syntax)
804 else
805 movsd mem, reg
806
807 Code generation for unaligned packed loads of single precision data
808 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
809 if (x86_sse_unaligned_move_optimal)
810 movups mem, reg
811
812 if (x86_sse_partial_reg_dependency == true)
813 {
814 xorps reg, reg
815 movlps mem, reg
816 movhps mem+8, reg
817 }
818 else
819 {
820 movlps mem, reg
821 movhps mem+8, reg
822 }
823
824 Code generation for unaligned packed loads of double precision data
825 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
826 if (x86_sse_unaligned_move_optimal)
827 movupd mem, reg
828
829 if (x86_sse_split_regs == true)
830 {
831 movlpd mem, reg
832 movhpd mem+8, reg
833 }
834 else
835 {
836 movsd mem, reg
837 movhpd mem+8, reg
838 }
839 */
840
841 void
842 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
843 {
844 rtx op0, op1, m;
845
846 op0 = operands[0];
847 op1 = operands[1];
848
849 /* Use unaligned load/store for AVX512 or when optimizing for size. */
850 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
851 {
852 emit_insn (gen_rtx_SET (op0, op1));
853 return;
854 }
855
856 if (TARGET_AVX)
857 {
858 if (GET_MODE_SIZE (mode) == 32)
859 ix86_avx256_split_vector_move_misalign (op0, op1);
860 else
861 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
862 emit_insn (gen_rtx_SET (op0, op1));
863 return;
864 }
865
866 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
867 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
868 {
869 emit_insn (gen_rtx_SET (op0, op1));
870 return;
871 }
872
873 /* ??? If we have typed data, then it would appear that using
874 movdqu is the only way to get unaligned data loaded with
875 integer type. */
876 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
877 {
878 emit_insn (gen_rtx_SET (op0, op1));
879 return;
880 }
881
882 if (MEM_P (op1))
883 {
884 if (TARGET_SSE2 && mode == V2DFmode)
885 {
886 rtx zero;
887
888 /* When SSE registers are split into halves, we can avoid
889 writing to the top half twice. */
890 if (TARGET_SSE_SPLIT_REGS)
891 {
892 emit_clobber (op0);
893 zero = op0;
894 }
895 else
896 {
897 /* ??? Not sure about the best option for the Intel chips.
898 The following would seem to satisfy; the register is
899 entirely cleared, breaking the dependency chain. We
900 then store to the upper half, with a dependency depth
901 of one. A rumor has it that Intel recommends two movsd
902 followed by an unpacklpd, but this is unconfirmed. And
903 given that the dependency depth of the unpacklpd would
904 still be one, I'm not sure why this would be better. */
905 zero = CONST0_RTX (V2DFmode);
906 }
907
908 m = adjust_address (op1, DFmode, 0);
909 emit_insn (gen_sse2_loadlpd (op0, zero, m));
910 m = adjust_address (op1, DFmode, 8);
911 emit_insn (gen_sse2_loadhpd (op0, op0, m));
912 }
913 else
914 {
915 rtx t;
916
917 if (mode != V4SFmode)
918 t = gen_reg_rtx (V4SFmode);
919 else
920 t = op0;
921
922 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
923 emit_move_insn (t, CONST0_RTX (V4SFmode));
924 else
925 emit_clobber (t);
926
927 m = adjust_address (op1, V2SFmode, 0);
928 emit_insn (gen_sse_loadlps (t, t, m));
929 m = adjust_address (op1, V2SFmode, 8);
930 emit_insn (gen_sse_loadhps (t, t, m));
931 if (mode != V4SFmode)
932 emit_move_insn (op0, gen_lowpart (mode, t));
933 }
934 }
935 else if (MEM_P (op0))
936 {
937 if (TARGET_SSE2 && mode == V2DFmode)
938 {
939 m = adjust_address (op0, DFmode, 0);
940 emit_insn (gen_sse2_storelpd (m, op1));
941 m = adjust_address (op0, DFmode, 8);
942 emit_insn (gen_sse2_storehpd (m, op1));
943 }
944 else
945 {
946 if (mode != V4SFmode)
947 op1 = gen_lowpart (V4SFmode, op1);
948
949 m = adjust_address (op0, V2SFmode, 0);
950 emit_insn (gen_sse_storelps (m, op1));
951 m = adjust_address (op0, V2SFmode, 8);
952 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
953 }
954 }
955 else
956 gcc_unreachable ();
957 }
958
959 /* Move bits 64:95 to bits 32:63. */
960
961 void
962 ix86_move_vector_high_sse_to_mmx (rtx op)
963 {
964 rtx mask = gen_rtx_PARALLEL (VOIDmode,
965 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
966 GEN_INT (0), GEN_INT (0)));
967 rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
968 op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
969 rtx insn = gen_rtx_SET (dest, op);
970 emit_insn (insn);
971 }
972
973 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
974
975 void
976 ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
977 {
978 rtx op0 = operands[0];
979 rtx op1 = operands[1];
980 rtx op2 = operands[2];
981
982 machine_mode dmode = GET_MODE (op0);
983 machine_mode smode = GET_MODE (op1);
984 machine_mode inner_dmode = GET_MODE_INNER (dmode);
985 machine_mode inner_smode = GET_MODE_INNER (smode);
986
987 /* Get the corresponding SSE mode for destination. */
988 int nunits = 16 / GET_MODE_SIZE (inner_dmode);
989 machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
990 nunits).require ();
991 machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
992 nunits / 2).require ();
993
994 /* Get the corresponding SSE mode for source. */
995 nunits = 16 / GET_MODE_SIZE (inner_smode);
996 machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
997 nunits).require ();
998
999 /* Generate SSE pack with signed/unsigned saturation. */
1000 rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
1001 op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
1002 op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
1003
1004 op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
1005 op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
1006 rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
1007 op1, op2));
1008 emit_insn (insn);
1009
1010 ix86_move_vector_high_sse_to_mmx (op0);
1011 }
1012
1013 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
1014
1015 void
1016 ix86_split_mmx_punpck (rtx operands[], bool high_p)
1017 {
1018 rtx op0 = operands[0];
1019 rtx op1 = operands[1];
1020 rtx op2 = operands[2];
1021 machine_mode mode = GET_MODE (op0);
1022 rtx mask;
1023 /* The corresponding SSE mode. */
1024 machine_mode sse_mode, double_sse_mode;
1025
1026 switch (mode)
1027 {
1028 case E_V4QImode:
1029 case E_V8QImode:
1030 sse_mode = V16QImode;
1031 double_sse_mode = V32QImode;
1032 mask = gen_rtx_PARALLEL (VOIDmode,
1033 gen_rtvec (16,
1034 GEN_INT (0), GEN_INT (16),
1035 GEN_INT (1), GEN_INT (17),
1036 GEN_INT (2), GEN_INT (18),
1037 GEN_INT (3), GEN_INT (19),
1038 GEN_INT (4), GEN_INT (20),
1039 GEN_INT (5), GEN_INT (21),
1040 GEN_INT (6), GEN_INT (22),
1041 GEN_INT (7), GEN_INT (23)));
1042 break;
1043
1044 case E_V4HImode:
1045 case E_V2HImode:
1046 sse_mode = V8HImode;
1047 double_sse_mode = V16HImode;
1048 mask = gen_rtx_PARALLEL (VOIDmode,
1049 gen_rtvec (8,
1050 GEN_INT (0), GEN_INT (8),
1051 GEN_INT (1), GEN_INT (9),
1052 GEN_INT (2), GEN_INT (10),
1053 GEN_INT (3), GEN_INT (11)));
1054 break;
1055
1056 case E_V2SImode:
1057 sse_mode = V4SImode;
1058 double_sse_mode = V8SImode;
1059 mask = gen_rtx_PARALLEL (VOIDmode,
1060 gen_rtvec (4,
1061 GEN_INT (0), GEN_INT (4),
1062 GEN_INT (1), GEN_INT (5)));
1063 break;
1064
1065 case E_V2SFmode:
1066 sse_mode = V4SFmode;
1067 double_sse_mode = V8SFmode;
1068 mask = gen_rtx_PARALLEL (VOIDmode,
1069 gen_rtvec (4,
1070 GEN_INT (0), GEN_INT (4),
1071 GEN_INT (1), GEN_INT (5)));
1072 break;
1073
1074 default:
1075 gcc_unreachable ();
1076 }
1077
1078 /* Generate SSE punpcklXX. */
1079 rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
1080 op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
1081 op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
1082
1083 op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
1084 op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
1085 rtx insn = gen_rtx_SET (dest, op2);
1086 emit_insn (insn);
1087
1088 /* Move high bits to low bits. */
1089 if (high_p)
1090 {
1091 if (sse_mode == V4SFmode)
1092 {
1093 mask = gen_rtx_PARALLEL (VOIDmode,
1094 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1095 GEN_INT (4), GEN_INT (5)));
1096 op2 = gen_rtx_VEC_CONCAT (V8SFmode, dest, dest);
1097 op1 = gen_rtx_VEC_SELECT (V4SFmode, op2, mask);
1098 }
1099 else
1100 {
1101 int sz = GET_MODE_SIZE (mode);
1102
1103 if (sz == 4)
1104 mask = gen_rtx_PARALLEL (VOIDmode,
1105 gen_rtvec (4, GEN_INT (1), GEN_INT (0),
1106 GEN_INT (0), GEN_INT (1)));
1107 else if (sz == 8)
1108 mask = gen_rtx_PARALLEL (VOIDmode,
1109 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1110 GEN_INT (0), GEN_INT (1)));
1111 else
1112 gcc_unreachable ();
1113
1114 dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
1115 op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
1116 }
1117
1118 insn = gen_rtx_SET (dest, op1);
1119 emit_insn (insn);
1120 }
1121 }
1122
1123 /* Helper function of ix86_fixup_binary_operands to canonicalize
1124 operand order. Returns true if the operands should be swapped. */
1125
1126 static bool
1127 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
1128 rtx operands[])
1129 {
1130 rtx dst = operands[0];
1131 rtx src1 = operands[1];
1132 rtx src2 = operands[2];
1133
1134 /* If the operation is not commutative, we can't do anything. */
1135 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
1136 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
1137 return false;
1138
1139 /* Highest priority is that src1 should match dst. */
1140 if (rtx_equal_p (dst, src1))
1141 return false;
1142 if (rtx_equal_p (dst, src2))
1143 return true;
1144
1145 /* Next highest priority is that immediate constants come second. */
1146 if (immediate_operand (src2, mode))
1147 return false;
1148 if (immediate_operand (src1, mode))
1149 return true;
1150
1151 /* Lowest priority is that memory references should come second. */
1152 if (MEM_P (src2))
1153 return false;
1154 if (MEM_P (src1))
1155 return true;
1156
1157 return false;
1158 }
1159
1160
1161 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
1162 destination to use for the operation. If different from the true
1163 destination in operands[0], a copy operation will be required. */
1164
1165 rtx
1166 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
1167 rtx operands[])
1168 {
1169 rtx dst = operands[0];
1170 rtx src1 = operands[1];
1171 rtx src2 = operands[2];
1172
1173 /* Canonicalize operand order. */
1174 if (ix86_swap_binary_operands_p (code, mode, operands))
1175 {
1176 /* It is invalid to swap operands of different modes. */
1177 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
1178
1179 std::swap (src1, src2);
1180 }
1181
1182 /* Both source operands cannot be in memory. */
1183 if (MEM_P (src1) && MEM_P (src2))
1184 {
1185 /* Optimization: Only read from memory once. */
1186 if (rtx_equal_p (src1, src2))
1187 {
1188 src2 = force_reg (mode, src2);
1189 src1 = src2;
1190 }
1191 else if (rtx_equal_p (dst, src1))
1192 src2 = force_reg (mode, src2);
1193 else
1194 src1 = force_reg (mode, src1);
1195 }
1196
1197 /* If the destination is memory, and we do not have matching source
1198 operands, do things in registers. */
1199 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1200 dst = gen_reg_rtx (mode);
1201
1202 /* Source 1 cannot be a constant. */
1203 if (CONSTANT_P (src1))
1204 src1 = force_reg (mode, src1);
1205
1206 /* Source 1 cannot be a non-matching memory. */
1207 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1208 src1 = force_reg (mode, src1);
1209
1210 /* Improve address combine. */
1211 if (code == PLUS
1212 && GET_MODE_CLASS (mode) == MODE_INT
1213 && MEM_P (src2))
1214 src2 = force_reg (mode, src2);
1215
1216 operands[1] = src1;
1217 operands[2] = src2;
1218 return dst;
1219 }
1220
1221 /* Similarly, but assume that the destination has already been
1222 set up properly. */
1223
1224 void
1225 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
1226 machine_mode mode, rtx operands[])
1227 {
1228 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
1229 gcc_assert (dst == operands[0]);
1230 }
1231
1232 /* Attempt to expand a binary operator. Make the expansion closer to the
1233 actual machine, then just general_operand, which will allow 3 separate
1234 memory references (one output, two input) in a single insn. */
1235
1236 void
1237 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
1238 rtx operands[])
1239 {
1240 rtx src1, src2, dst, op, clob;
1241
1242 dst = ix86_fixup_binary_operands (code, mode, operands);
1243 src1 = operands[1];
1244 src2 = operands[2];
1245
1246 /* Emit the instruction. */
1247
1248 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
1249
1250 if (reload_completed
1251 && code == PLUS
1252 && !rtx_equal_p (dst, src1))
1253 {
1254 /* This is going to be an LEA; avoid splitting it later. */
1255 emit_insn (op);
1256 }
1257 else
1258 {
1259 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1260 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1261 }
1262
1263 /* Fix up the destination if needed. */
1264 if (dst != operands[0])
1265 emit_move_insn (operands[0], dst);
1266 }
1267
1268 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
1269 the given OPERANDS. */
1270
1271 void
1272 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
1273 rtx operands[])
1274 {
1275 rtx op1 = NULL_RTX, op2 = NULL_RTX;
1276 if (SUBREG_P (operands[1]))
1277 {
1278 op1 = operands[1];
1279 op2 = operands[2];
1280 }
1281 else if (SUBREG_P (operands[2]))
1282 {
1283 op1 = operands[2];
1284 op2 = operands[1];
1285 }
1286 /* Optimize (__m128i) d | (__m128i) e and similar code
1287 when d and e are float vectors into float vector logical
1288 insn. In C/C++ without using intrinsics there is no other way
1289 to express vector logical operation on float vectors than
1290 to cast them temporarily to integer vectors. */
1291 if (op1
1292 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
1293 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
1294 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
1295 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
1296 && SUBREG_BYTE (op1) == 0
1297 && (GET_CODE (op2) == CONST_VECTOR
1298 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
1299 && SUBREG_BYTE (op2) == 0))
1300 && can_create_pseudo_p ())
1301 {
1302 rtx dst;
1303 switch (GET_MODE (SUBREG_REG (op1)))
1304 {
1305 case E_V4SFmode:
1306 case E_V8SFmode:
1307 case E_V16SFmode:
1308 case E_V2DFmode:
1309 case E_V4DFmode:
1310 case E_V8DFmode:
1311 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
1312 if (GET_CODE (op2) == CONST_VECTOR)
1313 {
1314 op2 = gen_lowpart (GET_MODE (dst), op2);
1315 op2 = force_reg (GET_MODE (dst), op2);
1316 }
1317 else
1318 {
1319 op1 = operands[1];
1320 op2 = SUBREG_REG (operands[2]);
1321 if (!vector_operand (op2, GET_MODE (dst)))
1322 op2 = force_reg (GET_MODE (dst), op2);
1323 }
1324 op1 = SUBREG_REG (op1);
1325 if (!vector_operand (op1, GET_MODE (dst)))
1326 op1 = force_reg (GET_MODE (dst), op1);
1327 emit_insn (gen_rtx_SET (dst,
1328 gen_rtx_fmt_ee (code, GET_MODE (dst),
1329 op1, op2)));
1330 emit_move_insn (operands[0], gen_lowpart (mode, dst));
1331 return;
1332 default:
1333 break;
1334 }
1335 }
1336 if (!vector_operand (operands[1], mode))
1337 operands[1] = force_reg (mode, operands[1]);
1338 if (!vector_operand (operands[2], mode))
1339 operands[2] = force_reg (mode, operands[2]);
1340 ix86_fixup_binary_operands_no_copy (code, mode, operands);
1341 emit_insn (gen_rtx_SET (operands[0],
1342 gen_rtx_fmt_ee (code, mode, operands[1],
1343 operands[2])));
1344 }
1345
1346 /* Return TRUE or FALSE depending on whether the binary operator meets the
1347 appropriate constraints. */
1348
1349 bool
1350 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1351 rtx operands[3])
1352 {
1353 rtx dst = operands[0];
1354 rtx src1 = operands[1];
1355 rtx src2 = operands[2];
1356
1357 /* Both source operands cannot be in memory. */
1358 if ((MEM_P (src1) || bcst_mem_operand (src1, mode))
1359 && (MEM_P (src2) || bcst_mem_operand (src2, mode)))
1360 return false;
1361
1362 /* Canonicalize operand order for commutative operators. */
1363 if (ix86_swap_binary_operands_p (code, mode, operands))
1364 std::swap (src1, src2);
1365
1366 /* If the destination is memory, we must have a matching source operand. */
1367 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1368 return false;
1369
1370 /* Source 1 cannot be a constant. */
1371 if (CONSTANT_P (src1))
1372 return false;
1373
1374 /* Source 1 cannot be a non-matching memory. */
1375 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1376 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1377 return (code == AND
1378 && (mode == HImode
1379 || mode == SImode
1380 || (TARGET_64BIT && mode == DImode))
1381 && satisfies_constraint_L (src2));
1382
1383 return true;
1384 }
1385
1386 /* Attempt to expand a unary operator. Make the expansion closer to the
1387 actual machine, then just general_operand, which will allow 2 separate
1388 memory references (one output, one input) in a single insn. */
1389
1390 void
1391 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1392 rtx operands[])
1393 {
1394 bool matching_memory = false;
1395 rtx src, dst, op, clob;
1396
1397 dst = operands[0];
1398 src = operands[1];
1399
1400 /* If the destination is memory, and we do not have matching source
1401 operands, do things in registers. */
1402 if (MEM_P (dst))
1403 {
1404 if (rtx_equal_p (dst, src))
1405 matching_memory = true;
1406 else
1407 dst = gen_reg_rtx (mode);
1408 }
1409
1410 /* When source operand is memory, destination must match. */
1411 if (MEM_P (src) && !matching_memory)
1412 src = force_reg (mode, src);
1413
1414 /* Emit the instruction. */
1415
1416 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1417
1418 if (code == NOT)
1419 emit_insn (op);
1420 else
1421 {
1422 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1423 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1424 }
1425
1426 /* Fix up the destination if needed. */
1427 if (dst != operands[0])
1428 emit_move_insn (operands[0], dst);
1429 }
1430
1431 /* Predict just emitted jump instruction to be taken with probability PROB. */
1432
1433 static void
1434 predict_jump (int prob)
1435 {
1436 rtx_insn *insn = get_last_insn ();
1437 gcc_assert (JUMP_P (insn));
1438 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1439 }
1440
1441 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1442 divisor are within the range [0-255]. */
1443
1444 void
1445 ix86_split_idivmod (machine_mode mode, rtx operands[],
1446 bool unsigned_p)
1447 {
1448 rtx_code_label *end_label, *qimode_label;
1449 rtx div, mod;
1450 rtx_insn *insn;
1451 rtx scratch, tmp0, tmp1, tmp2;
1452 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
1453
1454 operands[2] = force_reg (mode, operands[2]);
1455 operands[3] = force_reg (mode, operands[3]);
1456
1457 switch (mode)
1458 {
1459 case E_SImode:
1460 if (GET_MODE (operands[0]) == SImode)
1461 {
1462 if (GET_MODE (operands[1]) == SImode)
1463 gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
1464 else
1465 gen_divmod4_1
1466 = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
1467 }
1468 else
1469 gen_divmod4_1
1470 = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
1471 break;
1472
1473 case E_DImode:
1474 gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
1475 break;
1476
1477 default:
1478 gcc_unreachable ();
1479 }
1480
1481 end_label = gen_label_rtx ();
1482 qimode_label = gen_label_rtx ();
1483
1484 scratch = gen_reg_rtx (mode);
1485
1486 /* Use 8bit unsigned divimod if dividend and divisor are within
1487 the range [0-255]. */
1488 emit_move_insn (scratch, operands[2]);
1489 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1490 scratch, 1, OPTAB_DIRECT);
1491 emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
1492 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1493 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1494 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1495 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1496 pc_rtx);
1497 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1498 predict_jump (REG_BR_PROB_BASE * 50 / 100);
1499 JUMP_LABEL (insn) = qimode_label;
1500
1501 /* Generate original signed/unsigned divimod. */
1502 emit_insn (gen_divmod4_1 (operands[0], operands[1],
1503 operands[2], operands[3]));
1504
1505 /* Branch to the end. */
1506 emit_jump_insn (gen_jump (end_label));
1507 emit_barrier ();
1508
1509 /* Generate 8bit unsigned divide. */
1510 emit_label (qimode_label);
1511 /* Don't use operands[0] for result of 8bit divide since not all
1512 registers support QImode ZERO_EXTRACT. */
1513 tmp0 = lowpart_subreg (HImode, scratch, mode);
1514 tmp1 = lowpart_subreg (HImode, operands[2], mode);
1515 tmp2 = lowpart_subreg (QImode, operands[3], mode);
1516 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1517
1518 if (unsigned_p)
1519 {
1520 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1521 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
1522 }
1523 else
1524 {
1525 div = gen_rtx_DIV (mode, operands[2], operands[3]);
1526 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
1527 }
1528 if (mode == SImode)
1529 {
1530 if (GET_MODE (operands[0]) != SImode)
1531 div = gen_rtx_ZERO_EXTEND (DImode, div);
1532 if (GET_MODE (operands[1]) != SImode)
1533 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1534 }
1535
1536 /* Extract remainder from AH. */
1537 scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
1538 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
1539 GEN_INT (8), GEN_INT (8));
1540 insn = emit_move_insn (operands[1], tmp1);
1541 set_unique_reg_note (insn, REG_EQUAL, mod);
1542
1543 /* Zero extend quotient from AL. */
1544 tmp1 = gen_lowpart (QImode, tmp0);
1545 insn = emit_insn (gen_extend_insn
1546 (operands[0], tmp1,
1547 GET_MODE (operands[0]), QImode, 1));
1548 set_unique_reg_note (insn, REG_EQUAL, div);
1549
1550 emit_label (end_label);
1551 }
1552
1553 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1554 matches destination. RTX includes clobber of FLAGS_REG. */
1555
1556 void
1557 ix86_emit_binop (enum rtx_code code, machine_mode mode,
1558 rtx dst, rtx src)
1559 {
1560 rtx op, clob;
1561
1562 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1563 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1564
1565 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1566 }
1567
1568 /* Return true if regno1 def is nearest to the insn. */
1569
1570 static bool
1571 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1572 {
1573 rtx_insn *prev = insn;
1574 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1575
1576 if (insn == start)
1577 return false;
1578 while (prev && prev != start)
1579 {
1580 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1581 {
1582 prev = PREV_INSN (prev);
1583 continue;
1584 }
1585 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1586 return true;
1587 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1588 return false;
1589 prev = PREV_INSN (prev);
1590 }
1591
1592 /* None of the regs is defined in the bb. */
1593 return false;
1594 }
1595
1596 /* INSN_UID of the last insn emitted by zero store peephole2s. */
1597 int ix86_last_zero_store_uid;
1598
1599 /* Split lea instructions into a sequence of instructions
1600 which are executed on ALU to avoid AGU stalls.
1601 It is assumed that it is allowed to clobber flags register
1602 at lea position. */
1603
1604 void
1605 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1606 {
1607 unsigned int regno0, regno1, regno2;
1608 struct ix86_address parts;
1609 rtx target, tmp;
1610 int ok, adds;
1611
1612 ok = ix86_decompose_address (operands[1], &parts);
1613 gcc_assert (ok);
1614
1615 target = gen_lowpart (mode, operands[0]);
1616
1617 regno0 = true_regnum (target);
1618 regno1 = INVALID_REGNUM;
1619 regno2 = INVALID_REGNUM;
1620
1621 if (parts.base)
1622 {
1623 parts.base = gen_lowpart (mode, parts.base);
1624 regno1 = true_regnum (parts.base);
1625 }
1626
1627 if (parts.index)
1628 {
1629 parts.index = gen_lowpart (mode, parts.index);
1630 regno2 = true_regnum (parts.index);
1631 }
1632
1633 if (parts.disp)
1634 parts.disp = gen_lowpart (mode, parts.disp);
1635
1636 if (parts.scale > 1)
1637 {
1638 /* Case r1 = r1 + ... */
1639 if (regno1 == regno0)
1640 {
1641 /* If we have a case r1 = r1 + C * r2 then we
1642 should use multiplication which is very
1643 expensive. Assume cost model is wrong if we
1644 have such case here. */
1645 gcc_assert (regno2 != regno0);
1646
1647 for (adds = parts.scale; adds > 0; adds--)
1648 ix86_emit_binop (PLUS, mode, target, parts.index);
1649 }
1650 else
1651 {
1652 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1653 if (regno0 != regno2)
1654 emit_insn (gen_rtx_SET (target, parts.index));
1655
1656 /* Use shift for scaling, but emit it as MULT instead
1657 to avoid it being immediately peephole2 optimized back
1658 into lea. */
1659 ix86_emit_binop (MULT, mode, target, GEN_INT (parts.scale));
1660
1661 if (parts.base)
1662 ix86_emit_binop (PLUS, mode, target, parts.base);
1663
1664 if (parts.disp && parts.disp != const0_rtx)
1665 ix86_emit_binop (PLUS, mode, target, parts.disp);
1666 }
1667 }
1668 else if (!parts.base && !parts.index)
1669 {
1670 gcc_assert(parts.disp);
1671 emit_insn (gen_rtx_SET (target, parts.disp));
1672 }
1673 else
1674 {
1675 if (!parts.base)
1676 {
1677 if (regno0 != regno2)
1678 emit_insn (gen_rtx_SET (target, parts.index));
1679 }
1680 else if (!parts.index)
1681 {
1682 if (regno0 != regno1)
1683 emit_insn (gen_rtx_SET (target, parts.base));
1684 }
1685 else
1686 {
1687 if (regno0 == regno1)
1688 tmp = parts.index;
1689 else if (regno0 == regno2)
1690 tmp = parts.base;
1691 else
1692 {
1693 rtx tmp1;
1694
1695 /* Find better operand for SET instruction, depending
1696 on which definition is farther from the insn. */
1697 if (find_nearest_reg_def (insn, regno1, regno2))
1698 tmp = parts.index, tmp1 = parts.base;
1699 else
1700 tmp = parts.base, tmp1 = parts.index;
1701
1702 emit_insn (gen_rtx_SET (target, tmp));
1703
1704 if (parts.disp && parts.disp != const0_rtx)
1705 ix86_emit_binop (PLUS, mode, target, parts.disp);
1706
1707 ix86_emit_binop (PLUS, mode, target, tmp1);
1708 return;
1709 }
1710
1711 ix86_emit_binop (PLUS, mode, target, tmp);
1712 }
1713
1714 if (parts.disp && parts.disp != const0_rtx)
1715 ix86_emit_binop (PLUS, mode, target, parts.disp);
1716 }
1717 }
1718
1719 /* Post-reload splitter for converting an SF or DFmode value in an
1720 SSE register into an unsigned SImode. */
1721
1722 void
1723 ix86_split_convert_uns_si_sse (rtx operands[])
1724 {
1725 machine_mode vecmode;
1726 rtx value, large, zero_or_two31, input, two31, x;
1727
1728 large = operands[1];
1729 zero_or_two31 = operands[2];
1730 input = operands[3];
1731 two31 = operands[4];
1732 vecmode = GET_MODE (large);
1733 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1734
1735 /* Load up the value into the low element. We must ensure that the other
1736 elements are valid floats -- zero is the easiest such value. */
1737 if (MEM_P (input))
1738 {
1739 if (vecmode == V4SFmode)
1740 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1741 else
1742 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1743 }
1744 else
1745 {
1746 input = gen_rtx_REG (vecmode, REGNO (input));
1747 emit_move_insn (value, CONST0_RTX (vecmode));
1748 if (vecmode == V4SFmode)
1749 emit_insn (gen_sse_movss (value, value, input));
1750 else
1751 emit_insn (gen_sse2_movsd (value, value, input));
1752 }
1753
1754 emit_move_insn (large, two31);
1755 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1756
1757 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1758 emit_insn (gen_rtx_SET (large, x));
1759
1760 x = gen_rtx_AND (vecmode, zero_or_two31, large);
1761 emit_insn (gen_rtx_SET (zero_or_two31, x));
1762
1763 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1764 emit_insn (gen_rtx_SET (value, x));
1765
1766 large = gen_rtx_REG (V4SImode, REGNO (large));
1767 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1768
1769 x = gen_rtx_REG (V4SImode, REGNO (value));
1770 if (vecmode == V4SFmode)
1771 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1772 else
1773 emit_insn (gen_sse2_cvttpd2dq (x, value));
1774 value = x;
1775
1776 emit_insn (gen_xorv4si3 (value, value, large));
1777 }
1778
1779 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
1780 machine_mode mode, rtx target,
1781 rtx var, int one_var);
1782
1783 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1784 Expects the 64-bit DImode to be supplied in a pair of integral
1785 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1786 -mfpmath=sse, !optimize_size only. */
1787
1788 void
1789 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1790 {
1791 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1792 rtx int_xmm, fp_xmm;
1793 rtx biases, exponents;
1794 rtx x;
1795
1796 int_xmm = gen_reg_rtx (V4SImode);
1797 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1798 emit_insn (gen_movdi_to_sse (int_xmm, input));
1799 else if (TARGET_SSE_SPLIT_REGS)
1800 {
1801 emit_clobber (int_xmm);
1802 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1803 }
1804 else
1805 {
1806 x = gen_reg_rtx (V2DImode);
1807 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1808 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1809 }
1810
1811 x = gen_rtx_CONST_VECTOR (V4SImode,
1812 gen_rtvec (4, GEN_INT (0x43300000UL),
1813 GEN_INT (0x45300000UL),
1814 const0_rtx, const0_rtx));
1815 exponents = validize_mem (force_const_mem (V4SImode, x));
1816
1817 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1818 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1819
1820 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1821 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1822 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1823 (0x1.0p84 + double(fp_value_hi_xmm)).
1824 Note these exponents differ by 32. */
1825
1826 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1827
1828 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1829 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1830 real_ldexp (&bias_lo_rvt, &dconst1, 52);
1831 real_ldexp (&bias_hi_rvt, &dconst1, 84);
1832 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1833 x = const_double_from_real_value (bias_hi_rvt, DFmode);
1834 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1835 biases = validize_mem (force_const_mem (V2DFmode, biases));
1836 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1837
1838 /* Add the upper and lower DFmode values together. */
1839 if (TARGET_SSE3)
1840 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1841 else
1842 {
1843 x = copy_to_mode_reg (V2DFmode, fp_xmm);
1844 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1845 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1846 }
1847
1848 ix86_expand_vector_extract (false, target, fp_xmm, 0);
1849 }
1850
1851 /* Not used, but eases macroization of patterns. */
1852 void
1853 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1854 {
1855 gcc_unreachable ();
1856 }
1857
1858 static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask);
1859
1860 /* Convert an unsigned SImode value into a DFmode. Only currently used
1861 for SSE, but applicable anywhere. */
1862
1863 void
1864 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1865 {
1866 REAL_VALUE_TYPE TWO31r;
1867 rtx x, fp;
1868
1869 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1870 NULL, 1, OPTAB_DIRECT);
1871
1872 fp = gen_reg_rtx (DFmode);
1873 emit_insn (gen_floatsidf2 (fp, x));
1874
1875 real_ldexp (&TWO31r, &dconst1, 31);
1876 x = const_double_from_real_value (TWO31r, DFmode);
1877
1878 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
1879
1880 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
1881 if (HONOR_SIGNED_ZEROS (DFmode) && flag_rounding_math)
1882 x = ix86_expand_sse_fabs (x, NULL);
1883
1884 if (x != target)
1885 emit_move_insn (target, x);
1886 }
1887
1888 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1889 32-bit mode; otherwise we have a direct convert instruction. */
1890
1891 void
1892 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
1893 {
1894 REAL_VALUE_TYPE TWO32r;
1895 rtx fp_lo, fp_hi, x;
1896
1897 fp_lo = gen_reg_rtx (DFmode);
1898 fp_hi = gen_reg_rtx (DFmode);
1899
1900 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
1901
1902 real_ldexp (&TWO32r, &dconst1, 32);
1903 x = const_double_from_real_value (TWO32r, DFmode);
1904 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
1905
1906 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
1907
1908 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
1909 0, OPTAB_DIRECT);
1910 if (x != target)
1911 emit_move_insn (target, x);
1912 }
1913
1914 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1915 For x86_32, -mfpmath=sse, !optimize_size only. */
1916 void
1917 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
1918 {
1919 REAL_VALUE_TYPE ONE16r;
1920 rtx fp_hi, fp_lo, int_hi, int_lo, x;
1921
1922 real_ldexp (&ONE16r, &dconst1, 16);
1923 x = const_double_from_real_value (ONE16r, SFmode);
1924 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
1925 NULL, 0, OPTAB_DIRECT);
1926 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
1927 NULL, 0, OPTAB_DIRECT);
1928 fp_hi = gen_reg_rtx (SFmode);
1929 fp_lo = gen_reg_rtx (SFmode);
1930 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
1931 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
1932 if (TARGET_FMA)
1933 {
1934 x = validize_mem (force_const_mem (SFmode, x));
1935 fp_hi = gen_rtx_FMA (SFmode, fp_hi, x, fp_lo);
1936 emit_move_insn (target, fp_hi);
1937 }
1938 else
1939 {
1940 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
1941 0, OPTAB_DIRECT);
1942 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
1943 0, OPTAB_DIRECT);
1944 if (!rtx_equal_p (target, fp_hi))
1945 emit_move_insn (target, fp_hi);
1946 }
1947 }
1948
1949 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1950 a vector of unsigned ints VAL to vector of floats TARGET. */
1951
1952 void
1953 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
1954 {
1955 rtx tmp[8];
1956 REAL_VALUE_TYPE TWO16r;
1957 machine_mode intmode = GET_MODE (val);
1958 machine_mode fltmode = GET_MODE (target);
1959 rtx (*cvt) (rtx, rtx);
1960
1961 if (intmode == V4SImode)
1962 cvt = gen_floatv4siv4sf2;
1963 else
1964 cvt = gen_floatv8siv8sf2;
1965 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
1966 tmp[0] = force_reg (intmode, tmp[0]);
1967 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
1968 OPTAB_DIRECT);
1969 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
1970 NULL_RTX, 1, OPTAB_DIRECT);
1971 tmp[3] = gen_reg_rtx (fltmode);
1972 emit_insn (cvt (tmp[3], tmp[1]));
1973 tmp[4] = gen_reg_rtx (fltmode);
1974 emit_insn (cvt (tmp[4], tmp[2]));
1975 real_ldexp (&TWO16r, &dconst1, 16);
1976 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
1977 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
1978 if (TARGET_FMA)
1979 {
1980 tmp[6] = gen_rtx_FMA (fltmode, tmp[4], tmp[5], tmp[3]);
1981 emit_move_insn (target, tmp[6]);
1982 }
1983 else
1984 {
1985 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5],
1986 NULL_RTX, 1, OPTAB_DIRECT);
1987 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6],
1988 target, 1, OPTAB_DIRECT);
1989 if (tmp[7] != target)
1990 emit_move_insn (target, tmp[7]);
1991 }
1992 }
1993
1994 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1995 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1996 This is done by doing just signed conversion if < 0x1p31, and otherwise by
1997 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
1998
1999 rtx
2000 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
2001 {
2002 REAL_VALUE_TYPE TWO31r;
2003 rtx two31r, tmp[4];
2004 machine_mode mode = GET_MODE (val);
2005 machine_mode scalarmode = GET_MODE_INNER (mode);
2006 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
2007 rtx (*cmp) (rtx, rtx, rtx, rtx);
2008 int i;
2009
2010 for (i = 0; i < 3; i++)
2011 tmp[i] = gen_reg_rtx (mode);
2012 real_ldexp (&TWO31r, &dconst1, 31);
2013 two31r = const_double_from_real_value (TWO31r, scalarmode);
2014 two31r = ix86_build_const_vector (mode, 1, two31r);
2015 two31r = force_reg (mode, two31r);
2016 switch (mode)
2017 {
2018 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
2019 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
2020 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
2021 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
2022 default: gcc_unreachable ();
2023 }
2024 tmp[3] = gen_rtx_LE (mode, two31r, val);
2025 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
2026 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
2027 0, OPTAB_DIRECT);
2028 if (intmode == V4SImode || TARGET_AVX2)
2029 *xorp = expand_simple_binop (intmode, ASHIFT,
2030 gen_lowpart (intmode, tmp[0]),
2031 GEN_INT (31), NULL_RTX, 0,
2032 OPTAB_DIRECT);
2033 else
2034 {
2035 rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
2036 two31 = ix86_build_const_vector (intmode, 1, two31);
2037 *xorp = expand_simple_binop (intmode, AND,
2038 gen_lowpart (intmode, tmp[0]),
2039 two31, NULL_RTX, 0,
2040 OPTAB_DIRECT);
2041 }
2042 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
2043 0, OPTAB_DIRECT);
2044 }
2045
2046 /* Generate code for floating point ABS or NEG. */
2047
2048 void
2049 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2050 rtx operands[])
2051 {
2052 rtx set, dst, src;
2053 bool use_sse = false;
2054 bool vector_mode = VECTOR_MODE_P (mode);
2055 machine_mode vmode = mode;
2056 rtvec par;
2057
2058 if (vector_mode || mode == TFmode || mode == HFmode)
2059 {
2060 use_sse = true;
2061 if (mode == HFmode)
2062 vmode = V8HFmode;
2063 }
2064 else if (TARGET_SSE_MATH)
2065 {
2066 use_sse = SSE_FLOAT_MODE_P (mode);
2067 if (mode == SFmode)
2068 vmode = V4SFmode;
2069 else if (mode == DFmode)
2070 vmode = V2DFmode;
2071 }
2072
2073 dst = operands[0];
2074 src = operands[1];
2075
2076 set = gen_rtx_fmt_e (code, mode, src);
2077 set = gen_rtx_SET (dst, set);
2078
2079 if (use_sse)
2080 {
2081 rtx mask, use, clob;
2082
2083 /* NEG and ABS performed with SSE use bitwise mask operations.
2084 Create the appropriate mask now. */
2085 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
2086 use = gen_rtx_USE (VOIDmode, mask);
2087 if (vector_mode || mode == TFmode)
2088 par = gen_rtvec (2, set, use);
2089 else
2090 {
2091 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2092 par = gen_rtvec (3, set, use, clob);
2093 }
2094 }
2095 else
2096 {
2097 rtx clob;
2098
2099 /* Changing of sign for FP values is doable using integer unit too. */
2100 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2101 par = gen_rtvec (2, set, clob);
2102 }
2103
2104 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2105 }
2106
2107 /* Deconstruct a floating point ABS or NEG operation
2108 with integer registers into integer operations. */
2109
2110 void
2111 ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2112 rtx operands[])
2113 {
2114 enum rtx_code absneg_op;
2115 rtx dst, set;
2116
2117 gcc_assert (operands_match_p (operands[0], operands[1]));
2118
2119 switch (mode)
2120 {
2121 case E_SFmode:
2122 dst = gen_lowpart (SImode, operands[0]);
2123
2124 if (code == ABS)
2125 {
2126 set = gen_int_mode (0x7fffffff, SImode);
2127 absneg_op = AND;
2128 }
2129 else
2130 {
2131 set = gen_int_mode (0x80000000, SImode);
2132 absneg_op = XOR;
2133 }
2134 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2135 break;
2136
2137 case E_DFmode:
2138 if (TARGET_64BIT)
2139 {
2140 dst = gen_lowpart (DImode, operands[0]);
2141 dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
2142
2143 if (code == ABS)
2144 set = const0_rtx;
2145 else
2146 set = gen_rtx_NOT (DImode, dst);
2147 }
2148 else
2149 {
2150 dst = gen_highpart (SImode, operands[0]);
2151
2152 if (code == ABS)
2153 {
2154 set = gen_int_mode (0x7fffffff, SImode);
2155 absneg_op = AND;
2156 }
2157 else
2158 {
2159 set = gen_int_mode (0x80000000, SImode);
2160 absneg_op = XOR;
2161 }
2162 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2163 }
2164 break;
2165
2166 case E_XFmode:
2167 dst = gen_rtx_REG (SImode,
2168 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
2169 if (code == ABS)
2170 {
2171 set = GEN_INT (0x7fff);
2172 absneg_op = AND;
2173 }
2174 else
2175 {
2176 set = GEN_INT (0x8000);
2177 absneg_op = XOR;
2178 }
2179 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2180 break;
2181
2182 default:
2183 gcc_unreachable ();
2184 }
2185
2186 set = gen_rtx_SET (dst, set);
2187
2188 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2189 rtvec par = gen_rtvec (2, set, clob);
2190
2191 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2192 }
2193
2194 /* Expand a copysign operation. Special case operand 0 being a constant. */
2195
2196 void
2197 ix86_expand_copysign (rtx operands[])
2198 {
2199 machine_mode mode, vmode;
2200 rtx dest, vdest, op0, op1, mask, op2, op3;
2201
2202 mode = GET_MODE (operands[0]);
2203
2204 if (mode == HFmode)
2205 vmode = V8HFmode;
2206 else if (mode == SFmode)
2207 vmode = V4SFmode;
2208 else if (mode == DFmode)
2209 vmode = V2DFmode;
2210 else if (mode == TFmode)
2211 vmode = mode;
2212 else
2213 gcc_unreachable ();
2214
2215 if (rtx_equal_p (operands[1], operands[2]))
2216 {
2217 emit_move_insn (operands[0], operands[1]);
2218 return;
2219 }
2220
2221 dest = operands[0];
2222 vdest = lowpart_subreg (vmode, dest, mode);
2223 if (vdest == NULL_RTX)
2224 vdest = gen_reg_rtx (vmode);
2225 else
2226 dest = NULL_RTX;
2227 op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
2228 mask = ix86_build_signbit_mask (vmode, 0, 0);
2229
2230 if (CONST_DOUBLE_P (operands[1]))
2231 {
2232 op0 = simplify_unary_operation (ABS, mode, operands[1], mode);
2233 /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a. */
2234 if (op0 == CONST0_RTX (mode))
2235 {
2236 emit_move_insn (vdest, gen_rtx_AND (vmode, mask, op1));
2237 if (dest)
2238 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2239 return;
2240 }
2241
2242 if (GET_MODE_SIZE (mode) < 16)
2243 op0 = ix86_build_const_vector (vmode, false, op0);
2244 op0 = force_reg (vmode, op0);
2245 }
2246 else
2247 op0 = lowpart_subreg (vmode, force_reg (mode, operands[1]), mode);
2248
2249 op2 = gen_reg_rtx (vmode);
2250 op3 = gen_reg_rtx (vmode);
2251 emit_move_insn (op2, gen_rtx_AND (vmode,
2252 gen_rtx_NOT (vmode, mask),
2253 op0));
2254 emit_move_insn (op3, gen_rtx_AND (vmode, mask, op1));
2255 emit_move_insn (vdest, gen_rtx_IOR (vmode, op2, op3));
2256 if (dest)
2257 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2258 }
2259
2260 /* Expand an xorsign operation. */
2261
2262 void
2263 ix86_expand_xorsign (rtx operands[])
2264 {
2265 machine_mode mode, vmode;
2266 rtx dest, vdest, op0, op1, mask, x, temp;
2267
2268 dest = operands[0];
2269 op0 = operands[1];
2270 op1 = operands[2];
2271
2272 mode = GET_MODE (dest);
2273
2274 if (mode == HFmode)
2275 vmode = V8HFmode;
2276 else if (mode == SFmode)
2277 vmode = V4SFmode;
2278 else if (mode == DFmode)
2279 vmode = V2DFmode;
2280 else
2281 gcc_unreachable ();
2282
2283 temp = gen_reg_rtx (vmode);
2284 mask = ix86_build_signbit_mask (vmode, 0, 0);
2285
2286 op1 = lowpart_subreg (vmode, force_reg (mode, op1), mode);
2287 x = gen_rtx_AND (vmode, op1, mask);
2288 emit_insn (gen_rtx_SET (temp, x));
2289
2290 op0 = lowpart_subreg (vmode, force_reg (mode, op0), mode);
2291 x = gen_rtx_XOR (vmode, temp, op0);
2292
2293 vdest = lowpart_subreg (vmode, dest, mode);
2294 if (vdest == NULL_RTX)
2295 vdest = gen_reg_rtx (vmode);
2296 else
2297 dest = NULL_RTX;
2298 emit_insn (gen_rtx_SET (vdest, x));
2299
2300 if (dest)
2301 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2302 }
2303
2304 static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2305
2306 void
2307 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2308 {
2309 machine_mode mode = GET_MODE (op0);
2310 rtx tmp;
2311
2312 /* Handle special case - vector comparsion with boolean result, transform
2313 it using ptest instruction. */
2314 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
2315 || mode == OImode)
2316 {
2317 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
2318 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
2319
2320 gcc_assert (code == EQ || code == NE);
2321
2322 if (mode == OImode)
2323 {
2324 op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
2325 op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
2326 mode = p_mode;
2327 }
2328 /* Generate XOR since we can't check that one operand is zero vector. */
2329 tmp = gen_reg_rtx (mode);
2330 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
2331 tmp = gen_lowpart (p_mode, tmp);
2332 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
2333 gen_rtx_UNSPEC (CCmode,
2334 gen_rtvec (2, tmp, tmp),
2335 UNSPEC_PTEST)));
2336 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2337 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2338 gen_rtx_LABEL_REF (VOIDmode, label),
2339 pc_rtx);
2340 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2341 return;
2342 }
2343
2344 switch (mode)
2345 {
2346 case E_HFmode:
2347 case E_SFmode:
2348 case E_DFmode:
2349 case E_XFmode:
2350 case E_QImode:
2351 case E_HImode:
2352 case E_SImode:
2353 simple:
2354 tmp = ix86_expand_compare (code, op0, op1);
2355 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2356 gen_rtx_LABEL_REF (VOIDmode, label),
2357 pc_rtx);
2358 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2359 return;
2360
2361 case E_DImode:
2362 if (TARGET_64BIT)
2363 goto simple;
2364 /* FALLTHRU */
2365 case E_TImode:
2366 /* DI and TI mode equality/inequality comparisons may be performed
2367 on SSE registers. Avoid splitting them, except when optimizing
2368 for size. */
2369 if ((code == EQ || code == NE)
2370 && !optimize_insn_for_size_p ())
2371 goto simple;
2372
2373 /* Expand DImode branch into multiple compare+branch. */
2374 {
2375 rtx lo[2], hi[2];
2376 rtx_code_label *label2;
2377 enum rtx_code code1, code2, code3;
2378 machine_mode submode;
2379
2380 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2381 {
2382 std::swap (op0, op1);
2383 code = swap_condition (code);
2384 }
2385
2386 split_double_mode (mode, &op0, 1, lo+0, hi+0);
2387 split_double_mode (mode, &op1, 1, lo+1, hi+1);
2388
2389 submode = mode == DImode ? SImode : DImode;
2390
2391 /* If we are doing less-than or greater-or-equal-than,
2392 op1 is a constant and the low word is zero, then we can just
2393 examine the high word. Similarly for low word -1 and
2394 less-or-equal-than or greater-than. */
2395
2396 if (CONST_INT_P (hi[1]))
2397 switch (code)
2398 {
2399 case LT: case LTU: case GE: case GEU:
2400 if (lo[1] == const0_rtx)
2401 {
2402 ix86_expand_branch (code, hi[0], hi[1], label);
2403 return;
2404 }
2405 break;
2406 case LE: case LEU: case GT: case GTU:
2407 if (lo[1] == constm1_rtx)
2408 {
2409 ix86_expand_branch (code, hi[0], hi[1], label);
2410 return;
2411 }
2412 break;
2413 default:
2414 break;
2415 }
2416
2417 /* Emulate comparisons that do not depend on Zero flag with
2418 double-word subtraction. Note that only Overflow, Sign
2419 and Carry flags are valid, so swap arguments and condition
2420 of comparisons that would otherwise test Zero flag. */
2421
2422 switch (code)
2423 {
2424 case LE: case LEU: case GT: case GTU:
2425 std::swap (lo[0], lo[1]);
2426 std::swap (hi[0], hi[1]);
2427 code = swap_condition (code);
2428 /* FALLTHRU */
2429
2430 case LT: case LTU: case GE: case GEU:
2431 {
2432 bool uns = (code == LTU || code == GEU);
2433 rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2434 = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2435
2436 if (!nonimmediate_operand (lo[0], submode))
2437 lo[0] = force_reg (submode, lo[0]);
2438 if (!x86_64_general_operand (lo[1], submode))
2439 lo[1] = force_reg (submode, lo[1]);
2440
2441 if (!register_operand (hi[0], submode))
2442 hi[0] = force_reg (submode, hi[0]);
2443 if ((uns && !nonimmediate_operand (hi[1], submode))
2444 || (!uns && !x86_64_general_operand (hi[1], submode)))
2445 hi[1] = force_reg (submode, hi[1]);
2446
2447 emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2448
2449 tmp = gen_rtx_SCRATCH (submode);
2450 emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2451
2452 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2453 ix86_expand_branch (code, tmp, const0_rtx, label);
2454 return;
2455 }
2456
2457 default:
2458 break;
2459 }
2460
2461 /* Otherwise, we need two or three jumps. */
2462
2463 label2 = gen_label_rtx ();
2464
2465 code1 = code;
2466 code2 = swap_condition (code);
2467 code3 = unsigned_condition (code);
2468
2469 switch (code)
2470 {
2471 case LT: case GT: case LTU: case GTU:
2472 break;
2473
2474 case LE: code1 = LT; code2 = GT; break;
2475 case GE: code1 = GT; code2 = LT; break;
2476 case LEU: code1 = LTU; code2 = GTU; break;
2477 case GEU: code1 = GTU; code2 = LTU; break;
2478
2479 case EQ: code1 = UNKNOWN; code2 = NE; break;
2480 case NE: code2 = UNKNOWN; break;
2481
2482 default:
2483 gcc_unreachable ();
2484 }
2485
2486 /*
2487 * a < b =>
2488 * if (hi(a) < hi(b)) goto true;
2489 * if (hi(a) > hi(b)) goto false;
2490 * if (lo(a) < lo(b)) goto true;
2491 * false:
2492 */
2493
2494 if (code1 != UNKNOWN)
2495 ix86_expand_branch (code1, hi[0], hi[1], label);
2496 if (code2 != UNKNOWN)
2497 ix86_expand_branch (code2, hi[0], hi[1], label2);
2498
2499 ix86_expand_branch (code3, lo[0], lo[1], label);
2500
2501 if (code2 != UNKNOWN)
2502 emit_label (label2);
2503 return;
2504 }
2505
2506 default:
2507 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2508 goto simple;
2509 }
2510 }
2511
2512 /* Figure out whether to use unordered fp comparisons. */
2513
2514 static bool
2515 ix86_unordered_fp_compare (enum rtx_code code)
2516 {
2517 if (!TARGET_IEEE_FP)
2518 return false;
2519
2520 switch (code)
2521 {
2522 case LT:
2523 case LE:
2524 case GT:
2525 case GE:
2526 case LTGT:
2527 return false;
2528
2529 case EQ:
2530 case NE:
2531
2532 case UNORDERED:
2533 case ORDERED:
2534 case UNLT:
2535 case UNLE:
2536 case UNGT:
2537 case UNGE:
2538 case UNEQ:
2539 return true;
2540
2541 default:
2542 gcc_unreachable ();
2543 }
2544 }
2545
2546 /* Return a comparison we can do and that it is equivalent to
2547 swap_condition (code) apart possibly from orderedness.
2548 But, never change orderedness if TARGET_IEEE_FP, returning
2549 UNKNOWN in that case if necessary. */
2550
2551 static enum rtx_code
2552 ix86_fp_swap_condition (enum rtx_code code)
2553 {
2554 switch (code)
2555 {
2556 case GT: /* GTU - CF=0 & ZF=0 */
2557 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2558 case GE: /* GEU - CF=0 */
2559 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2560 case UNLT: /* LTU - CF=1 */
2561 return TARGET_IEEE_FP ? UNKNOWN : GT;
2562 case UNLE: /* LEU - CF=1 | ZF=1 */
2563 return TARGET_IEEE_FP ? UNKNOWN : GE;
2564 default:
2565 return swap_condition (code);
2566 }
2567 }
2568
2569 /* Return cost of comparison CODE using the best strategy for performance.
2570 All following functions do use number of instructions as a cost metrics.
2571 In future this should be tweaked to compute bytes for optimize_size and
2572 take into account performance of various instructions on various CPUs. */
2573
2574 static int
2575 ix86_fp_comparison_cost (enum rtx_code code)
2576 {
2577 int arith_cost;
2578
2579 /* The cost of code using bit-twiddling on %ah. */
2580 switch (code)
2581 {
2582 case UNLE:
2583 case UNLT:
2584 case LTGT:
2585 case GT:
2586 case GE:
2587 case UNORDERED:
2588 case ORDERED:
2589 case UNEQ:
2590 arith_cost = 4;
2591 break;
2592 case LT:
2593 case NE:
2594 case EQ:
2595 case UNGE:
2596 arith_cost = TARGET_IEEE_FP ? 5 : 4;
2597 break;
2598 case LE:
2599 case UNGT:
2600 arith_cost = TARGET_IEEE_FP ? 6 : 4;
2601 break;
2602 default:
2603 gcc_unreachable ();
2604 }
2605
2606 switch (ix86_fp_comparison_strategy (code))
2607 {
2608 case IX86_FPCMP_COMI:
2609 return arith_cost > 4 ? 3 : 2;
2610 case IX86_FPCMP_SAHF:
2611 return arith_cost > 4 ? 4 : 3;
2612 default:
2613 return arith_cost;
2614 }
2615 }
2616
2617 /* Swap, force into registers, or otherwise massage the two operands
2618 to a fp comparison. The operands are updated in place; the new
2619 comparison code is returned. */
2620
2621 static enum rtx_code
2622 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2623 {
2624 bool unordered_compare = ix86_unordered_fp_compare (code);
2625 rtx op0 = *pop0, op1 = *pop1;
2626 machine_mode op_mode = GET_MODE (op0);
2627 bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HF_P (op_mode);
2628
2629 if (op_mode == BFmode)
2630 {
2631 rtx op = gen_lowpart (HImode, op0);
2632 if (CONST_INT_P (op))
2633 op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
2634 op0, BFmode);
2635 else
2636 {
2637 rtx t1 = gen_reg_rtx (SImode);
2638 emit_insn (gen_zero_extendhisi2 (t1, op));
2639 emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
2640 op = gen_lowpart (SFmode, t1);
2641 }
2642 *pop0 = op;
2643 op = gen_lowpart (HImode, op1);
2644 if (CONST_INT_P (op))
2645 op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
2646 op1, BFmode);
2647 else
2648 {
2649 rtx t1 = gen_reg_rtx (SImode);
2650 emit_insn (gen_zero_extendhisi2 (t1, op));
2651 emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
2652 op = gen_lowpart (SFmode, t1);
2653 }
2654 *pop1 = op;
2655 return ix86_prepare_fp_compare_args (code, pop0, pop1);
2656 }
2657
2658 /* All of the unordered compare instructions only work on registers.
2659 The same is true of the fcomi compare instructions. The XFmode
2660 compare instructions require registers except when comparing
2661 against zero or when converting operand 1 from fixed point to
2662 floating point. */
2663
2664 if (!is_sse
2665 && (unordered_compare
2666 || (op_mode == XFmode
2667 && ! (standard_80387_constant_p (op0) == 1
2668 || standard_80387_constant_p (op1) == 1)
2669 && GET_CODE (op1) != FLOAT)
2670 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2671 {
2672 op0 = force_reg (op_mode, op0);
2673 op1 = force_reg (op_mode, op1);
2674 }
2675 else
2676 {
2677 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2678 things around if they appear profitable, otherwise force op0
2679 into a register. */
2680
2681 if (standard_80387_constant_p (op0) == 0
2682 || (MEM_P (op0)
2683 && ! (standard_80387_constant_p (op1) == 0
2684 || MEM_P (op1))))
2685 {
2686 enum rtx_code new_code = ix86_fp_swap_condition (code);
2687 if (new_code != UNKNOWN)
2688 {
2689 std::swap (op0, op1);
2690 code = new_code;
2691 }
2692 }
2693
2694 if (!REG_P (op0))
2695 op0 = force_reg (op_mode, op0);
2696
2697 if (CONSTANT_P (op1))
2698 {
2699 int tmp = standard_80387_constant_p (op1);
2700 if (tmp == 0)
2701 op1 = validize_mem (force_const_mem (op_mode, op1));
2702 else if (tmp == 1)
2703 {
2704 if (TARGET_CMOVE)
2705 op1 = force_reg (op_mode, op1);
2706 }
2707 else
2708 op1 = force_reg (op_mode, op1);
2709 }
2710 }
2711
2712 /* Try to rearrange the comparison to make it cheaper. */
2713 if (ix86_fp_comparison_cost (code)
2714 > ix86_fp_comparison_cost (swap_condition (code))
2715 && (REG_P (op1) || can_create_pseudo_p ()))
2716 {
2717 std::swap (op0, op1);
2718 code = swap_condition (code);
2719 if (!REG_P (op0))
2720 op0 = force_reg (op_mode, op0);
2721 }
2722
2723 *pop0 = op0;
2724 *pop1 = op1;
2725 return code;
2726 }
2727
2728 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2729
2730 static rtx
2731 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2732 {
2733 bool unordered_compare = ix86_unordered_fp_compare (code);
2734 machine_mode cmp_mode;
2735 rtx tmp, scratch;
2736
2737 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2738
2739 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2740 if (unordered_compare)
2741 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2742
2743 /* Do fcomi/sahf based test when profitable. */
2744 switch (ix86_fp_comparison_strategy (code))
2745 {
2746 case IX86_FPCMP_COMI:
2747 cmp_mode = CCFPmode;
2748 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2749 break;
2750
2751 case IX86_FPCMP_SAHF:
2752 cmp_mode = CCFPmode;
2753 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2754 scratch = gen_reg_rtx (HImode);
2755 emit_insn (gen_rtx_SET (scratch, tmp));
2756 emit_insn (gen_x86_sahf_1 (scratch));
2757 break;
2758
2759 case IX86_FPCMP_ARITH:
2760 cmp_mode = CCNOmode;
2761 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2762 scratch = gen_reg_rtx (HImode);
2763 emit_insn (gen_rtx_SET (scratch, tmp));
2764
2765 /* In the unordered case, we have to check C2 for NaN's, which
2766 doesn't happen to work out to anything nice combination-wise.
2767 So do some bit twiddling on the value we've got in AH to come
2768 up with an appropriate set of condition codes. */
2769
2770 switch (code)
2771 {
2772 case GT:
2773 case UNGT:
2774 if (code == GT || !TARGET_IEEE_FP)
2775 {
2776 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2777 code = EQ;
2778 }
2779 else
2780 {
2781 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2782 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2783 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2784 cmp_mode = CCmode;
2785 code = GEU;
2786 }
2787 break;
2788 case LT:
2789 case UNLT:
2790 if (code == LT && TARGET_IEEE_FP)
2791 {
2792 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2793 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2794 cmp_mode = CCmode;
2795 code = EQ;
2796 }
2797 else
2798 {
2799 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2800 code = NE;
2801 }
2802 break;
2803 case GE:
2804 case UNGE:
2805 if (code == GE || !TARGET_IEEE_FP)
2806 {
2807 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
2808 code = EQ;
2809 }
2810 else
2811 {
2812 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2813 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
2814 code = NE;
2815 }
2816 break;
2817 case LE:
2818 case UNLE:
2819 if (code == LE && TARGET_IEEE_FP)
2820 {
2821 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2822 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2823 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2824 cmp_mode = CCmode;
2825 code = LTU;
2826 }
2827 else
2828 {
2829 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2830 code = NE;
2831 }
2832 break;
2833 case EQ:
2834 case UNEQ:
2835 if (code == EQ && TARGET_IEEE_FP)
2836 {
2837 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2838 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2839 cmp_mode = CCmode;
2840 code = EQ;
2841 }
2842 else
2843 {
2844 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2845 code = NE;
2846 }
2847 break;
2848 case NE:
2849 case LTGT:
2850 if (code == NE && TARGET_IEEE_FP)
2851 {
2852 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2853 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
2854 GEN_INT (0x40)));
2855 code = NE;
2856 }
2857 else
2858 {
2859 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2860 code = EQ;
2861 }
2862 break;
2863
2864 case UNORDERED:
2865 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2866 code = NE;
2867 break;
2868 case ORDERED:
2869 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2870 code = EQ;
2871 break;
2872
2873 default:
2874 gcc_unreachable ();
2875 }
2876 break;
2877
2878 default:
2879 gcc_unreachable();
2880 }
2881
2882 /* Return the test that should be put into the flags user, i.e.
2883 the bcc, scc, or cmov instruction. */
2884 return gen_rtx_fmt_ee (code, VOIDmode,
2885 gen_rtx_REG (cmp_mode, FLAGS_REG),
2886 const0_rtx);
2887 }
2888
2889 /* Generate insn patterns to do an integer compare of OPERANDS. */
2890
2891 static rtx
2892 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
2893 {
2894 machine_mode cmpmode;
2895 rtx tmp, flags;
2896
2897 /* Swap operands to emit carry flag comparison. */
2898 if ((code == GTU || code == LEU)
2899 && nonimmediate_operand (op1, VOIDmode))
2900 {
2901 std::swap (op0, op1);
2902 code = swap_condition (code);
2903 }
2904
2905 cmpmode = SELECT_CC_MODE (code, op0, op1);
2906 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
2907
2908 /* This is very simple, but making the interface the same as in the
2909 FP case makes the rest of the code easier. */
2910 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
2911 emit_insn (gen_rtx_SET (flags, tmp));
2912
2913 /* Return the test that should be put into the flags user, i.e.
2914 the bcc, scc, or cmov instruction. */
2915 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
2916 }
2917
2918 static rtx
2919 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
2920 {
2921 rtx ret;
2922
2923 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
2924 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
2925
2926 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
2927 {
2928 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
2929 ret = ix86_expand_fp_compare (code, op0, op1);
2930 }
2931 else
2932 ret = ix86_expand_int_compare (code, op0, op1);
2933
2934 return ret;
2935 }
2936
2937 void
2938 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
2939 {
2940 rtx ret;
2941
2942 gcc_assert (GET_MODE (dest) == QImode);
2943
2944 ret = ix86_expand_compare (code, op0, op1);
2945 PUT_MODE (ret, QImode);
2946 emit_insn (gen_rtx_SET (dest, ret));
2947 }
2948
2949 /* Expand floating point op0 <=> op1, i.e.
2950 dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : 2. */
2951
2952 void
2953 ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1)
2954 {
2955 gcc_checking_assert (ix86_fp_comparison_strategy (GT) != IX86_FPCMP_ARITH);
2956 rtx gt = ix86_expand_fp_compare (GT, op0, op1);
2957 rtx l0 = gen_label_rtx ();
2958 rtx l1 = gen_label_rtx ();
2959 rtx l2 = TARGET_IEEE_FP ? gen_label_rtx () : NULL_RTX;
2960 rtx lend = gen_label_rtx ();
2961 rtx tmp;
2962 rtx_insn *jmp;
2963 if (l2)
2964 {
2965 rtx un = gen_rtx_fmt_ee (UNORDERED, VOIDmode,
2966 gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
2967 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, un,
2968 gen_rtx_LABEL_REF (VOIDmode, l2), pc_rtx);
2969 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2970 add_reg_br_prob_note (jmp, profile_probability:: very_unlikely ());
2971 }
2972 rtx eq = gen_rtx_fmt_ee (UNEQ, VOIDmode,
2973 gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
2974 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, eq,
2975 gen_rtx_LABEL_REF (VOIDmode, l0), pc_rtx);
2976 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2977 add_reg_br_prob_note (jmp, profile_probability::unlikely ());
2978 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, gt,
2979 gen_rtx_LABEL_REF (VOIDmode, l1), pc_rtx);
2980 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2981 add_reg_br_prob_note (jmp, profile_probability::even ());
2982 emit_move_insn (dest, constm1_rtx);
2983 emit_jump (lend);
2984 emit_label (l0);
2985 emit_move_insn (dest, const0_rtx);
2986 emit_jump (lend);
2987 emit_label (l1);
2988 emit_move_insn (dest, const1_rtx);
2989 emit_jump (lend);
2990 if (l2)
2991 {
2992 emit_label (l2);
2993 emit_move_insn (dest, const2_rtx);
2994 }
2995 emit_label (lend);
2996 }
2997
2998 /* Expand comparison setting or clearing carry flag. Return true when
2999 successful and set pop for the operation. */
3000 static bool
3001 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
3002 {
3003 machine_mode mode
3004 = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
3005
3006 /* Do not handle double-mode compares that go through special path. */
3007 if (mode == (TARGET_64BIT ? TImode : DImode))
3008 return false;
3009
3010 if (SCALAR_FLOAT_MODE_P (mode))
3011 {
3012 rtx compare_op;
3013 rtx_insn *compare_seq;
3014
3015 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
3016
3017 /* Shortcut: following common codes never translate
3018 into carry flag compares. */
3019 if (code == EQ || code == NE || code == UNEQ || code == LTGT
3020 || code == ORDERED || code == UNORDERED)
3021 return false;
3022
3023 /* These comparisons require zero flag; swap operands so they won't. */
3024 if ((code == GT || code == UNLE || code == LE || code == UNGT)
3025 && !TARGET_IEEE_FP)
3026 {
3027 std::swap (op0, op1);
3028 code = swap_condition (code);
3029 }
3030
3031 /* Try to expand the comparison and verify that we end up with
3032 carry flag based comparison. This fails to be true only when
3033 we decide to expand comparison using arithmetic that is not
3034 too common scenario. */
3035 start_sequence ();
3036 compare_op = ix86_expand_fp_compare (code, op0, op1);
3037 compare_seq = get_insns ();
3038 end_sequence ();
3039
3040 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
3041 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
3042 else
3043 code = GET_CODE (compare_op);
3044
3045 if (code != LTU && code != GEU)
3046 return false;
3047
3048 emit_insn (compare_seq);
3049 *pop = compare_op;
3050 return true;
3051 }
3052
3053 if (!INTEGRAL_MODE_P (mode))
3054 return false;
3055
3056 switch (code)
3057 {
3058 case LTU:
3059 case GEU:
3060 break;
3061
3062 /* Convert a==0 into (unsigned)a<1. */
3063 case EQ:
3064 case NE:
3065 if (op1 != const0_rtx)
3066 return false;
3067 op1 = const1_rtx;
3068 code = (code == EQ ? LTU : GEU);
3069 break;
3070
3071 /* Convert a>b into b<a or a>=b-1. */
3072 case GTU:
3073 case LEU:
3074 if (CONST_INT_P (op1))
3075 {
3076 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
3077 /* Bail out on overflow. We still can swap operands but that
3078 would force loading of the constant into register. */
3079 if (op1 == const0_rtx
3080 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
3081 return false;
3082 code = (code == GTU ? GEU : LTU);
3083 }
3084 else
3085 {
3086 std::swap (op0, op1);
3087 code = (code == GTU ? LTU : GEU);
3088 }
3089 break;
3090
3091 /* Convert a>=0 into (unsigned)a<0x80000000. */
3092 case LT:
3093 case GE:
3094 if (mode == DImode || op1 != const0_rtx)
3095 return false;
3096 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3097 code = (code == LT ? GEU : LTU);
3098 break;
3099 case LE:
3100 case GT:
3101 if (mode == DImode || op1 != constm1_rtx)
3102 return false;
3103 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3104 code = (code == LE ? GEU : LTU);
3105 break;
3106
3107 default:
3108 return false;
3109 }
3110 /* Swapping operands may cause constant to appear as first operand. */
3111 if (!nonimmediate_operand (op0, VOIDmode))
3112 {
3113 if (!can_create_pseudo_p ())
3114 return false;
3115 op0 = force_reg (mode, op0);
3116 }
3117 *pop = ix86_expand_compare (code, op0, op1);
3118 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
3119 return true;
3120 }
3121
3122 /* Expand conditional increment or decrement using adb/sbb instructions.
3123 The default case using setcc followed by the conditional move can be
3124 done by generic code. */
3125 bool
3126 ix86_expand_int_addcc (rtx operands[])
3127 {
3128 enum rtx_code code = GET_CODE (operands[1]);
3129 rtx flags;
3130 rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
3131 rtx compare_op;
3132 rtx val = const0_rtx;
3133 bool fpcmp = false;
3134 machine_mode mode;
3135 rtx op0 = XEXP (operands[1], 0);
3136 rtx op1 = XEXP (operands[1], 1);
3137
3138 if (operands[3] != const1_rtx
3139 && operands[3] != constm1_rtx)
3140 return false;
3141 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3142 return false;
3143 code = GET_CODE (compare_op);
3144
3145 flags = XEXP (compare_op, 0);
3146
3147 if (GET_MODE (flags) == CCFPmode)
3148 {
3149 fpcmp = true;
3150 code = ix86_fp_compare_code_to_integer (code);
3151 }
3152
3153 if (code != LTU)
3154 {
3155 val = constm1_rtx;
3156 if (fpcmp)
3157 PUT_CODE (compare_op,
3158 reverse_condition_maybe_unordered
3159 (GET_CODE (compare_op)));
3160 else
3161 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
3162 }
3163
3164 mode = GET_MODE (operands[0]);
3165
3166 /* Construct either adc or sbb insn. */
3167 if ((code == LTU) == (operands[3] == constm1_rtx))
3168 insn = gen_sub3_carry;
3169 else
3170 insn = gen_add3_carry;
3171
3172 emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
3173
3174 return true;
3175 }
3176
3177 bool
3178 ix86_expand_int_movcc (rtx operands[])
3179 {
3180 enum rtx_code code = GET_CODE (operands[1]), compare_code;
3181 rtx_insn *compare_seq;
3182 rtx compare_op;
3183 machine_mode mode = GET_MODE (operands[0]);
3184 bool sign_bit_compare_p = false;
3185 bool negate_cc_compare_p = false;
3186 rtx op0 = XEXP (operands[1], 0);
3187 rtx op1 = XEXP (operands[1], 1);
3188 rtx op2 = operands[2];
3189 rtx op3 = operands[3];
3190
3191 if (GET_MODE (op0) == TImode
3192 || (GET_MODE (op0) == DImode
3193 && !TARGET_64BIT))
3194 return false;
3195
3196 if (GET_MODE (op0) == BFmode
3197 && !ix86_fp_comparison_operator (operands[1], VOIDmode))
3198 return false;
3199
3200 start_sequence ();
3201 compare_op = ix86_expand_compare (code, op0, op1);
3202 compare_seq = get_insns ();
3203 end_sequence ();
3204
3205 compare_code = GET_CODE (compare_op);
3206
3207 if ((op1 == const0_rtx && (code == GE || code == LT))
3208 || (op1 == constm1_rtx && (code == GT || code == LE)))
3209 sign_bit_compare_p = true;
3210
3211 /* op0 == op1 ? op0 : op3 is equivalent to op0 == op1 ? op1 : op3,
3212 but if op1 is a constant, the latter form allows more optimizations,
3213 either through the last 2 ops being constant handling, or the one
3214 constant and one variable cases. On the other side, for cmov the
3215 former might be better as we don't need to load the constant into
3216 another register. */
3217 if (code == EQ && CONST_INT_P (op1) && rtx_equal_p (op0, op2))
3218 op2 = op1;
3219 /* Similarly for op0 != op1 ? op2 : op0 and op0 != op1 ? op2 : op1. */
3220 else if (code == NE && CONST_INT_P (op1) && rtx_equal_p (op0, op3))
3221 op3 = op1;
3222
3223 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
3224 HImode insns, we'd be swallowed in word prefix ops. */
3225
3226 if ((mode != HImode || TARGET_FAST_PREFIX)
3227 && (mode != (TARGET_64BIT ? TImode : DImode))
3228 && CONST_INT_P (op2)
3229 && CONST_INT_P (op3))
3230 {
3231 rtx out = operands[0];
3232 HOST_WIDE_INT ct = INTVAL (op2);
3233 HOST_WIDE_INT cf = INTVAL (op3);
3234 HOST_WIDE_INT diff;
3235
3236 if ((mode == SImode
3237 || (TARGET_64BIT && mode == DImode))
3238 && (GET_MODE (op0) == SImode
3239 || (TARGET_64BIT && GET_MODE (op0) == DImode)))
3240 {
3241 /* Special case x != 0 ? -1 : y. */
3242 if (code == NE && op1 == const0_rtx && ct == -1)
3243 {
3244 negate_cc_compare_p = true;
3245 std::swap (ct, cf);
3246 code = EQ;
3247 }
3248 else if (code == EQ && op1 == const0_rtx && cf == -1)
3249 negate_cc_compare_p = true;
3250 }
3251
3252 diff = ct - cf;
3253 /* Sign bit compares are better done using shifts than we do by using
3254 sbb. */
3255 if (sign_bit_compare_p
3256 || negate_cc_compare_p
3257 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3258 {
3259 /* Detect overlap between destination and compare sources. */
3260 rtx tmp = out;
3261
3262 if (negate_cc_compare_p)
3263 {
3264 if (GET_MODE (op0) == DImode)
3265 emit_insn (gen_x86_negdi_ccc (gen_reg_rtx (DImode), op0));
3266 else
3267 emit_insn (gen_x86_negsi_ccc (gen_reg_rtx (SImode),
3268 gen_lowpart (SImode, op0)));
3269
3270 tmp = gen_reg_rtx (mode);
3271 if (mode == DImode)
3272 emit_insn (gen_x86_movdicc_0_m1_neg (tmp));
3273 else
3274 emit_insn (gen_x86_movsicc_0_m1_neg (gen_lowpart (SImode,
3275 tmp)));
3276 }
3277 else if (!sign_bit_compare_p)
3278 {
3279 rtx flags;
3280 bool fpcmp = false;
3281
3282 compare_code = GET_CODE (compare_op);
3283
3284 flags = XEXP (compare_op, 0);
3285
3286 if (GET_MODE (flags) == CCFPmode)
3287 {
3288 fpcmp = true;
3289 compare_code
3290 = ix86_fp_compare_code_to_integer (compare_code);
3291 }
3292
3293 /* To simplify rest of code, restrict to the GEU case. */
3294 if (compare_code == LTU)
3295 {
3296 std::swap (ct, cf);
3297 compare_code = reverse_condition (compare_code);
3298 code = reverse_condition (code);
3299 }
3300 else
3301 {
3302 if (fpcmp)
3303 PUT_CODE (compare_op,
3304 reverse_condition_maybe_unordered
3305 (GET_CODE (compare_op)));
3306 else
3307 PUT_CODE (compare_op,
3308 reverse_condition (GET_CODE (compare_op)));
3309 }
3310 diff = ct - cf;
3311
3312 if (reg_overlap_mentioned_p (out, compare_op))
3313 tmp = gen_reg_rtx (mode);
3314
3315 if (mode == DImode)
3316 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
3317 else
3318 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
3319 flags, compare_op));
3320 }
3321 else
3322 {
3323 if (code == GT || code == GE)
3324 code = reverse_condition (code);
3325 else
3326 {
3327 std::swap (ct, cf);
3328 diff = ct - cf;
3329 }
3330 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
3331 }
3332
3333 if (diff == 1)
3334 {
3335 /*
3336 * cmpl op0,op1
3337 * sbbl dest,dest
3338 * [addl dest, ct]
3339 *
3340 * Size 5 - 8.
3341 */
3342 if (ct)
3343 tmp = expand_simple_binop (mode, PLUS,
3344 tmp, GEN_INT (ct),
3345 copy_rtx (tmp), 1, OPTAB_DIRECT);
3346 }
3347 else if (cf == -1)
3348 {
3349 /*
3350 * cmpl op0,op1
3351 * sbbl dest,dest
3352 * orl $ct, dest
3353 *
3354 * Size 8.
3355 */
3356 tmp = expand_simple_binop (mode, IOR,
3357 tmp, GEN_INT (ct),
3358 copy_rtx (tmp), 1, OPTAB_DIRECT);
3359 }
3360 else if (diff == -1 && ct)
3361 {
3362 /*
3363 * cmpl op0,op1
3364 * sbbl dest,dest
3365 * notl dest
3366 * [addl dest, cf]
3367 *
3368 * Size 8 - 11.
3369 */
3370 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3371 if (cf)
3372 tmp = expand_simple_binop (mode, PLUS,
3373 copy_rtx (tmp), GEN_INT (cf),
3374 copy_rtx (tmp), 1, OPTAB_DIRECT);
3375 }
3376 else
3377 {
3378 /*
3379 * cmpl op0,op1
3380 * sbbl dest,dest
3381 * [notl dest]
3382 * andl cf - ct, dest
3383 * [addl dest, ct]
3384 *
3385 * Size 8 - 11.
3386 */
3387
3388 if (cf == 0)
3389 {
3390 cf = ct;
3391 ct = 0;
3392 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3393 }
3394
3395 tmp = expand_simple_binop (mode, AND,
3396 copy_rtx (tmp),
3397 gen_int_mode (cf - ct, mode),
3398 copy_rtx (tmp), 1, OPTAB_DIRECT);
3399 if (ct)
3400 tmp = expand_simple_binop (mode, PLUS,
3401 copy_rtx (tmp), GEN_INT (ct),
3402 copy_rtx (tmp), 1, OPTAB_DIRECT);
3403 }
3404
3405 if (!rtx_equal_p (tmp, out))
3406 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3407
3408 return true;
3409 }
3410
3411 if (diff < 0)
3412 {
3413 machine_mode cmp_mode = GET_MODE (op0);
3414 enum rtx_code new_code;
3415
3416 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3417 {
3418 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3419
3420 /* We may be reversing a non-trapping
3421 comparison to a trapping comparison. */
3422 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3423 && code != EQ && code != NE
3424 && code != ORDERED && code != UNORDERED)
3425 new_code = UNKNOWN;
3426 else
3427 new_code = reverse_condition_maybe_unordered (code);
3428 }
3429 else
3430 new_code = ix86_reverse_condition (code, cmp_mode);
3431 if (new_code != UNKNOWN)
3432 {
3433 std::swap (ct, cf);
3434 diff = -diff;
3435 code = new_code;
3436 }
3437 }
3438
3439 compare_code = UNKNOWN;
3440 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3441 && CONST_INT_P (op1))
3442 {
3443 if (op1 == const0_rtx
3444 && (code == LT || code == GE))
3445 compare_code = code;
3446 else if (op1 == constm1_rtx)
3447 {
3448 if (code == LE)
3449 compare_code = LT;
3450 else if (code == GT)
3451 compare_code = GE;
3452 }
3453 }
3454
3455 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3456 if (compare_code != UNKNOWN
3457 && GET_MODE (op0) == GET_MODE (out)
3458 && (cf == -1 || ct == -1))
3459 {
3460 /* If lea code below could be used, only optimize
3461 if it results in a 2 insn sequence. */
3462
3463 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3464 || diff == 3 || diff == 5 || diff == 9)
3465 || (compare_code == LT && ct == -1)
3466 || (compare_code == GE && cf == -1))
3467 {
3468 /*
3469 * notl op1 (if necessary)
3470 * sarl $31, op1
3471 * orl cf, op1
3472 */
3473 if (ct != -1)
3474 {
3475 cf = ct;
3476 ct = -1;
3477 code = reverse_condition (code);
3478 }
3479
3480 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3481
3482 out = expand_simple_binop (mode, IOR,
3483 out, GEN_INT (cf),
3484 out, 1, OPTAB_DIRECT);
3485 if (out != operands[0])
3486 emit_move_insn (operands[0], out);
3487
3488 return true;
3489 }
3490 }
3491
3492
3493 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3494 || diff == 3 || diff == 5 || diff == 9)
3495 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3496 && (mode != DImode
3497 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3498 {
3499 /*
3500 * xorl dest,dest
3501 * cmpl op1,op2
3502 * setcc dest
3503 * lea cf(dest*(ct-cf)),dest
3504 *
3505 * Size 14.
3506 *
3507 * This also catches the degenerate setcc-only case.
3508 */
3509
3510 rtx tmp;
3511 int nops;
3512
3513 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3514
3515 nops = 0;
3516 /* On x86_64 the lea instruction operates on Pmode, so we need
3517 to get arithmetics done in proper mode to match. */
3518 if (diff == 1)
3519 tmp = copy_rtx (out);
3520 else
3521 {
3522 rtx out1;
3523 out1 = copy_rtx (out);
3524 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3525 nops++;
3526 if (diff & 1)
3527 {
3528 tmp = gen_rtx_PLUS (mode, tmp, out1);
3529 nops++;
3530 }
3531 }
3532 if (cf != 0)
3533 {
3534 tmp = plus_constant (mode, tmp, cf);
3535 nops++;
3536 }
3537 if (!rtx_equal_p (tmp, out))
3538 {
3539 if (nops == 1)
3540 out = force_operand (tmp, copy_rtx (out));
3541 else
3542 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3543 }
3544 if (!rtx_equal_p (out, operands[0]))
3545 emit_move_insn (operands[0], copy_rtx (out));
3546
3547 return true;
3548 }
3549
3550 /*
3551 * General case: Jumpful:
3552 * xorl dest,dest cmpl op1, op2
3553 * cmpl op1, op2 movl ct, dest
3554 * setcc dest jcc 1f
3555 * decl dest movl cf, dest
3556 * andl (cf-ct),dest 1:
3557 * addl ct,dest
3558 *
3559 * Size 20. Size 14.
3560 *
3561 * This is reasonably steep, but branch mispredict costs are
3562 * high on modern cpus, so consider failing only if optimizing
3563 * for space.
3564 */
3565
3566 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3567 && BRANCH_COST (optimize_insn_for_speed_p (),
3568 false) >= 2)
3569 {
3570 if (cf == 0)
3571 {
3572 machine_mode cmp_mode = GET_MODE (op0);
3573 enum rtx_code new_code;
3574
3575 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3576 {
3577 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3578
3579 /* We may be reversing a non-trapping
3580 comparison to a trapping comparison. */
3581 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3582 && code != EQ && code != NE
3583 && code != ORDERED && code != UNORDERED)
3584 new_code = UNKNOWN;
3585 else
3586 new_code = reverse_condition_maybe_unordered (code);
3587
3588 }
3589 else
3590 {
3591 new_code = ix86_reverse_condition (code, cmp_mode);
3592 if (compare_code != UNKNOWN && new_code != UNKNOWN)
3593 compare_code = reverse_condition (compare_code);
3594 }
3595
3596 if (new_code != UNKNOWN)
3597 {
3598 cf = ct;
3599 ct = 0;
3600 code = new_code;
3601 }
3602 }
3603
3604 if (compare_code != UNKNOWN)
3605 {
3606 /* notl op1 (if needed)
3607 sarl $31, op1
3608 andl (cf-ct), op1
3609 addl ct, op1
3610
3611 For x < 0 (resp. x <= -1) there will be no notl,
3612 so if possible swap the constants to get rid of the
3613 complement.
3614 True/false will be -1/0 while code below (store flag
3615 followed by decrement) is 0/-1, so the constants need
3616 to be exchanged once more. */
3617
3618 if (compare_code == GE || !cf)
3619 {
3620 code = reverse_condition (code);
3621 compare_code = LT;
3622 }
3623 else
3624 std::swap (ct, cf);
3625
3626 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3627 }
3628 else
3629 {
3630 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3631
3632 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
3633 constm1_rtx,
3634 copy_rtx (out), 1, OPTAB_DIRECT);
3635 }
3636
3637 out = expand_simple_binop (mode, AND, copy_rtx (out),
3638 gen_int_mode (cf - ct, mode),
3639 copy_rtx (out), 1, OPTAB_DIRECT);
3640 if (ct)
3641 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
3642 copy_rtx (out), 1, OPTAB_DIRECT);
3643 if (!rtx_equal_p (out, operands[0]))
3644 emit_move_insn (operands[0], copy_rtx (out));
3645
3646 return true;
3647 }
3648 }
3649
3650 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3651 {
3652 /* Try a few things more with specific constants and a variable. */
3653
3654 optab op;
3655 rtx var, orig_out, out, tmp;
3656
3657 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3658 return false;
3659
3660 operands[2] = op2;
3661 operands[3] = op3;
3662
3663 /* If one of the two operands is an interesting constant, load a
3664 constant with the above and mask it in with a logical operation. */
3665
3666 if (CONST_INT_P (operands[2]))
3667 {
3668 var = operands[3];
3669 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
3670 operands[3] = constm1_rtx, op = and_optab;
3671 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
3672 operands[3] = const0_rtx, op = ior_optab;
3673 else
3674 return false;
3675 }
3676 else if (CONST_INT_P (operands[3]))
3677 {
3678 var = operands[2];
3679 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
3680 {
3681 /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3682 "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
3683 if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
3684 operands[1] = simplify_gen_relational (LT, VOIDmode,
3685 GET_MODE (op0),
3686 op0, const0_rtx);
3687
3688 operands[2] = constm1_rtx;
3689 op = and_optab;
3690 }
3691 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
3692 operands[2] = const0_rtx, op = ior_optab;
3693 else
3694 return false;
3695 }
3696 else
3697 return false;
3698
3699 orig_out = operands[0];
3700 tmp = gen_reg_rtx (mode);
3701 operands[0] = tmp;
3702
3703 /* Recurse to get the constant loaded. */
3704 if (!ix86_expand_int_movcc (operands))
3705 return false;
3706
3707 /* Mask in the interesting variable. */
3708 out = expand_binop (mode, op, var, tmp, orig_out, 0,
3709 OPTAB_WIDEN);
3710 if (!rtx_equal_p (out, orig_out))
3711 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
3712
3713 return true;
3714 }
3715
3716 /*
3717 * For comparison with above,
3718 *
3719 * movl cf,dest
3720 * movl ct,tmp
3721 * cmpl op1,op2
3722 * cmovcc tmp,dest
3723 *
3724 * Size 15.
3725 */
3726
3727 if (! nonimmediate_operand (operands[2], mode))
3728 operands[2] = force_reg (mode, operands[2]);
3729 if (! nonimmediate_operand (operands[3], mode))
3730 operands[3] = force_reg (mode, operands[3]);
3731
3732 if (! register_operand (operands[2], VOIDmode)
3733 && (mode == QImode
3734 || ! register_operand (operands[3], VOIDmode)))
3735 operands[2] = force_reg (mode, operands[2]);
3736
3737 if (mode == QImode
3738 && ! register_operand (operands[3], VOIDmode))
3739 operands[3] = force_reg (mode, operands[3]);
3740
3741 emit_insn (compare_seq);
3742 emit_insn (gen_rtx_SET (operands[0],
3743 gen_rtx_IF_THEN_ELSE (mode,
3744 compare_op, operands[2],
3745 operands[3])));
3746 return true;
3747 }
3748
3749 /* Detect conditional moves that exactly match min/max operational
3750 semantics. Note that this is IEEE safe, as long as we don't
3751 interchange the operands.
3752
3753 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3754 and TRUE if the operation is successful and instructions are emitted. */
3755
3756 static bool
3757 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
3758 rtx cmp_op1, rtx if_true, rtx if_false)
3759 {
3760 machine_mode mode;
3761 bool is_min;
3762 rtx tmp;
3763
3764 if (code == LT)
3765 ;
3766 else if (code == UNGE)
3767 std::swap (if_true, if_false);
3768 else
3769 return false;
3770
3771 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
3772 is_min = true;
3773 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
3774 is_min = false;
3775 else
3776 return false;
3777
3778 mode = GET_MODE (dest);
3779
3780 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3781 but MODE may be a vector mode and thus not appropriate. */
3782 if (!flag_finite_math_only || flag_signed_zeros)
3783 {
3784 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
3785 rtvec v;
3786
3787 if_true = force_reg (mode, if_true);
3788 v = gen_rtvec (2, if_true, if_false);
3789 tmp = gen_rtx_UNSPEC (mode, v, u);
3790 }
3791 else
3792 {
3793 code = is_min ? SMIN : SMAX;
3794 if (MEM_P (if_true) && MEM_P (if_false))
3795 if_true = force_reg (mode, if_true);
3796 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
3797 }
3798
3799 emit_insn (gen_rtx_SET (dest, tmp));
3800 return true;
3801 }
3802
3803 /* Return true if MODE is valid for vector compare to mask register,
3804 Same result for conditionl vector move with mask register. */
3805 static bool
3806 ix86_valid_mask_cmp_mode (machine_mode mode)
3807 {
3808 /* XOP has its own vector conditional movement. */
3809 if (TARGET_XOP && !TARGET_AVX512F)
3810 return false;
3811
3812 /* HFmode only supports vcmpsh whose dest is mask register. */
3813 if (TARGET_AVX512FP16 && mode == HFmode)
3814 return true;
3815
3816 /* AVX512F is needed for mask operation. */
3817 if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
3818 return false;
3819
3820 /* AVX512BW is needed for vector QI/HImode,
3821 AVX512VL is needed for 128/256-bit vector. */
3822 machine_mode inner_mode = GET_MODE_INNER (mode);
3823 int vector_size = GET_MODE_SIZE (mode);
3824 if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
3825 return false;
3826
3827 return vector_size == 64 || TARGET_AVX512VL;
3828 }
3829
3830 /* Return true if integer mask comparison should be used. */
3831 static bool
3832 ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
3833 rtx op_true, rtx op_false)
3834 {
3835 int vector_size = GET_MODE_SIZE (mode);
3836
3837 if (cmp_mode == HFmode)
3838 return true;
3839 else if (vector_size < 16)
3840 return false;
3841 else if (vector_size == 64)
3842 return true;
3843 else if (GET_MODE_INNER (cmp_mode) == HFmode)
3844 return true;
3845
3846 /* When op_true is NULL, op_false must be NULL, or vice versa. */
3847 gcc_assert (!op_true == !op_false);
3848
3849 /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
3850 vector dest is required. */
3851 if (!op_true || !ix86_valid_mask_cmp_mode (cmp_mode))
3852 return false;
3853
3854 /* Exclude those that could be optimized in ix86_expand_sse_movcc. */
3855 if (op_false == CONST0_RTX (mode)
3856 || op_true == CONST0_RTX (mode)
3857 || (INTEGRAL_MODE_P (mode)
3858 && (op_true == CONSTM1_RTX (mode)
3859 || op_false == CONSTM1_RTX (mode))))
3860 return false;
3861
3862 return true;
3863 }
3864
3865 /* Expand an SSE comparison. Return the register with the result. */
3866
3867 static rtx
3868 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
3869 rtx op_true, rtx op_false)
3870 {
3871 machine_mode mode = GET_MODE (dest);
3872 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
3873
3874 /* In general case result of comparison can differ from operands' type. */
3875 machine_mode cmp_mode;
3876
3877 /* In AVX512F the result of comparison is an integer mask. */
3878 bool maskcmp = false;
3879 rtx x;
3880
3881 if (ix86_use_mask_cmp_p (mode, cmp_ops_mode, op_true, op_false))
3882 {
3883 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
3884 maskcmp = true;
3885 cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
3886 }
3887 else
3888 cmp_mode = cmp_ops_mode;
3889
3890 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
3891
3892 bool (*op1_predicate)(rtx, machine_mode)
3893 = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
3894
3895 if (!op1_predicate (cmp_op1, cmp_ops_mode))
3896 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
3897
3898 if (optimize
3899 || (maskcmp && cmp_mode != mode)
3900 || (op_true && reg_overlap_mentioned_p (dest, op_true))
3901 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
3902 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
3903
3904 if (maskcmp)
3905 {
3906 bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
3907 gcc_assert (ok);
3908 return dest;
3909 }
3910
3911 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
3912
3913 if (cmp_mode != mode)
3914 {
3915 x = force_reg (cmp_ops_mode, x);
3916 convert_move (dest, x, false);
3917 }
3918 else
3919 emit_insn (gen_rtx_SET (dest, x));
3920
3921 return dest;
3922 }
3923
3924 /* Emit x86 binary operand CODE in mode MODE for SSE vector
3925 instructions that can be performed using GP registers. */
3926
3927 static void
3928 ix86_emit_vec_binop (enum rtx_code code, machine_mode mode,
3929 rtx dst, rtx src1, rtx src2)
3930 {
3931 rtx tmp;
3932
3933 tmp = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
3934
3935 if (GET_MODE_SIZE (mode) <= GET_MODE_SIZE (SImode)
3936 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
3937 {
3938 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
3939 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
3940 }
3941
3942 emit_insn (tmp);
3943 }
3944
3945 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3946 operations. This is used for both scalar and vector conditional moves. */
3947
3948 void
3949 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
3950 {
3951 machine_mode mode = GET_MODE (dest);
3952 machine_mode cmpmode = GET_MODE (cmp);
3953 rtx x;
3954
3955 /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
3956 if (rtx_equal_p (op_true, op_false))
3957 {
3958 emit_move_insn (dest, op_true);
3959 return;
3960 }
3961
3962 /* If we have an integer mask and FP value then we need
3963 to cast mask to FP mode. */
3964 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
3965 {
3966 cmp = force_reg (cmpmode, cmp);
3967 cmp = gen_rtx_SUBREG (mode, cmp, 0);
3968 }
3969
3970 /* In AVX512F the result of comparison is an integer mask. */
3971 if (mode != cmpmode
3972 && GET_MODE_CLASS (cmpmode) == MODE_INT)
3973 {
3974 gcc_assert (ix86_valid_mask_cmp_mode (mode));
3975 /* Using scalar/vector move with mask register. */
3976 cmp = force_reg (cmpmode, cmp);
3977 /* Optimize for mask zero. */
3978 op_true = (op_true != CONST0_RTX (mode)
3979 ? force_reg (mode, op_true) : op_true);
3980 op_false = (op_false != CONST0_RTX (mode)
3981 ? force_reg (mode, op_false) : op_false);
3982 if (op_true == CONST0_RTX (mode))
3983 {
3984 if (cmpmode == E_DImode && !TARGET_64BIT)
3985 {
3986 x = gen_reg_rtx (cmpmode);
3987 emit_insn (gen_knotdi (x, cmp));
3988 }
3989 else
3990 x = expand_simple_unop (cmpmode, NOT, cmp, NULL, 1);
3991 cmp = x;
3992 /* Reverse op_true op_false. */
3993 std::swap (op_true, op_false);
3994 }
3995
3996 if (mode == HFmode)
3997 emit_insn (gen_movhf_mask (dest, op_true, op_false, cmp));
3998 else
3999 emit_insn (gen_rtx_SET (dest,
4000 gen_rtx_VEC_MERGE (mode,
4001 op_true, op_false, cmp)));
4002 return;
4003 }
4004
4005 if (vector_all_ones_operand (op_true, mode)
4006 && op_false == CONST0_RTX (mode))
4007 {
4008 emit_move_insn (dest, cmp);
4009 return;
4010 }
4011 else if (op_false == CONST0_RTX (mode))
4012 {
4013 x = expand_simple_binop (mode, AND, cmp, op_true,
4014 dest, 1, OPTAB_DIRECT);
4015 if (x != dest)
4016 emit_move_insn (dest, x);
4017 return;
4018 }
4019 else if (op_true == CONST0_RTX (mode))
4020 {
4021 op_false = force_reg (mode, op_false);
4022 x = gen_rtx_NOT (mode, cmp);
4023 ix86_emit_vec_binop (AND, mode, dest, x, op_false);
4024 return;
4025 }
4026 else if (vector_all_ones_operand (op_true, mode))
4027 {
4028 x = expand_simple_binop (mode, IOR, cmp, op_false,
4029 dest, 1, OPTAB_DIRECT);
4030 if (x != dest)
4031 emit_move_insn (dest, x);
4032 return;
4033 }
4034
4035 if (TARGET_XOP)
4036 {
4037 op_true = force_reg (mode, op_true);
4038
4039 if (GET_MODE_SIZE (mode) < 16
4040 || !nonimmediate_operand (op_false, mode))
4041 op_false = force_reg (mode, op_false);
4042
4043 emit_insn (gen_rtx_SET (dest,
4044 gen_rtx_IF_THEN_ELSE (mode, cmp,
4045 op_true, op_false)));
4046 return;
4047 }
4048
4049 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4050 machine_mode blend_mode = mode;
4051
4052 if (GET_MODE_SIZE (mode) < 16
4053 || !vector_operand (op_true, mode))
4054 op_true = force_reg (mode, op_true);
4055
4056 op_false = force_reg (mode, op_false);
4057
4058 switch (mode)
4059 {
4060 case E_V2SFmode:
4061 if (TARGET_SSE4_1)
4062 gen = gen_mmx_blendvps;
4063 break;
4064 case E_V4SFmode:
4065 if (TARGET_SSE4_1)
4066 gen = gen_sse4_1_blendvps;
4067 break;
4068 case E_V2DFmode:
4069 if (TARGET_SSE4_1)
4070 gen = gen_sse4_1_blendvpd;
4071 break;
4072 case E_SFmode:
4073 if (TARGET_SSE4_1)
4074 gen = gen_sse4_1_blendvss;
4075 break;
4076 case E_DFmode:
4077 if (TARGET_SSE4_1)
4078 gen = gen_sse4_1_blendvsd;
4079 break;
4080 case E_V8QImode:
4081 case E_V4HImode:
4082 case E_V2SImode:
4083 if (TARGET_SSE4_1)
4084 {
4085 gen = gen_mmx_pblendvb_v8qi;
4086 blend_mode = V8QImode;
4087 }
4088 break;
4089 case E_V4QImode:
4090 case E_V2HImode:
4091 if (TARGET_SSE4_1)
4092 {
4093 gen = gen_mmx_pblendvb_v4qi;
4094 blend_mode = V4QImode;
4095 }
4096 break;
4097 case E_V2QImode:
4098 if (TARGET_SSE4_1)
4099 gen = gen_mmx_pblendvb_v2qi;
4100 break;
4101 case E_V16QImode:
4102 case E_V8HImode:
4103 case E_V8HFmode:
4104 case E_V8BFmode:
4105 case E_V4SImode:
4106 case E_V2DImode:
4107 case E_V1TImode:
4108 if (TARGET_SSE4_1)
4109 {
4110 gen = gen_sse4_1_pblendvb;
4111 blend_mode = V16QImode;
4112 }
4113 break;
4114 case E_V8SFmode:
4115 if (TARGET_AVX)
4116 gen = gen_avx_blendvps256;
4117 break;
4118 case E_V4DFmode:
4119 if (TARGET_AVX)
4120 gen = gen_avx_blendvpd256;
4121 break;
4122 case E_V32QImode:
4123 case E_V16HImode:
4124 case E_V16HFmode:
4125 case E_V16BFmode:
4126 case E_V8SImode:
4127 case E_V4DImode:
4128 if (TARGET_AVX2)
4129 {
4130 gen = gen_avx2_pblendvb;
4131 blend_mode = V32QImode;
4132 }
4133 break;
4134
4135 case E_V64QImode:
4136 gen = gen_avx512bw_blendmv64qi;
4137 break;
4138 case E_V32HImode:
4139 gen = gen_avx512bw_blendmv32hi;
4140 break;
4141 case E_V32HFmode:
4142 gen = gen_avx512bw_blendmv32hf;
4143 break;
4144 case E_V32BFmode:
4145 gen = gen_avx512bw_blendmv32bf;
4146 break;
4147 case E_V16SImode:
4148 gen = gen_avx512f_blendmv16si;
4149 break;
4150 case E_V8DImode:
4151 gen = gen_avx512f_blendmv8di;
4152 break;
4153 case E_V8DFmode:
4154 gen = gen_avx512f_blendmv8df;
4155 break;
4156 case E_V16SFmode:
4157 gen = gen_avx512f_blendmv16sf;
4158 break;
4159
4160 default:
4161 break;
4162 }
4163
4164 if (gen != NULL)
4165 {
4166 if (blend_mode == mode)
4167 x = dest;
4168 else
4169 {
4170 x = gen_reg_rtx (blend_mode);
4171 op_false = gen_lowpart (blend_mode, op_false);
4172 op_true = gen_lowpart (blend_mode, op_true);
4173 cmp = gen_lowpart (blend_mode, cmp);
4174 }
4175
4176 emit_insn (gen (x, op_false, op_true, cmp));
4177
4178 if (x != dest)
4179 emit_move_insn (dest, gen_lowpart (mode, x));
4180 }
4181 else
4182 {
4183 rtx t2, t3;
4184
4185 t2 = expand_simple_binop (mode, AND, op_true, cmp,
4186 NULL, 1, OPTAB_DIRECT);
4187
4188 t3 = gen_reg_rtx (mode);
4189 x = gen_rtx_NOT (mode, cmp);
4190 ix86_emit_vec_binop (AND, mode, t3, x, op_false);
4191
4192 x = expand_simple_binop (mode, IOR, t3, t2,
4193 dest, 1, OPTAB_DIRECT);
4194 if (x != dest)
4195 emit_move_insn (dest, x);
4196 }
4197 }
4198
4199 /* Swap, force into registers, or otherwise massage the two operands
4200 to an sse comparison with a mask result. Thus we differ a bit from
4201 ix86_prepare_fp_compare_args which expects to produce a flags result.
4202
4203 The DEST operand exists to help determine whether to commute commutative
4204 operators. The POP0/POP1 operands are updated in place. The new
4205 comparison code is returned, or UNKNOWN if not implementable. */
4206
4207 static enum rtx_code
4208 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
4209 rtx *pop0, rtx *pop1)
4210 {
4211 switch (code)
4212 {
4213 case LTGT:
4214 case UNEQ:
4215 /* AVX supports all the needed comparisons. */
4216 if (TARGET_AVX)
4217 break;
4218 /* We have no LTGT as an operator. We could implement it with
4219 NE & ORDERED, but this requires an extra temporary. It's
4220 not clear that it's worth it. */
4221 return UNKNOWN;
4222
4223 case LT:
4224 case LE:
4225 case UNGT:
4226 case UNGE:
4227 /* These are supported directly. */
4228 break;
4229
4230 case EQ:
4231 case NE:
4232 case UNORDERED:
4233 case ORDERED:
4234 /* AVX has 3 operand comparisons, no need to swap anything. */
4235 if (TARGET_AVX)
4236 break;
4237 /* For commutative operators, try to canonicalize the destination
4238 operand to be first in the comparison - this helps reload to
4239 avoid extra moves. */
4240 if (!dest || !rtx_equal_p (dest, *pop1))
4241 break;
4242 /* FALLTHRU */
4243
4244 case GE:
4245 case GT:
4246 case UNLE:
4247 case UNLT:
4248 /* These are not supported directly before AVX, and furthermore
4249 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
4250 comparison operands to transform into something that is
4251 supported. */
4252 std::swap (*pop0, *pop1);
4253 code = swap_condition (code);
4254 break;
4255
4256 default:
4257 gcc_unreachable ();
4258 }
4259
4260 return code;
4261 }
4262
4263 /* Expand a floating-point conditional move. Return true if successful. */
4264
4265 bool
4266 ix86_expand_fp_movcc (rtx operands[])
4267 {
4268 machine_mode mode = GET_MODE (operands[0]);
4269 enum rtx_code code = GET_CODE (operands[1]);
4270 rtx tmp, compare_op;
4271 rtx op0 = XEXP (operands[1], 0);
4272 rtx op1 = XEXP (operands[1], 1);
4273
4274 if (GET_MODE (op0) == BFmode
4275 && !ix86_fp_comparison_operator (operands[1], VOIDmode))
4276 return false;
4277
4278 if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode))
4279 {
4280 machine_mode cmode;
4281
4282 /* Since we've no cmove for sse registers, don't force bad register
4283 allocation just to gain access to it. Deny movcc when the
4284 comparison mode doesn't match the move mode. */
4285 cmode = GET_MODE (op0);
4286 if (cmode == VOIDmode)
4287 cmode = GET_MODE (op1);
4288 if (cmode != mode)
4289 return false;
4290
4291 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
4292 if (code == UNKNOWN)
4293 return false;
4294
4295 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
4296 operands[2], operands[3]))
4297 return true;
4298
4299 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
4300 operands[2], operands[3]);
4301 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
4302 return true;
4303 }
4304
4305 if (GET_MODE (op0) == TImode
4306 || (GET_MODE (op0) == DImode
4307 && !TARGET_64BIT))
4308 return false;
4309
4310 /* The floating point conditional move instructions don't directly
4311 support conditions resulting from a signed integer comparison. */
4312
4313 compare_op = ix86_expand_compare (code, op0, op1);
4314 if (!fcmov_comparison_operator (compare_op, VOIDmode))
4315 {
4316 tmp = gen_reg_rtx (QImode);
4317 ix86_expand_setcc (tmp, code, op0, op1);
4318
4319 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
4320 }
4321
4322 emit_insn (gen_rtx_SET (operands[0],
4323 gen_rtx_IF_THEN_ELSE (mode, compare_op,
4324 operands[2], operands[3])));
4325
4326 return true;
4327 }
4328
4329 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
4330
4331 static int
4332 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
4333 {
4334 switch (code)
4335 {
4336 case EQ:
4337 return 0;
4338 case LT:
4339 case LTU:
4340 return 1;
4341 case LE:
4342 case LEU:
4343 return 2;
4344 case NE:
4345 return 4;
4346 case GE:
4347 case GEU:
4348 return 5;
4349 case GT:
4350 case GTU:
4351 return 6;
4352 default:
4353 gcc_unreachable ();
4354 }
4355 }
4356
4357 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
4358
4359 static int
4360 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
4361 {
4362 switch (code)
4363 {
4364 case EQ:
4365 return 0x00;
4366 case NE:
4367 return 0x04;
4368 case GT:
4369 return 0x0e;
4370 case LE:
4371 return 0x02;
4372 case GE:
4373 return 0x0d;
4374 case LT:
4375 return 0x01;
4376 case UNLE:
4377 return 0x0a;
4378 case UNLT:
4379 return 0x09;
4380 case UNGE:
4381 return 0x05;
4382 case UNGT:
4383 return 0x06;
4384 case UNEQ:
4385 return 0x18;
4386 case LTGT:
4387 return 0x0c;
4388 case ORDERED:
4389 return 0x07;
4390 case UNORDERED:
4391 return 0x03;
4392 default:
4393 gcc_unreachable ();
4394 }
4395 }
4396
4397 /* Return immediate value to be used in UNSPEC_PCMP
4398 for comparison CODE in MODE. */
4399
4400 static int
4401 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
4402 {
4403 if (FLOAT_MODE_P (mode))
4404 return ix86_fp_cmp_code_to_pcmp_immediate (code);
4405 return ix86_int_cmp_code_to_pcmp_immediate (code);
4406 }
4407
4408 /* Expand AVX-512 vector comparison. */
4409
4410 bool
4411 ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
4412 {
4413 machine_mode mask_mode = GET_MODE (dest);
4414 machine_mode cmp_mode = GET_MODE (cmp_op0);
4415 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
4416 int unspec_code;
4417 rtx unspec;
4418
4419 switch (code)
4420 {
4421 case LEU:
4422 case GTU:
4423 case GEU:
4424 case LTU:
4425 unspec_code = UNSPEC_UNSIGNED_PCMP;
4426 break;
4427
4428 default:
4429 unspec_code = UNSPEC_PCMP;
4430 }
4431
4432 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
4433 unspec_code);
4434 emit_insn (gen_rtx_SET (dest, unspec));
4435
4436 return true;
4437 }
4438
4439 /* Expand fp vector comparison. */
4440
4441 bool
4442 ix86_expand_fp_vec_cmp (rtx operands[])
4443 {
4444 enum rtx_code code = GET_CODE (operands[1]);
4445 rtx cmp;
4446
4447 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4448 &operands[2], &operands[3]);
4449 if (code == UNKNOWN)
4450 {
4451 rtx temp;
4452 switch (GET_CODE (operands[1]))
4453 {
4454 case LTGT:
4455 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
4456 operands[3], NULL, NULL);
4457 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
4458 operands[3], NULL, NULL);
4459 code = AND;
4460 break;
4461 case UNEQ:
4462 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
4463 operands[3], NULL, NULL);
4464 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
4465 operands[3], NULL, NULL);
4466 code = IOR;
4467 break;
4468 default:
4469 gcc_unreachable ();
4470 }
4471 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4472 OPTAB_DIRECT);
4473 }
4474 else
4475 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
4476 NULL, NULL);
4477
4478 if (operands[0] != cmp)
4479 emit_move_insn (operands[0], cmp);
4480
4481 return true;
4482 }
4483
4484 static rtx
4485 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4486 rtx op_true, rtx op_false, bool *negate)
4487 {
4488 machine_mode data_mode = GET_MODE (dest);
4489 machine_mode mode = GET_MODE (cop0);
4490 rtx x;
4491
4492 *negate = false;
4493
4494 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4495 if (TARGET_XOP
4496 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
4497 && GET_MODE_SIZE (mode) <= 16)
4498 ;
4499 /* AVX512F supports all of the comparsions
4500 on all 128/256/512-bit vector int types. */
4501 else if (ix86_use_mask_cmp_p (data_mode, mode, op_true, op_false))
4502 ;
4503 else
4504 {
4505 /* Canonicalize the comparison to EQ, GT, GTU. */
4506 switch (code)
4507 {
4508 case EQ:
4509 case GT:
4510 case GTU:
4511 break;
4512
4513 case NE:
4514 case LE:
4515 case LEU:
4516 code = reverse_condition (code);
4517 *negate = true;
4518 break;
4519
4520 case GE:
4521 case GEU:
4522 code = reverse_condition (code);
4523 *negate = true;
4524 /* FALLTHRU */
4525
4526 case LT:
4527 case LTU:
4528 std::swap (cop0, cop1);
4529 code = swap_condition (code);
4530 break;
4531
4532 default:
4533 gcc_unreachable ();
4534 }
4535
4536 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4537 if (mode == V2DImode)
4538 {
4539 switch (code)
4540 {
4541 case EQ:
4542 /* SSE4.1 supports EQ. */
4543 if (!TARGET_SSE4_1)
4544 return NULL;
4545 break;
4546
4547 case GT:
4548 case GTU:
4549 /* SSE4.2 supports GT/GTU. */
4550 if (!TARGET_SSE4_2)
4551 return NULL;
4552 break;
4553
4554 default:
4555 gcc_unreachable ();
4556 }
4557 }
4558
4559 rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
4560 rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
4561 if (*negate)
4562 std::swap (optrue, opfalse);
4563
4564 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4565 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4566 min (x, y) == x). While we add one instruction (the minimum),
4567 we remove the need for two instructions in the negation, as the
4568 result is done this way.
4569 When using masks, do it for SI/DImode element types, as it is shorter
4570 than the two subtractions. */
4571 if ((code != EQ
4572 && GET_MODE_SIZE (mode) != 64
4573 && vector_all_ones_operand (opfalse, data_mode)
4574 && optrue == CONST0_RTX (data_mode))
4575 || (code == GTU
4576 && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
4577 /* Don't do it if not using integer masks and we'd end up with
4578 the right values in the registers though. */
4579 && (GET_MODE_SIZE (mode) == 64
4580 || !vector_all_ones_operand (optrue, data_mode)
4581 || opfalse != CONST0_RTX (data_mode))))
4582 {
4583 rtx (*gen) (rtx, rtx, rtx) = NULL;
4584
4585 switch (mode)
4586 {
4587 case E_V16SImode:
4588 gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
4589 break;
4590 case E_V8DImode:
4591 gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
4592 cop0 = force_reg (mode, cop0);
4593 cop1 = force_reg (mode, cop1);
4594 break;
4595 case E_V32QImode:
4596 if (TARGET_AVX2)
4597 gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
4598 break;
4599 case E_V16HImode:
4600 if (TARGET_AVX2)
4601 gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
4602 break;
4603 case E_V8SImode:
4604 if (TARGET_AVX2)
4605 gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
4606 break;
4607 case E_V4DImode:
4608 if (TARGET_AVX512VL)
4609 {
4610 gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
4611 cop0 = force_reg (mode, cop0);
4612 cop1 = force_reg (mode, cop1);
4613 }
4614 break;
4615 case E_V16QImode:
4616 if (code == GTU && TARGET_SSE2)
4617 gen = gen_uminv16qi3;
4618 else if (code == GT && TARGET_SSE4_1)
4619 gen = gen_sminv16qi3;
4620 break;
4621 case E_V8QImode:
4622 if (code == GTU && TARGET_SSE2)
4623 gen = gen_uminv8qi3;
4624 else if (code == GT && TARGET_SSE4_1)
4625 gen = gen_sminv8qi3;
4626 break;
4627 case E_V4QImode:
4628 if (code == GTU && TARGET_SSE2)
4629 gen = gen_uminv4qi3;
4630 else if (code == GT && TARGET_SSE4_1)
4631 gen = gen_sminv4qi3;
4632 break;
4633 case E_V2QImode:
4634 if (code == GTU && TARGET_SSE2)
4635 gen = gen_uminv2qi3;
4636 else if (code == GT && TARGET_SSE4_1)
4637 gen = gen_sminv2qi3;
4638 break;
4639 case E_V8HImode:
4640 if (code == GTU && TARGET_SSE4_1)
4641 gen = gen_uminv8hi3;
4642 else if (code == GT && TARGET_SSE2)
4643 gen = gen_sminv8hi3;
4644 break;
4645 case E_V4HImode:
4646 if (code == GTU && TARGET_SSE4_1)
4647 gen = gen_uminv4hi3;
4648 else if (code == GT && TARGET_SSE2)
4649 gen = gen_sminv4hi3;
4650 break;
4651 case E_V2HImode:
4652 if (code == GTU && TARGET_SSE4_1)
4653 gen = gen_uminv2hi3;
4654 else if (code == GT && TARGET_SSE2)
4655 gen = gen_sminv2hi3;
4656 break;
4657 case E_V4SImode:
4658 if (TARGET_SSE4_1)
4659 gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
4660 break;
4661 case E_V2SImode:
4662 if (TARGET_SSE4_1)
4663 gen = (code == GTU) ? gen_uminv2si3 : gen_sminv2si3;
4664 break;
4665 case E_V2DImode:
4666 if (TARGET_AVX512VL)
4667 {
4668 gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
4669 cop0 = force_reg (mode, cop0);
4670 cop1 = force_reg (mode, cop1);
4671 }
4672 break;
4673 default:
4674 break;
4675 }
4676
4677 if (gen)
4678 {
4679 rtx tem = gen_reg_rtx (mode);
4680 if (!vector_operand (cop0, mode))
4681 cop0 = force_reg (mode, cop0);
4682 if (!vector_operand (cop1, mode))
4683 cop1 = force_reg (mode, cop1);
4684 *negate = !*negate;
4685 emit_insn (gen (tem, cop0, cop1));
4686 cop1 = tem;
4687 code = EQ;
4688 }
4689 }
4690
4691 /* Unsigned parallel compare is not supported by the hardware.
4692 Play some tricks to turn this into a signed comparison
4693 against 0. */
4694 if (code == GTU)
4695 {
4696 cop0 = force_reg (mode, cop0);
4697
4698 switch (mode)
4699 {
4700 case E_V16SImode:
4701 case E_V8DImode:
4702 case E_V8SImode:
4703 case E_V4DImode:
4704 case E_V4SImode:
4705 case E_V2SImode:
4706 case E_V2DImode:
4707 {
4708 rtx t1, t2, mask;
4709
4710 /* Subtract (-(INT MAX) - 1) from both operands to make
4711 them signed. */
4712 mask = ix86_build_signbit_mask (mode, true, false);
4713 t1 = gen_reg_rtx (mode);
4714 emit_insn (gen_sub3_insn (t1, cop0, mask));
4715
4716 t2 = gen_reg_rtx (mode);
4717 emit_insn (gen_sub3_insn (t2, cop1, mask));
4718
4719 cop0 = t1;
4720 cop1 = t2;
4721 code = GT;
4722 }
4723 break;
4724
4725 case E_V64QImode:
4726 case E_V32HImode:
4727 case E_V32QImode:
4728 case E_V16HImode:
4729 case E_V16QImode:
4730 case E_V8QImode:
4731 case E_V4QImode:
4732 case E_V2QImode:
4733 case E_V8HImode:
4734 case E_V4HImode:
4735 case E_V2HImode:
4736 /* Perform a parallel unsigned saturating subtraction. */
4737 x = gen_reg_rtx (mode);
4738 emit_insn (gen_rtx_SET
4739 (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
4740 cop0 = x;
4741 cop1 = CONST0_RTX (mode);
4742 code = EQ;
4743 *negate = !*negate;
4744 break;
4745
4746 default:
4747 gcc_unreachable ();
4748 }
4749 }
4750 }
4751
4752 if (*negate)
4753 std::swap (op_true, op_false);
4754
4755 /* Allow the comparison to be done in one mode, but the movcc to
4756 happen in another mode. */
4757 if (data_mode == mode)
4758 {
4759 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
4760 op_true, op_false);
4761 }
4762 else
4763 {
4764 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
4765 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
4766 op_true, op_false);
4767 if (GET_MODE (x) == mode)
4768 x = gen_lowpart (data_mode, x);
4769 }
4770
4771 return x;
4772 }
4773
4774 /* Expand integer vector comparison. */
4775
4776 bool
4777 ix86_expand_int_vec_cmp (rtx operands[])
4778 {
4779 rtx_code code = GET_CODE (operands[1]);
4780 bool negate = false;
4781 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
4782 operands[3], NULL, NULL, &negate);
4783
4784 if (!cmp)
4785 return false;
4786
4787 if (negate)
4788 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
4789 CONST0_RTX (GET_MODE (cmp)),
4790 NULL, NULL, &negate);
4791
4792 gcc_assert (!negate);
4793
4794 if (operands[0] != cmp)
4795 emit_move_insn (operands[0], cmp);
4796
4797 return true;
4798 }
4799
4800 /* Expand a floating-point vector conditional move; a vcond operation
4801 rather than a movcc operation. */
4802
4803 bool
4804 ix86_expand_fp_vcond (rtx operands[])
4805 {
4806 enum rtx_code code = GET_CODE (operands[3]);
4807 rtx cmp;
4808
4809 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4810 &operands[4], &operands[5]);
4811 if (code == UNKNOWN)
4812 {
4813 rtx temp;
4814 switch (GET_CODE (operands[3]))
4815 {
4816 case LTGT:
4817 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
4818 operands[5], operands[0], operands[0]);
4819 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
4820 operands[5], operands[1], operands[2]);
4821 code = AND;
4822 break;
4823 case UNEQ:
4824 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
4825 operands[5], operands[0], operands[0]);
4826 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
4827 operands[5], operands[1], operands[2]);
4828 code = IOR;
4829 break;
4830 default:
4831 gcc_unreachable ();
4832 }
4833 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4834 OPTAB_DIRECT);
4835 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4836 return true;
4837 }
4838
4839 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
4840 operands[5], operands[1], operands[2]))
4841 return true;
4842
4843 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
4844 operands[1], operands[2]);
4845 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4846 return true;
4847 }
4848
4849 /* Expand a signed/unsigned integral vector conditional move. */
4850
4851 bool
4852 ix86_expand_int_vcond (rtx operands[])
4853 {
4854 machine_mode data_mode = GET_MODE (operands[0]);
4855 machine_mode mode = GET_MODE (operands[4]);
4856 enum rtx_code code = GET_CODE (operands[3]);
4857 bool negate = false;
4858 rtx x, cop0, cop1;
4859
4860 cop0 = operands[4];
4861 cop1 = operands[5];
4862
4863 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4864 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4865 if ((code == LT || code == GE)
4866 && data_mode == mode
4867 && cop1 == CONST0_RTX (mode)
4868 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
4869 && GET_MODE_UNIT_SIZE (data_mode) > 1
4870 && GET_MODE_UNIT_SIZE (data_mode) <= 8
4871 && (GET_MODE_SIZE (data_mode) == 16
4872 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
4873 {
4874 rtx negop = operands[2 - (code == LT)];
4875 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
4876 if (negop == CONST1_RTX (data_mode))
4877 {
4878 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
4879 operands[0], 1, OPTAB_DIRECT);
4880 if (res != operands[0])
4881 emit_move_insn (operands[0], res);
4882 return true;
4883 }
4884 else if (GET_MODE_INNER (data_mode) != DImode
4885 && vector_all_ones_operand (negop, data_mode))
4886 {
4887 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
4888 operands[0], 0, OPTAB_DIRECT);
4889 if (res != operands[0])
4890 emit_move_insn (operands[0], res);
4891 return true;
4892 }
4893 }
4894
4895 if (!nonimmediate_operand (cop1, mode))
4896 cop1 = force_reg (mode, cop1);
4897 if (!general_operand (operands[1], data_mode))
4898 operands[1] = force_reg (data_mode, operands[1]);
4899 if (!general_operand (operands[2], data_mode))
4900 operands[2] = force_reg (data_mode, operands[2]);
4901
4902 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
4903 operands[1], operands[2], &negate);
4904
4905 if (!x)
4906 return false;
4907
4908 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
4909 operands[2-negate]);
4910 return true;
4911 }
4912
4913 static bool
4914 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
4915 struct expand_vec_perm_d *d)
4916 {
4917 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4918 expander, so args are either in d, or in op0, op1 etc. */
4919 machine_mode mode = GET_MODE (d ? d->op0 : op0);
4920 machine_mode maskmode = mode;
4921 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4922
4923 switch (mode)
4924 {
4925 case E_V16QImode:
4926 if (TARGET_AVX512VL && TARGET_AVX512VBMI)
4927 gen = gen_avx512vl_vpermt2varv16qi3;
4928 break;
4929 case E_V32QImode:
4930 if (TARGET_AVX512VL && TARGET_AVX512VBMI)
4931 gen = gen_avx512vl_vpermt2varv32qi3;
4932 break;
4933 case E_V64QImode:
4934 if (TARGET_AVX512VBMI)
4935 gen = gen_avx512bw_vpermt2varv64qi3;
4936 break;
4937 case E_V8HImode:
4938 if (TARGET_AVX512VL && TARGET_AVX512BW)
4939 gen = gen_avx512vl_vpermt2varv8hi3;
4940 break;
4941 case E_V16HImode:
4942 if (TARGET_AVX512VL && TARGET_AVX512BW)
4943 gen = gen_avx512vl_vpermt2varv16hi3;
4944 break;
4945 case E_V32HImode:
4946 if (TARGET_AVX512BW)
4947 gen = gen_avx512bw_vpermt2varv32hi3;
4948 break;
4949 case E_V4SImode:
4950 if (TARGET_AVX512VL)
4951 gen = gen_avx512vl_vpermt2varv4si3;
4952 break;
4953 case E_V8SImode:
4954 if (TARGET_AVX512VL)
4955 gen = gen_avx512vl_vpermt2varv8si3;
4956 break;
4957 case E_V16SImode:
4958 if (TARGET_AVX512F)
4959 gen = gen_avx512f_vpermt2varv16si3;
4960 break;
4961 case E_V4SFmode:
4962 if (TARGET_AVX512VL)
4963 {
4964 gen = gen_avx512vl_vpermt2varv4sf3;
4965 maskmode = V4SImode;
4966 }
4967 break;
4968 case E_V8SFmode:
4969 if (TARGET_AVX512VL)
4970 {
4971 gen = gen_avx512vl_vpermt2varv8sf3;
4972 maskmode = V8SImode;
4973 }
4974 break;
4975 case E_V16SFmode:
4976 if (TARGET_AVX512F)
4977 {
4978 gen = gen_avx512f_vpermt2varv16sf3;
4979 maskmode = V16SImode;
4980 }
4981 break;
4982 case E_V2DImode:
4983 if (TARGET_AVX512VL)
4984 gen = gen_avx512vl_vpermt2varv2di3;
4985 break;
4986 case E_V4DImode:
4987 if (TARGET_AVX512VL)
4988 gen = gen_avx512vl_vpermt2varv4di3;
4989 break;
4990 case E_V8DImode:
4991 if (TARGET_AVX512F)
4992 gen = gen_avx512f_vpermt2varv8di3;
4993 break;
4994 case E_V2DFmode:
4995 if (TARGET_AVX512VL)
4996 {
4997 gen = gen_avx512vl_vpermt2varv2df3;
4998 maskmode = V2DImode;
4999 }
5000 break;
5001 case E_V4DFmode:
5002 if (TARGET_AVX512VL)
5003 {
5004 gen = gen_avx512vl_vpermt2varv4df3;
5005 maskmode = V4DImode;
5006 }
5007 break;
5008 case E_V8DFmode:
5009 if (TARGET_AVX512F)
5010 {
5011 gen = gen_avx512f_vpermt2varv8df3;
5012 maskmode = V8DImode;
5013 }
5014 break;
5015 default:
5016 break;
5017 }
5018
5019 if (gen == NULL)
5020 return false;
5021
5022 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5023 expander, so args are either in d, or in op0, op1 etc. */
5024 if (d)
5025 {
5026 rtx vec[64];
5027 target = d->target;
5028 op0 = d->op0;
5029 op1 = d->op1;
5030 for (int i = 0; i < d->nelt; ++i)
5031 vec[i] = GEN_INT (d->perm[i]);
5032 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
5033 }
5034
5035 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
5036 return true;
5037 }
5038
5039 /* Expand a variable vector permutation. */
5040
5041 void
5042 ix86_expand_vec_perm (rtx operands[])
5043 {
5044 rtx target = operands[0];
5045 rtx op0 = operands[1];
5046 rtx op1 = operands[2];
5047 rtx mask = operands[3];
5048 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
5049 machine_mode mode = GET_MODE (op0);
5050 machine_mode maskmode = GET_MODE (mask);
5051 int w, e, i;
5052 bool one_operand_shuffle = rtx_equal_p (op0, op1);
5053
5054 /* Number of elements in the vector. */
5055 w = GET_MODE_NUNITS (mode);
5056 e = GET_MODE_UNIT_SIZE (mode);
5057 gcc_assert (w <= 64);
5058
5059 /* For HF mode vector, convert it to HI using subreg. */
5060 if (GET_MODE_INNER (mode) == HFmode)
5061 {
5062 machine_mode orig_mode = mode;
5063 mode = mode_for_vector (HImode, w).require ();
5064 target = lowpart_subreg (mode, target, orig_mode);
5065 op0 = lowpart_subreg (mode, op0, orig_mode);
5066 op1 = lowpart_subreg (mode, op1, orig_mode);
5067 }
5068
5069 if (TARGET_AVX512F && one_operand_shuffle)
5070 {
5071 rtx (*gen) (rtx, rtx, rtx) = NULL;
5072 switch (mode)
5073 {
5074 case E_V16SImode:
5075 gen =gen_avx512f_permvarv16si;
5076 break;
5077 case E_V16SFmode:
5078 gen = gen_avx512f_permvarv16sf;
5079 break;
5080 case E_V8DImode:
5081 gen = gen_avx512f_permvarv8di;
5082 break;
5083 case E_V8DFmode:
5084 gen = gen_avx512f_permvarv8df;
5085 break;
5086 default:
5087 break;
5088 }
5089 if (gen != NULL)
5090 {
5091 emit_insn (gen (target, op0, mask));
5092 return;
5093 }
5094 }
5095
5096 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
5097 return;
5098
5099 if (TARGET_AVX2)
5100 {
5101 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
5102 {
5103 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
5104 an constant shuffle operand. With a tiny bit of effort we can
5105 use VPERMD instead. A re-interpretation stall for V4DFmode is
5106 unfortunate but there's no avoiding it.
5107 Similarly for V16HImode we don't have instructions for variable
5108 shuffling, while for V32QImode we can use after preparing suitable
5109 masks vpshufb; vpshufb; vpermq; vpor. */
5110
5111 if (mode == V16HImode)
5112 {
5113 maskmode = mode = V32QImode;
5114 w = 32;
5115 e = 1;
5116 }
5117 else
5118 {
5119 maskmode = mode = V8SImode;
5120 w = 8;
5121 e = 4;
5122 }
5123 t1 = gen_reg_rtx (maskmode);
5124
5125 /* Replicate the low bits of the V4DImode mask into V8SImode:
5126 mask = { A B C D }
5127 t1 = { A A B B C C D D }. */
5128 for (i = 0; i < w / 2; ++i)
5129 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
5130 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5131 vt = force_reg (maskmode, vt);
5132 mask = gen_lowpart (maskmode, mask);
5133 if (maskmode == V8SImode)
5134 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
5135 else
5136 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
5137
5138 /* Multiply the shuffle indicies by two. */
5139 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
5140 OPTAB_DIRECT);
5141
5142 /* Add one to the odd shuffle indicies:
5143 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
5144 for (i = 0; i < w / 2; ++i)
5145 {
5146 vec[i * 2] = const0_rtx;
5147 vec[i * 2 + 1] = const1_rtx;
5148 }
5149 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5150 vt = validize_mem (force_const_mem (maskmode, vt));
5151 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
5152 OPTAB_DIRECT);
5153
5154 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
5155 operands[3] = mask = t1;
5156 target = gen_reg_rtx (mode);
5157 op0 = gen_lowpart (mode, op0);
5158 op1 = gen_lowpart (mode, op1);
5159 }
5160
5161 switch (mode)
5162 {
5163 case E_V8SImode:
5164 /* The VPERMD and VPERMPS instructions already properly ignore
5165 the high bits of the shuffle elements. No need for us to
5166 perform an AND ourselves. */
5167 if (one_operand_shuffle)
5168 {
5169 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
5170 if (target != operands[0])
5171 emit_move_insn (operands[0],
5172 gen_lowpart (GET_MODE (operands[0]), target));
5173 }
5174 else
5175 {
5176 t1 = gen_reg_rtx (V8SImode);
5177 t2 = gen_reg_rtx (V8SImode);
5178 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
5179 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
5180 goto merge_two;
5181 }
5182 return;
5183
5184 case E_V8SFmode:
5185 mask = gen_lowpart (V8SImode, mask);
5186 if (one_operand_shuffle)
5187 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
5188 else
5189 {
5190 t1 = gen_reg_rtx (V8SFmode);
5191 t2 = gen_reg_rtx (V8SFmode);
5192 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
5193 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
5194 goto merge_two;
5195 }
5196 return;
5197
5198 case E_V4SImode:
5199 /* By combining the two 128-bit input vectors into one 256-bit
5200 input vector, we can use VPERMD and VPERMPS for the full
5201 two-operand shuffle. */
5202 t1 = gen_reg_rtx (V8SImode);
5203 t2 = gen_reg_rtx (V8SImode);
5204 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
5205 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5206 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
5207 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
5208 return;
5209
5210 case E_V4SFmode:
5211 t1 = gen_reg_rtx (V8SFmode);
5212 t2 = gen_reg_rtx (V8SImode);
5213 mask = gen_lowpart (V4SImode, mask);
5214 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
5215 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5216 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
5217 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
5218 return;
5219
5220 case E_V32QImode:
5221 t1 = gen_reg_rtx (V32QImode);
5222 t2 = gen_reg_rtx (V32QImode);
5223 t3 = gen_reg_rtx (V32QImode);
5224 vt2 = GEN_INT (-128);
5225 vt = gen_const_vec_duplicate (V32QImode, vt2);
5226 vt = force_reg (V32QImode, vt);
5227 for (i = 0; i < 32; i++)
5228 vec[i] = i < 16 ? vt2 : const0_rtx;
5229 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
5230 vt2 = force_reg (V32QImode, vt2);
5231 /* From mask create two adjusted masks, which contain the same
5232 bits as mask in the low 7 bits of each vector element.
5233 The first mask will have the most significant bit clear
5234 if it requests element from the same 128-bit lane
5235 and MSB set if it requests element from the other 128-bit lane.
5236 The second mask will have the opposite values of the MSB,
5237 and additionally will have its 128-bit lanes swapped.
5238 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
5239 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
5240 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
5241 stands for other 12 bytes. */
5242 /* The bit whether element is from the same lane or the other
5243 lane is bit 4, so shift it up by 3 to the MSB position. */
5244 t5 = gen_reg_rtx (V4DImode);
5245 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
5246 GEN_INT (3)));
5247 /* Clear MSB bits from the mask just in case it had them set. */
5248 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
5249 /* After this t1 will have MSB set for elements from other lane. */
5250 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
5251 /* Clear bits other than MSB. */
5252 emit_insn (gen_andv32qi3 (t1, t1, vt));
5253 /* Or in the lower bits from mask into t3. */
5254 emit_insn (gen_iorv32qi3 (t3, t1, t2));
5255 /* And invert MSB bits in t1, so MSB is set for elements from the same
5256 lane. */
5257 emit_insn (gen_xorv32qi3 (t1, t1, vt));
5258 /* Swap 128-bit lanes in t3. */
5259 t6 = gen_reg_rtx (V4DImode);
5260 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
5261 const2_rtx, GEN_INT (3),
5262 const0_rtx, const1_rtx));
5263 /* And or in the lower bits from mask into t1. */
5264 emit_insn (gen_iorv32qi3 (t1, t1, t2));
5265 if (one_operand_shuffle)
5266 {
5267 /* Each of these shuffles will put 0s in places where
5268 element from the other 128-bit lane is needed, otherwise
5269 will shuffle in the requested value. */
5270 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
5271 gen_lowpart (V32QImode, t6)));
5272 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
5273 /* For t3 the 128-bit lanes are swapped again. */
5274 t7 = gen_reg_rtx (V4DImode);
5275 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
5276 const2_rtx, GEN_INT (3),
5277 const0_rtx, const1_rtx));
5278 /* And oring both together leads to the result. */
5279 emit_insn (gen_iorv32qi3 (target, t1,
5280 gen_lowpart (V32QImode, t7)));
5281 if (target != operands[0])
5282 emit_move_insn (operands[0],
5283 gen_lowpart (GET_MODE (operands[0]), target));
5284 return;
5285 }
5286
5287 t4 = gen_reg_rtx (V32QImode);
5288 /* Similarly to the above one_operand_shuffle code,
5289 just for repeated twice for each operand. merge_two:
5290 code will merge the two results together. */
5291 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
5292 gen_lowpart (V32QImode, t6)));
5293 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
5294 gen_lowpart (V32QImode, t6)));
5295 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
5296 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
5297 t7 = gen_reg_rtx (V4DImode);
5298 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
5299 const2_rtx, GEN_INT (3),
5300 const0_rtx, const1_rtx));
5301 t8 = gen_reg_rtx (V4DImode);
5302 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
5303 const2_rtx, GEN_INT (3),
5304 const0_rtx, const1_rtx));
5305 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
5306 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
5307 t1 = t4;
5308 t2 = t3;
5309 goto merge_two;
5310
5311 default:
5312 gcc_assert (GET_MODE_SIZE (mode) <= 16);
5313 break;
5314 }
5315 }
5316
5317 if (TARGET_XOP)
5318 {
5319 /* The XOP VPPERM insn supports three inputs. By ignoring the
5320 one_operand_shuffle special case, we avoid creating another
5321 set of constant vectors in memory. */
5322 one_operand_shuffle = false;
5323
5324 /* mask = mask & {2*w-1, ...} */
5325 vt = GEN_INT (2*w - 1);
5326 }
5327 else
5328 {
5329 /* mask = mask & {w-1, ...} */
5330 vt = GEN_INT (w - 1);
5331 }
5332
5333 vt = gen_const_vec_duplicate (maskmode, vt);
5334 mask = expand_simple_binop (maskmode, AND, mask, vt,
5335 NULL_RTX, 0, OPTAB_DIRECT);
5336
5337 /* For non-QImode operations, convert the word permutation control
5338 into a byte permutation control. */
5339 if (mode != V16QImode)
5340 {
5341 mask = expand_simple_binop (maskmode, ASHIFT, mask,
5342 GEN_INT (exact_log2 (e)),
5343 NULL_RTX, 0, OPTAB_DIRECT);
5344
5345 /* Convert mask to vector of chars. */
5346 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
5347
5348 /* Replicate each of the input bytes into byte positions:
5349 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
5350 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
5351 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
5352 for (i = 0; i < 16; ++i)
5353 vec[i] = GEN_INT (i/e * e);
5354 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5355 vt = validize_mem (force_const_mem (V16QImode, vt));
5356 if (TARGET_XOP)
5357 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
5358 else
5359 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
5360
5361 /* Convert it into the byte positions by doing
5362 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
5363 for (i = 0; i < 16; ++i)
5364 vec[i] = GEN_INT (i % e);
5365 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5366 vt = validize_mem (force_const_mem (V16QImode, vt));
5367 emit_insn (gen_addv16qi3 (mask, mask, vt));
5368 }
5369
5370 /* The actual shuffle operations all operate on V16QImode. */
5371 op0 = gen_lowpart (V16QImode, op0);
5372 op1 = gen_lowpart (V16QImode, op1);
5373
5374 if (TARGET_XOP)
5375 {
5376 if (GET_MODE (target) != V16QImode)
5377 target = gen_reg_rtx (V16QImode);
5378 emit_insn (gen_xop_pperm (target, op0, op1, mask));
5379 if (target != operands[0])
5380 emit_move_insn (operands[0],
5381 gen_lowpart (GET_MODE (operands[0]), target));
5382 }
5383 else if (one_operand_shuffle)
5384 {
5385 if (GET_MODE (target) != V16QImode)
5386 target = gen_reg_rtx (V16QImode);
5387 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
5388 if (target != operands[0])
5389 emit_move_insn (operands[0],
5390 gen_lowpart (GET_MODE (operands[0]), target));
5391 }
5392 else
5393 {
5394 rtx xops[6];
5395 bool ok;
5396
5397 /* Shuffle the two input vectors independently. */
5398 t1 = gen_reg_rtx (V16QImode);
5399 t2 = gen_reg_rtx (V16QImode);
5400 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
5401 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
5402
5403 merge_two:
5404 /* Then merge them together. The key is whether any given control
5405 element contained a bit set that indicates the second word. */
5406 mask = operands[3];
5407 vt = GEN_INT (w);
5408 if (maskmode == V2DImode && !TARGET_SSE4_1)
5409 {
5410 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
5411 more shuffle to convert the V2DI input mask into a V4SI
5412 input mask. At which point the masking that expand_int_vcond
5413 will work as desired. */
5414 rtx t3 = gen_reg_rtx (V4SImode);
5415 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
5416 const0_rtx, const0_rtx,
5417 const2_rtx, const2_rtx));
5418 mask = t3;
5419 maskmode = V4SImode;
5420 e = w = 4;
5421 }
5422
5423 vt = gen_const_vec_duplicate (maskmode, vt);
5424 vt = force_reg (maskmode, vt);
5425 mask = expand_simple_binop (maskmode, AND, mask, vt,
5426 NULL_RTX, 0, OPTAB_DIRECT);
5427
5428 if (GET_MODE (target) != mode)
5429 target = gen_reg_rtx (mode);
5430 xops[0] = target;
5431 xops[1] = gen_lowpart (mode, t2);
5432 xops[2] = gen_lowpart (mode, t1);
5433 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
5434 xops[4] = mask;
5435 xops[5] = vt;
5436 ok = ix86_expand_int_vcond (xops);
5437 gcc_assert (ok);
5438 if (target != operands[0])
5439 emit_move_insn (operands[0],
5440 gen_lowpart (GET_MODE (operands[0]), target));
5441 }
5442 }
5443
5444 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
5445 true if we should do zero extension, else sign extension. HIGH_P is
5446 true if we want the N/2 high elements, else the low elements. */
5447
5448 void
5449 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
5450 {
5451 machine_mode imode = GET_MODE (src);
5452 rtx tmp;
5453
5454 if (TARGET_SSE4_1)
5455 {
5456 rtx (*unpack)(rtx, rtx);
5457 rtx (*extract)(rtx, rtx) = NULL;
5458 machine_mode halfmode = BLKmode;
5459
5460 switch (imode)
5461 {
5462 case E_V64QImode:
5463 if (unsigned_p)
5464 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
5465 else
5466 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
5467 halfmode = V32QImode;
5468 extract
5469 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
5470 break;
5471 case E_V32QImode:
5472 if (unsigned_p)
5473 unpack = gen_avx2_zero_extendv16qiv16hi2;
5474 else
5475 unpack = gen_avx2_sign_extendv16qiv16hi2;
5476 halfmode = V16QImode;
5477 extract
5478 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
5479 break;
5480 case E_V32HImode:
5481 if (unsigned_p)
5482 unpack = gen_avx512f_zero_extendv16hiv16si2;
5483 else
5484 unpack = gen_avx512f_sign_extendv16hiv16si2;
5485 halfmode = V16HImode;
5486 extract
5487 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
5488 break;
5489 case E_V16HImode:
5490 if (unsigned_p)
5491 unpack = gen_avx2_zero_extendv8hiv8si2;
5492 else
5493 unpack = gen_avx2_sign_extendv8hiv8si2;
5494 halfmode = V8HImode;
5495 extract
5496 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
5497 break;
5498 case E_V16SImode:
5499 if (unsigned_p)
5500 unpack = gen_avx512f_zero_extendv8siv8di2;
5501 else
5502 unpack = gen_avx512f_sign_extendv8siv8di2;
5503 halfmode = V8SImode;
5504 extract
5505 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
5506 break;
5507 case E_V8SImode:
5508 if (unsigned_p)
5509 unpack = gen_avx2_zero_extendv4siv4di2;
5510 else
5511 unpack = gen_avx2_sign_extendv4siv4di2;
5512 halfmode = V4SImode;
5513 extract
5514 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
5515 break;
5516 case E_V16QImode:
5517 if (unsigned_p)
5518 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
5519 else
5520 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
5521 break;
5522 case E_V8HImode:
5523 if (unsigned_p)
5524 unpack = gen_sse4_1_zero_extendv4hiv4si2;
5525 else
5526 unpack = gen_sse4_1_sign_extendv4hiv4si2;
5527 break;
5528 case E_V4SImode:
5529 if (unsigned_p)
5530 unpack = gen_sse4_1_zero_extendv2siv2di2;
5531 else
5532 unpack = gen_sse4_1_sign_extendv2siv2di2;
5533 break;
5534 case E_V8QImode:
5535 if (unsigned_p)
5536 unpack = gen_sse4_1_zero_extendv4qiv4hi2;
5537 else
5538 unpack = gen_sse4_1_sign_extendv4qiv4hi2;
5539 break;
5540 case E_V4HImode:
5541 if (unsigned_p)
5542 unpack = gen_sse4_1_zero_extendv2hiv2si2;
5543 else
5544 unpack = gen_sse4_1_sign_extendv2hiv2si2;
5545 break;
5546 case E_V4QImode:
5547 if (unsigned_p)
5548 unpack = gen_sse4_1_zero_extendv2qiv2hi2;
5549 else
5550 unpack = gen_sse4_1_sign_extendv2qiv2hi2;
5551 break;
5552 default:
5553 gcc_unreachable ();
5554 }
5555
5556 if (GET_MODE_SIZE (imode) >= 32)
5557 {
5558 tmp = gen_reg_rtx (halfmode);
5559 emit_insn (extract (tmp, src));
5560 }
5561 else if (high_p)
5562 {
5563 switch (GET_MODE_SIZE (imode))
5564 {
5565 case 16:
5566 /* Shift higher 8 bytes to lower 8 bytes. */
5567 tmp = gen_reg_rtx (V1TImode);
5568 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
5569 GEN_INT (64)));
5570 break;
5571 case 8:
5572 /* Shift higher 4 bytes to lower 4 bytes. */
5573 tmp = gen_reg_rtx (V1DImode);
5574 emit_insn (gen_mmx_lshrv1di3 (tmp, gen_lowpart (V1DImode, src),
5575 GEN_INT (32)));
5576 break;
5577 case 4:
5578 /* Shift higher 2 bytes to lower 2 bytes. */
5579 tmp = gen_reg_rtx (V1SImode);
5580 emit_insn (gen_mmx_lshrv1si3 (tmp, gen_lowpart (V1SImode, src),
5581 GEN_INT (16)));
5582 break;
5583 default:
5584 gcc_unreachable ();
5585 }
5586
5587 tmp = gen_lowpart (imode, tmp);
5588 }
5589 else
5590 tmp = src;
5591
5592 emit_insn (unpack (dest, tmp));
5593 }
5594 else
5595 {
5596 rtx (*unpack)(rtx, rtx, rtx);
5597
5598 switch (imode)
5599 {
5600 case E_V16QImode:
5601 if (high_p)
5602 unpack = gen_vec_interleave_highv16qi;
5603 else
5604 unpack = gen_vec_interleave_lowv16qi;
5605 break;
5606 case E_V8HImode:
5607 if (high_p)
5608 unpack = gen_vec_interleave_highv8hi;
5609 else
5610 unpack = gen_vec_interleave_lowv8hi;
5611 break;
5612 case E_V4SImode:
5613 if (high_p)
5614 unpack = gen_vec_interleave_highv4si;
5615 else
5616 unpack = gen_vec_interleave_lowv4si;
5617 break;
5618 case E_V8QImode:
5619 if (high_p)
5620 unpack = gen_mmx_punpckhbw;
5621 else
5622 unpack = gen_mmx_punpcklbw;
5623 break;
5624 case E_V4HImode:
5625 if (high_p)
5626 unpack = gen_mmx_punpckhwd;
5627 else
5628 unpack = gen_mmx_punpcklwd;
5629 break;
5630 case E_V4QImode:
5631 if (high_p)
5632 unpack = gen_mmx_punpckhbw_low;
5633 else
5634 unpack = gen_mmx_punpcklbw_low;
5635 break;
5636 default:
5637 gcc_unreachable ();
5638 }
5639
5640 if (unsigned_p)
5641 tmp = force_reg (imode, CONST0_RTX (imode));
5642 else
5643 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5644 src, pc_rtx, pc_rtx);
5645
5646 rtx tmp2 = gen_reg_rtx (imode);
5647 emit_insn (unpack (tmp2, src, tmp));
5648 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
5649 }
5650 }
5651
5652 /* Return true if mem is pool constant which contains a const_vector
5653 perm index, assign the index to PERM. */
5654 bool
5655 ix86_extract_perm_from_pool_constant (int* perm, rtx mem)
5656 {
5657 machine_mode mode = GET_MODE (mem);
5658 int nelt = GET_MODE_NUNITS (mode);
5659
5660 if (!INTEGRAL_MODE_P (mode))
5661 return false;
5662
5663 /* Needs to be constant pool. */
5664 if (!(MEM_P (mem))
5665 || !SYMBOL_REF_P (XEXP (mem, 0))
5666 || !CONSTANT_POOL_ADDRESS_P (XEXP (mem, 0)))
5667 return false;
5668
5669 rtx constant = get_pool_constant (XEXP (mem, 0));
5670
5671 if (GET_CODE (constant) != CONST_VECTOR)
5672 return false;
5673
5674 /* There could be some rtx like
5675 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
5676 but with "*.LC1" refer to V2DI constant vector. */
5677 if (GET_MODE (constant) != mode)
5678 {
5679 constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
5680
5681 if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
5682 return false;
5683 }
5684
5685 for (int i = 0; i != nelt; i++)
5686 perm[i] = UINTVAL (XVECEXP (constant, 0, i));
5687
5688 return true;
5689 }
5690
5691 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5692 but works for floating pointer parameters and nonoffsetable memories.
5693 For pushes, it returns just stack offsets; the values will be saved
5694 in the right order. Maximally three parts are generated. */
5695
5696 static int
5697 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
5698 {
5699 int size;
5700
5701 if (!TARGET_64BIT)
5702 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
5703 else
5704 size = (GET_MODE_SIZE (mode) + 4) / 8;
5705
5706 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
5707 gcc_assert (size >= 2 && size <= 4);
5708
5709 /* Optimize constant pool reference to immediates. This is used by fp
5710 moves, that force all constants to memory to allow combining. */
5711 if (MEM_P (operand) && MEM_READONLY_P (operand))
5712 operand = avoid_constant_pool_reference (operand);
5713
5714 if (MEM_P (operand) && !offsettable_memref_p (operand))
5715 {
5716 /* The only non-offsetable memories we handle are pushes. */
5717 int ok = push_operand (operand, VOIDmode);
5718
5719 gcc_assert (ok);
5720
5721 operand = copy_rtx (operand);
5722 PUT_MODE (operand, word_mode);
5723 parts[0] = parts[1] = parts[2] = parts[3] = operand;
5724 return size;
5725 }
5726
5727 if (GET_CODE (operand) == CONST_VECTOR)
5728 {
5729 scalar_int_mode imode = int_mode_for_mode (mode).require ();
5730 /* Caution: if we looked through a constant pool memory above,
5731 the operand may actually have a different mode now. That's
5732 ok, since we want to pun this all the way back to an integer. */
5733 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
5734 gcc_assert (operand != NULL);
5735 mode = imode;
5736 }
5737
5738 if (!TARGET_64BIT)
5739 {
5740 if (mode == DImode)
5741 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5742 else
5743 {
5744 int i;
5745
5746 if (REG_P (operand))
5747 {
5748 gcc_assert (reload_completed);
5749 for (i = 0; i < size; i++)
5750 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
5751 }
5752 else if (offsettable_memref_p (operand))
5753 {
5754 operand = adjust_address (operand, SImode, 0);
5755 parts[0] = operand;
5756 for (i = 1; i < size; i++)
5757 parts[i] = adjust_address (operand, SImode, 4 * i);
5758 }
5759 else if (CONST_DOUBLE_P (operand))
5760 {
5761 const REAL_VALUE_TYPE *r;
5762 long l[4];
5763
5764 r = CONST_DOUBLE_REAL_VALUE (operand);
5765 switch (mode)
5766 {
5767 case E_TFmode:
5768 real_to_target (l, r, mode);
5769 parts[3] = gen_int_mode (l[3], SImode);
5770 parts[2] = gen_int_mode (l[2], SImode);
5771 break;
5772 case E_XFmode:
5773 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5774 long double may not be 80-bit. */
5775 real_to_target (l, r, mode);
5776 parts[2] = gen_int_mode (l[2], SImode);
5777 break;
5778 case E_DFmode:
5779 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
5780 break;
5781 default:
5782 gcc_unreachable ();
5783 }
5784 parts[1] = gen_int_mode (l[1], SImode);
5785 parts[0] = gen_int_mode (l[0], SImode);
5786 }
5787 else
5788 gcc_unreachable ();
5789 }
5790 }
5791 else
5792 {
5793 if (mode == TImode)
5794 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5795 if (mode == XFmode || mode == TFmode)
5796 {
5797 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
5798 if (REG_P (operand))
5799 {
5800 gcc_assert (reload_completed);
5801 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
5802 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
5803 }
5804 else if (offsettable_memref_p (operand))
5805 {
5806 operand = adjust_address (operand, DImode, 0);
5807 parts[0] = operand;
5808 parts[1] = adjust_address (operand, upper_mode, 8);
5809 }
5810 else if (CONST_DOUBLE_P (operand))
5811 {
5812 long l[4];
5813
5814 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
5815
5816 /* real_to_target puts 32-bit pieces in each long. */
5817 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
5818 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
5819 << 32), DImode);
5820
5821 if (upper_mode == SImode)
5822 parts[1] = gen_int_mode (l[2], SImode);
5823 else
5824 parts[1]
5825 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
5826 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
5827 << 32), DImode);
5828 }
5829 else
5830 gcc_unreachable ();
5831 }
5832 }
5833
5834 return size;
5835 }
5836
5837 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5838 Return false when normal moves are needed; true when all required
5839 insns have been emitted. Operands 2-4 contain the input values
5840 int the correct order; operands 5-7 contain the output values. */
5841
5842 void
5843 ix86_split_long_move (rtx operands[])
5844 {
5845 rtx part[2][4];
5846 int nparts, i, j;
5847 int push = 0;
5848 int collisions = 0;
5849 machine_mode mode = GET_MODE (operands[0]);
5850 bool collisionparts[4];
5851
5852 /* The DFmode expanders may ask us to move double.
5853 For 64bit target this is single move. By hiding the fact
5854 here we simplify i386.md splitters. */
5855 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
5856 {
5857 /* Optimize constant pool reference to immediates. This is used by
5858 fp moves, that force all constants to memory to allow combining. */
5859
5860 if (MEM_P (operands[1])
5861 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
5862 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
5863 operands[1] = get_pool_constant (XEXP (operands[1], 0));
5864 if (push_operand (operands[0], VOIDmode))
5865 {
5866 operands[0] = copy_rtx (operands[0]);
5867 PUT_MODE (operands[0], word_mode);
5868 }
5869 else
5870 operands[0] = gen_lowpart (DImode, operands[0]);
5871 operands[1] = gen_lowpart (DImode, operands[1]);
5872 emit_move_insn (operands[0], operands[1]);
5873 return;
5874 }
5875
5876 /* The only non-offsettable memory we handle is push. */
5877 if (push_operand (operands[0], VOIDmode))
5878 push = 1;
5879 else
5880 gcc_assert (!MEM_P (operands[0])
5881 || offsettable_memref_p (operands[0]));
5882
5883 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
5884 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
5885
5886 /* When emitting push, take care for source operands on the stack. */
5887 if (push && MEM_P (operands[1])
5888 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
5889 {
5890 rtx src_base = XEXP (part[1][nparts - 1], 0);
5891
5892 /* Compensate for the stack decrement by 4. */
5893 if (!TARGET_64BIT && nparts == 3
5894 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
5895 src_base = plus_constant (Pmode, src_base, 4);
5896
5897 /* src_base refers to the stack pointer and is
5898 automatically decreased by emitted push. */
5899 for (i = 0; i < nparts; i++)
5900 part[1][i] = change_address (part[1][i],
5901 GET_MODE (part[1][i]), src_base);
5902 }
5903
5904 /* We need to do copy in the right order in case an address register
5905 of the source overlaps the destination. */
5906 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
5907 {
5908 rtx tmp;
5909
5910 for (i = 0; i < nparts; i++)
5911 {
5912 collisionparts[i]
5913 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
5914 if (collisionparts[i])
5915 collisions++;
5916 }
5917
5918 /* Collision in the middle part can be handled by reordering. */
5919 if (collisions == 1 && nparts == 3 && collisionparts [1])
5920 {
5921 std::swap (part[0][1], part[0][2]);
5922 std::swap (part[1][1], part[1][2]);
5923 }
5924 else if (collisions == 1
5925 && nparts == 4
5926 && (collisionparts [1] || collisionparts [2]))
5927 {
5928 if (collisionparts [1])
5929 {
5930 std::swap (part[0][1], part[0][2]);
5931 std::swap (part[1][1], part[1][2]);
5932 }
5933 else
5934 {
5935 std::swap (part[0][2], part[0][3]);
5936 std::swap (part[1][2], part[1][3]);
5937 }
5938 }
5939
5940 /* If there are more collisions, we can't handle it by reordering.
5941 Do an lea to the last part and use only one colliding move. */
5942 else if (collisions > 1)
5943 {
5944 rtx base, addr;
5945
5946 collisions = 1;
5947
5948 base = part[0][nparts - 1];
5949
5950 /* Handle the case when the last part isn't valid for lea.
5951 Happens in 64-bit mode storing the 12-byte XFmode. */
5952 if (GET_MODE (base) != Pmode)
5953 base = gen_rtx_REG (Pmode, REGNO (base));
5954
5955 addr = XEXP (part[1][0], 0);
5956 if (TARGET_TLS_DIRECT_SEG_REFS)
5957 {
5958 struct ix86_address parts;
5959 int ok = ix86_decompose_address (addr, &parts);
5960 gcc_assert (ok);
5961 /* It is not valid to use %gs: or %fs: in lea. */
5962 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
5963 }
5964 emit_insn (gen_rtx_SET (base, addr));
5965 part[1][0] = replace_equiv_address (part[1][0], base);
5966 for (i = 1; i < nparts; i++)
5967 {
5968 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
5969 part[1][i] = replace_equiv_address (part[1][i], tmp);
5970 }
5971 }
5972 }
5973
5974 if (push)
5975 {
5976 if (!TARGET_64BIT)
5977 {
5978 if (nparts == 3)
5979 {
5980 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
5981 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
5982 emit_move_insn (part[0][2], part[1][2]);
5983 }
5984 else if (nparts == 4)
5985 {
5986 emit_move_insn (part[0][3], part[1][3]);
5987 emit_move_insn (part[0][2], part[1][2]);
5988 }
5989 }
5990 else
5991 {
5992 /* In 64bit mode we don't have 32bit push available. In case this is
5993 register, it is OK - we will just use larger counterpart. We also
5994 retype memory - these comes from attempt to avoid REX prefix on
5995 moving of second half of TFmode value. */
5996 if (GET_MODE (part[1][1]) == SImode)
5997 {
5998 switch (GET_CODE (part[1][1]))
5999 {
6000 case MEM:
6001 part[1][1] = adjust_address (part[1][1], DImode, 0);
6002 break;
6003
6004 case REG:
6005 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
6006 break;
6007
6008 default:
6009 gcc_unreachable ();
6010 }
6011
6012 if (GET_MODE (part[1][0]) == SImode)
6013 part[1][0] = part[1][1];
6014 }
6015 }
6016 emit_move_insn (part[0][1], part[1][1]);
6017 emit_move_insn (part[0][0], part[1][0]);
6018 return;
6019 }
6020
6021 /* Choose correct order to not overwrite the source before it is copied. */
6022 if ((REG_P (part[0][0])
6023 && REG_P (part[1][1])
6024 && (REGNO (part[0][0]) == REGNO (part[1][1])
6025 || (nparts == 3
6026 && REGNO (part[0][0]) == REGNO (part[1][2]))
6027 || (nparts == 4
6028 && REGNO (part[0][0]) == REGNO (part[1][3]))))
6029 || (collisions > 0
6030 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
6031 {
6032 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
6033 {
6034 operands[2 + i] = part[0][j];
6035 operands[6 + i] = part[1][j];
6036 }
6037 }
6038 else
6039 {
6040 for (i = 0; i < nparts; i++)
6041 {
6042 operands[2 + i] = part[0][i];
6043 operands[6 + i] = part[1][i];
6044 }
6045 }
6046
6047 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
6048 if (optimize_insn_for_size_p ())
6049 {
6050 for (j = 0; j < nparts - 1; j++)
6051 if (CONST_INT_P (operands[6 + j])
6052 && operands[6 + j] != const0_rtx
6053 && REG_P (operands[2 + j]))
6054 for (i = j; i < nparts - 1; i++)
6055 if (CONST_INT_P (operands[7 + i])
6056 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
6057 operands[7 + i] = operands[2 + j];
6058 }
6059
6060 for (i = 0; i < nparts; i++)
6061 emit_move_insn (operands[2 + i], operands[6 + i]);
6062
6063 return;
6064 }
6065
6066 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
6067 left shift by a constant, either using a single shift or
6068 a sequence of add instructions. */
6069
6070 static void
6071 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
6072 {
6073 if (count == 1
6074 || (count * ix86_cost->add <= ix86_cost->shift_const
6075 && !optimize_insn_for_size_p ()))
6076 {
6077 while (count-- > 0)
6078 emit_insn (gen_add2_insn (operand, operand));
6079 }
6080 else
6081 {
6082 rtx (*insn)(rtx, rtx, rtx);
6083
6084 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6085 emit_insn (insn (operand, operand, GEN_INT (count)));
6086 }
6087 }
6088
6089 void
6090 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
6091 {
6092 rtx (*gen_ashl3)(rtx, rtx, rtx);
6093 rtx (*gen_shld)(rtx, rtx, rtx);
6094 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6095 machine_mode half_mode;
6096
6097 rtx low[2], high[2];
6098 int count;
6099
6100 if (CONST_INT_P (operands[2]))
6101 {
6102 split_double_mode (mode, operands, 2, low, high);
6103 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6104
6105 if (count >= half_width)
6106 {
6107 emit_move_insn (high[0], low[1]);
6108 emit_move_insn (low[0], const0_rtx);
6109
6110 if (count > half_width)
6111 ix86_expand_ashl_const (high[0], count - half_width, mode);
6112 }
6113 else
6114 {
6115 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6116
6117 if (!rtx_equal_p (operands[0], operands[1]))
6118 emit_move_insn (operands[0], operands[1]);
6119
6120 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
6121 ix86_expand_ashl_const (low[0], count, mode);
6122 }
6123 return;
6124 }
6125
6126 split_double_mode (mode, operands, 1, low, high);
6127 half_mode = mode == DImode ? SImode : DImode;
6128
6129 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6130
6131 if (operands[1] == const1_rtx)
6132 {
6133 /* Assuming we've chosen a QImode capable registers, then 1 << N
6134 can be done with two 32/64-bit shifts, no branches, no cmoves. */
6135 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
6136 {
6137 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
6138
6139 ix86_expand_clear (low[0]);
6140 ix86_expand_clear (high[0]);
6141 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
6142
6143 d = gen_lowpart (QImode, low[0]);
6144 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6145 s = gen_rtx_EQ (QImode, flags, const0_rtx);
6146 emit_insn (gen_rtx_SET (d, s));
6147
6148 d = gen_lowpart (QImode, high[0]);
6149 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6150 s = gen_rtx_NE (QImode, flags, const0_rtx);
6151 emit_insn (gen_rtx_SET (d, s));
6152 }
6153
6154 /* Otherwise, we can get the same results by manually performing
6155 a bit extract operation on bit 5/6, and then performing the two
6156 shifts. The two methods of getting 0/1 into low/high are exactly
6157 the same size. Avoiding the shift in the bit extract case helps
6158 pentium4 a bit; no one else seems to care much either way. */
6159 else
6160 {
6161 rtx (*gen_lshr3)(rtx, rtx, rtx);
6162 rtx (*gen_and3)(rtx, rtx, rtx);
6163 rtx (*gen_xor3)(rtx, rtx, rtx);
6164 HOST_WIDE_INT bits;
6165 rtx x;
6166
6167 if (mode == DImode)
6168 {
6169 gen_lshr3 = gen_lshrsi3;
6170 gen_and3 = gen_andsi3;
6171 gen_xor3 = gen_xorsi3;
6172 bits = 5;
6173 }
6174 else
6175 {
6176 gen_lshr3 = gen_lshrdi3;
6177 gen_and3 = gen_anddi3;
6178 gen_xor3 = gen_xordi3;
6179 bits = 6;
6180 }
6181
6182 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
6183 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
6184 else
6185 x = gen_lowpart (half_mode, operands[2]);
6186 emit_insn (gen_rtx_SET (high[0], x));
6187
6188 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
6189 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
6190 emit_move_insn (low[0], high[0]);
6191 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
6192 }
6193
6194 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6195 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
6196 return;
6197 }
6198
6199 if (operands[1] == constm1_rtx)
6200 {
6201 /* For -1 << N, we can avoid the shld instruction, because we
6202 know that we're shifting 0...31/63 ones into a -1. */
6203 emit_move_insn (low[0], constm1_rtx);
6204 if (optimize_insn_for_size_p ())
6205 emit_move_insn (high[0], low[0]);
6206 else
6207 emit_move_insn (high[0], constm1_rtx);
6208 }
6209 else
6210 {
6211 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6212
6213 if (!rtx_equal_p (operands[0], operands[1]))
6214 emit_move_insn (operands[0], operands[1]);
6215
6216 split_double_mode (mode, operands, 1, low, high);
6217 emit_insn (gen_shld (high[0], low[0], operands[2]));
6218 }
6219
6220 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6221
6222 if (TARGET_CMOVE && scratch)
6223 {
6224 ix86_expand_clear (scratch);
6225 emit_insn (gen_x86_shift_adj_1
6226 (half_mode, high[0], low[0], operands[2], scratch));
6227 }
6228 else
6229 emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
6230 }
6231
6232 void
6233 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
6234 {
6235 rtx (*gen_ashr3)(rtx, rtx, rtx)
6236 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
6237 rtx (*gen_shrd)(rtx, rtx, rtx);
6238 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6239
6240 rtx low[2], high[2];
6241 int count;
6242
6243 if (CONST_INT_P (operands[2]))
6244 {
6245 split_double_mode (mode, operands, 2, low, high);
6246 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6247
6248 if (count == GET_MODE_BITSIZE (mode) - 1)
6249 {
6250 emit_move_insn (high[0], high[1]);
6251 emit_insn (gen_ashr3 (high[0], high[0],
6252 GEN_INT (half_width - 1)));
6253 emit_move_insn (low[0], high[0]);
6254
6255 }
6256 else if (count >= half_width)
6257 {
6258 emit_move_insn (low[0], high[1]);
6259 emit_move_insn (high[0], low[0]);
6260 emit_insn (gen_ashr3 (high[0], high[0],
6261 GEN_INT (half_width - 1)));
6262
6263 if (count > half_width)
6264 emit_insn (gen_ashr3 (low[0], low[0],
6265 GEN_INT (count - half_width)));
6266 }
6267 else
6268 {
6269 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6270
6271 if (!rtx_equal_p (operands[0], operands[1]))
6272 emit_move_insn (operands[0], operands[1]);
6273
6274 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6275 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
6276 }
6277 }
6278 else
6279 {
6280 machine_mode half_mode;
6281
6282 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6283
6284 if (!rtx_equal_p (operands[0], operands[1]))
6285 emit_move_insn (operands[0], operands[1]);
6286
6287 split_double_mode (mode, operands, 1, low, high);
6288 half_mode = mode == DImode ? SImode : DImode;
6289
6290 emit_insn (gen_shrd (low[0], high[0], operands[2]));
6291 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
6292
6293 if (TARGET_CMOVE && scratch)
6294 {
6295 emit_move_insn (scratch, high[0]);
6296 emit_insn (gen_ashr3 (scratch, scratch,
6297 GEN_INT (half_width - 1)));
6298 emit_insn (gen_x86_shift_adj_1
6299 (half_mode, low[0], high[0], operands[2], scratch));
6300 }
6301 else
6302 emit_insn (gen_x86_shift_adj_3
6303 (half_mode, low[0], high[0], operands[2]));
6304 }
6305 }
6306
6307 void
6308 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
6309 {
6310 rtx (*gen_lshr3)(rtx, rtx, rtx)
6311 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
6312 rtx (*gen_shrd)(rtx, rtx, rtx);
6313 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6314
6315 rtx low[2], high[2];
6316 int count;
6317
6318 if (CONST_INT_P (operands[2]))
6319 {
6320 split_double_mode (mode, operands, 2, low, high);
6321 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6322
6323 if (count >= half_width)
6324 {
6325 emit_move_insn (low[0], high[1]);
6326 ix86_expand_clear (high[0]);
6327
6328 if (count > half_width)
6329 emit_insn (gen_lshr3 (low[0], low[0],
6330 GEN_INT (count - half_width)));
6331 }
6332 else
6333 {
6334 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6335
6336 if (!rtx_equal_p (operands[0], operands[1]))
6337 emit_move_insn (operands[0], operands[1]);
6338
6339 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6340 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
6341 }
6342 }
6343 else
6344 {
6345 machine_mode half_mode;
6346
6347 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6348
6349 if (!rtx_equal_p (operands[0], operands[1]))
6350 emit_move_insn (operands[0], operands[1]);
6351
6352 split_double_mode (mode, operands, 1, low, high);
6353 half_mode = mode == DImode ? SImode : DImode;
6354
6355 emit_insn (gen_shrd (low[0], high[0], operands[2]));
6356 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
6357
6358 if (TARGET_CMOVE && scratch)
6359 {
6360 ix86_expand_clear (scratch);
6361 emit_insn (gen_x86_shift_adj_1
6362 (half_mode, low[0], high[0], operands[2], scratch));
6363 }
6364 else
6365 emit_insn (gen_x86_shift_adj_2
6366 (half_mode, low[0], high[0], operands[2]));
6367 }
6368 }
6369
6370 /* Expand move of V1TI mode register X to a new TI mode register. */
6371 static rtx
6372 ix86_expand_v1ti_to_ti (rtx x)
6373 {
6374 rtx result = gen_reg_rtx (TImode);
6375 if (TARGET_SSE2)
6376 {
6377 rtx temp = force_reg (V2DImode, gen_lowpart (V2DImode, x));
6378 rtx lo = gen_lowpart (DImode, result);
6379 emit_insn (gen_vec_extractv2didi (lo, temp, const0_rtx));
6380 rtx hi = gen_highpart (DImode, result);
6381 emit_insn (gen_vec_extractv2didi (hi, temp, const1_rtx));
6382 }
6383 else
6384 emit_move_insn (result, gen_lowpart (TImode, x));
6385 return result;
6386 }
6387
6388 /* Expand move of TI mode register X to a new V1TI mode register. */
6389 static rtx
6390 ix86_expand_ti_to_v1ti (rtx x)
6391 {
6392 if (TARGET_SSE2)
6393 {
6394 rtx lo = gen_lowpart (DImode, x);
6395 rtx hi = gen_highpart (DImode, x);
6396 rtx tmp = gen_reg_rtx (V2DImode);
6397 emit_insn (gen_vec_concatv2di (tmp, lo, hi));
6398 return force_reg (V1TImode, gen_lowpart (V1TImode, tmp));
6399 }
6400
6401 return force_reg (V1TImode, gen_lowpart (V1TImode, x));
6402 }
6403
6404 /* Expand V1TI mode shift (of rtx_code CODE) by constant. */
6405 void
6406 ix86_expand_v1ti_shift (enum rtx_code code, rtx operands[])
6407 {
6408 rtx op1 = force_reg (V1TImode, operands[1]);
6409
6410 if (!CONST_INT_P (operands[2]))
6411 {
6412 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6413 rtx tmp2 = gen_reg_rtx (TImode);
6414 rtx (*shift) (rtx, rtx, rtx)
6415 = (code == ASHIFT) ? gen_ashlti3 : gen_lshrti3;
6416 emit_insn (shift (tmp2, tmp1, operands[2]));
6417 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6418 emit_move_insn (operands[0], tmp3);
6419 return;
6420 }
6421
6422 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6423
6424 if (bits == 0)
6425 {
6426 emit_move_insn (operands[0], op1);
6427 return;
6428 }
6429
6430 if ((bits & 7) == 0)
6431 {
6432 rtx tmp = gen_reg_rtx (V1TImode);
6433 if (code == ASHIFT)
6434 emit_insn (gen_sse2_ashlv1ti3 (tmp, op1, GEN_INT (bits)));
6435 else
6436 emit_insn (gen_sse2_lshrv1ti3 (tmp, op1, GEN_INT (bits)));
6437 emit_move_insn (operands[0], tmp);
6438 return;
6439 }
6440
6441 rtx tmp1 = gen_reg_rtx (V1TImode);
6442 if (code == ASHIFT)
6443 emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (64)));
6444 else
6445 emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
6446
6447 /* tmp2 is operands[1] shifted by 64, in V2DImode. */
6448 rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6449
6450 /* tmp3 will be the V2DImode result. */
6451 rtx tmp3 = gen_reg_rtx (V2DImode);
6452
6453 if (bits > 64)
6454 {
6455 if (code == ASHIFT)
6456 emit_insn (gen_ashlv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
6457 else
6458 emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
6459 }
6460 else
6461 {
6462 /* tmp4 is operands[1], in V2DImode. */
6463 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6464
6465 rtx tmp5 = gen_reg_rtx (V2DImode);
6466 if (code == ASHIFT)
6467 emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (bits)));
6468 else
6469 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6470
6471 rtx tmp6 = gen_reg_rtx (V2DImode);
6472 if (code == ASHIFT)
6473 emit_insn (gen_lshrv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
6474 else
6475 emit_insn (gen_ashlv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
6476
6477 emit_insn (gen_iorv2di3 (tmp3, tmp5, tmp6));
6478 }
6479
6480 /* Convert the result back to V1TImode and store in operands[0]. */
6481 rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
6482 emit_move_insn (operands[0], tmp7);
6483 }
6484
6485 /* Expand V1TI mode rotate (of rtx_code CODE) by constant. */
6486 void
6487 ix86_expand_v1ti_rotate (enum rtx_code code, rtx operands[])
6488 {
6489 rtx op1 = force_reg (V1TImode, operands[1]);
6490
6491 if (!CONST_INT_P (operands[2]))
6492 {
6493 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6494 rtx tmp2 = gen_reg_rtx (TImode);
6495 rtx (*rotate) (rtx, rtx, rtx)
6496 = (code == ROTATE) ? gen_rotlti3 : gen_rotrti3;
6497 emit_insn (rotate (tmp2, tmp1, operands[2]));
6498 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6499 emit_move_insn (operands[0], tmp3);
6500 return;
6501 }
6502
6503 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6504
6505 if (bits == 0)
6506 {
6507 emit_move_insn (operands[0], op1);
6508 return;
6509 }
6510
6511 if (code == ROTATERT)
6512 bits = 128 - bits;
6513
6514 if ((bits & 31) == 0)
6515 {
6516 rtx tmp2 = gen_reg_rtx (V4SImode);
6517 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6518 if (bits == 32)
6519 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x93)));
6520 else if (bits == 64)
6521 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x4e)));
6522 else
6523 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x39)));
6524 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp2));
6525 return;
6526 }
6527
6528 if ((bits & 7) == 0)
6529 {
6530 rtx tmp1 = gen_reg_rtx (V1TImode);
6531 rtx tmp2 = gen_reg_rtx (V1TImode);
6532 rtx tmp3 = gen_reg_rtx (V1TImode);
6533
6534 emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (bits)));
6535 emit_insn (gen_sse2_lshrv1ti3 (tmp2, op1, GEN_INT (128 - bits)));
6536 emit_insn (gen_iorv1ti3 (tmp3, tmp1, tmp2));
6537 emit_move_insn (operands[0], tmp3);
6538 return;
6539 }
6540
6541 rtx op1_v4si = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6542
6543 rtx lobits;
6544 rtx hibits;
6545
6546 switch (bits >> 5)
6547 {
6548 case 0:
6549 lobits = op1_v4si;
6550 hibits = gen_reg_rtx (V4SImode);
6551 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x93)));
6552 break;
6553
6554 case 1:
6555 lobits = gen_reg_rtx (V4SImode);
6556 hibits = gen_reg_rtx (V4SImode);
6557 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x93)));
6558 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x4e)));
6559 break;
6560
6561 case 2:
6562 lobits = gen_reg_rtx (V4SImode);
6563 hibits = gen_reg_rtx (V4SImode);
6564 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x4e)));
6565 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x39)));
6566 break;
6567
6568 default:
6569 lobits = gen_reg_rtx (V4SImode);
6570 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x39)));
6571 hibits = op1_v4si;
6572 break;
6573 }
6574
6575 rtx tmp1 = gen_reg_rtx (V4SImode);
6576 rtx tmp2 = gen_reg_rtx (V4SImode);
6577 rtx tmp3 = gen_reg_rtx (V4SImode);
6578
6579 emit_insn (gen_ashlv4si3 (tmp1, lobits, GEN_INT (bits & 31)));
6580 emit_insn (gen_lshrv4si3 (tmp2, hibits, GEN_INT (32 - (bits & 31))));
6581 emit_insn (gen_iorv4si3 (tmp3, tmp1, tmp2));
6582
6583 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
6584 }
6585
6586 /* Expand V1TI mode ashiftrt by constant. */
6587 void
6588 ix86_expand_v1ti_ashiftrt (rtx operands[])
6589 {
6590 rtx op1 = force_reg (V1TImode, operands[1]);
6591
6592 if (!CONST_INT_P (operands[2]))
6593 {
6594 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6595 rtx tmp2 = gen_reg_rtx (TImode);
6596 emit_insn (gen_ashrti3 (tmp2, tmp1, operands[2]));
6597 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6598 emit_move_insn (operands[0], tmp3);
6599 return;
6600 }
6601
6602 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6603
6604 if (bits == 0)
6605 {
6606 emit_move_insn (operands[0], op1);
6607 return;
6608 }
6609
6610 if (bits == 127)
6611 {
6612 /* Two operations. */
6613 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
6614 rtx tmp2 = gen_reg_rtx (V4SImode);
6615 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6616
6617 rtx tmp3 = gen_reg_rtx (V4SImode);
6618 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6619
6620 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
6621 return;
6622 }
6623
6624 if (bits == 64)
6625 {
6626 /* Three operations. */
6627 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
6628 rtx tmp2 = gen_reg_rtx (V4SImode);
6629 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6630
6631 rtx tmp3 = gen_reg_rtx (V4SImode);
6632 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6633
6634 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6635 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6636 rtx tmp6 = gen_reg_rtx (V2DImode);
6637 emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
6638
6639 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6640 return;
6641 }
6642
6643 if (bits == 96)
6644 {
6645 /* Three operations. */
6646 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
6647 rtx tmp2 = gen_reg_rtx (V4SImode);
6648 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
6649
6650 rtx tmp3 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6651 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
6652 rtx tmp5 = gen_reg_rtx (V2DImode);
6653 emit_insn (gen_vec_interleave_highv2di (tmp5, tmp3, tmp4));
6654
6655 rtx tmp6 = force_reg(V4SImode, gen_lowpart (V4SImode, tmp5));
6656 rtx tmp7 = gen_reg_rtx (V4SImode);
6657 emit_insn (gen_sse2_pshufd (tmp7, tmp6, GEN_INT (0xfd)));
6658
6659 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
6660 return;
6661 }
6662
6663 if (bits >= 111)
6664 {
6665 /* Three operations. */
6666 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6667 rtx tmp2 = gen_reg_rtx (V4SImode);
6668 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
6669
6670 rtx tmp3 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6671 rtx tmp4 = gen_reg_rtx (V8HImode);
6672 emit_insn (gen_sse2_pshufhw (tmp4, tmp3, GEN_INT (0xfe)));
6673
6674 rtx tmp5 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp4));
6675 rtx tmp6 = gen_reg_rtx (V4SImode);
6676 emit_insn (gen_sse2_pshufd (tmp6, tmp5, GEN_INT (0xfe)));
6677
6678 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6679 return;
6680 }
6681
6682 if (TARGET_AVX2 || TARGET_SSE4_1)
6683 {
6684 /* Three operations. */
6685 if (bits == 32)
6686 {
6687 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6688 rtx tmp2 = gen_reg_rtx (V4SImode);
6689 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
6690
6691 rtx tmp3 = gen_reg_rtx (V1TImode);
6692 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (32)));
6693
6694 if (TARGET_AVX2)
6695 {
6696 rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
6697 rtx tmp5 = gen_reg_rtx (V4SImode);
6698 emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
6699 GEN_INT (7)));
6700
6701 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
6702 }
6703 else
6704 {
6705 rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6706 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
6707 rtx tmp6 = gen_reg_rtx (V8HImode);
6708 emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
6709 GEN_INT (0x3f)));
6710
6711 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6712 }
6713 return;
6714 }
6715
6716 /* Three operations. */
6717 if (bits == 8 || bits == 16 || bits == 24)
6718 {
6719 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6720 rtx tmp2 = gen_reg_rtx (V4SImode);
6721 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6722
6723 rtx tmp3 = gen_reg_rtx (V1TImode);
6724 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (bits)));
6725
6726 if (TARGET_AVX2)
6727 {
6728 rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
6729 rtx tmp5 = gen_reg_rtx (V4SImode);
6730 emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
6731 GEN_INT (7)));
6732
6733 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
6734 }
6735 else
6736 {
6737 rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6738 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
6739 rtx tmp6 = gen_reg_rtx (V8HImode);
6740 emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
6741 GEN_INT (0x3f)));
6742
6743 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6744 }
6745 return;
6746 }
6747 }
6748
6749 if (bits > 96)
6750 {
6751 /* Four operations. */
6752 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6753 rtx tmp2 = gen_reg_rtx (V4SImode);
6754 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
6755
6756 rtx tmp3 = gen_reg_rtx (V4SImode);
6757 emit_insn (gen_ashrv4si3 (tmp3, tmp1, GEN_INT (31)));
6758
6759 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
6760 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6761 rtx tmp6 = gen_reg_rtx (V2DImode);
6762 emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
6763
6764 rtx tmp7 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp6));
6765 rtx tmp8 = gen_reg_rtx (V4SImode);
6766 emit_insn (gen_sse2_pshufd (tmp8, tmp7, GEN_INT (0xfd)));
6767
6768 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp8));
6769 return;
6770 }
6771
6772 if (TARGET_SSE4_1 && (bits == 48 || bits == 80))
6773 {
6774 /* Four operations. */
6775 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6776 rtx tmp2 = gen_reg_rtx (V4SImode);
6777 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6778
6779 rtx tmp3 = gen_reg_rtx (V4SImode);
6780 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6781
6782 rtx tmp4 = gen_reg_rtx (V1TImode);
6783 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
6784
6785 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
6786 rtx tmp6 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp4));
6787 rtx tmp7 = gen_reg_rtx (V8HImode);
6788 emit_insn (gen_sse4_1_pblendw (tmp7, tmp5, tmp6,
6789 GEN_INT (bits == 48 ? 0x1f : 0x07)));
6790
6791 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
6792 return;
6793 }
6794
6795 if ((bits & 7) == 0)
6796 {
6797 /* Five operations. */
6798 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6799 rtx tmp2 = gen_reg_rtx (V4SImode);
6800 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6801
6802 rtx tmp3 = gen_reg_rtx (V4SImode);
6803 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6804
6805 rtx tmp4 = gen_reg_rtx (V1TImode);
6806 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
6807
6808 rtx tmp5 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
6809 rtx tmp6 = gen_reg_rtx (V1TImode);
6810 emit_insn (gen_sse2_ashlv1ti3 (tmp6, tmp5, GEN_INT (128 - bits)));
6811
6812 rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
6813 rtx tmp8 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp6));
6814 rtx tmp9 = gen_reg_rtx (V2DImode);
6815 emit_insn (gen_iorv2di3 (tmp9, tmp7, tmp8));
6816
6817 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp9));
6818 return;
6819 }
6820
6821 if (TARGET_AVX2 && bits < 32)
6822 {
6823 /* Six operations. */
6824 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6825 rtx tmp2 = gen_reg_rtx (V4SImode);
6826 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6827
6828 rtx tmp3 = gen_reg_rtx (V1TImode);
6829 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
6830
6831 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6832 rtx tmp5 = gen_reg_rtx (V2DImode);
6833 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6834
6835 rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6836 rtx tmp7 = gen_reg_rtx (V2DImode);
6837 emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
6838
6839 rtx tmp8 = gen_reg_rtx (V2DImode);
6840 emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
6841
6842 rtx tmp9 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp8));
6843 rtx tmp10 = gen_reg_rtx (V4SImode);
6844 emit_insn (gen_avx2_pblenddv4si (tmp10, tmp2, tmp9, GEN_INT (7)));
6845
6846 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp10));
6847 return;
6848 }
6849
6850 if (TARGET_SSE4_1 && bits < 15)
6851 {
6852 /* Six operations. */
6853 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6854 rtx tmp2 = gen_reg_rtx (V4SImode);
6855 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6856
6857 rtx tmp3 = gen_reg_rtx (V1TImode);
6858 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
6859
6860 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6861 rtx tmp5 = gen_reg_rtx (V2DImode);
6862 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6863
6864 rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6865 rtx tmp7 = gen_reg_rtx (V2DImode);
6866 emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
6867
6868 rtx tmp8 = gen_reg_rtx (V2DImode);
6869 emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
6870
6871 rtx tmp9 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6872 rtx tmp10 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp8));
6873 rtx tmp11 = gen_reg_rtx (V8HImode);
6874 emit_insn (gen_sse4_1_pblendw (tmp11, tmp9, tmp10, GEN_INT (0x3f)));
6875
6876 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp11));
6877 return;
6878 }
6879
6880 if (bits == 1)
6881 {
6882 /* Eight operations. */
6883 rtx tmp1 = gen_reg_rtx (V1TImode);
6884 emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
6885
6886 rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6887 rtx tmp3 = gen_reg_rtx (V2DImode);
6888 emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (1)));
6889
6890 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6891 rtx tmp5 = gen_reg_rtx (V2DImode);
6892 emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (63)));
6893
6894 rtx tmp6 = gen_reg_rtx (V2DImode);
6895 emit_insn (gen_iorv2di3 (tmp6, tmp3, tmp5));
6896
6897 rtx tmp7 = gen_reg_rtx (V2DImode);
6898 emit_insn (gen_lshrv2di3 (tmp7, tmp2, GEN_INT (63)));
6899
6900 rtx tmp8 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp7));
6901 rtx tmp9 = gen_reg_rtx (V4SImode);
6902 emit_insn (gen_sse2_pshufd (tmp9, tmp8, GEN_INT (0xbf)));
6903
6904 rtx tmp10 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp9));
6905 rtx tmp11 = gen_reg_rtx (V2DImode);
6906 emit_insn (gen_ashlv2di3 (tmp11, tmp10, GEN_INT (31)));
6907
6908 rtx tmp12 = gen_reg_rtx (V2DImode);
6909 emit_insn (gen_iorv2di3 (tmp12, tmp6, tmp11));
6910
6911 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp12));
6912 return;
6913 }
6914
6915 if (bits > 64)
6916 {
6917 /* Eight operations. */
6918 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6919 rtx tmp2 = gen_reg_rtx (V4SImode);
6920 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6921
6922 rtx tmp3 = gen_reg_rtx (V4SImode);
6923 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6924
6925 rtx tmp4 = gen_reg_rtx (V1TImode);
6926 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
6927
6928 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
6929 rtx tmp6 = gen_reg_rtx (V2DImode);
6930 emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits - 64)));
6931
6932 rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
6933 rtx tmp8 = gen_reg_rtx (V1TImode);
6934 emit_insn (gen_sse2_ashlv1ti3 (tmp8, tmp7, GEN_INT (64)));
6935
6936 rtx tmp9 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6937 rtx tmp10 = gen_reg_rtx (V2DImode);
6938 emit_insn (gen_ashlv2di3 (tmp10, tmp9, GEN_INT (128 - bits)));
6939
6940 rtx tmp11 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp8));
6941 rtx tmp12 = gen_reg_rtx (V2DImode);
6942 emit_insn (gen_iorv2di3 (tmp12, tmp10, tmp11));
6943
6944 rtx tmp13 = gen_reg_rtx (V2DImode);
6945 emit_insn (gen_iorv2di3 (tmp13, tmp6, tmp12));
6946
6947 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp13));
6948 }
6949 else
6950 {
6951 /* Nine operations. */
6952 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6953 rtx tmp2 = gen_reg_rtx (V4SImode);
6954 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6955
6956 rtx tmp3 = gen_reg_rtx (V4SImode);
6957 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6958
6959 rtx tmp4 = gen_reg_rtx (V1TImode);
6960 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
6961
6962 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6963 rtx tmp6 = gen_reg_rtx (V2DImode);
6964 emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits)));
6965
6966 rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
6967 rtx tmp8 = gen_reg_rtx (V2DImode);
6968 emit_insn (gen_ashlv2di3 (tmp8, tmp7, GEN_INT (64 - bits)));
6969
6970 rtx tmp9 = gen_reg_rtx (V2DImode);
6971 emit_insn (gen_iorv2di3 (tmp9, tmp6, tmp8));
6972
6973 rtx tmp10 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
6974 rtx tmp11 = gen_reg_rtx (V1TImode);
6975 emit_insn (gen_sse2_ashlv1ti3 (tmp11, tmp10, GEN_INT (64)));
6976
6977 rtx tmp12 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp11));
6978 rtx tmp13 = gen_reg_rtx (V2DImode);
6979 emit_insn (gen_ashlv2di3 (tmp13, tmp12, GEN_INT (64 - bits)));
6980
6981 rtx tmp14 = gen_reg_rtx (V2DImode);
6982 emit_insn (gen_iorv2di3 (tmp14, tmp9, tmp13));
6983
6984 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp14));
6985 }
6986 }
6987
6988 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
6989 DImode for constant loop counts. */
6990
6991 static machine_mode
6992 counter_mode (rtx count_exp)
6993 {
6994 if (GET_MODE (count_exp) != VOIDmode)
6995 return GET_MODE (count_exp);
6996 if (!CONST_INT_P (count_exp))
6997 return Pmode;
6998 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
6999 return DImode;
7000 return SImode;
7001 }
7002
7003 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
7004 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
7005 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
7006 memory by VALUE (supposed to be in MODE).
7007
7008 The size is rounded down to whole number of chunk size moved at once.
7009 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
7010
7011
7012 static void
7013 expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
7014 rtx destptr, rtx srcptr, rtx value,
7015 rtx count, machine_mode mode, int unroll,
7016 int expected_size, bool issetmem)
7017 {
7018 rtx_code_label *out_label, *top_label;
7019 rtx iter, tmp;
7020 machine_mode iter_mode = counter_mode (count);
7021 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
7022 rtx piece_size = GEN_INT (piece_size_n);
7023 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
7024 rtx size;
7025 int i;
7026
7027 top_label = gen_label_rtx ();
7028 out_label = gen_label_rtx ();
7029 iter = gen_reg_rtx (iter_mode);
7030
7031 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
7032 NULL, 1, OPTAB_DIRECT);
7033 /* Those two should combine. */
7034 if (piece_size == const1_rtx)
7035 {
7036 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
7037 true, out_label);
7038 predict_jump (REG_BR_PROB_BASE * 10 / 100);
7039 }
7040 emit_move_insn (iter, const0_rtx);
7041
7042 emit_label (top_label);
7043
7044 tmp = convert_modes (Pmode, iter_mode, iter, true);
7045
7046 /* This assert could be relaxed - in this case we'll need to compute
7047 smallest power of two, containing in PIECE_SIZE_N and pass it to
7048 offset_address. */
7049 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
7050 destmem = offset_address (destmem, tmp, piece_size_n);
7051 destmem = adjust_address (destmem, mode, 0);
7052
7053 if (!issetmem)
7054 {
7055 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
7056 srcmem = adjust_address (srcmem, mode, 0);
7057
7058 /* When unrolling for chips that reorder memory reads and writes,
7059 we can save registers by using single temporary.
7060 Also using 4 temporaries is overkill in 32bit mode. */
7061 if (!TARGET_64BIT && 0)
7062 {
7063 for (i = 0; i < unroll; i++)
7064 {
7065 if (i)
7066 {
7067 destmem = adjust_address (copy_rtx (destmem), mode,
7068 GET_MODE_SIZE (mode));
7069 srcmem = adjust_address (copy_rtx (srcmem), mode,
7070 GET_MODE_SIZE (mode));
7071 }
7072 emit_move_insn (destmem, srcmem);
7073 }
7074 }
7075 else
7076 {
7077 rtx tmpreg[4];
7078 gcc_assert (unroll <= 4);
7079 for (i = 0; i < unroll; i++)
7080 {
7081 tmpreg[i] = gen_reg_rtx (mode);
7082 if (i)
7083 srcmem = adjust_address (copy_rtx (srcmem), mode,
7084 GET_MODE_SIZE (mode));
7085 emit_move_insn (tmpreg[i], srcmem);
7086 }
7087 for (i = 0; i < unroll; i++)
7088 {
7089 if (i)
7090 destmem = adjust_address (copy_rtx (destmem), mode,
7091 GET_MODE_SIZE (mode));
7092 emit_move_insn (destmem, tmpreg[i]);
7093 }
7094 }
7095 }
7096 else
7097 for (i = 0; i < unroll; i++)
7098 {
7099 if (i)
7100 destmem = adjust_address (copy_rtx (destmem), mode,
7101 GET_MODE_SIZE (mode));
7102 emit_move_insn (destmem, value);
7103 }
7104
7105 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
7106 true, OPTAB_LIB_WIDEN);
7107 if (tmp != iter)
7108 emit_move_insn (iter, tmp);
7109
7110 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
7111 true, top_label);
7112 if (expected_size != -1)
7113 {
7114 expected_size /= GET_MODE_SIZE (mode) * unroll;
7115 if (expected_size == 0)
7116 predict_jump (0);
7117 else if (expected_size > REG_BR_PROB_BASE)
7118 predict_jump (REG_BR_PROB_BASE - 1);
7119 else
7120 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
7121 / expected_size);
7122 }
7123 else
7124 predict_jump (REG_BR_PROB_BASE * 80 / 100);
7125 iter = ix86_zero_extend_to_Pmode (iter);
7126 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
7127 true, OPTAB_LIB_WIDEN);
7128 if (tmp != destptr)
7129 emit_move_insn (destptr, tmp);
7130 if (!issetmem)
7131 {
7132 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
7133 true, OPTAB_LIB_WIDEN);
7134 if (tmp != srcptr)
7135 emit_move_insn (srcptr, tmp);
7136 }
7137 emit_label (out_label);
7138 }
7139
7140 /* Divide COUNTREG by SCALE. */
7141 static rtx
7142 scale_counter (rtx countreg, int scale)
7143 {
7144 rtx sc;
7145
7146 if (scale == 1)
7147 return countreg;
7148 if (CONST_INT_P (countreg))
7149 return GEN_INT (INTVAL (countreg) / scale);
7150 gcc_assert (REG_P (countreg));
7151
7152 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
7153 GEN_INT (exact_log2 (scale)),
7154 NULL, 1, OPTAB_DIRECT);
7155 return sc;
7156 }
7157
7158 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
7159 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
7160 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
7161 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
7162 ORIG_VALUE is the original value passed to memset to fill the memory with.
7163 Other arguments have same meaning as for previous function. */
7164
7165 static void
7166 expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
7167 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
7168 rtx count,
7169 machine_mode mode, bool issetmem)
7170 {
7171 rtx destexp;
7172 rtx srcexp;
7173 rtx countreg;
7174 HOST_WIDE_INT rounded_count;
7175
7176 /* If possible, it is shorter to use rep movs.
7177 TODO: Maybe it is better to move this logic to decide_alg. */
7178 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
7179 && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
7180 && (!issetmem || orig_value == const0_rtx))
7181 mode = SImode;
7182
7183 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
7184 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
7185
7186 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
7187 GET_MODE_SIZE (mode)));
7188 if (mode != QImode)
7189 {
7190 destexp = gen_rtx_ASHIFT (Pmode, countreg,
7191 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
7192 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
7193 }
7194 else
7195 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
7196 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
7197 {
7198 rounded_count
7199 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
7200 destmem = shallow_copy_rtx (destmem);
7201 set_mem_size (destmem, rounded_count);
7202 }
7203 else if (MEM_SIZE_KNOWN_P (destmem))
7204 clear_mem_size (destmem);
7205
7206 if (issetmem)
7207 {
7208 value = force_reg (mode, gen_lowpart (mode, value));
7209 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
7210 }
7211 else
7212 {
7213 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
7214 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
7215 if (mode != QImode)
7216 {
7217 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
7218 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
7219 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
7220 }
7221 else
7222 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
7223 if (CONST_INT_P (count))
7224 {
7225 rounded_count
7226 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
7227 srcmem = shallow_copy_rtx (srcmem);
7228 set_mem_size (srcmem, rounded_count);
7229 }
7230 else
7231 {
7232 if (MEM_SIZE_KNOWN_P (srcmem))
7233 clear_mem_size (srcmem);
7234 }
7235 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
7236 destexp, srcexp));
7237 }
7238 }
7239
7240 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
7241 DESTMEM.
7242 SRC is passed by pointer to be updated on return.
7243 Return value is updated DST. */
7244 static rtx
7245 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
7246 HOST_WIDE_INT size_to_move)
7247 {
7248 rtx dst = destmem, src = *srcmem, tempreg;
7249 enum insn_code code;
7250 machine_mode move_mode;
7251 int piece_size, i;
7252
7253 /* Find the widest mode in which we could perform moves.
7254 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7255 it until move of such size is supported. */
7256 piece_size = 1 << floor_log2 (size_to_move);
7257 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
7258 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
7259 {
7260 gcc_assert (piece_size > 1);
7261 piece_size >>= 1;
7262 }
7263
7264 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7265 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7266 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
7267 {
7268 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
7269 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
7270 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
7271 {
7272 move_mode = word_mode;
7273 piece_size = GET_MODE_SIZE (move_mode);
7274 code = optab_handler (mov_optab, move_mode);
7275 }
7276 }
7277 gcc_assert (code != CODE_FOR_nothing);
7278
7279 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
7280 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
7281
7282 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7283 gcc_assert (size_to_move % piece_size == 0);
7284
7285 for (i = 0; i < size_to_move; i += piece_size)
7286 {
7287 /* We move from memory to memory, so we'll need to do it via
7288 a temporary register. */
7289 tempreg = gen_reg_rtx (move_mode);
7290 emit_insn (GEN_FCN (code) (tempreg, src));
7291 emit_insn (GEN_FCN (code) (dst, tempreg));
7292
7293 emit_move_insn (destptr,
7294 plus_constant (Pmode, copy_rtx (destptr), piece_size));
7295 emit_move_insn (srcptr,
7296 plus_constant (Pmode, copy_rtx (srcptr), piece_size));
7297
7298 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7299 piece_size);
7300 src = adjust_automodify_address_nv (src, move_mode, srcptr,
7301 piece_size);
7302 }
7303
7304 /* Update DST and SRC rtx. */
7305 *srcmem = src;
7306 return dst;
7307 }
7308
7309 /* Helper function for the string operations below. Dest VARIABLE whether
7310 it is aligned to VALUE bytes. If true, jump to the label. */
7311
7312 static rtx_code_label *
7313 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
7314 {
7315 rtx_code_label *label = gen_label_rtx ();
7316 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
7317 if (GET_MODE (variable) == DImode)
7318 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
7319 else
7320 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
7321 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
7322 1, label);
7323 if (epilogue)
7324 predict_jump (REG_BR_PROB_BASE * 50 / 100);
7325 else
7326 predict_jump (REG_BR_PROB_BASE * 90 / 100);
7327 return label;
7328 }
7329
7330
7331 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
7332
7333 static void
7334 expand_cpymem_epilogue (rtx destmem, rtx srcmem,
7335 rtx destptr, rtx srcptr, rtx count, int max_size)
7336 {
7337 rtx src, dest;
7338 if (CONST_INT_P (count))
7339 {
7340 HOST_WIDE_INT countval = INTVAL (count);
7341 HOST_WIDE_INT epilogue_size = countval % max_size;
7342 int i;
7343
7344 /* For now MAX_SIZE should be a power of 2. This assert could be
7345 relaxed, but it'll require a bit more complicated epilogue
7346 expanding. */
7347 gcc_assert ((max_size & (max_size - 1)) == 0);
7348 for (i = max_size; i >= 1; i >>= 1)
7349 {
7350 if (epilogue_size & i)
7351 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
7352 }
7353 return;
7354 }
7355 if (max_size > 8)
7356 {
7357 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
7358 count, 1, OPTAB_DIRECT);
7359 expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
7360 count, QImode, 1, 4, false);
7361 return;
7362 }
7363
7364 /* When there are stringops, we can cheaply increase dest and src pointers.
7365 Otherwise we save code size by maintaining offset (zero is readily
7366 available from preceding rep operation) and using x86 addressing modes.
7367 */
7368 if (TARGET_SINGLE_STRINGOP)
7369 {
7370 if (max_size > 4)
7371 {
7372 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7373 src = change_address (srcmem, SImode, srcptr);
7374 dest = change_address (destmem, SImode, destptr);
7375 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7376 emit_label (label);
7377 LABEL_NUSES (label) = 1;
7378 }
7379 if (max_size > 2)
7380 {
7381 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7382 src = change_address (srcmem, HImode, srcptr);
7383 dest = change_address (destmem, HImode, destptr);
7384 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7385 emit_label (label);
7386 LABEL_NUSES (label) = 1;
7387 }
7388 if (max_size > 1)
7389 {
7390 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7391 src = change_address (srcmem, QImode, srcptr);
7392 dest = change_address (destmem, QImode, destptr);
7393 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7394 emit_label (label);
7395 LABEL_NUSES (label) = 1;
7396 }
7397 }
7398 else
7399 {
7400 rtx offset = force_reg (Pmode, const0_rtx);
7401 rtx tmp;
7402
7403 if (max_size > 4)
7404 {
7405 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7406 src = change_address (srcmem, SImode, srcptr);
7407 dest = change_address (destmem, SImode, destptr);
7408 emit_move_insn (dest, src);
7409 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
7410 true, OPTAB_LIB_WIDEN);
7411 if (tmp != offset)
7412 emit_move_insn (offset, tmp);
7413 emit_label (label);
7414 LABEL_NUSES (label) = 1;
7415 }
7416 if (max_size > 2)
7417 {
7418 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7419 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
7420 src = change_address (srcmem, HImode, tmp);
7421 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
7422 dest = change_address (destmem, HImode, tmp);
7423 emit_move_insn (dest, src);
7424 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
7425 true, OPTAB_LIB_WIDEN);
7426 if (tmp != offset)
7427 emit_move_insn (offset, tmp);
7428 emit_label (label);
7429 LABEL_NUSES (label) = 1;
7430 }
7431 if (max_size > 1)
7432 {
7433 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7434 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
7435 src = change_address (srcmem, QImode, tmp);
7436 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
7437 dest = change_address (destmem, QImode, tmp);
7438 emit_move_insn (dest, src);
7439 emit_label (label);
7440 LABEL_NUSES (label) = 1;
7441 }
7442 }
7443 }
7444
7445 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
7446 with value PROMOTED_VAL.
7447 SRC is passed by pointer to be updated on return.
7448 Return value is updated DST. */
7449 static rtx
7450 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
7451 HOST_WIDE_INT size_to_move)
7452 {
7453 rtx dst = destmem;
7454 enum insn_code code;
7455 machine_mode move_mode;
7456 int piece_size, i;
7457
7458 /* Find the widest mode in which we could perform moves.
7459 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7460 it until move of such size is supported. */
7461 move_mode = GET_MODE (promoted_val);
7462 if (move_mode == VOIDmode)
7463 move_mode = QImode;
7464 if (size_to_move < GET_MODE_SIZE (move_mode))
7465 {
7466 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
7467 move_mode = int_mode_for_size (move_bits, 0).require ();
7468 promoted_val = gen_lowpart (move_mode, promoted_val);
7469 }
7470 piece_size = GET_MODE_SIZE (move_mode);
7471 code = optab_handler (mov_optab, move_mode);
7472 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
7473
7474 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
7475
7476 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7477 gcc_assert (size_to_move % piece_size == 0);
7478
7479 for (i = 0; i < size_to_move; i += piece_size)
7480 {
7481 if (piece_size <= GET_MODE_SIZE (word_mode))
7482 {
7483 emit_insn (gen_strset (destptr, dst, promoted_val));
7484 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7485 piece_size);
7486 continue;
7487 }
7488
7489 emit_insn (GEN_FCN (code) (dst, promoted_val));
7490
7491 emit_move_insn (destptr,
7492 plus_constant (Pmode, copy_rtx (destptr), piece_size));
7493
7494 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7495 piece_size);
7496 }
7497
7498 /* Update DST rtx. */
7499 return dst;
7500 }
7501 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7502 static void
7503 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
7504 rtx count, int max_size)
7505 {
7506 count = expand_simple_binop (counter_mode (count), AND, count,
7507 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
7508 expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
7509 gen_lowpart (QImode, value), count, QImode,
7510 1, max_size / 2, true);
7511 }
7512
7513 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7514 static void
7515 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
7516 rtx count, int max_size)
7517 {
7518 rtx dest;
7519
7520 if (CONST_INT_P (count))
7521 {
7522 HOST_WIDE_INT countval = INTVAL (count);
7523 HOST_WIDE_INT epilogue_size = countval % max_size;
7524 int i;
7525
7526 /* For now MAX_SIZE should be a power of 2. This assert could be
7527 relaxed, but it'll require a bit more complicated epilogue
7528 expanding. */
7529 gcc_assert ((max_size & (max_size - 1)) == 0);
7530 for (i = max_size; i >= 1; i >>= 1)
7531 {
7532 if (epilogue_size & i)
7533 {
7534 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
7535 destmem = emit_memset (destmem, destptr, vec_value, i);
7536 else
7537 destmem = emit_memset (destmem, destptr, value, i);
7538 }
7539 }
7540 return;
7541 }
7542 if (max_size > 32)
7543 {
7544 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
7545 return;
7546 }
7547 if (max_size > 16)
7548 {
7549 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
7550 if (TARGET_64BIT)
7551 {
7552 dest = change_address (destmem, DImode, destptr);
7553 emit_insn (gen_strset (destptr, dest, value));
7554 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
7555 emit_insn (gen_strset (destptr, dest, value));
7556 }
7557 else
7558 {
7559 dest = change_address (destmem, SImode, destptr);
7560 emit_insn (gen_strset (destptr, dest, value));
7561 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
7562 emit_insn (gen_strset (destptr, dest, value));
7563 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
7564 emit_insn (gen_strset (destptr, dest, value));
7565 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
7566 emit_insn (gen_strset (destptr, dest, value));
7567 }
7568 emit_label (label);
7569 LABEL_NUSES (label) = 1;
7570 }
7571 if (max_size > 8)
7572 {
7573 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
7574 if (TARGET_64BIT)
7575 {
7576 dest = change_address (destmem, DImode, destptr);
7577 emit_insn (gen_strset (destptr, dest, value));
7578 }
7579 else
7580 {
7581 dest = change_address (destmem, SImode, destptr);
7582 emit_insn (gen_strset (destptr, dest, value));
7583 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
7584 emit_insn (gen_strset (destptr, dest, value));
7585 }
7586 emit_label (label);
7587 LABEL_NUSES (label) = 1;
7588 }
7589 if (max_size > 4)
7590 {
7591 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7592 dest = change_address (destmem, SImode, destptr);
7593 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
7594 emit_label (label);
7595 LABEL_NUSES (label) = 1;
7596 }
7597 if (max_size > 2)
7598 {
7599 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7600 dest = change_address (destmem, HImode, destptr);
7601 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
7602 emit_label (label);
7603 LABEL_NUSES (label) = 1;
7604 }
7605 if (max_size > 1)
7606 {
7607 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7608 dest = change_address (destmem, QImode, destptr);
7609 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
7610 emit_label (label);
7611 LABEL_NUSES (label) = 1;
7612 }
7613 }
7614
7615 /* Adjust COUNTER by the VALUE. */
7616 static void
7617 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
7618 {
7619 emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
7620 }
7621
7622 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
7623 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
7624 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
7625 ignored.
7626 Return value is updated DESTMEM. */
7627
7628 static rtx
7629 expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
7630 rtx destptr, rtx srcptr, rtx value,
7631 rtx vec_value, rtx count, int align,
7632 int desired_alignment, bool issetmem)
7633 {
7634 int i;
7635 for (i = 1; i < desired_alignment; i <<= 1)
7636 {
7637 if (align <= i)
7638 {
7639 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
7640 if (issetmem)
7641 {
7642 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
7643 destmem = emit_memset (destmem, destptr, vec_value, i);
7644 else
7645 destmem = emit_memset (destmem, destptr, value, i);
7646 }
7647 else
7648 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
7649 ix86_adjust_counter (count, i);
7650 emit_label (label);
7651 LABEL_NUSES (label) = 1;
7652 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
7653 }
7654 }
7655 return destmem;
7656 }
7657
7658 /* Test if COUNT&SIZE is nonzero and if so, expand movme
7659 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
7660 and jump to DONE_LABEL. */
7661 static void
7662 expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
7663 rtx destptr, rtx srcptr,
7664 rtx value, rtx vec_value,
7665 rtx count, int size,
7666 rtx done_label, bool issetmem)
7667 {
7668 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
7669 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
7670 rtx modesize;
7671 int n;
7672
7673 /* If we do not have vector value to copy, we must reduce size. */
7674 if (issetmem)
7675 {
7676 if (!vec_value)
7677 {
7678 if (GET_MODE (value) == VOIDmode && size > 8)
7679 mode = Pmode;
7680 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
7681 mode = GET_MODE (value);
7682 }
7683 else
7684 mode = GET_MODE (vec_value), value = vec_value;
7685 }
7686 else
7687 {
7688 /* Choose appropriate vector mode. */
7689 if (size >= 32)
7690 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
7691 else if (size >= 16)
7692 mode = TARGET_SSE ? V16QImode : DImode;
7693 srcmem = change_address (srcmem, mode, srcptr);
7694 }
7695 destmem = change_address (destmem, mode, destptr);
7696 modesize = GEN_INT (GET_MODE_SIZE (mode));
7697 gcc_assert (GET_MODE_SIZE (mode) <= size);
7698 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
7699 {
7700 if (issetmem)
7701 emit_move_insn (destmem, gen_lowpart (mode, value));
7702 else
7703 {
7704 emit_move_insn (destmem, srcmem);
7705 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7706 }
7707 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7708 }
7709
7710 destmem = offset_address (destmem, count, 1);
7711 destmem = offset_address (destmem, GEN_INT (-2 * size),
7712 GET_MODE_SIZE (mode));
7713 if (!issetmem)
7714 {
7715 srcmem = offset_address (srcmem, count, 1);
7716 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
7717 GET_MODE_SIZE (mode));
7718 }
7719 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
7720 {
7721 if (issetmem)
7722 emit_move_insn (destmem, gen_lowpart (mode, value));
7723 else
7724 {
7725 emit_move_insn (destmem, srcmem);
7726 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7727 }
7728 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7729 }
7730 emit_jump_insn (gen_jump (done_label));
7731 emit_barrier ();
7732
7733 emit_label (label);
7734 LABEL_NUSES (label) = 1;
7735 }
7736
7737 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
7738 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
7739 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
7740 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
7741 DONE_LABEL is a label after the whole copying sequence. The label is created
7742 on demand if *DONE_LABEL is NULL.
7743 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
7744 bounds after the initial copies.
7745
7746 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
7747 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
7748 we will dispatch to a library call for large blocks.
7749
7750 In pseudocode we do:
7751
7752 if (COUNT < SIZE)
7753 {
7754 Assume that SIZE is 4. Bigger sizes are handled analogously
7755 if (COUNT & 4)
7756 {
7757 copy 4 bytes from SRCPTR to DESTPTR
7758 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
7759 goto done_label
7760 }
7761 if (!COUNT)
7762 goto done_label;
7763 copy 1 byte from SRCPTR to DESTPTR
7764 if (COUNT & 2)
7765 {
7766 copy 2 bytes from SRCPTR to DESTPTR
7767 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
7768 }
7769 }
7770 else
7771 {
7772 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
7773 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
7774
7775 OLD_DESPTR = DESTPTR;
7776 Align DESTPTR up to DESIRED_ALIGN
7777 SRCPTR += DESTPTR - OLD_DESTPTR
7778 COUNT -= DEST_PTR - OLD_DESTPTR
7779 if (DYNAMIC_CHECK)
7780 Round COUNT down to multiple of SIZE
7781 << optional caller supplied zero size guard is here >>
7782 << optional caller supplied dynamic check is here >>
7783 << caller supplied main copy loop is here >>
7784 }
7785 done_label:
7786 */
7787 static void
7788 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
7789 rtx *destptr, rtx *srcptr,
7790 machine_mode mode,
7791 rtx value, rtx vec_value,
7792 rtx *count,
7793 rtx_code_label **done_label,
7794 int size,
7795 int desired_align,
7796 int align,
7797 unsigned HOST_WIDE_INT *min_size,
7798 bool dynamic_check,
7799 bool issetmem)
7800 {
7801 rtx_code_label *loop_label = NULL, *label;
7802 int n;
7803 rtx modesize;
7804 int prolog_size = 0;
7805 rtx mode_value;
7806
7807 /* Chose proper value to copy. */
7808 if (issetmem && VECTOR_MODE_P (mode))
7809 mode_value = vec_value;
7810 else
7811 mode_value = value;
7812 gcc_assert (GET_MODE_SIZE (mode) <= size);
7813
7814 /* See if block is big or small, handle small blocks. */
7815 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
7816 {
7817 int size2 = size;
7818 loop_label = gen_label_rtx ();
7819
7820 if (!*done_label)
7821 *done_label = gen_label_rtx ();
7822
7823 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
7824 1, loop_label);
7825 size2 >>= 1;
7826
7827 /* Handle sizes > 3. */
7828 for (;size2 > 2; size2 >>= 1)
7829 expand_small_cpymem_or_setmem (destmem, srcmem,
7830 *destptr, *srcptr,
7831 value, vec_value,
7832 *count,
7833 size2, *done_label, issetmem);
7834 /* Nothing to copy? Jump to DONE_LABEL if so */
7835 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
7836 1, *done_label);
7837
7838 /* Do a byte copy. */
7839 destmem = change_address (destmem, QImode, *destptr);
7840 if (issetmem)
7841 emit_move_insn (destmem, gen_lowpart (QImode, value));
7842 else
7843 {
7844 srcmem = change_address (srcmem, QImode, *srcptr);
7845 emit_move_insn (destmem, srcmem);
7846 }
7847
7848 /* Handle sizes 2 and 3. */
7849 label = ix86_expand_aligntest (*count, 2, false);
7850 destmem = change_address (destmem, HImode, *destptr);
7851 destmem = offset_address (destmem, *count, 1);
7852 destmem = offset_address (destmem, GEN_INT (-2), 2);
7853 if (issetmem)
7854 emit_move_insn (destmem, gen_lowpart (HImode, value));
7855 else
7856 {
7857 srcmem = change_address (srcmem, HImode, *srcptr);
7858 srcmem = offset_address (srcmem, *count, 1);
7859 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
7860 emit_move_insn (destmem, srcmem);
7861 }
7862
7863 emit_label (label);
7864 LABEL_NUSES (label) = 1;
7865 emit_jump_insn (gen_jump (*done_label));
7866 emit_barrier ();
7867 }
7868 else
7869 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
7870 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
7871
7872 /* Start memcpy for COUNT >= SIZE. */
7873 if (loop_label)
7874 {
7875 emit_label (loop_label);
7876 LABEL_NUSES (loop_label) = 1;
7877 }
7878
7879 /* Copy first desired_align bytes. */
7880 if (!issetmem)
7881 srcmem = change_address (srcmem, mode, *srcptr);
7882 destmem = change_address (destmem, mode, *destptr);
7883 modesize = GEN_INT (GET_MODE_SIZE (mode));
7884 for (n = 0; prolog_size < desired_align - align; n++)
7885 {
7886 if (issetmem)
7887 emit_move_insn (destmem, mode_value);
7888 else
7889 {
7890 emit_move_insn (destmem, srcmem);
7891 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7892 }
7893 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7894 prolog_size += GET_MODE_SIZE (mode);
7895 }
7896
7897
7898 /* Copy last SIZE bytes. */
7899 destmem = offset_address (destmem, *count, 1);
7900 destmem = offset_address (destmem,
7901 GEN_INT (-size - prolog_size),
7902 1);
7903 if (issetmem)
7904 emit_move_insn (destmem, mode_value);
7905 else
7906 {
7907 srcmem = offset_address (srcmem, *count, 1);
7908 srcmem = offset_address (srcmem,
7909 GEN_INT (-size - prolog_size),
7910 1);
7911 emit_move_insn (destmem, srcmem);
7912 }
7913 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
7914 {
7915 destmem = offset_address (destmem, modesize, 1);
7916 if (issetmem)
7917 emit_move_insn (destmem, mode_value);
7918 else
7919 {
7920 srcmem = offset_address (srcmem, modesize, 1);
7921 emit_move_insn (destmem, srcmem);
7922 }
7923 }
7924
7925 /* Align destination. */
7926 if (desired_align > 1 && desired_align > align)
7927 {
7928 rtx saveddest = *destptr;
7929
7930 gcc_assert (desired_align <= size);
7931 /* Align destptr up, place it to new register. */
7932 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
7933 GEN_INT (prolog_size),
7934 NULL_RTX, 1, OPTAB_DIRECT);
7935 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
7936 REG_POINTER (*destptr) = 1;
7937 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
7938 GEN_INT (-desired_align),
7939 *destptr, 1, OPTAB_DIRECT);
7940 /* See how many bytes we skipped. */
7941 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
7942 *destptr,
7943 saveddest, 1, OPTAB_DIRECT);
7944 /* Adjust srcptr and count. */
7945 if (!issetmem)
7946 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
7947 saveddest, *srcptr, 1, OPTAB_DIRECT);
7948 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
7949 saveddest, *count, 1, OPTAB_DIRECT);
7950 /* We copied at most size + prolog_size. */
7951 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
7952 *min_size
7953 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
7954 else
7955 *min_size = 0;
7956
7957 /* Our loops always round down the block size, but for dispatch to
7958 library we need precise value. */
7959 if (dynamic_check)
7960 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
7961 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
7962 }
7963 else
7964 {
7965 gcc_assert (prolog_size == 0);
7966 /* Decrease count, so we won't end up copying last word twice. */
7967 if (!CONST_INT_P (*count))
7968 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
7969 constm1_rtx, *count, 1, OPTAB_DIRECT);
7970 else
7971 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
7972 (unsigned HOST_WIDE_INT)size));
7973 if (*min_size)
7974 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
7975 }
7976 }
7977
7978
7979 /* This function is like the previous one, except here we know how many bytes
7980 need to be copied. That allows us to update alignment not only of DST, which
7981 is returned, but also of SRC, which is passed as a pointer for that
7982 reason. */
7983 static rtx
7984 expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
7985 rtx srcreg, rtx value, rtx vec_value,
7986 int desired_align, int align_bytes,
7987 bool issetmem)
7988 {
7989 rtx src = NULL;
7990 rtx orig_dst = dst;
7991 rtx orig_src = NULL;
7992 int piece_size = 1;
7993 int copied_bytes = 0;
7994
7995 if (!issetmem)
7996 {
7997 gcc_assert (srcp != NULL);
7998 src = *srcp;
7999 orig_src = src;
8000 }
8001
8002 for (piece_size = 1;
8003 piece_size <= desired_align && copied_bytes < align_bytes;
8004 piece_size <<= 1)
8005 {
8006 if (align_bytes & piece_size)
8007 {
8008 if (issetmem)
8009 {
8010 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
8011 dst = emit_memset (dst, destreg, vec_value, piece_size);
8012 else
8013 dst = emit_memset (dst, destreg, value, piece_size);
8014 }
8015 else
8016 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
8017 copied_bytes += piece_size;
8018 }
8019 }
8020 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
8021 set_mem_align (dst, desired_align * BITS_PER_UNIT);
8022 if (MEM_SIZE_KNOWN_P (orig_dst))
8023 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
8024
8025 if (!issetmem)
8026 {
8027 int src_align_bytes = get_mem_align_offset (src, desired_align
8028 * BITS_PER_UNIT);
8029 if (src_align_bytes >= 0)
8030 src_align_bytes = desired_align - src_align_bytes;
8031 if (src_align_bytes >= 0)
8032 {
8033 unsigned int src_align;
8034 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
8035 {
8036 if ((src_align_bytes & (src_align - 1))
8037 == (align_bytes & (src_align - 1)))
8038 break;
8039 }
8040 if (src_align > (unsigned int) desired_align)
8041 src_align = desired_align;
8042 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
8043 set_mem_align (src, src_align * BITS_PER_UNIT);
8044 }
8045 if (MEM_SIZE_KNOWN_P (orig_src))
8046 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
8047 *srcp = src;
8048 }
8049
8050 return dst;
8051 }
8052
8053 /* Return true if ALG can be used in current context.
8054 Assume we expand memset if MEMSET is true. */
8055 static bool
8056 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
8057 {
8058 if (alg == no_stringop)
8059 return false;
8060 if (alg == vector_loop)
8061 return TARGET_SSE || TARGET_AVX;
8062 /* Algorithms using the rep prefix want at least edi and ecx;
8063 additionally, memset wants eax and memcpy wants esi. Don't
8064 consider such algorithms if the user has appropriated those
8065 registers for their own purposes, or if we have a non-default
8066 address space, since some string insns cannot override the segment. */
8067 if (alg == rep_prefix_1_byte
8068 || alg == rep_prefix_4_byte
8069 || alg == rep_prefix_8_byte)
8070 {
8071 if (have_as)
8072 return false;
8073 if (fixed_regs[CX_REG]
8074 || fixed_regs[DI_REG]
8075 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
8076 return false;
8077 }
8078 return true;
8079 }
8080
8081 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
8082 static enum stringop_alg
8083 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
8084 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
8085 bool memset, bool zero_memset, bool have_as,
8086 int *dynamic_check, bool *noalign, bool recur)
8087 {
8088 const struct stringop_algs *algs;
8089 bool optimize_for_speed;
8090 int max = 0;
8091 const struct processor_costs *cost;
8092 int i;
8093 bool any_alg_usable_p = false;
8094
8095 *noalign = false;
8096 *dynamic_check = -1;
8097
8098 /* Even if the string operation call is cold, we still might spend a lot
8099 of time processing large blocks. */
8100 if (optimize_function_for_size_p (cfun)
8101 || (optimize_insn_for_size_p ()
8102 && (max_size < 256
8103 || (expected_size != -1 && expected_size < 256))))
8104 optimize_for_speed = false;
8105 else
8106 optimize_for_speed = true;
8107
8108 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
8109 if (memset)
8110 algs = &cost->memset[TARGET_64BIT != 0];
8111 else
8112 algs = &cost->memcpy[TARGET_64BIT != 0];
8113
8114 /* See maximal size for user defined algorithm. */
8115 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
8116 {
8117 enum stringop_alg candidate = algs->size[i].alg;
8118 bool usable = alg_usable_p (candidate, memset, have_as);
8119 any_alg_usable_p |= usable;
8120
8121 if (candidate != libcall && candidate && usable)
8122 max = algs->size[i].max;
8123 }
8124
8125 /* If expected size is not known but max size is small enough
8126 so inline version is a win, set expected size into
8127 the range. */
8128 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
8129 && expected_size == -1)
8130 expected_size = min_size / 2 + max_size / 2;
8131
8132 /* If user specified the algorithm, honor it if possible. */
8133 if (ix86_stringop_alg != no_stringop
8134 && alg_usable_p (ix86_stringop_alg, memset, have_as))
8135 return ix86_stringop_alg;
8136 /* rep; movq or rep; movl is the smallest variant. */
8137 else if (!optimize_for_speed)
8138 {
8139 *noalign = true;
8140 if (!count || (count & 3) || (memset && !zero_memset))
8141 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
8142 ? rep_prefix_1_byte : loop_1_byte;
8143 else
8144 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
8145 ? rep_prefix_4_byte : loop;
8146 }
8147 /* Very tiny blocks are best handled via the loop, REP is expensive to
8148 setup. */
8149 else if (expected_size != -1 && expected_size < 4)
8150 return loop_1_byte;
8151 else if (expected_size != -1)
8152 {
8153 enum stringop_alg alg = libcall;
8154 bool alg_noalign = false;
8155 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
8156 {
8157 /* We get here if the algorithms that were not libcall-based
8158 were rep-prefix based and we are unable to use rep prefixes
8159 based on global register usage. Break out of the loop and
8160 use the heuristic below. */
8161 if (algs->size[i].max == 0)
8162 break;
8163 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
8164 {
8165 enum stringop_alg candidate = algs->size[i].alg;
8166
8167 if (candidate != libcall
8168 && alg_usable_p (candidate, memset, have_as))
8169 {
8170 alg = candidate;
8171 alg_noalign = algs->size[i].noalign;
8172 }
8173 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
8174 last non-libcall inline algorithm. */
8175 if (TARGET_INLINE_ALL_STRINGOPS)
8176 {
8177 /* When the current size is best to be copied by a libcall,
8178 but we are still forced to inline, run the heuristic below
8179 that will pick code for medium sized blocks. */
8180 if (alg != libcall)
8181 {
8182 *noalign = alg_noalign;
8183 return alg;
8184 }
8185 else if (!any_alg_usable_p)
8186 break;
8187 }
8188 else if (alg_usable_p (candidate, memset, have_as)
8189 && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
8190 && candidate == rep_prefix_1_byte
8191 /* NB: If min_size != max_size, size is
8192 unknown. */
8193 && min_size != max_size))
8194 {
8195 *noalign = algs->size[i].noalign;
8196 return candidate;
8197 }
8198 }
8199 }
8200 }
8201 /* When asked to inline the call anyway, try to pick meaningful choice.
8202 We look for maximal size of block that is faster to copy by hand and
8203 take blocks of at most of that size guessing that average size will
8204 be roughly half of the block.
8205
8206 If this turns out to be bad, we might simply specify the preferred
8207 choice in ix86_costs. */
8208 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
8209 && (algs->unknown_size == libcall
8210 || !alg_usable_p (algs->unknown_size, memset, have_as)))
8211 {
8212 enum stringop_alg alg;
8213 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
8214
8215 /* If there aren't any usable algorithms or if recursing already,
8216 then recursing on smaller sizes or same size isn't going to
8217 find anything. Just return the simple byte-at-a-time copy loop. */
8218 if (!any_alg_usable_p || recur)
8219 {
8220 /* Pick something reasonable. */
8221 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
8222 *dynamic_check = 128;
8223 return loop_1_byte;
8224 }
8225 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
8226 zero_memset, have_as, dynamic_check, noalign, true);
8227 gcc_assert (*dynamic_check == -1);
8228 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
8229 *dynamic_check = max;
8230 else
8231 gcc_assert (alg != libcall);
8232 return alg;
8233 }
8234 return (alg_usable_p (algs->unknown_size, memset, have_as)
8235 ? algs->unknown_size : libcall);
8236 }
8237
8238 /* Decide on alignment. We know that the operand is already aligned to ALIGN
8239 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
8240 static int
8241 decide_alignment (int align,
8242 enum stringop_alg alg,
8243 int expected_size,
8244 machine_mode move_mode)
8245 {
8246 int desired_align = 0;
8247
8248 gcc_assert (alg != no_stringop);
8249
8250 if (alg == libcall)
8251 return 0;
8252 if (move_mode == VOIDmode)
8253 return 0;
8254
8255 desired_align = GET_MODE_SIZE (move_mode);
8256 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
8257 copying whole cacheline at once. */
8258 if (TARGET_CPU_P (PENTIUMPRO)
8259 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
8260 desired_align = 8;
8261
8262 if (optimize_size)
8263 desired_align = 1;
8264 if (desired_align < align)
8265 desired_align = align;
8266 if (expected_size != -1 && expected_size < 4)
8267 desired_align = align;
8268
8269 return desired_align;
8270 }
8271
8272
8273 /* Helper function for memcpy. For QImode value 0xXY produce
8274 0xXYXYXYXY of wide specified by MODE. This is essentially
8275 a * 0x10101010, but we can do slightly better than
8276 synth_mult by unwinding the sequence by hand on CPUs with
8277 slow multiply. */
8278 static rtx
8279 promote_duplicated_reg (machine_mode mode, rtx val)
8280 {
8281 machine_mode valmode = GET_MODE (val);
8282 rtx tmp;
8283 int nops = mode == DImode ? 3 : 2;
8284
8285 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
8286 if (val == const0_rtx)
8287 return copy_to_mode_reg (mode, CONST0_RTX (mode));
8288 if (CONST_INT_P (val))
8289 {
8290 HOST_WIDE_INT v = INTVAL (val) & 255;
8291
8292 v |= v << 8;
8293 v |= v << 16;
8294 if (mode == DImode)
8295 v |= (v << 16) << 16;
8296 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
8297 }
8298
8299 if (valmode == VOIDmode)
8300 valmode = QImode;
8301 if (valmode != QImode)
8302 val = gen_lowpart (QImode, val);
8303 if (mode == QImode)
8304 return val;
8305 if (!TARGET_PARTIAL_REG_STALL)
8306 nops--;
8307 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
8308 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
8309 <= (ix86_cost->shift_const + ix86_cost->add) * nops
8310 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
8311 {
8312 rtx reg = convert_modes (mode, QImode, val, true);
8313 tmp = promote_duplicated_reg (mode, const1_rtx);
8314 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
8315 OPTAB_DIRECT);
8316 }
8317 else
8318 {
8319 rtx reg = convert_modes (mode, QImode, val, true);
8320
8321 if (!TARGET_PARTIAL_REG_STALL)
8322 emit_insn (gen_insv_1 (mode, reg, reg));
8323 else
8324 {
8325 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
8326 NULL, 1, OPTAB_DIRECT);
8327 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
8328 OPTAB_DIRECT);
8329 }
8330 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
8331 NULL, 1, OPTAB_DIRECT);
8332 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
8333 if (mode == SImode)
8334 return reg;
8335 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
8336 NULL, 1, OPTAB_DIRECT);
8337 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
8338 return reg;
8339 }
8340 }
8341
8342 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
8343 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
8344 alignment from ALIGN to DESIRED_ALIGN. */
8345 static rtx
8346 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
8347 int align)
8348 {
8349 rtx promoted_val;
8350
8351 if (TARGET_64BIT
8352 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
8353 promoted_val = promote_duplicated_reg (DImode, val);
8354 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
8355 promoted_val = promote_duplicated_reg (SImode, val);
8356 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
8357 promoted_val = promote_duplicated_reg (HImode, val);
8358 else
8359 promoted_val = val;
8360
8361 return promoted_val;
8362 }
8363
8364 /* Copy the address to a Pmode register. This is used for x32 to
8365 truncate DImode TLS address to a SImode register. */
8366
8367 static rtx
8368 ix86_copy_addr_to_reg (rtx addr)
8369 {
8370 rtx reg;
8371 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
8372 {
8373 reg = copy_addr_to_reg (addr);
8374 REG_POINTER (reg) = 1;
8375 return reg;
8376 }
8377 else
8378 {
8379 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
8380 reg = copy_to_mode_reg (DImode, addr);
8381 REG_POINTER (reg) = 1;
8382 return gen_rtx_SUBREG (SImode, reg, 0);
8383 }
8384 }
8385
8386 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
8387 operations when profitable. The code depends upon architecture, block size
8388 and alignment, but always has one of the following overall structures:
8389
8390 Aligned move sequence:
8391
8392 1) Prologue guard: Conditional that jumps up to epilogues for small
8393 blocks that can be handled by epilogue alone. This is faster
8394 but also needed for correctness, since prologue assume the block
8395 is larger than the desired alignment.
8396
8397 Optional dynamic check for size and libcall for large
8398 blocks is emitted here too, with -minline-stringops-dynamically.
8399
8400 2) Prologue: copy first few bytes in order to get destination
8401 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
8402 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
8403 copied. We emit either a jump tree on power of two sized
8404 blocks, or a byte loop.
8405
8406 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8407 with specified algorithm.
8408
8409 4) Epilogue: code copying tail of the block that is too small to be
8410 handled by main body (or up to size guarded by prologue guard).
8411
8412 Misaligned move sequence
8413
8414 1) missaligned move prologue/epilogue containing:
8415 a) Prologue handling small memory blocks and jumping to done_label
8416 (skipped if blocks are known to be large enough)
8417 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
8418 needed by single possibly misaligned move
8419 (skipped if alignment is not needed)
8420 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
8421
8422 2) Zero size guard dispatching to done_label, if needed
8423
8424 3) dispatch to library call, if needed,
8425
8426 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8427 with specified algorithm. */
8428 bool
8429 ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
8430 rtx align_exp, rtx expected_align_exp,
8431 rtx expected_size_exp, rtx min_size_exp,
8432 rtx max_size_exp, rtx probable_max_size_exp,
8433 bool issetmem)
8434 {
8435 rtx destreg;
8436 rtx srcreg = NULL;
8437 rtx_code_label *label = NULL;
8438 rtx tmp;
8439 rtx_code_label *jump_around_label = NULL;
8440 HOST_WIDE_INT align = 1;
8441 unsigned HOST_WIDE_INT count = 0;
8442 HOST_WIDE_INT expected_size = -1;
8443 int size_needed = 0, epilogue_size_needed;
8444 int desired_align = 0, align_bytes = 0;
8445 enum stringop_alg alg;
8446 rtx promoted_val = NULL;
8447 rtx vec_promoted_val = NULL;
8448 bool force_loopy_epilogue = false;
8449 int dynamic_check;
8450 bool need_zero_guard = false;
8451 bool noalign;
8452 machine_mode move_mode = VOIDmode;
8453 machine_mode wider_mode;
8454 int unroll_factor = 1;
8455 /* TODO: Once value ranges are available, fill in proper data. */
8456 unsigned HOST_WIDE_INT min_size = 0;
8457 unsigned HOST_WIDE_INT max_size = -1;
8458 unsigned HOST_WIDE_INT probable_max_size = -1;
8459 bool misaligned_prologue_used = false;
8460 bool have_as;
8461
8462 if (CONST_INT_P (align_exp))
8463 align = INTVAL (align_exp);
8464 /* i386 can do misaligned access on reasonably increased cost. */
8465 if (CONST_INT_P (expected_align_exp)
8466 && INTVAL (expected_align_exp) > align)
8467 align = INTVAL (expected_align_exp);
8468 /* ALIGN is the minimum of destination and source alignment, but we care here
8469 just about destination alignment. */
8470 else if (!issetmem
8471 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
8472 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
8473
8474 if (CONST_INT_P (count_exp))
8475 {
8476 min_size = max_size = probable_max_size = count = expected_size
8477 = INTVAL (count_exp);
8478 /* When COUNT is 0, there is nothing to do. */
8479 if (!count)
8480 return true;
8481 }
8482 else
8483 {
8484 if (min_size_exp)
8485 min_size = INTVAL (min_size_exp);
8486 if (max_size_exp)
8487 max_size = INTVAL (max_size_exp);
8488 if (probable_max_size_exp)
8489 probable_max_size = INTVAL (probable_max_size_exp);
8490 if (CONST_INT_P (expected_size_exp))
8491 expected_size = INTVAL (expected_size_exp);
8492 }
8493
8494 /* Make sure we don't need to care about overflow later on. */
8495 if (count > (HOST_WIDE_INT_1U << 30))
8496 return false;
8497
8498 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
8499 if (!issetmem)
8500 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
8501
8502 /* Step 0: Decide on preferred algorithm, desired alignment and
8503 size of chunks to be copied by main loop. */
8504 alg = decide_alg (count, expected_size, min_size, probable_max_size,
8505 issetmem,
8506 issetmem && val_exp == const0_rtx, have_as,
8507 &dynamic_check, &noalign, false);
8508
8509 if (dump_file)
8510 fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
8511 stringop_alg_names[alg]);
8512
8513 if (alg == libcall)
8514 return false;
8515 gcc_assert (alg != no_stringop);
8516
8517 /* For now vector-version of memset is generated only for memory zeroing, as
8518 creating of promoted vector value is very cheap in this case. */
8519 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
8520 alg = unrolled_loop;
8521
8522 if (!count)
8523 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
8524 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
8525 if (!issetmem)
8526 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
8527
8528 unroll_factor = 1;
8529 move_mode = word_mode;
8530 switch (alg)
8531 {
8532 case libcall:
8533 case no_stringop:
8534 case last_alg:
8535 gcc_unreachable ();
8536 case loop_1_byte:
8537 need_zero_guard = true;
8538 move_mode = QImode;
8539 break;
8540 case loop:
8541 need_zero_guard = true;
8542 break;
8543 case unrolled_loop:
8544 need_zero_guard = true;
8545 unroll_factor = (TARGET_64BIT ? 4 : 2);
8546 break;
8547 case vector_loop:
8548 need_zero_guard = true;
8549 unroll_factor = 4;
8550 /* Find the widest supported mode. */
8551 move_mode = word_mode;
8552 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
8553 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
8554 move_mode = wider_mode;
8555
8556 if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
8557 move_mode = TImode;
8558
8559 /* Find the corresponding vector mode with the same size as MOVE_MODE.
8560 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
8561 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
8562 {
8563 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
8564 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
8565 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
8566 move_mode = word_mode;
8567 }
8568 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
8569 break;
8570 case rep_prefix_8_byte:
8571 move_mode = DImode;
8572 break;
8573 case rep_prefix_4_byte:
8574 move_mode = SImode;
8575 break;
8576 case rep_prefix_1_byte:
8577 move_mode = QImode;
8578 break;
8579 }
8580 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
8581 epilogue_size_needed = size_needed;
8582
8583 /* If we are going to call any library calls conditionally, make sure any
8584 pending stack adjustment happen before the first conditional branch,
8585 otherwise they will be emitted before the library call only and won't
8586 happen from the other branches. */
8587 if (dynamic_check != -1)
8588 do_pending_stack_adjust ();
8589
8590 desired_align = decide_alignment (align, alg, expected_size, move_mode);
8591 if (!TARGET_ALIGN_STRINGOPS || noalign)
8592 align = desired_align;
8593
8594 /* Step 1: Prologue guard. */
8595
8596 /* Alignment code needs count to be in register. */
8597 if (CONST_INT_P (count_exp) && desired_align > align)
8598 {
8599 if (INTVAL (count_exp) > desired_align
8600 && INTVAL (count_exp) > size_needed)
8601 {
8602 align_bytes
8603 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
8604 if (align_bytes <= 0)
8605 align_bytes = 0;
8606 else
8607 align_bytes = desired_align - align_bytes;
8608 }
8609 if (align_bytes == 0)
8610 count_exp = force_reg (counter_mode (count_exp), count_exp);
8611 }
8612 gcc_assert (desired_align >= 1 && align >= 1);
8613
8614 /* Misaligned move sequences handle both prologue and epilogue at once.
8615 Default code generation results in a smaller code for large alignments
8616 and also avoids redundant job when sizes are known precisely. */
8617 misaligned_prologue_used
8618 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
8619 && MAX (desired_align, epilogue_size_needed) <= 32
8620 && desired_align <= epilogue_size_needed
8621 && ((desired_align > align && !align_bytes)
8622 || (!count && epilogue_size_needed > 1)));
8623
8624 /* Do the cheap promotion to allow better CSE across the
8625 main loop and epilogue (ie one load of the big constant in the
8626 front of all code.
8627 For now the misaligned move sequences do not have fast path
8628 without broadcasting. */
8629 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
8630 {
8631 if (alg == vector_loop)
8632 {
8633 gcc_assert (val_exp == const0_rtx);
8634 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
8635 promoted_val = promote_duplicated_reg_to_size (val_exp,
8636 GET_MODE_SIZE (word_mode),
8637 desired_align, align);
8638 }
8639 else
8640 {
8641 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
8642 desired_align, align);
8643 }
8644 }
8645 /* Misaligned move sequences handles both prologues and epilogues at once.
8646 Default code generation results in smaller code for large alignments and
8647 also avoids redundant job when sizes are known precisely. */
8648 if (misaligned_prologue_used)
8649 {
8650 /* Misaligned move prologue handled small blocks by itself. */
8651 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
8652 (dst, src, &destreg, &srcreg,
8653 move_mode, promoted_val, vec_promoted_val,
8654 &count_exp,
8655 &jump_around_label,
8656 desired_align < align
8657 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
8658 desired_align, align, &min_size, dynamic_check, issetmem);
8659 if (!issetmem)
8660 src = change_address (src, BLKmode, srcreg);
8661 dst = change_address (dst, BLKmode, destreg);
8662 set_mem_align (dst, desired_align * BITS_PER_UNIT);
8663 epilogue_size_needed = 0;
8664 if (need_zero_guard
8665 && min_size < (unsigned HOST_WIDE_INT) size_needed)
8666 {
8667 /* It is possible that we copied enough so the main loop will not
8668 execute. */
8669 gcc_assert (size_needed > 1);
8670 if (jump_around_label == NULL_RTX)
8671 jump_around_label = gen_label_rtx ();
8672 emit_cmp_and_jump_insns (count_exp,
8673 GEN_INT (size_needed),
8674 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
8675 if (expected_size == -1
8676 || expected_size < (desired_align - align) / 2 + size_needed)
8677 predict_jump (REG_BR_PROB_BASE * 20 / 100);
8678 else
8679 predict_jump (REG_BR_PROB_BASE * 60 / 100);
8680 }
8681 }
8682 /* Ensure that alignment prologue won't copy past end of block. */
8683 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
8684 {
8685 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
8686 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
8687 Make sure it is power of 2. */
8688 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
8689
8690 /* To improve performance of small blocks, we jump around the VAL
8691 promoting mode. This mean that if the promoted VAL is not constant,
8692 we might not use it in the epilogue and have to use byte
8693 loop variant. */
8694 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
8695 force_loopy_epilogue = true;
8696 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8697 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8698 {
8699 /* If main algorithm works on QImode, no epilogue is needed.
8700 For small sizes just don't align anything. */
8701 if (size_needed == 1)
8702 desired_align = align;
8703 else
8704 goto epilogue;
8705 }
8706 else if (!count
8707 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8708 {
8709 label = gen_label_rtx ();
8710 emit_cmp_and_jump_insns (count_exp,
8711 GEN_INT (epilogue_size_needed),
8712 LTU, 0, counter_mode (count_exp), 1, label);
8713 if (expected_size == -1 || expected_size < epilogue_size_needed)
8714 predict_jump (REG_BR_PROB_BASE * 60 / 100);
8715 else
8716 predict_jump (REG_BR_PROB_BASE * 20 / 100);
8717 }
8718 }
8719
8720 /* Emit code to decide on runtime whether library call or inline should be
8721 used. */
8722 if (dynamic_check != -1)
8723 {
8724 if (!issetmem && CONST_INT_P (count_exp))
8725 {
8726 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
8727 {
8728 emit_block_copy_via_libcall (dst, src, count_exp);
8729 count_exp = const0_rtx;
8730 goto epilogue;
8731 }
8732 }
8733 else
8734 {
8735 rtx_code_label *hot_label = gen_label_rtx ();
8736 if (jump_around_label == NULL_RTX)
8737 jump_around_label = gen_label_rtx ();
8738 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
8739 LEU, 0, counter_mode (count_exp),
8740 1, hot_label);
8741 predict_jump (REG_BR_PROB_BASE * 90 / 100);
8742 if (issetmem)
8743 set_storage_via_libcall (dst, count_exp, val_exp);
8744 else
8745 emit_block_copy_via_libcall (dst, src, count_exp);
8746 emit_jump (jump_around_label);
8747 emit_label (hot_label);
8748 }
8749 }
8750
8751 /* Step 2: Alignment prologue. */
8752 /* Do the expensive promotion once we branched off the small blocks. */
8753 if (issetmem && !promoted_val)
8754 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
8755 desired_align, align);
8756
8757 if (desired_align > align && !misaligned_prologue_used)
8758 {
8759 if (align_bytes == 0)
8760 {
8761 /* Except for the first move in prologue, we no longer know
8762 constant offset in aliasing info. It don't seems to worth
8763 the pain to maintain it for the first move, so throw away
8764 the info early. */
8765 dst = change_address (dst, BLKmode, destreg);
8766 if (!issetmem)
8767 src = change_address (src, BLKmode, srcreg);
8768 dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
8769 promoted_val, vec_promoted_val,
8770 count_exp, align, desired_align,
8771 issetmem);
8772 /* At most desired_align - align bytes are copied. */
8773 if (min_size < (unsigned)(desired_align - align))
8774 min_size = 0;
8775 else
8776 min_size -= desired_align - align;
8777 }
8778 else
8779 {
8780 /* If we know how many bytes need to be stored before dst is
8781 sufficiently aligned, maintain aliasing info accurately. */
8782 dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
8783 srcreg,
8784 promoted_val,
8785 vec_promoted_val,
8786 desired_align,
8787 align_bytes,
8788 issetmem);
8789
8790 count_exp = plus_constant (counter_mode (count_exp),
8791 count_exp, -align_bytes);
8792 count -= align_bytes;
8793 min_size -= align_bytes;
8794 max_size -= align_bytes;
8795 }
8796 if (need_zero_guard
8797 && min_size < (unsigned HOST_WIDE_INT) size_needed
8798 && (count < (unsigned HOST_WIDE_INT) size_needed
8799 || (align_bytes == 0
8800 && count < ((unsigned HOST_WIDE_INT) size_needed
8801 + desired_align - align))))
8802 {
8803 /* It is possible that we copied enough so the main loop will not
8804 execute. */
8805 gcc_assert (size_needed > 1);
8806 if (label == NULL_RTX)
8807 label = gen_label_rtx ();
8808 emit_cmp_and_jump_insns (count_exp,
8809 GEN_INT (size_needed),
8810 LTU, 0, counter_mode (count_exp), 1, label);
8811 if (expected_size == -1
8812 || expected_size < (desired_align - align) / 2 + size_needed)
8813 predict_jump (REG_BR_PROB_BASE * 20 / 100);
8814 else
8815 predict_jump (REG_BR_PROB_BASE * 60 / 100);
8816 }
8817 }
8818 if (label && size_needed == 1)
8819 {
8820 emit_label (label);
8821 LABEL_NUSES (label) = 1;
8822 label = NULL;
8823 epilogue_size_needed = 1;
8824 if (issetmem)
8825 promoted_val = val_exp;
8826 }
8827 else if (label == NULL_RTX && !misaligned_prologue_used)
8828 epilogue_size_needed = size_needed;
8829
8830 /* Step 3: Main loop. */
8831
8832 switch (alg)
8833 {
8834 case libcall:
8835 case no_stringop:
8836 case last_alg:
8837 gcc_unreachable ();
8838 case loop_1_byte:
8839 case loop:
8840 case unrolled_loop:
8841 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
8842 count_exp, move_mode, unroll_factor,
8843 expected_size, issetmem);
8844 break;
8845 case vector_loop:
8846 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
8847 vec_promoted_val, count_exp, move_mode,
8848 unroll_factor, expected_size, issetmem);
8849 break;
8850 case rep_prefix_8_byte:
8851 case rep_prefix_4_byte:
8852 case rep_prefix_1_byte:
8853 expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
8854 val_exp, count_exp, move_mode, issetmem);
8855 break;
8856 }
8857 /* Adjust properly the offset of src and dest memory for aliasing. */
8858 if (CONST_INT_P (count_exp))
8859 {
8860 if (!issetmem)
8861 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
8862 (count / size_needed) * size_needed);
8863 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
8864 (count / size_needed) * size_needed);
8865 }
8866 else
8867 {
8868 if (!issetmem)
8869 src = change_address (src, BLKmode, srcreg);
8870 dst = change_address (dst, BLKmode, destreg);
8871 }
8872
8873 /* Step 4: Epilogue to copy the remaining bytes. */
8874 epilogue:
8875 if (label)
8876 {
8877 /* When the main loop is done, COUNT_EXP might hold original count,
8878 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
8879 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
8880 bytes. Compensate if needed. */
8881
8882 if (size_needed < epilogue_size_needed)
8883 {
8884 tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
8885 GEN_INT (size_needed - 1), count_exp, 1,
8886 OPTAB_DIRECT);
8887 if (tmp != count_exp)
8888 emit_move_insn (count_exp, tmp);
8889 }
8890 emit_label (label);
8891 LABEL_NUSES (label) = 1;
8892 }
8893
8894 if (count_exp != const0_rtx && epilogue_size_needed > 1)
8895 {
8896 if (force_loopy_epilogue)
8897 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
8898 epilogue_size_needed);
8899 else
8900 {
8901 if (issetmem)
8902 expand_setmem_epilogue (dst, destreg, promoted_val,
8903 vec_promoted_val, count_exp,
8904 epilogue_size_needed);
8905 else
8906 expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
8907 epilogue_size_needed);
8908 }
8909 }
8910 if (jump_around_label)
8911 emit_label (jump_around_label);
8912 return true;
8913 }
8914
8915 /* Expand cmpstrn or memcmp. */
8916
8917 bool
8918 ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2,
8919 rtx length, rtx align, bool is_cmpstrn)
8920 {
8921 /* Expand strncmp and memcmp only with -minline-all-stringops since
8922 "repz cmpsb" can be much slower than strncmp and memcmp functions
8923 implemented with vector instructions, see
8924
8925 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
8926 */
8927 if (!TARGET_INLINE_ALL_STRINGOPS)
8928 return false;
8929
8930 /* Can't use this if the user has appropriated ecx, esi or edi. */
8931 if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
8932 return false;
8933
8934 if (is_cmpstrn)
8935 {
8936 /* For strncmp, length is the maximum length, which can be larger
8937 than actual string lengths. We can expand the cmpstrn pattern
8938 to "repz cmpsb" only if one of the strings is a constant so
8939 that expand_builtin_strncmp() can write the length argument to
8940 be the minimum of the const string length and the actual length
8941 argument. Otherwise, "repz cmpsb" may pass the 0 byte. */
8942 tree t1 = MEM_EXPR (src1);
8943 tree t2 = MEM_EXPR (src2);
8944 if (!((t1 && TREE_CODE (t1) == MEM_REF
8945 && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
8946 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0))
8947 == STRING_CST))
8948 || (t2 && TREE_CODE (t2) == MEM_REF
8949 && TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
8950 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0))
8951 == STRING_CST))))
8952 return false;
8953 }
8954
8955 rtx addr1 = copy_addr_to_reg (XEXP (src1, 0));
8956 rtx addr2 = copy_addr_to_reg (XEXP (src2, 0));
8957 if (addr1 != XEXP (src1, 0))
8958 src1 = replace_equiv_address_nv (src1, addr1);
8959 if (addr2 != XEXP (src2, 0))
8960 src2 = replace_equiv_address_nv (src2, addr2);
8961
8962 /* NB: Make a copy of the data length to avoid changing the original
8963 data length by cmpstrnqi patterns. */
8964 length = ix86_zero_extend_to_Pmode (length);
8965 rtx lengthreg = gen_reg_rtx (Pmode);
8966 emit_move_insn (lengthreg, length);
8967
8968 /* If we are testing strict equality, we can use known alignment to
8969 good advantage. This may be possible with combine, particularly
8970 once cc0 is dead. */
8971 if (CONST_INT_P (length))
8972 {
8973 if (length == const0_rtx)
8974 {
8975 emit_move_insn (result, const0_rtx);
8976 return true;
8977 }
8978 emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align,
8979 src1, src2));
8980 }
8981 else
8982 {
8983 emit_insn (gen_cmp_1 (Pmode, lengthreg, lengthreg));
8984 emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align,
8985 src1, src2));
8986 }
8987
8988 rtx out = gen_lowpart (QImode, result);
8989 emit_insn (gen_cmpintqi (out));
8990 emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out));
8991
8992 return true;
8993 }
8994
8995 /* Expand the appropriate insns for doing strlen if not just doing
8996 repnz; scasb
8997
8998 out = result, initialized with the start address
8999 align_rtx = alignment of the address.
9000 scratch = scratch register, initialized with the startaddress when
9001 not aligned, otherwise undefined
9002
9003 This is just the body. It needs the initializations mentioned above and
9004 some address computing at the end. These things are done in i386.md. */
9005
9006 static void
9007 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
9008 {
9009 int align;
9010 rtx tmp;
9011 rtx_code_label *align_2_label = NULL;
9012 rtx_code_label *align_3_label = NULL;
9013 rtx_code_label *align_4_label = gen_label_rtx ();
9014 rtx_code_label *end_0_label = gen_label_rtx ();
9015 rtx mem;
9016 rtx tmpreg = gen_reg_rtx (SImode);
9017 rtx scratch = gen_reg_rtx (SImode);
9018 rtx cmp;
9019
9020 align = 0;
9021 if (CONST_INT_P (align_rtx))
9022 align = INTVAL (align_rtx);
9023
9024 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
9025
9026 /* Is there a known alignment and is it less than 4? */
9027 if (align < 4)
9028 {
9029 rtx scratch1 = gen_reg_rtx (Pmode);
9030 emit_move_insn (scratch1, out);
9031 /* Is there a known alignment and is it not 2? */
9032 if (align != 2)
9033 {
9034 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
9035 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
9036
9037 /* Leave just the 3 lower bits. */
9038 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
9039 NULL_RTX, 0, OPTAB_WIDEN);
9040
9041 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
9042 Pmode, 1, align_4_label);
9043 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
9044 Pmode, 1, align_2_label);
9045 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
9046 Pmode, 1, align_3_label);
9047 }
9048 else
9049 {
9050 /* Since the alignment is 2, we have to check 2 or 0 bytes;
9051 check if is aligned to 4 - byte. */
9052
9053 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
9054 NULL_RTX, 0, OPTAB_WIDEN);
9055
9056 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
9057 Pmode, 1, align_4_label);
9058 }
9059
9060 mem = change_address (src, QImode, out);
9061
9062 /* Now compare the bytes. */
9063
9064 /* Compare the first n unaligned byte on a byte per byte basis. */
9065 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
9066 QImode, 1, end_0_label);
9067
9068 /* Increment the address. */
9069 emit_insn (gen_add2_insn (out, const1_rtx));
9070
9071 /* Not needed with an alignment of 2 */
9072 if (align != 2)
9073 {
9074 emit_label (align_2_label);
9075
9076 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
9077 end_0_label);
9078
9079 emit_insn (gen_add2_insn (out, const1_rtx));
9080
9081 emit_label (align_3_label);
9082 }
9083
9084 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
9085 end_0_label);
9086
9087 emit_insn (gen_add2_insn (out, const1_rtx));
9088 }
9089
9090 /* Generate loop to check 4 bytes at a time. It is not a good idea to
9091 align this loop. It gives only huge programs, but does not help to
9092 speed up. */
9093 emit_label (align_4_label);
9094
9095 mem = change_address (src, SImode, out);
9096 emit_move_insn (scratch, mem);
9097 emit_insn (gen_add2_insn (out, GEN_INT (4)));
9098
9099 /* This formula yields a nonzero result iff one of the bytes is zero.
9100 This saves three branches inside loop and many cycles. */
9101
9102 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
9103 emit_insn (gen_one_cmplsi2 (scratch, scratch));
9104 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
9105 emit_insn (gen_andsi3 (tmpreg, tmpreg,
9106 gen_int_mode (0x80808080, SImode)));
9107 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
9108 align_4_label);
9109
9110 if (TARGET_CMOVE)
9111 {
9112 rtx reg = gen_reg_rtx (SImode);
9113 rtx reg2 = gen_reg_rtx (Pmode);
9114 emit_move_insn (reg, tmpreg);
9115 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
9116
9117 /* If zero is not in the first two bytes, move two bytes forward. */
9118 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
9119 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9120 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
9121 emit_insn (gen_rtx_SET (tmpreg,
9122 gen_rtx_IF_THEN_ELSE (SImode, tmp,
9123 reg,
9124 tmpreg)));
9125 /* Emit lea manually to avoid clobbering of flags. */
9126 emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
9127
9128 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9129 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
9130 emit_insn (gen_rtx_SET (out,
9131 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
9132 reg2,
9133 out)));
9134 }
9135 else
9136 {
9137 rtx_code_label *end_2_label = gen_label_rtx ();
9138 /* Is zero in the first two bytes? */
9139
9140 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
9141 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9142 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
9143 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
9144 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
9145 pc_rtx);
9146 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
9147 JUMP_LABEL (tmp) = end_2_label;
9148
9149 /* Not in the first two. Move two bytes forward. */
9150 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
9151 emit_insn (gen_add2_insn (out, const2_rtx));
9152
9153 emit_label (end_2_label);
9154
9155 }
9156
9157 /* Avoid branch in fixing the byte. */
9158 tmpreg = gen_lowpart (QImode, tmpreg);
9159 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
9160 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
9161 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
9162 emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
9163
9164 emit_label (end_0_label);
9165 }
9166
9167 /* Expand strlen. */
9168
9169 bool
9170 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
9171 {
9172 if (TARGET_UNROLL_STRLEN
9173 && TARGET_INLINE_ALL_STRINGOPS
9174 && eoschar == const0_rtx
9175 && optimize > 1)
9176 {
9177 /* The generic case of strlen expander is long. Avoid it's
9178 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
9179 rtx addr = force_reg (Pmode, XEXP (src, 0));
9180 /* Well it seems that some optimizer does not combine a call like
9181 foo(strlen(bar), strlen(bar));
9182 when the move and the subtraction is done here. It does calculate
9183 the length just once when these instructions are done inside of
9184 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
9185 often used and I use one fewer register for the lifetime of
9186 output_strlen_unroll() this is better. */
9187
9188 emit_move_insn (out, addr);
9189
9190 ix86_expand_strlensi_unroll_1 (out, src, align);
9191
9192 /* strlensi_unroll_1 returns the address of the zero at the end of
9193 the string, like memchr(), so compute the length by subtracting
9194 the start address. */
9195 emit_insn (gen_sub2_insn (out, addr));
9196 return true;
9197 }
9198 else
9199 return false;
9200 }
9201
9202 /* For given symbol (function) construct code to compute address of it's PLT
9203 entry in large x86-64 PIC model. */
9204
9205 static rtx
9206 construct_plt_address (rtx symbol)
9207 {
9208 rtx tmp, unspec;
9209
9210 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
9211 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
9212 gcc_assert (Pmode == DImode);
9213
9214 tmp = gen_reg_rtx (Pmode);
9215 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
9216
9217 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
9218 emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
9219 return tmp;
9220 }
9221
9222 /* Additional registers that are clobbered by SYSV calls. */
9223
9224 static int const x86_64_ms_sysv_extra_clobbered_registers
9225 [NUM_X86_64_MS_CLOBBERED_REGS] =
9226 {
9227 SI_REG, DI_REG,
9228 XMM6_REG, XMM7_REG,
9229 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
9230 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
9231 };
9232
9233 rtx_insn *
9234 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
9235 rtx callarg2,
9236 rtx pop, bool sibcall)
9237 {
9238 rtx vec[3];
9239 rtx use = NULL, call;
9240 unsigned int vec_len = 0;
9241 tree fndecl;
9242
9243 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
9244 {
9245 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
9246 if (fndecl
9247 && (lookup_attribute ("interrupt",
9248 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
9249 error ("interrupt service routine cannot be called directly");
9250 }
9251 else
9252 fndecl = NULL_TREE;
9253
9254 if (pop == const0_rtx)
9255 pop = NULL;
9256 gcc_assert (!TARGET_64BIT || !pop);
9257
9258 rtx addr = XEXP (fnaddr, 0);
9259 if (TARGET_MACHO && !TARGET_64BIT)
9260 {
9261 #if TARGET_MACHO
9262 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
9263 fnaddr = machopic_indirect_call_target (fnaddr);
9264 #endif
9265 }
9266 else
9267 {
9268 /* Static functions and indirect calls don't need the pic register. Also,
9269 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
9270 it an indirect call. */
9271 if (flag_pic
9272 && GET_CODE (addr) == SYMBOL_REF
9273 && ix86_call_use_plt_p (addr))
9274 {
9275 if (flag_plt
9276 && (SYMBOL_REF_DECL (addr) == NULL_TREE
9277 || !lookup_attribute ("noplt",
9278 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
9279 {
9280 if (!TARGET_64BIT
9281 || (ix86_cmodel == CM_LARGE_PIC
9282 && DEFAULT_ABI != MS_ABI))
9283 {
9284 use_reg (&use, gen_rtx_REG (Pmode,
9285 REAL_PIC_OFFSET_TABLE_REGNUM));
9286 if (ix86_use_pseudo_pic_reg ())
9287 emit_move_insn (gen_rtx_REG (Pmode,
9288 REAL_PIC_OFFSET_TABLE_REGNUM),
9289 pic_offset_table_rtx);
9290 }
9291 }
9292 else if (!TARGET_PECOFF && !TARGET_MACHO)
9293 {
9294 if (TARGET_64BIT
9295 && ix86_cmodel == CM_LARGE_PIC
9296 && DEFAULT_ABI != MS_ABI)
9297 {
9298 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
9299 UNSPEC_GOT);
9300 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9301 fnaddr = force_reg (Pmode, fnaddr);
9302 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr);
9303 }
9304 else if (TARGET_64BIT)
9305 {
9306 fnaddr = gen_rtx_UNSPEC (Pmode,
9307 gen_rtvec (1, addr),
9308 UNSPEC_GOTPCREL);
9309 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9310 }
9311 else
9312 {
9313 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
9314 UNSPEC_GOT);
9315 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9316 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
9317 fnaddr);
9318 }
9319 fnaddr = gen_const_mem (Pmode, fnaddr);
9320 /* Pmode may not be the same as word_mode for x32, which
9321 doesn't support indirect branch via 32-bit memory slot.
9322 Since x32 GOT slot is 64 bit with zero upper 32 bits,
9323 indirect branch via x32 GOT slot is OK. */
9324 if (GET_MODE (fnaddr) != word_mode)
9325 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
9326 fnaddr = gen_rtx_MEM (QImode, fnaddr);
9327 }
9328 }
9329 }
9330
9331 /* Skip setting up RAX register for -mskip-rax-setup when there are no
9332 parameters passed in vector registers. */
9333 if (TARGET_64BIT
9334 && (INTVAL (callarg2) > 0
9335 || (INTVAL (callarg2) == 0
9336 && (TARGET_SSE || !flag_skip_rax_setup))))
9337 {
9338 rtx al = gen_rtx_REG (QImode, AX_REG);
9339 emit_move_insn (al, callarg2);
9340 use_reg (&use, al);
9341 }
9342
9343 if (ix86_cmodel == CM_LARGE_PIC
9344 && !TARGET_PECOFF
9345 && MEM_P (fnaddr)
9346 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
9347 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
9348 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
9349 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
9350 branch via x32 GOT slot is OK. */
9351 else if (!(TARGET_X32
9352 && MEM_P (fnaddr)
9353 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
9354 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
9355 && (sibcall
9356 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
9357 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
9358 {
9359 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
9360 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
9361 }
9362
9363 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
9364
9365 if (retval)
9366 call = gen_rtx_SET (retval, call);
9367 vec[vec_len++] = call;
9368
9369 if (pop)
9370 {
9371 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
9372 pop = gen_rtx_SET (stack_pointer_rtx, pop);
9373 vec[vec_len++] = pop;
9374 }
9375
9376 if (cfun->machine->no_caller_saved_registers
9377 && (!fndecl
9378 || (!TREE_THIS_VOLATILE (fndecl)
9379 && !lookup_attribute ("no_caller_saved_registers",
9380 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
9381 {
9382 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
9383 bool is_64bit_ms_abi = (TARGET_64BIT
9384 && ix86_function_abi (fndecl) == MS_ABI);
9385 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
9386
9387 /* If there are no caller-saved registers, add all registers
9388 that are clobbered by the call which returns. */
9389 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
9390 if (!fixed_regs[i]
9391 && (ix86_call_used_regs[i] == 1
9392 || (ix86_call_used_regs[i] & c_mask))
9393 && !STACK_REGNO_P (i)
9394 && !MMX_REGNO_P (i))
9395 clobber_reg (&use,
9396 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
9397 }
9398 else if (TARGET_64BIT_MS_ABI
9399 && (!callarg2 || INTVAL (callarg2) != -2))
9400 {
9401 unsigned i;
9402
9403 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
9404 {
9405 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
9406 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
9407
9408 clobber_reg (&use, gen_rtx_REG (mode, regno));
9409 }
9410
9411 /* Set here, but it may get cleared later. */
9412 if (TARGET_CALL_MS2SYSV_XLOGUES)
9413 {
9414 if (!TARGET_SSE)
9415 ;
9416
9417 /* Don't break hot-patched functions. */
9418 else if (ix86_function_ms_hook_prologue (current_function_decl))
9419 ;
9420
9421 /* TODO: Cases not yet examined. */
9422 else if (flag_split_stack)
9423 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
9424
9425 else
9426 {
9427 gcc_assert (!reload_completed);
9428 cfun->machine->call_ms2sysv = true;
9429 }
9430 }
9431 }
9432
9433 if (TARGET_MACHO && TARGET_64BIT && !sibcall
9434 && ((GET_CODE (addr) == SYMBOL_REF && !SYMBOL_REF_LOCAL_P (addr))
9435 || !fndecl || TREE_PUBLIC (fndecl)))
9436 {
9437 /* We allow public functions defined in a TU to bind locally for PIC
9438 code (the default) on 64bit Mach-O.
9439 If such functions are not inlined, we cannot tell at compile-time if
9440 they will be called via the lazy symbol resolver (this can depend on
9441 options given at link-time). Therefore, we must assume that the lazy
9442 resolver could be used which clobbers R11 and R10. */
9443 clobber_reg (&use, gen_rtx_REG (DImode, R11_REG));
9444 clobber_reg (&use, gen_rtx_REG (DImode, R10_REG));
9445 }
9446
9447 if (vec_len > 1)
9448 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
9449 rtx_insn *call_insn = emit_call_insn (call);
9450 if (use)
9451 CALL_INSN_FUNCTION_USAGE (call_insn) = use;
9452
9453 return call_insn;
9454 }
9455
9456 /* Split simple return with popping POPC bytes from stack to indirect
9457 branch with stack adjustment . */
9458
9459 void
9460 ix86_split_simple_return_pop_internal (rtx popc)
9461 {
9462 struct machine_function *m = cfun->machine;
9463 rtx ecx = gen_rtx_REG (SImode, CX_REG);
9464 rtx_insn *insn;
9465
9466 /* There is no "pascal" calling convention in any 64bit ABI. */
9467 gcc_assert (!TARGET_64BIT);
9468
9469 insn = emit_insn (gen_pop (ecx));
9470 m->fs.cfa_offset -= UNITS_PER_WORD;
9471 m->fs.sp_offset -= UNITS_PER_WORD;
9472
9473 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
9474 x = gen_rtx_SET (stack_pointer_rtx, x);
9475 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
9476 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
9477 RTX_FRAME_RELATED_P (insn) = 1;
9478
9479 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
9480 x = gen_rtx_SET (stack_pointer_rtx, x);
9481 insn = emit_insn (x);
9482 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
9483 RTX_FRAME_RELATED_P (insn) = 1;
9484
9485 /* Now return address is in ECX. */
9486 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
9487 }
9488
9489 /* Errors in the source file can cause expand_expr to return const0_rtx
9490 where we expect a vector. To avoid crashing, use one of the vector
9491 clear instructions. */
9492
9493 static rtx
9494 safe_vector_operand (rtx x, machine_mode mode)
9495 {
9496 if (x == const0_rtx)
9497 x = CONST0_RTX (mode);
9498 return x;
9499 }
9500
9501 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
9502
9503 static rtx
9504 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
9505 {
9506 rtx pat;
9507 tree arg0 = CALL_EXPR_ARG (exp, 0);
9508 tree arg1 = CALL_EXPR_ARG (exp, 1);
9509 rtx op0 = expand_normal (arg0);
9510 rtx op1 = expand_normal (arg1);
9511 machine_mode tmode = insn_data[icode].operand[0].mode;
9512 machine_mode mode0 = insn_data[icode].operand[1].mode;
9513 machine_mode mode1 = insn_data[icode].operand[2].mode;
9514
9515 if (VECTOR_MODE_P (mode0))
9516 op0 = safe_vector_operand (op0, mode0);
9517 if (VECTOR_MODE_P (mode1))
9518 op1 = safe_vector_operand (op1, mode1);
9519
9520 if (optimize || !target
9521 || GET_MODE (target) != tmode
9522 || !insn_data[icode].operand[0].predicate (target, tmode))
9523 target = gen_reg_rtx (tmode);
9524
9525 if (GET_MODE (op1) == SImode && mode1 == TImode)
9526 {
9527 rtx x = gen_reg_rtx (V4SImode);
9528 emit_insn (gen_sse2_loadd (x, op1));
9529 op1 = gen_lowpart (TImode, x);
9530 }
9531
9532 if (!insn_data[icode].operand[1].predicate (op0, mode0))
9533 op0 = copy_to_mode_reg (mode0, op0);
9534 if (!insn_data[icode].operand[2].predicate (op1, mode1))
9535 op1 = copy_to_mode_reg (mode1, op1);
9536
9537 pat = GEN_FCN (icode) (target, op0, op1);
9538 if (! pat)
9539 return 0;
9540
9541 emit_insn (pat);
9542
9543 return target;
9544 }
9545
9546 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
9547
9548 static rtx
9549 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
9550 enum ix86_builtin_func_type m_type,
9551 enum rtx_code sub_code)
9552 {
9553 rtx pat;
9554 unsigned int i, nargs;
9555 bool comparison_p = false;
9556 bool tf_p = false;
9557 bool last_arg_constant = false;
9558 int num_memory = 0;
9559 rtx xops[4];
9560
9561 machine_mode tmode = insn_data[icode].operand[0].mode;
9562
9563 switch (m_type)
9564 {
9565 case MULTI_ARG_4_DF2_DI_I:
9566 case MULTI_ARG_4_DF2_DI_I1:
9567 case MULTI_ARG_4_SF2_SI_I:
9568 case MULTI_ARG_4_SF2_SI_I1:
9569 nargs = 4;
9570 last_arg_constant = true;
9571 break;
9572
9573 case MULTI_ARG_3_SF:
9574 case MULTI_ARG_3_DF:
9575 case MULTI_ARG_3_SF2:
9576 case MULTI_ARG_3_DF2:
9577 case MULTI_ARG_3_DI:
9578 case MULTI_ARG_3_SI:
9579 case MULTI_ARG_3_SI_DI:
9580 case MULTI_ARG_3_HI:
9581 case MULTI_ARG_3_HI_SI:
9582 case MULTI_ARG_3_QI:
9583 case MULTI_ARG_3_DI2:
9584 case MULTI_ARG_3_SI2:
9585 case MULTI_ARG_3_HI2:
9586 case MULTI_ARG_3_QI2:
9587 nargs = 3;
9588 break;
9589
9590 case MULTI_ARG_2_SF:
9591 case MULTI_ARG_2_DF:
9592 case MULTI_ARG_2_DI:
9593 case MULTI_ARG_2_SI:
9594 case MULTI_ARG_2_HI:
9595 case MULTI_ARG_2_QI:
9596 nargs = 2;
9597 break;
9598
9599 case MULTI_ARG_2_DI_IMM:
9600 case MULTI_ARG_2_SI_IMM:
9601 case MULTI_ARG_2_HI_IMM:
9602 case MULTI_ARG_2_QI_IMM:
9603 nargs = 2;
9604 last_arg_constant = true;
9605 break;
9606
9607 case MULTI_ARG_1_SF:
9608 case MULTI_ARG_1_DF:
9609 case MULTI_ARG_1_SF2:
9610 case MULTI_ARG_1_DF2:
9611 case MULTI_ARG_1_DI:
9612 case MULTI_ARG_1_SI:
9613 case MULTI_ARG_1_HI:
9614 case MULTI_ARG_1_QI:
9615 case MULTI_ARG_1_SI_DI:
9616 case MULTI_ARG_1_HI_DI:
9617 case MULTI_ARG_1_HI_SI:
9618 case MULTI_ARG_1_QI_DI:
9619 case MULTI_ARG_1_QI_SI:
9620 case MULTI_ARG_1_QI_HI:
9621 nargs = 1;
9622 break;
9623
9624 case MULTI_ARG_2_DI_CMP:
9625 case MULTI_ARG_2_SI_CMP:
9626 case MULTI_ARG_2_HI_CMP:
9627 case MULTI_ARG_2_QI_CMP:
9628 nargs = 2;
9629 comparison_p = true;
9630 break;
9631
9632 case MULTI_ARG_2_SF_TF:
9633 case MULTI_ARG_2_DF_TF:
9634 case MULTI_ARG_2_DI_TF:
9635 case MULTI_ARG_2_SI_TF:
9636 case MULTI_ARG_2_HI_TF:
9637 case MULTI_ARG_2_QI_TF:
9638 nargs = 2;
9639 tf_p = true;
9640 break;
9641
9642 default:
9643 gcc_unreachable ();
9644 }
9645
9646 if (optimize || !target
9647 || GET_MODE (target) != tmode
9648 || !insn_data[icode].operand[0].predicate (target, tmode))
9649 target = gen_reg_rtx (tmode);
9650 else if (memory_operand (target, tmode))
9651 num_memory++;
9652
9653 gcc_assert (nargs <= ARRAY_SIZE (xops));
9654
9655 for (i = 0; i < nargs; i++)
9656 {
9657 tree arg = CALL_EXPR_ARG (exp, i);
9658 rtx op = expand_normal (arg);
9659 int adjust = (comparison_p) ? 1 : 0;
9660 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
9661
9662 if (last_arg_constant && i == nargs - 1)
9663 {
9664 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
9665 {
9666 enum insn_code new_icode = icode;
9667 switch (icode)
9668 {
9669 case CODE_FOR_xop_vpermil2v2df3:
9670 case CODE_FOR_xop_vpermil2v4sf3:
9671 case CODE_FOR_xop_vpermil2v4df3:
9672 case CODE_FOR_xop_vpermil2v8sf3:
9673 error ("the last argument must be a 2-bit immediate");
9674 return gen_reg_rtx (tmode);
9675 case CODE_FOR_xop_rotlv2di3:
9676 new_icode = CODE_FOR_rotlv2di3;
9677 goto xop_rotl;
9678 case CODE_FOR_xop_rotlv4si3:
9679 new_icode = CODE_FOR_rotlv4si3;
9680 goto xop_rotl;
9681 case CODE_FOR_xop_rotlv8hi3:
9682 new_icode = CODE_FOR_rotlv8hi3;
9683 goto xop_rotl;
9684 case CODE_FOR_xop_rotlv16qi3:
9685 new_icode = CODE_FOR_rotlv16qi3;
9686 xop_rotl:
9687 if (CONST_INT_P (op))
9688 {
9689 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
9690 op = GEN_INT (INTVAL (op) & mask);
9691 gcc_checking_assert
9692 (insn_data[icode].operand[i + 1].predicate (op, mode));
9693 }
9694 else
9695 {
9696 gcc_checking_assert
9697 (nargs == 2
9698 && insn_data[new_icode].operand[0].mode == tmode
9699 && insn_data[new_icode].operand[1].mode == tmode
9700 && insn_data[new_icode].operand[2].mode == mode
9701 && insn_data[new_icode].operand[0].predicate
9702 == insn_data[icode].operand[0].predicate
9703 && insn_data[new_icode].operand[1].predicate
9704 == insn_data[icode].operand[1].predicate);
9705 icode = new_icode;
9706 goto non_constant;
9707 }
9708 break;
9709 default:
9710 gcc_unreachable ();
9711 }
9712 }
9713 }
9714 else
9715 {
9716 non_constant:
9717 if (VECTOR_MODE_P (mode))
9718 op = safe_vector_operand (op, mode);
9719
9720 /* If we aren't optimizing, only allow one memory operand to be
9721 generated. */
9722 if (memory_operand (op, mode))
9723 num_memory++;
9724
9725 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
9726
9727 if (optimize
9728 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
9729 || num_memory > 1)
9730 op = force_reg (mode, op);
9731 }
9732
9733 xops[i] = op;
9734 }
9735
9736 switch (nargs)
9737 {
9738 case 1:
9739 pat = GEN_FCN (icode) (target, xops[0]);
9740 break;
9741
9742 case 2:
9743 if (tf_p)
9744 pat = GEN_FCN (icode) (target, xops[0], xops[1],
9745 GEN_INT ((int)sub_code));
9746 else if (! comparison_p)
9747 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
9748 else
9749 {
9750 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
9751 xops[0], xops[1]);
9752
9753 pat = GEN_FCN (icode) (target, cmp_op, xops[0], xops[1]);
9754 }
9755 break;
9756
9757 case 3:
9758 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
9759 break;
9760
9761 case 4:
9762 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
9763 break;
9764
9765 default:
9766 gcc_unreachable ();
9767 }
9768
9769 if (! pat)
9770 return 0;
9771
9772 emit_insn (pat);
9773 return target;
9774 }
9775
9776 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
9777 insns with vec_merge. */
9778
9779 static rtx
9780 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
9781 rtx target)
9782 {
9783 rtx pat;
9784 tree arg0 = CALL_EXPR_ARG (exp, 0);
9785 rtx op1, op0 = expand_normal (arg0);
9786 machine_mode tmode = insn_data[icode].operand[0].mode;
9787 machine_mode mode0 = insn_data[icode].operand[1].mode;
9788
9789 if (optimize || !target
9790 || GET_MODE (target) != tmode
9791 || !insn_data[icode].operand[0].predicate (target, tmode))
9792 target = gen_reg_rtx (tmode);
9793
9794 if (VECTOR_MODE_P (mode0))
9795 op0 = safe_vector_operand (op0, mode0);
9796
9797 if ((optimize && !register_operand (op0, mode0))
9798 || !insn_data[icode].operand[1].predicate (op0, mode0))
9799 op0 = copy_to_mode_reg (mode0, op0);
9800
9801 op1 = op0;
9802 if (!insn_data[icode].operand[2].predicate (op1, mode0))
9803 op1 = copy_to_mode_reg (mode0, op1);
9804
9805 pat = GEN_FCN (icode) (target, op0, op1);
9806 if (! pat)
9807 return 0;
9808 emit_insn (pat);
9809 return target;
9810 }
9811
9812 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
9813
9814 static rtx
9815 ix86_expand_sse_compare (const struct builtin_description *d,
9816 tree exp, rtx target, bool swap)
9817 {
9818 rtx pat;
9819 tree arg0 = CALL_EXPR_ARG (exp, 0);
9820 tree arg1 = CALL_EXPR_ARG (exp, 1);
9821 rtx op0 = expand_normal (arg0);
9822 rtx op1 = expand_normal (arg1);
9823 rtx op2;
9824 machine_mode tmode = insn_data[d->icode].operand[0].mode;
9825 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
9826 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
9827 enum rtx_code comparison = d->comparison;
9828
9829 if (VECTOR_MODE_P (mode0))
9830 op0 = safe_vector_operand (op0, mode0);
9831 if (VECTOR_MODE_P (mode1))
9832 op1 = safe_vector_operand (op1, mode1);
9833
9834 /* Swap operands if we have a comparison that isn't available in
9835 hardware. */
9836 if (swap)
9837 std::swap (op0, op1);
9838
9839 if (optimize || !target
9840 || GET_MODE (target) != tmode
9841 || !insn_data[d->icode].operand[0].predicate (target, tmode))
9842 target = gen_reg_rtx (tmode);
9843
9844 if ((optimize && !register_operand (op0, mode0))
9845 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
9846 op0 = copy_to_mode_reg (mode0, op0);
9847 if ((optimize && !register_operand (op1, mode1))
9848 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
9849 op1 = copy_to_mode_reg (mode1, op1);
9850
9851 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
9852 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
9853 if (! pat)
9854 return 0;
9855 emit_insn (pat);
9856 return target;
9857 }
9858
9859 /* Subroutine of ix86_sse_comi and ix86_sse_comi_round to take care of
9860 * ordered EQ or unordered NE, generate PF jump. */
9861
9862 static rtx
9863 ix86_ssecom_setcc (const enum rtx_code comparison,
9864 bool check_unordered, machine_mode mode,
9865 rtx set_dst, rtx target)
9866 {
9867
9868 rtx_code_label *label = NULL;
9869
9870 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
9871 with NAN operands. */
9872 if (check_unordered)
9873 {
9874 gcc_assert (comparison == EQ || comparison == NE);
9875
9876 rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
9877 label = gen_label_rtx ();
9878 rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
9879 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
9880 gen_rtx_LABEL_REF (VOIDmode, label),
9881 pc_rtx);
9882 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
9883 }
9884
9885 /* NB: Set CCFPmode and check a different CCmode which is in subset
9886 of CCFPmode. */
9887 if (GET_MODE (set_dst) != mode)
9888 {
9889 gcc_assert (mode == CCAmode || mode == CCCmode
9890 || mode == CCOmode || mode == CCPmode
9891 || mode == CCSmode || mode == CCZmode);
9892 set_dst = gen_rtx_REG (mode, FLAGS_REG);
9893 }
9894
9895 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
9896 gen_rtx_fmt_ee (comparison, QImode,
9897 set_dst,
9898 const0_rtx)));
9899
9900 if (label)
9901 emit_label (label);
9902
9903 return SUBREG_REG (target);
9904 }
9905
9906 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
9907
9908 static rtx
9909 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
9910 rtx target)
9911 {
9912 rtx pat, set_dst;
9913 tree arg0 = CALL_EXPR_ARG (exp, 0);
9914 tree arg1 = CALL_EXPR_ARG (exp, 1);
9915 rtx op0 = expand_normal (arg0);
9916 rtx op1 = expand_normal (arg1);
9917 enum insn_code icode = d->icode;
9918 const struct insn_data_d *insn_p = &insn_data[icode];
9919 machine_mode mode0 = insn_p->operand[0].mode;
9920 machine_mode mode1 = insn_p->operand[1].mode;
9921
9922 if (VECTOR_MODE_P (mode0))
9923 op0 = safe_vector_operand (op0, mode0);
9924 if (VECTOR_MODE_P (mode1))
9925 op1 = safe_vector_operand (op1, mode1);
9926
9927 enum rtx_code comparison = d->comparison;
9928 rtx const_val = const0_rtx;
9929
9930 bool check_unordered = false;
9931 machine_mode mode = CCFPmode;
9932 switch (comparison)
9933 {
9934 case LE: /* -> GE */
9935 case LT: /* -> GT */
9936 std::swap (op0, op1);
9937 comparison = swap_condition (comparison);
9938 /* FALLTHRU */
9939 case GT:
9940 case GE:
9941 break;
9942 case EQ:
9943 check_unordered = true;
9944 mode = CCZmode;
9945 break;
9946 case NE:
9947 check_unordered = true;
9948 mode = CCZmode;
9949 const_val = const1_rtx;
9950 break;
9951 default:
9952 gcc_unreachable ();
9953 }
9954
9955 target = gen_reg_rtx (SImode);
9956 emit_move_insn (target, const_val);
9957 target = gen_rtx_SUBREG (QImode, target, 0);
9958
9959 if ((optimize && !register_operand (op0, mode0))
9960 || !insn_p->operand[0].predicate (op0, mode0))
9961 op0 = copy_to_mode_reg (mode0, op0);
9962 if ((optimize && !register_operand (op1, mode1))
9963 || !insn_p->operand[1].predicate (op1, mode1))
9964 op1 = copy_to_mode_reg (mode1, op1);
9965
9966 pat = GEN_FCN (icode) (op0, op1);
9967 if (! pat)
9968 return 0;
9969
9970 set_dst = SET_DEST (pat);
9971 emit_insn (pat);
9972 return ix86_ssecom_setcc (comparison, check_unordered, mode,
9973 set_dst, target);
9974 }
9975
9976 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
9977
9978 static rtx
9979 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
9980 rtx target)
9981 {
9982 rtx pat;
9983 tree arg0 = CALL_EXPR_ARG (exp, 0);
9984 rtx op1, op0 = expand_normal (arg0);
9985 machine_mode tmode = insn_data[d->icode].operand[0].mode;
9986 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
9987
9988 if (optimize || target == 0
9989 || GET_MODE (target) != tmode
9990 || !insn_data[d->icode].operand[0].predicate (target, tmode))
9991 target = gen_reg_rtx (tmode);
9992
9993 if (VECTOR_MODE_P (mode0))
9994 op0 = safe_vector_operand (op0, mode0);
9995
9996 if ((optimize && !register_operand (op0, mode0))
9997 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
9998 op0 = copy_to_mode_reg (mode0, op0);
9999
10000 op1 = GEN_INT (d->comparison);
10001
10002 pat = GEN_FCN (d->icode) (target, op0, op1);
10003 if (! pat)
10004 return 0;
10005 emit_insn (pat);
10006 return target;
10007 }
10008
10009 static rtx
10010 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
10011 tree exp, rtx target)
10012 {
10013 rtx pat;
10014 tree arg0 = CALL_EXPR_ARG (exp, 0);
10015 tree arg1 = CALL_EXPR_ARG (exp, 1);
10016 rtx op0 = expand_normal (arg0);
10017 rtx op1 = expand_normal (arg1);
10018 rtx op2;
10019 machine_mode tmode = insn_data[d->icode].operand[0].mode;
10020 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
10021 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
10022
10023 if (optimize || target == 0
10024 || GET_MODE (target) != tmode
10025 || !insn_data[d->icode].operand[0].predicate (target, tmode))
10026 target = gen_reg_rtx (tmode);
10027
10028 op0 = safe_vector_operand (op0, mode0);
10029 op1 = safe_vector_operand (op1, mode1);
10030
10031 if ((optimize && !register_operand (op0, mode0))
10032 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
10033 op0 = copy_to_mode_reg (mode0, op0);
10034 if ((optimize && !register_operand (op1, mode1))
10035 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
10036 op1 = copy_to_mode_reg (mode1, op1);
10037
10038 op2 = GEN_INT (d->comparison);
10039
10040 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
10041 if (! pat)
10042 return 0;
10043 emit_insn (pat);
10044 return target;
10045 }
10046
10047 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
10048
10049 static rtx
10050 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
10051 rtx target)
10052 {
10053 rtx pat;
10054 tree arg0 = CALL_EXPR_ARG (exp, 0);
10055 tree arg1 = CALL_EXPR_ARG (exp, 1);
10056 rtx op0 = expand_normal (arg0);
10057 rtx op1 = expand_normal (arg1);
10058 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
10059 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
10060 enum rtx_code comparison = d->comparison;
10061
10062 if (VECTOR_MODE_P (mode0))
10063 op0 = safe_vector_operand (op0, mode0);
10064 if (VECTOR_MODE_P (mode1))
10065 op1 = safe_vector_operand (op1, mode1);
10066
10067 target = gen_reg_rtx (SImode);
10068 emit_move_insn (target, const0_rtx);
10069 target = gen_rtx_SUBREG (QImode, target, 0);
10070
10071 if ((optimize && !register_operand (op0, mode0))
10072 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
10073 op0 = copy_to_mode_reg (mode0, op0);
10074 if ((optimize && !register_operand (op1, mode1))
10075 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
10076 op1 = copy_to_mode_reg (mode1, op1);
10077
10078 pat = GEN_FCN (d->icode) (op0, op1);
10079 if (! pat)
10080 return 0;
10081 emit_insn (pat);
10082 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10083 gen_rtx_fmt_ee (comparison, QImode,
10084 SET_DEST (pat),
10085 const0_rtx)));
10086
10087 return SUBREG_REG (target);
10088 }
10089
10090 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
10091
10092 static rtx
10093 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
10094 tree exp, rtx target)
10095 {
10096 rtx pat;
10097 tree arg0 = CALL_EXPR_ARG (exp, 0);
10098 tree arg1 = CALL_EXPR_ARG (exp, 1);
10099 tree arg2 = CALL_EXPR_ARG (exp, 2);
10100 tree arg3 = CALL_EXPR_ARG (exp, 3);
10101 tree arg4 = CALL_EXPR_ARG (exp, 4);
10102 rtx scratch0, scratch1;
10103 rtx op0 = expand_normal (arg0);
10104 rtx op1 = expand_normal (arg1);
10105 rtx op2 = expand_normal (arg2);
10106 rtx op3 = expand_normal (arg3);
10107 rtx op4 = expand_normal (arg4);
10108 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
10109
10110 tmode0 = insn_data[d->icode].operand[0].mode;
10111 tmode1 = insn_data[d->icode].operand[1].mode;
10112 modev2 = insn_data[d->icode].operand[2].mode;
10113 modei3 = insn_data[d->icode].operand[3].mode;
10114 modev4 = insn_data[d->icode].operand[4].mode;
10115 modei5 = insn_data[d->icode].operand[5].mode;
10116 modeimm = insn_data[d->icode].operand[6].mode;
10117
10118 if (VECTOR_MODE_P (modev2))
10119 op0 = safe_vector_operand (op0, modev2);
10120 if (VECTOR_MODE_P (modev4))
10121 op2 = safe_vector_operand (op2, modev4);
10122
10123 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
10124 op0 = copy_to_mode_reg (modev2, op0);
10125 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
10126 op1 = copy_to_mode_reg (modei3, op1);
10127 if ((optimize && !register_operand (op2, modev4))
10128 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
10129 op2 = copy_to_mode_reg (modev4, op2);
10130 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
10131 op3 = copy_to_mode_reg (modei5, op3);
10132
10133 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
10134 {
10135 error ("the fifth argument must be an 8-bit immediate");
10136 return const0_rtx;
10137 }
10138
10139 if (d->code == IX86_BUILTIN_PCMPESTRI128)
10140 {
10141 if (optimize || !target
10142 || GET_MODE (target) != tmode0
10143 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
10144 target = gen_reg_rtx (tmode0);
10145
10146 scratch1 = gen_reg_rtx (tmode1);
10147
10148 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
10149 }
10150 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
10151 {
10152 if (optimize || !target
10153 || GET_MODE (target) != tmode1
10154 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
10155 target = gen_reg_rtx (tmode1);
10156
10157 scratch0 = gen_reg_rtx (tmode0);
10158
10159 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
10160 }
10161 else
10162 {
10163 gcc_assert (d->flag);
10164
10165 scratch0 = gen_reg_rtx (tmode0);
10166 scratch1 = gen_reg_rtx (tmode1);
10167
10168 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
10169 }
10170
10171 if (! pat)
10172 return 0;
10173
10174 emit_insn (pat);
10175
10176 if (d->flag)
10177 {
10178 target = gen_reg_rtx (SImode);
10179 emit_move_insn (target, const0_rtx);
10180 target = gen_rtx_SUBREG (QImode, target, 0);
10181
10182 emit_insn
10183 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10184 gen_rtx_fmt_ee (EQ, QImode,
10185 gen_rtx_REG ((machine_mode) d->flag,
10186 FLAGS_REG),
10187 const0_rtx)));
10188 return SUBREG_REG (target);
10189 }
10190 else
10191 return target;
10192 }
10193
10194
10195 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
10196
10197 static rtx
10198 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
10199 tree exp, rtx target)
10200 {
10201 rtx pat;
10202 tree arg0 = CALL_EXPR_ARG (exp, 0);
10203 tree arg1 = CALL_EXPR_ARG (exp, 1);
10204 tree arg2 = CALL_EXPR_ARG (exp, 2);
10205 rtx scratch0, scratch1;
10206 rtx op0 = expand_normal (arg0);
10207 rtx op1 = expand_normal (arg1);
10208 rtx op2 = expand_normal (arg2);
10209 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
10210
10211 tmode0 = insn_data[d->icode].operand[0].mode;
10212 tmode1 = insn_data[d->icode].operand[1].mode;
10213 modev2 = insn_data[d->icode].operand[2].mode;
10214 modev3 = insn_data[d->icode].operand[3].mode;
10215 modeimm = insn_data[d->icode].operand[4].mode;
10216
10217 if (VECTOR_MODE_P (modev2))
10218 op0 = safe_vector_operand (op0, modev2);
10219 if (VECTOR_MODE_P (modev3))
10220 op1 = safe_vector_operand (op1, modev3);
10221
10222 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
10223 op0 = copy_to_mode_reg (modev2, op0);
10224 if ((optimize && !register_operand (op1, modev3))
10225 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
10226 op1 = copy_to_mode_reg (modev3, op1);
10227
10228 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
10229 {
10230 error ("the third argument must be an 8-bit immediate");
10231 return const0_rtx;
10232 }
10233
10234 if (d->code == IX86_BUILTIN_PCMPISTRI128)
10235 {
10236 if (optimize || !target
10237 || GET_MODE (target) != tmode0
10238 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
10239 target = gen_reg_rtx (tmode0);
10240
10241 scratch1 = gen_reg_rtx (tmode1);
10242
10243 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
10244 }
10245 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
10246 {
10247 if (optimize || !target
10248 || GET_MODE (target) != tmode1
10249 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
10250 target = gen_reg_rtx (tmode1);
10251
10252 scratch0 = gen_reg_rtx (tmode0);
10253
10254 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
10255 }
10256 else
10257 {
10258 gcc_assert (d->flag);
10259
10260 scratch0 = gen_reg_rtx (tmode0);
10261 scratch1 = gen_reg_rtx (tmode1);
10262
10263 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
10264 }
10265
10266 if (! pat)
10267 return 0;
10268
10269 emit_insn (pat);
10270
10271 if (d->flag)
10272 {
10273 target = gen_reg_rtx (SImode);
10274 emit_move_insn (target, const0_rtx);
10275 target = gen_rtx_SUBREG (QImode, target, 0);
10276
10277 emit_insn
10278 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10279 gen_rtx_fmt_ee (EQ, QImode,
10280 gen_rtx_REG ((machine_mode) d->flag,
10281 FLAGS_REG),
10282 const0_rtx)));
10283 return SUBREG_REG (target);
10284 }
10285 else
10286 return target;
10287 }
10288
10289 /* Fixup modeless constants to fit required mode. */
10290
10291 static rtx
10292 fixup_modeless_constant (rtx x, machine_mode mode)
10293 {
10294 if (GET_MODE (x) == VOIDmode)
10295 x = convert_to_mode (mode, x, 1);
10296 return x;
10297 }
10298
10299 /* Subroutine of ix86_expand_builtin to take care of insns with
10300 variable number of operands. */
10301
10302 static rtx
10303 ix86_expand_args_builtin (const struct builtin_description *d,
10304 tree exp, rtx target)
10305 {
10306 rtx pat, real_target;
10307 unsigned int i, nargs;
10308 unsigned int nargs_constant = 0;
10309 unsigned int mask_pos = 0;
10310 int num_memory = 0;
10311 rtx xops[6];
10312 bool second_arg_count = false;
10313 enum insn_code icode = d->icode;
10314 const struct insn_data_d *insn_p = &insn_data[icode];
10315 machine_mode tmode = insn_p->operand[0].mode;
10316 machine_mode rmode = VOIDmode;
10317 bool swap = false;
10318 enum rtx_code comparison = d->comparison;
10319
10320 switch ((enum ix86_builtin_func_type) d->flag)
10321 {
10322 case V2DF_FTYPE_V2DF_ROUND:
10323 case V4DF_FTYPE_V4DF_ROUND:
10324 case V8DF_FTYPE_V8DF_ROUND:
10325 case V4SF_FTYPE_V4SF_ROUND:
10326 case V8SF_FTYPE_V8SF_ROUND:
10327 case V16SF_FTYPE_V16SF_ROUND:
10328 case V8HF_FTYPE_V8HF_ROUND:
10329 case V16HF_FTYPE_V16HF_ROUND:
10330 case V32HF_FTYPE_V32HF_ROUND:
10331 case V4SI_FTYPE_V4SF_ROUND:
10332 case V8SI_FTYPE_V8SF_ROUND:
10333 case V16SI_FTYPE_V16SF_ROUND:
10334 return ix86_expand_sse_round (d, exp, target);
10335 case V4SI_FTYPE_V2DF_V2DF_ROUND:
10336 case V8SI_FTYPE_V4DF_V4DF_ROUND:
10337 case V16SI_FTYPE_V8DF_V8DF_ROUND:
10338 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
10339 case INT_FTYPE_V8SF_V8SF_PTEST:
10340 case INT_FTYPE_V4DI_V4DI_PTEST:
10341 case INT_FTYPE_V4DF_V4DF_PTEST:
10342 case INT_FTYPE_V4SF_V4SF_PTEST:
10343 case INT_FTYPE_V2DI_V2DI_PTEST:
10344 case INT_FTYPE_V2DF_V2DF_PTEST:
10345 return ix86_expand_sse_ptest (d, exp, target);
10346 case FLOAT128_FTYPE_FLOAT128:
10347 case FLOAT_FTYPE_FLOAT:
10348 case INT_FTYPE_INT:
10349 case UINT_FTYPE_UINT:
10350 case UINT16_FTYPE_UINT16:
10351 case UINT64_FTYPE_INT:
10352 case UINT64_FTYPE_UINT64:
10353 case INT64_FTYPE_INT64:
10354 case INT64_FTYPE_V4SF:
10355 case INT64_FTYPE_V2DF:
10356 case INT_FTYPE_V16QI:
10357 case INT_FTYPE_V8QI:
10358 case INT_FTYPE_V8SF:
10359 case INT_FTYPE_V4DF:
10360 case INT_FTYPE_V4SF:
10361 case INT_FTYPE_V2DF:
10362 case INT_FTYPE_V32QI:
10363 case V16QI_FTYPE_V16QI:
10364 case V8SI_FTYPE_V8SF:
10365 case V8SI_FTYPE_V4SI:
10366 case V8HI_FTYPE_V8HI:
10367 case V8HI_FTYPE_V16QI:
10368 case V8QI_FTYPE_V8QI:
10369 case V8SF_FTYPE_V8SF:
10370 case V8SF_FTYPE_V8SI:
10371 case V8SF_FTYPE_V4SF:
10372 case V8SF_FTYPE_V8HI:
10373 case V4SI_FTYPE_V4SI:
10374 case V4SI_FTYPE_V16QI:
10375 case V4SI_FTYPE_V4SF:
10376 case V4SI_FTYPE_V8SI:
10377 case V4SI_FTYPE_V8HI:
10378 case V4SI_FTYPE_V4DF:
10379 case V4SI_FTYPE_V2DF:
10380 case V4HI_FTYPE_V4HI:
10381 case V4DF_FTYPE_V4DF:
10382 case V4DF_FTYPE_V4SI:
10383 case V4DF_FTYPE_V4SF:
10384 case V4DF_FTYPE_V2DF:
10385 case V4SF_FTYPE_V4SF:
10386 case V4SF_FTYPE_V4SI:
10387 case V4SF_FTYPE_V8SF:
10388 case V4SF_FTYPE_V4DF:
10389 case V4SF_FTYPE_V8HI:
10390 case V4SF_FTYPE_V2DF:
10391 case V2DI_FTYPE_V2DI:
10392 case V2DI_FTYPE_V16QI:
10393 case V2DI_FTYPE_V8HI:
10394 case V2DI_FTYPE_V4SI:
10395 case V2DF_FTYPE_V2DF:
10396 case V2DF_FTYPE_V4SI:
10397 case V2DF_FTYPE_V4DF:
10398 case V2DF_FTYPE_V4SF:
10399 case V2DF_FTYPE_V2SI:
10400 case V2SI_FTYPE_V2SI:
10401 case V2SI_FTYPE_V4SF:
10402 case V2SI_FTYPE_V2SF:
10403 case V2SI_FTYPE_V2DF:
10404 case V2SF_FTYPE_V2SF:
10405 case V2SF_FTYPE_V2SI:
10406 case V32QI_FTYPE_V32QI:
10407 case V32QI_FTYPE_V16QI:
10408 case V16HI_FTYPE_V16HI:
10409 case V16HI_FTYPE_V8HI:
10410 case V8SI_FTYPE_V8SI:
10411 case V16HI_FTYPE_V16QI:
10412 case V8SI_FTYPE_V16QI:
10413 case V4DI_FTYPE_V16QI:
10414 case V8SI_FTYPE_V8HI:
10415 case V4DI_FTYPE_V8HI:
10416 case V4DI_FTYPE_V4SI:
10417 case V4DI_FTYPE_V2DI:
10418 case UQI_FTYPE_UQI:
10419 case UHI_FTYPE_UHI:
10420 case USI_FTYPE_USI:
10421 case USI_FTYPE_UQI:
10422 case USI_FTYPE_UHI:
10423 case UDI_FTYPE_UDI:
10424 case UHI_FTYPE_V16QI:
10425 case USI_FTYPE_V32QI:
10426 case UDI_FTYPE_V64QI:
10427 case V16QI_FTYPE_UHI:
10428 case V32QI_FTYPE_USI:
10429 case V64QI_FTYPE_UDI:
10430 case V8HI_FTYPE_UQI:
10431 case V16HI_FTYPE_UHI:
10432 case V32HI_FTYPE_USI:
10433 case V4SI_FTYPE_UQI:
10434 case V8SI_FTYPE_UQI:
10435 case V4SI_FTYPE_UHI:
10436 case V8SI_FTYPE_UHI:
10437 case UQI_FTYPE_V8HI:
10438 case UHI_FTYPE_V16HI:
10439 case USI_FTYPE_V32HI:
10440 case UQI_FTYPE_V4SI:
10441 case UQI_FTYPE_V8SI:
10442 case UHI_FTYPE_V16SI:
10443 case UQI_FTYPE_V2DI:
10444 case UQI_FTYPE_V4DI:
10445 case UQI_FTYPE_V8DI:
10446 case V16SI_FTYPE_UHI:
10447 case V2DI_FTYPE_UQI:
10448 case V4DI_FTYPE_UQI:
10449 case V16SI_FTYPE_INT:
10450 case V16SF_FTYPE_V8SF:
10451 case V16SI_FTYPE_V8SI:
10452 case V16SF_FTYPE_V4SF:
10453 case V16SI_FTYPE_V4SI:
10454 case V16SI_FTYPE_V16SF:
10455 case V16SI_FTYPE_V16SI:
10456 case V64QI_FTYPE_V64QI:
10457 case V32HI_FTYPE_V32HI:
10458 case V16SF_FTYPE_V16SF:
10459 case V8DI_FTYPE_UQI:
10460 case V8DI_FTYPE_V8DI:
10461 case V8DF_FTYPE_V4DF:
10462 case V8DF_FTYPE_V2DF:
10463 case V8DF_FTYPE_V8DF:
10464 case V4DI_FTYPE_V4DI:
10465 case V16BF_FTYPE_V16SF:
10466 case V8BF_FTYPE_V8SF:
10467 case V8BF_FTYPE_V4SF:
10468 nargs = 1;
10469 break;
10470 case V4SF_FTYPE_V4SF_VEC_MERGE:
10471 case V2DF_FTYPE_V2DF_VEC_MERGE:
10472 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
10473 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
10474 case V16QI_FTYPE_V16QI_V16QI:
10475 case V16QI_FTYPE_V8HI_V8HI:
10476 case V16HF_FTYPE_V16HF_V16HF:
10477 case V16SF_FTYPE_V16SF_V16SF:
10478 case V8QI_FTYPE_V8QI_V8QI:
10479 case V8QI_FTYPE_V4HI_V4HI:
10480 case V8HI_FTYPE_V8HI_V8HI:
10481 case V8HI_FTYPE_V16QI_V16QI:
10482 case V8HI_FTYPE_V4SI_V4SI:
10483 case V8HF_FTYPE_V8HF_V8HF:
10484 case V8SF_FTYPE_V8SF_V8SF:
10485 case V8SF_FTYPE_V8SF_V8SI:
10486 case V8DF_FTYPE_V8DF_V8DF:
10487 case V4SI_FTYPE_V4SI_V4SI:
10488 case V4SI_FTYPE_V8HI_V8HI:
10489 case V4SI_FTYPE_V2DF_V2DF:
10490 case V4HI_FTYPE_V4HI_V4HI:
10491 case V4HI_FTYPE_V8QI_V8QI:
10492 case V4HI_FTYPE_V2SI_V2SI:
10493 case V4DF_FTYPE_V4DF_V4DF:
10494 case V4DF_FTYPE_V4DF_V4DI:
10495 case V4SF_FTYPE_V4SF_V4SF:
10496 case V4SF_FTYPE_V4SF_V4SI:
10497 case V4SF_FTYPE_V4SF_V2SI:
10498 case V4SF_FTYPE_V4SF_V2DF:
10499 case V4SF_FTYPE_V4SF_UINT:
10500 case V4SF_FTYPE_V4SF_DI:
10501 case V4SF_FTYPE_V4SF_SI:
10502 case V2DI_FTYPE_V2DI_V2DI:
10503 case V2DI_FTYPE_V16QI_V16QI:
10504 case V2DI_FTYPE_V4SI_V4SI:
10505 case V2DI_FTYPE_V2DI_V16QI:
10506 case V2SI_FTYPE_V2SI_V2SI:
10507 case V2SI_FTYPE_V4HI_V4HI:
10508 case V2SI_FTYPE_V2SF_V2SF:
10509 case V2DF_FTYPE_V2DF_V2DF:
10510 case V2DF_FTYPE_V2DF_V4SF:
10511 case V2DF_FTYPE_V2DF_V2DI:
10512 case V2DF_FTYPE_V2DF_DI:
10513 case V2DF_FTYPE_V2DF_SI:
10514 case V2DF_FTYPE_V2DF_UINT:
10515 case V2SF_FTYPE_V2SF_V2SF:
10516 case V1DI_FTYPE_V1DI_V1DI:
10517 case V1DI_FTYPE_V8QI_V8QI:
10518 case V1DI_FTYPE_V2SI_V2SI:
10519 case V32QI_FTYPE_V16HI_V16HI:
10520 case V16HI_FTYPE_V8SI_V8SI:
10521 case V64QI_FTYPE_V64QI_V64QI:
10522 case V32QI_FTYPE_V32QI_V32QI:
10523 case V16HI_FTYPE_V32QI_V32QI:
10524 case V16HI_FTYPE_V16HI_V16HI:
10525 case V8SI_FTYPE_V4DF_V4DF:
10526 case V8SI_FTYPE_V8SI_V8SI:
10527 case V8SI_FTYPE_V16HI_V16HI:
10528 case V4DI_FTYPE_V4DI_V4DI:
10529 case V4DI_FTYPE_V8SI_V8SI:
10530 case V4DI_FTYPE_V32QI_V32QI:
10531 case V8DI_FTYPE_V64QI_V64QI:
10532 if (comparison == UNKNOWN)
10533 return ix86_expand_binop_builtin (icode, exp, target);
10534 nargs = 2;
10535 break;
10536 case V4SF_FTYPE_V4SF_V4SF_SWAP:
10537 case V2DF_FTYPE_V2DF_V2DF_SWAP:
10538 gcc_assert (comparison != UNKNOWN);
10539 nargs = 2;
10540 swap = true;
10541 break;
10542 case V16HI_FTYPE_V16HI_V8HI_COUNT:
10543 case V16HI_FTYPE_V16HI_SI_COUNT:
10544 case V8SI_FTYPE_V8SI_V4SI_COUNT:
10545 case V8SI_FTYPE_V8SI_SI_COUNT:
10546 case V4DI_FTYPE_V4DI_V2DI_COUNT:
10547 case V4DI_FTYPE_V4DI_INT_COUNT:
10548 case V8HI_FTYPE_V8HI_V8HI_COUNT:
10549 case V8HI_FTYPE_V8HI_SI_COUNT:
10550 case V4SI_FTYPE_V4SI_V4SI_COUNT:
10551 case V4SI_FTYPE_V4SI_SI_COUNT:
10552 case V4HI_FTYPE_V4HI_V4HI_COUNT:
10553 case V4HI_FTYPE_V4HI_SI_COUNT:
10554 case V2DI_FTYPE_V2DI_V2DI_COUNT:
10555 case V2DI_FTYPE_V2DI_SI_COUNT:
10556 case V2SI_FTYPE_V2SI_V2SI_COUNT:
10557 case V2SI_FTYPE_V2SI_SI_COUNT:
10558 case V1DI_FTYPE_V1DI_V1DI_COUNT:
10559 case V1DI_FTYPE_V1DI_SI_COUNT:
10560 nargs = 2;
10561 second_arg_count = true;
10562 break;
10563 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
10564 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
10565 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
10566 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
10567 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
10568 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
10569 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
10570 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
10571 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
10572 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
10573 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
10574 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
10575 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
10576 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
10577 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
10578 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
10579 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
10580 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
10581 nargs = 4;
10582 second_arg_count = true;
10583 break;
10584 case UINT64_FTYPE_UINT64_UINT64:
10585 case UINT_FTYPE_UINT_UINT:
10586 case UINT_FTYPE_UINT_USHORT:
10587 case UINT_FTYPE_UINT_UCHAR:
10588 case UINT16_FTYPE_UINT16_INT:
10589 case UINT8_FTYPE_UINT8_INT:
10590 case UQI_FTYPE_UQI_UQI:
10591 case UHI_FTYPE_UHI_UHI:
10592 case USI_FTYPE_USI_USI:
10593 case UDI_FTYPE_UDI_UDI:
10594 case V16SI_FTYPE_V8DF_V8DF:
10595 case V32BF_FTYPE_V16SF_V16SF:
10596 case V16BF_FTYPE_V8SF_V8SF:
10597 case V8BF_FTYPE_V4SF_V4SF:
10598 case V16BF_FTYPE_V16SF_UHI:
10599 case V8BF_FTYPE_V8SF_UQI:
10600 case V8BF_FTYPE_V4SF_UQI:
10601 nargs = 2;
10602 break;
10603 case V2DI_FTYPE_V2DI_INT_CONVERT:
10604 nargs = 2;
10605 rmode = V1TImode;
10606 nargs_constant = 1;
10607 break;
10608 case V4DI_FTYPE_V4DI_INT_CONVERT:
10609 nargs = 2;
10610 rmode = V2TImode;
10611 nargs_constant = 1;
10612 break;
10613 case V8DI_FTYPE_V8DI_INT_CONVERT:
10614 nargs = 2;
10615 rmode = V4TImode;
10616 nargs_constant = 1;
10617 break;
10618 case V8HI_FTYPE_V8HI_INT:
10619 case V8HI_FTYPE_V8SF_INT:
10620 case V16HI_FTYPE_V16SF_INT:
10621 case V8HI_FTYPE_V4SF_INT:
10622 case V8SF_FTYPE_V8SF_INT:
10623 case V4SF_FTYPE_V16SF_INT:
10624 case V16SF_FTYPE_V16SF_INT:
10625 case V4SI_FTYPE_V4SI_INT:
10626 case V4SI_FTYPE_V8SI_INT:
10627 case V4HI_FTYPE_V4HI_INT:
10628 case V4DF_FTYPE_V4DF_INT:
10629 case V4DF_FTYPE_V8DF_INT:
10630 case V4SF_FTYPE_V4SF_INT:
10631 case V4SF_FTYPE_V8SF_INT:
10632 case V2DI_FTYPE_V2DI_INT:
10633 case V2DF_FTYPE_V2DF_INT:
10634 case V2DF_FTYPE_V4DF_INT:
10635 case V16HI_FTYPE_V16HI_INT:
10636 case V8SI_FTYPE_V8SI_INT:
10637 case V16SI_FTYPE_V16SI_INT:
10638 case V4SI_FTYPE_V16SI_INT:
10639 case V4DI_FTYPE_V4DI_INT:
10640 case V2DI_FTYPE_V4DI_INT:
10641 case V4DI_FTYPE_V8DI_INT:
10642 case UQI_FTYPE_UQI_UQI_CONST:
10643 case UHI_FTYPE_UHI_UQI:
10644 case USI_FTYPE_USI_UQI:
10645 case UDI_FTYPE_UDI_UQI:
10646 nargs = 2;
10647 nargs_constant = 1;
10648 break;
10649 case V16QI_FTYPE_V16QI_V16QI_V16QI:
10650 case V8SF_FTYPE_V8SF_V8SF_V8SF:
10651 case V4DF_FTYPE_V4DF_V4DF_V4DF:
10652 case V4SF_FTYPE_V4SF_V4SF_V4SF:
10653 case V2DF_FTYPE_V2DF_V2DF_V2DF:
10654 case V32QI_FTYPE_V32QI_V32QI_V32QI:
10655 case UHI_FTYPE_V16SI_V16SI_UHI:
10656 case UQI_FTYPE_V8DI_V8DI_UQI:
10657 case V16HI_FTYPE_V16SI_V16HI_UHI:
10658 case V16QI_FTYPE_V16SI_V16QI_UHI:
10659 case V16QI_FTYPE_V8DI_V16QI_UQI:
10660 case V32HF_FTYPE_V32HF_V32HF_USI:
10661 case V16SF_FTYPE_V16SF_V16SF_UHI:
10662 case V16SF_FTYPE_V4SF_V16SF_UHI:
10663 case V16SI_FTYPE_SI_V16SI_UHI:
10664 case V16SI_FTYPE_V16HI_V16SI_UHI:
10665 case V16SI_FTYPE_V16QI_V16SI_UHI:
10666 case V8SF_FTYPE_V4SF_V8SF_UQI:
10667 case V4DF_FTYPE_V2DF_V4DF_UQI:
10668 case V8SI_FTYPE_V4SI_V8SI_UQI:
10669 case V8SI_FTYPE_SI_V8SI_UQI:
10670 case V4SI_FTYPE_V4SI_V4SI_UQI:
10671 case V4SI_FTYPE_SI_V4SI_UQI:
10672 case V4DI_FTYPE_V2DI_V4DI_UQI:
10673 case V4DI_FTYPE_DI_V4DI_UQI:
10674 case V2DI_FTYPE_V2DI_V2DI_UQI:
10675 case V2DI_FTYPE_DI_V2DI_UQI:
10676 case V64QI_FTYPE_V64QI_V64QI_UDI:
10677 case V64QI_FTYPE_V16QI_V64QI_UDI:
10678 case V64QI_FTYPE_QI_V64QI_UDI:
10679 case V32QI_FTYPE_V32QI_V32QI_USI:
10680 case V32QI_FTYPE_V16QI_V32QI_USI:
10681 case V32QI_FTYPE_QI_V32QI_USI:
10682 case V16QI_FTYPE_V16QI_V16QI_UHI:
10683 case V16QI_FTYPE_QI_V16QI_UHI:
10684 case V32HI_FTYPE_V8HI_V32HI_USI:
10685 case V32HI_FTYPE_HI_V32HI_USI:
10686 case V16HI_FTYPE_V8HI_V16HI_UHI:
10687 case V16HI_FTYPE_HI_V16HI_UHI:
10688 case V8HI_FTYPE_V8HI_V8HI_UQI:
10689 case V8HI_FTYPE_HI_V8HI_UQI:
10690 case V16HF_FTYPE_V16HF_V16HF_UHI:
10691 case V8SF_FTYPE_V8HI_V8SF_UQI:
10692 case V4SF_FTYPE_V8HI_V4SF_UQI:
10693 case V8SI_FTYPE_V8HF_V8SI_UQI:
10694 case V8SF_FTYPE_V8HF_V8SF_UQI:
10695 case V8SI_FTYPE_V8SF_V8SI_UQI:
10696 case V4SI_FTYPE_V4SF_V4SI_UQI:
10697 case V4SI_FTYPE_V8HF_V4SI_UQI:
10698 case V4SF_FTYPE_V8HF_V4SF_UQI:
10699 case V4DI_FTYPE_V8HF_V4DI_UQI:
10700 case V4DI_FTYPE_V4SF_V4DI_UQI:
10701 case V2DI_FTYPE_V8HF_V2DI_UQI:
10702 case V2DI_FTYPE_V4SF_V2DI_UQI:
10703 case V8HF_FTYPE_V8HF_V8HF_UQI:
10704 case V8HF_FTYPE_V8HF_V8HF_V8HF:
10705 case V8HF_FTYPE_V8HI_V8HF_UQI:
10706 case V8HF_FTYPE_V8SI_V8HF_UQI:
10707 case V8HF_FTYPE_V8SF_V8HF_UQI:
10708 case V8HF_FTYPE_V4SI_V8HF_UQI:
10709 case V8HF_FTYPE_V4SF_V8HF_UQI:
10710 case V8HF_FTYPE_V4DI_V8HF_UQI:
10711 case V8HF_FTYPE_V4DF_V8HF_UQI:
10712 case V8HF_FTYPE_V2DI_V8HF_UQI:
10713 case V8HF_FTYPE_V2DF_V8HF_UQI:
10714 case V4SF_FTYPE_V4DI_V4SF_UQI:
10715 case V4SF_FTYPE_V2DI_V4SF_UQI:
10716 case V4DF_FTYPE_V4DI_V4DF_UQI:
10717 case V4DF_FTYPE_V8HF_V4DF_UQI:
10718 case V2DF_FTYPE_V8HF_V2DF_UQI:
10719 case V2DF_FTYPE_V2DI_V2DF_UQI:
10720 case V16QI_FTYPE_V8HI_V16QI_UQI:
10721 case V16QI_FTYPE_V16HI_V16QI_UHI:
10722 case V16QI_FTYPE_V4SI_V16QI_UQI:
10723 case V16QI_FTYPE_V8SI_V16QI_UQI:
10724 case V8HI_FTYPE_V8HF_V8HI_UQI:
10725 case V8HI_FTYPE_V4SI_V8HI_UQI:
10726 case V8HI_FTYPE_V8SI_V8HI_UQI:
10727 case V16QI_FTYPE_V2DI_V16QI_UQI:
10728 case V16QI_FTYPE_V4DI_V16QI_UQI:
10729 case V8HI_FTYPE_V2DI_V8HI_UQI:
10730 case V8HI_FTYPE_V4DI_V8HI_UQI:
10731 case V4SI_FTYPE_V2DI_V4SI_UQI:
10732 case V4SI_FTYPE_V4DI_V4SI_UQI:
10733 case V32QI_FTYPE_V32HI_V32QI_USI:
10734 case UHI_FTYPE_V16QI_V16QI_UHI:
10735 case USI_FTYPE_V32QI_V32QI_USI:
10736 case UDI_FTYPE_V64QI_V64QI_UDI:
10737 case UQI_FTYPE_V8HI_V8HI_UQI:
10738 case UHI_FTYPE_V16HI_V16HI_UHI:
10739 case USI_FTYPE_V32HI_V32HI_USI:
10740 case UQI_FTYPE_V4SI_V4SI_UQI:
10741 case UQI_FTYPE_V8SI_V8SI_UQI:
10742 case UQI_FTYPE_V2DI_V2DI_UQI:
10743 case UQI_FTYPE_V4DI_V4DI_UQI:
10744 case V4SF_FTYPE_V2DF_V4SF_UQI:
10745 case V4SF_FTYPE_V4DF_V4SF_UQI:
10746 case V16SI_FTYPE_V16SI_V16SI_UHI:
10747 case V16SI_FTYPE_V4SI_V16SI_UHI:
10748 case V2DI_FTYPE_V4SI_V2DI_UQI:
10749 case V2DI_FTYPE_V8HI_V2DI_UQI:
10750 case V2DI_FTYPE_V16QI_V2DI_UQI:
10751 case V4DI_FTYPE_V4DI_V4DI_UQI:
10752 case V4DI_FTYPE_V4SI_V4DI_UQI:
10753 case V4DI_FTYPE_V8HI_V4DI_UQI:
10754 case V4DI_FTYPE_V16QI_V4DI_UQI:
10755 case V4DI_FTYPE_V4DF_V4DI_UQI:
10756 case V2DI_FTYPE_V2DF_V2DI_UQI:
10757 case V4SI_FTYPE_V4DF_V4SI_UQI:
10758 case V4SI_FTYPE_V2DF_V4SI_UQI:
10759 case V4SI_FTYPE_V8HI_V4SI_UQI:
10760 case V4SI_FTYPE_V16QI_V4SI_UQI:
10761 case V4DI_FTYPE_V4DI_V4DI_V4DI:
10762 case V8DF_FTYPE_V2DF_V8DF_UQI:
10763 case V8DF_FTYPE_V4DF_V8DF_UQI:
10764 case V8DF_FTYPE_V8DF_V8DF_UQI:
10765 case V8SF_FTYPE_V8SF_V8SF_UQI:
10766 case V8SF_FTYPE_V8SI_V8SF_UQI:
10767 case V4DF_FTYPE_V4DF_V4DF_UQI:
10768 case V4SF_FTYPE_V4SF_V4SF_UQI:
10769 case V2DF_FTYPE_V2DF_V2DF_UQI:
10770 case V2DF_FTYPE_V4SF_V2DF_UQI:
10771 case V2DF_FTYPE_V4SI_V2DF_UQI:
10772 case V4SF_FTYPE_V4SI_V4SF_UQI:
10773 case V4DF_FTYPE_V4SF_V4DF_UQI:
10774 case V4DF_FTYPE_V4SI_V4DF_UQI:
10775 case V8SI_FTYPE_V8SI_V8SI_UQI:
10776 case V8SI_FTYPE_V8HI_V8SI_UQI:
10777 case V8SI_FTYPE_V16QI_V8SI_UQI:
10778 case V8DF_FTYPE_V8SI_V8DF_UQI:
10779 case V8DI_FTYPE_DI_V8DI_UQI:
10780 case V16SF_FTYPE_V8SF_V16SF_UHI:
10781 case V16SI_FTYPE_V8SI_V16SI_UHI:
10782 case V16HF_FTYPE_V16HI_V16HF_UHI:
10783 case V16HF_FTYPE_V16HF_V16HF_V16HF:
10784 case V16HI_FTYPE_V16HF_V16HI_UHI:
10785 case V16HI_FTYPE_V16HI_V16HI_UHI:
10786 case V8HI_FTYPE_V16QI_V8HI_UQI:
10787 case V16HI_FTYPE_V16QI_V16HI_UHI:
10788 case V32HI_FTYPE_V32HI_V32HI_USI:
10789 case V32HI_FTYPE_V32QI_V32HI_USI:
10790 case V8DI_FTYPE_V16QI_V8DI_UQI:
10791 case V8DI_FTYPE_V2DI_V8DI_UQI:
10792 case V8DI_FTYPE_V4DI_V8DI_UQI:
10793 case V8DI_FTYPE_V8DI_V8DI_UQI:
10794 case V8DI_FTYPE_V8HI_V8DI_UQI:
10795 case V8DI_FTYPE_V8SI_V8DI_UQI:
10796 case V8HI_FTYPE_V8DI_V8HI_UQI:
10797 case V8SI_FTYPE_V8DI_V8SI_UQI:
10798 case V4SI_FTYPE_V4SI_V4SI_V4SI:
10799 case V16SI_FTYPE_V16SI_V16SI_V16SI:
10800 case V8DI_FTYPE_V8DI_V8DI_V8DI:
10801 case V32HI_FTYPE_V32HI_V32HI_V32HI:
10802 case V2DI_FTYPE_V2DI_V2DI_V2DI:
10803 case V16HI_FTYPE_V16HI_V16HI_V16HI:
10804 case V8SI_FTYPE_V8SI_V8SI_V8SI:
10805 case V8HI_FTYPE_V8HI_V8HI_V8HI:
10806 case V32BF_FTYPE_V16SF_V16SF_USI:
10807 case V16BF_FTYPE_V8SF_V8SF_UHI:
10808 case V8BF_FTYPE_V4SF_V4SF_UQI:
10809 case V16BF_FTYPE_V16SF_V16BF_UHI:
10810 case V8BF_FTYPE_V8SF_V8BF_UQI:
10811 case V8BF_FTYPE_V4SF_V8BF_UQI:
10812 case V16SF_FTYPE_V16SF_V32BF_V32BF:
10813 case V8SF_FTYPE_V8SF_V16BF_V16BF:
10814 case V4SF_FTYPE_V4SF_V8BF_V8BF:
10815 nargs = 3;
10816 break;
10817 case V32QI_FTYPE_V32QI_V32QI_INT:
10818 case V16HI_FTYPE_V16HI_V16HI_INT:
10819 case V16QI_FTYPE_V16QI_V16QI_INT:
10820 case V4DI_FTYPE_V4DI_V4DI_INT:
10821 case V8HI_FTYPE_V8HI_V8HI_INT:
10822 case V8SI_FTYPE_V8SI_V8SI_INT:
10823 case V8SI_FTYPE_V8SI_V4SI_INT:
10824 case V8SF_FTYPE_V8SF_V8SF_INT:
10825 case V8SF_FTYPE_V8SF_V4SF_INT:
10826 case V4SI_FTYPE_V4SI_V4SI_INT:
10827 case V4DF_FTYPE_V4DF_V4DF_INT:
10828 case V16SF_FTYPE_V16SF_V16SF_INT:
10829 case V16SF_FTYPE_V16SF_V4SF_INT:
10830 case V16SI_FTYPE_V16SI_V4SI_INT:
10831 case V4DF_FTYPE_V4DF_V2DF_INT:
10832 case V4SF_FTYPE_V4SF_V4SF_INT:
10833 case V2DI_FTYPE_V2DI_V2DI_INT:
10834 case V4DI_FTYPE_V4DI_V2DI_INT:
10835 case V2DF_FTYPE_V2DF_V2DF_INT:
10836 case UQI_FTYPE_V8DI_V8UDI_INT:
10837 case UQI_FTYPE_V8DF_V8DF_INT:
10838 case UQI_FTYPE_V2DF_V2DF_INT:
10839 case UQI_FTYPE_V4SF_V4SF_INT:
10840 case UHI_FTYPE_V16SI_V16SI_INT:
10841 case UHI_FTYPE_V16SF_V16SF_INT:
10842 case V64QI_FTYPE_V64QI_V64QI_INT:
10843 case V32HI_FTYPE_V32HI_V32HI_INT:
10844 case V16SI_FTYPE_V16SI_V16SI_INT:
10845 case V8DI_FTYPE_V8DI_V8DI_INT:
10846 nargs = 3;
10847 nargs_constant = 1;
10848 break;
10849 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
10850 nargs = 3;
10851 rmode = V4DImode;
10852 nargs_constant = 1;
10853 break;
10854 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
10855 nargs = 3;
10856 rmode = V2DImode;
10857 nargs_constant = 1;
10858 break;
10859 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
10860 nargs = 3;
10861 rmode = DImode;
10862 nargs_constant = 1;
10863 break;
10864 case V2DI_FTYPE_V2DI_UINT_UINT:
10865 nargs = 3;
10866 nargs_constant = 2;
10867 break;
10868 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
10869 nargs = 3;
10870 rmode = V8DImode;
10871 nargs_constant = 1;
10872 break;
10873 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
10874 nargs = 5;
10875 rmode = V8DImode;
10876 mask_pos = 2;
10877 nargs_constant = 1;
10878 break;
10879 case QI_FTYPE_V8DF_INT_UQI:
10880 case QI_FTYPE_V4DF_INT_UQI:
10881 case QI_FTYPE_V2DF_INT_UQI:
10882 case HI_FTYPE_V16SF_INT_UHI:
10883 case QI_FTYPE_V8SF_INT_UQI:
10884 case QI_FTYPE_V4SF_INT_UQI:
10885 case QI_FTYPE_V8HF_INT_UQI:
10886 case HI_FTYPE_V16HF_INT_UHI:
10887 case SI_FTYPE_V32HF_INT_USI:
10888 case V4SI_FTYPE_V4SI_V4SI_UHI:
10889 case V8SI_FTYPE_V8SI_V8SI_UHI:
10890 nargs = 3;
10891 mask_pos = 1;
10892 nargs_constant = 1;
10893 break;
10894 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
10895 nargs = 5;
10896 rmode = V4DImode;
10897 mask_pos = 2;
10898 nargs_constant = 1;
10899 break;
10900 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
10901 nargs = 5;
10902 rmode = V2DImode;
10903 mask_pos = 2;
10904 nargs_constant = 1;
10905 break;
10906 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
10907 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
10908 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
10909 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
10910 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
10911 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
10912 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
10913 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
10914 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
10915 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
10916 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
10917 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
10918 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
10919 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
10920 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
10921 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
10922 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI:
10923 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
10924 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
10925 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
10926 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
10927 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
10928 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
10929 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
10930 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
10931 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
10932 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
10933 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
10934 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
10935 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
10936 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
10937 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
10938 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
10939 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
10940 case V16HF_FTYPE_V16HF_V16HF_V16HF_UQI:
10941 case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI:
10942 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
10943 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
10944 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
10945 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
10946 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
10947 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
10948 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
10949 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI:
10950 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
10951 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
10952 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
10953 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
10954 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
10955 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
10956 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
10957 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
10958 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
10959 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
10960 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
10961 case V32BF_FTYPE_V16SF_V16SF_V32BF_USI:
10962 case V16BF_FTYPE_V8SF_V8SF_V16BF_UHI:
10963 case V8BF_FTYPE_V4SF_V4SF_V8BF_UQI:
10964 nargs = 4;
10965 break;
10966 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
10967 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
10968 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
10969 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
10970 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
10971 nargs = 4;
10972 nargs_constant = 1;
10973 break;
10974 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
10975 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
10976 case QI_FTYPE_V4DF_V4DF_INT_UQI:
10977 case QI_FTYPE_V8SF_V8SF_INT_UQI:
10978 case UHI_FTYPE_V16HF_V16HF_INT_UHI:
10979 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
10980 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
10981 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
10982 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
10983 case UQI_FTYPE_V8HF_V8HF_INT_UQI:
10984 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
10985 case USI_FTYPE_V32QI_V32QI_INT_USI:
10986 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
10987 case USI_FTYPE_V32HI_V32HI_INT_USI:
10988 case USI_FTYPE_V32HF_V32HF_INT_USI:
10989 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
10990 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
10991 nargs = 4;
10992 mask_pos = 1;
10993 nargs_constant = 1;
10994 break;
10995 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
10996 nargs = 4;
10997 nargs_constant = 2;
10998 break;
10999 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
11000 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
11001 case V16SF_FTYPE_V16SF_V32BF_V32BF_UHI:
11002 case V8SF_FTYPE_V8SF_V16BF_V16BF_UQI:
11003 case V4SF_FTYPE_V4SF_V8BF_V8BF_UQI:
11004 nargs = 4;
11005 break;
11006 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
11007 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
11008 mask_pos = 1;
11009 nargs = 4;
11010 nargs_constant = 1;
11011 break;
11012 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
11013 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
11014 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
11015 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
11016 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
11017 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
11018 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
11019 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
11020 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
11021 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
11022 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
11023 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
11024 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
11025 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
11026 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
11027 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
11028 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
11029 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
11030 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
11031 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
11032 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
11033 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
11034 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
11035 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
11036 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
11037 case V16HF_FTYPE_V16HF_INT_V16HF_UHI:
11038 case V8HF_FTYPE_V8HF_INT_V8HF_UQI:
11039 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
11040 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
11041 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
11042 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
11043 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
11044 nargs = 4;
11045 mask_pos = 2;
11046 nargs_constant = 1;
11047 break;
11048 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
11049 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
11050 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
11051 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
11052 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
11053 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
11054 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
11055 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
11056 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
11057 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
11058 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
11059 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
11060 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
11061 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
11062 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
11063 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
11064 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
11065 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
11066 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
11067 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
11068 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
11069 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
11070 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
11071 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
11072 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
11073 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
11074 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
11075 nargs = 5;
11076 mask_pos = 2;
11077 nargs_constant = 1;
11078 break;
11079 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
11080 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
11081 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
11082 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
11083 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
11084 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
11085 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
11086 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
11087 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
11088 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
11089 nargs = 5;
11090 mask_pos = 1;
11091 nargs_constant = 1;
11092 break;
11093 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
11094 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
11095 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
11096 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
11097 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
11098 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
11099 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
11100 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
11101 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
11102 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
11103 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
11104 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
11105 nargs = 5;
11106 mask_pos = 1;
11107 nargs_constant = 2;
11108 break;
11109
11110 default:
11111 gcc_unreachable ();
11112 }
11113
11114 gcc_assert (nargs <= ARRAY_SIZE (xops));
11115
11116 if (comparison != UNKNOWN)
11117 {
11118 gcc_assert (nargs == 2);
11119 return ix86_expand_sse_compare (d, exp, target, swap);
11120 }
11121
11122 if (rmode == VOIDmode || rmode == tmode)
11123 {
11124 if (optimize
11125 || target == 0
11126 || GET_MODE (target) != tmode
11127 || !insn_p->operand[0].predicate (target, tmode))
11128 target = gen_reg_rtx (tmode);
11129 else if (memory_operand (target, tmode))
11130 num_memory++;
11131 real_target = target;
11132 }
11133 else
11134 {
11135 real_target = gen_reg_rtx (tmode);
11136 target = lowpart_subreg (rmode, real_target, tmode);
11137 }
11138
11139 for (i = 0; i < nargs; i++)
11140 {
11141 tree arg = CALL_EXPR_ARG (exp, i);
11142 rtx op = expand_normal (arg);
11143 machine_mode mode = insn_p->operand[i + 1].mode;
11144 bool match = insn_p->operand[i + 1].predicate (op, mode);
11145
11146 if (second_arg_count && i == 1)
11147 {
11148 /* SIMD shift insns take either an 8-bit immediate or
11149 register as count. But builtin functions take int as
11150 count. If count doesn't match, we put it in register.
11151 The instructions are using 64-bit count, if op is just
11152 32-bit, zero-extend it, as negative shift counts
11153 are undefined behavior and zero-extension is more
11154 efficient. */
11155 if (!match)
11156 {
11157 if (SCALAR_INT_MODE_P (GET_MODE (op)))
11158 op = convert_modes (mode, GET_MODE (op), op, 1);
11159 else
11160 op = lowpart_subreg (mode, op, GET_MODE (op));
11161 if (!insn_p->operand[i + 1].predicate (op, mode))
11162 op = copy_to_reg (op);
11163 }
11164 }
11165 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
11166 (!mask_pos && (nargs - i) <= nargs_constant))
11167 {
11168 if (!match)
11169 switch (icode)
11170 {
11171 case CODE_FOR_avx_vinsertf128v4di:
11172 case CODE_FOR_avx_vextractf128v4di:
11173 error ("the last argument must be an 1-bit immediate");
11174 return const0_rtx;
11175
11176 case CODE_FOR_avx512f_cmpv8di3_mask:
11177 case CODE_FOR_avx512f_cmpv16si3_mask:
11178 case CODE_FOR_avx512f_ucmpv8di3_mask:
11179 case CODE_FOR_avx512f_ucmpv16si3_mask:
11180 case CODE_FOR_avx512vl_cmpv4di3_mask:
11181 case CODE_FOR_avx512vl_cmpv8si3_mask:
11182 case CODE_FOR_avx512vl_ucmpv4di3_mask:
11183 case CODE_FOR_avx512vl_ucmpv8si3_mask:
11184 case CODE_FOR_avx512vl_cmpv2di3_mask:
11185 case CODE_FOR_avx512vl_cmpv4si3_mask:
11186 case CODE_FOR_avx512vl_ucmpv2di3_mask:
11187 case CODE_FOR_avx512vl_ucmpv4si3_mask:
11188 error ("the last argument must be a 3-bit immediate");
11189 return const0_rtx;
11190
11191 case CODE_FOR_sse4_1_roundsd:
11192 case CODE_FOR_sse4_1_roundss:
11193
11194 case CODE_FOR_sse4_1_roundpd:
11195 case CODE_FOR_sse4_1_roundps:
11196 case CODE_FOR_avx_roundpd256:
11197 case CODE_FOR_avx_roundps256:
11198
11199 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
11200 case CODE_FOR_sse4_1_roundps_sfix:
11201 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
11202 case CODE_FOR_avx_roundps_sfix256:
11203
11204 case CODE_FOR_sse4_1_blendps:
11205 case CODE_FOR_avx_blendpd256:
11206 case CODE_FOR_avx_vpermilv4df:
11207 case CODE_FOR_avx_vpermilv4df_mask:
11208 case CODE_FOR_avx512f_getmantv8df_mask:
11209 case CODE_FOR_avx512f_getmantv16sf_mask:
11210 case CODE_FOR_avx512vl_getmantv16hf_mask:
11211 case CODE_FOR_avx512vl_getmantv8sf_mask:
11212 case CODE_FOR_avx512vl_getmantv4df_mask:
11213 case CODE_FOR_avx512fp16_getmantv8hf_mask:
11214 case CODE_FOR_avx512vl_getmantv4sf_mask:
11215 case CODE_FOR_avx512vl_getmantv2df_mask:
11216 case CODE_FOR_avx512dq_rangepv8df_mask_round:
11217 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
11218 case CODE_FOR_avx512dq_rangepv4df_mask:
11219 case CODE_FOR_avx512dq_rangepv8sf_mask:
11220 case CODE_FOR_avx512dq_rangepv2df_mask:
11221 case CODE_FOR_avx512dq_rangepv4sf_mask:
11222 case CODE_FOR_avx_shufpd256_mask:
11223 error ("the last argument must be a 4-bit immediate");
11224 return const0_rtx;
11225
11226 case CODE_FOR_sha1rnds4:
11227 case CODE_FOR_sse4_1_blendpd:
11228 case CODE_FOR_avx_vpermilv2df:
11229 case CODE_FOR_avx_vpermilv2df_mask:
11230 case CODE_FOR_xop_vpermil2v2df3:
11231 case CODE_FOR_xop_vpermil2v4sf3:
11232 case CODE_FOR_xop_vpermil2v4df3:
11233 case CODE_FOR_xop_vpermil2v8sf3:
11234 case CODE_FOR_avx512f_vinsertf32x4_mask:
11235 case CODE_FOR_avx512f_vinserti32x4_mask:
11236 case CODE_FOR_avx512f_vextractf32x4_mask:
11237 case CODE_FOR_avx512f_vextracti32x4_mask:
11238 case CODE_FOR_sse2_shufpd:
11239 case CODE_FOR_sse2_shufpd_mask:
11240 case CODE_FOR_avx512dq_shuf_f64x2_mask:
11241 case CODE_FOR_avx512dq_shuf_i64x2_mask:
11242 case CODE_FOR_avx512vl_shuf_i32x4_mask:
11243 case CODE_FOR_avx512vl_shuf_f32x4_mask:
11244 error ("the last argument must be a 2-bit immediate");
11245 return const0_rtx;
11246
11247 case CODE_FOR_avx_vextractf128v4df:
11248 case CODE_FOR_avx_vextractf128v8sf:
11249 case CODE_FOR_avx_vextractf128v8si:
11250 case CODE_FOR_avx_vinsertf128v4df:
11251 case CODE_FOR_avx_vinsertf128v8sf:
11252 case CODE_FOR_avx_vinsertf128v8si:
11253 case CODE_FOR_avx512f_vinsertf64x4_mask:
11254 case CODE_FOR_avx512f_vinserti64x4_mask:
11255 case CODE_FOR_avx512f_vextractf64x4_mask:
11256 case CODE_FOR_avx512f_vextracti64x4_mask:
11257 case CODE_FOR_avx512dq_vinsertf32x8_mask:
11258 case CODE_FOR_avx512dq_vinserti32x8_mask:
11259 case CODE_FOR_avx512vl_vinsertv4df:
11260 case CODE_FOR_avx512vl_vinsertv4di:
11261 case CODE_FOR_avx512vl_vinsertv8sf:
11262 case CODE_FOR_avx512vl_vinsertv8si:
11263 error ("the last argument must be a 1-bit immediate");
11264 return const0_rtx;
11265
11266 case CODE_FOR_avx_vmcmpv2df3:
11267 case CODE_FOR_avx_vmcmpv4sf3:
11268 case CODE_FOR_avx_cmpv2df3:
11269 case CODE_FOR_avx_cmpv4sf3:
11270 case CODE_FOR_avx_cmpv4df3:
11271 case CODE_FOR_avx_cmpv8sf3:
11272 case CODE_FOR_avx512f_cmpv8df3_mask:
11273 case CODE_FOR_avx512f_cmpv16sf3_mask:
11274 case CODE_FOR_avx512f_vmcmpv2df3_mask:
11275 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
11276 case CODE_FOR_avx512bw_cmpv32hf3_mask:
11277 case CODE_FOR_avx512vl_cmpv16hf3_mask:
11278 case CODE_FOR_avx512fp16_cmpv8hf3_mask:
11279 error ("the last argument must be a 5-bit immediate");
11280 return const0_rtx;
11281
11282 default:
11283 switch (nargs_constant)
11284 {
11285 case 2:
11286 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
11287 (!mask_pos && (nargs - i) == nargs_constant))
11288 {
11289 error ("the next to last argument must be an 8-bit immediate");
11290 break;
11291 }
11292 /* FALLTHRU */
11293 case 1:
11294 error ("the last argument must be an 8-bit immediate");
11295 break;
11296 default:
11297 gcc_unreachable ();
11298 }
11299 return const0_rtx;
11300 }
11301 }
11302 else
11303 {
11304 if (VECTOR_MODE_P (mode))
11305 op = safe_vector_operand (op, mode);
11306
11307 /* If we aren't optimizing, only allow one memory operand to
11308 be generated. */
11309 if (memory_operand (op, mode))
11310 num_memory++;
11311
11312 op = fixup_modeless_constant (op, mode);
11313
11314 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
11315 {
11316 if (optimize || !match || num_memory > 1)
11317 op = copy_to_mode_reg (mode, op);
11318 }
11319 else
11320 {
11321 op = copy_to_reg (op);
11322 op = lowpart_subreg (mode, op, GET_MODE (op));
11323 }
11324 }
11325
11326 xops[i] = op;
11327 }
11328
11329 switch (nargs)
11330 {
11331 case 1:
11332 pat = GEN_FCN (icode) (real_target, xops[0]);
11333 break;
11334 case 2:
11335 pat = GEN_FCN (icode) (real_target, xops[0], xops[1]);
11336 break;
11337 case 3:
11338 pat = GEN_FCN (icode) (real_target, xops[0], xops[1], xops[2]);
11339 break;
11340 case 4:
11341 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11342 xops[2], xops[3]);
11343 break;
11344 case 5:
11345 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11346 xops[2], xops[3], xops[4]);
11347 break;
11348 case 6:
11349 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11350 xops[2], xops[3], xops[4], xops[5]);
11351 break;
11352 default:
11353 gcc_unreachable ();
11354 }
11355
11356 if (! pat)
11357 return 0;
11358
11359 emit_insn (pat);
11360 return target;
11361 }
11362
11363 /* Transform pattern of following layout:
11364 (set A
11365 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
11366 )
11367 into:
11368 (set (A B)) */
11369
11370 static rtx
11371 ix86_erase_embedded_rounding (rtx pat)
11372 {
11373 if (GET_CODE (pat) == INSN)
11374 pat = PATTERN (pat);
11375
11376 gcc_assert (GET_CODE (pat) == SET);
11377 rtx src = SET_SRC (pat);
11378 gcc_assert (XVECLEN (src, 0) == 2);
11379 rtx p0 = XVECEXP (src, 0, 0);
11380 gcc_assert (GET_CODE (src) == UNSPEC
11381 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
11382 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
11383 return res;
11384 }
11385
11386 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
11387 with rounding. */
11388 static rtx
11389 ix86_expand_sse_comi_round (const struct builtin_description *d,
11390 tree exp, rtx target)
11391 {
11392 rtx pat, set_dst;
11393 tree arg0 = CALL_EXPR_ARG (exp, 0);
11394 tree arg1 = CALL_EXPR_ARG (exp, 1);
11395 tree arg2 = CALL_EXPR_ARG (exp, 2);
11396 tree arg3 = CALL_EXPR_ARG (exp, 3);
11397 rtx op0 = expand_normal (arg0);
11398 rtx op1 = expand_normal (arg1);
11399 rtx op2 = expand_normal (arg2);
11400 rtx op3 = expand_normal (arg3);
11401 enum insn_code icode = d->icode;
11402 const struct insn_data_d *insn_p = &insn_data[icode];
11403 machine_mode mode0 = insn_p->operand[0].mode;
11404 machine_mode mode1 = insn_p->operand[1].mode;
11405
11406 /* See avxintrin.h for values. */
11407 static const enum rtx_code comparisons[32] =
11408 {
11409 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
11410 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
11411 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
11412 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
11413 };
11414 static const bool ordereds[32] =
11415 {
11416 true, true, true, false, false, false, false, true,
11417 false, false, false, true, true, true, true, false,
11418 true, true, true, false, false, false, false, true,
11419 false, false, false, true, true, true, true, false
11420 };
11421 static const bool non_signalings[32] =
11422 {
11423 true, false, false, true, true, false, false, true,
11424 true, false, false, true, true, false, false, true,
11425 false, true, true, false, false, true, true, false,
11426 false, true, true, false, false, true, true, false
11427 };
11428
11429 if (!CONST_INT_P (op2))
11430 {
11431 error ("the third argument must be comparison constant");
11432 return const0_rtx;
11433 }
11434 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
11435 {
11436 error ("incorrect comparison mode");
11437 return const0_rtx;
11438 }
11439
11440 if (!insn_p->operand[2].predicate (op3, SImode))
11441 {
11442 error ("incorrect rounding operand");
11443 return const0_rtx;
11444 }
11445
11446 if (VECTOR_MODE_P (mode0))
11447 op0 = safe_vector_operand (op0, mode0);
11448 if (VECTOR_MODE_P (mode1))
11449 op1 = safe_vector_operand (op1, mode1);
11450
11451 enum rtx_code comparison = comparisons[INTVAL (op2)];
11452 bool ordered = ordereds[INTVAL (op2)];
11453 bool non_signaling = non_signalings[INTVAL (op2)];
11454 rtx const_val = const0_rtx;
11455
11456 bool check_unordered = false;
11457 machine_mode mode = CCFPmode;
11458 switch (comparison)
11459 {
11460 case ORDERED:
11461 if (!ordered)
11462 {
11463 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
11464 if (!non_signaling)
11465 ordered = true;
11466 mode = CCSmode;
11467 }
11468 else
11469 {
11470 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
11471 if (non_signaling)
11472 ordered = false;
11473 mode = CCPmode;
11474 }
11475 comparison = NE;
11476 break;
11477 case UNORDERED:
11478 if (ordered)
11479 {
11480 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
11481 if (non_signaling)
11482 ordered = false;
11483 mode = CCSmode;
11484 }
11485 else
11486 {
11487 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
11488 if (!non_signaling)
11489 ordered = true;
11490 mode = CCPmode;
11491 }
11492 comparison = EQ;
11493 break;
11494
11495 case LE: /* -> GE */
11496 case LT: /* -> GT */
11497 case UNGE: /* -> UNLE */
11498 case UNGT: /* -> UNLT */
11499 std::swap (op0, op1);
11500 comparison = swap_condition (comparison);
11501 /* FALLTHRU */
11502 case GT:
11503 case GE:
11504 case UNEQ:
11505 case UNLT:
11506 case UNLE:
11507 case LTGT:
11508 /* These are supported by CCFPmode. NB: Use ordered/signaling
11509 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
11510 with NAN operands. */
11511 if (ordered == non_signaling)
11512 ordered = !ordered;
11513 break;
11514 case EQ:
11515 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11516 _CMP_EQ_OQ/_CMP_EQ_OS. */
11517 check_unordered = true;
11518 mode = CCZmode;
11519 break;
11520 case NE:
11521 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11522 _CMP_NEQ_UQ/_CMP_NEQ_US. */
11523 gcc_assert (!ordered);
11524 check_unordered = true;
11525 mode = CCZmode;
11526 const_val = const1_rtx;
11527 break;
11528 default:
11529 gcc_unreachable ();
11530 }
11531
11532 target = gen_reg_rtx (SImode);
11533 emit_move_insn (target, const_val);
11534 target = gen_rtx_SUBREG (QImode, target, 0);
11535
11536 if ((optimize && !register_operand (op0, mode0))
11537 || !insn_p->operand[0].predicate (op0, mode0))
11538 op0 = copy_to_mode_reg (mode0, op0);
11539 if ((optimize && !register_operand (op1, mode1))
11540 || !insn_p->operand[1].predicate (op1, mode1))
11541 op1 = copy_to_mode_reg (mode1, op1);
11542
11543 /*
11544 1. COMI: ordered and signaling.
11545 2. UCOMI: unordered and non-signaling.
11546 */
11547 if (non_signaling)
11548 icode = (icode == CODE_FOR_sse_comi_round
11549 ? CODE_FOR_sse_ucomi_round
11550 : CODE_FOR_sse2_ucomi_round);
11551
11552 pat = GEN_FCN (icode) (op0, op1, op3);
11553 if (! pat)
11554 return 0;
11555
11556 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
11557 if (INTVAL (op3) == NO_ROUND)
11558 {
11559 pat = ix86_erase_embedded_rounding (pat);
11560 if (! pat)
11561 return 0;
11562
11563 set_dst = SET_DEST (pat);
11564 }
11565 else
11566 {
11567 gcc_assert (GET_CODE (pat) == SET);
11568 set_dst = SET_DEST (pat);
11569 }
11570
11571 emit_insn (pat);
11572
11573 return ix86_ssecom_setcc (comparison, check_unordered, mode,
11574 set_dst, target);
11575 }
11576
11577 static rtx
11578 ix86_expand_round_builtin (const struct builtin_description *d,
11579 tree exp, rtx target)
11580 {
11581 rtx pat;
11582 unsigned int i, nargs;
11583 rtx xops[6];
11584 enum insn_code icode = d->icode;
11585 const struct insn_data_d *insn_p = &insn_data[icode];
11586 machine_mode tmode = insn_p->operand[0].mode;
11587 unsigned int nargs_constant = 0;
11588 unsigned int redundant_embed_rnd = 0;
11589
11590 switch ((enum ix86_builtin_func_type) d->flag)
11591 {
11592 case UINT64_FTYPE_V2DF_INT:
11593 case UINT64_FTYPE_V4SF_INT:
11594 case UINT64_FTYPE_V8HF_INT:
11595 case UINT_FTYPE_V2DF_INT:
11596 case UINT_FTYPE_V4SF_INT:
11597 case UINT_FTYPE_V8HF_INT:
11598 case INT64_FTYPE_V2DF_INT:
11599 case INT64_FTYPE_V4SF_INT:
11600 case INT64_FTYPE_V8HF_INT:
11601 case INT_FTYPE_V2DF_INT:
11602 case INT_FTYPE_V4SF_INT:
11603 case INT_FTYPE_V8HF_INT:
11604 nargs = 2;
11605 break;
11606 case V32HF_FTYPE_V32HF_V32HF_INT:
11607 case V8HF_FTYPE_V8HF_V8HF_INT:
11608 case V8HF_FTYPE_V8HF_INT_INT:
11609 case V8HF_FTYPE_V8HF_UINT_INT:
11610 case V8HF_FTYPE_V8HF_INT64_INT:
11611 case V8HF_FTYPE_V8HF_UINT64_INT:
11612 case V4SF_FTYPE_V4SF_UINT_INT:
11613 case V4SF_FTYPE_V4SF_UINT64_INT:
11614 case V2DF_FTYPE_V2DF_UINT64_INT:
11615 case V4SF_FTYPE_V4SF_INT_INT:
11616 case V4SF_FTYPE_V4SF_INT64_INT:
11617 case V2DF_FTYPE_V2DF_INT64_INT:
11618 case V4SF_FTYPE_V4SF_V4SF_INT:
11619 case V2DF_FTYPE_V2DF_V2DF_INT:
11620 case V4SF_FTYPE_V4SF_V2DF_INT:
11621 case V2DF_FTYPE_V2DF_V4SF_INT:
11622 nargs = 3;
11623 break;
11624 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
11625 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
11626 case V32HI_FTYPE_V32HF_V32HI_USI_INT:
11627 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
11628 case V8DI_FTYPE_V8HF_V8DI_UQI_INT:
11629 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
11630 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
11631 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
11632 case V8DF_FTYPE_V8HF_V8DF_UQI_INT:
11633 case V16SF_FTYPE_V16HF_V16SF_UHI_INT:
11634 case V32HF_FTYPE_V32HI_V32HF_USI_INT:
11635 case V32HF_FTYPE_V32HF_V32HF_USI_INT:
11636 case V32HF_FTYPE_V32HF_V32HF_V32HF_INT:
11637 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
11638 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
11639 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
11640 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
11641 case V16SI_FTYPE_V16HF_V16SI_UHI_INT:
11642 case V16HF_FTYPE_V16SI_V16HF_UHI_INT:
11643 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
11644 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
11645 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
11646 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
11647 case V8HF_FTYPE_V8DI_V8HF_UQI_INT:
11648 case V8HF_FTYPE_V8DF_V8HF_UQI_INT:
11649 case V16HF_FTYPE_V16SF_V16HF_UHI_INT:
11650 case V8HF_FTYPE_V8HF_V8HF_V8HF_INT:
11651 nargs = 4;
11652 break;
11653 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
11654 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
11655 nargs_constant = 2;
11656 nargs = 4;
11657 break;
11658 case INT_FTYPE_V4SF_V4SF_INT_INT:
11659 case INT_FTYPE_V2DF_V2DF_INT_INT:
11660 return ix86_expand_sse_comi_round (d, exp, target);
11661 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
11662 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
11663 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
11664 case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT:
11665 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
11666 case V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT:
11667 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
11668 case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT:
11669 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
11670 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
11671 case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
11672 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
11673 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
11674 case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
11675 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT:
11676 case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT:
11677 case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT:
11678 nargs = 5;
11679 break;
11680 case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT:
11681 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
11682 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
11683 case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT:
11684 case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT:
11685 nargs_constant = 4;
11686 nargs = 5;
11687 break;
11688 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
11689 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
11690 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
11691 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
11692 case USI_FTYPE_V32HF_V32HF_INT_USI_INT:
11693 case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT:
11694 nargs_constant = 3;
11695 nargs = 5;
11696 break;
11697 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
11698 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
11699 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
11700 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
11701 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
11702 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
11703 case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT:
11704 nargs = 6;
11705 nargs_constant = 4;
11706 break;
11707 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
11708 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
11709 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
11710 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
11711 nargs = 6;
11712 nargs_constant = 3;
11713 break;
11714 default:
11715 gcc_unreachable ();
11716 }
11717 gcc_assert (nargs <= ARRAY_SIZE (xops));
11718
11719 if (optimize
11720 || target == 0
11721 || GET_MODE (target) != tmode
11722 || !insn_p->operand[0].predicate (target, tmode))
11723 target = gen_reg_rtx (tmode);
11724
11725 for (i = 0; i < nargs; i++)
11726 {
11727 tree arg = CALL_EXPR_ARG (exp, i);
11728 rtx op = expand_normal (arg);
11729 machine_mode mode = insn_p->operand[i + 1].mode;
11730 bool match = insn_p->operand[i + 1].predicate (op, mode);
11731
11732 if (i == nargs - nargs_constant)
11733 {
11734 if (!match)
11735 {
11736 switch (icode)
11737 {
11738 case CODE_FOR_avx512f_getmantv8df_mask_round:
11739 case CODE_FOR_avx512f_getmantv16sf_mask_round:
11740 case CODE_FOR_avx512bw_getmantv32hf_mask_round:
11741 case CODE_FOR_avx512f_vgetmantv2df_round:
11742 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
11743 case CODE_FOR_avx512f_vgetmantv4sf_round:
11744 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
11745 case CODE_FOR_avx512f_vgetmantv8hf_mask_round:
11746 error ("the immediate argument must be a 4-bit immediate");
11747 return const0_rtx;
11748 case CODE_FOR_avx512f_cmpv8df3_mask_round:
11749 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
11750 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
11751 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
11752 case CODE_FOR_avx512f_vmcmpv8hf3_mask_round:
11753 case CODE_FOR_avx512bw_cmpv32hf3_mask_round:
11754 error ("the immediate argument must be a 5-bit immediate");
11755 return const0_rtx;
11756 default:
11757 error ("the immediate argument must be an 8-bit immediate");
11758 return const0_rtx;
11759 }
11760 }
11761 }
11762 else if (i == nargs-1)
11763 {
11764 if (!insn_p->operand[nargs].predicate (op, SImode))
11765 {
11766 error ("incorrect rounding operand");
11767 return const0_rtx;
11768 }
11769
11770 /* If there is no rounding use normal version of the pattern. */
11771 if (INTVAL (op) == NO_ROUND)
11772 {
11773 /* Skip erasing embedded rounding for below expanders who
11774 generates multiple insns. In ix86_erase_embedded_rounding
11775 the pattern will be transformed to a single set, and emit_insn
11776 appends the set insead of insert it to chain. So the insns
11777 emitted inside define_expander would be ignored. */
11778 switch (icode)
11779 {
11780 case CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round:
11781 case CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round:
11782 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round:
11783 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round:
11784 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round:
11785 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round:
11786 redundant_embed_rnd = 0;
11787 break;
11788 default:
11789 redundant_embed_rnd = 1;
11790 break;
11791 }
11792 }
11793 }
11794 else
11795 {
11796 if (VECTOR_MODE_P (mode))
11797 op = safe_vector_operand (op, mode);
11798
11799 op = fixup_modeless_constant (op, mode);
11800
11801 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
11802 {
11803 if (optimize || !match)
11804 op = copy_to_mode_reg (mode, op);
11805 }
11806 else
11807 {
11808 op = copy_to_reg (op);
11809 op = lowpart_subreg (mode, op, GET_MODE (op));
11810 }
11811 }
11812
11813 xops[i] = op;
11814 }
11815
11816 switch (nargs)
11817 {
11818 case 1:
11819 pat = GEN_FCN (icode) (target, xops[0]);
11820 break;
11821 case 2:
11822 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
11823 break;
11824 case 3:
11825 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
11826 break;
11827 case 4:
11828 pat = GEN_FCN (icode) (target, xops[0], xops[1],
11829 xops[2], xops[3]);
11830 break;
11831 case 5:
11832 pat = GEN_FCN (icode) (target, xops[0], xops[1],
11833 xops[2], xops[3], xops[4]);
11834 break;
11835 case 6:
11836 pat = GEN_FCN (icode) (target, xops[0], xops[1],
11837 xops[2], xops[3], xops[4], xops[5]);
11838 break;
11839 default:
11840 gcc_unreachable ();
11841 }
11842
11843 if (!pat)
11844 return 0;
11845
11846 if (redundant_embed_rnd)
11847 pat = ix86_erase_embedded_rounding (pat);
11848
11849 emit_insn (pat);
11850 return target;
11851 }
11852
11853 /* Subroutine of ix86_expand_builtin to take care of special insns
11854 with variable number of operands. */
11855
11856 static rtx
11857 ix86_expand_special_args_builtin (const struct builtin_description *d,
11858 tree exp, rtx target)
11859 {
11860 tree arg;
11861 rtx pat, op;
11862 unsigned int i, nargs, arg_adjust, memory;
11863 unsigned int constant = 100;
11864 bool aligned_mem = false;
11865 rtx xops[4];
11866 enum insn_code icode = d->icode;
11867 const struct insn_data_d *insn_p = &insn_data[icode];
11868 machine_mode tmode = insn_p->operand[0].mode;
11869 enum { load, store } klass;
11870
11871 switch ((enum ix86_builtin_func_type) d->flag)
11872 {
11873 case VOID_FTYPE_VOID:
11874 emit_insn (GEN_FCN (icode) (target));
11875 return 0;
11876 case VOID_FTYPE_UINT64:
11877 case VOID_FTYPE_UNSIGNED:
11878 nargs = 0;
11879 klass = store;
11880 memory = 0;
11881 break;
11882
11883 case INT_FTYPE_VOID:
11884 case USHORT_FTYPE_VOID:
11885 case UINT64_FTYPE_VOID:
11886 case UINT_FTYPE_VOID:
11887 case UINT8_FTYPE_VOID:
11888 case UNSIGNED_FTYPE_VOID:
11889 nargs = 0;
11890 klass = load;
11891 memory = 0;
11892 break;
11893 case UINT64_FTYPE_PUNSIGNED:
11894 case V2DI_FTYPE_PV2DI:
11895 case V4DI_FTYPE_PV4DI:
11896 case V32QI_FTYPE_PCCHAR:
11897 case V16QI_FTYPE_PCCHAR:
11898 case V8SF_FTYPE_PCV4SF:
11899 case V8SF_FTYPE_PCFLOAT:
11900 case V4SF_FTYPE_PCFLOAT:
11901 case V4SF_FTYPE_PCFLOAT16:
11902 case V4SF_FTYPE_PCBFLOAT16:
11903 case V4SF_FTYPE_PCV8BF:
11904 case V4SF_FTYPE_PCV8HF:
11905 case V8SF_FTYPE_PCFLOAT16:
11906 case V8SF_FTYPE_PCBFLOAT16:
11907 case V8SF_FTYPE_PCV16HF:
11908 case V8SF_FTYPE_PCV16BF:
11909 case V4DF_FTYPE_PCV2DF:
11910 case V4DF_FTYPE_PCDOUBLE:
11911 case V2DF_FTYPE_PCDOUBLE:
11912 case VOID_FTYPE_PVOID:
11913 case V8DI_FTYPE_PV8DI:
11914 nargs = 1;
11915 klass = load;
11916 memory = 0;
11917 switch (icode)
11918 {
11919 case CODE_FOR_sse4_1_movntdqa:
11920 case CODE_FOR_avx2_movntdqa:
11921 case CODE_FOR_avx512f_movntdqa:
11922 aligned_mem = true;
11923 break;
11924 default:
11925 break;
11926 }
11927 break;
11928 case VOID_FTYPE_PV2SF_V4SF:
11929 case VOID_FTYPE_PV8DI_V8DI:
11930 case VOID_FTYPE_PV4DI_V4DI:
11931 case VOID_FTYPE_PV2DI_V2DI:
11932 case VOID_FTYPE_PCHAR_V32QI:
11933 case VOID_FTYPE_PCHAR_V16QI:
11934 case VOID_FTYPE_PFLOAT_V16SF:
11935 case VOID_FTYPE_PFLOAT_V8SF:
11936 case VOID_FTYPE_PFLOAT_V4SF:
11937 case VOID_FTYPE_PDOUBLE_V8DF:
11938 case VOID_FTYPE_PDOUBLE_V4DF:
11939 case VOID_FTYPE_PDOUBLE_V2DF:
11940 case VOID_FTYPE_PLONGLONG_LONGLONG:
11941 case VOID_FTYPE_PULONGLONG_ULONGLONG:
11942 case VOID_FTYPE_PUNSIGNED_UNSIGNED:
11943 case VOID_FTYPE_PINT_INT:
11944 nargs = 1;
11945 klass = store;
11946 /* Reserve memory operand for target. */
11947 memory = ARRAY_SIZE (xops);
11948 switch (icode)
11949 {
11950 /* These builtins and instructions require the memory
11951 to be properly aligned. */
11952 case CODE_FOR_avx_movntv4di:
11953 case CODE_FOR_sse2_movntv2di:
11954 case CODE_FOR_avx_movntv8sf:
11955 case CODE_FOR_sse_movntv4sf:
11956 case CODE_FOR_sse4a_vmmovntv4sf:
11957 case CODE_FOR_avx_movntv4df:
11958 case CODE_FOR_sse2_movntv2df:
11959 case CODE_FOR_sse4a_vmmovntv2df:
11960 case CODE_FOR_sse2_movntidi:
11961 case CODE_FOR_sse_movntq:
11962 case CODE_FOR_sse2_movntisi:
11963 case CODE_FOR_avx512f_movntv16sf:
11964 case CODE_FOR_avx512f_movntv8df:
11965 case CODE_FOR_avx512f_movntv8di:
11966 aligned_mem = true;
11967 break;
11968 default:
11969 break;
11970 }
11971 break;
11972 case VOID_FTYPE_PVOID_PCVOID:
11973 nargs = 1;
11974 klass = store;
11975 memory = 0;
11976
11977 break;
11978 case V4SF_FTYPE_V4SF_PCV2SF:
11979 case V2DF_FTYPE_V2DF_PCDOUBLE:
11980 nargs = 2;
11981 klass = load;
11982 memory = 1;
11983 break;
11984 case V8SF_FTYPE_PCV8SF_V8SI:
11985 case V4DF_FTYPE_PCV4DF_V4DI:
11986 case V4SF_FTYPE_PCV4SF_V4SI:
11987 case V2DF_FTYPE_PCV2DF_V2DI:
11988 case V8SI_FTYPE_PCV8SI_V8SI:
11989 case V4DI_FTYPE_PCV4DI_V4DI:
11990 case V4SI_FTYPE_PCV4SI_V4SI:
11991 case V2DI_FTYPE_PCV2DI_V2DI:
11992 case VOID_FTYPE_INT_INT64:
11993 nargs = 2;
11994 klass = load;
11995 memory = 0;
11996 break;
11997 case VOID_FTYPE_PV8DF_V8DF_UQI:
11998 case VOID_FTYPE_PV4DF_V4DF_UQI:
11999 case VOID_FTYPE_PV2DF_V2DF_UQI:
12000 case VOID_FTYPE_PV16SF_V16SF_UHI:
12001 case VOID_FTYPE_PV8SF_V8SF_UQI:
12002 case VOID_FTYPE_PV4SF_V4SF_UQI:
12003 case VOID_FTYPE_PV8DI_V8DI_UQI:
12004 case VOID_FTYPE_PV4DI_V4DI_UQI:
12005 case VOID_FTYPE_PV2DI_V2DI_UQI:
12006 case VOID_FTYPE_PV16SI_V16SI_UHI:
12007 case VOID_FTYPE_PV8SI_V8SI_UQI:
12008 case VOID_FTYPE_PV4SI_V4SI_UQI:
12009 case VOID_FTYPE_PV64QI_V64QI_UDI:
12010 case VOID_FTYPE_PV32HI_V32HI_USI:
12011 case VOID_FTYPE_PV32QI_V32QI_USI:
12012 case VOID_FTYPE_PV16QI_V16QI_UHI:
12013 case VOID_FTYPE_PV16HI_V16HI_UHI:
12014 case VOID_FTYPE_PV8HI_V8HI_UQI:
12015 switch (icode)
12016 {
12017 /* These builtins and instructions require the memory
12018 to be properly aligned. */
12019 case CODE_FOR_avx512f_storev16sf_mask:
12020 case CODE_FOR_avx512f_storev16si_mask:
12021 case CODE_FOR_avx512f_storev8df_mask:
12022 case CODE_FOR_avx512f_storev8di_mask:
12023 case CODE_FOR_avx512vl_storev8sf_mask:
12024 case CODE_FOR_avx512vl_storev8si_mask:
12025 case CODE_FOR_avx512vl_storev4df_mask:
12026 case CODE_FOR_avx512vl_storev4di_mask:
12027 case CODE_FOR_avx512vl_storev4sf_mask:
12028 case CODE_FOR_avx512vl_storev4si_mask:
12029 case CODE_FOR_avx512vl_storev2df_mask:
12030 case CODE_FOR_avx512vl_storev2di_mask:
12031 aligned_mem = true;
12032 break;
12033 default:
12034 break;
12035 }
12036 /* FALLTHRU */
12037 case VOID_FTYPE_PV8SF_V8SI_V8SF:
12038 case VOID_FTYPE_PV4DF_V4DI_V4DF:
12039 case VOID_FTYPE_PV4SF_V4SI_V4SF:
12040 case VOID_FTYPE_PV2DF_V2DI_V2DF:
12041 case VOID_FTYPE_PV8SI_V8SI_V8SI:
12042 case VOID_FTYPE_PV4DI_V4DI_V4DI:
12043 case VOID_FTYPE_PV4SI_V4SI_V4SI:
12044 case VOID_FTYPE_PV2DI_V2DI_V2DI:
12045 case VOID_FTYPE_PV8SI_V8DI_UQI:
12046 case VOID_FTYPE_PV8HI_V8DI_UQI:
12047 case VOID_FTYPE_PV16HI_V16SI_UHI:
12048 case VOID_FTYPE_PUDI_V8DI_UQI:
12049 case VOID_FTYPE_PV16QI_V16SI_UHI:
12050 case VOID_FTYPE_PV4SI_V4DI_UQI:
12051 case VOID_FTYPE_PUDI_V2DI_UQI:
12052 case VOID_FTYPE_PUDI_V4DI_UQI:
12053 case VOID_FTYPE_PUSI_V2DI_UQI:
12054 case VOID_FTYPE_PV8HI_V8SI_UQI:
12055 case VOID_FTYPE_PUDI_V4SI_UQI:
12056 case VOID_FTYPE_PUSI_V4DI_UQI:
12057 case VOID_FTYPE_PUHI_V2DI_UQI:
12058 case VOID_FTYPE_PUDI_V8SI_UQI:
12059 case VOID_FTYPE_PUSI_V4SI_UQI:
12060 case VOID_FTYPE_PCHAR_V64QI_UDI:
12061 case VOID_FTYPE_PCHAR_V32QI_USI:
12062 case VOID_FTYPE_PCHAR_V16QI_UHI:
12063 case VOID_FTYPE_PSHORT_V32HI_USI:
12064 case VOID_FTYPE_PSHORT_V16HI_UHI:
12065 case VOID_FTYPE_PSHORT_V8HI_UQI:
12066 case VOID_FTYPE_PINT_V16SI_UHI:
12067 case VOID_FTYPE_PINT_V8SI_UQI:
12068 case VOID_FTYPE_PINT_V4SI_UQI:
12069 case VOID_FTYPE_PINT64_V8DI_UQI:
12070 case VOID_FTYPE_PINT64_V4DI_UQI:
12071 case VOID_FTYPE_PINT64_V2DI_UQI:
12072 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
12073 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
12074 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
12075 case VOID_FTYPE_PFLOAT_V16SF_UHI:
12076 case VOID_FTYPE_PFLOAT_V8SF_UQI:
12077 case VOID_FTYPE_PFLOAT_V4SF_UQI:
12078 case VOID_FTYPE_PCFLOAT16_V8HF_UQI:
12079 case VOID_FTYPE_PV32QI_V32HI_USI:
12080 case VOID_FTYPE_PV16QI_V16HI_UHI:
12081 case VOID_FTYPE_PUDI_V8HI_UQI:
12082 nargs = 2;
12083 klass = store;
12084 /* Reserve memory operand for target. */
12085 memory = ARRAY_SIZE (xops);
12086 break;
12087 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
12088 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
12089 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
12090 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
12091 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
12092 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
12093 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
12094 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
12095 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
12096 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
12097 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
12098 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
12099 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
12100 case V32HI_FTYPE_PCV32HI_V32HI_USI:
12101 case V32QI_FTYPE_PCV32QI_V32QI_USI:
12102 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
12103 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
12104 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
12105 switch (icode)
12106 {
12107 /* These builtins and instructions require the memory
12108 to be properly aligned. */
12109 case CODE_FOR_avx512f_loadv16sf_mask:
12110 case CODE_FOR_avx512f_loadv16si_mask:
12111 case CODE_FOR_avx512f_loadv8df_mask:
12112 case CODE_FOR_avx512f_loadv8di_mask:
12113 case CODE_FOR_avx512vl_loadv8sf_mask:
12114 case CODE_FOR_avx512vl_loadv8si_mask:
12115 case CODE_FOR_avx512vl_loadv4df_mask:
12116 case CODE_FOR_avx512vl_loadv4di_mask:
12117 case CODE_FOR_avx512vl_loadv4sf_mask:
12118 case CODE_FOR_avx512vl_loadv4si_mask:
12119 case CODE_FOR_avx512vl_loadv2df_mask:
12120 case CODE_FOR_avx512vl_loadv2di_mask:
12121 case CODE_FOR_avx512bw_loadv64qi_mask:
12122 case CODE_FOR_avx512vl_loadv32qi_mask:
12123 case CODE_FOR_avx512vl_loadv16qi_mask:
12124 case CODE_FOR_avx512bw_loadv32hi_mask:
12125 case CODE_FOR_avx512vl_loadv16hi_mask:
12126 case CODE_FOR_avx512vl_loadv8hi_mask:
12127 aligned_mem = true;
12128 break;
12129 default:
12130 break;
12131 }
12132 /* FALLTHRU */
12133 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
12134 case V32QI_FTYPE_PCCHAR_V32QI_USI:
12135 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
12136 case V32HI_FTYPE_PCSHORT_V32HI_USI:
12137 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
12138 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
12139 case V16SI_FTYPE_PCINT_V16SI_UHI:
12140 case V8SI_FTYPE_PCINT_V8SI_UQI:
12141 case V4SI_FTYPE_PCINT_V4SI_UQI:
12142 case V8DI_FTYPE_PCINT64_V8DI_UQI:
12143 case V4DI_FTYPE_PCINT64_V4DI_UQI:
12144 case V2DI_FTYPE_PCINT64_V2DI_UQI:
12145 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
12146 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
12147 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
12148 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
12149 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
12150 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
12151 case V8HF_FTYPE_PCFLOAT16_V8HF_UQI:
12152 nargs = 3;
12153 klass = load;
12154 memory = 0;
12155 break;
12156 case INT_FTYPE_PINT_INT_INT_INT:
12157 case LONGLONG_FTYPE_PLONGLONG_LONGLONG_LONGLONG_INT:
12158 nargs = 4;
12159 klass = load;
12160 memory = 0;
12161 constant = 3;
12162 break;
12163 default:
12164 gcc_unreachable ();
12165 }
12166
12167 gcc_assert (nargs <= ARRAY_SIZE (xops));
12168
12169 if (klass == store)
12170 {
12171 arg = CALL_EXPR_ARG (exp, 0);
12172 op = expand_normal (arg);
12173 gcc_assert (target == 0);
12174 if (memory)
12175 {
12176 op = ix86_zero_extend_to_Pmode (op);
12177 target = gen_rtx_MEM (tmode, op);
12178 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
12179 on it. Try to improve it using get_pointer_alignment,
12180 and if the special builtin is one that requires strict
12181 mode alignment, also from it's GET_MODE_ALIGNMENT.
12182 Failure to do so could lead to ix86_legitimate_combined_insn
12183 rejecting all changes to such insns. */
12184 unsigned int align = get_pointer_alignment (arg);
12185 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
12186 align = GET_MODE_ALIGNMENT (tmode);
12187 if (MEM_ALIGN (target) < align)
12188 set_mem_align (target, align);
12189 }
12190 else
12191 target = force_reg (tmode, op);
12192 arg_adjust = 1;
12193 }
12194 else
12195 {
12196 arg_adjust = 0;
12197 if (optimize
12198 || target == 0
12199 || !register_operand (target, tmode)
12200 || GET_MODE (target) != tmode)
12201 target = gen_reg_rtx (tmode);
12202 }
12203
12204 for (i = 0; i < nargs; i++)
12205 {
12206 machine_mode mode = insn_p->operand[i + 1].mode;
12207
12208 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
12209 op = expand_normal (arg);
12210
12211 if (i == memory)
12212 {
12213 /* This must be the memory operand. */
12214 op = ix86_zero_extend_to_Pmode (op);
12215 op = gen_rtx_MEM (mode, op);
12216 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
12217 on it. Try to improve it using get_pointer_alignment,
12218 and if the special builtin is one that requires strict
12219 mode alignment, also from it's GET_MODE_ALIGNMENT.
12220 Failure to do so could lead to ix86_legitimate_combined_insn
12221 rejecting all changes to such insns. */
12222 unsigned int align = get_pointer_alignment (arg);
12223 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
12224 align = GET_MODE_ALIGNMENT (mode);
12225 if (MEM_ALIGN (op) < align)
12226 set_mem_align (op, align);
12227 }
12228 else if (i == constant)
12229 {
12230 /* This must be the constant. */
12231 if (!insn_p->operand[nargs].predicate(op, SImode))
12232 {
12233 error ("the fourth argument must be one of enum %qs", "_CMPCCX_ENUM");
12234 return const0_rtx;
12235 }
12236 }
12237 else
12238 {
12239 /* This must be register. */
12240 if (VECTOR_MODE_P (mode))
12241 op = safe_vector_operand (op, mode);
12242
12243 op = fixup_modeless_constant (op, mode);
12244
12245 /* NB: 3-operands load implied it's a mask load or v{p}expand*,
12246 and that mask operand shoud be at the end.
12247 Keep all-ones mask which would be simplified by the expander. */
12248 if (nargs == 3 && i == 2 && klass == load
12249 && constm1_operand (op, mode)
12250 && insn_p->operand[i].predicate (op, mode))
12251 ;
12252 else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
12253 op = copy_to_mode_reg (mode, op);
12254 else
12255 {
12256 op = copy_to_reg (op);
12257 op = lowpart_subreg (mode, op, GET_MODE (op));
12258 }
12259 }
12260
12261 xops[i]= op;
12262 }
12263
12264 switch (nargs)
12265 {
12266 case 0:
12267 pat = GEN_FCN (icode) (target);
12268 break;
12269 case 1:
12270 pat = GEN_FCN (icode) (target, xops[0]);
12271 break;
12272 case 2:
12273 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
12274 break;
12275 case 3:
12276 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
12277 break;
12278 case 4:
12279 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
12280 break;
12281 default:
12282 gcc_unreachable ();
12283 }
12284
12285 if (! pat)
12286 return 0;
12287
12288 emit_insn (pat);
12289 return klass == store ? 0 : target;
12290 }
12291
12292 /* Return the integer constant in ARG. Constrain it to be in the range
12293 of the subparts of VEC_TYPE; issue an error if not. */
12294
12295 static int
12296 get_element_number (tree vec_type, tree arg)
12297 {
12298 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
12299
12300 if (!tree_fits_uhwi_p (arg)
12301 || (elt = tree_to_uhwi (arg), elt > max))
12302 {
12303 error ("selector must be an integer constant in the range "
12304 "[0, %wi]", max);
12305 return 0;
12306 }
12307
12308 return elt;
12309 }
12310
12311 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12312 ix86_expand_vector_init. We DO have language-level syntax for this, in
12313 the form of (type){ init-list }. Except that since we can't place emms
12314 instructions from inside the compiler, we can't allow the use of MMX
12315 registers unless the user explicitly asks for it. So we do *not* define
12316 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
12317 we have builtins invoked by mmintrin.h that gives us license to emit
12318 these sorts of instructions. */
12319
12320 static rtx
12321 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
12322 {
12323 machine_mode tmode = TYPE_MODE (type);
12324 machine_mode inner_mode = GET_MODE_INNER (tmode);
12325 int i, n_elt = GET_MODE_NUNITS (tmode);
12326 rtvec v = rtvec_alloc (n_elt);
12327
12328 gcc_assert (VECTOR_MODE_P (tmode));
12329 gcc_assert (call_expr_nargs (exp) == n_elt);
12330
12331 for (i = 0; i < n_elt; ++i)
12332 {
12333 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
12334 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
12335 }
12336
12337 if (!target || !register_operand (target, tmode))
12338 target = gen_reg_rtx (tmode);
12339
12340 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
12341 return target;
12342 }
12343
12344 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12345 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
12346 had a language-level syntax for referencing vector elements. */
12347
12348 static rtx
12349 ix86_expand_vec_ext_builtin (tree exp, rtx target)
12350 {
12351 machine_mode tmode, mode0;
12352 tree arg0, arg1;
12353 int elt;
12354 rtx op0;
12355
12356 arg0 = CALL_EXPR_ARG (exp, 0);
12357 arg1 = CALL_EXPR_ARG (exp, 1);
12358
12359 op0 = expand_normal (arg0);
12360 elt = get_element_number (TREE_TYPE (arg0), arg1);
12361
12362 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
12363 mode0 = TYPE_MODE (TREE_TYPE (arg0));
12364 gcc_assert (VECTOR_MODE_P (mode0));
12365
12366 op0 = force_reg (mode0, op0);
12367
12368 if (optimize || !target || !register_operand (target, tmode))
12369 target = gen_reg_rtx (tmode);
12370
12371 ix86_expand_vector_extract (true, target, op0, elt);
12372
12373 return target;
12374 }
12375
12376 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12377 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
12378 a language-level syntax for referencing vector elements. */
12379
12380 static rtx
12381 ix86_expand_vec_set_builtin (tree exp)
12382 {
12383 machine_mode tmode, mode1;
12384 tree arg0, arg1, arg2;
12385 int elt;
12386 rtx op0, op1, target;
12387
12388 arg0 = CALL_EXPR_ARG (exp, 0);
12389 arg1 = CALL_EXPR_ARG (exp, 1);
12390 arg2 = CALL_EXPR_ARG (exp, 2);
12391
12392 tmode = TYPE_MODE (TREE_TYPE (arg0));
12393 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
12394 gcc_assert (VECTOR_MODE_P (tmode));
12395
12396 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
12397 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
12398 elt = get_element_number (TREE_TYPE (arg0), arg2);
12399
12400 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
12401 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
12402
12403 op0 = force_reg (tmode, op0);
12404 op1 = force_reg (mode1, op1);
12405
12406 /* OP0 is the source of these builtin functions and shouldn't be
12407 modified. Create a copy, use it and return it as target. */
12408 target = gen_reg_rtx (tmode);
12409 emit_move_insn (target, op0);
12410 ix86_expand_vector_set (true, target, op1, elt);
12411
12412 return target;
12413 }
12414
12415 /* Return true if the necessary isa options for this builtin exist,
12416 else false.
12417 fcode = DECL_MD_FUNCTION_CODE (fndecl); */
12418 bool
12419 ix86_check_builtin_isa_match (unsigned int fcode,
12420 HOST_WIDE_INT* pbisa,
12421 HOST_WIDE_INT* pbisa2)
12422 {
12423 HOST_WIDE_INT isa = ix86_isa_flags;
12424 HOST_WIDE_INT isa2 = ix86_isa_flags2;
12425 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
12426 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
12427 /* The general case is we require all the ISAs specified in bisa{,2}
12428 to be enabled.
12429 The exceptions are:
12430 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
12431 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
12432 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
12433 (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
12434 OPTION_MASK_ISA2_AVXVNNI
12435 (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512IFMA) or
12436 OPTION_MASK_ISA2_AVXIFMA
12437 (OPTION_MASK_ISA_AVXNECONVERT | OPTION_MASK_ISA2_AVX512BF16) or
12438 OPTION_MASK_ISA2_AVXNECONVERT
12439 where for each such pair it is sufficient if either of the ISAs is
12440 enabled, plus if it is ored with other options also those others.
12441 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
12442 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
12443 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
12444 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
12445 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
12446
12447 if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
12448 == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
12449 && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
12450 isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
12451
12452 if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
12453 == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
12454 && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
12455 isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
12456
12457 if ((((bisa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12458 == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12459 || (bisa2 & OPTION_MASK_ISA2_AVXVNNI) != 0)
12460 && (((isa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12461 == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12462 || (isa2 & OPTION_MASK_ISA2_AVXVNNI) != 0))
12463 {
12464 isa |= OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL;
12465 isa2 |= OPTION_MASK_ISA2_AVXVNNI;
12466 }
12467
12468 if ((((bisa & (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL))
12469 == (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL))
12470 || (bisa2 & OPTION_MASK_ISA2_AVXIFMA) != 0)
12471 && (((isa & (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL))
12472 == (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL))
12473 || (isa2 & OPTION_MASK_ISA2_AVXIFMA) != 0))
12474 {
12475 isa |= OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL;
12476 isa2 |= OPTION_MASK_ISA2_AVXIFMA;
12477 }
12478
12479 if ((((bisa & OPTION_MASK_ISA_AVX512VL) != 0
12480 && (bisa2 & OPTION_MASK_ISA2_AVX512BF16) != 0)
12481 && (bisa2 & OPTION_MASK_ISA2_AVXNECONVERT) != 0)
12482 && (((isa & OPTION_MASK_ISA_AVX512VL) != 0
12483 && (isa2 & OPTION_MASK_ISA2_AVX512BF16) != 0)
12484 || (isa2 & OPTION_MASK_ISA2_AVXNECONVERT) != 0))
12485 {
12486 isa |= OPTION_MASK_ISA_AVX512VL;
12487 isa2 |= OPTION_MASK_ISA2_AVXNECONVERT | OPTION_MASK_ISA2_AVX512BF16;
12488 }
12489
12490 if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
12491 /* __builtin_ia32_maskmovq requires MMX registers. */
12492 && fcode != IX86_BUILTIN_MASKMOVQ)
12493 {
12494 bisa &= ~OPTION_MASK_ISA_MMX;
12495 bisa |= OPTION_MASK_ISA_SSE2;
12496 }
12497
12498 if (pbisa)
12499 *pbisa = bisa;
12500 if (pbisa2)
12501 *pbisa2 = bisa2;
12502
12503 return (bisa & isa) == bisa && (bisa2 & isa2) == bisa2;
12504 }
12505
12506 /* Expand an expression EXP that calls a built-in function,
12507 with result going to TARGET if that's convenient
12508 (and in mode MODE if that's convenient).
12509 SUBTARGET may be used as the target for computing one of EXP's operands.
12510 IGNORE is nonzero if the value is to be ignored. */
12511
12512 rtx
12513 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
12514 machine_mode mode, int ignore)
12515 {
12516 size_t i;
12517 enum insn_code icode, icode2;
12518 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
12519 tree arg0, arg1, arg2, arg3, arg4;
12520 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
12521 machine_mode mode0, mode1, mode2, mode3, mode4;
12522 unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
12523 HOST_WIDE_INT bisa, bisa2;
12524
12525 /* For CPU builtins that can be folded, fold first and expand the fold. */
12526 switch (fcode)
12527 {
12528 case IX86_BUILTIN_CPU_INIT:
12529 {
12530 /* Make it call __cpu_indicator_init in libgcc. */
12531 tree call_expr, fndecl, type;
12532 type = build_function_type_list (integer_type_node, NULL_TREE);
12533 fndecl = build_fn_decl ("__cpu_indicator_init", type);
12534 call_expr = build_call_expr (fndecl, 0);
12535 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
12536 }
12537 case IX86_BUILTIN_CPU_IS:
12538 case IX86_BUILTIN_CPU_SUPPORTS:
12539 {
12540 tree arg0 = CALL_EXPR_ARG (exp, 0);
12541 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
12542 gcc_assert (fold_expr != NULL_TREE);
12543 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
12544 }
12545 }
12546
12547 if (!ix86_check_builtin_isa_match (fcode, &bisa, &bisa2))
12548 {
12549 bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
12550 if (TARGET_ABI_X32)
12551 bisa |= OPTION_MASK_ABI_X32;
12552 else
12553 bisa |= OPTION_MASK_ABI_64;
12554 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
12555 (enum fpmath_unit) 0,
12556 (enum prefer_vector_width) 0,
12557 PVW_NONE, PVW_NONE,
12558 false, add_abi_p);
12559 if (!opts)
12560 error ("%qE needs unknown isa option", fndecl);
12561 else
12562 {
12563 gcc_assert (opts != NULL);
12564 error ("%qE needs isa option %s", fndecl, opts);
12565 free (opts);
12566 }
12567 return expand_call (exp, target, ignore);
12568 }
12569
12570 switch (fcode)
12571 {
12572 case IX86_BUILTIN_MASKMOVQ:
12573 case IX86_BUILTIN_MASKMOVDQU:
12574 icode = (fcode == IX86_BUILTIN_MASKMOVQ
12575 ? CODE_FOR_mmx_maskmovq
12576 : CODE_FOR_sse2_maskmovdqu);
12577 /* Note the arg order is different from the operand order. */
12578 arg1 = CALL_EXPR_ARG (exp, 0);
12579 arg2 = CALL_EXPR_ARG (exp, 1);
12580 arg0 = CALL_EXPR_ARG (exp, 2);
12581 op0 = expand_normal (arg0);
12582 op1 = expand_normal (arg1);
12583 op2 = expand_normal (arg2);
12584 mode0 = insn_data[icode].operand[0].mode;
12585 mode1 = insn_data[icode].operand[1].mode;
12586 mode2 = insn_data[icode].operand[2].mode;
12587
12588 op0 = ix86_zero_extend_to_Pmode (op0);
12589 op0 = gen_rtx_MEM (mode1, op0);
12590
12591 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12592 op0 = copy_to_mode_reg (mode0, op0);
12593 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12594 op1 = copy_to_mode_reg (mode1, op1);
12595 if (!insn_data[icode].operand[2].predicate (op2, mode2))
12596 op2 = copy_to_mode_reg (mode2, op2);
12597 pat = GEN_FCN (icode) (op0, op1, op2);
12598 if (! pat)
12599 return 0;
12600 emit_insn (pat);
12601 return 0;
12602
12603 case IX86_BUILTIN_LDMXCSR:
12604 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
12605 target = assign_386_stack_local (SImode, SLOT_TEMP);
12606 emit_move_insn (target, op0);
12607 emit_insn (gen_sse_ldmxcsr (target));
12608 return 0;
12609
12610 case IX86_BUILTIN_STMXCSR:
12611 target = assign_386_stack_local (SImode, SLOT_TEMP);
12612 emit_insn (gen_sse_stmxcsr (target));
12613 return copy_to_mode_reg (SImode, target);
12614
12615 case IX86_BUILTIN_CLFLUSH:
12616 arg0 = CALL_EXPR_ARG (exp, 0);
12617 op0 = expand_normal (arg0);
12618 icode = CODE_FOR_sse2_clflush;
12619 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12620 op0 = ix86_zero_extend_to_Pmode (op0);
12621
12622 emit_insn (gen_sse2_clflush (op0));
12623 return 0;
12624
12625 case IX86_BUILTIN_CLWB:
12626 arg0 = CALL_EXPR_ARG (exp, 0);
12627 op0 = expand_normal (arg0);
12628 icode = CODE_FOR_clwb;
12629 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12630 op0 = ix86_zero_extend_to_Pmode (op0);
12631
12632 emit_insn (gen_clwb (op0));
12633 return 0;
12634
12635 case IX86_BUILTIN_CLFLUSHOPT:
12636 arg0 = CALL_EXPR_ARG (exp, 0);
12637 op0 = expand_normal (arg0);
12638 icode = CODE_FOR_clflushopt;
12639 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12640 op0 = ix86_zero_extend_to_Pmode (op0);
12641
12642 emit_insn (gen_clflushopt (op0));
12643 return 0;
12644
12645 case IX86_BUILTIN_MONITOR:
12646 case IX86_BUILTIN_MONITORX:
12647 arg0 = CALL_EXPR_ARG (exp, 0);
12648 arg1 = CALL_EXPR_ARG (exp, 1);
12649 arg2 = CALL_EXPR_ARG (exp, 2);
12650 op0 = expand_normal (arg0);
12651 op1 = expand_normal (arg1);
12652 op2 = expand_normal (arg2);
12653 if (!REG_P (op0))
12654 op0 = ix86_zero_extend_to_Pmode (op0);
12655 if (!REG_P (op1))
12656 op1 = copy_to_mode_reg (SImode, op1);
12657 if (!REG_P (op2))
12658 op2 = copy_to_mode_reg (SImode, op2);
12659
12660 emit_insn (fcode == IX86_BUILTIN_MONITOR
12661 ? gen_sse3_monitor (Pmode, op0, op1, op2)
12662 : gen_monitorx (Pmode, op0, op1, op2));
12663 return 0;
12664
12665 case IX86_BUILTIN_MWAIT:
12666 arg0 = CALL_EXPR_ARG (exp, 0);
12667 arg1 = CALL_EXPR_ARG (exp, 1);
12668 op0 = expand_normal (arg0);
12669 op1 = expand_normal (arg1);
12670 if (!REG_P (op0))
12671 op0 = copy_to_mode_reg (SImode, op0);
12672 if (!REG_P (op1))
12673 op1 = copy_to_mode_reg (SImode, op1);
12674 emit_insn (gen_sse3_mwait (op0, op1));
12675 return 0;
12676
12677 case IX86_BUILTIN_MWAITX:
12678 arg0 = CALL_EXPR_ARG (exp, 0);
12679 arg1 = CALL_EXPR_ARG (exp, 1);
12680 arg2 = CALL_EXPR_ARG (exp, 2);
12681 op0 = expand_normal (arg0);
12682 op1 = expand_normal (arg1);
12683 op2 = expand_normal (arg2);
12684 if (!REG_P (op0))
12685 op0 = copy_to_mode_reg (SImode, op0);
12686 if (!REG_P (op1))
12687 op1 = copy_to_mode_reg (SImode, op1);
12688 if (!REG_P (op2))
12689 op2 = copy_to_mode_reg (SImode, op2);
12690 emit_insn (gen_mwaitx (op0, op1, op2));
12691 return 0;
12692
12693 case IX86_BUILTIN_UMONITOR:
12694 arg0 = CALL_EXPR_ARG (exp, 0);
12695 op0 = expand_normal (arg0);
12696
12697 op0 = ix86_zero_extend_to_Pmode (op0);
12698 emit_insn (gen_umonitor (Pmode, op0));
12699 return 0;
12700
12701 case IX86_BUILTIN_UMWAIT:
12702 case IX86_BUILTIN_TPAUSE:
12703 arg0 = CALL_EXPR_ARG (exp, 0);
12704 arg1 = CALL_EXPR_ARG (exp, 1);
12705 op0 = expand_normal (arg0);
12706 op1 = expand_normal (arg1);
12707
12708 if (!REG_P (op0))
12709 op0 = copy_to_mode_reg (SImode, op0);
12710
12711 op1 = force_reg (DImode, op1);
12712
12713 if (TARGET_64BIT)
12714 {
12715 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
12716 NULL, 1, OPTAB_DIRECT);
12717 switch (fcode)
12718 {
12719 case IX86_BUILTIN_UMWAIT:
12720 icode = CODE_FOR_umwait_rex64;
12721 break;
12722 case IX86_BUILTIN_TPAUSE:
12723 icode = CODE_FOR_tpause_rex64;
12724 break;
12725 default:
12726 gcc_unreachable ();
12727 }
12728
12729 op2 = gen_lowpart (SImode, op2);
12730 op1 = gen_lowpart (SImode, op1);
12731 pat = GEN_FCN (icode) (op0, op1, op2);
12732 }
12733 else
12734 {
12735 switch (fcode)
12736 {
12737 case IX86_BUILTIN_UMWAIT:
12738 icode = CODE_FOR_umwait;
12739 break;
12740 case IX86_BUILTIN_TPAUSE:
12741 icode = CODE_FOR_tpause;
12742 break;
12743 default:
12744 gcc_unreachable ();
12745 }
12746 pat = GEN_FCN (icode) (op0, op1);
12747 }
12748
12749 if (!pat)
12750 return 0;
12751
12752 emit_insn (pat);
12753
12754 if (target == 0
12755 || !register_operand (target, QImode))
12756 target = gen_reg_rtx (QImode);
12757
12758 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12759 const0_rtx);
12760 emit_insn (gen_rtx_SET (target, pat));
12761
12762 return target;
12763
12764 case IX86_BUILTIN_TESTUI:
12765 emit_insn (gen_testui ());
12766
12767 if (target == 0
12768 || !register_operand (target, QImode))
12769 target = gen_reg_rtx (QImode);
12770
12771 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12772 const0_rtx);
12773 emit_insn (gen_rtx_SET (target, pat));
12774
12775 return target;
12776
12777 case IX86_BUILTIN_CLZERO:
12778 arg0 = CALL_EXPR_ARG (exp, 0);
12779 op0 = expand_normal (arg0);
12780 if (!REG_P (op0))
12781 op0 = ix86_zero_extend_to_Pmode (op0);
12782 emit_insn (gen_clzero (Pmode, op0));
12783 return 0;
12784
12785 case IX86_BUILTIN_CLDEMOTE:
12786 arg0 = CALL_EXPR_ARG (exp, 0);
12787 op0 = expand_normal (arg0);
12788 icode = CODE_FOR_cldemote;
12789 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12790 op0 = ix86_zero_extend_to_Pmode (op0);
12791
12792 emit_insn (gen_cldemote (op0));
12793 return 0;
12794
12795 case IX86_BUILTIN_LOADIWKEY:
12796 {
12797 arg0 = CALL_EXPR_ARG (exp, 0);
12798 arg1 = CALL_EXPR_ARG (exp, 1);
12799 arg2 = CALL_EXPR_ARG (exp, 2);
12800 arg3 = CALL_EXPR_ARG (exp, 3);
12801
12802 op0 = expand_normal (arg0);
12803 op1 = expand_normal (arg1);
12804 op2 = expand_normal (arg2);
12805 op3 = expand_normal (arg3);
12806
12807 if (!REG_P (op0))
12808 op0 = copy_to_mode_reg (V2DImode, op0);
12809 if (!REG_P (op1))
12810 op1 = copy_to_mode_reg (V2DImode, op1);
12811 if (!REG_P (op2))
12812 op2 = copy_to_mode_reg (V2DImode, op2);
12813 if (!REG_P (op3))
12814 op3 = copy_to_mode_reg (SImode, op3);
12815
12816 emit_insn (gen_loadiwkey (op0, op1, op2, op3));
12817
12818 return 0;
12819 }
12820
12821 case IX86_BUILTIN_AESDEC128KLU8:
12822 icode = CODE_FOR_aesdec128klu8;
12823 goto aesdecenc_expand;
12824
12825 case IX86_BUILTIN_AESDEC256KLU8:
12826 icode = CODE_FOR_aesdec256klu8;
12827 goto aesdecenc_expand;
12828
12829 case IX86_BUILTIN_AESENC128KLU8:
12830 icode = CODE_FOR_aesenc128klu8;
12831 goto aesdecenc_expand;
12832
12833 case IX86_BUILTIN_AESENC256KLU8:
12834 icode = CODE_FOR_aesenc256klu8;
12835
12836 aesdecenc_expand:
12837
12838 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i *odata
12839 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i idata
12840 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
12841
12842 op0 = expand_normal (arg0);
12843 op1 = expand_normal (arg1);
12844 op2 = expand_normal (arg2);
12845
12846 if (!address_operand (op0, V2DImode))
12847 {
12848 op0 = convert_memory_address (Pmode, op0);
12849 op0 = copy_addr_to_reg (op0);
12850 }
12851 op0 = gen_rtx_MEM (V2DImode, op0);
12852
12853 if (!REG_P (op1))
12854 op1 = copy_to_mode_reg (V2DImode, op1);
12855
12856 if (!address_operand (op2, VOIDmode))
12857 {
12858 op2 = convert_memory_address (Pmode, op2);
12859 op2 = copy_addr_to_reg (op2);
12860 }
12861 op2 = gen_rtx_MEM (BLKmode, op2);
12862
12863 emit_insn (GEN_FCN (icode) (op1, op1, op2));
12864
12865 if (target == 0)
12866 target = gen_reg_rtx (QImode);
12867
12868 /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime
12869 error occurs. Then the output should be cleared for safety. */
12870 rtx_code_label *ok_label;
12871 rtx tmp;
12872
12873 tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
12874 pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
12875 ok_label = gen_label_rtx ();
12876 emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
12877 true, ok_label);
12878 /* Usually the runtime error seldom occur, so predict OK path as
12879 hotspot to optimize it as fallthrough block. */
12880 predict_jump (REG_BR_PROB_BASE * 90 / 100);
12881
12882 emit_insn (gen_rtx_SET (op1, const0_rtx));
12883
12884 emit_label (ok_label);
12885 emit_insn (gen_rtx_SET (target, pat));
12886 emit_insn (gen_rtx_SET (op0, op1));
12887
12888 return target;
12889
12890 case IX86_BUILTIN_AESDECWIDE128KLU8:
12891 icode = CODE_FOR_aesdecwide128klu8;
12892 goto wideaesdecenc_expand;
12893
12894 case IX86_BUILTIN_AESDECWIDE256KLU8:
12895 icode = CODE_FOR_aesdecwide256klu8;
12896 goto wideaesdecenc_expand;
12897
12898 case IX86_BUILTIN_AESENCWIDE128KLU8:
12899 icode = CODE_FOR_aesencwide128klu8;
12900 goto wideaesdecenc_expand;
12901
12902 case IX86_BUILTIN_AESENCWIDE256KLU8:
12903 icode = CODE_FOR_aesencwide256klu8;
12904
12905 wideaesdecenc_expand:
12906
12907 rtx xmm_regs[8];
12908 rtx op;
12909
12910 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i * odata
12911 arg1 = CALL_EXPR_ARG (exp, 1); // const __m128i * idata
12912 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
12913
12914 op0 = expand_normal (arg0);
12915 op1 = expand_normal (arg1);
12916 op2 = expand_normal (arg2);
12917
12918 if (!address_operand (op2, VOIDmode))
12919 {
12920 op2 = convert_memory_address (Pmode, op2);
12921 op2 = copy_addr_to_reg (op2);
12922 }
12923 op2 = gen_rtx_MEM (BLKmode, op2);
12924
12925 for (i = 0; i < 8; i++)
12926 {
12927 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
12928
12929 op = gen_rtx_MEM (V2DImode,
12930 plus_constant (Pmode, op1, (i * 16)));
12931
12932 emit_move_insn (xmm_regs[i], op);
12933 }
12934
12935 emit_insn (GEN_FCN (icode) (op2));
12936
12937 if (target == 0)
12938 target = gen_reg_rtx (QImode);
12939
12940 tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
12941 pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
12942 ok_label = gen_label_rtx ();
12943 emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
12944 true, ok_label);
12945 predict_jump (REG_BR_PROB_BASE * 90 / 100);
12946
12947 for (i = 0; i < 8; i++)
12948 emit_insn (gen_rtx_SET (xmm_regs[i], const0_rtx));
12949
12950 emit_label (ok_label);
12951 emit_insn (gen_rtx_SET (target, pat));
12952
12953 for (i = 0; i < 8; i++)
12954 {
12955 op = gen_rtx_MEM (V2DImode,
12956 plus_constant (Pmode, op0, (i * 16)));
12957 emit_move_insn (op, xmm_regs[i]);
12958 }
12959
12960 return target;
12961
12962 case IX86_BUILTIN_ENCODEKEY128U32:
12963 {
12964 rtx op, xmm_regs[7];
12965
12966 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
12967 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i key
12968 arg2 = CALL_EXPR_ARG (exp, 2); // void *h
12969
12970 op0 = expand_normal (arg0);
12971 op1 = expand_normal (arg1);
12972 op2 = expand_normal (arg2);
12973
12974 if (!REG_P (op0))
12975 op0 = copy_to_mode_reg (SImode, op0);
12976
12977 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
12978 emit_move_insn (op, op1);
12979
12980 for (i = 0; i < 3; i++)
12981 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
12982
12983 if (target == 0)
12984 target = gen_reg_rtx (SImode);
12985
12986 emit_insn (gen_encodekey128u32 (target, op0));
12987
12988 for (i = 0; i < 3; i++)
12989 {
12990 op = gen_rtx_MEM (V2DImode,
12991 plus_constant (Pmode, op2, (i * 16)));
12992 emit_move_insn (op, xmm_regs[i]);
12993 }
12994
12995 return target;
12996 }
12997 case IX86_BUILTIN_ENCODEKEY256U32:
12998 {
12999 rtx op, xmm_regs[7];
13000
13001 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
13002 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i keylow
13003 arg2 = CALL_EXPR_ARG (exp, 2); // __m128i keyhi
13004 arg3 = CALL_EXPR_ARG (exp, 3); // void *h
13005
13006 op0 = expand_normal (arg0);
13007 op1 = expand_normal (arg1);
13008 op2 = expand_normal (arg2);
13009 op3 = expand_normal (arg3);
13010
13011 if (!REG_P (op0))
13012 op0 = copy_to_mode_reg (SImode, op0);
13013
13014 /* Force to use xmm0, xmm1 for keylow, keyhi*/
13015 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
13016 emit_move_insn (op, op1);
13017 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (1));
13018 emit_move_insn (op, op2);
13019
13020 for (i = 0; i < 4; i++)
13021 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
13022
13023 if (target == 0)
13024 target = gen_reg_rtx (SImode);
13025
13026 emit_insn (gen_encodekey256u32 (target, op0));
13027
13028 for (i = 0; i < 4; i++)
13029 {
13030 op = gen_rtx_MEM (V2DImode,
13031 plus_constant (Pmode, op3, (i * 16)));
13032 emit_move_insn (op, xmm_regs[i]);
13033 }
13034
13035 return target;
13036 }
13037
13038 case IX86_BUILTIN_VEC_INIT_V2SI:
13039 case IX86_BUILTIN_VEC_INIT_V4HI:
13040 case IX86_BUILTIN_VEC_INIT_V8QI:
13041 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
13042
13043 case IX86_BUILTIN_VEC_EXT_V2DF:
13044 case IX86_BUILTIN_VEC_EXT_V2DI:
13045 case IX86_BUILTIN_VEC_EXT_V4SF:
13046 case IX86_BUILTIN_VEC_EXT_V4SI:
13047 case IX86_BUILTIN_VEC_EXT_V8HI:
13048 case IX86_BUILTIN_VEC_EXT_V2SI:
13049 case IX86_BUILTIN_VEC_EXT_V4HI:
13050 case IX86_BUILTIN_VEC_EXT_V16QI:
13051 return ix86_expand_vec_ext_builtin (exp, target);
13052
13053 case IX86_BUILTIN_VEC_SET_V2DI:
13054 case IX86_BUILTIN_VEC_SET_V4SF:
13055 case IX86_BUILTIN_VEC_SET_V4SI:
13056 case IX86_BUILTIN_VEC_SET_V8HI:
13057 case IX86_BUILTIN_VEC_SET_V4HI:
13058 case IX86_BUILTIN_VEC_SET_V16QI:
13059 return ix86_expand_vec_set_builtin (exp);
13060
13061 case IX86_BUILTIN_NANQ:
13062 case IX86_BUILTIN_NANSQ:
13063 return expand_call (exp, target, ignore);
13064
13065 case IX86_BUILTIN_RDPID:
13066
13067 op0 = gen_reg_rtx (word_mode);
13068
13069 if (TARGET_64BIT)
13070 {
13071 insn = gen_rdpid_rex64 (op0);
13072 op0 = convert_to_mode (SImode, op0, 1);
13073 }
13074 else
13075 insn = gen_rdpid (op0);
13076
13077 emit_insn (insn);
13078
13079 if (target == 0
13080 || !register_operand (target, SImode))
13081 target = gen_reg_rtx (SImode);
13082
13083 emit_move_insn (target, op0);
13084 return target;
13085
13086 case IX86_BUILTIN_2INTERSECTD512:
13087 case IX86_BUILTIN_2INTERSECTQ512:
13088 case IX86_BUILTIN_2INTERSECTD256:
13089 case IX86_BUILTIN_2INTERSECTQ256:
13090 case IX86_BUILTIN_2INTERSECTD128:
13091 case IX86_BUILTIN_2INTERSECTQ128:
13092 arg0 = CALL_EXPR_ARG (exp, 0);
13093 arg1 = CALL_EXPR_ARG (exp, 1);
13094 arg2 = CALL_EXPR_ARG (exp, 2);
13095 arg3 = CALL_EXPR_ARG (exp, 3);
13096 op0 = expand_normal (arg0);
13097 op1 = expand_normal (arg1);
13098 op2 = expand_normal (arg2);
13099 op3 = expand_normal (arg3);
13100
13101 if (!address_operand (op0, VOIDmode))
13102 {
13103 op0 = convert_memory_address (Pmode, op0);
13104 op0 = copy_addr_to_reg (op0);
13105 }
13106 if (!address_operand (op1, VOIDmode))
13107 {
13108 op1 = convert_memory_address (Pmode, op1);
13109 op1 = copy_addr_to_reg (op1);
13110 }
13111
13112 switch (fcode)
13113 {
13114 case IX86_BUILTIN_2INTERSECTD512:
13115 mode4 = P2HImode;
13116 icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
13117 break;
13118 case IX86_BUILTIN_2INTERSECTQ512:
13119 mode4 = P2QImode;
13120 icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
13121 break;
13122 case IX86_BUILTIN_2INTERSECTD256:
13123 mode4 = P2QImode;
13124 icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
13125 break;
13126 case IX86_BUILTIN_2INTERSECTQ256:
13127 mode4 = P2QImode;
13128 icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
13129 break;
13130 case IX86_BUILTIN_2INTERSECTD128:
13131 mode4 = P2QImode;
13132 icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
13133 break;
13134 case IX86_BUILTIN_2INTERSECTQ128:
13135 mode4 = P2QImode;
13136 icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
13137 break;
13138 default:
13139 gcc_unreachable ();
13140 }
13141
13142 mode2 = insn_data[icode].operand[1].mode;
13143 mode3 = insn_data[icode].operand[2].mode;
13144 if (!insn_data[icode].operand[1].predicate (op2, mode2))
13145 op2 = copy_to_mode_reg (mode2, op2);
13146 if (!insn_data[icode].operand[2].predicate (op3, mode3))
13147 op3 = copy_to_mode_reg (mode3, op3);
13148
13149 op4 = gen_reg_rtx (mode4);
13150 emit_insn (GEN_FCN (icode) (op4, op2, op3));
13151 mode0 = mode4 == P2HImode ? HImode : QImode;
13152 emit_move_insn (gen_rtx_MEM (mode0, op0),
13153 gen_lowpart (mode0, op4));
13154 emit_move_insn (gen_rtx_MEM (mode0, op1),
13155 gen_highpart (mode0, op4));
13156
13157 return 0;
13158
13159 case IX86_BUILTIN_RDPMC:
13160 case IX86_BUILTIN_RDTSC:
13161 case IX86_BUILTIN_RDTSCP:
13162 case IX86_BUILTIN_XGETBV:
13163
13164 op0 = gen_reg_rtx (DImode);
13165 op1 = gen_reg_rtx (DImode);
13166
13167 if (fcode == IX86_BUILTIN_RDPMC)
13168 {
13169 arg0 = CALL_EXPR_ARG (exp, 0);
13170 op2 = expand_normal (arg0);
13171 if (!register_operand (op2, SImode))
13172 op2 = copy_to_mode_reg (SImode, op2);
13173
13174 insn = (TARGET_64BIT
13175 ? gen_rdpmc_rex64 (op0, op1, op2)
13176 : gen_rdpmc (op0, op2));
13177 emit_insn (insn);
13178 }
13179 else if (fcode == IX86_BUILTIN_XGETBV)
13180 {
13181 arg0 = CALL_EXPR_ARG (exp, 0);
13182 op2 = expand_normal (arg0);
13183 if (!register_operand (op2, SImode))
13184 op2 = copy_to_mode_reg (SImode, op2);
13185
13186 insn = (TARGET_64BIT
13187 ? gen_xgetbv_rex64 (op0, op1, op2)
13188 : gen_xgetbv (op0, op2));
13189 emit_insn (insn);
13190 }
13191 else if (fcode == IX86_BUILTIN_RDTSC)
13192 {
13193 insn = (TARGET_64BIT
13194 ? gen_rdtsc_rex64 (op0, op1)
13195 : gen_rdtsc (op0));
13196 emit_insn (insn);
13197 }
13198 else
13199 {
13200 op2 = gen_reg_rtx (SImode);
13201
13202 insn = (TARGET_64BIT
13203 ? gen_rdtscp_rex64 (op0, op1, op2)
13204 : gen_rdtscp (op0, op2));
13205 emit_insn (insn);
13206
13207 arg0 = CALL_EXPR_ARG (exp, 0);
13208 op4 = expand_normal (arg0);
13209 if (!address_operand (op4, VOIDmode))
13210 {
13211 op4 = convert_memory_address (Pmode, op4);
13212 op4 = copy_addr_to_reg (op4);
13213 }
13214 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
13215 }
13216
13217 if (target == 0
13218 || !register_operand (target, DImode))
13219 target = gen_reg_rtx (DImode);
13220
13221 if (TARGET_64BIT)
13222 {
13223 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
13224 op1, 1, OPTAB_DIRECT);
13225 op0 = expand_simple_binop (DImode, IOR, op0, op1,
13226 op0, 1, OPTAB_DIRECT);
13227 }
13228
13229 emit_move_insn (target, op0);
13230 return target;
13231
13232 case IX86_BUILTIN_ENQCMD:
13233 case IX86_BUILTIN_ENQCMDS:
13234 case IX86_BUILTIN_MOVDIR64B:
13235
13236 arg0 = CALL_EXPR_ARG (exp, 0);
13237 arg1 = CALL_EXPR_ARG (exp, 1);
13238 op0 = expand_normal (arg0);
13239 op1 = expand_normal (arg1);
13240
13241 op0 = ix86_zero_extend_to_Pmode (op0);
13242 if (!address_operand (op1, VOIDmode))
13243 {
13244 op1 = convert_memory_address (Pmode, op1);
13245 op1 = copy_addr_to_reg (op1);
13246 }
13247 op1 = gen_rtx_MEM (XImode, op1);
13248
13249 if (fcode == IX86_BUILTIN_MOVDIR64B)
13250 {
13251 emit_insn (gen_movdir64b (Pmode, op0, op1));
13252 return 0;
13253 }
13254 else
13255 {
13256 if (target == 0
13257 || !register_operand (target, SImode))
13258 target = gen_reg_rtx (SImode);
13259
13260 emit_move_insn (target, const0_rtx);
13261 target = gen_rtx_SUBREG (QImode, target, 0);
13262
13263 int unspecv = (fcode == IX86_BUILTIN_ENQCMD
13264 ? UNSPECV_ENQCMD
13265 : UNSPECV_ENQCMDS);
13266 icode = code_for_enqcmd (unspecv, Pmode);
13267 emit_insn (GEN_FCN (icode) (op0, op1));
13268
13269 emit_insn
13270 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
13271 gen_rtx_fmt_ee (EQ, QImode,
13272 gen_rtx_REG (CCZmode, FLAGS_REG),
13273 const0_rtx)));
13274 return SUBREG_REG (target);
13275 }
13276
13277 case IX86_BUILTIN_FXSAVE:
13278 case IX86_BUILTIN_FXRSTOR:
13279 case IX86_BUILTIN_FXSAVE64:
13280 case IX86_BUILTIN_FXRSTOR64:
13281 case IX86_BUILTIN_FNSTENV:
13282 case IX86_BUILTIN_FLDENV:
13283 mode0 = BLKmode;
13284 switch (fcode)
13285 {
13286 case IX86_BUILTIN_FXSAVE:
13287 icode = CODE_FOR_fxsave;
13288 break;
13289 case IX86_BUILTIN_FXRSTOR:
13290 icode = CODE_FOR_fxrstor;
13291 break;
13292 case IX86_BUILTIN_FXSAVE64:
13293 icode = CODE_FOR_fxsave64;
13294 break;
13295 case IX86_BUILTIN_FXRSTOR64:
13296 icode = CODE_FOR_fxrstor64;
13297 break;
13298 case IX86_BUILTIN_FNSTENV:
13299 icode = CODE_FOR_fnstenv;
13300 break;
13301 case IX86_BUILTIN_FLDENV:
13302 icode = CODE_FOR_fldenv;
13303 break;
13304 default:
13305 gcc_unreachable ();
13306 }
13307
13308 arg0 = CALL_EXPR_ARG (exp, 0);
13309 op0 = expand_normal (arg0);
13310
13311 if (!address_operand (op0, VOIDmode))
13312 {
13313 op0 = convert_memory_address (Pmode, op0);
13314 op0 = copy_addr_to_reg (op0);
13315 }
13316 op0 = gen_rtx_MEM (mode0, op0);
13317
13318 pat = GEN_FCN (icode) (op0);
13319 if (pat)
13320 emit_insn (pat);
13321 return 0;
13322
13323 case IX86_BUILTIN_XSETBV:
13324 arg0 = CALL_EXPR_ARG (exp, 0);
13325 arg1 = CALL_EXPR_ARG (exp, 1);
13326 op0 = expand_normal (arg0);
13327 op1 = expand_normal (arg1);
13328
13329 if (!REG_P (op0))
13330 op0 = copy_to_mode_reg (SImode, op0);
13331
13332 op1 = force_reg (DImode, op1);
13333
13334 if (TARGET_64BIT)
13335 {
13336 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
13337 NULL, 1, OPTAB_DIRECT);
13338
13339 icode = CODE_FOR_xsetbv_rex64;
13340
13341 op2 = gen_lowpart (SImode, op2);
13342 op1 = gen_lowpart (SImode, op1);
13343 pat = GEN_FCN (icode) (op0, op1, op2);
13344 }
13345 else
13346 {
13347 icode = CODE_FOR_xsetbv;
13348
13349 pat = GEN_FCN (icode) (op0, op1);
13350 }
13351 if (pat)
13352 emit_insn (pat);
13353 return 0;
13354
13355 case IX86_BUILTIN_XSAVE:
13356 case IX86_BUILTIN_XRSTOR:
13357 case IX86_BUILTIN_XSAVE64:
13358 case IX86_BUILTIN_XRSTOR64:
13359 case IX86_BUILTIN_XSAVEOPT:
13360 case IX86_BUILTIN_XSAVEOPT64:
13361 case IX86_BUILTIN_XSAVES:
13362 case IX86_BUILTIN_XRSTORS:
13363 case IX86_BUILTIN_XSAVES64:
13364 case IX86_BUILTIN_XRSTORS64:
13365 case IX86_BUILTIN_XSAVEC:
13366 case IX86_BUILTIN_XSAVEC64:
13367 arg0 = CALL_EXPR_ARG (exp, 0);
13368 arg1 = CALL_EXPR_ARG (exp, 1);
13369 op0 = expand_normal (arg0);
13370 op1 = expand_normal (arg1);
13371
13372 if (!address_operand (op0, VOIDmode))
13373 {
13374 op0 = convert_memory_address (Pmode, op0);
13375 op0 = copy_addr_to_reg (op0);
13376 }
13377 op0 = gen_rtx_MEM (BLKmode, op0);
13378
13379 op1 = force_reg (DImode, op1);
13380
13381 if (TARGET_64BIT)
13382 {
13383 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
13384 NULL, 1, OPTAB_DIRECT);
13385 switch (fcode)
13386 {
13387 case IX86_BUILTIN_XSAVE:
13388 icode = CODE_FOR_xsave_rex64;
13389 break;
13390 case IX86_BUILTIN_XRSTOR:
13391 icode = CODE_FOR_xrstor_rex64;
13392 break;
13393 case IX86_BUILTIN_XSAVE64:
13394 icode = CODE_FOR_xsave64;
13395 break;
13396 case IX86_BUILTIN_XRSTOR64:
13397 icode = CODE_FOR_xrstor64;
13398 break;
13399 case IX86_BUILTIN_XSAVEOPT:
13400 icode = CODE_FOR_xsaveopt_rex64;
13401 break;
13402 case IX86_BUILTIN_XSAVEOPT64:
13403 icode = CODE_FOR_xsaveopt64;
13404 break;
13405 case IX86_BUILTIN_XSAVES:
13406 icode = CODE_FOR_xsaves_rex64;
13407 break;
13408 case IX86_BUILTIN_XRSTORS:
13409 icode = CODE_FOR_xrstors_rex64;
13410 break;
13411 case IX86_BUILTIN_XSAVES64:
13412 icode = CODE_FOR_xsaves64;
13413 break;
13414 case IX86_BUILTIN_XRSTORS64:
13415 icode = CODE_FOR_xrstors64;
13416 break;
13417 case IX86_BUILTIN_XSAVEC:
13418 icode = CODE_FOR_xsavec_rex64;
13419 break;
13420 case IX86_BUILTIN_XSAVEC64:
13421 icode = CODE_FOR_xsavec64;
13422 break;
13423 default:
13424 gcc_unreachable ();
13425 }
13426
13427 op2 = gen_lowpart (SImode, op2);
13428 op1 = gen_lowpart (SImode, op1);
13429 pat = GEN_FCN (icode) (op0, op1, op2);
13430 }
13431 else
13432 {
13433 switch (fcode)
13434 {
13435 case IX86_BUILTIN_XSAVE:
13436 icode = CODE_FOR_xsave;
13437 break;
13438 case IX86_BUILTIN_XRSTOR:
13439 icode = CODE_FOR_xrstor;
13440 break;
13441 case IX86_BUILTIN_XSAVEOPT:
13442 icode = CODE_FOR_xsaveopt;
13443 break;
13444 case IX86_BUILTIN_XSAVES:
13445 icode = CODE_FOR_xsaves;
13446 break;
13447 case IX86_BUILTIN_XRSTORS:
13448 icode = CODE_FOR_xrstors;
13449 break;
13450 case IX86_BUILTIN_XSAVEC:
13451 icode = CODE_FOR_xsavec;
13452 break;
13453 default:
13454 gcc_unreachable ();
13455 }
13456 pat = GEN_FCN (icode) (op0, op1);
13457 }
13458
13459 if (pat)
13460 emit_insn (pat);
13461 return 0;
13462
13463 case IX86_BUILTIN_LLWPCB:
13464 arg0 = CALL_EXPR_ARG (exp, 0);
13465 op0 = expand_normal (arg0);
13466
13467 if (!register_operand (op0, Pmode))
13468 op0 = ix86_zero_extend_to_Pmode (op0);
13469 emit_insn (gen_lwp_llwpcb (Pmode, op0));
13470 return 0;
13471
13472 case IX86_BUILTIN_SLWPCB:
13473 if (!target
13474 || !register_operand (target, Pmode))
13475 target = gen_reg_rtx (Pmode);
13476 emit_insn (gen_lwp_slwpcb (Pmode, target));
13477 return target;
13478
13479 case IX86_BUILTIN_LWPVAL32:
13480 case IX86_BUILTIN_LWPVAL64:
13481 case IX86_BUILTIN_LWPINS32:
13482 case IX86_BUILTIN_LWPINS64:
13483 mode = ((fcode == IX86_BUILTIN_LWPVAL32
13484 || fcode == IX86_BUILTIN_LWPINS32)
13485 ? SImode : DImode);
13486
13487 if (fcode == IX86_BUILTIN_LWPVAL32
13488 || fcode == IX86_BUILTIN_LWPVAL64)
13489 icode = code_for_lwp_lwpval (mode);
13490 else
13491 icode = code_for_lwp_lwpins (mode);
13492
13493 arg0 = CALL_EXPR_ARG (exp, 0);
13494 arg1 = CALL_EXPR_ARG (exp, 1);
13495 arg2 = CALL_EXPR_ARG (exp, 2);
13496 op0 = expand_normal (arg0);
13497 op1 = expand_normal (arg1);
13498 op2 = expand_normal (arg2);
13499 mode0 = insn_data[icode].operand[0].mode;
13500
13501 if (!insn_data[icode].operand[0].predicate (op0, mode0))
13502 op0 = copy_to_mode_reg (mode0, op0);
13503 if (!insn_data[icode].operand[1].predicate (op1, SImode))
13504 op1 = copy_to_mode_reg (SImode, op1);
13505
13506 if (!CONST_INT_P (op2))
13507 {
13508 error ("the last argument must be a 32-bit immediate");
13509 return const0_rtx;
13510 }
13511
13512 emit_insn (GEN_FCN (icode) (op0, op1, op2));
13513
13514 if (fcode == IX86_BUILTIN_LWPINS32
13515 || fcode == IX86_BUILTIN_LWPINS64)
13516 {
13517 if (target == 0
13518 || !nonimmediate_operand (target, QImode))
13519 target = gen_reg_rtx (QImode);
13520
13521 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
13522 const0_rtx);
13523 emit_insn (gen_rtx_SET (target, pat));
13524
13525 return target;
13526 }
13527 else
13528 return 0;
13529
13530 case IX86_BUILTIN_BEXTRI32:
13531 case IX86_BUILTIN_BEXTRI64:
13532 mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
13533
13534 arg0 = CALL_EXPR_ARG (exp, 0);
13535 arg1 = CALL_EXPR_ARG (exp, 1);
13536 op0 = expand_normal (arg0);
13537 op1 = expand_normal (arg1);
13538
13539 if (!CONST_INT_P (op1))
13540 {
13541 error ("last argument must be an immediate");
13542 return const0_rtx;
13543 }
13544 else
13545 {
13546 unsigned char lsb_index = UINTVAL (op1);
13547 unsigned char length = UINTVAL (op1) >> 8;
13548
13549 unsigned char bitsize = GET_MODE_BITSIZE (mode);
13550
13551 icode = code_for_tbm_bextri (mode);
13552
13553 mode1 = insn_data[icode].operand[1].mode;
13554 if (!insn_data[icode].operand[1].predicate (op0, mode1))
13555 op0 = copy_to_mode_reg (mode1, op0);
13556
13557 mode0 = insn_data[icode].operand[0].mode;
13558 if (target == 0
13559 || !register_operand (target, mode0))
13560 target = gen_reg_rtx (mode0);
13561
13562 if (length == 0 || lsb_index >= bitsize)
13563 {
13564 emit_move_insn (target, const0_rtx);
13565 return target;
13566 }
13567
13568 if (length + lsb_index > bitsize)
13569 length = bitsize - lsb_index;
13570
13571 op1 = GEN_INT (length);
13572 op2 = GEN_INT (lsb_index);
13573
13574 emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
13575 return target;
13576 }
13577
13578 case IX86_BUILTIN_RDRAND16_STEP:
13579 mode = HImode;
13580 goto rdrand_step;
13581
13582 case IX86_BUILTIN_RDRAND32_STEP:
13583 mode = SImode;
13584 goto rdrand_step;
13585
13586 case IX86_BUILTIN_RDRAND64_STEP:
13587 mode = DImode;
13588
13589 rdrand_step:
13590 arg0 = CALL_EXPR_ARG (exp, 0);
13591 op1 = expand_normal (arg0);
13592 if (!address_operand (op1, VOIDmode))
13593 {
13594 op1 = convert_memory_address (Pmode, op1);
13595 op1 = copy_addr_to_reg (op1);
13596 }
13597
13598 op0 = gen_reg_rtx (mode);
13599 emit_insn (gen_rdrand (mode, op0));
13600
13601 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
13602
13603 op1 = force_reg (SImode, const1_rtx);
13604
13605 /* Emit SImode conditional move. */
13606 if (mode == HImode)
13607 {
13608 if (TARGET_ZERO_EXTEND_WITH_AND
13609 && optimize_function_for_speed_p (cfun))
13610 {
13611 op2 = force_reg (SImode, const0_rtx);
13612
13613 emit_insn (gen_movstricthi
13614 (gen_lowpart (HImode, op2), op0));
13615 }
13616 else
13617 {
13618 op2 = gen_reg_rtx (SImode);
13619
13620 emit_insn (gen_zero_extendhisi2 (op2, op0));
13621 }
13622 }
13623 else if (mode == SImode)
13624 op2 = op0;
13625 else
13626 op2 = gen_rtx_SUBREG (SImode, op0, 0);
13627
13628 if (target == 0
13629 || !register_operand (target, SImode))
13630 target = gen_reg_rtx (SImode);
13631
13632 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
13633 const0_rtx);
13634 emit_insn (gen_rtx_SET (target,
13635 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
13636 return target;
13637
13638 case IX86_BUILTIN_RDSEED16_STEP:
13639 mode = HImode;
13640 goto rdseed_step;
13641
13642 case IX86_BUILTIN_RDSEED32_STEP:
13643 mode = SImode;
13644 goto rdseed_step;
13645
13646 case IX86_BUILTIN_RDSEED64_STEP:
13647 mode = DImode;
13648
13649 rdseed_step:
13650 arg0 = CALL_EXPR_ARG (exp, 0);
13651 op1 = expand_normal (arg0);
13652 if (!address_operand (op1, VOIDmode))
13653 {
13654 op1 = convert_memory_address (Pmode, op1);
13655 op1 = copy_addr_to_reg (op1);
13656 }
13657
13658 op0 = gen_reg_rtx (mode);
13659 emit_insn (gen_rdseed (mode, op0));
13660
13661 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
13662
13663 op2 = gen_reg_rtx (QImode);
13664
13665 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
13666 const0_rtx);
13667 emit_insn (gen_rtx_SET (op2, pat));
13668
13669 if (target == 0
13670 || !register_operand (target, SImode))
13671 target = gen_reg_rtx (SImode);
13672
13673 emit_insn (gen_zero_extendqisi2 (target, op2));
13674 return target;
13675
13676 case IX86_BUILTIN_SBB32:
13677 icode = CODE_FOR_subborrowsi;
13678 icode2 = CODE_FOR_subborrowsi_0;
13679 mode0 = SImode;
13680 mode1 = DImode;
13681 mode2 = CCmode;
13682 goto handlecarry;
13683
13684 case IX86_BUILTIN_SBB64:
13685 icode = CODE_FOR_subborrowdi;
13686 icode2 = CODE_FOR_subborrowdi_0;
13687 mode0 = DImode;
13688 mode1 = TImode;
13689 mode2 = CCmode;
13690 goto handlecarry;
13691
13692 case IX86_BUILTIN_ADDCARRYX32:
13693 icode = CODE_FOR_addcarrysi;
13694 icode2 = CODE_FOR_addcarrysi_0;
13695 mode0 = SImode;
13696 mode1 = DImode;
13697 mode2 = CCCmode;
13698 goto handlecarry;
13699
13700 case IX86_BUILTIN_ADDCARRYX64:
13701 icode = CODE_FOR_addcarrydi;
13702 icode2 = CODE_FOR_addcarrydi_0;
13703 mode0 = DImode;
13704 mode1 = TImode;
13705 mode2 = CCCmode;
13706
13707 handlecarry:
13708 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
13709 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
13710 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
13711 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
13712
13713 op1 = expand_normal (arg0);
13714 if (!integer_zerop (arg0))
13715 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
13716
13717 op2 = expand_normal (arg1);
13718 if (!register_operand (op2, mode0))
13719 op2 = copy_to_mode_reg (mode0, op2);
13720
13721 op3 = expand_normal (arg2);
13722 if (!register_operand (op3, mode0))
13723 op3 = copy_to_mode_reg (mode0, op3);
13724
13725 op4 = expand_normal (arg3);
13726 if (!address_operand (op4, VOIDmode))
13727 {
13728 op4 = convert_memory_address (Pmode, op4);
13729 op4 = copy_addr_to_reg (op4);
13730 }
13731
13732 op0 = gen_reg_rtx (mode0);
13733 if (integer_zerop (arg0))
13734 {
13735 /* If arg0 is 0, optimize right away into add or sub
13736 instruction that sets CCCmode flags. */
13737 op1 = gen_rtx_REG (mode2, FLAGS_REG);
13738 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
13739 }
13740 else
13741 {
13742 /* Generate CF from input operand. */
13743 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
13744
13745 /* Generate instruction that consumes CF. */
13746 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
13747 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
13748 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
13749 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
13750 }
13751
13752 /* Return current CF value. */
13753 if (target == 0)
13754 target = gen_reg_rtx (QImode);
13755
13756 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
13757 emit_insn (gen_rtx_SET (target, pat));
13758
13759 /* Store the result. */
13760 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
13761
13762 return target;
13763
13764 case IX86_BUILTIN_READ_FLAGS:
13765 if (ignore)
13766 return const0_rtx;
13767
13768 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
13769
13770 if (optimize
13771 || target == NULL_RTX
13772 || !nonimmediate_operand (target, word_mode)
13773 || GET_MODE (target) != word_mode)
13774 target = gen_reg_rtx (word_mode);
13775
13776 emit_insn (gen_pop (target));
13777 return target;
13778
13779 case IX86_BUILTIN_WRITE_FLAGS:
13780
13781 arg0 = CALL_EXPR_ARG (exp, 0);
13782 op0 = expand_normal (arg0);
13783 if (!general_no_elim_operand (op0, word_mode))
13784 op0 = copy_to_mode_reg (word_mode, op0);
13785
13786 emit_insn (gen_push (op0));
13787 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
13788 return 0;
13789
13790 case IX86_BUILTIN_KTESTC8:
13791 icode = CODE_FOR_ktestqi;
13792 mode3 = CCCmode;
13793 goto kortest;
13794
13795 case IX86_BUILTIN_KTESTZ8:
13796 icode = CODE_FOR_ktestqi;
13797 mode3 = CCZmode;
13798 goto kortest;
13799
13800 case IX86_BUILTIN_KTESTC16:
13801 icode = CODE_FOR_ktesthi;
13802 mode3 = CCCmode;
13803 goto kortest;
13804
13805 case IX86_BUILTIN_KTESTZ16:
13806 icode = CODE_FOR_ktesthi;
13807 mode3 = CCZmode;
13808 goto kortest;
13809
13810 case IX86_BUILTIN_KTESTC32:
13811 icode = CODE_FOR_ktestsi;
13812 mode3 = CCCmode;
13813 goto kortest;
13814
13815 case IX86_BUILTIN_KTESTZ32:
13816 icode = CODE_FOR_ktestsi;
13817 mode3 = CCZmode;
13818 goto kortest;
13819
13820 case IX86_BUILTIN_KTESTC64:
13821 icode = CODE_FOR_ktestdi;
13822 mode3 = CCCmode;
13823 goto kortest;
13824
13825 case IX86_BUILTIN_KTESTZ64:
13826 icode = CODE_FOR_ktestdi;
13827 mode3 = CCZmode;
13828 goto kortest;
13829
13830 case IX86_BUILTIN_KORTESTC8:
13831 icode = CODE_FOR_kortestqi;
13832 mode3 = CCCmode;
13833 goto kortest;
13834
13835 case IX86_BUILTIN_KORTESTZ8:
13836 icode = CODE_FOR_kortestqi;
13837 mode3 = CCZmode;
13838 goto kortest;
13839
13840 case IX86_BUILTIN_KORTESTC16:
13841 icode = CODE_FOR_kortesthi;
13842 mode3 = CCCmode;
13843 goto kortest;
13844
13845 case IX86_BUILTIN_KORTESTZ16:
13846 icode = CODE_FOR_kortesthi;
13847 mode3 = CCZmode;
13848 goto kortest;
13849
13850 case IX86_BUILTIN_KORTESTC32:
13851 icode = CODE_FOR_kortestsi;
13852 mode3 = CCCmode;
13853 goto kortest;
13854
13855 case IX86_BUILTIN_KORTESTZ32:
13856 icode = CODE_FOR_kortestsi;
13857 mode3 = CCZmode;
13858 goto kortest;
13859
13860 case IX86_BUILTIN_KORTESTC64:
13861 icode = CODE_FOR_kortestdi;
13862 mode3 = CCCmode;
13863 goto kortest;
13864
13865 case IX86_BUILTIN_KORTESTZ64:
13866 icode = CODE_FOR_kortestdi;
13867 mode3 = CCZmode;
13868
13869 kortest:
13870 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
13871 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
13872 op0 = expand_normal (arg0);
13873 op1 = expand_normal (arg1);
13874
13875 mode0 = insn_data[icode].operand[0].mode;
13876 mode1 = insn_data[icode].operand[1].mode;
13877
13878 if (GET_MODE (op0) != VOIDmode)
13879 op0 = force_reg (GET_MODE (op0), op0);
13880
13881 op0 = gen_lowpart (mode0, op0);
13882
13883 if (!insn_data[icode].operand[0].predicate (op0, mode0))
13884 op0 = copy_to_mode_reg (mode0, op0);
13885
13886 if (GET_MODE (op1) != VOIDmode)
13887 op1 = force_reg (GET_MODE (op1), op1);
13888
13889 op1 = gen_lowpart (mode1, op1);
13890
13891 if (!insn_data[icode].operand[1].predicate (op1, mode1))
13892 op1 = copy_to_mode_reg (mode1, op1);
13893
13894 target = gen_reg_rtx (QImode);
13895
13896 /* Emit kortest. */
13897 emit_insn (GEN_FCN (icode) (op0, op1));
13898 /* And use setcc to return result from flags. */
13899 ix86_expand_setcc (target, EQ,
13900 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
13901 return target;
13902
13903 case IX86_BUILTIN_GATHERSIV2DF:
13904 icode = CODE_FOR_avx2_gathersiv2df;
13905 goto gather_gen;
13906 case IX86_BUILTIN_GATHERSIV4DF:
13907 icode = CODE_FOR_avx2_gathersiv4df;
13908 goto gather_gen;
13909 case IX86_BUILTIN_GATHERDIV2DF:
13910 icode = CODE_FOR_avx2_gatherdiv2df;
13911 goto gather_gen;
13912 case IX86_BUILTIN_GATHERDIV4DF:
13913 icode = CODE_FOR_avx2_gatherdiv4df;
13914 goto gather_gen;
13915 case IX86_BUILTIN_GATHERSIV4SF:
13916 icode = CODE_FOR_avx2_gathersiv4sf;
13917 goto gather_gen;
13918 case IX86_BUILTIN_GATHERSIV8SF:
13919 icode = CODE_FOR_avx2_gathersiv8sf;
13920 goto gather_gen;
13921 case IX86_BUILTIN_GATHERDIV4SF:
13922 icode = CODE_FOR_avx2_gatherdiv4sf;
13923 goto gather_gen;
13924 case IX86_BUILTIN_GATHERDIV8SF:
13925 icode = CODE_FOR_avx2_gatherdiv8sf;
13926 goto gather_gen;
13927 case IX86_BUILTIN_GATHERSIV2DI:
13928 icode = CODE_FOR_avx2_gathersiv2di;
13929 goto gather_gen;
13930 case IX86_BUILTIN_GATHERSIV4DI:
13931 icode = CODE_FOR_avx2_gathersiv4di;
13932 goto gather_gen;
13933 case IX86_BUILTIN_GATHERDIV2DI:
13934 icode = CODE_FOR_avx2_gatherdiv2di;
13935 goto gather_gen;
13936 case IX86_BUILTIN_GATHERDIV4DI:
13937 icode = CODE_FOR_avx2_gatherdiv4di;
13938 goto gather_gen;
13939 case IX86_BUILTIN_GATHERSIV4SI:
13940 icode = CODE_FOR_avx2_gathersiv4si;
13941 goto gather_gen;
13942 case IX86_BUILTIN_GATHERSIV8SI:
13943 icode = CODE_FOR_avx2_gathersiv8si;
13944 goto gather_gen;
13945 case IX86_BUILTIN_GATHERDIV4SI:
13946 icode = CODE_FOR_avx2_gatherdiv4si;
13947 goto gather_gen;
13948 case IX86_BUILTIN_GATHERDIV8SI:
13949 icode = CODE_FOR_avx2_gatherdiv8si;
13950 goto gather_gen;
13951 case IX86_BUILTIN_GATHERALTSIV4DF:
13952 icode = CODE_FOR_avx2_gathersiv4df;
13953 goto gather_gen;
13954 case IX86_BUILTIN_GATHERALTDIV8SF:
13955 icode = CODE_FOR_avx2_gatherdiv8sf;
13956 goto gather_gen;
13957 case IX86_BUILTIN_GATHERALTSIV4DI:
13958 icode = CODE_FOR_avx2_gathersiv4di;
13959 goto gather_gen;
13960 case IX86_BUILTIN_GATHERALTDIV8SI:
13961 icode = CODE_FOR_avx2_gatherdiv8si;
13962 goto gather_gen;
13963 case IX86_BUILTIN_GATHER3SIV16SF:
13964 icode = CODE_FOR_avx512f_gathersiv16sf;
13965 goto gather_gen;
13966 case IX86_BUILTIN_GATHER3SIV8DF:
13967 icode = CODE_FOR_avx512f_gathersiv8df;
13968 goto gather_gen;
13969 case IX86_BUILTIN_GATHER3DIV16SF:
13970 icode = CODE_FOR_avx512f_gatherdiv16sf;
13971 goto gather_gen;
13972 case IX86_BUILTIN_GATHER3DIV8DF:
13973 icode = CODE_FOR_avx512f_gatherdiv8df;
13974 goto gather_gen;
13975 case IX86_BUILTIN_GATHER3SIV16SI:
13976 icode = CODE_FOR_avx512f_gathersiv16si;
13977 goto gather_gen;
13978 case IX86_BUILTIN_GATHER3SIV8DI:
13979 icode = CODE_FOR_avx512f_gathersiv8di;
13980 goto gather_gen;
13981 case IX86_BUILTIN_GATHER3DIV16SI:
13982 icode = CODE_FOR_avx512f_gatherdiv16si;
13983 goto gather_gen;
13984 case IX86_BUILTIN_GATHER3DIV8DI:
13985 icode = CODE_FOR_avx512f_gatherdiv8di;
13986 goto gather_gen;
13987 case IX86_BUILTIN_GATHER3ALTSIV8DF:
13988 icode = CODE_FOR_avx512f_gathersiv8df;
13989 goto gather_gen;
13990 case IX86_BUILTIN_GATHER3ALTDIV16SF:
13991 icode = CODE_FOR_avx512f_gatherdiv16sf;
13992 goto gather_gen;
13993 case IX86_BUILTIN_GATHER3ALTSIV8DI:
13994 icode = CODE_FOR_avx512f_gathersiv8di;
13995 goto gather_gen;
13996 case IX86_BUILTIN_GATHER3ALTDIV16SI:
13997 icode = CODE_FOR_avx512f_gatherdiv16si;
13998 goto gather_gen;
13999 case IX86_BUILTIN_GATHER3SIV2DF:
14000 icode = CODE_FOR_avx512vl_gathersiv2df;
14001 goto gather_gen;
14002 case IX86_BUILTIN_GATHER3SIV4DF:
14003 icode = CODE_FOR_avx512vl_gathersiv4df;
14004 goto gather_gen;
14005 case IX86_BUILTIN_GATHER3DIV2DF:
14006 icode = CODE_FOR_avx512vl_gatherdiv2df;
14007 goto gather_gen;
14008 case IX86_BUILTIN_GATHER3DIV4DF:
14009 icode = CODE_FOR_avx512vl_gatherdiv4df;
14010 goto gather_gen;
14011 case IX86_BUILTIN_GATHER3SIV4SF:
14012 icode = CODE_FOR_avx512vl_gathersiv4sf;
14013 goto gather_gen;
14014 case IX86_BUILTIN_GATHER3SIV8SF:
14015 icode = CODE_FOR_avx512vl_gathersiv8sf;
14016 goto gather_gen;
14017 case IX86_BUILTIN_GATHER3DIV4SF:
14018 icode = CODE_FOR_avx512vl_gatherdiv4sf;
14019 goto gather_gen;
14020 case IX86_BUILTIN_GATHER3DIV8SF:
14021 icode = CODE_FOR_avx512vl_gatherdiv8sf;
14022 goto gather_gen;
14023 case IX86_BUILTIN_GATHER3SIV2DI:
14024 icode = CODE_FOR_avx512vl_gathersiv2di;
14025 goto gather_gen;
14026 case IX86_BUILTIN_GATHER3SIV4DI:
14027 icode = CODE_FOR_avx512vl_gathersiv4di;
14028 goto gather_gen;
14029 case IX86_BUILTIN_GATHER3DIV2DI:
14030 icode = CODE_FOR_avx512vl_gatherdiv2di;
14031 goto gather_gen;
14032 case IX86_BUILTIN_GATHER3DIV4DI:
14033 icode = CODE_FOR_avx512vl_gatherdiv4di;
14034 goto gather_gen;
14035 case IX86_BUILTIN_GATHER3SIV4SI:
14036 icode = CODE_FOR_avx512vl_gathersiv4si;
14037 goto gather_gen;
14038 case IX86_BUILTIN_GATHER3SIV8SI:
14039 icode = CODE_FOR_avx512vl_gathersiv8si;
14040 goto gather_gen;
14041 case IX86_BUILTIN_GATHER3DIV4SI:
14042 icode = CODE_FOR_avx512vl_gatherdiv4si;
14043 goto gather_gen;
14044 case IX86_BUILTIN_GATHER3DIV8SI:
14045 icode = CODE_FOR_avx512vl_gatherdiv8si;
14046 goto gather_gen;
14047 case IX86_BUILTIN_GATHER3ALTSIV4DF:
14048 icode = CODE_FOR_avx512vl_gathersiv4df;
14049 goto gather_gen;
14050 case IX86_BUILTIN_GATHER3ALTDIV8SF:
14051 icode = CODE_FOR_avx512vl_gatherdiv8sf;
14052 goto gather_gen;
14053 case IX86_BUILTIN_GATHER3ALTSIV4DI:
14054 icode = CODE_FOR_avx512vl_gathersiv4di;
14055 goto gather_gen;
14056 case IX86_BUILTIN_GATHER3ALTDIV8SI:
14057 icode = CODE_FOR_avx512vl_gatherdiv8si;
14058 goto gather_gen;
14059 case IX86_BUILTIN_SCATTERSIV16SF:
14060 icode = CODE_FOR_avx512f_scattersiv16sf;
14061 goto scatter_gen;
14062 case IX86_BUILTIN_SCATTERSIV8DF:
14063 icode = CODE_FOR_avx512f_scattersiv8df;
14064 goto scatter_gen;
14065 case IX86_BUILTIN_SCATTERDIV16SF:
14066 icode = CODE_FOR_avx512f_scatterdiv16sf;
14067 goto scatter_gen;
14068 case IX86_BUILTIN_SCATTERDIV8DF:
14069 icode = CODE_FOR_avx512f_scatterdiv8df;
14070 goto scatter_gen;
14071 case IX86_BUILTIN_SCATTERSIV16SI:
14072 icode = CODE_FOR_avx512f_scattersiv16si;
14073 goto scatter_gen;
14074 case IX86_BUILTIN_SCATTERSIV8DI:
14075 icode = CODE_FOR_avx512f_scattersiv8di;
14076 goto scatter_gen;
14077 case IX86_BUILTIN_SCATTERDIV16SI:
14078 icode = CODE_FOR_avx512f_scatterdiv16si;
14079 goto scatter_gen;
14080 case IX86_BUILTIN_SCATTERDIV8DI:
14081 icode = CODE_FOR_avx512f_scatterdiv8di;
14082 goto scatter_gen;
14083 case IX86_BUILTIN_SCATTERSIV8SF:
14084 icode = CODE_FOR_avx512vl_scattersiv8sf;
14085 goto scatter_gen;
14086 case IX86_BUILTIN_SCATTERSIV4SF:
14087 icode = CODE_FOR_avx512vl_scattersiv4sf;
14088 goto scatter_gen;
14089 case IX86_BUILTIN_SCATTERSIV4DF:
14090 icode = CODE_FOR_avx512vl_scattersiv4df;
14091 goto scatter_gen;
14092 case IX86_BUILTIN_SCATTERSIV2DF:
14093 icode = CODE_FOR_avx512vl_scattersiv2df;
14094 goto scatter_gen;
14095 case IX86_BUILTIN_SCATTERDIV8SF:
14096 icode = CODE_FOR_avx512vl_scatterdiv8sf;
14097 goto scatter_gen;
14098 case IX86_BUILTIN_SCATTERDIV4SF:
14099 icode = CODE_FOR_avx512vl_scatterdiv4sf;
14100 goto scatter_gen;
14101 case IX86_BUILTIN_SCATTERDIV4DF:
14102 icode = CODE_FOR_avx512vl_scatterdiv4df;
14103 goto scatter_gen;
14104 case IX86_BUILTIN_SCATTERDIV2DF:
14105 icode = CODE_FOR_avx512vl_scatterdiv2df;
14106 goto scatter_gen;
14107 case IX86_BUILTIN_SCATTERSIV8SI:
14108 icode = CODE_FOR_avx512vl_scattersiv8si;
14109 goto scatter_gen;
14110 case IX86_BUILTIN_SCATTERSIV4SI:
14111 icode = CODE_FOR_avx512vl_scattersiv4si;
14112 goto scatter_gen;
14113 case IX86_BUILTIN_SCATTERSIV4DI:
14114 icode = CODE_FOR_avx512vl_scattersiv4di;
14115 goto scatter_gen;
14116 case IX86_BUILTIN_SCATTERSIV2DI:
14117 icode = CODE_FOR_avx512vl_scattersiv2di;
14118 goto scatter_gen;
14119 case IX86_BUILTIN_SCATTERDIV8SI:
14120 icode = CODE_FOR_avx512vl_scatterdiv8si;
14121 goto scatter_gen;
14122 case IX86_BUILTIN_SCATTERDIV4SI:
14123 icode = CODE_FOR_avx512vl_scatterdiv4si;
14124 goto scatter_gen;
14125 case IX86_BUILTIN_SCATTERDIV4DI:
14126 icode = CODE_FOR_avx512vl_scatterdiv4di;
14127 goto scatter_gen;
14128 case IX86_BUILTIN_SCATTERDIV2DI:
14129 icode = CODE_FOR_avx512vl_scatterdiv2di;
14130 goto scatter_gen;
14131 case IX86_BUILTIN_GATHERPFDPD:
14132 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
14133 goto vec_prefetch_gen;
14134 case IX86_BUILTIN_SCATTERALTSIV8DF:
14135 icode = CODE_FOR_avx512f_scattersiv8df;
14136 goto scatter_gen;
14137 case IX86_BUILTIN_SCATTERALTDIV16SF:
14138 icode = CODE_FOR_avx512f_scatterdiv16sf;
14139 goto scatter_gen;
14140 case IX86_BUILTIN_SCATTERALTSIV8DI:
14141 icode = CODE_FOR_avx512f_scattersiv8di;
14142 goto scatter_gen;
14143 case IX86_BUILTIN_SCATTERALTDIV16SI:
14144 icode = CODE_FOR_avx512f_scatterdiv16si;
14145 goto scatter_gen;
14146 case IX86_BUILTIN_SCATTERALTSIV4DF:
14147 icode = CODE_FOR_avx512vl_scattersiv4df;
14148 goto scatter_gen;
14149 case IX86_BUILTIN_SCATTERALTDIV8SF:
14150 icode = CODE_FOR_avx512vl_scatterdiv8sf;
14151 goto scatter_gen;
14152 case IX86_BUILTIN_SCATTERALTSIV4DI:
14153 icode = CODE_FOR_avx512vl_scattersiv4di;
14154 goto scatter_gen;
14155 case IX86_BUILTIN_SCATTERALTDIV8SI:
14156 icode = CODE_FOR_avx512vl_scatterdiv8si;
14157 goto scatter_gen;
14158 case IX86_BUILTIN_SCATTERALTSIV2DF:
14159 icode = CODE_FOR_avx512vl_scattersiv2df;
14160 goto scatter_gen;
14161 case IX86_BUILTIN_SCATTERALTDIV4SF:
14162 icode = CODE_FOR_avx512vl_scatterdiv4sf;
14163 goto scatter_gen;
14164 case IX86_BUILTIN_SCATTERALTSIV2DI:
14165 icode = CODE_FOR_avx512vl_scattersiv2di;
14166 goto scatter_gen;
14167 case IX86_BUILTIN_SCATTERALTDIV4SI:
14168 icode = CODE_FOR_avx512vl_scatterdiv4si;
14169 goto scatter_gen;
14170 case IX86_BUILTIN_GATHERPFDPS:
14171 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
14172 goto vec_prefetch_gen;
14173 case IX86_BUILTIN_GATHERPFQPD:
14174 icode = CODE_FOR_avx512pf_gatherpfv8didf;
14175 goto vec_prefetch_gen;
14176 case IX86_BUILTIN_GATHERPFQPS:
14177 icode = CODE_FOR_avx512pf_gatherpfv8disf;
14178 goto vec_prefetch_gen;
14179 case IX86_BUILTIN_SCATTERPFDPD:
14180 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
14181 goto vec_prefetch_gen;
14182 case IX86_BUILTIN_SCATTERPFDPS:
14183 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
14184 goto vec_prefetch_gen;
14185 case IX86_BUILTIN_SCATTERPFQPD:
14186 icode = CODE_FOR_avx512pf_scatterpfv8didf;
14187 goto vec_prefetch_gen;
14188 case IX86_BUILTIN_SCATTERPFQPS:
14189 icode = CODE_FOR_avx512pf_scatterpfv8disf;
14190 goto vec_prefetch_gen;
14191
14192 gather_gen:
14193 rtx half;
14194 rtx (*gen) (rtx, rtx);
14195
14196 arg0 = CALL_EXPR_ARG (exp, 0);
14197 arg1 = CALL_EXPR_ARG (exp, 1);
14198 arg2 = CALL_EXPR_ARG (exp, 2);
14199 arg3 = CALL_EXPR_ARG (exp, 3);
14200 arg4 = CALL_EXPR_ARG (exp, 4);
14201 op0 = expand_normal (arg0);
14202 op1 = expand_normal (arg1);
14203 op2 = expand_normal (arg2);
14204 op3 = expand_normal (arg3);
14205 op4 = expand_normal (arg4);
14206 /* Note the arg order is different from the operand order. */
14207 mode0 = insn_data[icode].operand[1].mode;
14208 mode2 = insn_data[icode].operand[3].mode;
14209 mode3 = insn_data[icode].operand[4].mode;
14210 mode4 = insn_data[icode].operand[5].mode;
14211
14212 if (target == NULL_RTX
14213 || GET_MODE (target) != insn_data[icode].operand[0].mode
14214 || !insn_data[icode].operand[0].predicate (target,
14215 GET_MODE (target)))
14216 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
14217 else
14218 subtarget = target;
14219
14220 switch (fcode)
14221 {
14222 case IX86_BUILTIN_GATHER3ALTSIV8DF:
14223 case IX86_BUILTIN_GATHER3ALTSIV8DI:
14224 half = gen_reg_rtx (V8SImode);
14225 if (!nonimmediate_operand (op2, V16SImode))
14226 op2 = copy_to_mode_reg (V16SImode, op2);
14227 emit_insn (gen_vec_extract_lo_v16si (half, op2));
14228 op2 = half;
14229 break;
14230 case IX86_BUILTIN_GATHER3ALTSIV4DF:
14231 case IX86_BUILTIN_GATHER3ALTSIV4DI:
14232 case IX86_BUILTIN_GATHERALTSIV4DF:
14233 case IX86_BUILTIN_GATHERALTSIV4DI:
14234 half = gen_reg_rtx (V4SImode);
14235 if (!nonimmediate_operand (op2, V8SImode))
14236 op2 = copy_to_mode_reg (V8SImode, op2);
14237 emit_insn (gen_vec_extract_lo_v8si (half, op2));
14238 op2 = half;
14239 break;
14240 case IX86_BUILTIN_GATHER3ALTDIV16SF:
14241 case IX86_BUILTIN_GATHER3ALTDIV16SI:
14242 half = gen_reg_rtx (mode0);
14243 if (mode0 == V8SFmode)
14244 gen = gen_vec_extract_lo_v16sf;
14245 else
14246 gen = gen_vec_extract_lo_v16si;
14247 if (!nonimmediate_operand (op0, GET_MODE (op0)))
14248 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
14249 emit_insn (gen (half, op0));
14250 op0 = half;
14251 op3 = lowpart_subreg (QImode, op3, HImode);
14252 break;
14253 case IX86_BUILTIN_GATHER3ALTDIV8SF:
14254 case IX86_BUILTIN_GATHER3ALTDIV8SI:
14255 case IX86_BUILTIN_GATHERALTDIV8SF:
14256 case IX86_BUILTIN_GATHERALTDIV8SI:
14257 half = gen_reg_rtx (mode0);
14258 if (mode0 == V4SFmode)
14259 gen = gen_vec_extract_lo_v8sf;
14260 else
14261 gen = gen_vec_extract_lo_v8si;
14262 if (!nonimmediate_operand (op0, GET_MODE (op0)))
14263 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
14264 emit_insn (gen (half, op0));
14265 op0 = half;
14266 if (VECTOR_MODE_P (GET_MODE (op3)))
14267 {
14268 half = gen_reg_rtx (mode0);
14269 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14270 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14271 emit_insn (gen (half, op3));
14272 op3 = half;
14273 }
14274 break;
14275 default:
14276 break;
14277 }
14278
14279 /* Force memory operand only with base register here. But we
14280 don't want to do it on memory operand for other builtin
14281 functions. */
14282 op1 = ix86_zero_extend_to_Pmode (op1);
14283
14284 if (!insn_data[icode].operand[1].predicate (op0, mode0))
14285 op0 = copy_to_mode_reg (mode0, op0);
14286 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
14287 op1 = copy_to_mode_reg (Pmode, op1);
14288 if (!insn_data[icode].operand[3].predicate (op2, mode2))
14289 op2 = copy_to_mode_reg (mode2, op2);
14290
14291 op3 = fixup_modeless_constant (op3, mode3);
14292
14293 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
14294 {
14295 if (!insn_data[icode].operand[4].predicate (op3, mode3))
14296 op3 = copy_to_mode_reg (mode3, op3);
14297 }
14298 else
14299 {
14300 op3 = copy_to_reg (op3);
14301 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
14302 }
14303 if (!insn_data[icode].operand[5].predicate (op4, mode4))
14304 {
14305 error ("the last argument must be scale 1, 2, 4, 8");
14306 return const0_rtx;
14307 }
14308
14309 /* Optimize. If mask is known to have all high bits set,
14310 replace op0 with pc_rtx to signal that the instruction
14311 overwrites the whole destination and doesn't use its
14312 previous contents. */
14313 if (optimize)
14314 {
14315 if (TREE_CODE (arg3) == INTEGER_CST)
14316 {
14317 if (integer_all_onesp (arg3))
14318 op0 = pc_rtx;
14319 }
14320 else if (TREE_CODE (arg3) == VECTOR_CST)
14321 {
14322 unsigned int negative = 0;
14323 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
14324 {
14325 tree cst = VECTOR_CST_ELT (arg3, i);
14326 if (TREE_CODE (cst) == INTEGER_CST
14327 && tree_int_cst_sign_bit (cst))
14328 negative++;
14329 else if (TREE_CODE (cst) == REAL_CST
14330 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
14331 negative++;
14332 }
14333 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
14334 op0 = pc_rtx;
14335 }
14336 else if (TREE_CODE (arg3) == SSA_NAME
14337 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
14338 {
14339 /* Recognize also when mask is like:
14340 __v2df src = _mm_setzero_pd ();
14341 __v2df mask = _mm_cmpeq_pd (src, src);
14342 or
14343 __v8sf src = _mm256_setzero_ps ();
14344 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
14345 as that is a cheaper way to load all ones into
14346 a register than having to load a constant from
14347 memory. */
14348 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
14349 if (is_gimple_call (def_stmt))
14350 {
14351 tree fndecl = gimple_call_fndecl (def_stmt);
14352 if (fndecl
14353 && fndecl_built_in_p (fndecl, BUILT_IN_MD))
14354 switch (DECL_MD_FUNCTION_CODE (fndecl))
14355 {
14356 case IX86_BUILTIN_CMPPD:
14357 case IX86_BUILTIN_CMPPS:
14358 case IX86_BUILTIN_CMPPD256:
14359 case IX86_BUILTIN_CMPPS256:
14360 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
14361 break;
14362 /* FALLTHRU */
14363 case IX86_BUILTIN_CMPEQPD:
14364 case IX86_BUILTIN_CMPEQPS:
14365 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
14366 && initializer_zerop (gimple_call_arg (def_stmt,
14367 1)))
14368 op0 = pc_rtx;
14369 break;
14370 default:
14371 break;
14372 }
14373 }
14374 }
14375 }
14376
14377 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
14378 if (! pat)
14379 return const0_rtx;
14380 emit_insn (pat);
14381
14382 switch (fcode)
14383 {
14384 case IX86_BUILTIN_GATHER3DIV16SF:
14385 if (target == NULL_RTX)
14386 target = gen_reg_rtx (V8SFmode);
14387 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
14388 break;
14389 case IX86_BUILTIN_GATHER3DIV16SI:
14390 if (target == NULL_RTX)
14391 target = gen_reg_rtx (V8SImode);
14392 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
14393 break;
14394 case IX86_BUILTIN_GATHER3DIV8SF:
14395 case IX86_BUILTIN_GATHERDIV8SF:
14396 if (target == NULL_RTX)
14397 target = gen_reg_rtx (V4SFmode);
14398 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
14399 break;
14400 case IX86_BUILTIN_GATHER3DIV8SI:
14401 case IX86_BUILTIN_GATHERDIV8SI:
14402 if (target == NULL_RTX)
14403 target = gen_reg_rtx (V4SImode);
14404 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
14405 break;
14406 default:
14407 target = subtarget;
14408 break;
14409 }
14410 return target;
14411
14412 scatter_gen:
14413 arg0 = CALL_EXPR_ARG (exp, 0);
14414 arg1 = CALL_EXPR_ARG (exp, 1);
14415 arg2 = CALL_EXPR_ARG (exp, 2);
14416 arg3 = CALL_EXPR_ARG (exp, 3);
14417 arg4 = CALL_EXPR_ARG (exp, 4);
14418 op0 = expand_normal (arg0);
14419 op1 = expand_normal (arg1);
14420 op2 = expand_normal (arg2);
14421 op3 = expand_normal (arg3);
14422 op4 = expand_normal (arg4);
14423 mode1 = insn_data[icode].operand[1].mode;
14424 mode2 = insn_data[icode].operand[2].mode;
14425 mode3 = insn_data[icode].operand[3].mode;
14426 mode4 = insn_data[icode].operand[4].mode;
14427
14428 /* Scatter instruction stores operand op3 to memory with
14429 indices from op2 and scale from op4 under writemask op1.
14430 If index operand op2 has more elements then source operand
14431 op3 one need to use only its low half. And vice versa. */
14432 switch (fcode)
14433 {
14434 case IX86_BUILTIN_SCATTERALTSIV8DF:
14435 case IX86_BUILTIN_SCATTERALTSIV8DI:
14436 half = gen_reg_rtx (V8SImode);
14437 if (!nonimmediate_operand (op2, V16SImode))
14438 op2 = copy_to_mode_reg (V16SImode, op2);
14439 emit_insn (gen_vec_extract_lo_v16si (half, op2));
14440 op2 = half;
14441 break;
14442 case IX86_BUILTIN_SCATTERALTDIV16SF:
14443 case IX86_BUILTIN_SCATTERALTDIV16SI:
14444 half = gen_reg_rtx (mode3);
14445 if (mode3 == V8SFmode)
14446 gen = gen_vec_extract_lo_v16sf;
14447 else
14448 gen = gen_vec_extract_lo_v16si;
14449 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14450 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14451 emit_insn (gen (half, op3));
14452 op3 = half;
14453 break;
14454 case IX86_BUILTIN_SCATTERALTSIV4DF:
14455 case IX86_BUILTIN_SCATTERALTSIV4DI:
14456 half = gen_reg_rtx (V4SImode);
14457 if (!nonimmediate_operand (op2, V8SImode))
14458 op2 = copy_to_mode_reg (V8SImode, op2);
14459 emit_insn (gen_vec_extract_lo_v8si (half, op2));
14460 op2 = half;
14461 break;
14462 case IX86_BUILTIN_SCATTERALTDIV8SF:
14463 case IX86_BUILTIN_SCATTERALTDIV8SI:
14464 half = gen_reg_rtx (mode3);
14465 if (mode3 == V4SFmode)
14466 gen = gen_vec_extract_lo_v8sf;
14467 else
14468 gen = gen_vec_extract_lo_v8si;
14469 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14470 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14471 emit_insn (gen (half, op3));
14472 op3 = half;
14473 break;
14474 case IX86_BUILTIN_SCATTERALTSIV2DF:
14475 case IX86_BUILTIN_SCATTERALTSIV2DI:
14476 if (!nonimmediate_operand (op2, V4SImode))
14477 op2 = copy_to_mode_reg (V4SImode, op2);
14478 break;
14479 case IX86_BUILTIN_SCATTERALTDIV4SF:
14480 case IX86_BUILTIN_SCATTERALTDIV4SI:
14481 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14482 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14483 break;
14484 default:
14485 break;
14486 }
14487
14488 /* Force memory operand only with base register here. But we
14489 don't want to do it on memory operand for other builtin
14490 functions. */
14491 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
14492
14493 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
14494 op0 = copy_to_mode_reg (Pmode, op0);
14495
14496 op1 = fixup_modeless_constant (op1, mode1);
14497
14498 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
14499 {
14500 if (!insn_data[icode].operand[1].predicate (op1, mode1))
14501 op1 = copy_to_mode_reg (mode1, op1);
14502 }
14503 else
14504 {
14505 op1 = copy_to_reg (op1);
14506 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
14507 }
14508
14509 if (!insn_data[icode].operand[2].predicate (op2, mode2))
14510 op2 = copy_to_mode_reg (mode2, op2);
14511
14512 if (!insn_data[icode].operand[3].predicate (op3, mode3))
14513 op3 = copy_to_mode_reg (mode3, op3);
14514
14515 if (!insn_data[icode].operand[4].predicate (op4, mode4))
14516 {
14517 error ("the last argument must be scale 1, 2, 4, 8");
14518 return const0_rtx;
14519 }
14520
14521 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
14522 if (! pat)
14523 return const0_rtx;
14524
14525 emit_insn (pat);
14526 return 0;
14527
14528 vec_prefetch_gen:
14529 arg0 = CALL_EXPR_ARG (exp, 0);
14530 arg1 = CALL_EXPR_ARG (exp, 1);
14531 arg2 = CALL_EXPR_ARG (exp, 2);
14532 arg3 = CALL_EXPR_ARG (exp, 3);
14533 arg4 = CALL_EXPR_ARG (exp, 4);
14534 op0 = expand_normal (arg0);
14535 op1 = expand_normal (arg1);
14536 op2 = expand_normal (arg2);
14537 op3 = expand_normal (arg3);
14538 op4 = expand_normal (arg4);
14539 mode0 = insn_data[icode].operand[0].mode;
14540 mode1 = insn_data[icode].operand[1].mode;
14541 mode3 = insn_data[icode].operand[3].mode;
14542 mode4 = insn_data[icode].operand[4].mode;
14543
14544 op0 = fixup_modeless_constant (op0, mode0);
14545
14546 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
14547 {
14548 if (!insn_data[icode].operand[0].predicate (op0, mode0))
14549 op0 = copy_to_mode_reg (mode0, op0);
14550 }
14551 else
14552 {
14553 op0 = copy_to_reg (op0);
14554 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
14555 }
14556
14557 if (!insn_data[icode].operand[1].predicate (op1, mode1))
14558 op1 = copy_to_mode_reg (mode1, op1);
14559
14560 /* Force memory operand only with base register here. But we
14561 don't want to do it on memory operand for other builtin
14562 functions. */
14563 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
14564
14565 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
14566 op2 = copy_to_mode_reg (Pmode, op2);
14567
14568 if (!insn_data[icode].operand[3].predicate (op3, mode3))
14569 {
14570 error ("the forth argument must be scale 1, 2, 4, 8");
14571 return const0_rtx;
14572 }
14573
14574 if (!insn_data[icode].operand[4].predicate (op4, mode4))
14575 {
14576 error ("incorrect hint operand");
14577 return const0_rtx;
14578 }
14579
14580 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
14581 if (! pat)
14582 return const0_rtx;
14583
14584 emit_insn (pat);
14585
14586 return 0;
14587
14588 case IX86_BUILTIN_XABORT:
14589 icode = CODE_FOR_xabort;
14590 arg0 = CALL_EXPR_ARG (exp, 0);
14591 op0 = expand_normal (arg0);
14592 mode0 = insn_data[icode].operand[0].mode;
14593 if (!insn_data[icode].operand[0].predicate (op0, mode0))
14594 {
14595 error ("the argument to %<xabort%> intrinsic must "
14596 "be an 8-bit immediate");
14597 return const0_rtx;
14598 }
14599 emit_insn (gen_xabort (op0));
14600 return 0;
14601
14602 case IX86_BUILTIN_RDSSPD:
14603 case IX86_BUILTIN_RDSSPQ:
14604 mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
14605
14606 if (target == 0
14607 || !register_operand (target, mode))
14608 target = gen_reg_rtx (mode);
14609
14610 op0 = force_reg (mode, const0_rtx);
14611
14612 emit_insn (gen_rdssp (mode, target, op0));
14613 return target;
14614
14615 case IX86_BUILTIN_INCSSPD:
14616 case IX86_BUILTIN_INCSSPQ:
14617 mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
14618
14619 arg0 = CALL_EXPR_ARG (exp, 0);
14620 op0 = expand_normal (arg0);
14621
14622 op0 = force_reg (mode, op0);
14623
14624 emit_insn (gen_incssp (mode, op0));
14625 return 0;
14626
14627 case IX86_BUILTIN_HRESET:
14628 icode = CODE_FOR_hreset;
14629 arg0 = CALL_EXPR_ARG (exp, 0);
14630 op0 = expand_normal (arg0);
14631 op0 = force_reg (SImode, op0);
14632 emit_insn (gen_hreset (op0));
14633 return 0;
14634
14635 case IX86_BUILTIN_RSTORSSP:
14636 case IX86_BUILTIN_CLRSSBSY:
14637 arg0 = CALL_EXPR_ARG (exp, 0);
14638 op0 = expand_normal (arg0);
14639 icode = (fcode == IX86_BUILTIN_RSTORSSP
14640 ? CODE_FOR_rstorssp
14641 : CODE_FOR_clrssbsy);
14642
14643 if (!address_operand (op0, VOIDmode))
14644 {
14645 op0 = convert_memory_address (Pmode, op0);
14646 op0 = copy_addr_to_reg (op0);
14647 }
14648 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
14649 return 0;
14650
14651 case IX86_BUILTIN_WRSSD:
14652 case IX86_BUILTIN_WRSSQ:
14653 case IX86_BUILTIN_WRUSSD:
14654 case IX86_BUILTIN_WRUSSQ:
14655 mode = ((fcode == IX86_BUILTIN_WRSSD
14656 || fcode == IX86_BUILTIN_WRUSSD)
14657 ? SImode : DImode);
14658
14659 arg0 = CALL_EXPR_ARG (exp, 0);
14660 op0 = expand_normal (arg0);
14661 arg1 = CALL_EXPR_ARG (exp, 1);
14662 op1 = expand_normal (arg1);
14663
14664 op0 = force_reg (mode, op0);
14665
14666 if (!address_operand (op1, VOIDmode))
14667 {
14668 op1 = convert_memory_address (Pmode, op1);
14669 op1 = copy_addr_to_reg (op1);
14670 }
14671 op1 = gen_rtx_MEM (mode, op1);
14672
14673 icode = ((fcode == IX86_BUILTIN_WRSSD
14674 || fcode == IX86_BUILTIN_WRSSQ)
14675 ? code_for_wrss (mode)
14676 : code_for_wruss (mode));
14677 emit_insn (GEN_FCN (icode) (op0, op1));
14678
14679 return 0;
14680
14681 default:
14682 break;
14683 }
14684
14685 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
14686 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
14687 {
14688 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
14689 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
14690 target);
14691 }
14692
14693 if (fcode >= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
14694 && fcode <= IX86_BUILTIN__BDESC_PURE_ARGS_LAST)
14695 {
14696 i = fcode - IX86_BUILTIN__BDESC_PURE_ARGS_FIRST;
14697 return ix86_expand_special_args_builtin (bdesc_pure_args + i, exp,
14698 target);
14699 }
14700
14701 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
14702 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
14703 {
14704 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
14705 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
14706 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
14707 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
14708 int masked = 1;
14709 machine_mode mode, wide_mode, nar_mode;
14710
14711 nar_mode = V4SFmode;
14712 mode = V16SFmode;
14713 wide_mode = V64SFmode;
14714 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
14715 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
14716
14717 switch (fcode)
14718 {
14719 case IX86_BUILTIN_4FMAPS:
14720 fcn = gen_avx5124fmaddps_4fmaddps;
14721 masked = 0;
14722 goto v4fma_expand;
14723
14724 case IX86_BUILTIN_4DPWSSD:
14725 nar_mode = V4SImode;
14726 mode = V16SImode;
14727 wide_mode = V64SImode;
14728 fcn = gen_avx5124vnniw_vp4dpwssd;
14729 masked = 0;
14730 goto v4fma_expand;
14731
14732 case IX86_BUILTIN_4DPWSSDS:
14733 nar_mode = V4SImode;
14734 mode = V16SImode;
14735 wide_mode = V64SImode;
14736 fcn = gen_avx5124vnniw_vp4dpwssds;
14737 masked = 0;
14738 goto v4fma_expand;
14739
14740 case IX86_BUILTIN_4FNMAPS:
14741 fcn = gen_avx5124fmaddps_4fnmaddps;
14742 masked = 0;
14743 goto v4fma_expand;
14744
14745 case IX86_BUILTIN_4FNMAPS_MASK:
14746 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
14747 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
14748 goto v4fma_expand;
14749
14750 case IX86_BUILTIN_4DPWSSD_MASK:
14751 nar_mode = V4SImode;
14752 mode = V16SImode;
14753 wide_mode = V64SImode;
14754 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
14755 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
14756 goto v4fma_expand;
14757
14758 case IX86_BUILTIN_4DPWSSDS_MASK:
14759 nar_mode = V4SImode;
14760 mode = V16SImode;
14761 wide_mode = V64SImode;
14762 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
14763 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
14764 goto v4fma_expand;
14765
14766 case IX86_BUILTIN_4FMAPS_MASK:
14767 {
14768 tree args[4];
14769 rtx ops[4];
14770 rtx wide_reg;
14771 rtx accum;
14772 rtx addr;
14773 rtx mem;
14774
14775 v4fma_expand:
14776 wide_reg = gen_reg_rtx (wide_mode);
14777 for (i = 0; i < 4; i++)
14778 {
14779 args[i] = CALL_EXPR_ARG (exp, i);
14780 ops[i] = expand_normal (args[i]);
14781
14782 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
14783 ops[i]);
14784 }
14785
14786 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
14787 accum = force_reg (mode, accum);
14788
14789 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
14790 addr = force_reg (Pmode, addr);
14791
14792 mem = gen_rtx_MEM (nar_mode, addr);
14793
14794 target = gen_reg_rtx (mode);
14795
14796 emit_move_insn (target, accum);
14797
14798 if (! masked)
14799 emit_insn (fcn (target, accum, wide_reg, mem));
14800 else
14801 {
14802 rtx merge, mask;
14803 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
14804
14805 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
14806
14807 if (CONST_INT_P (mask))
14808 mask = fixup_modeless_constant (mask, HImode);
14809
14810 mask = force_reg (HImode, mask);
14811
14812 if (GET_MODE (mask) != HImode)
14813 mask = gen_rtx_SUBREG (HImode, mask, 0);
14814
14815 /* If merge is 0 then we're about to emit z-masked variant. */
14816 if (const0_operand (merge, mode))
14817 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
14818 /* If merge is the same as accum then emit merge-masked variant. */
14819 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
14820 {
14821 merge = force_reg (mode, merge);
14822 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
14823 }
14824 /* Merge with something unknown might happen if we z-mask w/ -O0. */
14825 else
14826 {
14827 target = gen_reg_rtx (mode);
14828 emit_move_insn (target, merge);
14829 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
14830 }
14831 }
14832 return target;
14833 }
14834
14835 case IX86_BUILTIN_4FNMASS:
14836 fcn = gen_avx5124fmaddps_4fnmaddss;
14837 masked = 0;
14838 goto s4fma_expand;
14839
14840 case IX86_BUILTIN_4FMASS:
14841 fcn = gen_avx5124fmaddps_4fmaddss;
14842 masked = 0;
14843 goto s4fma_expand;
14844
14845 case IX86_BUILTIN_4FNMASS_MASK:
14846 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
14847 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
14848 goto s4fma_expand;
14849
14850 case IX86_BUILTIN_4FMASS_MASK:
14851 {
14852 tree args[4];
14853 rtx ops[4];
14854 rtx wide_reg;
14855 rtx accum;
14856 rtx addr;
14857 rtx mem;
14858
14859 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
14860 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
14861
14862 s4fma_expand:
14863 mode = V4SFmode;
14864 wide_reg = gen_reg_rtx (V64SFmode);
14865 for (i = 0; i < 4; i++)
14866 {
14867 rtx tmp;
14868 args[i] = CALL_EXPR_ARG (exp, i);
14869 ops[i] = expand_normal (args[i]);
14870
14871 tmp = gen_reg_rtx (SFmode);
14872 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
14873
14874 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
14875 gen_rtx_SUBREG (V16SFmode, tmp, 0));
14876 }
14877
14878 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
14879 accum = force_reg (V4SFmode, accum);
14880
14881 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
14882 addr = force_reg (Pmode, addr);
14883
14884 mem = gen_rtx_MEM (V4SFmode, addr);
14885
14886 target = gen_reg_rtx (V4SFmode);
14887
14888 emit_move_insn (target, accum);
14889
14890 if (! masked)
14891 emit_insn (fcn (target, accum, wide_reg, mem));
14892 else
14893 {
14894 rtx merge, mask;
14895 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
14896
14897 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
14898
14899 if (CONST_INT_P (mask))
14900 mask = fixup_modeless_constant (mask, QImode);
14901
14902 mask = force_reg (QImode, mask);
14903
14904 if (GET_MODE (mask) != QImode)
14905 mask = gen_rtx_SUBREG (QImode, mask, 0);
14906
14907 /* If merge is 0 then we're about to emit z-masked variant. */
14908 if (const0_operand (merge, mode))
14909 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
14910 /* If merge is the same as accum then emit merge-masked
14911 variant. */
14912 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
14913 {
14914 merge = force_reg (mode, merge);
14915 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
14916 }
14917 /* Merge with something unknown might happen if we z-mask
14918 w/ -O0. */
14919 else
14920 {
14921 target = gen_reg_rtx (mode);
14922 emit_move_insn (target, merge);
14923 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
14924 }
14925 }
14926 return target;
14927 }
14928 case IX86_BUILTIN_RDPID:
14929 return ix86_expand_special_args_builtin (bdesc_args + i, exp,
14930 target);
14931 case IX86_BUILTIN_FABSQ:
14932 case IX86_BUILTIN_COPYSIGNQ:
14933 if (!TARGET_SSE)
14934 /* Emit a normal call if SSE isn't available. */
14935 return expand_call (exp, target, ignore);
14936 /* FALLTHRU */
14937 default:
14938 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
14939 }
14940 }
14941
14942 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
14943 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
14944 {
14945 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
14946 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
14947 }
14948
14949 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
14950 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
14951 {
14952 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
14953 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
14954 }
14955
14956 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
14957 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
14958 {
14959 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
14960 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
14961 }
14962
14963 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
14964 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
14965 {
14966 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
14967 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
14968 }
14969
14970 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
14971 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
14972 {
14973 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
14974 const struct builtin_description *d = bdesc_multi_arg + i;
14975 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
14976 (enum ix86_builtin_func_type)
14977 d->flag, d->comparison);
14978 }
14979
14980 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
14981 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
14982 {
14983 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
14984 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
14985 target);
14986 }
14987
14988 gcc_unreachable ();
14989 }
14990
14991 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
14992 fill target with val via vec_duplicate. */
14993
14994 static bool
14995 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
14996 {
14997 bool ok;
14998 rtx_insn *insn;
14999 rtx dup;
15000
15001 /* First attempt to recognize VAL as-is. */
15002 dup = gen_vec_duplicate (mode, val);
15003 insn = emit_insn (gen_rtx_SET (target, dup));
15004 if (recog_memoized (insn) < 0)
15005 {
15006 rtx_insn *seq;
15007 machine_mode innermode = GET_MODE_INNER (mode);
15008 rtx reg;
15009
15010 /* If that fails, force VAL into a register. */
15011
15012 start_sequence ();
15013 reg = force_reg (innermode, val);
15014 if (GET_MODE (reg) != innermode)
15015 reg = gen_lowpart (innermode, reg);
15016 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
15017 seq = get_insns ();
15018 end_sequence ();
15019 if (seq)
15020 emit_insn_before (seq, insn);
15021
15022 ok = recog_memoized (insn) >= 0;
15023 gcc_assert (ok);
15024 }
15025 return true;
15026 }
15027
15028 /* Get a vector mode of the same size as the original but with elements
15029 twice as wide. This is only guaranteed to apply to integral vectors. */
15030
15031 static machine_mode
15032 get_mode_wider_vector (machine_mode o)
15033 {
15034 /* ??? Rely on the ordering that genmodes.cc gives to vectors. */
15035 machine_mode n = GET_MODE_NEXT_MODE (o).require ();
15036 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
15037 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
15038 return n;
15039 }
15040
15041 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
15042 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
15043
15044 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15045 with all elements equal to VAR. Return true if successful. */
15046
15047 bool
15048 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
15049 rtx target, rtx val)
15050 {
15051 bool ok;
15052
15053 switch (mode)
15054 {
15055 case E_V2SImode:
15056 case E_V2SFmode:
15057 if (!mmx_ok)
15058 return false;
15059 /* FALLTHRU */
15060
15061 case E_V4DFmode:
15062 case E_V4DImode:
15063 case E_V8SFmode:
15064 case E_V8SImode:
15065 case E_V2DFmode:
15066 case E_V2DImode:
15067 case E_V4SFmode:
15068 case E_V4SImode:
15069 case E_V16SImode:
15070 case E_V8DImode:
15071 case E_V16SFmode:
15072 case E_V8DFmode:
15073 return ix86_vector_duplicate_value (mode, target, val);
15074
15075 case E_V4HImode:
15076 if (!mmx_ok)
15077 return false;
15078 if (TARGET_SSE || TARGET_3DNOW_A)
15079 {
15080 rtx x;
15081
15082 val = gen_lowpart (SImode, val);
15083 x = gen_rtx_TRUNCATE (HImode, val);
15084 x = gen_rtx_VEC_DUPLICATE (mode, x);
15085 emit_insn (gen_rtx_SET (target, x));
15086 return true;
15087 }
15088 goto widen;
15089
15090 case E_V2HImode:
15091 if (TARGET_SSE2)
15092 {
15093 rtx x;
15094
15095 val = gen_lowpart (SImode, val);
15096 x = gen_rtx_TRUNCATE (HImode, val);
15097 x = gen_rtx_VEC_DUPLICATE (mode, x);
15098 emit_insn (gen_rtx_SET (target, x));
15099 return true;
15100 }
15101 return false;
15102
15103 case E_V8QImode:
15104 case E_V4QImode:
15105 if (!mmx_ok)
15106 return false;
15107 goto widen;
15108
15109 case E_V8HImode:
15110 case E_V8HFmode:
15111 case E_V8BFmode:
15112 if (TARGET_AVX2)
15113 return ix86_vector_duplicate_value (mode, target, val);
15114
15115 if (TARGET_SSE2)
15116 {
15117 struct expand_vec_perm_d dperm;
15118 rtx tmp1, tmp2;
15119
15120 permute:
15121 memset (&dperm, 0, sizeof (dperm));
15122 dperm.target = target;
15123 dperm.vmode = mode;
15124 dperm.nelt = GET_MODE_NUNITS (mode);
15125 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
15126 dperm.one_operand_p = true;
15127
15128 if (mode == V8HFmode || mode == V8BFmode)
15129 {
15130 tmp1 = force_reg (GET_MODE_INNER (mode), val);
15131 tmp2 = gen_reg_rtx (mode);
15132 emit_insn (maybe_gen_vec_set_0 (mode, tmp2,
15133 CONST0_RTX (mode), tmp1));
15134 tmp1 = gen_lowpart (mode, tmp2);
15135 }
15136 else
15137 {
15138 /* Extend to SImode using a paradoxical SUBREG. */
15139 tmp1 = gen_reg_rtx (SImode);
15140 emit_move_insn (tmp1, gen_lowpart (SImode, val));
15141
15142 /* Insert the SImode value as
15143 low element of a V4SImode vector. */
15144 tmp2 = gen_reg_rtx (V4SImode);
15145 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
15146 tmp1 = gen_lowpart (mode, tmp2);
15147 }
15148
15149 emit_move_insn (dperm.op0, tmp1);
15150 ok = (expand_vec_perm_1 (&dperm)
15151 || expand_vec_perm_broadcast_1 (&dperm));
15152 gcc_assert (ok);
15153 return ok;
15154 }
15155 goto widen;
15156
15157 case E_V16QImode:
15158 if (TARGET_AVX2)
15159 return ix86_vector_duplicate_value (mode, target, val);
15160
15161 if (TARGET_SSE2)
15162 goto permute;
15163 goto widen;
15164
15165 widen:
15166 /* Replicate the value once into the next wider mode and recurse. */
15167 {
15168 machine_mode smode, wsmode, wvmode;
15169 rtx x;
15170
15171 smode = GET_MODE_INNER (mode);
15172 wvmode = get_mode_wider_vector (mode);
15173 wsmode = GET_MODE_INNER (wvmode);
15174
15175 val = convert_modes (wsmode, smode, val, true);
15176
15177 if (smode == QImode && !TARGET_PARTIAL_REG_STALL)
15178 emit_insn (gen_insv_1 (wsmode, val, val));
15179 else
15180 {
15181 x = expand_simple_binop (wsmode, ASHIFT, val,
15182 GEN_INT (GET_MODE_BITSIZE (smode)),
15183 NULL_RTX, 1, OPTAB_LIB_WIDEN);
15184 val = expand_simple_binop (wsmode, IOR, val, x, x, 1,
15185 OPTAB_LIB_WIDEN);
15186 }
15187
15188 x = gen_reg_rtx (wvmode);
15189 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
15190 gcc_assert (ok);
15191 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
15192 return ok;
15193 }
15194
15195 case E_V16HImode:
15196 case E_V16HFmode:
15197 case E_V16BFmode:
15198 case E_V32QImode:
15199 if (TARGET_AVX2)
15200 return ix86_vector_duplicate_value (mode, target, val);
15201 else
15202 {
15203 machine_mode hvmode;
15204 switch (mode)
15205 {
15206 case V16HImode:
15207 hvmode = V8HImode;
15208 break;
15209 case V16HFmode:
15210 hvmode = V8HFmode;
15211 break;
15212 case V16BFmode:
15213 hvmode = V8BFmode;
15214 break;
15215 case V32QImode:
15216 hvmode = V16QImode;
15217 break;
15218 default:
15219 gcc_unreachable ();
15220 }
15221 rtx x = gen_reg_rtx (hvmode);
15222
15223 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
15224 gcc_assert (ok);
15225
15226 x = gen_rtx_VEC_CONCAT (mode, x, x);
15227 emit_insn (gen_rtx_SET (target, x));
15228 }
15229 return true;
15230
15231 case E_V32HImode:
15232 case E_V32HFmode:
15233 case E_V32BFmode:
15234 case E_V64QImode:
15235 if (TARGET_AVX512BW)
15236 return ix86_vector_duplicate_value (mode, target, val);
15237 else
15238 {
15239 machine_mode hvmode;
15240 switch (mode)
15241 {
15242 case V32HImode:
15243 hvmode = V16HImode;
15244 break;
15245 case V32HFmode:
15246 hvmode = V16HFmode;
15247 break;
15248 case V32BFmode:
15249 hvmode = V16BFmode;
15250 break;
15251 case V64QImode:
15252 hvmode = V32QImode;
15253 break;
15254 default:
15255 gcc_unreachable ();
15256 }
15257 rtx x = gen_reg_rtx (hvmode);
15258
15259 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
15260 gcc_assert (ok);
15261
15262 x = gen_rtx_VEC_CONCAT (mode, x, x);
15263 emit_insn (gen_rtx_SET (target, x));
15264 }
15265 return true;
15266
15267 default:
15268 return false;
15269 }
15270 }
15271
15272 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15273 whose ONE_VAR element is VAR, and other elements are zero. Return true
15274 if successful. */
15275
15276 static bool
15277 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
15278 rtx target, rtx var, int one_var)
15279 {
15280 machine_mode vsimode;
15281 rtx new_target;
15282 rtx x, tmp;
15283 bool use_vector_set = false;
15284 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
15285
15286 switch (mode)
15287 {
15288 case E_V2DImode:
15289 /* For SSE4.1, we normally use vector set. But if the second
15290 element is zero and inter-unit moves are OK, we use movq
15291 instead. */
15292 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
15293 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
15294 && one_var == 0));
15295 break;
15296 case E_V16QImode:
15297 case E_V4SImode:
15298 case E_V4SFmode:
15299 use_vector_set = TARGET_SSE4_1;
15300 break;
15301 case E_V8HImode:
15302 use_vector_set = TARGET_SSE2;
15303 gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
15304 ? gen_vec_setv8hi_0 : NULL;
15305 break;
15306 case E_V8QImode:
15307 use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
15308 break;
15309 case E_V4HImode:
15310 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
15311 break;
15312 case E_V4QImode:
15313 use_vector_set = TARGET_SSE4_1;
15314 break;
15315 case E_V32QImode:
15316 use_vector_set = TARGET_AVX;
15317 break;
15318 case E_V16HImode:
15319 use_vector_set = TARGET_AVX;
15320 gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
15321 ? gen_vec_setv16hi_0 : NULL;
15322 break;
15323 case E_V8SImode:
15324 use_vector_set = TARGET_AVX;
15325 gen_vec_set_0 = gen_vec_setv8si_0;
15326 break;
15327 case E_V8SFmode:
15328 use_vector_set = TARGET_AVX;
15329 gen_vec_set_0 = gen_vec_setv8sf_0;
15330 break;
15331 case E_V4DFmode:
15332 use_vector_set = TARGET_AVX;
15333 gen_vec_set_0 = gen_vec_setv4df_0;
15334 break;
15335 case E_V4DImode:
15336 /* Use ix86_expand_vector_set in 64bit mode only. */
15337 use_vector_set = TARGET_AVX && TARGET_64BIT;
15338 gen_vec_set_0 = gen_vec_setv4di_0;
15339 break;
15340 case E_V16SImode:
15341 use_vector_set = TARGET_AVX512F && one_var == 0;
15342 gen_vec_set_0 = gen_vec_setv16si_0;
15343 break;
15344 case E_V16SFmode:
15345 use_vector_set = TARGET_AVX512F && one_var == 0;
15346 gen_vec_set_0 = gen_vec_setv16sf_0;
15347 break;
15348 case E_V8DFmode:
15349 use_vector_set = TARGET_AVX512F && one_var == 0;
15350 gen_vec_set_0 = gen_vec_setv8df_0;
15351 break;
15352 case E_V8DImode:
15353 /* Use ix86_expand_vector_set in 64bit mode only. */
15354 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
15355 gen_vec_set_0 = gen_vec_setv8di_0;
15356 break;
15357 case E_V8HFmode:
15358 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15359 gen_vec_set_0 = gen_vec_setv8hf_0;
15360 break;
15361 case E_V16HFmode:
15362 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15363 gen_vec_set_0 = gen_vec_setv16hf_0;
15364 break;
15365 case E_V32HFmode:
15366 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15367 gen_vec_set_0 = gen_vec_setv32hf_0;
15368 break;
15369 case E_V8BFmode:
15370 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15371 gen_vec_set_0 = gen_vec_setv8bf_0;
15372 break;
15373 case E_V16BFmode:
15374 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15375 gen_vec_set_0 = gen_vec_setv16bf_0;
15376 break;
15377 case E_V32BFmode:
15378 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15379 gen_vec_set_0 = gen_vec_setv32bf_0;
15380 break;
15381 case E_V32HImode:
15382 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15383 gen_vec_set_0 = gen_vec_setv32hi_0;
15384 default:
15385 break;
15386 }
15387
15388 if (use_vector_set)
15389 {
15390 if (gen_vec_set_0 && one_var == 0)
15391 {
15392 var = force_reg (GET_MODE_INNER (mode), var);
15393 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
15394 return true;
15395 }
15396 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
15397 var = force_reg (GET_MODE_INNER (mode), var);
15398 ix86_expand_vector_set (mmx_ok, target, var, one_var);
15399 return true;
15400 }
15401
15402 switch (mode)
15403 {
15404 case E_V2SFmode:
15405 case E_V2SImode:
15406 if (!mmx_ok)
15407 return false;
15408 /* FALLTHRU */
15409
15410 case E_V2DFmode:
15411 case E_V2DImode:
15412 if (one_var != 0)
15413 return false;
15414 var = force_reg (GET_MODE_INNER (mode), var);
15415 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
15416 emit_insn (gen_rtx_SET (target, x));
15417 return true;
15418
15419 case E_V4SFmode:
15420 case E_V4SImode:
15421 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
15422 new_target = gen_reg_rtx (mode);
15423 else
15424 new_target = target;
15425 var = force_reg (GET_MODE_INNER (mode), var);
15426 x = gen_rtx_VEC_DUPLICATE (mode, var);
15427 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
15428 emit_insn (gen_rtx_SET (new_target, x));
15429 if (one_var != 0)
15430 {
15431 /* We need to shuffle the value to the correct position, so
15432 create a new pseudo to store the intermediate result. */
15433
15434 /* With SSE2, we can use the integer shuffle insns. */
15435 if (mode != V4SFmode && TARGET_SSE2)
15436 {
15437 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
15438 const1_rtx,
15439 GEN_INT (one_var == 1 ? 0 : 1),
15440 GEN_INT (one_var == 2 ? 0 : 1),
15441 GEN_INT (one_var == 3 ? 0 : 1)));
15442 if (target != new_target)
15443 emit_move_insn (target, new_target);
15444 return true;
15445 }
15446
15447 /* Otherwise convert the intermediate result to V4SFmode and
15448 use the SSE1 shuffle instructions. */
15449 if (mode != V4SFmode)
15450 {
15451 tmp = gen_reg_rtx (V4SFmode);
15452 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
15453 }
15454 else
15455 tmp = new_target;
15456
15457 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
15458 const1_rtx,
15459 GEN_INT (one_var == 1 ? 0 : 1),
15460 GEN_INT (one_var == 2 ? 0+4 : 1+4),
15461 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
15462
15463 if (mode != V4SFmode)
15464 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
15465 else if (tmp != target)
15466 emit_move_insn (target, tmp);
15467 }
15468 else if (target != new_target)
15469 emit_move_insn (target, new_target);
15470 return true;
15471
15472 case E_V8HImode:
15473 case E_V16QImode:
15474 vsimode = V4SImode;
15475 goto widen;
15476 case E_V4HImode:
15477 case E_V8QImode:
15478 if (!mmx_ok)
15479 return false;
15480 vsimode = V2SImode;
15481 goto widen;
15482 widen:
15483 if (one_var != 0)
15484 return false;
15485
15486 /* Zero extend the variable element to SImode and recurse. */
15487 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
15488
15489 x = gen_reg_rtx (vsimode);
15490 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
15491 var, one_var))
15492 gcc_unreachable ();
15493
15494 emit_move_insn (target, gen_lowpart (mode, x));
15495 return true;
15496
15497 default:
15498 return false;
15499 }
15500 }
15501
15502 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15503 consisting of the values in VALS. It is known that all elements
15504 except ONE_VAR are constants. Return true if successful. */
15505
15506 static bool
15507 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
15508 rtx target, rtx vals, int one_var)
15509 {
15510 rtx var = XVECEXP (vals, 0, one_var);
15511 machine_mode wmode;
15512 rtx const_vec, x;
15513
15514 const_vec = copy_rtx (vals);
15515 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
15516 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
15517
15518 switch (mode)
15519 {
15520 case E_V2DFmode:
15521 case E_V2DImode:
15522 case E_V2SFmode:
15523 case E_V2SImode:
15524 /* For the two element vectors, it's just as easy to use
15525 the general case. */
15526 return false;
15527
15528 case E_V4DImode:
15529 /* Use ix86_expand_vector_set in 64bit mode only. */
15530 if (!TARGET_64BIT)
15531 return false;
15532 /* FALLTHRU */
15533 case E_V8HFmode:
15534 case E_V16HFmode:
15535 case E_V8BFmode:
15536 case E_V16BFmode:
15537 case E_V4DFmode:
15538 case E_V8SFmode:
15539 case E_V8SImode:
15540 case E_V16HImode:
15541 case E_V32QImode:
15542 case E_V4SFmode:
15543 case E_V4SImode:
15544 case E_V8HImode:
15545 case E_V4HImode:
15546 break;
15547
15548 case E_V16QImode:
15549 if (TARGET_SSE4_1)
15550 break;
15551 wmode = V8HImode;
15552 goto widen;
15553 case E_V8QImode:
15554 if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
15555 break;
15556 wmode = V4HImode;
15557 goto widen;
15558 case E_V4QImode:
15559 if (TARGET_SSE4_1)
15560 break;
15561 wmode = V2HImode;
15562 widen:
15563 /* There's no way to set one QImode entry easily. Combine
15564 the variable value with its adjacent constant value, and
15565 promote to an HImode set. */
15566 x = XVECEXP (vals, 0, one_var ^ 1);
15567 if (one_var & 1)
15568 {
15569 var = convert_modes (HImode, QImode, var, true);
15570 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
15571 NULL_RTX, 1, OPTAB_LIB_WIDEN);
15572 x = GEN_INT (INTVAL (x) & 0xff);
15573 }
15574 else
15575 {
15576 var = convert_modes (HImode, QImode, var, true);
15577 x = gen_int_mode (UINTVAL (x) << 8, HImode);
15578 }
15579 if (x != const0_rtx)
15580 var = expand_simple_binop (HImode, IOR, var, x, var,
15581 1, OPTAB_LIB_WIDEN);
15582
15583 x = gen_reg_rtx (wmode);
15584 emit_move_insn (x, gen_lowpart (wmode, const_vec));
15585 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
15586
15587 emit_move_insn (target, gen_lowpart (mode, x));
15588 return true;
15589
15590 default:
15591 return false;
15592 }
15593
15594 emit_move_insn (target, const_vec);
15595 ix86_expand_vector_set (mmx_ok, target, var, one_var);
15596 return true;
15597 }
15598
15599 /* A subroutine of ix86_expand_vector_init_general. Use vector
15600 concatenate to handle the most general case: all values variable,
15601 and none identical. */
15602
15603 static void
15604 ix86_expand_vector_init_concat (machine_mode mode,
15605 rtx target, rtx *ops, int n)
15606 {
15607 machine_mode half_mode = VOIDmode;
15608 rtx half[2];
15609 rtvec v;
15610 int i, j;
15611
15612 switch (n)
15613 {
15614 case 2:
15615 switch (mode)
15616 {
15617 case E_V32HFmode:
15618 half_mode = V16HFmode;
15619 break;
15620 case E_V32BFmode:
15621 half_mode = V16BFmode;
15622 break;
15623 case E_V16SImode:
15624 half_mode = V8SImode;
15625 break;
15626 case E_V16SFmode:
15627 half_mode = V8SFmode;
15628 break;
15629 case E_V8DImode:
15630 half_mode = V4DImode;
15631 break;
15632 case E_V8DFmode:
15633 half_mode = V4DFmode;
15634 break;
15635 case E_V16HFmode:
15636 half_mode = V8HFmode;
15637 break;
15638 case E_V16BFmode:
15639 half_mode = V8BFmode;
15640 break;
15641 case E_V8SImode:
15642 half_mode = V4SImode;
15643 break;
15644 case E_V8SFmode:
15645 half_mode = V4SFmode;
15646 break;
15647 case E_V4DImode:
15648 half_mode = V2DImode;
15649 break;
15650 case E_V4DFmode:
15651 half_mode = V2DFmode;
15652 break;
15653 case E_V4SImode:
15654 half_mode = V2SImode;
15655 break;
15656 case E_V4SFmode:
15657 half_mode = V2SFmode;
15658 break;
15659 case E_V2DImode:
15660 half_mode = DImode;
15661 break;
15662 case E_V2SImode:
15663 half_mode = SImode;
15664 break;
15665 case E_V2DFmode:
15666 half_mode = DFmode;
15667 break;
15668 case E_V2SFmode:
15669 half_mode = SFmode;
15670 break;
15671 default:
15672 gcc_unreachable ();
15673 }
15674
15675 if (!register_operand (ops[1], half_mode))
15676 ops[1] = force_reg (half_mode, ops[1]);
15677 if (!register_operand (ops[0], half_mode))
15678 ops[0] = force_reg (half_mode, ops[0]);
15679 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
15680 ops[1])));
15681 break;
15682
15683 case 4:
15684 switch (mode)
15685 {
15686 case E_V4DImode:
15687 half_mode = V2DImode;
15688 break;
15689 case E_V4DFmode:
15690 half_mode = V2DFmode;
15691 break;
15692 case E_V4SImode:
15693 half_mode = V2SImode;
15694 break;
15695 case E_V4SFmode:
15696 half_mode = V2SFmode;
15697 break;
15698 default:
15699 gcc_unreachable ();
15700 }
15701 goto half;
15702
15703 case 8:
15704 switch (mode)
15705 {
15706 case E_V8DImode:
15707 half_mode = V4DImode;
15708 break;
15709 case E_V8DFmode:
15710 half_mode = V4DFmode;
15711 break;
15712 case E_V8SImode:
15713 half_mode = V4SImode;
15714 break;
15715 case E_V8SFmode:
15716 half_mode = V4SFmode;
15717 break;
15718 default:
15719 gcc_unreachable ();
15720 }
15721 goto half;
15722
15723 case 16:
15724 switch (mode)
15725 {
15726 case E_V16SImode:
15727 half_mode = V8SImode;
15728 break;
15729 case E_V16SFmode:
15730 half_mode = V8SFmode;
15731 break;
15732 default:
15733 gcc_unreachable ();
15734 }
15735 goto half;
15736
15737 half:
15738 /* FIXME: We process inputs backward to help RA. PR 36222. */
15739 i = n - 1;
15740 for (j = 1; j != -1; j--)
15741 {
15742 half[j] = gen_reg_rtx (half_mode);
15743 switch (n >> 1)
15744 {
15745 case 2:
15746 v = gen_rtvec (2, ops[i-1], ops[i]);
15747 i -= 2;
15748 break;
15749 case 4:
15750 v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
15751 i -= 4;
15752 break;
15753 case 8:
15754 v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
15755 ops[i-3], ops[i-2], ops[i-1], ops[i]);
15756 i -= 8;
15757 break;
15758 default:
15759 gcc_unreachable ();
15760 }
15761 ix86_expand_vector_init (false, half[j],
15762 gen_rtx_PARALLEL (half_mode, v));
15763 }
15764
15765 ix86_expand_vector_init_concat (mode, target, half, 2);
15766 break;
15767
15768 default:
15769 gcc_unreachable ();
15770 }
15771 }
15772
15773 /* A subroutine of ix86_expand_vector_init_general. Use vector
15774 interleave to handle the most general case: all values variable,
15775 and none identical. */
15776
15777 static void
15778 ix86_expand_vector_init_interleave (machine_mode mode,
15779 rtx target, rtx *ops, int n)
15780 {
15781 machine_mode first_imode, second_imode, third_imode, inner_mode;
15782 int i, j;
15783 rtx op, op0, op1;
15784 rtx (*gen_load_even) (rtx, rtx, rtx);
15785 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
15786 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
15787
15788 switch (mode)
15789 {
15790 case E_V8HFmode:
15791 gen_load_even = gen_vec_interleave_lowv8hf;
15792 gen_interleave_first_low = gen_vec_interleave_lowv4si;
15793 gen_interleave_second_low = gen_vec_interleave_lowv2di;
15794 inner_mode = HFmode;
15795 first_imode = V4SImode;
15796 second_imode = V2DImode;
15797 third_imode = VOIDmode;
15798 break;
15799 case E_V8BFmode:
15800 gen_load_even = gen_vec_interleave_lowv8bf;
15801 gen_interleave_first_low = gen_vec_interleave_lowv4si;
15802 gen_interleave_second_low = gen_vec_interleave_lowv2di;
15803 inner_mode = BFmode;
15804 first_imode = V4SImode;
15805 second_imode = V2DImode;
15806 third_imode = VOIDmode;
15807 break;
15808 case E_V8HImode:
15809 gen_load_even = gen_vec_setv8hi;
15810 gen_interleave_first_low = gen_vec_interleave_lowv4si;
15811 gen_interleave_second_low = gen_vec_interleave_lowv2di;
15812 inner_mode = HImode;
15813 first_imode = V4SImode;
15814 second_imode = V2DImode;
15815 third_imode = VOIDmode;
15816 break;
15817 case E_V16QImode:
15818 gen_load_even = gen_vec_setv16qi;
15819 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
15820 gen_interleave_second_low = gen_vec_interleave_lowv4si;
15821 inner_mode = QImode;
15822 first_imode = V8HImode;
15823 second_imode = V4SImode;
15824 third_imode = V2DImode;
15825 break;
15826 default:
15827 gcc_unreachable ();
15828 }
15829
15830 for (i = 0; i < n; i++)
15831 {
15832 op = ops [i + i];
15833 if (inner_mode == HFmode || inner_mode == BFmode)
15834 {
15835 rtx even, odd;
15836 /* Use vpuncklwd to pack 2 HFmode or BFmode. */
15837 machine_mode vec_mode =
15838 (inner_mode == HFmode) ? V8HFmode : V8BFmode;
15839 op0 = gen_reg_rtx (vec_mode);
15840 even = lowpart_subreg (vec_mode,
15841 force_reg (inner_mode, op), inner_mode);
15842 odd = lowpart_subreg (vec_mode,
15843 force_reg (inner_mode, ops[i + i + 1]),
15844 inner_mode);
15845 emit_insn (gen_load_even (op0, even, odd));
15846 }
15847 else
15848 {
15849 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
15850 op0 = gen_reg_rtx (SImode);
15851 emit_move_insn (op0, gen_lowpart (SImode, op));
15852
15853 /* Insert the SImode value as low element of V4SImode vector. */
15854 op1 = gen_reg_rtx (V4SImode);
15855 op0 = gen_rtx_VEC_MERGE (V4SImode,
15856 gen_rtx_VEC_DUPLICATE (V4SImode,
15857 op0),
15858 CONST0_RTX (V4SImode),
15859 const1_rtx);
15860 emit_insn (gen_rtx_SET (op1, op0));
15861
15862 /* Cast the V4SImode vector back to a vector in orignal mode. */
15863 op0 = gen_reg_rtx (mode);
15864 emit_move_insn (op0, gen_lowpart (mode, op1));
15865
15866 /* Load even elements into the second position. */
15867 emit_insn (gen_load_even (op0,
15868 force_reg (inner_mode,
15869 ops[i + i + 1]),
15870 const1_rtx));
15871 }
15872
15873 /* Cast vector to FIRST_IMODE vector. */
15874 ops[i] = gen_reg_rtx (first_imode);
15875 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
15876 }
15877
15878 /* Interleave low FIRST_IMODE vectors. */
15879 for (i = j = 0; i < n; i += 2, j++)
15880 {
15881 op0 = gen_reg_rtx (first_imode);
15882 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
15883
15884 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
15885 ops[j] = gen_reg_rtx (second_imode);
15886 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
15887 }
15888
15889 /* Interleave low SECOND_IMODE vectors. */
15890 switch (second_imode)
15891 {
15892 case E_V4SImode:
15893 for (i = j = 0; i < n / 2; i += 2, j++)
15894 {
15895 op0 = gen_reg_rtx (second_imode);
15896 emit_insn (gen_interleave_second_low (op0, ops[i],
15897 ops[i + 1]));
15898
15899 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
15900 vector. */
15901 ops[j] = gen_reg_rtx (third_imode);
15902 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
15903 }
15904 second_imode = V2DImode;
15905 gen_interleave_second_low = gen_vec_interleave_lowv2di;
15906 /* FALLTHRU */
15907
15908 case E_V2DImode:
15909 op0 = gen_reg_rtx (second_imode);
15910 emit_insn (gen_interleave_second_low (op0, ops[0],
15911 ops[1]));
15912
15913 /* Cast the SECOND_IMODE vector back to a vector on original
15914 mode. */
15915 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
15916 break;
15917
15918 default:
15919 gcc_unreachable ();
15920 }
15921 }
15922
15923 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
15924 all values variable, and none identical. */
15925
15926 static void
15927 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
15928 rtx target, rtx vals)
15929 {
15930 rtx ops[64], op0, op1, op2, op3, op4, op5;
15931 machine_mode half_mode = VOIDmode;
15932 machine_mode quarter_mode = VOIDmode;
15933 int n, i;
15934
15935 switch (mode)
15936 {
15937 case E_V2SFmode:
15938 case E_V2SImode:
15939 if (!mmx_ok && !TARGET_SSE)
15940 break;
15941 /* FALLTHRU */
15942
15943 case E_V16SImode:
15944 case E_V16SFmode:
15945 case E_V8DFmode:
15946 case E_V8DImode:
15947 case E_V8SFmode:
15948 case E_V8SImode:
15949 case E_V4DFmode:
15950 case E_V4DImode:
15951 case E_V4SFmode:
15952 case E_V4SImode:
15953 case E_V2DFmode:
15954 case E_V2DImode:
15955 n = GET_MODE_NUNITS (mode);
15956 for (i = 0; i < n; i++)
15957 ops[i] = XVECEXP (vals, 0, i);
15958 ix86_expand_vector_init_concat (mode, target, ops, n);
15959 return;
15960
15961 case E_V2TImode:
15962 for (i = 0; i < 2; i++)
15963 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
15964 op0 = gen_reg_rtx (V4DImode);
15965 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
15966 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
15967 return;
15968
15969 case E_V4TImode:
15970 for (i = 0; i < 4; i++)
15971 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
15972 ops[4] = gen_reg_rtx (V4DImode);
15973 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
15974 ops[5] = gen_reg_rtx (V4DImode);
15975 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
15976 op0 = gen_reg_rtx (V8DImode);
15977 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
15978 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
15979 return;
15980
15981 case E_V32QImode:
15982 half_mode = V16QImode;
15983 goto half;
15984
15985 case E_V16HImode:
15986 half_mode = V8HImode;
15987 goto half;
15988
15989 case E_V16HFmode:
15990 half_mode = V8HFmode;
15991 goto half;
15992
15993 case E_V16BFmode:
15994 half_mode = V8BFmode;
15995 goto half;
15996
15997 half:
15998 n = GET_MODE_NUNITS (mode);
15999 for (i = 0; i < n; i++)
16000 ops[i] = XVECEXP (vals, 0, i);
16001 op0 = gen_reg_rtx (half_mode);
16002 op1 = gen_reg_rtx (half_mode);
16003 ix86_expand_vector_init_interleave (half_mode, op0, ops,
16004 n >> 2);
16005 ix86_expand_vector_init_interleave (half_mode, op1,
16006 &ops [n >> 1], n >> 2);
16007 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
16008 return;
16009
16010 case E_V64QImode:
16011 quarter_mode = V16QImode;
16012 half_mode = V32QImode;
16013 goto quarter;
16014
16015 case E_V32HImode:
16016 quarter_mode = V8HImode;
16017 half_mode = V16HImode;
16018 goto quarter;
16019
16020 case E_V32HFmode:
16021 quarter_mode = V8HFmode;
16022 half_mode = V16HFmode;
16023 goto quarter;
16024
16025 case E_V32BFmode:
16026 quarter_mode = V8BFmode;
16027 half_mode = V16BFmode;
16028 goto quarter;
16029
16030 quarter:
16031 n = GET_MODE_NUNITS (mode);
16032 for (i = 0; i < n; i++)
16033 ops[i] = XVECEXP (vals, 0, i);
16034 op0 = gen_reg_rtx (quarter_mode);
16035 op1 = gen_reg_rtx (quarter_mode);
16036 op2 = gen_reg_rtx (quarter_mode);
16037 op3 = gen_reg_rtx (quarter_mode);
16038 op4 = gen_reg_rtx (half_mode);
16039 op5 = gen_reg_rtx (half_mode);
16040 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
16041 n >> 3);
16042 ix86_expand_vector_init_interleave (quarter_mode, op1,
16043 &ops [n >> 2], n >> 3);
16044 ix86_expand_vector_init_interleave (quarter_mode, op2,
16045 &ops [n >> 1], n >> 3);
16046 ix86_expand_vector_init_interleave (quarter_mode, op3,
16047 &ops [(n >> 1) | (n >> 2)], n >> 3);
16048 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
16049 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
16050 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
16051 return;
16052
16053 case E_V16QImode:
16054 if (!TARGET_SSE4_1)
16055 break;
16056 /* FALLTHRU */
16057
16058 case E_V8HImode:
16059 if (!TARGET_SSE2)
16060 break;
16061
16062 /* Don't use ix86_expand_vector_init_interleave if we can't
16063 move from GPR to SSE register directly. */
16064 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
16065 break;
16066 /* FALLTHRU */
16067
16068 case E_V8HFmode:
16069 case E_V8BFmode:
16070
16071 n = GET_MODE_NUNITS (mode);
16072 for (i = 0; i < n; i++)
16073 ops[i] = XVECEXP (vals, 0, i);
16074 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
16075 return;
16076
16077 case E_V4HImode:
16078 case E_V8QImode:
16079
16080 case E_V2HImode:
16081 case E_V4QImode:
16082 break;
16083
16084 default:
16085 gcc_unreachable ();
16086 }
16087
16088 {
16089 int i, j, n_elts, n_words, n_elt_per_word;
16090 machine_mode tmp_mode, inner_mode;
16091 rtx words[4], shift;
16092
16093 tmp_mode = (GET_MODE_SIZE (mode) < UNITS_PER_WORD) ? SImode : word_mode;
16094
16095 inner_mode = GET_MODE_INNER (mode);
16096 n_elts = GET_MODE_NUNITS (mode);
16097 n_words = GET_MODE_SIZE (mode) / GET_MODE_SIZE (tmp_mode);
16098 n_elt_per_word = n_elts / n_words;
16099 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
16100
16101 for (i = 0; i < n_words; ++i)
16102 {
16103 rtx word = NULL_RTX;
16104
16105 for (j = 0; j < n_elt_per_word; ++j)
16106 {
16107 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
16108 elt = convert_modes (tmp_mode, inner_mode, elt, true);
16109
16110 if (j == 0)
16111 word = elt;
16112 else
16113 {
16114 word = expand_simple_binop (tmp_mode, ASHIFT, word, shift,
16115 NULL_RTX, 1, OPTAB_LIB_WIDEN);
16116 word = expand_simple_binop (tmp_mode, IOR, word, elt,
16117 NULL_RTX, 1, OPTAB_LIB_WIDEN);
16118 }
16119 }
16120
16121 words[i] = word;
16122 }
16123
16124 if (n_words == 1)
16125 emit_move_insn (target, gen_lowpart (mode, words[0]));
16126 else if (n_words == 2)
16127 {
16128 rtx tmp = gen_reg_rtx (mode);
16129 emit_clobber (tmp);
16130 emit_move_insn (gen_lowpart (tmp_mode, tmp), words[0]);
16131 emit_move_insn (gen_highpart (tmp_mode, tmp), words[1]);
16132 emit_move_insn (target, tmp);
16133 }
16134 else if (n_words == 4)
16135 {
16136 rtx tmp = gen_reg_rtx (V4SImode);
16137 gcc_assert (tmp_mode == SImode);
16138 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
16139 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
16140 emit_move_insn (target, gen_lowpart (mode, tmp));
16141 }
16142 else
16143 gcc_unreachable ();
16144 }
16145 }
16146
16147 /* Initialize vector TARGET via VALS. Suppress the use of MMX
16148 instructions unless MMX_OK is true. */
16149
16150 void
16151 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
16152 {
16153 machine_mode mode = GET_MODE (target);
16154 machine_mode inner_mode = GET_MODE_INNER (mode);
16155 int n_elts = GET_MODE_NUNITS (mode);
16156 int n_var = 0, one_var = -1;
16157 bool all_same = true, all_const_zero = true;
16158 int i;
16159 rtx x;
16160
16161 /* Handle first initialization from vector elts. */
16162 if (n_elts != XVECLEN (vals, 0))
16163 {
16164 rtx subtarget = target;
16165 x = XVECEXP (vals, 0, 0);
16166 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
16167 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
16168 {
16169 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
16170 if (inner_mode == QImode
16171 || inner_mode == HImode
16172 || inner_mode == TImode
16173 || inner_mode == HFmode
16174 || inner_mode == BFmode)
16175 {
16176 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
16177 scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
16178 n_bits /= GET_MODE_SIZE (elt_mode);
16179 mode = mode_for_vector (elt_mode, n_bits).require ();
16180 inner_mode = mode_for_vector (elt_mode, n_bits / 2).require ();
16181 ops[0] = gen_lowpart (inner_mode, ops[0]);
16182 ops[1] = gen_lowpart (inner_mode, ops[1]);
16183 subtarget = gen_reg_rtx (mode);
16184 }
16185 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
16186 if (subtarget != target)
16187 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
16188 return;
16189 }
16190 gcc_unreachable ();
16191 }
16192
16193 for (i = 0; i < n_elts; ++i)
16194 {
16195 x = XVECEXP (vals, 0, i);
16196 if (!(CONST_SCALAR_INT_P (x)
16197 || CONST_DOUBLE_P (x)
16198 || CONST_FIXED_P (x)))
16199 n_var++, one_var = i;
16200 else if (x != CONST0_RTX (inner_mode))
16201 all_const_zero = false;
16202 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
16203 all_same = false;
16204 }
16205
16206 /* Constants are best loaded from the constant pool. */
16207 if (n_var == 0)
16208 {
16209 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
16210 return;
16211 }
16212
16213 /* If all values are identical, broadcast the value. */
16214 if (all_same
16215 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
16216 XVECEXP (vals, 0, 0)))
16217 return;
16218
16219 /* Values where only one field is non-constant are best loaded from
16220 the pool and overwritten via move later. */
16221 if (n_var == 1)
16222 {
16223 if (all_const_zero
16224 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
16225 XVECEXP (vals, 0, one_var),
16226 one_var))
16227 return;
16228
16229 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
16230 return;
16231 }
16232
16233 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
16234 }
16235
16236 /* Implemented as
16237 V setg (V v, int idx, T val)
16238 {
16239 V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
16240 V valv = (V){val, val, val, val, val, val, val, val};
16241 V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
16242 v = (v & ~mask) | (valv & mask);
16243 return v;
16244 }. */
16245 void
16246 ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
16247 {
16248 rtx vec[64];
16249 machine_mode mode = GET_MODE (target);
16250 machine_mode cmp_mode = mode;
16251 int n_elts = GET_MODE_NUNITS (mode);
16252 rtx valv,idxv,constv,idx_tmp;
16253 bool ok = false;
16254
16255 /* 512-bits vector byte/word broadcast and comparison only available
16256 under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
16257 when without TARGET_AVX512BW. */
16258 if ((mode == V32HImode || mode == V32HFmode || mode == V32BFmode
16259 || mode == V64QImode)
16260 && !TARGET_AVX512BW)
16261 {
16262 gcc_assert (TARGET_AVX512F);
16263 rtx vhi, vlo, idx_hi;
16264 machine_mode half_mode;
16265 rtx (*extract_hi)(rtx, rtx);
16266 rtx (*extract_lo)(rtx, rtx);
16267
16268 if (mode == V32HImode)
16269 {
16270 half_mode = V16HImode;
16271 extract_hi = gen_vec_extract_hi_v32hi;
16272 extract_lo = gen_vec_extract_lo_v32hi;
16273 }
16274 else if (mode == V32HFmode)
16275 {
16276 half_mode = V16HFmode;
16277 extract_hi = gen_vec_extract_hi_v32hf;
16278 extract_lo = gen_vec_extract_lo_v32hf;
16279 }
16280 else if (mode == V32BFmode)
16281 {
16282 half_mode = V16BFmode;
16283 extract_hi = gen_vec_extract_hi_v32bf;
16284 extract_lo = gen_vec_extract_lo_v32bf;
16285 }
16286 else
16287 {
16288 half_mode = V32QImode;
16289 extract_hi = gen_vec_extract_hi_v64qi;
16290 extract_lo = gen_vec_extract_lo_v64qi;
16291 }
16292
16293 vhi = gen_reg_rtx (half_mode);
16294 vlo = gen_reg_rtx (half_mode);
16295 idx_hi = gen_reg_rtx (GET_MODE (idx));
16296 emit_insn (extract_hi (vhi, target));
16297 emit_insn (extract_lo (vlo, target));
16298 vec[0] = idx_hi;
16299 vec[1] = idx;
16300 vec[2] = GEN_INT (n_elts/2);
16301 ix86_expand_binary_operator (MINUS, GET_MODE (idx), vec);
16302 ix86_expand_vector_set_var (vhi, val, idx_hi);
16303 ix86_expand_vector_set_var (vlo, val, idx);
16304 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, vlo, vhi)));
16305 return;
16306 }
16307
16308 if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
16309 {
16310 switch (mode)
16311 {
16312 case E_V2DFmode:
16313 cmp_mode = V2DImode;
16314 break;
16315 case E_V4DFmode:
16316 cmp_mode = V4DImode;
16317 break;
16318 case E_V8DFmode:
16319 cmp_mode = V8DImode;
16320 break;
16321 case E_V2SFmode:
16322 cmp_mode = V2SImode;
16323 break;
16324 case E_V4SFmode:
16325 cmp_mode = V4SImode;
16326 break;
16327 case E_V8SFmode:
16328 cmp_mode = V8SImode;
16329 break;
16330 case E_V16SFmode:
16331 cmp_mode = V16SImode;
16332 break;
16333 case E_V8HFmode:
16334 cmp_mode = V8HImode;
16335 break;
16336 case E_V16HFmode:
16337 cmp_mode = V16HImode;
16338 break;
16339 case E_V32HFmode:
16340 cmp_mode = V32HImode;
16341 break;
16342 case E_V8BFmode:
16343 cmp_mode = V8HImode;
16344 break;
16345 case E_V16BFmode:
16346 cmp_mode = V16HImode;
16347 break;
16348 case E_V32BFmode:
16349 cmp_mode = V32HImode;
16350 break;
16351 default:
16352 gcc_unreachable ();
16353 }
16354 }
16355
16356 for (int i = 0; i != n_elts; i++)
16357 vec[i] = GEN_INT (i);
16358 constv = gen_rtx_CONST_VECTOR (cmp_mode, gen_rtvec_v (n_elts, vec));
16359 valv = gen_reg_rtx (mode);
16360 idxv = gen_reg_rtx (cmp_mode);
16361 idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
16362
16363 ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
16364 mode, valv, val);
16365 gcc_assert (ok);
16366 ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
16367 cmp_mode, idxv, idx_tmp);
16368 gcc_assert (ok);
16369 vec[0] = target;
16370 vec[1] = valv;
16371 vec[2] = target;
16372 vec[3] = gen_rtx_EQ (mode, idxv, constv);
16373 vec[4] = idxv;
16374 vec[5] = constv;
16375 ok = ix86_expand_int_vcond (vec);
16376 gcc_assert (ok);
16377 }
16378
16379 void
16380 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
16381 {
16382 machine_mode mode = GET_MODE (target);
16383 machine_mode inner_mode = GET_MODE_INNER (mode);
16384 machine_mode half_mode;
16385 bool use_vec_merge = false;
16386 bool blendm_const = false;
16387 rtx tmp;
16388 static rtx (*gen_extract[8][2]) (rtx, rtx)
16389 = {
16390 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
16391 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
16392 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
16393 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
16394 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
16395 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df },
16396 { gen_vec_extract_lo_v16hf, gen_vec_extract_hi_v16hf },
16397 { gen_vec_extract_lo_v16bf, gen_vec_extract_hi_v16bf }
16398 };
16399 static rtx (*gen_insert[8][2]) (rtx, rtx, rtx)
16400 = {
16401 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
16402 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
16403 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
16404 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
16405 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
16406 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df },
16407 { gen_vec_set_lo_v16hf, gen_vec_set_hi_v16hf },
16408 { gen_vec_set_lo_v16bf, gen_vec_set_hi_v16bf },
16409 };
16410 int i, j, n;
16411 machine_mode mmode = VOIDmode;
16412 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
16413
16414 switch (mode)
16415 {
16416 case E_V2SImode:
16417 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16418 if (use_vec_merge)
16419 break;
16420 /* FALLTHRU */
16421
16422 case E_V2SFmode:
16423 if (mmx_ok)
16424 {
16425 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
16426 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
16427 if (elt == 0)
16428 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
16429 else
16430 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
16431 emit_insn (gen_rtx_SET (target, tmp));
16432 return;
16433 }
16434 break;
16435
16436 case E_V2DImode:
16437 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
16438 if (use_vec_merge)
16439 break;
16440
16441 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
16442 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
16443 if (elt == 0)
16444 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
16445 else
16446 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
16447 emit_insn (gen_rtx_SET (target, tmp));
16448 return;
16449
16450 case E_V2DFmode:
16451 /* NB: For ELT == 0, use standard scalar operation patterns which
16452 preserve the rest of the vector for combiner:
16453
16454 (vec_merge:V2DF
16455 (vec_duplicate:V2DF (reg:DF))
16456 (reg:V2DF)
16457 (const_int 1))
16458 */
16459 if (elt == 0)
16460 goto do_vec_merge;
16461
16462 {
16463 rtx op0, op1;
16464
16465 /* For the two element vectors, we implement a VEC_CONCAT with
16466 the extraction of the other element. */
16467
16468 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
16469 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
16470
16471 if (elt == 0)
16472 op0 = val, op1 = tmp;
16473 else
16474 op0 = tmp, op1 = val;
16475
16476 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
16477 emit_insn (gen_rtx_SET (target, tmp));
16478 }
16479 return;
16480
16481 case E_V4SFmode:
16482 use_vec_merge = TARGET_SSE4_1;
16483 if (use_vec_merge)
16484 break;
16485
16486 switch (elt)
16487 {
16488 case 0:
16489 use_vec_merge = true;
16490 break;
16491
16492 case 1:
16493 /* tmp = target = A B C D */
16494 tmp = copy_to_reg (target);
16495 /* target = A A B B */
16496 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
16497 /* target = X A B B */
16498 ix86_expand_vector_set (false, target, val, 0);
16499 /* target = A X C D */
16500 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16501 const1_rtx, const0_rtx,
16502 GEN_INT (2+4), GEN_INT (3+4)));
16503 return;
16504
16505 case 2:
16506 /* tmp = target = A B C D */
16507 tmp = copy_to_reg (target);
16508 /* tmp = X B C D */
16509 ix86_expand_vector_set (false, tmp, val, 0);
16510 /* target = A B X D */
16511 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16512 const0_rtx, const1_rtx,
16513 GEN_INT (0+4), GEN_INT (3+4)));
16514 return;
16515
16516 case 3:
16517 /* tmp = target = A B C D */
16518 tmp = copy_to_reg (target);
16519 /* tmp = X B C D */
16520 ix86_expand_vector_set (false, tmp, val, 0);
16521 /* target = A B X D */
16522 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16523 const0_rtx, const1_rtx,
16524 GEN_INT (2+4), GEN_INT (0+4)));
16525 return;
16526
16527 default:
16528 gcc_unreachable ();
16529 }
16530 break;
16531
16532 case E_V4SImode:
16533 use_vec_merge = TARGET_SSE4_1;
16534 if (use_vec_merge)
16535 break;
16536
16537 /* Element 0 handled by vec_merge below. */
16538 if (elt == 0)
16539 {
16540 use_vec_merge = true;
16541 break;
16542 }
16543
16544 if (TARGET_SSE2)
16545 {
16546 /* With SSE2, use integer shuffles to swap element 0 and ELT,
16547 store into element 0, then shuffle them back. */
16548
16549 rtx order[4];
16550
16551 order[0] = GEN_INT (elt);
16552 order[1] = const1_rtx;
16553 order[2] = const2_rtx;
16554 order[3] = GEN_INT (3);
16555 order[elt] = const0_rtx;
16556
16557 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
16558 order[1], order[2], order[3]));
16559
16560 ix86_expand_vector_set (false, target, val, 0);
16561
16562 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
16563 order[1], order[2], order[3]));
16564 }
16565 else
16566 {
16567 /* For SSE1, we have to reuse the V4SF code. */
16568 rtx t = gen_reg_rtx (V4SFmode);
16569 emit_move_insn (t, gen_lowpart (V4SFmode, target));
16570 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
16571 emit_move_insn (target, gen_lowpart (mode, t));
16572 }
16573 return;
16574
16575 case E_V8HImode:
16576 case E_V8HFmode:
16577 case E_V8BFmode:
16578 case E_V2HImode:
16579 use_vec_merge = TARGET_SSE2;
16580 break;
16581 case E_V4HImode:
16582 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
16583 break;
16584
16585 case E_V16QImode:
16586 case E_V4QImode:
16587 use_vec_merge = TARGET_SSE4_1;
16588 break;
16589
16590 case E_V8QImode:
16591 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16592 break;
16593
16594 case E_V32QImode:
16595 half_mode = V16QImode;
16596 j = 0;
16597 n = 16;
16598 goto half;
16599
16600 case E_V16HFmode:
16601 case E_V16BFmode:
16602 /* For ELT == 0, vec_setv8hf_0 can save 1 vpbroadcastw. */
16603 if (TARGET_AVX2 && elt != 0)
16604 {
16605 mmode = SImode;
16606 gen_blendm = ((mode == E_V16HFmode) ? gen_avx2_pblendph_1
16607 : gen_avx2_pblendbf_1);
16608 blendm_const = true;
16609 break;
16610 }
16611 else
16612 {
16613 half_mode = ((mode == E_V16HFmode) ? V8HFmode : V8BFmode);
16614 j = ((mode == E_V16HFmode) ? 6 : 7);
16615 n = 8;
16616 goto half;
16617 }
16618
16619 case E_V16HImode:
16620 half_mode = V8HImode;
16621 j = 1;
16622 n = 8;
16623 goto half;
16624
16625 case E_V8SImode:
16626 half_mode = V4SImode;
16627 j = 2;
16628 n = 4;
16629 goto half;
16630
16631 case E_V4DImode:
16632 half_mode = V2DImode;
16633 j = 3;
16634 n = 2;
16635 goto half;
16636
16637 case E_V8SFmode:
16638 half_mode = V4SFmode;
16639 j = 4;
16640 n = 4;
16641 goto half;
16642
16643 case E_V4DFmode:
16644 half_mode = V2DFmode;
16645 j = 5;
16646 n = 2;
16647 goto half;
16648
16649 half:
16650 /* Compute offset. */
16651 i = elt / n;
16652 elt %= n;
16653
16654 gcc_assert (i <= 1);
16655
16656 /* Extract the half. */
16657 tmp = gen_reg_rtx (half_mode);
16658 emit_insn (gen_extract[j][i] (tmp, target));
16659
16660 /* Put val in tmp at elt. */
16661 ix86_expand_vector_set (false, tmp, val, elt);
16662
16663 /* Put it back. */
16664 emit_insn (gen_insert[j][i] (target, target, tmp));
16665 return;
16666
16667 case E_V8DFmode:
16668 if (TARGET_AVX512F)
16669 {
16670 mmode = QImode;
16671 gen_blendm = gen_avx512f_blendmv8df;
16672 }
16673 break;
16674
16675 case E_V8DImode:
16676 if (TARGET_AVX512F)
16677 {
16678 mmode = QImode;
16679 gen_blendm = gen_avx512f_blendmv8di;
16680 }
16681 break;
16682
16683 case E_V16SFmode:
16684 if (TARGET_AVX512F)
16685 {
16686 mmode = HImode;
16687 gen_blendm = gen_avx512f_blendmv16sf;
16688 }
16689 break;
16690
16691 case E_V16SImode:
16692 if (TARGET_AVX512F)
16693 {
16694 mmode = HImode;
16695 gen_blendm = gen_avx512f_blendmv16si;
16696 }
16697 break;
16698
16699 case E_V32HFmode:
16700 if (TARGET_AVX512BW)
16701 {
16702 mmode = SImode;
16703 gen_blendm = gen_avx512bw_blendmv32hf;
16704 }
16705 break;
16706 case E_V32BFmode:
16707 if (TARGET_AVX512BW)
16708 {
16709 mmode = SImode;
16710 gen_blendm = gen_avx512bw_blendmv32bf;
16711 }
16712 break;
16713 case E_V32HImode:
16714 if (TARGET_AVX512BW)
16715 {
16716 mmode = SImode;
16717 gen_blendm = gen_avx512bw_blendmv32hi;
16718 }
16719 else if (TARGET_AVX512F)
16720 {
16721 half_mode = E_V8HImode;
16722 n = 8;
16723 goto quarter;
16724 }
16725 break;
16726
16727 case E_V64QImode:
16728 if (TARGET_AVX512BW)
16729 {
16730 mmode = DImode;
16731 gen_blendm = gen_avx512bw_blendmv64qi;
16732 }
16733 else if (TARGET_AVX512F)
16734 {
16735 half_mode = E_V16QImode;
16736 n = 16;
16737 goto quarter;
16738 }
16739 break;
16740
16741 quarter:
16742 /* Compute offset. */
16743 i = elt / n;
16744 elt %= n;
16745
16746 gcc_assert (i <= 3);
16747
16748 {
16749 /* Extract the quarter. */
16750 tmp = gen_reg_rtx (V4SImode);
16751 rtx tmp2 = gen_lowpart (V16SImode, target);
16752 rtx mask = gen_reg_rtx (QImode);
16753
16754 emit_move_insn (mask, constm1_rtx);
16755 emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
16756 tmp, mask));
16757
16758 tmp2 = gen_reg_rtx (half_mode);
16759 emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
16760 tmp = tmp2;
16761
16762 /* Put val in tmp at elt. */
16763 ix86_expand_vector_set (false, tmp, val, elt);
16764
16765 /* Put it back. */
16766 tmp2 = gen_reg_rtx (V16SImode);
16767 rtx tmp3 = gen_lowpart (V16SImode, target);
16768 mask = gen_reg_rtx (HImode);
16769 emit_move_insn (mask, constm1_rtx);
16770 tmp = gen_lowpart (V4SImode, tmp);
16771 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
16772 tmp3, mask));
16773 emit_move_insn (target, gen_lowpart (mode, tmp2));
16774 }
16775 return;
16776
16777 default:
16778 break;
16779 }
16780
16781 if (mmode != VOIDmode)
16782 {
16783 tmp = gen_reg_rtx (mode);
16784 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
16785 rtx merge_mask = gen_int_mode (HOST_WIDE_INT_1U << elt, mmode);
16786 /* The avx512*_blendm<mode> expanders have different operand order
16787 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
16788 elements where the mask is set and second input operand otherwise,
16789 in {sse,avx}*_*blend* the first input operand is used for elements
16790 where the mask is clear and second input operand otherwise. */
16791 if (!blendm_const)
16792 merge_mask = force_reg (mmode, merge_mask);
16793 emit_insn (gen_blendm (target, target, tmp, merge_mask));
16794 }
16795 else if (use_vec_merge)
16796 {
16797 do_vec_merge:
16798 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
16799 tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
16800 GEN_INT (HOST_WIDE_INT_1U << elt));
16801 emit_insn (gen_rtx_SET (target, tmp));
16802 }
16803 else
16804 {
16805 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
16806
16807 emit_move_insn (mem, target);
16808
16809 tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
16810 emit_move_insn (tmp, val);
16811
16812 emit_move_insn (target, mem);
16813 }
16814 }
16815
16816 void
16817 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
16818 {
16819 machine_mode mode = GET_MODE (vec);
16820 machine_mode inner_mode = GET_MODE_INNER (mode);
16821 bool use_vec_extr = false;
16822 rtx tmp;
16823
16824 switch (mode)
16825 {
16826 case E_V2SImode:
16827 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16828 if (use_vec_extr)
16829 break;
16830 /* FALLTHRU */
16831
16832 case E_V2SFmode:
16833 if (!mmx_ok)
16834 break;
16835 /* FALLTHRU */
16836
16837 case E_V2DFmode:
16838 case E_V2DImode:
16839 case E_V2TImode:
16840 case E_V4TImode:
16841 use_vec_extr = true;
16842 break;
16843
16844 case E_V4SFmode:
16845 use_vec_extr = TARGET_SSE4_1;
16846 if (use_vec_extr)
16847 break;
16848
16849 switch (elt)
16850 {
16851 case 0:
16852 tmp = vec;
16853 break;
16854
16855 case 1:
16856 case 3:
16857 tmp = gen_reg_rtx (mode);
16858 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
16859 GEN_INT (elt), GEN_INT (elt),
16860 GEN_INT (elt+4), GEN_INT (elt+4)));
16861 break;
16862
16863 case 2:
16864 tmp = gen_reg_rtx (mode);
16865 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
16866 break;
16867
16868 default:
16869 gcc_unreachable ();
16870 }
16871 vec = tmp;
16872 use_vec_extr = true;
16873 elt = 0;
16874 break;
16875
16876 case E_V4SImode:
16877 use_vec_extr = TARGET_SSE4_1;
16878 if (use_vec_extr)
16879 break;
16880
16881 if (TARGET_SSE2)
16882 {
16883 switch (elt)
16884 {
16885 case 0:
16886 tmp = vec;
16887 break;
16888
16889 case 1:
16890 case 3:
16891 tmp = gen_reg_rtx (mode);
16892 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
16893 GEN_INT (elt), GEN_INT (elt),
16894 GEN_INT (elt), GEN_INT (elt)));
16895 break;
16896
16897 case 2:
16898 tmp = gen_reg_rtx (mode);
16899 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
16900 break;
16901
16902 default:
16903 gcc_unreachable ();
16904 }
16905 vec = tmp;
16906 use_vec_extr = true;
16907 elt = 0;
16908 }
16909 else
16910 {
16911 /* For SSE1, we have to reuse the V4SF code. */
16912 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
16913 gen_lowpart (V4SFmode, vec), elt);
16914 return;
16915 }
16916 break;
16917
16918 case E_V8HImode:
16919 case E_V8HFmode:
16920 case E_V8BFmode:
16921 case E_V2HImode:
16922 use_vec_extr = TARGET_SSE2;
16923 break;
16924 case E_V4HImode:
16925 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
16926 break;
16927
16928 case E_V16QImode:
16929 use_vec_extr = TARGET_SSE4_1;
16930 if (!use_vec_extr
16931 && TARGET_SSE2
16932 && elt == 0
16933 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
16934 {
16935 tmp = gen_reg_rtx (SImode);
16936 ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
16937 0);
16938 emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
16939 return;
16940 }
16941 break;
16942 case E_V4QImode:
16943 use_vec_extr = TARGET_SSE4_1;
16944 break;
16945
16946 case E_V8SFmode:
16947 if (TARGET_AVX)
16948 {
16949 tmp = gen_reg_rtx (V4SFmode);
16950 if (elt < 4)
16951 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
16952 else
16953 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
16954 ix86_expand_vector_extract (false, target, tmp, elt & 3);
16955 return;
16956 }
16957 break;
16958
16959 case E_V4DFmode:
16960 if (TARGET_AVX)
16961 {
16962 tmp = gen_reg_rtx (V2DFmode);
16963 if (elt < 2)
16964 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
16965 else
16966 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
16967 ix86_expand_vector_extract (false, target, tmp, elt & 1);
16968 return;
16969 }
16970 break;
16971
16972 case E_V32QImode:
16973 if (TARGET_AVX)
16974 {
16975 tmp = gen_reg_rtx (V16QImode);
16976 if (elt < 16)
16977 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
16978 else
16979 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
16980 ix86_expand_vector_extract (false, target, tmp, elt & 15);
16981 return;
16982 }
16983 break;
16984
16985 case E_V16HImode:
16986 if (TARGET_AVX)
16987 {
16988 tmp = gen_reg_rtx (V8HImode);
16989 if (elt < 8)
16990 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
16991 else
16992 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
16993 ix86_expand_vector_extract (false, target, tmp, elt & 7);
16994 return;
16995 }
16996 break;
16997
16998 case E_V8SImode:
16999 if (TARGET_AVX)
17000 {
17001 tmp = gen_reg_rtx (V4SImode);
17002 if (elt < 4)
17003 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
17004 else
17005 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
17006 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17007 return;
17008 }
17009 break;
17010
17011 case E_V4DImode:
17012 if (TARGET_AVX)
17013 {
17014 tmp = gen_reg_rtx (V2DImode);
17015 if (elt < 2)
17016 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
17017 else
17018 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
17019 ix86_expand_vector_extract (false, target, tmp, elt & 1);
17020 return;
17021 }
17022 break;
17023
17024 case E_V32HImode:
17025 if (TARGET_AVX512BW)
17026 {
17027 tmp = gen_reg_rtx (V16HImode);
17028 if (elt < 16)
17029 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
17030 else
17031 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
17032 ix86_expand_vector_extract (false, target, tmp, elt & 15);
17033 return;
17034 }
17035 break;
17036
17037 case E_V64QImode:
17038 if (TARGET_AVX512BW)
17039 {
17040 tmp = gen_reg_rtx (V32QImode);
17041 if (elt < 32)
17042 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
17043 else
17044 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
17045 ix86_expand_vector_extract (false, target, tmp, elt & 31);
17046 return;
17047 }
17048 break;
17049
17050 case E_V16SFmode:
17051 tmp = gen_reg_rtx (V8SFmode);
17052 if (elt < 8)
17053 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
17054 else
17055 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
17056 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17057 return;
17058
17059 case E_V8DFmode:
17060 tmp = gen_reg_rtx (V4DFmode);
17061 if (elt < 4)
17062 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
17063 else
17064 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
17065 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17066 return;
17067
17068 case E_V16SImode:
17069 tmp = gen_reg_rtx (V8SImode);
17070 if (elt < 8)
17071 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
17072 else
17073 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
17074 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17075 return;
17076
17077 case E_V8DImode:
17078 tmp = gen_reg_rtx (V4DImode);
17079 if (elt < 4)
17080 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
17081 else
17082 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
17083 ix86_expand_vector_extract (false, target, tmp, elt & 3);
17084 return;
17085
17086 case E_V32HFmode:
17087 case E_V32BFmode:
17088 if (TARGET_AVX512BW)
17089 {
17090 tmp = (mode == E_V32HFmode
17091 ? gen_reg_rtx (V16HFmode)
17092 : gen_reg_rtx (V16BFmode));
17093 if (elt < 16)
17094 emit_insn (maybe_gen_vec_extract_lo (mode, tmp, vec));
17095 else
17096 emit_insn (maybe_gen_vec_extract_hi (mode, tmp, vec));
17097 ix86_expand_vector_extract (false, target, tmp, elt & 15);
17098 return;
17099 }
17100 break;
17101
17102 case E_V16HFmode:
17103 case E_V16BFmode:
17104 if (TARGET_AVX)
17105 {
17106 tmp = (mode == E_V16HFmode
17107 ? gen_reg_rtx (V8HFmode)
17108 : gen_reg_rtx (V8BFmode));
17109 if (elt < 8)
17110 emit_insn (maybe_gen_vec_extract_lo (mode, tmp, vec));
17111 else
17112 emit_insn (maybe_gen_vec_extract_hi (mode, tmp, vec));
17113 ix86_expand_vector_extract (false, target, tmp, elt & 7);
17114 return;
17115 }
17116 break;
17117
17118 case E_V8QImode:
17119 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
17120 /* ??? Could extract the appropriate HImode element and shift. */
17121 break;
17122
17123 default:
17124 break;
17125 }
17126
17127 if (use_vec_extr)
17128 {
17129 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
17130 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
17131
17132 /* Let the rtl optimizers know about the zero extension performed. */
17133 if (inner_mode == QImode || inner_mode == HImode)
17134 {
17135 rtx reg = gen_reg_rtx (SImode);
17136 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
17137 emit_move_insn (reg, tmp);
17138 tmp = gen_lowpart (inner_mode, reg);
17139 SUBREG_PROMOTED_VAR_P (tmp) = 1;
17140 SUBREG_PROMOTED_SET (tmp, 1);
17141 }
17142
17143 emit_move_insn (target, tmp);
17144 }
17145 else
17146 {
17147 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
17148
17149 emit_move_insn (mem, vec);
17150
17151 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
17152 emit_move_insn (target, tmp);
17153 }
17154 }
17155
17156 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
17157 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
17158 The upper bits of DEST are undefined, though they shouldn't cause
17159 exceptions (some bits from src or all zeros are ok). */
17160
17161 static void
17162 emit_reduc_half (rtx dest, rtx src, int i)
17163 {
17164 rtx tem, d = dest;
17165 switch (GET_MODE (src))
17166 {
17167 case E_V4SFmode:
17168 if (i == 128)
17169 tem = gen_sse_movhlps (dest, src, src);
17170 else
17171 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
17172 GEN_INT (1 + 4), GEN_INT (1 + 4));
17173 break;
17174 case E_V2DFmode:
17175 tem = gen_vec_interleave_highv2df (dest, src, src);
17176 break;
17177 case E_V4QImode:
17178 d = gen_reg_rtx (V1SImode);
17179 tem = gen_mmx_lshrv1si3 (d, gen_lowpart (V1SImode, src),
17180 GEN_INT (i / 2));
17181 break;
17182 case E_V4HImode:
17183 d = gen_reg_rtx (V1DImode);
17184 tem = gen_mmx_lshrv1di3 (d, gen_lowpart (V1DImode, src),
17185 GEN_INT (i / 2));
17186 break;
17187 case E_V16QImode:
17188 case E_V8HImode:
17189 case E_V8HFmode:
17190 case E_V4SImode:
17191 case E_V2DImode:
17192 d = gen_reg_rtx (V1TImode);
17193 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
17194 GEN_INT (i / 2));
17195 break;
17196 case E_V8SFmode:
17197 if (i == 256)
17198 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
17199 else
17200 tem = gen_avx_shufps256 (dest, src, src,
17201 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
17202 break;
17203 case E_V4DFmode:
17204 if (i == 256)
17205 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
17206 else
17207 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
17208 break;
17209 case E_V32QImode:
17210 case E_V16HImode:
17211 case E_V16HFmode:
17212 case E_V8SImode:
17213 case E_V4DImode:
17214 if (i == 256)
17215 {
17216 if (GET_MODE (dest) != V4DImode)
17217 d = gen_reg_rtx (V4DImode);
17218 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
17219 gen_lowpart (V4DImode, src),
17220 const1_rtx);
17221 }
17222 else
17223 {
17224 d = gen_reg_rtx (V2TImode);
17225 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
17226 GEN_INT (i / 2));
17227 }
17228 break;
17229 case E_V64QImode:
17230 case E_V32HImode:
17231 case E_V32HFmode:
17232 if (i < 64)
17233 {
17234 d = gen_reg_rtx (V4TImode);
17235 tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
17236 GEN_INT (i / 2));
17237 break;
17238 }
17239 /* FALLTHRU */
17240 case E_V16SImode:
17241 case E_V16SFmode:
17242 case E_V8DImode:
17243 case E_V8DFmode:
17244 if (i > 128)
17245 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
17246 gen_lowpart (V16SImode, src),
17247 gen_lowpart (V16SImode, src),
17248 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
17249 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
17250 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
17251 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
17252 GEN_INT (0xC), GEN_INT (0xD),
17253 GEN_INT (0xE), GEN_INT (0xF),
17254 GEN_INT (0x10), GEN_INT (0x11),
17255 GEN_INT (0x12), GEN_INT (0x13),
17256 GEN_INT (0x14), GEN_INT (0x15),
17257 GEN_INT (0x16), GEN_INT (0x17));
17258 else
17259 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
17260 gen_lowpart (V16SImode, src),
17261 GEN_INT (i == 128 ? 0x2 : 0x1),
17262 GEN_INT (0x3),
17263 GEN_INT (0x3),
17264 GEN_INT (0x3),
17265 GEN_INT (i == 128 ? 0x6 : 0x5),
17266 GEN_INT (0x7),
17267 GEN_INT (0x7),
17268 GEN_INT (0x7),
17269 GEN_INT (i == 128 ? 0xA : 0x9),
17270 GEN_INT (0xB),
17271 GEN_INT (0xB),
17272 GEN_INT (0xB),
17273 GEN_INT (i == 128 ? 0xE : 0xD),
17274 GEN_INT (0xF),
17275 GEN_INT (0xF),
17276 GEN_INT (0xF));
17277 break;
17278 default:
17279 gcc_unreachable ();
17280 }
17281 emit_insn (tem);
17282 if (d != dest)
17283 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
17284 }
17285
17286 /* Expand a vector reduction. FN is the binary pattern to reduce;
17287 DEST is the destination; IN is the input vector. */
17288
17289 void
17290 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
17291 {
17292 rtx half, dst, vec = in;
17293 machine_mode mode = GET_MODE (in);
17294 int i;
17295
17296 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
17297 if (TARGET_SSE4_1
17298 && mode == V8HImode
17299 && fn == gen_uminv8hi3)
17300 {
17301 emit_insn (gen_sse4_1_phminposuw (dest, in));
17302 return;
17303 }
17304
17305 for (i = GET_MODE_BITSIZE (mode);
17306 i > GET_MODE_UNIT_BITSIZE (mode);
17307 i >>= 1)
17308 {
17309 half = gen_reg_rtx (mode);
17310 emit_reduc_half (half, vec, i);
17311 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
17312 dst = dest;
17313 else
17314 dst = gen_reg_rtx (mode);
17315 emit_insn (fn (dst, half, vec));
17316 vec = dst;
17317 }
17318 }
17319
17320 /* Output code to perform a conditional jump to LABEL, if C2 flag in
17321 FP status register is set. */
17322
17323 void
17324 ix86_emit_fp_unordered_jump (rtx label)
17325 {
17326 rtx reg = gen_reg_rtx (HImode);
17327 rtx_insn *insn;
17328 rtx temp;
17329
17330 emit_insn (gen_x86_fnstsw_1 (reg));
17331
17332 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
17333 {
17334 emit_insn (gen_x86_sahf_1 (reg));
17335
17336 temp = gen_rtx_REG (CCmode, FLAGS_REG);
17337 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
17338 }
17339 else
17340 {
17341 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
17342
17343 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
17344 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
17345 }
17346
17347 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
17348 gen_rtx_LABEL_REF (VOIDmode, label),
17349 pc_rtx);
17350 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
17351 predict_jump (REG_BR_PROB_BASE * 10 / 100);
17352 JUMP_LABEL (insn) = label;
17353 }
17354
17355 /* Output code to perform an sinh XFmode calculation. */
17356
17357 void
17358 ix86_emit_i387_sinh (rtx op0, rtx op1)
17359 {
17360 rtx e1 = gen_reg_rtx (XFmode);
17361 rtx e2 = gen_reg_rtx (XFmode);
17362 rtx scratch = gen_reg_rtx (HImode);
17363 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17364 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17365 rtx cst1, tmp;
17366 rtx_code_label *jump_label = gen_label_rtx ();
17367 rtx_insn *insn;
17368
17369 /* scratch = fxam (op1) */
17370 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17371
17372 /* e1 = expm1 (|op1|) */
17373 emit_insn (gen_absxf2 (e2, op1));
17374 emit_insn (gen_expm1xf2 (e1, e2));
17375
17376 /* e2 = e1 / (e1 + 1.0) + e1 */
17377 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17378 emit_insn (gen_addxf3 (e2, e1, cst1));
17379 emit_insn (gen_divxf3 (e2, e1, e2));
17380 emit_insn (gen_addxf3 (e2, e2, e1));
17381
17382 /* flags = signbit (op1) */
17383 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17384
17385 /* if (flags) then e2 = -e2 */
17386 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17387 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17388 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17389 pc_rtx);
17390 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17391 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17392 JUMP_LABEL (insn) = jump_label;
17393
17394 emit_insn (gen_negxf2 (e2, e2));
17395
17396 emit_label (jump_label);
17397 LABEL_NUSES (jump_label) = 1;
17398
17399 /* op0 = 0.5 * e2 */
17400 half = force_reg (XFmode, half);
17401 emit_insn (gen_mulxf3 (op0, e2, half));
17402 }
17403
17404 /* Output code to perform an cosh XFmode calculation. */
17405
17406 void
17407 ix86_emit_i387_cosh (rtx op0, rtx op1)
17408 {
17409 rtx e1 = gen_reg_rtx (XFmode);
17410 rtx e2 = gen_reg_rtx (XFmode);
17411 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17412 rtx cst1;
17413
17414 /* e1 = exp (op1) */
17415 emit_insn (gen_expxf2 (e1, op1));
17416
17417 /* e2 = e1 + 1.0 / e1 */
17418 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17419 emit_insn (gen_divxf3 (e2, cst1, e1));
17420 emit_insn (gen_addxf3 (e2, e1, e2));
17421
17422 /* op0 = 0.5 * e2 */
17423 half = force_reg (XFmode, half);
17424 emit_insn (gen_mulxf3 (op0, e2, half));
17425 }
17426
17427 /* Output code to perform an tanh XFmode calculation. */
17428
17429 void
17430 ix86_emit_i387_tanh (rtx op0, rtx op1)
17431 {
17432 rtx e1 = gen_reg_rtx (XFmode);
17433 rtx e2 = gen_reg_rtx (XFmode);
17434 rtx scratch = gen_reg_rtx (HImode);
17435 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17436 rtx cst2, tmp;
17437 rtx_code_label *jump_label = gen_label_rtx ();
17438 rtx_insn *insn;
17439
17440 /* scratch = fxam (op1) */
17441 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17442
17443 /* e1 = expm1 (-|2 * op1|) */
17444 emit_insn (gen_addxf3 (e2, op1, op1));
17445 emit_insn (gen_absxf2 (e2, e2));
17446 emit_insn (gen_negxf2 (e2, e2));
17447 emit_insn (gen_expm1xf2 (e1, e2));
17448
17449 /* e2 = e1 / (e1 + 2.0) */
17450 cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
17451 emit_insn (gen_addxf3 (e2, e1, cst2));
17452 emit_insn (gen_divxf3 (e2, e1, e2));
17453
17454 /* flags = signbit (op1) */
17455 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17456
17457 /* if (!flags) then e2 = -e2 */
17458 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17459 gen_rtx_NE (VOIDmode, flags, const0_rtx),
17460 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17461 pc_rtx);
17462 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17463 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17464 JUMP_LABEL (insn) = jump_label;
17465
17466 emit_insn (gen_negxf2 (e2, e2));
17467
17468 emit_label (jump_label);
17469 LABEL_NUSES (jump_label) = 1;
17470
17471 emit_move_insn (op0, e2);
17472 }
17473
17474 /* Output code to perform an asinh XFmode calculation. */
17475
17476 void
17477 ix86_emit_i387_asinh (rtx op0, rtx op1)
17478 {
17479 rtx e1 = gen_reg_rtx (XFmode);
17480 rtx e2 = gen_reg_rtx (XFmode);
17481 rtx scratch = gen_reg_rtx (HImode);
17482 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17483 rtx cst1, tmp;
17484 rtx_code_label *jump_label = gen_label_rtx ();
17485 rtx_insn *insn;
17486
17487 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
17488 emit_insn (gen_mulxf3 (e1, op1, op1));
17489 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17490 emit_insn (gen_addxf3 (e2, e1, cst1));
17491 emit_insn (gen_sqrtxf2 (e2, e2));
17492 emit_insn (gen_addxf3 (e2, e2, cst1));
17493
17494 /* e1 = e1 / e2 */
17495 emit_insn (gen_divxf3 (e1, e1, e2));
17496
17497 /* scratch = fxam (op1) */
17498 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17499
17500 /* e1 = e1 + |op1| */
17501 emit_insn (gen_absxf2 (e2, op1));
17502 emit_insn (gen_addxf3 (e1, e1, e2));
17503
17504 /* e2 = log1p (e1) */
17505 ix86_emit_i387_log1p (e2, e1);
17506
17507 /* flags = signbit (op1) */
17508 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17509
17510 /* if (flags) then e2 = -e2 */
17511 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17512 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17513 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17514 pc_rtx);
17515 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17516 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17517 JUMP_LABEL (insn) = jump_label;
17518
17519 emit_insn (gen_negxf2 (e2, e2));
17520
17521 emit_label (jump_label);
17522 LABEL_NUSES (jump_label) = 1;
17523
17524 emit_move_insn (op0, e2);
17525 }
17526
17527 /* Output code to perform an acosh XFmode calculation. */
17528
17529 void
17530 ix86_emit_i387_acosh (rtx op0, rtx op1)
17531 {
17532 rtx e1 = gen_reg_rtx (XFmode);
17533 rtx e2 = gen_reg_rtx (XFmode);
17534 rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17535
17536 /* e2 = sqrt (op1 + 1.0) */
17537 emit_insn (gen_addxf3 (e2, op1, cst1));
17538 emit_insn (gen_sqrtxf2 (e2, e2));
17539
17540 /* e1 = sqrt (op1 - 1.0) */
17541 emit_insn (gen_subxf3 (e1, op1, cst1));
17542 emit_insn (gen_sqrtxf2 (e1, e1));
17543
17544 /* e1 = e1 * e2 */
17545 emit_insn (gen_mulxf3 (e1, e1, e2));
17546
17547 /* e1 = e1 + op1 */
17548 emit_insn (gen_addxf3 (e1, e1, op1));
17549
17550 /* op0 = log (e1) */
17551 emit_insn (gen_logxf2 (op0, e1));
17552 }
17553
17554 /* Output code to perform an atanh XFmode calculation. */
17555
17556 void
17557 ix86_emit_i387_atanh (rtx op0, rtx op1)
17558 {
17559 rtx e1 = gen_reg_rtx (XFmode);
17560 rtx e2 = gen_reg_rtx (XFmode);
17561 rtx scratch = gen_reg_rtx (HImode);
17562 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17563 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17564 rtx cst1, tmp;
17565 rtx_code_label *jump_label = gen_label_rtx ();
17566 rtx_insn *insn;
17567
17568 /* scratch = fxam (op1) */
17569 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17570
17571 /* e2 = |op1| */
17572 emit_insn (gen_absxf2 (e2, op1));
17573
17574 /* e1 = -(e2 + e2) / (e2 + 1.0) */
17575 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17576 emit_insn (gen_addxf3 (e1, e2, cst1));
17577 emit_insn (gen_addxf3 (e2, e2, e2));
17578 emit_insn (gen_negxf2 (e2, e2));
17579 emit_insn (gen_divxf3 (e1, e2, e1));
17580
17581 /* e2 = log1p (e1) */
17582 ix86_emit_i387_log1p (e2, e1);
17583
17584 /* flags = signbit (op1) */
17585 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17586
17587 /* if (!flags) then e2 = -e2 */
17588 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17589 gen_rtx_NE (VOIDmode, flags, const0_rtx),
17590 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17591 pc_rtx);
17592 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17593 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17594 JUMP_LABEL (insn) = jump_label;
17595
17596 emit_insn (gen_negxf2 (e2, e2));
17597
17598 emit_label (jump_label);
17599 LABEL_NUSES (jump_label) = 1;
17600
17601 /* op0 = 0.5 * e2 */
17602 half = force_reg (XFmode, half);
17603 emit_insn (gen_mulxf3 (op0, e2, half));
17604 }
17605
17606 /* Output code to perform a log1p XFmode calculation. */
17607
17608 void
17609 ix86_emit_i387_log1p (rtx op0, rtx op1)
17610 {
17611 rtx_code_label *label1 = gen_label_rtx ();
17612 rtx_code_label *label2 = gen_label_rtx ();
17613
17614 rtx tmp = gen_reg_rtx (XFmode);
17615 rtx res = gen_reg_rtx (XFmode);
17616 rtx cst, cstln2, cst1;
17617 rtx_insn *insn;
17618
17619 /* The emit_jump call emits pending stack adjust, make sure it is emitted
17620 before the conditional jump, otherwise the stack adjustment will be
17621 only conditional. */
17622 do_pending_stack_adjust ();
17623
17624 cst = const_double_from_real_value
17625 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
17626 cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
17627
17628 emit_insn (gen_absxf2 (tmp, op1));
17629
17630 cst = force_reg (XFmode, cst);
17631 ix86_expand_branch (GE, tmp, cst, label1);
17632 predict_jump (REG_BR_PROB_BASE * 10 / 100);
17633 insn = get_last_insn ();
17634 JUMP_LABEL (insn) = label1;
17635
17636 emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
17637 emit_jump (label2);
17638
17639 emit_label (label1);
17640 LABEL_NUSES (label1) = 1;
17641
17642 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17643 emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
17644 emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
17645
17646 emit_label (label2);
17647 LABEL_NUSES (label2) = 1;
17648
17649 emit_move_insn (op0, res);
17650 }
17651
17652 /* Emit code for round calculation. */
17653 void
17654 ix86_emit_i387_round (rtx op0, rtx op1)
17655 {
17656 machine_mode inmode = GET_MODE (op1);
17657 machine_mode outmode = GET_MODE (op0);
17658 rtx e1 = gen_reg_rtx (XFmode);
17659 rtx e2 = gen_reg_rtx (XFmode);
17660 rtx scratch = gen_reg_rtx (HImode);
17661 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17662 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17663 rtx res = gen_reg_rtx (outmode);
17664 rtx_code_label *jump_label = gen_label_rtx ();
17665 rtx (*floor_insn) (rtx, rtx);
17666 rtx (*neg_insn) (rtx, rtx);
17667 rtx_insn *insn;
17668 rtx tmp;
17669
17670 switch (inmode)
17671 {
17672 case E_SFmode:
17673 case E_DFmode:
17674 tmp = gen_reg_rtx (XFmode);
17675
17676 emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
17677 op1 = tmp;
17678 break;
17679 case E_XFmode:
17680 break;
17681 default:
17682 gcc_unreachable ();
17683 }
17684
17685 switch (outmode)
17686 {
17687 case E_SFmode:
17688 floor_insn = gen_frndintxf2_floor;
17689 neg_insn = gen_negsf2;
17690 break;
17691 case E_DFmode:
17692 floor_insn = gen_frndintxf2_floor;
17693 neg_insn = gen_negdf2;
17694 break;
17695 case E_XFmode:
17696 floor_insn = gen_frndintxf2_floor;
17697 neg_insn = gen_negxf2;
17698 break;
17699 case E_HImode:
17700 floor_insn = gen_lfloorxfhi2;
17701 neg_insn = gen_neghi2;
17702 break;
17703 case E_SImode:
17704 floor_insn = gen_lfloorxfsi2;
17705 neg_insn = gen_negsi2;
17706 break;
17707 case E_DImode:
17708 floor_insn = gen_lfloorxfdi2;
17709 neg_insn = gen_negdi2;
17710 break;
17711 default:
17712 gcc_unreachable ();
17713 }
17714
17715 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
17716
17717 /* scratch = fxam(op1) */
17718 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17719
17720 /* e1 = fabs(op1) */
17721 emit_insn (gen_absxf2 (e1, op1));
17722
17723 /* e2 = e1 + 0.5 */
17724 half = force_reg (XFmode, half);
17725 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
17726
17727 /* res = floor(e2) */
17728 switch (outmode)
17729 {
17730 case E_SFmode:
17731 case E_DFmode:
17732 {
17733 tmp = gen_reg_rtx (XFmode);
17734
17735 emit_insn (floor_insn (tmp, e2));
17736 emit_insn (gen_rtx_SET (res,
17737 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
17738 UNSPEC_TRUNC_NOOP)));
17739 }
17740 break;
17741 default:
17742 emit_insn (floor_insn (res, e2));
17743 }
17744
17745 /* flags = signbit(a) */
17746 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17747
17748 /* if (flags) then res = -res */
17749 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17750 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17751 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17752 pc_rtx);
17753 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17754 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17755 JUMP_LABEL (insn) = jump_label;
17756
17757 emit_insn (neg_insn (res, res));
17758
17759 emit_label (jump_label);
17760 LABEL_NUSES (jump_label) = 1;
17761
17762 emit_move_insn (op0, res);
17763 }
17764
17765 /* Output code to perform a Newton-Rhapson approximation of a single precision
17766 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
17767
17768 void
17769 ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
17770 {
17771 rtx x0, x1, e0, e1;
17772
17773 x0 = gen_reg_rtx (mode);
17774 e0 = gen_reg_rtx (mode);
17775 e1 = gen_reg_rtx (mode);
17776 x1 = gen_reg_rtx (mode);
17777
17778 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
17779
17780 b = force_reg (mode, b);
17781
17782 /* x0 = rcp(b) estimate */
17783 if (mode == V16SFmode || mode == V8DFmode)
17784 {
17785 if (TARGET_AVX512ER)
17786 {
17787 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
17788 UNSPEC_RCP28)));
17789 /* res = a * x0 */
17790 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
17791 return;
17792 }
17793 else
17794 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
17795 UNSPEC_RCP14)));
17796 }
17797 else
17798 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
17799 UNSPEC_RCP)));
17800
17801 /* e0 = x0 * b */
17802 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
17803
17804 /* e0 = x0 * e0 */
17805 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
17806
17807 /* e1 = x0 + x0 */
17808 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
17809
17810 /* x1 = e1 - e0 */
17811 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
17812
17813 /* res = a * x1 */
17814 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
17815 }
17816
17817 /* Output code to perform a Newton-Rhapson approximation of a
17818 single precision floating point [reciprocal] square root. */
17819
17820 void
17821 ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
17822 {
17823 rtx x0, e0, e1, e2, e3, mthree, mhalf;
17824 REAL_VALUE_TYPE r;
17825 int unspec;
17826
17827 x0 = gen_reg_rtx (mode);
17828 e0 = gen_reg_rtx (mode);
17829 e1 = gen_reg_rtx (mode);
17830 e2 = gen_reg_rtx (mode);
17831 e3 = gen_reg_rtx (mode);
17832
17833 if (TARGET_AVX512ER && mode == V16SFmode)
17834 {
17835 if (recip)
17836 /* res = rsqrt28(a) estimate */
17837 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
17838 UNSPEC_RSQRT28)));
17839 else
17840 {
17841 /* x0 = rsqrt28(a) estimate */
17842 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
17843 UNSPEC_RSQRT28)));
17844 /* res = rcp28(x0) estimate */
17845 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
17846 UNSPEC_RCP28)));
17847 }
17848 return;
17849 }
17850
17851 real_from_integer (&r, VOIDmode, -3, SIGNED);
17852 mthree = const_double_from_real_value (r, SFmode);
17853
17854 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
17855 mhalf = const_double_from_real_value (r, SFmode);
17856 unspec = UNSPEC_RSQRT;
17857
17858 if (VECTOR_MODE_P (mode))
17859 {
17860 mthree = ix86_build_const_vector (mode, true, mthree);
17861 mhalf = ix86_build_const_vector (mode, true, mhalf);
17862 /* There is no 512-bit rsqrt. There is however rsqrt14. */
17863 if (GET_MODE_SIZE (mode) == 64)
17864 unspec = UNSPEC_RSQRT14;
17865 }
17866
17867 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
17868 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
17869
17870 a = force_reg (mode, a);
17871
17872 /* x0 = rsqrt(a) estimate */
17873 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
17874 unspec)));
17875
17876 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
17877 if (!recip)
17878 {
17879 rtx zero = force_reg (mode, CONST0_RTX(mode));
17880 rtx mask;
17881
17882 /* Handle masked compare. */
17883 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
17884 {
17885 mask = gen_reg_rtx (HImode);
17886 /* Imm value 0x4 corresponds to not-equal comparison. */
17887 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
17888 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
17889 }
17890 else
17891 {
17892 mask = gen_reg_rtx (mode);
17893 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
17894 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
17895 }
17896 }
17897
17898 mthree = force_reg (mode, mthree);
17899
17900 /* e0 = x0 * a */
17901 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
17902
17903 unsigned vector_size = GET_MODE_SIZE (mode);
17904 if (TARGET_FMA
17905 || (TARGET_AVX512F && vector_size == 64)
17906 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
17907 emit_insn (gen_rtx_SET (e2,
17908 gen_rtx_FMA (mode, e0, x0, mthree)));
17909 else
17910 {
17911 /* e1 = e0 * x0 */
17912 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
17913
17914 /* e2 = e1 - 3. */
17915 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
17916 }
17917
17918 mhalf = force_reg (mode, mhalf);
17919 if (recip)
17920 /* e3 = -.5 * x0 */
17921 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
17922 else
17923 /* e3 = -.5 * e0 */
17924 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
17925 /* ret = e2 * e3 */
17926 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
17927 }
17928
17929 /* Expand fabs (OP0) and return a new rtx that holds the result. The
17930 mask for masking out the sign-bit is stored in *SMASK, if that is
17931 non-null. */
17932
17933 static rtx
17934 ix86_expand_sse_fabs (rtx op0, rtx *smask)
17935 {
17936 machine_mode vmode, mode = GET_MODE (op0);
17937 rtx xa, mask;
17938
17939 xa = gen_reg_rtx (mode);
17940 if (mode == SFmode)
17941 vmode = V4SFmode;
17942 else if (mode == DFmode)
17943 vmode = V2DFmode;
17944 else
17945 vmode = mode;
17946 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
17947 if (!VECTOR_MODE_P (mode))
17948 {
17949 /* We need to generate a scalar mode mask in this case. */
17950 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
17951 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
17952 mask = gen_reg_rtx (mode);
17953 emit_insn (gen_rtx_SET (mask, tmp));
17954 }
17955 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
17956
17957 if (smask)
17958 *smask = mask;
17959
17960 return xa;
17961 }
17962
17963 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
17964 swapping the operands if SWAP_OPERANDS is true. The expanded
17965 code is a forward jump to a newly created label in case the
17966 comparison is true. The generated label rtx is returned. */
17967 static rtx_code_label *
17968 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
17969 bool swap_operands)
17970 {
17971 bool unordered_compare = ix86_unordered_fp_compare (code);
17972 rtx_code_label *label;
17973 rtx tmp, reg;
17974
17975 if (swap_operands)
17976 std::swap (op0, op1);
17977
17978 label = gen_label_rtx ();
17979 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
17980 if (unordered_compare)
17981 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
17982 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
17983 emit_insn (gen_rtx_SET (reg, tmp));
17984 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
17985 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
17986 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
17987 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17988 JUMP_LABEL (tmp) = label;
17989
17990 return label;
17991 }
17992
17993 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
17994 using comparison code CODE. Operands are swapped for the comparison if
17995 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
17996 static rtx
17997 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
17998 bool swap_operands)
17999 {
18000 rtx (*insn)(rtx, rtx, rtx, rtx);
18001 machine_mode mode = GET_MODE (op0);
18002 rtx mask = gen_reg_rtx (mode);
18003
18004 if (swap_operands)
18005 std::swap (op0, op1);
18006
18007 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
18008
18009 emit_insn (insn (mask, op0, op1,
18010 gen_rtx_fmt_ee (code, mode, op0, op1)));
18011 return mask;
18012 }
18013
18014 /* Expand copysign from SIGN to the positive value ABS_VALUE
18015 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
18016 the sign-bit. */
18017
18018 static void
18019 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
18020 {
18021 machine_mode mode = GET_MODE (sign);
18022 rtx sgn = gen_reg_rtx (mode);
18023 if (mask == NULL_RTX)
18024 {
18025 machine_mode vmode;
18026
18027 if (mode == SFmode)
18028 vmode = V4SFmode;
18029 else if (mode == DFmode)
18030 vmode = V2DFmode;
18031 else
18032 vmode = mode;
18033
18034 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
18035 if (!VECTOR_MODE_P (mode))
18036 {
18037 /* We need to generate a scalar mode mask in this case. */
18038 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
18039 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
18040 mask = gen_reg_rtx (mode);
18041 emit_insn (gen_rtx_SET (mask, tmp));
18042 }
18043 }
18044 else
18045 mask = gen_rtx_NOT (mode, mask);
18046 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
18047 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
18048 }
18049
18050 /* Expand SSE sequence for computing lround from OP1 storing
18051 into OP0. */
18052
18053 void
18054 ix86_expand_lround (rtx op0, rtx op1)
18055 {
18056 /* C code for the stuff we're doing below:
18057 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
18058 return (long)tmp;
18059 */
18060 machine_mode mode = GET_MODE (op1);
18061 const struct real_format *fmt;
18062 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18063 rtx adj;
18064
18065 /* load nextafter (0.5, 0.0) */
18066 fmt = REAL_MODE_FORMAT (mode);
18067 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18068 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18069
18070 /* adj = copysign (0.5, op1) */
18071 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
18072 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
18073
18074 /* adj = op1 + adj */
18075 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
18076
18077 /* op0 = (imode)adj */
18078 expand_fix (op0, adj, 0);
18079 }
18080
18081 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
18082 into OPERAND0. */
18083
18084 void
18085 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
18086 {
18087 /* C code for the stuff we're doing below (for do_floor):
18088 xi = (long)op1;
18089 xi -= (double)xi > op1 ? 1 : 0;
18090 return xi;
18091 */
18092 machine_mode fmode = GET_MODE (op1);
18093 machine_mode imode = GET_MODE (op0);
18094 rtx ireg, freg, tmp;
18095 rtx_code_label *label;
18096
18097 /* reg = (long)op1 */
18098 ireg = gen_reg_rtx (imode);
18099 expand_fix (ireg, op1, 0);
18100
18101 /* freg = (double)reg */
18102 freg = gen_reg_rtx (fmode);
18103 expand_float (freg, ireg, 0);
18104
18105 /* ireg = (freg > op1) ? ireg - 1 : ireg */
18106 label = ix86_expand_sse_compare_and_jump (UNLE,
18107 freg, op1, !do_floor);
18108 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
18109 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
18110 emit_move_insn (ireg, tmp);
18111
18112 emit_label (label);
18113 LABEL_NUSES (label) = 1;
18114
18115 emit_move_insn (op0, ireg);
18116 }
18117
18118 /* Generate and return a rtx of mode MODE for 2**n where n is the number
18119 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
18120
18121 static rtx
18122 ix86_gen_TWO52 (machine_mode mode)
18123 {
18124 const struct real_format *fmt;
18125 REAL_VALUE_TYPE TWO52r;
18126 rtx TWO52;
18127
18128 fmt = REAL_MODE_FORMAT (mode);
18129 real_2expN (&TWO52r, fmt->p - 1, mode);
18130 TWO52 = const_double_from_real_value (TWO52r, mode);
18131 TWO52 = force_reg (mode, TWO52);
18132
18133 return TWO52;
18134 }
18135
18136 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
18137
18138 void
18139 ix86_expand_rint (rtx operand0, rtx operand1)
18140 {
18141 /* C code for the stuff we're doing below:
18142 xa = fabs (operand1);
18143 if (!isless (xa, 2**52))
18144 return operand1;
18145 two52 = 2**52;
18146 if (flag_rounding_math)
18147 {
18148 two52 = copysign (two52, operand1);
18149 xa = operand1;
18150 }
18151 xa = xa + two52 - two52;
18152 return copysign (xa, operand1);
18153 */
18154 machine_mode mode = GET_MODE (operand0);
18155 rtx res, xa, TWO52, mask;
18156 rtx_code_label *label;
18157
18158 TWO52 = ix86_gen_TWO52 (mode);
18159
18160 /* Temporary for holding the result, initialized to the input
18161 operand to ease control flow. */
18162 res = copy_to_reg (operand1);
18163
18164 /* xa = abs (operand1) */
18165 xa = ix86_expand_sse_fabs (res, &mask);
18166
18167 /* if (!isless (xa, TWO52)) goto label; */
18168 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18169
18170 if (flag_rounding_math)
18171 {
18172 ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask);
18173 xa = res;
18174 }
18175
18176 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18177 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
18178
18179 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18180 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
18181 xa = ix86_expand_sse_fabs (xa, NULL);
18182
18183 ix86_sse_copysign_to_positive (res, xa, res, mask);
18184
18185 emit_label (label);
18186 LABEL_NUSES (label) = 1;
18187
18188 emit_move_insn (operand0, res);
18189 }
18190
18191 /* Expand SSE2 sequence for computing floor or ceil
18192 from OPERAND1 storing into OPERAND0. */
18193 void
18194 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
18195 {
18196 /* C code for the stuff we expand below.
18197 double xa = fabs (x), x2;
18198 if (!isless (xa, TWO52))
18199 return x;
18200 x2 = (double)(long)x;
18201
18202 Compensate. Floor:
18203 if (x2 > x)
18204 x2 -= 1;
18205 Compensate. Ceil:
18206 if (x2 < x)
18207 x2 += 1;
18208
18209 if (HONOR_SIGNED_ZEROS (mode))
18210 return copysign (x2, x);
18211 return x2;
18212 */
18213 machine_mode mode = GET_MODE (operand0);
18214 rtx xa, xi, TWO52, tmp, one, res, mask;
18215 rtx_code_label *label;
18216
18217 TWO52 = ix86_gen_TWO52 (mode);
18218
18219 /* Temporary for holding the result, initialized to the input
18220 operand to ease control flow. */
18221 res = copy_to_reg (operand1);
18222
18223 /* xa = abs (operand1) */
18224 xa = ix86_expand_sse_fabs (res, &mask);
18225
18226 /* if (!isless (xa, TWO52)) goto label; */
18227 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18228
18229 /* xa = (double)(long)x */
18230 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
18231 expand_fix (xi, res, 0);
18232 expand_float (xa, xi, 0);
18233
18234 /* generate 1.0 */
18235 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18236
18237 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
18238 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
18239 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18240 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
18241 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18242 if (HONOR_SIGNED_ZEROS (mode))
18243 {
18244 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18245 if (do_floor && flag_rounding_math)
18246 tmp = ix86_expand_sse_fabs (tmp, NULL);
18247
18248 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
18249 }
18250 emit_move_insn (res, tmp);
18251
18252 emit_label (label);
18253 LABEL_NUSES (label) = 1;
18254
18255 emit_move_insn (operand0, res);
18256 }
18257
18258 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
18259 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18260 that is only available on 64bit targets. */
18261 void
18262 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
18263 {
18264 /* C code for the stuff we expand below.
18265 double xa = fabs (x), x2;
18266 if (!isless (xa, TWO52))
18267 return x;
18268 xa = xa + TWO52 - TWO52;
18269 x2 = copysign (xa, x);
18270
18271 Compensate. Floor:
18272 if (x2 > x)
18273 x2 -= 1;
18274 Compensate. Ceil:
18275 if (x2 < x)
18276 x2 += 1;
18277
18278 if (HONOR_SIGNED_ZEROS (mode))
18279 x2 = copysign (x2, x);
18280 return x2;
18281 */
18282 machine_mode mode = GET_MODE (operand0);
18283 rtx xa, TWO52, tmp, one, res, mask;
18284 rtx_code_label *label;
18285
18286 TWO52 = ix86_gen_TWO52 (mode);
18287
18288 /* Temporary for holding the result, initialized to the input
18289 operand to ease control flow. */
18290 res = copy_to_reg (operand1);
18291
18292 /* xa = abs (operand1) */
18293 xa = ix86_expand_sse_fabs (res, &mask);
18294
18295 /* if (!isless (xa, TWO52)) goto label; */
18296 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18297
18298 /* xa = xa + TWO52 - TWO52; */
18299 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18300 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
18301
18302 /* xa = copysign (xa, operand1) */
18303 ix86_sse_copysign_to_positive (xa, xa, res, mask);
18304
18305 /* generate 1.0 */
18306 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18307
18308 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
18309 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
18310 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18311 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
18312 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18313 if (HONOR_SIGNED_ZEROS (mode))
18314 {
18315 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18316 if (do_floor && flag_rounding_math)
18317 tmp = ix86_expand_sse_fabs (tmp, NULL);
18318
18319 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
18320 }
18321 emit_move_insn (res, tmp);
18322
18323 emit_label (label);
18324 LABEL_NUSES (label) = 1;
18325
18326 emit_move_insn (operand0, res);
18327 }
18328
18329 /* Expand SSE sequence for computing trunc
18330 from OPERAND1 storing into OPERAND0. */
18331 void
18332 ix86_expand_trunc (rtx operand0, rtx operand1)
18333 {
18334 /* C code for SSE variant we expand below.
18335 double xa = fabs (x), x2;
18336 if (!isless (xa, TWO52))
18337 return x;
18338 x2 = (double)(long)x;
18339 if (HONOR_SIGNED_ZEROS (mode))
18340 return copysign (x2, x);
18341 return x2;
18342 */
18343 machine_mode mode = GET_MODE (operand0);
18344 rtx xa, xi, TWO52, res, mask;
18345 rtx_code_label *label;
18346
18347 TWO52 = ix86_gen_TWO52 (mode);
18348
18349 /* Temporary for holding the result, initialized to the input
18350 operand to ease control flow. */
18351 res = copy_to_reg (operand1);
18352
18353 /* xa = abs (operand1) */
18354 xa = ix86_expand_sse_fabs (res, &mask);
18355
18356 /* if (!isless (xa, TWO52)) goto label; */
18357 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18358
18359 /* xa = (double)(long)x */
18360 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
18361 expand_fix (xi, res, 0);
18362 expand_float (xa, xi, 0);
18363
18364 if (HONOR_SIGNED_ZEROS (mode))
18365 ix86_sse_copysign_to_positive (xa, xa, res, mask);
18366
18367 emit_move_insn (res, xa);
18368
18369 emit_label (label);
18370 LABEL_NUSES (label) = 1;
18371
18372 emit_move_insn (operand0, res);
18373 }
18374
18375 /* Expand SSE sequence for computing trunc from OPERAND1 storing
18376 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18377 that is only available on 64bit targets. */
18378 void
18379 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
18380 {
18381 machine_mode mode = GET_MODE (operand0);
18382 rtx xa, xa2, TWO52, tmp, one, res, mask;
18383 rtx_code_label *label;
18384
18385 /* C code for SSE variant we expand below.
18386 double xa = fabs (x), x2;
18387 if (!isless (xa, TWO52))
18388 return x;
18389 xa2 = xa + TWO52 - TWO52;
18390 Compensate:
18391 if (xa2 > xa)
18392 xa2 -= 1.0;
18393 x2 = copysign (xa2, x);
18394 return x2;
18395 */
18396
18397 TWO52 = ix86_gen_TWO52 (mode);
18398
18399 /* Temporary for holding the result, initialized to the input
18400 operand to ease control flow. */
18401 res =copy_to_reg (operand1);
18402
18403 /* xa = abs (operand1) */
18404 xa = ix86_expand_sse_fabs (res, &mask);
18405
18406 /* if (!isless (xa, TWO52)) goto label; */
18407 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18408
18409 /* xa2 = xa + TWO52 - TWO52; */
18410 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18411 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
18412
18413 /* generate 1.0 */
18414 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18415
18416 /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */
18417 tmp = ix86_expand_sse_compare_mask (UNGT, xa2, xa, false);
18418 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18419 tmp = expand_simple_binop (mode, MINUS,
18420 xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18421 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18422 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
18423 tmp = ix86_expand_sse_fabs (tmp, NULL);
18424
18425 /* res = copysign (xa2, operand1) */
18426 ix86_sse_copysign_to_positive (res, tmp, res, mask);
18427
18428 emit_label (label);
18429 LABEL_NUSES (label) = 1;
18430
18431 emit_move_insn (operand0, res);
18432 }
18433
18434 /* Expand SSE sequence for computing round
18435 from OPERAND1 storing into OPERAND0. */
18436 void
18437 ix86_expand_round (rtx operand0, rtx operand1)
18438 {
18439 /* C code for the stuff we're doing below:
18440 double xa = fabs (x);
18441 if (!isless (xa, TWO52))
18442 return x;
18443 xa = (double)(long)(xa + nextafter (0.5, 0.0));
18444 return copysign (xa, x);
18445 */
18446 machine_mode mode = GET_MODE (operand0);
18447 rtx res, TWO52, xa, xi, half, mask;
18448 rtx_code_label *label;
18449 const struct real_format *fmt;
18450 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18451
18452 /* Temporary for holding the result, initialized to the input
18453 operand to ease control flow. */
18454 res = copy_to_reg (operand1);
18455
18456 TWO52 = ix86_gen_TWO52 (mode);
18457 xa = ix86_expand_sse_fabs (res, &mask);
18458 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18459
18460 /* load nextafter (0.5, 0.0) */
18461 fmt = REAL_MODE_FORMAT (mode);
18462 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18463 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18464
18465 /* xa = xa + 0.5 */
18466 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
18467 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
18468
18469 /* xa = (double)(int64_t)xa */
18470 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
18471 expand_fix (xi, xa, 0);
18472 expand_float (xa, xi, 0);
18473
18474 /* res = copysign (xa, operand1) */
18475 ix86_sse_copysign_to_positive (res, xa, res, mask);
18476
18477 emit_label (label);
18478 LABEL_NUSES (label) = 1;
18479
18480 emit_move_insn (operand0, res);
18481 }
18482
18483 /* Expand SSE sequence for computing round from OPERAND1 storing
18484 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18485 that is only available on 64bit targets. */
18486 void
18487 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
18488 {
18489 /* C code for the stuff we expand below.
18490 double xa = fabs (x), xa2, x2;
18491 if (!isless (xa, TWO52))
18492 return x;
18493 Using the absolute value and copying back sign makes
18494 -0.0 -> -0.0 correct.
18495 xa2 = xa + TWO52 - TWO52;
18496 Compensate.
18497 dxa = xa2 - xa;
18498 if (dxa <= -0.5)
18499 xa2 += 1;
18500 else if (dxa > 0.5)
18501 xa2 -= 1;
18502 x2 = copysign (xa2, x);
18503 return x2;
18504 */
18505 machine_mode mode = GET_MODE (operand0);
18506 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
18507 rtx_code_label *label;
18508
18509 TWO52 = ix86_gen_TWO52 (mode);
18510
18511 /* Temporary for holding the result, initialized to the input
18512 operand to ease control flow. */
18513 res = copy_to_reg (operand1);
18514
18515 /* xa = abs (operand1) */
18516 xa = ix86_expand_sse_fabs (res, &mask);
18517
18518 /* if (!isless (xa, TWO52)) goto label; */
18519 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18520
18521 /* xa2 = xa + TWO52 - TWO52; */
18522 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18523 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
18524
18525 /* dxa = xa2 - xa; */
18526 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
18527
18528 /* generate 0.5, 1.0 and -0.5 */
18529 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
18530 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
18531 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
18532 0, OPTAB_DIRECT);
18533
18534 /* Compensate. */
18535 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
18536 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
18537 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
18538 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18539 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
18540 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
18541 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
18542 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18543
18544 /* res = copysign (xa2, operand1) */
18545 ix86_sse_copysign_to_positive (res, xa2, res, mask);
18546
18547 emit_label (label);
18548 LABEL_NUSES (label) = 1;
18549
18550 emit_move_insn (operand0, res);
18551 }
18552
18553 /* Expand SSE sequence for computing round
18554 from OP1 storing into OP0 using sse4 round insn. */
18555 void
18556 ix86_expand_round_sse4 (rtx op0, rtx op1)
18557 {
18558 machine_mode mode = GET_MODE (op0);
18559 rtx e1, e2, res, half;
18560 const struct real_format *fmt;
18561 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18562 rtx (*gen_copysign) (rtx, rtx, rtx);
18563 rtx (*gen_round) (rtx, rtx, rtx);
18564
18565 switch (mode)
18566 {
18567 case E_SFmode:
18568 gen_copysign = gen_copysignsf3;
18569 gen_round = gen_sse4_1_roundsf2;
18570 break;
18571 case E_DFmode:
18572 gen_copysign = gen_copysigndf3;
18573 gen_round = gen_sse4_1_rounddf2;
18574 break;
18575 default:
18576 gcc_unreachable ();
18577 }
18578
18579 /* round (a) = trunc (a + copysign (0.5, a)) */
18580
18581 /* load nextafter (0.5, 0.0) */
18582 fmt = REAL_MODE_FORMAT (mode);
18583 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18584 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18585 half = const_double_from_real_value (pred_half, mode);
18586
18587 /* e1 = copysign (0.5, op1) */
18588 e1 = gen_reg_rtx (mode);
18589 emit_insn (gen_copysign (e1, half, op1));
18590
18591 /* e2 = op1 + e1 */
18592 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
18593
18594 /* res = trunc (e2) */
18595 res = gen_reg_rtx (mode);
18596 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
18597
18598 emit_move_insn (op0, res);
18599 }
18600
18601 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
18602 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
18603 insn every time. */
18604
18605 static GTY(()) rtx_insn *vselect_insn;
18606
18607 /* Initialize vselect_insn. */
18608
18609 static void
18610 init_vselect_insn (void)
18611 {
18612 unsigned i;
18613 rtx x;
18614
18615 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
18616 for (i = 0; i < MAX_VECT_LEN; ++i)
18617 XVECEXP (x, 0, i) = const0_rtx;
18618 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
18619 const0_rtx), x);
18620 x = gen_rtx_SET (const0_rtx, x);
18621 start_sequence ();
18622 vselect_insn = emit_insn (x);
18623 end_sequence ();
18624 }
18625
18626 /* Construct (set target (vec_select op0 (parallel perm))) and
18627 return true if that's a valid instruction in the active ISA. */
18628
18629 static bool
18630 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
18631 unsigned nelt, bool testing_p)
18632 {
18633 unsigned int i;
18634 rtx x, save_vconcat;
18635 int icode;
18636
18637 if (vselect_insn == NULL_RTX)
18638 init_vselect_insn ();
18639
18640 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
18641 PUT_NUM_ELEM (XVEC (x, 0), nelt);
18642 for (i = 0; i < nelt; ++i)
18643 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
18644 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
18645 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
18646 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
18647 SET_DEST (PATTERN (vselect_insn)) = target;
18648 icode = recog_memoized (vselect_insn);
18649
18650 if (icode >= 0 && !testing_p)
18651 emit_insn (copy_rtx (PATTERN (vselect_insn)));
18652
18653 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
18654 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
18655 INSN_CODE (vselect_insn) = -1;
18656
18657 return icode >= 0;
18658 }
18659
18660 /* Similar, but generate a vec_concat from op0 and op1 as well. */
18661
18662 static bool
18663 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
18664 const unsigned char *perm, unsigned nelt,
18665 bool testing_p)
18666 {
18667 machine_mode v2mode;
18668 rtx x;
18669 bool ok;
18670
18671 if (vselect_insn == NULL_RTX)
18672 init_vselect_insn ();
18673
18674 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
18675 return false;
18676 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
18677 PUT_MODE (x, v2mode);
18678 XEXP (x, 0) = op0;
18679 XEXP (x, 1) = op1;
18680 ok = expand_vselect (target, x, perm, nelt, testing_p);
18681 XEXP (x, 0) = const0_rtx;
18682 XEXP (x, 1) = const0_rtx;
18683 return ok;
18684 }
18685
18686 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
18687 using movss or movsd. */
18688 static bool
18689 expand_vec_perm_movs (struct expand_vec_perm_d *d)
18690 {
18691 machine_mode vmode = d->vmode;
18692 unsigned i, nelt = d->nelt;
18693 rtx x;
18694
18695 if (d->one_operand_p)
18696 return false;
18697
18698 if (!(TARGET_SSE && vmode == V4SFmode)
18699 && !(TARGET_MMX_WITH_SSE && vmode == V2SFmode)
18700 && !(TARGET_SSE2 && vmode == V2DFmode))
18701 return false;
18702
18703 /* Only the first element is changed. */
18704 if (d->perm[0] != nelt && d->perm[0] != 0)
18705 return false;
18706 for (i = 1; i < nelt; ++i)
18707 if (d->perm[i] != i + nelt - d->perm[0])
18708 return false;
18709
18710 if (d->testing_p)
18711 return true;
18712
18713 if (d->perm[0] == nelt)
18714 x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
18715 else
18716 x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
18717
18718 emit_insn (gen_rtx_SET (d->target, x));
18719
18720 return true;
18721 }
18722
18723 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
18724 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
18725
18726 static bool
18727 expand_vec_perm_blend (struct expand_vec_perm_d *d)
18728 {
18729 machine_mode mmode, vmode = d->vmode;
18730 unsigned i, nelt = d->nelt;
18731 unsigned HOST_WIDE_INT mask;
18732 rtx target, op0, op1, maskop, x;
18733 rtx rperm[32], vperm;
18734
18735 if (d->one_operand_p)
18736 return false;
18737 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
18738 && (TARGET_AVX512BW
18739 || GET_MODE_UNIT_SIZE (vmode) >= 4))
18740 ;
18741 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
18742 ;
18743 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
18744 ;
18745 else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
18746 || GET_MODE_SIZE (vmode) == 8
18747 || GET_MODE_SIZE (vmode) == 4))
18748 ;
18749 else
18750 return false;
18751
18752 /* This is a blend, not a permute. Elements must stay in their
18753 respective lanes. */
18754 for (i = 0; i < nelt; ++i)
18755 {
18756 unsigned e = d->perm[i];
18757 if (!(e == i || e == i + nelt))
18758 return false;
18759 }
18760
18761 if (d->testing_p)
18762 return true;
18763
18764 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
18765 decision should be extracted elsewhere, so that we only try that
18766 sequence once all budget==3 options have been tried. */
18767 target = d->target;
18768 op0 = d->op0;
18769 op1 = d->op1;
18770 mask = 0;
18771
18772 switch (vmode)
18773 {
18774 case E_V8DFmode:
18775 case E_V16SFmode:
18776 case E_V4DFmode:
18777 case E_V8SFmode:
18778 case E_V2DFmode:
18779 case E_V4SFmode:
18780 case E_V4HImode:
18781 case E_V8HImode:
18782 case E_V8SImode:
18783 case E_V32HImode:
18784 case E_V64QImode:
18785 case E_V16SImode:
18786 case E_V8DImode:
18787 for (i = 0; i < nelt; ++i)
18788 mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
18789 break;
18790
18791 case E_V2DImode:
18792 for (i = 0; i < 2; ++i)
18793 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
18794 vmode = V8HImode;
18795 goto do_subreg;
18796
18797 case E_V2SImode:
18798 for (i = 0; i < 2; ++i)
18799 mask |= (d->perm[i] >= 2 ? 3 : 0) << (i * 2);
18800 vmode = V4HImode;
18801 goto do_subreg;
18802
18803 case E_V4SImode:
18804 for (i = 0; i < 4; ++i)
18805 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
18806 vmode = V8HImode;
18807 goto do_subreg;
18808
18809 case E_V16QImode:
18810 /* See if bytes move in pairs so we can use pblendw with
18811 an immediate argument, rather than pblendvb with a vector
18812 argument. */
18813 for (i = 0; i < 16; i += 2)
18814 if (d->perm[i] + 1 != d->perm[i + 1])
18815 {
18816 use_pblendvb:
18817 for (i = 0; i < nelt; ++i)
18818 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
18819
18820 finish_pblendvb:
18821 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
18822 vperm = force_reg (vmode, vperm);
18823
18824 if (GET_MODE_SIZE (vmode) == 4)
18825 emit_insn (gen_mmx_pblendvb_v4qi (target, op0, op1, vperm));
18826 else if (GET_MODE_SIZE (vmode) == 8)
18827 emit_insn (gen_mmx_pblendvb_v8qi (target, op0, op1, vperm));
18828 else if (GET_MODE_SIZE (vmode) == 16)
18829 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
18830 else
18831 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
18832 if (target != d->target)
18833 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
18834 return true;
18835 }
18836
18837 for (i = 0; i < 8; ++i)
18838 mask |= (d->perm[i * 2] >= 16) << i;
18839 vmode = V8HImode;
18840 /* FALLTHRU */
18841
18842 do_subreg:
18843 target = gen_reg_rtx (vmode);
18844 op0 = gen_lowpart (vmode, op0);
18845 op1 = gen_lowpart (vmode, op1);
18846 break;
18847
18848 case E_V8QImode:
18849 for (i = 0; i < 8; i += 2)
18850 if (d->perm[i] + 1 != d->perm[i + 1])
18851 goto use_pblendvb;
18852
18853 for (i = 0; i < 4; ++i)
18854 mask |= (d->perm[i * 2] >= 8) << i;
18855 vmode = V4HImode;
18856 goto do_subreg;
18857
18858 case E_V4QImode:
18859 for (i = 0; i < 4; i += 2)
18860 if (d->perm[i] + 1 != d->perm[i + 1])
18861 goto use_pblendvb;
18862
18863 for (i = 0; i < 2; ++i)
18864 mask |= (d->perm[i * 2] >= 4) << i;
18865 vmode = V2HImode;
18866 goto do_subreg;
18867
18868 case E_V32QImode:
18869 /* See if bytes move in pairs. If not, vpblendvb must be used. */
18870 for (i = 0; i < 32; i += 2)
18871 if (d->perm[i] + 1 != d->perm[i + 1])
18872 goto use_pblendvb;
18873 /* See if bytes move in quadruplets. If yes, vpblendd
18874 with immediate can be used. */
18875 for (i = 0; i < 32; i += 4)
18876 if (d->perm[i] + 2 != d->perm[i + 2])
18877 break;
18878 if (i < 32)
18879 {
18880 /* See if bytes move the same in both lanes. If yes,
18881 vpblendw with immediate can be used. */
18882 for (i = 0; i < 16; i += 2)
18883 if (d->perm[i] + 16 != d->perm[i + 16])
18884 goto use_pblendvb;
18885
18886 /* Use vpblendw. */
18887 for (i = 0; i < 16; ++i)
18888 mask |= (d->perm[i * 2] >= 32) << i;
18889 vmode = V16HImode;
18890 goto do_subreg;
18891 }
18892
18893 /* Use vpblendd. */
18894 for (i = 0; i < 8; ++i)
18895 mask |= (d->perm[i * 4] >= 32) << i;
18896 vmode = V8SImode;
18897 goto do_subreg;
18898
18899 case E_V16HImode:
18900 /* See if words move in pairs. If yes, vpblendd can be used. */
18901 for (i = 0; i < 16; i += 2)
18902 if (d->perm[i] + 1 != d->perm[i + 1])
18903 break;
18904 if (i < 16)
18905 {
18906 /* See if words move the same in both lanes. If not,
18907 vpblendvb must be used. */
18908 for (i = 0; i < 8; i++)
18909 if (d->perm[i] + 8 != d->perm[i + 8])
18910 {
18911 /* Use vpblendvb. */
18912 for (i = 0; i < 32; ++i)
18913 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
18914
18915 vmode = V32QImode;
18916 nelt = 32;
18917 target = gen_reg_rtx (vmode);
18918 op0 = gen_lowpart (vmode, op0);
18919 op1 = gen_lowpart (vmode, op1);
18920 goto finish_pblendvb;
18921 }
18922
18923 /* Use vpblendw. */
18924 for (i = 0; i < 16; ++i)
18925 mask |= (d->perm[i] >= 16) << i;
18926 break;
18927 }
18928
18929 /* Use vpblendd. */
18930 for (i = 0; i < 8; ++i)
18931 mask |= (d->perm[i * 2] >= 16) << i;
18932 vmode = V8SImode;
18933 goto do_subreg;
18934
18935 case E_V4DImode:
18936 /* Use vpblendd. */
18937 for (i = 0; i < 4; ++i)
18938 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
18939 vmode = V8SImode;
18940 goto do_subreg;
18941
18942 default:
18943 gcc_unreachable ();
18944 }
18945
18946 switch (vmode)
18947 {
18948 case E_V8DFmode:
18949 case E_V8DImode:
18950 mmode = QImode;
18951 break;
18952 case E_V16SFmode:
18953 case E_V16SImode:
18954 mmode = HImode;
18955 break;
18956 case E_V32HImode:
18957 mmode = SImode;
18958 break;
18959 case E_V64QImode:
18960 mmode = DImode;
18961 break;
18962 default:
18963 mmode = VOIDmode;
18964 }
18965
18966 if (mmode != VOIDmode)
18967 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
18968 else
18969 maskop = GEN_INT (mask);
18970
18971 /* This matches five different patterns with the different modes. */
18972 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
18973 x = gen_rtx_SET (target, x);
18974 emit_insn (x);
18975 if (target != d->target)
18976 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
18977
18978 return true;
18979 }
18980
18981 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
18982 in terms of the variable form of vpermilps.
18983
18984 Note that we will have already failed the immediate input vpermilps,
18985 which requires that the high and low part shuffle be identical; the
18986 variable form doesn't require that. */
18987
18988 static bool
18989 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
18990 {
18991 rtx rperm[8], vperm;
18992 unsigned i;
18993
18994 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
18995 return false;
18996
18997 /* We can only permute within the 128-bit lane. */
18998 for (i = 0; i < 8; ++i)
18999 {
19000 unsigned e = d->perm[i];
19001 if (i < 4 ? e >= 4 : e < 4)
19002 return false;
19003 }
19004
19005 if (d->testing_p)
19006 return true;
19007
19008 for (i = 0; i < 8; ++i)
19009 {
19010 unsigned e = d->perm[i];
19011
19012 /* Within each 128-bit lane, the elements of op0 are numbered
19013 from 0 and the elements of op1 are numbered from 4. */
19014 if (e >= 8 + 4)
19015 e -= 8;
19016 else if (e >= 4)
19017 e -= 4;
19018
19019 rperm[i] = GEN_INT (e);
19020 }
19021
19022 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
19023 vperm = force_reg (V8SImode, vperm);
19024 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
19025
19026 return true;
19027 }
19028
19029 /* For V*[QHS]Imode permutations, check if the same permutation
19030 can't be performed in a 2x, 4x or 8x wider inner mode. */
19031
19032 static bool
19033 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
19034 struct expand_vec_perm_d *nd)
19035 {
19036 int i;
19037 machine_mode mode = VOIDmode;
19038
19039 switch (d->vmode)
19040 {
19041 case E_V8QImode: mode = V4HImode; break;
19042 case E_V16QImode: mode = V8HImode; break;
19043 case E_V32QImode: mode = V16HImode; break;
19044 case E_V64QImode: mode = V32HImode; break;
19045 case E_V4HImode: mode = V2SImode; break;
19046 case E_V8HImode: mode = V4SImode; break;
19047 case E_V16HImode: mode = V8SImode; break;
19048 case E_V32HImode: mode = V16SImode; break;
19049 case E_V4SImode: mode = V2DImode; break;
19050 case E_V8SImode: mode = V4DImode; break;
19051 case E_V16SImode: mode = V8DImode; break;
19052 default: return false;
19053 }
19054 for (i = 0; i < d->nelt; i += 2)
19055 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
19056 return false;
19057 nd->vmode = mode;
19058 nd->nelt = d->nelt / 2;
19059 for (i = 0; i < nd->nelt; i++)
19060 nd->perm[i] = d->perm[2 * i] / 2;
19061 if (GET_MODE_INNER (mode) != DImode)
19062 canonicalize_vector_int_perm (nd, nd);
19063 if (nd != d)
19064 {
19065 nd->one_operand_p = d->one_operand_p;
19066 nd->testing_p = d->testing_p;
19067 if (d->op0 == d->op1)
19068 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
19069 else
19070 {
19071 nd->op0 = gen_lowpart (nd->vmode, d->op0);
19072 nd->op1 = gen_lowpart (nd->vmode, d->op1);
19073 }
19074 if (d->testing_p)
19075 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
19076 else
19077 nd->target = gen_reg_rtx (nd->vmode);
19078 }
19079 return true;
19080 }
19081
19082 /* Return true if permutation D can be performed as VMODE permutation
19083 instead. */
19084
19085 static bool
19086 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
19087 {
19088 unsigned int i, j, chunk;
19089
19090 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
19091 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
19092 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
19093 return false;
19094
19095 if (GET_MODE_NUNITS (vmode) >= d->nelt)
19096 return true;
19097
19098 chunk = d->nelt / GET_MODE_NUNITS (vmode);
19099 for (i = 0; i < d->nelt; i += chunk)
19100 if (d->perm[i] & (chunk - 1))
19101 return false;
19102 else
19103 for (j = 1; j < chunk; ++j)
19104 if (d->perm[i] + j != d->perm[i + j])
19105 return false;
19106
19107 return true;
19108 }
19109
19110 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19111 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
19112
19113 static bool
19114 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
19115 {
19116 unsigned i, nelt, eltsz, mask;
19117 unsigned char perm[64];
19118 machine_mode vmode;
19119 struct expand_vec_perm_d nd;
19120 rtx rperm[64], vperm, target, op0, op1;
19121
19122 nelt = d->nelt;
19123
19124 if (!d->one_operand_p)
19125 switch (GET_MODE_SIZE (d->vmode))
19126 {
19127 case 4:
19128 if (!TARGET_XOP)
19129 return false;
19130 vmode = V4QImode;
19131 break;
19132
19133 case 8:
19134 if (!TARGET_XOP)
19135 return false;
19136 vmode = V8QImode;
19137 break;
19138
19139 case 16:
19140 if (!TARGET_XOP)
19141 return false;
19142 vmode = V16QImode;
19143 break;
19144
19145 case 32:
19146 if (!TARGET_AVX2)
19147 return false;
19148
19149 if (valid_perm_using_mode_p (V2TImode, d))
19150 {
19151 if (d->testing_p)
19152 return true;
19153
19154 /* Use vperm2i128 insn. The pattern uses
19155 V4DImode instead of V2TImode. */
19156 target = d->target;
19157 if (d->vmode != V4DImode)
19158 target = gen_reg_rtx (V4DImode);
19159 op0 = gen_lowpart (V4DImode, d->op0);
19160 op1 = gen_lowpart (V4DImode, d->op1);
19161 rperm[0]
19162 = GEN_INT ((d->perm[0] / (nelt / 2))
19163 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
19164 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
19165 if (target != d->target)
19166 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19167 return true;
19168 }
19169 /* FALLTHRU */
19170
19171 default:
19172 return false;
19173 }
19174 else
19175 switch (GET_MODE_SIZE (d->vmode))
19176 {
19177 case 4:
19178 if (!TARGET_SSSE3)
19179 return false;
19180 vmode = V4QImode;
19181 break;
19182
19183 case 8:
19184 if (!TARGET_SSSE3)
19185 return false;
19186 vmode = V8QImode;
19187 break;
19188
19189 case 16:
19190 if (!TARGET_SSSE3)
19191 return false;
19192 vmode = V16QImode;
19193 break;
19194
19195 case 32:
19196 if (!TARGET_AVX2)
19197 return false;
19198
19199 /* V4DImode should be already handled through
19200 expand_vselect by vpermq instruction. */
19201 gcc_assert (d->vmode != V4DImode);
19202
19203 vmode = V32QImode;
19204 if (d->vmode == V8SImode
19205 || d->vmode == V16HImode
19206 || d->vmode == V32QImode)
19207 {
19208 /* First see if vpermq can be used for
19209 V8SImode/V16HImode/V32QImode. */
19210 if (valid_perm_using_mode_p (V4DImode, d))
19211 {
19212 for (i = 0; i < 4; i++)
19213 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
19214 if (d->testing_p)
19215 return true;
19216 target = gen_reg_rtx (V4DImode);
19217 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
19218 perm, 4, false))
19219 {
19220 emit_move_insn (d->target,
19221 gen_lowpart (d->vmode, target));
19222 return true;
19223 }
19224 return false;
19225 }
19226
19227 /* Next see if vpermd can be used. */
19228 if (valid_perm_using_mode_p (V8SImode, d))
19229 vmode = V8SImode;
19230 }
19231 /* Or if vpermps can be used. */
19232 else if (d->vmode == V8SFmode)
19233 vmode = V8SImode;
19234
19235 if (vmode == V32QImode)
19236 {
19237 /* vpshufb only works intra lanes, it is not
19238 possible to shuffle bytes in between the lanes. */
19239 for (i = 0; i < nelt; ++i)
19240 if ((d->perm[i] ^ i) & (nelt / 2))
19241 return false;
19242 }
19243 break;
19244
19245 case 64:
19246 if (!TARGET_AVX512BW)
19247 return false;
19248
19249 /* If vpermq didn't work, vpshufb won't work either. */
19250 if (d->vmode == V8DFmode || d->vmode == V8DImode)
19251 return false;
19252
19253 vmode = V64QImode;
19254 if (d->vmode == V16SImode
19255 || d->vmode == V32HImode
19256 || d->vmode == V64QImode)
19257 {
19258 /* First see if vpermq can be used for
19259 V16SImode/V32HImode/V64QImode. */
19260 if (valid_perm_using_mode_p (V8DImode, d))
19261 {
19262 for (i = 0; i < 8; i++)
19263 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
19264 if (d->testing_p)
19265 return true;
19266 target = gen_reg_rtx (V8DImode);
19267 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
19268 perm, 8, false))
19269 {
19270 emit_move_insn (d->target,
19271 gen_lowpart (d->vmode, target));
19272 return true;
19273 }
19274 return false;
19275 }
19276
19277 /* Next see if vpermd can be used. */
19278 if (valid_perm_using_mode_p (V16SImode, d))
19279 vmode = V16SImode;
19280 }
19281 /* Or if vpermps can be used. */
19282 else if (d->vmode == V16SFmode)
19283 vmode = V16SImode;
19284
19285 if (vmode == V64QImode)
19286 {
19287 /* vpshufb only works intra lanes, it is not
19288 possible to shuffle bytes in between the lanes. */
19289 for (i = 0; i < nelt; ++i)
19290 if ((d->perm[i] ^ i) & (3 * nelt / 4))
19291 return false;
19292 }
19293 break;
19294
19295 default:
19296 return false;
19297 }
19298
19299 if (d->testing_p)
19300 return true;
19301
19302 /* Try to avoid variable permutation instruction. */
19303 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19304 {
19305 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19306 return true;
19307 }
19308
19309 if (vmode == V8SImode)
19310 for (i = 0; i < 8; ++i)
19311 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
19312 else if (vmode == V16SImode)
19313 for (i = 0; i < 16; ++i)
19314 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
19315 else
19316 {
19317 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
19318 if (!d->one_operand_p)
19319 mask = 2 * nelt - 1;
19320 else if (vmode == V64QImode)
19321 mask = nelt / 4 - 1;
19322 else if (vmode == V32QImode)
19323 mask = nelt / 2 - 1;
19324 else
19325 mask = nelt - 1;
19326
19327 for (i = 0; i < nelt; ++i)
19328 {
19329 unsigned j, e = d->perm[i] & mask;
19330 for (j = 0; j < eltsz; ++j)
19331 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
19332 }
19333 }
19334
19335 machine_mode vpmode = vmode;
19336
19337 nelt = GET_MODE_SIZE (vmode);
19338
19339 /* Emulate narrow modes with V16QI instructions. */
19340 if (nelt < 16)
19341 {
19342 rtx m128 = GEN_INT (-128);
19343
19344 /* Remap elements from the second operand, as we have to
19345 account for inactive top elements from the first operand. */
19346 if (!d->one_operand_p)
19347 {
19348 for (i = 0; i < nelt; ++i)
19349 {
19350 unsigned ival = UINTVAL (rperm[i]);
19351 if (ival >= nelt)
19352 rperm[i] = GEN_INT (ival + 16 - nelt);
19353 }
19354 }
19355
19356 /* Fill inactive elements in the top positions with zeros. */
19357 for (i = nelt; i < 16; ++i)
19358 rperm[i] = m128;
19359
19360 vpmode = V16QImode;
19361 }
19362
19363 vperm = gen_rtx_CONST_VECTOR (vpmode,
19364 gen_rtvec_v (GET_MODE_NUNITS (vpmode), rperm));
19365 vperm = force_reg (vpmode, vperm);
19366
19367 if (vmode == d->vmode)
19368 target = d->target;
19369 else
19370 target = gen_reg_rtx (vmode);
19371
19372 op0 = gen_lowpart (vmode, d->op0);
19373
19374 if (d->one_operand_p)
19375 {
19376 rtx (*gen) (rtx, rtx, rtx);
19377
19378 if (vmode == V4QImode)
19379 gen = gen_mmx_pshufbv4qi3;
19380 else if (vmode == V8QImode)
19381 gen = gen_mmx_pshufbv8qi3;
19382 else if (vmode == V16QImode)
19383 gen = gen_ssse3_pshufbv16qi3;
19384 else if (vmode == V32QImode)
19385 gen = gen_avx2_pshufbv32qi3;
19386 else if (vmode == V64QImode)
19387 gen = gen_avx512bw_pshufbv64qi3;
19388 else if (vmode == V8SFmode)
19389 gen = gen_avx2_permvarv8sf;
19390 else if (vmode == V8SImode)
19391 gen = gen_avx2_permvarv8si;
19392 else if (vmode == V16SFmode)
19393 gen = gen_avx512f_permvarv16sf;
19394 else if (vmode == V16SImode)
19395 gen = gen_avx512f_permvarv16si;
19396 else
19397 gcc_unreachable ();
19398
19399 emit_insn (gen (target, op0, vperm));
19400 }
19401 else
19402 {
19403 rtx (*gen) (rtx, rtx, rtx, rtx);
19404
19405 op1 = gen_lowpart (vmode, d->op1);
19406
19407 if (vmode == V4QImode)
19408 gen = gen_mmx_ppermv32;
19409 else if (vmode == V8QImode)
19410 gen = gen_mmx_ppermv64;
19411 else if (vmode == V16QImode)
19412 gen = gen_xop_pperm;
19413 else
19414 gcc_unreachable ();
19415
19416 emit_insn (gen (target, op0, op1, vperm));
19417 }
19418
19419 if (target != d->target)
19420 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19421
19422 return true;
19423 }
19424
19425 /* Try to expand one-operand permutation with constant mask. */
19426
19427 static bool
19428 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
19429 {
19430 machine_mode mode = GET_MODE (d->op0);
19431 machine_mode maskmode = mode;
19432 unsigned inner_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
19433 rtx (*gen) (rtx, rtx, rtx) = NULL;
19434 rtx target, op0, mask;
19435 rtx vec[64];
19436
19437 if (!rtx_equal_p (d->op0, d->op1))
19438 return false;
19439
19440 if (!TARGET_AVX512F)
19441 return false;
19442
19443 /* Accept VNxHImode and VNxQImode now. */
19444 if (!TARGET_AVX512VL && GET_MODE_SIZE (mode) < 64)
19445 return false;
19446
19447 /* vpermw. */
19448 if (!TARGET_AVX512BW && inner_size == 2)
19449 return false;
19450
19451 /* vpermb. */
19452 if (!TARGET_AVX512VBMI && inner_size == 1)
19453 return false;
19454
19455 switch (mode)
19456 {
19457 case E_V16SImode:
19458 gen = gen_avx512f_permvarv16si;
19459 break;
19460 case E_V16SFmode:
19461 gen = gen_avx512f_permvarv16sf;
19462 maskmode = V16SImode;
19463 break;
19464 case E_V8DImode:
19465 gen = gen_avx512f_permvarv8di;
19466 break;
19467 case E_V8DFmode:
19468 gen = gen_avx512f_permvarv8df;
19469 maskmode = V8DImode;
19470 break;
19471 case E_V32HImode:
19472 gen = gen_avx512bw_permvarv32hi;
19473 break;
19474 case E_V16HImode:
19475 gen = gen_avx512vl_permvarv16hi;
19476 break;
19477 case E_V8HImode:
19478 gen = gen_avx512vl_permvarv8hi;
19479 break;
19480 case E_V64QImode:
19481 gen = gen_avx512bw_permvarv64qi;
19482 break;
19483 case E_V32QImode:
19484 gen = gen_avx512vl_permvarv32qi;
19485 break;
19486 case E_V16QImode:
19487 gen = gen_avx512vl_permvarv16qi;
19488 break;
19489
19490 default:
19491 return false;
19492 }
19493
19494 if (d->testing_p)
19495 return true;
19496
19497 target = d->target;
19498 op0 = d->op0;
19499 for (int i = 0; i < d->nelt; ++i)
19500 vec[i] = GEN_INT (d->perm[i]);
19501 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
19502 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
19503 return true;
19504 }
19505
19506 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
19507
19508 /* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
19509 in a single instruction. */
19510
19511 static bool
19512 expand_vec_perm_1 (struct expand_vec_perm_d *d)
19513 {
19514 unsigned i, nelt = d->nelt;
19515 struct expand_vec_perm_d nd;
19516
19517 /* Check plain VEC_SELECT first, because AVX has instructions that could
19518 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
19519 input where SEL+CONCAT may not. */
19520 if (d->one_operand_p)
19521 {
19522 int mask = nelt - 1;
19523 bool identity_perm = true;
19524 bool broadcast_perm = true;
19525
19526 for (i = 0; i < nelt; i++)
19527 {
19528 nd.perm[i] = d->perm[i] & mask;
19529 if (nd.perm[i] != i)
19530 identity_perm = false;
19531 if (nd.perm[i])
19532 broadcast_perm = false;
19533 }
19534
19535 if (identity_perm)
19536 {
19537 if (!d->testing_p)
19538 emit_move_insn (d->target, d->op0);
19539 return true;
19540 }
19541 else if (broadcast_perm && TARGET_AVX2)
19542 {
19543 /* Use vpbroadcast{b,w,d}. */
19544 rtx (*gen) (rtx, rtx) = NULL;
19545 switch (d->vmode)
19546 {
19547 case E_V64QImode:
19548 if (TARGET_AVX512BW)
19549 gen = gen_avx512bw_vec_dupv64qi_1;
19550 break;
19551 case E_V32QImode:
19552 gen = gen_avx2_pbroadcastv32qi_1;
19553 break;
19554 case E_V32HImode:
19555 if (TARGET_AVX512BW)
19556 gen = gen_avx512bw_vec_dupv32hi_1;
19557 break;
19558 case E_V16HImode:
19559 gen = gen_avx2_pbroadcastv16hi_1;
19560 break;
19561 case E_V16SImode:
19562 if (TARGET_AVX512F)
19563 gen = gen_avx512f_vec_dupv16si_1;
19564 break;
19565 case E_V8SImode:
19566 gen = gen_avx2_pbroadcastv8si_1;
19567 break;
19568 case E_V16QImode:
19569 gen = gen_avx2_pbroadcastv16qi;
19570 break;
19571 case E_V8HImode:
19572 gen = gen_avx2_pbroadcastv8hi;
19573 break;
19574 case E_V16SFmode:
19575 if (TARGET_AVX512F)
19576 gen = gen_avx512f_vec_dupv16sf_1;
19577 break;
19578 case E_V8SFmode:
19579 gen = gen_avx2_vec_dupv8sf_1;
19580 break;
19581 case E_V8DFmode:
19582 if (TARGET_AVX512F)
19583 gen = gen_avx512f_vec_dupv8df_1;
19584 break;
19585 case E_V8DImode:
19586 if (TARGET_AVX512F)
19587 gen = gen_avx512f_vec_dupv8di_1;
19588 break;
19589 /* For other modes prefer other shuffles this function creates. */
19590 default: break;
19591 }
19592 if (gen != NULL)
19593 {
19594 if (!d->testing_p)
19595 emit_insn (gen (d->target, d->op0));
19596 return true;
19597 }
19598 }
19599
19600 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
19601 return true;
19602
19603 /* There are plenty of patterns in sse.md that are written for
19604 SEL+CONCAT and are not replicated for a single op. Perhaps
19605 that should be changed, to avoid the nastiness here. */
19606
19607 /* Recognize interleave style patterns, which means incrementing
19608 every other permutation operand. */
19609 for (i = 0; i < nelt; i += 2)
19610 {
19611 nd.perm[i] = d->perm[i] & mask;
19612 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
19613 }
19614 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
19615 d->testing_p))
19616 return true;
19617
19618 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
19619 if (nelt >= 4)
19620 {
19621 for (i = 0; i < nelt; i += 4)
19622 {
19623 nd.perm[i + 0] = d->perm[i + 0] & mask;
19624 nd.perm[i + 1] = d->perm[i + 1] & mask;
19625 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
19626 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
19627 }
19628
19629 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
19630 d->testing_p))
19631 return true;
19632 }
19633 }
19634
19635 /* Try movss/movsd instructions. */
19636 if (expand_vec_perm_movs (d))
19637 return true;
19638
19639 /* Finally, try the fully general two operand permute. */
19640 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
19641 d->testing_p))
19642 return true;
19643
19644 /* Recognize interleave style patterns with reversed operands. */
19645 if (!d->one_operand_p)
19646 {
19647 for (i = 0; i < nelt; ++i)
19648 {
19649 unsigned e = d->perm[i];
19650 if (e >= nelt)
19651 e -= nelt;
19652 else
19653 e += nelt;
19654 nd.perm[i] = e;
19655 }
19656
19657 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
19658 d->testing_p))
19659 return true;
19660 }
19661
19662 /* Try the SSE4.1 blend variable merge instructions. */
19663 if (expand_vec_perm_blend (d))
19664 return true;
19665
19666 /* Try one of the AVX vpermil variable permutations. */
19667 if (expand_vec_perm_vpermil (d))
19668 return true;
19669
19670 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
19671 vpshufb, vpermd, vpermps or vpermq variable permutation. */
19672 if (expand_vec_perm_pshufb (d))
19673 return true;
19674
19675 /* Try the AVX2 vpalignr instruction. */
19676 if (expand_vec_perm_palignr (d, true))
19677 return true;
19678
19679 /* Try the AVX512F vperm{w,b,s,d} instructions */
19680 if (ix86_expand_vec_one_operand_perm_avx512 (d))
19681 return true;
19682
19683 /* Try the AVX512F vpermt2/vpermi2 instructions. */
19684 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
19685 return true;
19686
19687 /* See if we can get the same permutation in different vector integer
19688 mode. */
19689 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19690 {
19691 if (!d->testing_p)
19692 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19693 return true;
19694 }
19695 return false;
19696 }
19697
19698 /* Canonicalize vec_perm index to make the first index
19699 always comes from the first vector. */
19700 static void
19701 ix86_vec_perm_index_canon (struct expand_vec_perm_d *d)
19702 {
19703 unsigned nelt = d->nelt;
19704 if (d->perm[0] < nelt)
19705 return;
19706
19707 for (unsigned i = 0; i != nelt; i++)
19708 d->perm[i] = (d->perm[i] + nelt) % (2 * nelt);
19709
19710 std::swap (d->op0, d->op1);
19711 return;
19712 }
19713
19714 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19715 in terms of a pair of shufps+ shufps/pshufd instructions. */
19716 static bool
19717 expand_vec_perm_shufps_shufps (struct expand_vec_perm_d *d)
19718 {
19719 unsigned char perm1[4];
19720 machine_mode vmode = d->vmode;
19721 bool ok;
19722 unsigned i, j, k, count = 0;
19723
19724 if (d->one_operand_p
19725 || (vmode != V4SImode && vmode != V4SFmode))
19726 return false;
19727
19728 if (d->testing_p)
19729 return true;
19730
19731 ix86_vec_perm_index_canon (d);
19732 for (i = 0; i < 4; ++i)
19733 count += d->perm[i] > 3 ? 1 : 0;
19734
19735 gcc_assert (count & 3);
19736
19737 rtx tmp = gen_reg_rtx (vmode);
19738 /* 2 from op0 and 2 from op1. */
19739 if (count == 2)
19740 {
19741 unsigned char perm2[4];
19742 for (i = 0, j = 0, k = 2; i < 4; ++i)
19743 if (d->perm[i] & 4)
19744 {
19745 perm1[k++] = d->perm[i];
19746 perm2[i] = k - 1;
19747 }
19748 else
19749 {
19750 perm1[j++] = d->perm[i];
19751 perm2[i] = j - 1;
19752 }
19753
19754 /* shufps. */
19755 ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
19756 perm1, d->nelt, false);
19757 gcc_assert (ok);
19758 if (vmode == V4SImode && TARGET_SSE2)
19759 /* pshufd. */
19760 ok = expand_vselect (d->target, tmp,
19761 perm2, d->nelt, false);
19762 else
19763 {
19764 /* shufps. */
19765 perm2[2] += 4;
19766 perm2[3] += 4;
19767 ok = expand_vselect_vconcat (d->target, tmp, tmp,
19768 perm2, d->nelt, false);
19769 }
19770 gcc_assert (ok);
19771 }
19772 /* 3 from one op and 1 from another. */
19773 else
19774 {
19775 unsigned pair_idx = 8, lone_idx = 8, shift;
19776
19777 /* Find the lone index. */
19778 for (i = 0; i < 4; ++i)
19779 if ((d->perm[i] > 3 && count == 1)
19780 || (d->perm[i] < 4 && count == 3))
19781 lone_idx = i;
19782
19783 /* When lone_idx is not 0, it must from second op(count == 1). */
19784 gcc_assert (count == (lone_idx ? 1 : 3));
19785
19786 /* Find the pair index that sits in the same half as the lone index. */
19787 shift = lone_idx & 2;
19788 pair_idx = 1 - lone_idx + 2 * shift;
19789
19790 /* First permutate lone index and pair index into the same vector as
19791 [ lone, lone, pair, pair ]. */
19792 perm1[1] = perm1[0]
19793 = (count == 3) ? d->perm[lone_idx] : d->perm[lone_idx] - 4;
19794 perm1[3] = perm1[2]
19795 = (count == 3) ? d->perm[pair_idx] : d->perm[pair_idx] + 4;
19796
19797 /* Alway put the vector contains lone indx at the first. */
19798 if (count == 1)
19799 std::swap (d->op0, d->op1);
19800
19801 /* shufps. */
19802 ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
19803 perm1, d->nelt, false);
19804 gcc_assert (ok);
19805
19806 /* Refine lone and pair index to original order. */
19807 perm1[shift] = lone_idx << 1;
19808 perm1[shift + 1] = pair_idx << 1;
19809
19810 /* Select the remaining 2 elements in another vector. */
19811 for (i = 2 - shift; i < 4 - shift; ++i)
19812 perm1[i] = lone_idx == 1 ? d->perm[i] + 4 : d->perm[i];
19813
19814 /* Adjust to original selector. */
19815 if (lone_idx > 1)
19816 std::swap (tmp, d->op1);
19817
19818 /* shufps. */
19819 ok = expand_vselect_vconcat (d->target, tmp, d->op1,
19820 perm1, d->nelt, false);
19821
19822 gcc_assert (ok);
19823 }
19824
19825 return true;
19826 }
19827
19828 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19829 in terms of a pair of pshuflw + pshufhw instructions. */
19830
19831 static bool
19832 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
19833 {
19834 unsigned char perm2[MAX_VECT_LEN];
19835 unsigned i;
19836 bool ok;
19837
19838 if (d->vmode != V8HImode || !d->one_operand_p)
19839 return false;
19840
19841 /* The two permutations only operate in 64-bit lanes. */
19842 for (i = 0; i < 4; ++i)
19843 if (d->perm[i] >= 4)
19844 return false;
19845 for (i = 4; i < 8; ++i)
19846 if (d->perm[i] < 4)
19847 return false;
19848
19849 if (d->testing_p)
19850 return true;
19851
19852 /* Emit the pshuflw. */
19853 memcpy (perm2, d->perm, 4);
19854 for (i = 4; i < 8; ++i)
19855 perm2[i] = i;
19856 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
19857 gcc_assert (ok);
19858
19859 /* Emit the pshufhw. */
19860 memcpy (perm2 + 4, d->perm + 4, 4);
19861 for (i = 0; i < 4; ++i)
19862 perm2[i] = i;
19863 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
19864 gcc_assert (ok);
19865
19866 return true;
19867 }
19868
19869 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
19870 the permutation using the SSSE3 palignr instruction. This succeeds
19871 when all of the elements in PERM fit within one vector and we merely
19872 need to shift them down so that a single vector permutation has a
19873 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
19874 the vpalignr instruction itself can perform the requested permutation. */
19875
19876 static bool
19877 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
19878 {
19879 unsigned i, nelt = d->nelt;
19880 unsigned min, max, minswap, maxswap;
19881 bool in_order, ok, swap = false;
19882 rtx shift, target;
19883 struct expand_vec_perm_d dcopy;
19884
19885 /* Even with AVX, palignr only operates on 128-bit vectors,
19886 in AVX2 palignr operates on both 128-bit lanes. */
19887 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
19888 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
19889 return false;
19890
19891 min = 2 * nelt;
19892 max = 0;
19893 minswap = 2 * nelt;
19894 maxswap = 0;
19895 for (i = 0; i < nelt; ++i)
19896 {
19897 unsigned e = d->perm[i];
19898 unsigned eswap = d->perm[i] ^ nelt;
19899 if (GET_MODE_SIZE (d->vmode) == 32)
19900 {
19901 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
19902 eswap = e ^ (nelt / 2);
19903 }
19904 if (e < min)
19905 min = e;
19906 if (e > max)
19907 max = e;
19908 if (eswap < minswap)
19909 minswap = eswap;
19910 if (eswap > maxswap)
19911 maxswap = eswap;
19912 }
19913 if (min == 0
19914 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
19915 {
19916 if (d->one_operand_p
19917 || minswap == 0
19918 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
19919 ? nelt / 2 : nelt))
19920 return false;
19921 swap = true;
19922 min = minswap;
19923 max = maxswap;
19924 }
19925
19926 /* Given that we have SSSE3, we know we'll be able to implement the
19927 single operand permutation after the palignr with pshufb for
19928 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
19929 first. */
19930 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
19931 return true;
19932
19933 dcopy = *d;
19934 if (swap)
19935 {
19936 dcopy.op0 = d->op1;
19937 dcopy.op1 = d->op0;
19938 for (i = 0; i < nelt; ++i)
19939 dcopy.perm[i] ^= nelt;
19940 }
19941
19942 in_order = true;
19943 for (i = 0; i < nelt; ++i)
19944 {
19945 unsigned e = dcopy.perm[i];
19946 if (GET_MODE_SIZE (d->vmode) == 32
19947 && e >= nelt
19948 && (e & (nelt / 2 - 1)) < min)
19949 e = e - min - (nelt / 2);
19950 else
19951 e = e - min;
19952 if (e != i)
19953 in_order = false;
19954 dcopy.perm[i] = e;
19955 }
19956 dcopy.one_operand_p = true;
19957
19958 if (single_insn_only_p && !in_order)
19959 return false;
19960
19961 /* For AVX2, test whether we can permute the result in one instruction. */
19962 if (d->testing_p)
19963 {
19964 if (in_order)
19965 return true;
19966 dcopy.op1 = dcopy.op0;
19967 return expand_vec_perm_1 (&dcopy);
19968 }
19969
19970 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
19971 if (GET_MODE_SIZE (d->vmode) == 16)
19972 {
19973 target = gen_reg_rtx (V1TImode);
19974 emit_insn (gen_ssse3_palignrv1ti (target,
19975 gen_lowpart (V1TImode, dcopy.op1),
19976 gen_lowpart (V1TImode, dcopy.op0),
19977 shift));
19978 }
19979 else
19980 {
19981 target = gen_reg_rtx (V2TImode);
19982 emit_insn (gen_avx2_palignrv2ti (target,
19983 gen_lowpart (V2TImode, dcopy.op1),
19984 gen_lowpart (V2TImode, dcopy.op0),
19985 shift));
19986 }
19987
19988 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
19989
19990 /* Test for the degenerate case where the alignment by itself
19991 produces the desired permutation. */
19992 if (in_order)
19993 {
19994 emit_move_insn (d->target, dcopy.op0);
19995 return true;
19996 }
19997
19998 ok = expand_vec_perm_1 (&dcopy);
19999 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
20000
20001 return ok;
20002 }
20003
20004 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20005 the permutation using the SSE4_1 pblendv instruction. Potentially
20006 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
20007
20008 static bool
20009 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
20010 {
20011 unsigned i, which, nelt = d->nelt;
20012 struct expand_vec_perm_d dcopy, dcopy1;
20013 machine_mode vmode = d->vmode;
20014 bool ok;
20015
20016 /* Use the same checks as in expand_vec_perm_blend. */
20017 if (d->one_operand_p)
20018 return false;
20019 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
20020 ;
20021 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
20022 ;
20023 else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 4
20024 || GET_MODE_SIZE (vmode) == 8
20025 || GET_MODE_SIZE (vmode) == 16))
20026 ;
20027 else
20028 return false;
20029
20030 /* Figure out where permutation elements stay not in their
20031 respective lanes. */
20032 for (i = 0, which = 0; i < nelt; ++i)
20033 {
20034 unsigned e = d->perm[i];
20035 if (e != i)
20036 which |= (e < nelt ? 1 : 2);
20037 }
20038 /* We can pblend the part where elements stay not in their
20039 respective lanes only when these elements are all in one
20040 half of a permutation.
20041 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
20042 lanes, but both 8 and 9 >= 8
20043 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
20044 respective lanes and 8 >= 8, but 2 not. */
20045 if (which != 1 && which != 2)
20046 return false;
20047 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
20048 return true;
20049
20050 /* First we apply one operand permutation to the part where
20051 elements stay not in their respective lanes. */
20052 dcopy = *d;
20053 if (which == 2)
20054 dcopy.op0 = dcopy.op1 = d->op1;
20055 else
20056 dcopy.op0 = dcopy.op1 = d->op0;
20057 if (!d->testing_p)
20058 dcopy.target = gen_reg_rtx (vmode);
20059 dcopy.one_operand_p = true;
20060
20061 for (i = 0; i < nelt; ++i)
20062 dcopy.perm[i] = d->perm[i] & (nelt - 1);
20063
20064 ok = expand_vec_perm_1 (&dcopy);
20065 if (GET_MODE_SIZE (vmode) != 16 && !ok)
20066 return false;
20067 else
20068 gcc_assert (ok);
20069 if (d->testing_p)
20070 return true;
20071
20072 /* Next we put permuted elements into their positions. */
20073 dcopy1 = *d;
20074 if (which == 2)
20075 dcopy1.op1 = dcopy.target;
20076 else
20077 dcopy1.op0 = dcopy.target;
20078
20079 for (i = 0; i < nelt; ++i)
20080 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
20081
20082 ok = expand_vec_perm_blend (&dcopy1);
20083 gcc_assert (ok);
20084
20085 return true;
20086 }
20087
20088 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
20089
20090 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20091 a two vector permutation into a single vector permutation by using
20092 an interleave operation to merge the vectors. */
20093
20094 static bool
20095 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
20096 {
20097 struct expand_vec_perm_d dremap, dfinal;
20098 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
20099 unsigned HOST_WIDE_INT contents;
20100 unsigned char remap[2 * MAX_VECT_LEN];
20101 rtx_insn *seq;
20102 bool ok, same_halves = false;
20103
20104 if (GET_MODE_SIZE (d->vmode) == 4
20105 || GET_MODE_SIZE (d->vmode) == 8
20106 || GET_MODE_SIZE (d->vmode) == 16)
20107 {
20108 if (d->one_operand_p)
20109 return false;
20110 }
20111 else if (GET_MODE_SIZE (d->vmode) == 32)
20112 {
20113 if (!TARGET_AVX)
20114 return false;
20115 /* For 32-byte modes allow even d->one_operand_p.
20116 The lack of cross-lane shuffling in some instructions
20117 might prevent a single insn shuffle. */
20118 dfinal = *d;
20119 dfinal.testing_p = true;
20120 /* If expand_vec_perm_interleave3 can expand this into
20121 a 3 insn sequence, give up and let it be expanded as
20122 3 insn sequence. While that is one insn longer,
20123 it doesn't need a memory operand and in the common
20124 case that both interleave low and high permutations
20125 with the same operands are adjacent needs 4 insns
20126 for both after CSE. */
20127 if (expand_vec_perm_interleave3 (&dfinal))
20128 return false;
20129 }
20130 else
20131 return false;
20132
20133 /* Examine from whence the elements come. */
20134 contents = 0;
20135 for (i = 0; i < nelt; ++i)
20136 contents |= HOST_WIDE_INT_1U << d->perm[i];
20137
20138 memset (remap, 0xff, sizeof (remap));
20139 dremap = *d;
20140
20141 if (GET_MODE_SIZE (d->vmode) == 4
20142 || GET_MODE_SIZE (d->vmode) == 8)
20143 {
20144 unsigned HOST_WIDE_INT h1, h2, h3, h4;
20145
20146 /* Split the two input vectors into 4 halves. */
20147 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
20148 h2 = h1 << nelt2;
20149 h3 = h2 << nelt2;
20150 h4 = h3 << nelt2;
20151
20152 /* If the elements from the low halves use interleave low,
20153 and similarly for interleave high. */
20154 if ((contents & (h1 | h3)) == contents)
20155 {
20156 /* punpckl* */
20157 for (i = 0; i < nelt2; ++i)
20158 {
20159 remap[i] = i * 2;
20160 remap[i + nelt] = i * 2 + 1;
20161 dremap.perm[i * 2] = i;
20162 dremap.perm[i * 2 + 1] = i + nelt;
20163 }
20164 }
20165 else if ((contents & (h2 | h4)) == contents)
20166 {
20167 /* punpckh* */
20168 for (i = 0; i < nelt2; ++i)
20169 {
20170 remap[i + nelt2] = i * 2;
20171 remap[i + nelt + nelt2] = i * 2 + 1;
20172 dremap.perm[i * 2] = i + nelt2;
20173 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
20174 }
20175 }
20176 else
20177 return false;
20178 }
20179 else if (GET_MODE_SIZE (d->vmode) == 16)
20180 {
20181 unsigned HOST_WIDE_INT h1, h2, h3, h4;
20182
20183 /* Split the two input vectors into 4 halves. */
20184 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
20185 h2 = h1 << nelt2;
20186 h3 = h2 << nelt2;
20187 h4 = h3 << nelt2;
20188
20189 /* If the elements from the low halves use interleave low, and similarly
20190 for interleave high. If the elements are from mis-matched halves, we
20191 can use shufps for V4SF/V4SI or do a DImode shuffle. */
20192 if ((contents & (h1 | h3)) == contents)
20193 {
20194 /* punpckl* */
20195 for (i = 0; i < nelt2; ++i)
20196 {
20197 remap[i] = i * 2;
20198 remap[i + nelt] = i * 2 + 1;
20199 dremap.perm[i * 2] = i;
20200 dremap.perm[i * 2 + 1] = i + nelt;
20201 }
20202 if (!TARGET_SSE2 && d->vmode == V4SImode)
20203 dremap.vmode = V4SFmode;
20204 }
20205 else if ((contents & (h2 | h4)) == contents)
20206 {
20207 /* punpckh* */
20208 for (i = 0; i < nelt2; ++i)
20209 {
20210 remap[i + nelt2] = i * 2;
20211 remap[i + nelt + nelt2] = i * 2 + 1;
20212 dremap.perm[i * 2] = i + nelt2;
20213 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
20214 }
20215 if (!TARGET_SSE2 && d->vmode == V4SImode)
20216 dremap.vmode = V4SFmode;
20217 }
20218 else if ((contents & (h1 | h4)) == contents)
20219 {
20220 /* shufps */
20221 for (i = 0; i < nelt2; ++i)
20222 {
20223 remap[i] = i;
20224 remap[i + nelt + nelt2] = i + nelt2;
20225 dremap.perm[i] = i;
20226 dremap.perm[i + nelt2] = i + nelt + nelt2;
20227 }
20228 if (nelt != 4)
20229 {
20230 /* shufpd */
20231 dremap.vmode = V2DImode;
20232 dremap.nelt = 2;
20233 dremap.perm[0] = 0;
20234 dremap.perm[1] = 3;
20235 }
20236 }
20237 else if ((contents & (h2 | h3)) == contents)
20238 {
20239 /* shufps */
20240 for (i = 0; i < nelt2; ++i)
20241 {
20242 remap[i + nelt2] = i;
20243 remap[i + nelt] = i + nelt2;
20244 dremap.perm[i] = i + nelt2;
20245 dremap.perm[i + nelt2] = i + nelt;
20246 }
20247 if (nelt != 4)
20248 {
20249 /* shufpd */
20250 dremap.vmode = V2DImode;
20251 dremap.nelt = 2;
20252 dremap.perm[0] = 1;
20253 dremap.perm[1] = 2;
20254 }
20255 }
20256 else
20257 return false;
20258 }
20259 else
20260 {
20261 unsigned int nelt4 = nelt / 4, nzcnt = 0;
20262 unsigned HOST_WIDE_INT q[8];
20263 unsigned int nonzero_halves[4];
20264
20265 /* Split the two input vectors into 8 quarters. */
20266 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
20267 for (i = 1; i < 8; ++i)
20268 q[i] = q[0] << (nelt4 * i);
20269 for (i = 0; i < 4; ++i)
20270 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
20271 {
20272 nonzero_halves[nzcnt] = i;
20273 ++nzcnt;
20274 }
20275
20276 if (nzcnt == 1)
20277 {
20278 gcc_assert (d->one_operand_p);
20279 nonzero_halves[1] = nonzero_halves[0];
20280 same_halves = true;
20281 }
20282 else if (d->one_operand_p)
20283 {
20284 gcc_assert (nonzero_halves[0] == 0);
20285 gcc_assert (nonzero_halves[1] == 1);
20286 }
20287
20288 if (nzcnt <= 2)
20289 {
20290 if (d->perm[0] / nelt2 == nonzero_halves[1])
20291 {
20292 /* Attempt to increase the likelihood that dfinal
20293 shuffle will be intra-lane. */
20294 std::swap (nonzero_halves[0], nonzero_halves[1]);
20295 }
20296
20297 /* vperm2f128 or vperm2i128. */
20298 for (i = 0; i < nelt2; ++i)
20299 {
20300 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
20301 remap[i + nonzero_halves[0] * nelt2] = i;
20302 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
20303 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
20304 }
20305
20306 if (d->vmode != V8SFmode
20307 && d->vmode != V4DFmode
20308 && d->vmode != V8SImode)
20309 {
20310 dremap.vmode = V8SImode;
20311 dremap.nelt = 8;
20312 for (i = 0; i < 4; ++i)
20313 {
20314 dremap.perm[i] = i + nonzero_halves[0] * 4;
20315 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
20316 }
20317 }
20318 }
20319 else if (d->one_operand_p)
20320 return false;
20321 else if (TARGET_AVX2
20322 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
20323 {
20324 /* vpunpckl* */
20325 for (i = 0; i < nelt4; ++i)
20326 {
20327 remap[i] = i * 2;
20328 remap[i + nelt] = i * 2 + 1;
20329 remap[i + nelt2] = i * 2 + nelt2;
20330 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
20331 dremap.perm[i * 2] = i;
20332 dremap.perm[i * 2 + 1] = i + nelt;
20333 dremap.perm[i * 2 + nelt2] = i + nelt2;
20334 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
20335 }
20336 }
20337 else if (TARGET_AVX2
20338 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
20339 {
20340 /* vpunpckh* */
20341 for (i = 0; i < nelt4; ++i)
20342 {
20343 remap[i + nelt4] = i * 2;
20344 remap[i + nelt + nelt4] = i * 2 + 1;
20345 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
20346 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
20347 dremap.perm[i * 2] = i + nelt4;
20348 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
20349 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
20350 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
20351 }
20352 }
20353 else
20354 return false;
20355 }
20356
20357 /* Use the remapping array set up above to move the elements from their
20358 swizzled locations into their final destinations. */
20359 dfinal = *d;
20360 for (i = 0; i < nelt; ++i)
20361 {
20362 unsigned e = remap[d->perm[i]];
20363 gcc_assert (e < nelt);
20364 /* If same_halves is true, both halves of the remapped vector are the
20365 same. Avoid cross-lane accesses if possible. */
20366 if (same_halves && i >= nelt2)
20367 {
20368 gcc_assert (e < nelt2);
20369 dfinal.perm[i] = e + nelt2;
20370 }
20371 else
20372 dfinal.perm[i] = e;
20373 }
20374 if (!d->testing_p)
20375 {
20376 dremap.target = gen_reg_rtx (dremap.vmode);
20377 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
20378 }
20379 dfinal.op1 = dfinal.op0;
20380 dfinal.one_operand_p = true;
20381
20382 /* Test if the final remap can be done with a single insn. For V4SFmode or
20383 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
20384 start_sequence ();
20385 ok = expand_vec_perm_1 (&dfinal);
20386 seq = get_insns ();
20387 end_sequence ();
20388
20389 if (!ok)
20390 return false;
20391
20392 if (d->testing_p)
20393 return true;
20394
20395 if (dremap.vmode != dfinal.vmode)
20396 {
20397 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
20398 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
20399 }
20400
20401 ok = expand_vec_perm_1 (&dremap);
20402 gcc_assert (ok);
20403
20404 emit_insn (seq);
20405 return true;
20406 }
20407
20408 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20409 a single vector cross-lane permutation into vpermq followed
20410 by any of the single insn permutations. */
20411
20412 static bool
20413 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
20414 {
20415 struct expand_vec_perm_d dremap, dfinal;
20416 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
20417 unsigned contents[2];
20418 bool ok;
20419
20420 if (!(TARGET_AVX2
20421 && (d->vmode == V32QImode || d->vmode == V16HImode)
20422 && d->one_operand_p))
20423 return false;
20424
20425 contents[0] = 0;
20426 contents[1] = 0;
20427 for (i = 0; i < nelt2; ++i)
20428 {
20429 contents[0] |= 1u << (d->perm[i] / nelt4);
20430 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
20431 }
20432
20433 for (i = 0; i < 2; ++i)
20434 {
20435 unsigned int cnt = 0;
20436 for (j = 0; j < 4; ++j)
20437 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
20438 return false;
20439 }
20440
20441 if (d->testing_p)
20442 return true;
20443
20444 dremap = *d;
20445 dremap.vmode = V4DImode;
20446 dremap.nelt = 4;
20447 dremap.target = gen_reg_rtx (V4DImode);
20448 dremap.op0 = gen_lowpart (V4DImode, d->op0);
20449 dremap.op1 = dremap.op0;
20450 dremap.one_operand_p = true;
20451 for (i = 0; i < 2; ++i)
20452 {
20453 unsigned int cnt = 0;
20454 for (j = 0; j < 4; ++j)
20455 if ((contents[i] & (1u << j)) != 0)
20456 dremap.perm[2 * i + cnt++] = j;
20457 for (; cnt < 2; ++cnt)
20458 dremap.perm[2 * i + cnt] = 0;
20459 }
20460
20461 dfinal = *d;
20462 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
20463 dfinal.op1 = dfinal.op0;
20464 dfinal.one_operand_p = true;
20465 for (i = 0, j = 0; i < nelt; ++i)
20466 {
20467 if (i == nelt2)
20468 j = 2;
20469 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
20470 if ((d->perm[i] / nelt4) == dremap.perm[j])
20471 ;
20472 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
20473 dfinal.perm[i] |= nelt4;
20474 else
20475 gcc_unreachable ();
20476 }
20477
20478 ok = expand_vec_perm_1 (&dremap);
20479 gcc_assert (ok);
20480
20481 ok = expand_vec_perm_1 (&dfinal);
20482 gcc_assert (ok);
20483
20484 return true;
20485 }
20486
20487 static bool canonicalize_perm (struct expand_vec_perm_d *d);
20488
20489 /* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
20490 a vector permutation using two instructions, vperm2f128 resp.
20491 vperm2i128 followed by any single in-lane permutation. */
20492
20493 static bool
20494 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
20495 {
20496 struct expand_vec_perm_d dfirst, dsecond;
20497 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
20498 bool ok;
20499
20500 if (!TARGET_AVX
20501 || GET_MODE_SIZE (d->vmode) != 32
20502 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
20503 return false;
20504
20505 dsecond = *d;
20506 dsecond.one_operand_p = false;
20507 dsecond.testing_p = true;
20508
20509 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
20510 immediate. For perm < 16 the second permutation uses
20511 d->op0 as first operand, for perm >= 16 it uses d->op1
20512 as first operand. The second operand is the result of
20513 vperm2[fi]128. */
20514 for (perm = 0; perm < 32; perm++)
20515 {
20516 /* Ignore permutations which do not move anything cross-lane. */
20517 if (perm < 16)
20518 {
20519 /* The second shuffle for e.g. V4DFmode has
20520 0123 and ABCD operands.
20521 Ignore AB23, as 23 is already in the second lane
20522 of the first operand. */
20523 if ((perm & 0xc) == (1 << 2)) continue;
20524 /* And 01CD, as 01 is in the first lane of the first
20525 operand. */
20526 if ((perm & 3) == 0) continue;
20527 /* And 4567, as then the vperm2[fi]128 doesn't change
20528 anything on the original 4567 second operand. */
20529 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
20530 }
20531 else
20532 {
20533 /* The second shuffle for e.g. V4DFmode has
20534 4567 and ABCD operands.
20535 Ignore AB67, as 67 is already in the second lane
20536 of the first operand. */
20537 if ((perm & 0xc) == (3 << 2)) continue;
20538 /* And 45CD, as 45 is in the first lane of the first
20539 operand. */
20540 if ((perm & 3) == 2) continue;
20541 /* And 0123, as then the vperm2[fi]128 doesn't change
20542 anything on the original 0123 first operand. */
20543 if ((perm & 0xf) == (1 << 2)) continue;
20544 }
20545
20546 for (i = 0; i < nelt; i++)
20547 {
20548 j = d->perm[i] / nelt2;
20549 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
20550 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
20551 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
20552 dsecond.perm[i] = d->perm[i] & (nelt - 1);
20553 else
20554 break;
20555 }
20556
20557 if (i == nelt)
20558 {
20559 start_sequence ();
20560 ok = expand_vec_perm_1 (&dsecond);
20561 end_sequence ();
20562 }
20563 else
20564 ok = false;
20565
20566 if (ok)
20567 {
20568 if (d->testing_p)
20569 return true;
20570
20571 /* Found a usable second shuffle. dfirst will be
20572 vperm2f128 on d->op0 and d->op1. */
20573 dsecond.testing_p = false;
20574 dfirst = *d;
20575 dfirst.target = gen_reg_rtx (d->vmode);
20576 for (i = 0; i < nelt; i++)
20577 dfirst.perm[i] = (i & (nelt2 - 1))
20578 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
20579
20580 canonicalize_perm (&dfirst);
20581 ok = expand_vec_perm_1 (&dfirst);
20582 gcc_assert (ok);
20583
20584 /* And dsecond is some single insn shuffle, taking
20585 d->op0 and result of vperm2f128 (if perm < 16) or
20586 d->op1 and result of vperm2f128 (otherwise). */
20587 if (perm >= 16)
20588 dsecond.op0 = dsecond.op1;
20589 dsecond.op1 = dfirst.target;
20590
20591 ok = expand_vec_perm_1 (&dsecond);
20592 gcc_assert (ok);
20593
20594 return true;
20595 }
20596
20597 /* For one operand, the only useful vperm2f128 permutation is 0x01
20598 aka lanes swap. */
20599 if (d->one_operand_p)
20600 return false;
20601 }
20602
20603 return false;
20604 }
20605
20606 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20607 a two vector permutation using 2 intra-lane interleave insns
20608 and cross-lane shuffle for 32-byte vectors. */
20609
20610 static bool
20611 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
20612 {
20613 unsigned i, nelt;
20614 rtx (*gen) (rtx, rtx, rtx);
20615
20616 if (d->one_operand_p)
20617 return false;
20618 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
20619 ;
20620 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
20621 ;
20622 else
20623 return false;
20624
20625 nelt = d->nelt;
20626 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
20627 return false;
20628 for (i = 0; i < nelt; i += 2)
20629 if (d->perm[i] != d->perm[0] + i / 2
20630 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
20631 return false;
20632
20633 if (d->testing_p)
20634 return true;
20635
20636 switch (d->vmode)
20637 {
20638 case E_V32QImode:
20639 if (d->perm[0])
20640 gen = gen_vec_interleave_highv32qi;
20641 else
20642 gen = gen_vec_interleave_lowv32qi;
20643 break;
20644 case E_V16HImode:
20645 if (d->perm[0])
20646 gen = gen_vec_interleave_highv16hi;
20647 else
20648 gen = gen_vec_interleave_lowv16hi;
20649 break;
20650 case E_V8SImode:
20651 if (d->perm[0])
20652 gen = gen_vec_interleave_highv8si;
20653 else
20654 gen = gen_vec_interleave_lowv8si;
20655 break;
20656 case E_V4DImode:
20657 if (d->perm[0])
20658 gen = gen_vec_interleave_highv4di;
20659 else
20660 gen = gen_vec_interleave_lowv4di;
20661 break;
20662 case E_V8SFmode:
20663 if (d->perm[0])
20664 gen = gen_vec_interleave_highv8sf;
20665 else
20666 gen = gen_vec_interleave_lowv8sf;
20667 break;
20668 case E_V4DFmode:
20669 if (d->perm[0])
20670 gen = gen_vec_interleave_highv4df;
20671 else
20672 gen = gen_vec_interleave_lowv4df;
20673 break;
20674 default:
20675 gcc_unreachable ();
20676 }
20677
20678 emit_insn (gen (d->target, d->op0, d->op1));
20679 return true;
20680 }
20681
20682 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
20683 a single vector permutation using a single intra-lane vector
20684 permutation, vperm2f128 swapping the lanes and vblend* insn blending
20685 the non-swapped and swapped vectors together. */
20686
20687 static bool
20688 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
20689 {
20690 struct expand_vec_perm_d dfirst, dsecond;
20691 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
20692 rtx_insn *seq;
20693 bool ok;
20694 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
20695
20696 if (!TARGET_AVX
20697 || TARGET_AVX2
20698 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
20699 || !d->one_operand_p)
20700 return false;
20701
20702 dfirst = *d;
20703 for (i = 0; i < nelt; i++)
20704 dfirst.perm[i] = 0xff;
20705 for (i = 0, msk = 0; i < nelt; i++)
20706 {
20707 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
20708 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
20709 return false;
20710 dfirst.perm[j] = d->perm[i];
20711 if (j != i)
20712 msk |= (1 << i);
20713 }
20714 for (i = 0; i < nelt; i++)
20715 if (dfirst.perm[i] == 0xff)
20716 dfirst.perm[i] = i;
20717
20718 if (!d->testing_p)
20719 dfirst.target = gen_reg_rtx (dfirst.vmode);
20720
20721 start_sequence ();
20722 ok = expand_vec_perm_1 (&dfirst);
20723 seq = get_insns ();
20724 end_sequence ();
20725
20726 if (!ok)
20727 return false;
20728
20729 if (d->testing_p)
20730 return true;
20731
20732 emit_insn (seq);
20733
20734 dsecond = *d;
20735 dsecond.op0 = dfirst.target;
20736 dsecond.op1 = dfirst.target;
20737 dsecond.one_operand_p = true;
20738 dsecond.target = gen_reg_rtx (dsecond.vmode);
20739 for (i = 0; i < nelt; i++)
20740 dsecond.perm[i] = i ^ nelt2;
20741
20742 ok = expand_vec_perm_1 (&dsecond);
20743 gcc_assert (ok);
20744
20745 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
20746 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
20747 return true;
20748 }
20749
20750 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
20751 a two vector permutation using two single vector permutations and
20752 {,v}{,p}unpckl{ps,pd,bw,wd,dq}. If two_insn, succeed only if one
20753 of dfirst or dsecond is identity permutation. */
20754
20755 static bool
20756 expand_vec_perm_2perm_interleave (struct expand_vec_perm_d *d, bool two_insn)
20757 {
20758 unsigned i, nelt = d->nelt, nelt2 = nelt / 2, lane = nelt;
20759 struct expand_vec_perm_d dfirst, dsecond, dfinal;
20760 bool ident1 = true, ident2 = true;
20761
20762 if (d->one_operand_p)
20763 return false;
20764
20765 if (GET_MODE_SIZE (d->vmode) == 16)
20766 {
20767 if (!TARGET_SSE)
20768 return false;
20769 if (d->vmode != V4SFmode && d->vmode != V2DFmode && !TARGET_SSE2)
20770 return false;
20771 }
20772 else if (GET_MODE_SIZE (d->vmode) == 32)
20773 {
20774 if (!TARGET_AVX)
20775 return false;
20776 if (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)
20777 return false;
20778 lane = nelt2;
20779 }
20780 else
20781 return false;
20782
20783 for (i = 1; i < nelt; i++)
20784 if ((d->perm[i] >= nelt) != ((d->perm[0] >= nelt) ^ (i & 1)))
20785 return false;
20786
20787 dfirst = *d;
20788 dsecond = *d;
20789 dfinal = *d;
20790 dfirst.op1 = dfirst.op0;
20791 dfirst.one_operand_p = true;
20792 dsecond.op0 = dsecond.op1;
20793 dsecond.one_operand_p = true;
20794
20795 for (i = 0; i < nelt; i++)
20796 if (d->perm[i] >= nelt)
20797 {
20798 dsecond.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i] - nelt;
20799 if (d->perm[i] - nelt != i / 2 + (i >= lane ? lane / 2 : 0))
20800 ident2 = false;
20801 dsecond.perm[i / 2 + (i >= lane ? lane : lane / 2)]
20802 = d->perm[i] - nelt;
20803 }
20804 else
20805 {
20806 dfirst.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i];
20807 if (d->perm[i] != i / 2 + (i >= lane ? lane / 2 : 0))
20808 ident1 = false;
20809 dfirst.perm[i / 2 + (i >= lane ? lane : lane / 2)] = d->perm[i];
20810 }
20811
20812 if (two_insn && !ident1 && !ident2)
20813 return false;
20814
20815 if (!d->testing_p)
20816 {
20817 if (!ident1)
20818 dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
20819 if (!ident2)
20820 dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
20821 if (d->perm[0] >= nelt)
20822 std::swap (dfinal.op0, dfinal.op1);
20823 }
20824
20825 bool ok;
20826 rtx_insn *seq1 = NULL, *seq2 = NULL;
20827
20828 if (!ident1)
20829 {
20830 start_sequence ();
20831 ok = expand_vec_perm_1 (&dfirst);
20832 seq1 = get_insns ();
20833 end_sequence ();
20834
20835 if (!ok)
20836 return false;
20837 }
20838
20839 if (!ident2)
20840 {
20841 start_sequence ();
20842 ok = expand_vec_perm_1 (&dsecond);
20843 seq2 = get_insns ();
20844 end_sequence ();
20845
20846 if (!ok)
20847 return false;
20848 }
20849
20850 if (d->testing_p)
20851 return true;
20852
20853 for (i = 0; i < nelt; i++)
20854 {
20855 dfinal.perm[i] = i / 2;
20856 if (i >= lane)
20857 dfinal.perm[i] += lane / 2;
20858 if ((i & 1) != 0)
20859 dfinal.perm[i] += nelt;
20860 }
20861 emit_insn (seq1);
20862 emit_insn (seq2);
20863 ok = expand_vselect_vconcat (dfinal.target, dfinal.op0, dfinal.op1,
20864 dfinal.perm, dfinal.nelt, false);
20865 gcc_assert (ok);
20866 return true;
20867 }
20868
20869 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20870 the permutation using two single vector permutations and the SSE4_1 pblendv
20871 instruction. If two_insn, succeed only if one of dfirst or dsecond is
20872 identity permutation. */
20873
20874 static bool
20875 expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
20876 {
20877 unsigned i, nelt = d->nelt;
20878 struct expand_vec_perm_d dfirst, dsecond, dfinal;
20879 machine_mode vmode = d->vmode;
20880 bool ident1 = true, ident2 = true;
20881
20882 /* Use the same checks as in expand_vec_perm_blend. */
20883 if (d->one_operand_p)
20884 return false;
20885 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
20886 ;
20887 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
20888 ;
20889 else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
20890 || GET_MODE_SIZE (vmode) == 8
20891 || GET_MODE_SIZE (vmode) == 4))
20892 ;
20893 else
20894 return false;
20895
20896 dfirst = *d;
20897 dsecond = *d;
20898 dfinal = *d;
20899 dfirst.op1 = dfirst.op0;
20900 dfirst.one_operand_p = true;
20901 dsecond.op0 = dsecond.op1;
20902 dsecond.one_operand_p = true;
20903
20904 for (i = 0; i < nelt; ++i)
20905 if (d->perm[i] >= nelt)
20906 {
20907 dfirst.perm[i] = 0xff;
20908 dsecond.perm[i] = d->perm[i] - nelt;
20909 if (d->perm[i] != i + nelt)
20910 ident2 = false;
20911 }
20912 else
20913 {
20914 dsecond.perm[i] = 0xff;
20915 dfirst.perm[i] = d->perm[i];
20916 if (d->perm[i] != i)
20917 ident1 = false;
20918 }
20919
20920 if (two_insn && !ident1 && !ident2)
20921 return false;
20922
20923 /* For now. Ideally treat 0xff as a wildcard. */
20924 for (i = 0; i < nelt; ++i)
20925 if (dfirst.perm[i] == 0xff)
20926 {
20927 if (GET_MODE_SIZE (vmode) == 32
20928 && dfirst.perm[i ^ (nelt / 2)] != 0xff)
20929 dfirst.perm[i] = dfirst.perm[i ^ (nelt / 2)] ^ (nelt / 2);
20930 else
20931 dfirst.perm[i] = i;
20932 }
20933 else
20934 {
20935 if (GET_MODE_SIZE (vmode) == 32
20936 && dsecond.perm[i ^ (nelt / 2)] != 0xff)
20937 dsecond.perm[i] = dsecond.perm[i ^ (nelt / 2)] ^ (nelt / 2);
20938 else
20939 dsecond.perm[i] = i;
20940 }
20941
20942 if (!d->testing_p)
20943 {
20944 if (!ident1)
20945 dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
20946 if (!ident2)
20947 dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
20948 }
20949
20950 bool ok;
20951 rtx_insn *seq1 = NULL, *seq2 = NULL;
20952
20953 if (!ident1)
20954 {
20955 start_sequence ();
20956 ok = expand_vec_perm_1 (&dfirst);
20957 seq1 = get_insns ();
20958 end_sequence ();
20959
20960 if (!ok)
20961 return false;
20962 }
20963
20964 if (!ident2)
20965 {
20966 start_sequence ();
20967 ok = expand_vec_perm_1 (&dsecond);
20968 seq2 = get_insns ();
20969 end_sequence ();
20970
20971 if (!ok)
20972 return false;
20973 }
20974
20975 if (d->testing_p)
20976 return true;
20977
20978 for (i = 0; i < nelt; ++i)
20979 dfinal.perm[i] = (d->perm[i] >= nelt ? i + nelt : i);
20980
20981 emit_insn (seq1);
20982 emit_insn (seq2);
20983 ok = expand_vec_perm_blend (&dfinal);
20984 gcc_assert (ok);
20985 return true;
20986 }
20987
20988 /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
20989 permutation using two vperm2f128, followed by a vshufpd insn blending
20990 the two vectors together. */
20991
20992 static bool
20993 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
20994 {
20995 struct expand_vec_perm_d dfirst, dsecond, dthird;
20996 bool ok;
20997
20998 if (!TARGET_AVX || (d->vmode != V4DFmode))
20999 return false;
21000
21001 if (d->testing_p)
21002 return true;
21003
21004 dfirst = *d;
21005 dsecond = *d;
21006 dthird = *d;
21007
21008 dfirst.perm[0] = (d->perm[0] & ~1);
21009 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
21010 dfirst.perm[2] = (d->perm[2] & ~1);
21011 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
21012 dsecond.perm[0] = (d->perm[1] & ~1);
21013 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
21014 dsecond.perm[2] = (d->perm[3] & ~1);
21015 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
21016 dthird.perm[0] = (d->perm[0] % 2);
21017 dthird.perm[1] = (d->perm[1] % 2) + 4;
21018 dthird.perm[2] = (d->perm[2] % 2) + 2;
21019 dthird.perm[3] = (d->perm[3] % 2) + 6;
21020
21021 dfirst.target = gen_reg_rtx (dfirst.vmode);
21022 dsecond.target = gen_reg_rtx (dsecond.vmode);
21023 dthird.op0 = dfirst.target;
21024 dthird.op1 = dsecond.target;
21025 dthird.one_operand_p = false;
21026
21027 canonicalize_perm (&dfirst);
21028 canonicalize_perm (&dsecond);
21029
21030 ok = expand_vec_perm_1 (&dfirst)
21031 && expand_vec_perm_1 (&dsecond)
21032 && expand_vec_perm_1 (&dthird);
21033
21034 gcc_assert (ok);
21035
21036 return true;
21037 }
21038
21039 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
21040
21041 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21042 a two vector permutation using two intra-lane vector
21043 permutations, vperm2f128 swapping the lanes and vblend* insn blending
21044 the non-swapped and swapped vectors together. */
21045
21046 static bool
21047 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
21048 {
21049 struct expand_vec_perm_d dfirst, dsecond, dthird;
21050 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
21051 rtx_insn *seq1, *seq2;
21052 bool ok;
21053 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
21054
21055 if (!TARGET_AVX
21056 || TARGET_AVX2
21057 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
21058 || d->one_operand_p)
21059 return false;
21060
21061 dfirst = *d;
21062 dsecond = *d;
21063 for (i = 0; i < nelt; i++)
21064 {
21065 dfirst.perm[i] = 0xff;
21066 dsecond.perm[i] = 0xff;
21067 }
21068 for (i = 0, msk = 0; i < nelt; i++)
21069 {
21070 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
21071 if (j == i)
21072 {
21073 dfirst.perm[j] = d->perm[i];
21074 which1 |= (d->perm[i] < nelt ? 1 : 2);
21075 }
21076 else
21077 {
21078 dsecond.perm[j] = d->perm[i];
21079 which2 |= (d->perm[i] < nelt ? 1 : 2);
21080 msk |= (1U << i);
21081 }
21082 }
21083 if (msk == 0 || msk == (1U << nelt) - 1)
21084 return false;
21085
21086 if (!d->testing_p)
21087 {
21088 dfirst.target = gen_reg_rtx (dfirst.vmode);
21089 dsecond.target = gen_reg_rtx (dsecond.vmode);
21090 }
21091
21092 for (i = 0; i < nelt; i++)
21093 {
21094 if (dfirst.perm[i] == 0xff)
21095 dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
21096 if (dsecond.perm[i] == 0xff)
21097 dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
21098 }
21099 canonicalize_perm (&dfirst);
21100 start_sequence ();
21101 ok = ix86_expand_vec_perm_const_1 (&dfirst);
21102 seq1 = get_insns ();
21103 end_sequence ();
21104
21105 if (!ok)
21106 return false;
21107
21108 canonicalize_perm (&dsecond);
21109 start_sequence ();
21110 ok = ix86_expand_vec_perm_const_1 (&dsecond);
21111 seq2 = get_insns ();
21112 end_sequence ();
21113
21114 if (!ok)
21115 return false;
21116
21117 if (d->testing_p)
21118 return true;
21119
21120 emit_insn (seq1);
21121 emit_insn (seq2);
21122
21123 dthird = *d;
21124 dthird.op0 = dsecond.target;
21125 dthird.op1 = dsecond.target;
21126 dthird.one_operand_p = true;
21127 dthird.target = gen_reg_rtx (dthird.vmode);
21128 for (i = 0; i < nelt; i++)
21129 dthird.perm[i] = i ^ nelt2;
21130
21131 ok = expand_vec_perm_1 (&dthird);
21132 gcc_assert (ok);
21133
21134 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
21135 emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
21136 return true;
21137 }
21138
21139 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
21140 permutation with two pshufb insns and an ior. We should have already
21141 failed all two instruction sequences. */
21142
21143 static bool
21144 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
21145 {
21146 rtx rperm[2][16], vperm, l, h, op, m128;
21147 unsigned int i, nelt, eltsz;
21148 machine_mode mode;
21149 rtx (*gen) (rtx, rtx, rtx);
21150
21151 if (!TARGET_SSSE3 || (GET_MODE_SIZE (d->vmode) != 16
21152 && GET_MODE_SIZE (d->vmode) != 8
21153 && GET_MODE_SIZE (d->vmode) != 4))
21154 return false;
21155 gcc_assert (!d->one_operand_p);
21156
21157 if (d->testing_p)
21158 return true;
21159
21160 switch (GET_MODE_SIZE (d->vmode))
21161 {
21162 case 4:
21163 mode = V4QImode;
21164 gen = gen_mmx_pshufbv4qi3;
21165 break;
21166 case 8:
21167 mode = V8QImode;
21168 gen = gen_mmx_pshufbv8qi3;
21169 break;
21170 case 16:
21171 mode = V16QImode;
21172 gen = gen_ssse3_pshufbv16qi3;
21173 break;
21174 default:
21175 gcc_unreachable ();
21176 }
21177
21178 nelt = d->nelt;
21179 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21180
21181 /* Generate two permutation masks. If the required element is within
21182 the given vector it is shuffled into the proper lane. If the required
21183 element is in the other vector, force a zero into the lane by setting
21184 bit 7 in the permutation mask. */
21185 m128 = GEN_INT (-128);
21186 for (i = 0; i < nelt; ++i)
21187 {
21188 unsigned j, k, e = d->perm[i];
21189 unsigned which = (e >= nelt);
21190 if (e >= nelt)
21191 e -= nelt;
21192
21193 for (j = 0; j < eltsz; ++j)
21194 {
21195 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
21196 rperm[1-which][i*eltsz + j] = m128;
21197 }
21198
21199 for (k = i*eltsz + j; k < 16; ++k)
21200 rperm[0][k] = rperm[1][k] = m128;
21201 }
21202
21203 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
21204 vperm = force_reg (V16QImode, vperm);
21205
21206 l = gen_reg_rtx (mode);
21207 op = gen_lowpart (mode, d->op0);
21208 emit_insn (gen (l, op, vperm));
21209
21210 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
21211 vperm = force_reg (V16QImode, vperm);
21212
21213 h = gen_reg_rtx (mode);
21214 op = gen_lowpart (mode, d->op1);
21215 emit_insn (gen (h, op, vperm));
21216
21217 op = d->target;
21218 if (d->vmode != mode)
21219 op = gen_reg_rtx (mode);
21220 ix86_emit_vec_binop (IOR, mode, op, l, h);
21221 if (op != d->target)
21222 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
21223
21224 return true;
21225 }
21226
21227 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
21228 with two vpshufb insns, vpermq and vpor. We should have already failed
21229 all two or three instruction sequences. */
21230
21231 static bool
21232 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
21233 {
21234 rtx rperm[2][32], vperm, l, h, hp, op, m128;
21235 unsigned int i, nelt, eltsz;
21236
21237 if (!TARGET_AVX2
21238 || !d->one_operand_p
21239 || (d->vmode != V32QImode && d->vmode != V16HImode))
21240 return false;
21241
21242 if (d->testing_p)
21243 return true;
21244
21245 nelt = d->nelt;
21246 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21247
21248 /* Generate two permutation masks. If the required element is within
21249 the same lane, it is shuffled in. If the required element from the
21250 other lane, force a zero by setting bit 7 in the permutation mask.
21251 In the other mask the mask has non-negative elements if element
21252 is requested from the other lane, but also moved to the other lane,
21253 so that the result of vpshufb can have the two V2TImode halves
21254 swapped. */
21255 m128 = GEN_INT (-128);
21256 for (i = 0; i < nelt; ++i)
21257 {
21258 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
21259 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
21260
21261 for (j = 0; j < eltsz; ++j)
21262 {
21263 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
21264 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
21265 }
21266 }
21267
21268 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
21269 vperm = force_reg (V32QImode, vperm);
21270
21271 h = gen_reg_rtx (V32QImode);
21272 op = gen_lowpart (V32QImode, d->op0);
21273 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
21274
21275 /* Swap the 128-byte lanes of h into hp. */
21276 hp = gen_reg_rtx (V4DImode);
21277 op = gen_lowpart (V4DImode, h);
21278 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
21279 const1_rtx));
21280
21281 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
21282 vperm = force_reg (V32QImode, vperm);
21283
21284 l = gen_reg_rtx (V32QImode);
21285 op = gen_lowpart (V32QImode, d->op0);
21286 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
21287
21288 op = d->target;
21289 if (d->vmode != V32QImode)
21290 op = gen_reg_rtx (V32QImode);
21291 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
21292 if (op != d->target)
21293 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
21294
21295 return true;
21296 }
21297
21298 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21299 and extract-odd permutations of two V32QImode and V16QImode operand
21300 with two vpshufb insns, vpor and vpermq. We should have already
21301 failed all two or three instruction sequences. */
21302
21303 static bool
21304 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
21305 {
21306 rtx rperm[2][32], vperm, l, h, ior, op, m128;
21307 unsigned int i, nelt, eltsz;
21308
21309 if (!TARGET_AVX2
21310 || d->one_operand_p
21311 || (d->vmode != V32QImode && d->vmode != V16HImode))
21312 return false;
21313
21314 for (i = 0; i < d->nelt; ++i)
21315 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
21316 return false;
21317
21318 if (d->testing_p)
21319 return true;
21320
21321 nelt = d->nelt;
21322 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21323
21324 /* Generate two permutation masks. In the first permutation mask
21325 the first quarter will contain indexes for the first half
21326 of the op0, the second quarter will contain bit 7 set, third quarter
21327 will contain indexes for the second half of the op0 and the
21328 last quarter bit 7 set. In the second permutation mask
21329 the first quarter will contain bit 7 set, the second quarter
21330 indexes for the first half of the op1, the third quarter bit 7 set
21331 and last quarter indexes for the second half of the op1.
21332 I.e. the first mask e.g. for V32QImode extract even will be:
21333 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
21334 (all values masked with 0xf except for -128) and second mask
21335 for extract even will be
21336 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
21337 m128 = GEN_INT (-128);
21338 for (i = 0; i < nelt; ++i)
21339 {
21340 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
21341 unsigned which = d->perm[i] >= nelt;
21342 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
21343
21344 for (j = 0; j < eltsz; ++j)
21345 {
21346 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
21347 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
21348 }
21349 }
21350
21351 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
21352 vperm = force_reg (V32QImode, vperm);
21353
21354 l = gen_reg_rtx (V32QImode);
21355 op = gen_lowpart (V32QImode, d->op0);
21356 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
21357
21358 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
21359 vperm = force_reg (V32QImode, vperm);
21360
21361 h = gen_reg_rtx (V32QImode);
21362 op = gen_lowpart (V32QImode, d->op1);
21363 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
21364
21365 ior = gen_reg_rtx (V32QImode);
21366 emit_insn (gen_iorv32qi3 (ior, l, h));
21367
21368 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
21369 op = gen_reg_rtx (V4DImode);
21370 ior = gen_lowpart (V4DImode, ior);
21371 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
21372 const1_rtx, GEN_INT (3)));
21373 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
21374
21375 return true;
21376 }
21377
21378 /* Implement permutation with pslldq + psrldq + por when pshufb is not
21379 available. */
21380 static bool
21381 expand_vec_perm_pslldq_psrldq_por (struct expand_vec_perm_d *d, bool pandn)
21382 {
21383 unsigned i, nelt = d->nelt;
21384 unsigned start1, end1 = -1;
21385 machine_mode vmode = d->vmode, imode;
21386 int start2 = -1;
21387 bool clear_op0, clear_op1;
21388 unsigned inner_size;
21389 rtx op0, op1, dop1;
21390 rtx (*gen_vec_shr) (rtx, rtx, rtx);
21391 rtx (*gen_vec_shl) (rtx, rtx, rtx);
21392
21393 /* pshufd can be used for V4SI/V2DI under TARGET_SSE2. */
21394 if (!TARGET_SSE2 || (vmode != E_V16QImode && vmode != E_V8HImode))
21395 return false;
21396
21397 start1 = d->perm[0];
21398 for (i = 1; i < nelt; i++)
21399 {
21400 if (d->perm[i] != d->perm[i-1] + 1
21401 || d->perm[i] == nelt)
21402 {
21403 if (start2 == -1)
21404 {
21405 start2 = d->perm[i];
21406 end1 = d->perm[i-1];
21407 }
21408 else
21409 return false;
21410 }
21411 }
21412
21413 clear_op0 = end1 != nelt - 1;
21414 clear_op1 = start2 % nelt != 0;
21415 /* pandn/pand is needed to clear upper/lower bits of op0/op1. */
21416 if (!pandn && (clear_op0 || clear_op1))
21417 return false;
21418
21419 if (d->testing_p)
21420 return true;
21421
21422 gen_vec_shr = vmode == E_V16QImode ? gen_vec_shr_v16qi : gen_vec_shr_v8hi;
21423 gen_vec_shl = vmode == E_V16QImode ? gen_vec_shl_v16qi : gen_vec_shl_v8hi;
21424 imode = GET_MODE_INNER (vmode);
21425 inner_size = GET_MODE_BITSIZE (imode);
21426 op0 = gen_reg_rtx (vmode);
21427 op1 = gen_reg_rtx (vmode);
21428
21429 if (start1)
21430 emit_insn (gen_vec_shr (op0, d->op0, GEN_INT (start1 * inner_size)));
21431 else
21432 emit_move_insn (op0, d->op0);
21433
21434 dop1 = d->op1;
21435 if (d->one_operand_p)
21436 dop1 = d->op0;
21437
21438 int shl_offset = end1 - start1 + 1 - start2 % nelt;
21439 if (shl_offset)
21440 emit_insn (gen_vec_shl (op1, dop1, GEN_INT (shl_offset * inner_size)));
21441 else
21442 emit_move_insn (op1, dop1);
21443
21444 /* Clear lower/upper bits for op0/op1. */
21445 if (clear_op0 || clear_op1)
21446 {
21447 rtx vec[16];
21448 rtx const_vec;
21449 rtx clear;
21450 for (i = 0; i != nelt; i++)
21451 {
21452 if (i < (end1 - start1 + 1))
21453 vec[i] = gen_int_mode ((HOST_WIDE_INT_1U << inner_size) - 1, imode);
21454 else
21455 vec[i] = CONST0_RTX (imode);
21456 }
21457 const_vec = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, vec));
21458 const_vec = validize_mem (force_const_mem (vmode, const_vec));
21459 clear = force_reg (vmode, const_vec);
21460
21461 if (clear_op0)
21462 emit_move_insn (op0, gen_rtx_AND (vmode, op0, clear));
21463 if (clear_op1)
21464 emit_move_insn (op1, gen_rtx_AND (vmode,
21465 gen_rtx_NOT (vmode, clear),
21466 op1));
21467 }
21468
21469 emit_move_insn (d->target, gen_rtx_IOR (vmode, op0, op1));
21470 return true;
21471 }
21472
21473 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21474 and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
21475 operands with two "and" and "pack" or two "shift" and "pack" insns.
21476 We should have already failed all two instruction sequences. */
21477
21478 static bool
21479 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
21480 {
21481 rtx op, dop0, dop1, t;
21482 unsigned i, odd, c, s, nelt = d->nelt;
21483 bool end_perm = false;
21484 machine_mode half_mode;
21485 rtx (*gen_and) (rtx, rtx, rtx);
21486 rtx (*gen_pack) (rtx, rtx, rtx);
21487 rtx (*gen_shift) (rtx, rtx, rtx);
21488
21489 if (d->one_operand_p)
21490 return false;
21491
21492 switch (d->vmode)
21493 {
21494 case E_V4HImode:
21495 /* Required for "pack". */
21496 if (!TARGET_SSE4_1)
21497 return false;
21498 c = 0xffff;
21499 s = 16;
21500 half_mode = V2SImode;
21501 gen_and = gen_andv2si3;
21502 gen_pack = gen_mmx_packusdw;
21503 gen_shift = gen_lshrv2si3;
21504 break;
21505 case E_V8HImode:
21506 /* Required for "pack". */
21507 if (!TARGET_SSE4_1)
21508 return false;
21509 c = 0xffff;
21510 s = 16;
21511 half_mode = V4SImode;
21512 gen_and = gen_andv4si3;
21513 gen_pack = gen_sse4_1_packusdw;
21514 gen_shift = gen_lshrv4si3;
21515 break;
21516 case E_V8QImode:
21517 /* No check as all instructions are SSE2. */
21518 c = 0xff;
21519 s = 8;
21520 half_mode = V4HImode;
21521 gen_and = gen_andv4hi3;
21522 gen_pack = gen_mmx_packuswb;
21523 gen_shift = gen_lshrv4hi3;
21524 break;
21525 case E_V16QImode:
21526 /* No check as all instructions are SSE2. */
21527 c = 0xff;
21528 s = 8;
21529 half_mode = V8HImode;
21530 gen_and = gen_andv8hi3;
21531 gen_pack = gen_sse2_packuswb;
21532 gen_shift = gen_lshrv8hi3;
21533 break;
21534 case E_V16HImode:
21535 if (!TARGET_AVX2)
21536 return false;
21537 c = 0xffff;
21538 s = 16;
21539 half_mode = V8SImode;
21540 gen_and = gen_andv8si3;
21541 gen_pack = gen_avx2_packusdw;
21542 gen_shift = gen_lshrv8si3;
21543 end_perm = true;
21544 break;
21545 case E_V32QImode:
21546 if (!TARGET_AVX2)
21547 return false;
21548 c = 0xff;
21549 s = 8;
21550 half_mode = V16HImode;
21551 gen_and = gen_andv16hi3;
21552 gen_pack = gen_avx2_packuswb;
21553 gen_shift = gen_lshrv16hi3;
21554 end_perm = true;
21555 break;
21556 default:
21557 /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
21558 are more profitable than general shuffles. */
21559 return false;
21560 }
21561
21562 /* Check that permutation is even or odd. */
21563 odd = d->perm[0];
21564 if (odd > 1)
21565 return false;
21566
21567 for (i = 1; i < nelt; ++i)
21568 if (d->perm[i] != 2 * i + odd)
21569 return false;
21570
21571 if (d->testing_p)
21572 return true;
21573
21574 dop0 = gen_reg_rtx (half_mode);
21575 dop1 = gen_reg_rtx (half_mode);
21576 if (odd == 0)
21577 {
21578 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
21579 t = force_reg (half_mode, t);
21580 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
21581 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
21582 }
21583 else
21584 {
21585 emit_insn (gen_shift (dop0,
21586 gen_lowpart (half_mode, d->op0),
21587 GEN_INT (s)));
21588 emit_insn (gen_shift (dop1,
21589 gen_lowpart (half_mode, d->op1),
21590 GEN_INT (s)));
21591 }
21592 /* In AVX2 for 256 bit case we need to permute pack result. */
21593 if (TARGET_AVX2 && end_perm)
21594 {
21595 op = gen_reg_rtx (d->vmode);
21596 t = gen_reg_rtx (V4DImode);
21597 emit_insn (gen_pack (op, dop0, dop1));
21598 emit_insn (gen_avx2_permv4di_1 (t,
21599 gen_lowpart (V4DImode, op),
21600 const0_rtx,
21601 const2_rtx,
21602 const1_rtx,
21603 GEN_INT (3)));
21604 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
21605 }
21606 else
21607 emit_insn (gen_pack (d->target, dop0, dop1));
21608
21609 return true;
21610 }
21611
21612 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21613 and extract-odd permutations of two V64QI operands
21614 with two "shifts", two "truncs" and one "concat" insns for "odd"
21615 and two "truncs" and one concat insn for "even."
21616 Have already failed all two instruction sequences. */
21617
21618 static bool
21619 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
21620 {
21621 rtx t1, t2, t3, t4;
21622 unsigned i, odd, nelt = d->nelt;
21623
21624 if (!TARGET_AVX512BW
21625 || d->one_operand_p
21626 || d->vmode != V64QImode)
21627 return false;
21628
21629 /* Check that permutation is even or odd. */
21630 odd = d->perm[0];
21631 if (odd > 1)
21632 return false;
21633
21634 for (i = 1; i < nelt; ++i)
21635 if (d->perm[i] != 2 * i + odd)
21636 return false;
21637
21638 if (d->testing_p)
21639 return true;
21640
21641
21642 if (odd)
21643 {
21644 t1 = gen_reg_rtx (V32HImode);
21645 t2 = gen_reg_rtx (V32HImode);
21646 emit_insn (gen_lshrv32hi3 (t1,
21647 gen_lowpart (V32HImode, d->op0),
21648 GEN_INT (8)));
21649 emit_insn (gen_lshrv32hi3 (t2,
21650 gen_lowpart (V32HImode, d->op1),
21651 GEN_INT (8)));
21652 }
21653 else
21654 {
21655 t1 = gen_lowpart (V32HImode, d->op0);
21656 t2 = gen_lowpart (V32HImode, d->op1);
21657 }
21658
21659 t3 = gen_reg_rtx (V32QImode);
21660 t4 = gen_reg_rtx (V32QImode);
21661 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
21662 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
21663 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
21664
21665 return true;
21666 }
21667
21668 /* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
21669 and extract-odd permutations. */
21670
21671 static bool
21672 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
21673 {
21674 rtx t1, t2, t3, t4, t5;
21675
21676 switch (d->vmode)
21677 {
21678 case E_V4DFmode:
21679 if (d->testing_p)
21680 break;
21681 t1 = gen_reg_rtx (V4DFmode);
21682 t2 = gen_reg_rtx (V4DFmode);
21683
21684 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
21685 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
21686 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
21687
21688 /* Now an unpck[lh]pd will produce the result required. */
21689 if (odd)
21690 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
21691 else
21692 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
21693 emit_insn (t3);
21694 break;
21695
21696 case E_V8SFmode:
21697 {
21698 int mask = odd ? 0xdd : 0x88;
21699
21700 if (d->testing_p)
21701 break;
21702 t1 = gen_reg_rtx (V8SFmode);
21703 t2 = gen_reg_rtx (V8SFmode);
21704 t3 = gen_reg_rtx (V8SFmode);
21705
21706 /* Shuffle within the 128-bit lanes to produce:
21707 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
21708 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
21709 GEN_INT (mask)));
21710
21711 /* Shuffle the lanes around to produce:
21712 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
21713 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
21714 GEN_INT (0x3)));
21715
21716 /* Shuffle within the 128-bit lanes to produce:
21717 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
21718 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
21719
21720 /* Shuffle within the 128-bit lanes to produce:
21721 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
21722 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
21723
21724 /* Shuffle the lanes around to produce:
21725 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
21726 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
21727 GEN_INT (0x20)));
21728 }
21729 break;
21730
21731 case E_V2DFmode:
21732 case E_V4SFmode:
21733 case E_V2DImode:
21734 case E_V2SImode:
21735 case E_V4SImode:
21736 case E_V2HImode:
21737 /* These are always directly implementable by expand_vec_perm_1. */
21738 gcc_unreachable ();
21739
21740 case E_V2SFmode:
21741 gcc_assert (TARGET_MMX_WITH_SSE);
21742 /* We have no suitable instructions. */
21743 if (d->testing_p)
21744 return false;
21745 break;
21746
21747 case E_V4QImode:
21748 if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
21749 return expand_vec_perm_pshufb2 (d);
21750 else
21751 {
21752 if (d->testing_p)
21753 break;
21754 /* We need 2*log2(N)-1 operations to achieve odd/even
21755 with interleave. */
21756 t1 = gen_reg_rtx (V4QImode);
21757 emit_insn (gen_mmx_punpckhbw_low (t1, d->op0, d->op1));
21758 emit_insn (gen_mmx_punpcklbw_low (d->target, d->op0, d->op1));
21759 if (odd)
21760 t2 = gen_mmx_punpckhbw_low (d->target, d->target, t1);
21761 else
21762 t2 = gen_mmx_punpcklbw_low (d->target, d->target, t1);
21763 emit_insn (t2);
21764 }
21765 break;
21766
21767 case E_V4HImode:
21768 if (TARGET_SSE4_1)
21769 return expand_vec_perm_even_odd_pack (d);
21770 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
21771 return expand_vec_perm_pshufb2 (d);
21772 else
21773 {
21774 if (d->testing_p)
21775 break;
21776 /* We need 2*log2(N)-1 operations to achieve odd/even
21777 with interleave. */
21778 t1 = gen_reg_rtx (V4HImode);
21779 emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
21780 emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
21781 if (odd)
21782 t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
21783 else
21784 t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
21785 emit_insn (t2);
21786 }
21787 break;
21788
21789 case E_V8HImode:
21790 if (TARGET_SSE4_1)
21791 return expand_vec_perm_even_odd_pack (d);
21792 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
21793 return expand_vec_perm_pshufb2 (d);
21794 else
21795 {
21796 if (d->testing_p)
21797 break;
21798 /* We need 2*log2(N)-1 operations to achieve odd/even
21799 with interleave. */
21800 t1 = gen_reg_rtx (V8HImode);
21801 t2 = gen_reg_rtx (V8HImode);
21802 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
21803 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
21804 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
21805 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
21806 if (odd)
21807 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
21808 else
21809 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
21810 emit_insn (t3);
21811 }
21812 break;
21813
21814 case E_V8QImode:
21815 case E_V16QImode:
21816 return expand_vec_perm_even_odd_pack (d);
21817
21818 case E_V16HImode:
21819 case E_V32QImode:
21820 return expand_vec_perm_even_odd_pack (d);
21821
21822 case E_V64QImode:
21823 return expand_vec_perm_even_odd_trunc (d);
21824
21825 case E_V4DImode:
21826 if (!TARGET_AVX2)
21827 {
21828 struct expand_vec_perm_d d_copy = *d;
21829 d_copy.vmode = V4DFmode;
21830 if (d->testing_p)
21831 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
21832 else
21833 d_copy.target = gen_reg_rtx (V4DFmode);
21834 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
21835 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
21836 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
21837 {
21838 if (!d->testing_p)
21839 emit_move_insn (d->target,
21840 gen_lowpart (V4DImode, d_copy.target));
21841 return true;
21842 }
21843 return false;
21844 }
21845
21846 if (d->testing_p)
21847 break;
21848
21849 t1 = gen_reg_rtx (V4DImode);
21850 t2 = gen_reg_rtx (V4DImode);
21851
21852 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
21853 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
21854 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
21855
21856 /* Now an vpunpck[lh]qdq will produce the result required. */
21857 if (odd)
21858 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
21859 else
21860 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
21861 emit_insn (t3);
21862 break;
21863
21864 case E_V8SImode:
21865 if (!TARGET_AVX2)
21866 {
21867 struct expand_vec_perm_d d_copy = *d;
21868 d_copy.vmode = V8SFmode;
21869 if (d->testing_p)
21870 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
21871 else
21872 d_copy.target = gen_reg_rtx (V8SFmode);
21873 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
21874 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
21875 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
21876 {
21877 if (!d->testing_p)
21878 emit_move_insn (d->target,
21879 gen_lowpart (V8SImode, d_copy.target));
21880 return true;
21881 }
21882 return false;
21883 }
21884
21885 if (d->testing_p)
21886 break;
21887
21888 t1 = gen_reg_rtx (V8SImode);
21889 t2 = gen_reg_rtx (V8SImode);
21890 t3 = gen_reg_rtx (V4DImode);
21891 t4 = gen_reg_rtx (V4DImode);
21892 t5 = gen_reg_rtx (V4DImode);
21893
21894 /* Shuffle the lanes around into
21895 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
21896 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
21897 gen_lowpart (V4DImode, d->op1),
21898 GEN_INT (0x20)));
21899 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
21900 gen_lowpart (V4DImode, d->op1),
21901 GEN_INT (0x31)));
21902
21903 /* Swap the 2nd and 3rd position in each lane into
21904 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
21905 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
21906 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
21907 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
21908 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
21909
21910 /* Now an vpunpck[lh]qdq will produce
21911 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
21912 if (odd)
21913 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
21914 gen_lowpart (V4DImode, t2));
21915 else
21916 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
21917 gen_lowpart (V4DImode, t2));
21918 emit_insn (t3);
21919 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
21920 break;
21921
21922 default:
21923 gcc_unreachable ();
21924 }
21925
21926 return true;
21927 }
21928
21929 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
21930 extract-even and extract-odd permutations. */
21931
21932 static bool
21933 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
21934 {
21935 unsigned i, odd, nelt = d->nelt;
21936
21937 odd = d->perm[0];
21938 if (odd != 0 && odd != 1)
21939 return false;
21940
21941 for (i = 1; i < nelt; ++i)
21942 if (d->perm[i] != 2 * i + odd)
21943 return false;
21944
21945 if (d->vmode == E_V32HImode
21946 && d->testing_p
21947 && !TARGET_AVX512BW)
21948 return false;
21949
21950 return expand_vec_perm_even_odd_1 (d, odd);
21951 }
21952
21953 /* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
21954 permutations. We assume that expand_vec_perm_1 has already failed. */
21955
21956 static bool
21957 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
21958 {
21959 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
21960 machine_mode vmode = d->vmode;
21961 rtx (*gen) (rtx, rtx, rtx);
21962 unsigned char perm2[4];
21963 rtx op0 = d->op0, dest;
21964 bool ok;
21965
21966 switch (vmode)
21967 {
21968 case E_V4DFmode:
21969 case E_V8SFmode:
21970 /* These are special-cased in sse.md so that we can optionally
21971 use the vbroadcast instruction. They expand to two insns
21972 if the input happens to be in a register. */
21973 gcc_unreachable ();
21974
21975 case E_V2DFmode:
21976 case E_V2SFmode:
21977 case E_V4SFmode:
21978 case E_V2DImode:
21979 case E_V2SImode:
21980 case E_V4SImode:
21981 case E_V2HImode:
21982 case E_V4HImode:
21983 /* These are always implementable using standard shuffle patterns. */
21984 gcc_unreachable ();
21985
21986 case E_V4QImode:
21987 /* This can be implemented via interleave and pshuflw. */
21988 if (d->testing_p)
21989 return true;
21990
21991 if (elt >= nelt2)
21992 {
21993 gen = gen_mmx_punpckhbw_low;
21994 elt -= nelt2;
21995 }
21996 else
21997 gen = gen_mmx_punpcklbw_low;
21998
21999 dest = gen_reg_rtx (vmode);
22000 emit_insn (gen (dest, op0, op0));
22001 vmode = get_mode_wider_vector (vmode);
22002 op0 = gen_lowpart (vmode, dest);
22003
22004 memset (perm2, elt, 2);
22005 dest = gen_reg_rtx (vmode);
22006 ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
22007 gcc_assert (ok);
22008
22009 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
22010 return true;
22011
22012 case E_V8QImode:
22013 /* This can be implemented via interleave. We save one insn by
22014 stopping once we have promoted to V2SImode and then use pshufd. */
22015 if (d->testing_p)
22016 return true;
22017 do
22018 {
22019 if (elt >= nelt2)
22020 {
22021 gen = vmode == V8QImode ? gen_mmx_punpckhbw
22022 : gen_mmx_punpckhwd;
22023 elt -= nelt2;
22024 }
22025 else
22026 gen = vmode == V8QImode ? gen_mmx_punpcklbw
22027 : gen_mmx_punpcklwd;
22028 nelt2 /= 2;
22029
22030 dest = gen_reg_rtx (vmode);
22031 emit_insn (gen (dest, op0, op0));
22032 vmode = get_mode_wider_vector (vmode);
22033 op0 = gen_lowpart (vmode, dest);
22034 }
22035 while (vmode != V2SImode);
22036
22037 memset (perm2, elt, 2);
22038 dest = gen_reg_rtx (vmode);
22039 ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
22040 gcc_assert (ok);
22041
22042 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
22043 return true;
22044
22045 case E_V8HImode:
22046 case E_V16QImode:
22047 /* These can be implemented via interleave. We save one insn by
22048 stopping once we have promoted to V4SImode and then use pshufd. */
22049 if (d->testing_p)
22050 return true;
22051 do
22052 {
22053 if (elt >= nelt2)
22054 {
22055 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
22056 : gen_vec_interleave_highv8hi;
22057 elt -= nelt2;
22058 }
22059 else
22060 gen = vmode == V16QImode ? gen_vec_interleave_lowv16qi
22061 : gen_vec_interleave_lowv8hi;
22062 nelt2 /= 2;
22063
22064 dest = gen_reg_rtx (vmode);
22065 emit_insn (gen (dest, op0, op0));
22066 vmode = get_mode_wider_vector (vmode);
22067 op0 = gen_lowpart (vmode, dest);
22068 }
22069 while (vmode != V4SImode);
22070
22071 memset (perm2, elt, 4);
22072 dest = gen_reg_rtx (vmode);
22073 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
22074 gcc_assert (ok);
22075
22076 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
22077 return true;
22078
22079 case E_V8HFmode:
22080 case E_V8BFmode:
22081 /* This can be implemented via interleave and pshufd. */
22082 if (d->testing_p)
22083 return true;
22084
22085 rtx (*maybe_gen) (machine_mode, int, rtx, rtx, rtx);
22086 if (elt >= nelt2)
22087 {
22088 maybe_gen = maybe_gen_vec_interleave_high;
22089 elt -= nelt2;
22090 }
22091 else
22092 maybe_gen = maybe_gen_vec_interleave_low;
22093 nelt2 /= 2;
22094
22095 dest = gen_reg_rtx (vmode);
22096 emit_insn (maybe_gen (vmode, 1, dest, op0, op0));
22097
22098 vmode = V4SImode;
22099 op0 = gen_lowpart (vmode, dest);
22100
22101 memset (perm2, elt, 4);
22102 dest = gen_reg_rtx (vmode);
22103 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
22104 gcc_assert (ok);
22105
22106 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
22107 return true;
22108
22109 case E_V32QImode:
22110 case E_V16HImode:
22111 case E_V8SImode:
22112 case E_V4DImode:
22113 /* For AVX2 broadcasts of the first element vpbroadcast* or
22114 vpermq should be used by expand_vec_perm_1. */
22115 gcc_assert (!TARGET_AVX2 || d->perm[0]);
22116 return false;
22117
22118 case E_V64QImode:
22119 gcc_assert (!TARGET_AVX512BW || d->perm[0]);
22120 return false;
22121
22122 case E_V32HImode:
22123 gcc_assert (!TARGET_AVX512BW);
22124 return false;
22125
22126 default:
22127 gcc_unreachable ();
22128 }
22129 }
22130
22131 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
22132 broadcast permutations. */
22133
22134 static bool
22135 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
22136 {
22137 unsigned i, elt, nelt = d->nelt;
22138
22139 if (!d->one_operand_p)
22140 return false;
22141
22142 elt = d->perm[0];
22143 for (i = 1; i < nelt; ++i)
22144 if (d->perm[i] != elt)
22145 return false;
22146
22147 return expand_vec_perm_broadcast_1 (d);
22148 }
22149
22150 /* Implement arbitrary permutations of two V64QImode operands
22151 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
22152 static bool
22153 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
22154 {
22155 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
22156 return false;
22157
22158 if (d->testing_p)
22159 return true;
22160
22161 struct expand_vec_perm_d ds[2];
22162 rtx rperm[128], vperm, target0, target1;
22163 unsigned int i, nelt;
22164 machine_mode vmode;
22165
22166 nelt = d->nelt;
22167 vmode = V64QImode;
22168
22169 for (i = 0; i < 2; i++)
22170 {
22171 ds[i] = *d;
22172 ds[i].vmode = V32HImode;
22173 ds[i].nelt = 32;
22174 ds[i].target = gen_reg_rtx (V32HImode);
22175 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
22176 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
22177 }
22178
22179 /* Prepare permutations such that the first one takes care of
22180 putting the even bytes into the right positions or one higher
22181 positions (ds[0]) and the second one takes care of
22182 putting the odd bytes into the right positions or one below
22183 (ds[1]). */
22184
22185 for (i = 0; i < nelt; i++)
22186 {
22187 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
22188 if (i & 1)
22189 {
22190 rperm[i] = constm1_rtx;
22191 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
22192 }
22193 else
22194 {
22195 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
22196 rperm[i + 64] = constm1_rtx;
22197 }
22198 }
22199
22200 bool ok = expand_vec_perm_1 (&ds[0]);
22201 gcc_assert (ok);
22202 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
22203
22204 ok = expand_vec_perm_1 (&ds[1]);
22205 gcc_assert (ok);
22206 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
22207
22208 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
22209 vperm = force_reg (vmode, vperm);
22210 target0 = gen_reg_rtx (V64QImode);
22211 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
22212
22213 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
22214 vperm = force_reg (vmode, vperm);
22215 target1 = gen_reg_rtx (V64QImode);
22216 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
22217
22218 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
22219 return true;
22220 }
22221
22222 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
22223 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
22224 all the shorter instruction sequences. */
22225
22226 static bool
22227 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
22228 {
22229 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
22230 unsigned int i, nelt, eltsz;
22231 bool used[4];
22232
22233 if (!TARGET_AVX2
22234 || d->one_operand_p
22235 || (d->vmode != V32QImode && d->vmode != V16HImode))
22236 return false;
22237
22238 if (d->testing_p)
22239 return true;
22240
22241 nelt = d->nelt;
22242 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
22243
22244 /* Generate 4 permutation masks. If the required element is within
22245 the same lane, it is shuffled in. If the required element from the
22246 other lane, force a zero by setting bit 7 in the permutation mask.
22247 In the other mask the mask has non-negative elements if element
22248 is requested from the other lane, but also moved to the other lane,
22249 so that the result of vpshufb can have the two V2TImode halves
22250 swapped. */
22251 m128 = GEN_INT (-128);
22252 for (i = 0; i < 32; ++i)
22253 {
22254 rperm[0][i] = m128;
22255 rperm[1][i] = m128;
22256 rperm[2][i] = m128;
22257 rperm[3][i] = m128;
22258 }
22259 used[0] = false;
22260 used[1] = false;
22261 used[2] = false;
22262 used[3] = false;
22263 for (i = 0; i < nelt; ++i)
22264 {
22265 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
22266 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
22267 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
22268
22269 for (j = 0; j < eltsz; ++j)
22270 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
22271 used[which] = true;
22272 }
22273
22274 for (i = 0; i < 2; ++i)
22275 {
22276 if (!used[2 * i + 1])
22277 {
22278 h[i] = NULL_RTX;
22279 continue;
22280 }
22281 vperm = gen_rtx_CONST_VECTOR (V32QImode,
22282 gen_rtvec_v (32, rperm[2 * i + 1]));
22283 vperm = force_reg (V32QImode, vperm);
22284 h[i] = gen_reg_rtx (V32QImode);
22285 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
22286 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
22287 }
22288
22289 /* Swap the 128-byte lanes of h[X]. */
22290 for (i = 0; i < 2; ++i)
22291 {
22292 if (h[i] == NULL_RTX)
22293 continue;
22294 op = gen_reg_rtx (V4DImode);
22295 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
22296 const2_rtx, GEN_INT (3), const0_rtx,
22297 const1_rtx));
22298 h[i] = gen_lowpart (V32QImode, op);
22299 }
22300
22301 for (i = 0; i < 2; ++i)
22302 {
22303 if (!used[2 * i])
22304 {
22305 l[i] = NULL_RTX;
22306 continue;
22307 }
22308 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
22309 vperm = force_reg (V32QImode, vperm);
22310 l[i] = gen_reg_rtx (V32QImode);
22311 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
22312 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
22313 }
22314
22315 for (i = 0; i < 2; ++i)
22316 {
22317 if (h[i] && l[i])
22318 {
22319 op = gen_reg_rtx (V32QImode);
22320 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
22321 l[i] = op;
22322 }
22323 else if (h[i])
22324 l[i] = h[i];
22325 }
22326
22327 gcc_assert (l[0] && l[1]);
22328 op = d->target;
22329 if (d->vmode != V32QImode)
22330 op = gen_reg_rtx (V32QImode);
22331 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
22332 if (op != d->target)
22333 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
22334 return true;
22335 }
22336
22337 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
22338 taken care of, perform the expansion in D and return true on success. */
22339
22340 static bool
22341 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
22342 {
22343 /* Try a single instruction expansion. */
22344 if (expand_vec_perm_1 (d))
22345 return true;
22346
22347 /* Try sequences of two instructions. */
22348
22349 if (expand_vec_perm_pshuflw_pshufhw (d))
22350 return true;
22351
22352 if (expand_vec_perm_palignr (d, false))
22353 return true;
22354
22355 if (expand_vec_perm_interleave2 (d))
22356 return true;
22357
22358 if (expand_vec_perm_broadcast (d))
22359 return true;
22360
22361 if (expand_vec_perm_vpermq_perm_1 (d))
22362 return true;
22363
22364 if (expand_vec_perm_vperm2f128 (d))
22365 return true;
22366
22367 if (expand_vec_perm_pblendv (d))
22368 return true;
22369
22370 if (expand_vec_perm_2perm_interleave (d, true))
22371 return true;
22372
22373 if (expand_vec_perm_2perm_pblendv (d, true))
22374 return true;
22375
22376 if (expand_vec_perm_shufps_shufps (d))
22377 return true;
22378
22379 /* Try sequences of three instructions. */
22380
22381 if (expand_vec_perm_even_odd_pack (d))
22382 return true;
22383
22384 if (expand_vec_perm_2vperm2f128_vshuf (d))
22385 return true;
22386
22387 if (expand_vec_perm_pshufb2 (d))
22388 return true;
22389
22390 if (expand_vec_perm_pslldq_psrldq_por (d, false))
22391 return true;
22392
22393 if (expand_vec_perm_interleave3 (d))
22394 return true;
22395
22396 if (expand_vec_perm_vperm2f128_vblend (d))
22397 return true;
22398
22399 if (expand_vec_perm_2perm_interleave (d, false))
22400 return true;
22401
22402 if (expand_vec_perm_2perm_pblendv (d, false))
22403 return true;
22404
22405 /* Try sequences of four instructions. */
22406
22407 if (expand_vec_perm_even_odd_trunc (d))
22408 return true;
22409 if (expand_vec_perm_vpshufb2_vpermq (d))
22410 return true;
22411
22412 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
22413 return true;
22414
22415 if (expand_vec_perm_vpermt2_vpshub2 (d))
22416 return true;
22417
22418 /* ??? Look for narrow permutations whose element orderings would
22419 allow the promotion to a wider mode. */
22420
22421 /* ??? Look for sequences of interleave or a wider permute that place
22422 the data into the correct lanes for a half-vector shuffle like
22423 pshuf[lh]w or vpermilps. */
22424
22425 /* ??? Look for sequences of interleave that produce the desired results.
22426 The combinatorics of punpck[lh] get pretty ugly... */
22427
22428 if (expand_vec_perm_even_odd (d))
22429 return true;
22430
22431 /* Generate four or five instructions. */
22432 if (expand_vec_perm_pslldq_psrldq_por (d, true))
22433 return true;
22434
22435 /* Even longer sequences. */
22436 if (expand_vec_perm_vpshufb4_vpermq2 (d))
22437 return true;
22438
22439 /* See if we can get the same permutation in different vector integer
22440 mode. */
22441 struct expand_vec_perm_d nd;
22442 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
22443 {
22444 if (!d->testing_p)
22445 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
22446 return true;
22447 }
22448
22449 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
22450 if (expand_vec_perm2_vperm2f128_vblend (d))
22451 return true;
22452
22453 return false;
22454 }
22455
22456 /* If a permutation only uses one operand, make it clear. Returns true
22457 if the permutation references both operands. */
22458
22459 static bool
22460 canonicalize_perm (struct expand_vec_perm_d *d)
22461 {
22462 int i, which, nelt = d->nelt;
22463
22464 for (i = which = 0; i < nelt; ++i)
22465 which |= (d->perm[i] < nelt ? 1 : 2);
22466
22467 d->one_operand_p = true;
22468 switch (which)
22469 {
22470 default:
22471 gcc_unreachable();
22472
22473 case 3:
22474 if (!rtx_equal_p (d->op0, d->op1))
22475 {
22476 d->one_operand_p = false;
22477 break;
22478 }
22479 /* The elements of PERM do not suggest that only the first operand
22480 is used, but both operands are identical. Allow easier matching
22481 of the permutation by folding the permutation into the single
22482 input vector. */
22483 /* FALLTHRU */
22484
22485 case 2:
22486 for (i = 0; i < nelt; ++i)
22487 d->perm[i] &= nelt - 1;
22488 d->op0 = d->op1;
22489 break;
22490
22491 case 1:
22492 d->op1 = d->op0;
22493 break;
22494 }
22495
22496 return (which == 3);
22497 }
22498
22499 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
22500
22501 bool
22502 ix86_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
22503 rtx target, rtx op0, rtx op1,
22504 const vec_perm_indices &sel)
22505 {
22506 if (vmode != op_mode)
22507 return false;
22508
22509 struct expand_vec_perm_d d;
22510 unsigned char perm[MAX_VECT_LEN];
22511 unsigned int i, nelt, which;
22512 bool two_args;
22513
22514 /* For HF mode vector, convert it to HI using subreg. */
22515 if (GET_MODE_INNER (vmode) == HFmode)
22516 {
22517 machine_mode orig_mode = vmode;
22518 vmode = mode_for_vector (HImode,
22519 GET_MODE_NUNITS (vmode)).require ();
22520 if (target)
22521 target = lowpart_subreg (vmode, target, orig_mode);
22522 if (op0)
22523 op0 = lowpart_subreg (vmode, op0, orig_mode);
22524 if (op1)
22525 op1 = lowpart_subreg (vmode, op1, orig_mode);
22526 }
22527
22528 d.target = target;
22529 d.op0 = op0;
22530 d.op1 = op1;
22531
22532 d.vmode = vmode;
22533 gcc_assert (VECTOR_MODE_P (d.vmode));
22534 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
22535 d.testing_p = !target;
22536
22537 gcc_assert (sel.length () == nelt);
22538 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
22539
22540 /* Given sufficient ISA support we can just return true here
22541 for selected vector modes. */
22542 switch (d.vmode)
22543 {
22544 case E_V16SFmode:
22545 case E_V16SImode:
22546 case E_V8DImode:
22547 case E_V8DFmode:
22548 if (!TARGET_AVX512F)
22549 return false;
22550 /* All implementable with a single vperm[it]2 insn. */
22551 if (d.testing_p)
22552 return true;
22553 break;
22554 case E_V32HImode:
22555 if (!TARGET_AVX512F)
22556 return false;
22557 if (d.testing_p && TARGET_AVX512BW)
22558 /* All implementable with a single vperm[it]2 insn. */
22559 return true;
22560 break;
22561 case E_V64QImode:
22562 if (!TARGET_AVX512F)
22563 return false;
22564 if (d.testing_p && TARGET_AVX512BW)
22565 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
22566 return true;
22567 break;
22568 case E_V8SImode:
22569 case E_V8SFmode:
22570 case E_V4DFmode:
22571 case E_V4DImode:
22572 if (!TARGET_AVX)
22573 return false;
22574 if (d.testing_p && TARGET_AVX512VL)
22575 /* All implementable with a single vperm[it]2 insn. */
22576 return true;
22577 break;
22578 case E_V16HImode:
22579 if (!TARGET_SSE2)
22580 return false;
22581 if (d.testing_p && TARGET_AVX2)
22582 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
22583 return true;
22584 break;
22585 case E_V32QImode:
22586 if (!TARGET_SSE2)
22587 return false;
22588 if (d.testing_p && TARGET_AVX2)
22589 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
22590 return true;
22591 break;
22592 case E_V8HImode:
22593 case E_V16QImode:
22594 if (!TARGET_SSE2)
22595 return false;
22596 /* Fall through. */
22597 case E_V4SImode:
22598 case E_V4SFmode:
22599 if (!TARGET_SSE)
22600 return false;
22601 /* All implementable with a single vpperm insn. */
22602 if (d.testing_p && TARGET_XOP)
22603 return true;
22604 /* All implementable with 2 pshufb + 1 ior. */
22605 if (d.testing_p && TARGET_SSSE3)
22606 return true;
22607 break;
22608 case E_V2SFmode:
22609 case E_V2SImode:
22610 case E_V4HImode:
22611 case E_V8QImode:
22612 if (!TARGET_MMX_WITH_SSE)
22613 return false;
22614 break;
22615 case E_V2HImode:
22616 if (!TARGET_SSE2)
22617 return false;
22618 /* All implementable with *punpckwd. */
22619 if (d.testing_p)
22620 return true;
22621 break;
22622 case E_V4QImode:
22623 if (!TARGET_SSE2)
22624 return false;
22625 break;
22626 case E_V2DImode:
22627 case E_V2DFmode:
22628 if (!TARGET_SSE)
22629 return false;
22630 /* All implementable with shufpd or unpck[lh]pd. */
22631 if (d.testing_p)
22632 return true;
22633 break;
22634 default:
22635 return false;
22636 }
22637
22638 for (i = which = 0; i < nelt; ++i)
22639 {
22640 unsigned char e = sel[i];
22641 gcc_assert (e < 2 * nelt);
22642 d.perm[i] = e;
22643 perm[i] = e;
22644 which |= (e < nelt ? 1 : 2);
22645 }
22646
22647 if (d.testing_p)
22648 {
22649 /* For all elements from second vector, fold the elements to first. */
22650 if (which == 2)
22651 for (i = 0; i < nelt; ++i)
22652 d.perm[i] -= nelt;
22653
22654 /* Check whether the mask can be applied to the vector type. */
22655 d.one_operand_p = (which != 3);
22656
22657 /* Implementable with shufps, pshufd or pshuflw. */
22658 if (d.one_operand_p
22659 && (d.vmode == V4SFmode || d.vmode == V2SFmode
22660 || d.vmode == V4SImode || d.vmode == V2SImode
22661 || d.vmode == V4HImode || d.vmode == V2HImode))
22662 return true;
22663
22664 /* Otherwise we have to go through the motions and see if we can
22665 figure out how to generate the requested permutation. */
22666 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
22667 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
22668 if (!d.one_operand_p)
22669 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
22670
22671 start_sequence ();
22672 bool ret = ix86_expand_vec_perm_const_1 (&d);
22673 end_sequence ();
22674
22675 return ret;
22676 }
22677
22678 two_args = canonicalize_perm (&d);
22679
22680 /* If one of the operands is a zero vector, try to match pmovzx. */
22681 if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
22682 {
22683 struct expand_vec_perm_d dzero = d;
22684 if (d.op0 == CONST0_RTX (vmode))
22685 {
22686 d.op1 = dzero.op1 = force_reg (vmode, d.op1);
22687 std::swap (dzero.op0, dzero.op1);
22688 for (i = 0; i < nelt; ++i)
22689 dzero.perm[i] ^= nelt;
22690 }
22691 else
22692 d.op0 = dzero.op0 = force_reg (vmode, d.op0);
22693
22694 if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
22695 dzero.perm, nelt, dzero.testing_p))
22696 return true;
22697 }
22698
22699 /* Force operands into registers. */
22700 rtx nop0 = force_reg (vmode, d.op0);
22701 if (d.op0 == d.op1)
22702 d.op1 = nop0;
22703 d.op0 = nop0;
22704 d.op1 = force_reg (vmode, d.op1);
22705
22706 if (ix86_expand_vec_perm_const_1 (&d))
22707 return true;
22708
22709 /* If the selector says both arguments are needed, but the operands are the
22710 same, the above tried to expand with one_operand_p and flattened selector.
22711 If that didn't work, retry without one_operand_p; we succeeded with that
22712 during testing. */
22713 if (two_args && d.one_operand_p)
22714 {
22715 d.one_operand_p = false;
22716 memcpy (d.perm, perm, sizeof (perm));
22717 return ix86_expand_vec_perm_const_1 (&d);
22718 }
22719
22720 return false;
22721 }
22722
22723 void
22724 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
22725 {
22726 struct expand_vec_perm_d d;
22727 unsigned i, nelt;
22728
22729 d.target = targ;
22730 d.op0 = op0;
22731 d.op1 = op1;
22732 d.vmode = GET_MODE (targ);
22733 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
22734 d.one_operand_p = false;
22735 d.testing_p = false;
22736
22737 for (i = 0; i < nelt; ++i)
22738 d.perm[i] = i * 2 + odd;
22739
22740 /* We'll either be able to implement the permutation directly... */
22741 if (expand_vec_perm_1 (&d))
22742 return;
22743
22744 /* ... or we use the special-case patterns. */
22745 expand_vec_perm_even_odd_1 (&d, odd);
22746 }
22747
22748 static void
22749 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
22750 {
22751 struct expand_vec_perm_d d;
22752 unsigned i, nelt, base;
22753 bool ok;
22754
22755 d.target = targ;
22756 d.op0 = op0;
22757 d.op1 = op1;
22758 d.vmode = GET_MODE (targ);
22759 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
22760 d.one_operand_p = false;
22761 d.testing_p = false;
22762
22763 base = high_p ? nelt / 2 : 0;
22764 for (i = 0; i < nelt / 2; ++i)
22765 {
22766 d.perm[i * 2] = i + base;
22767 d.perm[i * 2 + 1] = i + base + nelt;
22768 }
22769
22770 /* Note that for AVX this isn't one instruction. */
22771 ok = ix86_expand_vec_perm_const_1 (&d);
22772 gcc_assert (ok);
22773 }
22774
22775 /* This function is similar as ix86_expand_vecop_qihi,
22776 but optimized under AVX512BW by using vpmovwb.
22777 For example, optimize vector MUL generation like
22778
22779 vpmovzxbw ymm2, xmm0
22780 vpmovzxbw ymm3, xmm1
22781 vpmullw ymm4, ymm2, ymm3
22782 vpmovwb xmm0, ymm4
22783
22784 it would take less instructions than ix86_expand_vecop_qihi.
22785 Return true if success. */
22786
22787 static bool
22788 ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
22789 {
22790 machine_mode himode, qimode = GET_MODE (dest);
22791 rtx hop1, hop2, hdest;
22792 rtx (*gen_extend)(rtx, rtx);
22793 rtx (*gen_truncate)(rtx, rtx);
22794 bool uns_p = (code == ASHIFTRT) ? false : true;
22795
22796 /* There's no V64HImode multiplication instruction. */
22797 if (qimode == E_V64QImode)
22798 return false;
22799
22800 /* vpmovwb only available under AVX512BW. */
22801 if (!TARGET_AVX512BW)
22802 return false;
22803 if ((qimode == V8QImode || qimode == V16QImode)
22804 && !TARGET_AVX512VL)
22805 return false;
22806 /* Not generate zmm instruction when prefer 128/256 bit vector width. */
22807 if (qimode == V32QImode
22808 && (TARGET_PREFER_AVX128 || TARGET_PREFER_AVX256))
22809 return false;
22810
22811 switch (qimode)
22812 {
22813 case E_V8QImode:
22814 himode = V8HImode;
22815 gen_extend = uns_p ? gen_zero_extendv8qiv8hi2 : gen_extendv8qiv8hi2;
22816 gen_truncate = gen_truncv8hiv8qi2;
22817 break;
22818 case E_V16QImode:
22819 himode = V16HImode;
22820 gen_extend = uns_p ? gen_zero_extendv16qiv16hi2 : gen_extendv16qiv16hi2;
22821 gen_truncate = gen_truncv16hiv16qi2;
22822 break;
22823 case E_V32QImode:
22824 himode = V32HImode;
22825 gen_extend = uns_p ? gen_zero_extendv32qiv32hi2 : gen_extendv32qiv32hi2;
22826 gen_truncate = gen_truncv32hiv32qi2;
22827 break;
22828 default:
22829 gcc_unreachable ();
22830 }
22831
22832 hop1 = gen_reg_rtx (himode);
22833 hop2 = gen_reg_rtx (himode);
22834 hdest = gen_reg_rtx (himode);
22835 emit_insn (gen_extend (hop1, op1));
22836 emit_insn (gen_extend (hop2, op2));
22837 emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (code, himode,
22838 hop1, hop2)));
22839 emit_insn (gen_truncate (dest, hdest));
22840 return true;
22841 }
22842
22843 /* Expand a vector operation shift by constant for a V*QImode in terms of the
22844 same operation on V*HImode. Return true if success. */
22845 static bool
22846 ix86_expand_vec_shift_qihi_constant (enum rtx_code code,
22847 rtx dest, rtx op1, rtx op2)
22848 {
22849 machine_mode qimode, himode;
22850 HOST_WIDE_INT and_constant, xor_constant;
22851 HOST_WIDE_INT shift_amount;
22852 rtx vec_const_and, vec_const_xor;
22853 rtx tmp, op1_subreg;
22854 rtx (*gen_shift) (rtx, rtx, rtx);
22855 rtx (*gen_and) (rtx, rtx, rtx);
22856 rtx (*gen_xor) (rtx, rtx, rtx);
22857 rtx (*gen_sub) (rtx, rtx, rtx);
22858
22859 /* Only optimize shift by constant. */
22860 if (!CONST_INT_P (op2))
22861 return false;
22862
22863 qimode = GET_MODE (dest);
22864 shift_amount = INTVAL (op2);
22865 /* Do nothing when shift amount greater equal 8. */
22866 if (shift_amount > 7)
22867 return false;
22868
22869 gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
22870 /* Record sign bit. */
22871 xor_constant = 1 << (8 - shift_amount - 1);
22872
22873 /* Zero upper/lower bits shift from left/right element. */
22874 and_constant
22875 = (code == ASHIFT ? 256 - (1 << shift_amount)
22876 : (1 << (8 - shift_amount)) - 1);
22877
22878 switch (qimode)
22879 {
22880 case V16QImode:
22881 himode = V8HImode;
22882 gen_shift =
22883 ((code == ASHIFT)
22884 ? gen_ashlv8hi3
22885 : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
22886 gen_and = gen_andv16qi3;
22887 gen_xor = gen_xorv16qi3;
22888 gen_sub = gen_subv16qi3;
22889 break;
22890 case V32QImode:
22891 himode = V16HImode;
22892 gen_shift =
22893 ((code == ASHIFT)
22894 ? gen_ashlv16hi3
22895 : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
22896 gen_and = gen_andv32qi3;
22897 gen_xor = gen_xorv32qi3;
22898 gen_sub = gen_subv32qi3;
22899 break;
22900 case V64QImode:
22901 himode = V32HImode;
22902 gen_shift =
22903 ((code == ASHIFT)
22904 ? gen_ashlv32hi3
22905 : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
22906 gen_and = gen_andv64qi3;
22907 gen_xor = gen_xorv64qi3;
22908 gen_sub = gen_subv64qi3;
22909 break;
22910 default:
22911 gcc_unreachable ();
22912 }
22913
22914 tmp = gen_reg_rtx (himode);
22915 vec_const_and = gen_reg_rtx (qimode);
22916 op1_subreg = lowpart_subreg (himode, op1, qimode);
22917
22918 /* For ASHIFT and LSHIFTRT, perform operation like
22919 vpsllw/vpsrlw $shift_amount, %op1, %dest.
22920 vpand %vec_const_and, %dest. */
22921 emit_insn (gen_shift (tmp, op1_subreg, op2));
22922 emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
22923 emit_move_insn (vec_const_and,
22924 ix86_build_const_vector (qimode, true,
22925 gen_int_mode (and_constant, QImode)));
22926 emit_insn (gen_and (dest, dest, vec_const_and));
22927
22928 /* For ASHIFTRT, perform extra operation like
22929 vpxor %vec_const_xor, %dest, %dest
22930 vpsubb %vec_const_xor, %dest, %dest */
22931 if (code == ASHIFTRT)
22932 {
22933 vec_const_xor = gen_reg_rtx (qimode);
22934 emit_move_insn (vec_const_xor,
22935 ix86_build_const_vector (qimode, true,
22936 gen_int_mode (xor_constant, QImode)));
22937 emit_insn (gen_xor (dest, dest, vec_const_xor));
22938 emit_insn (gen_sub (dest, dest, vec_const_xor));
22939 }
22940 return true;
22941 }
22942
22943 /* Expand a vector operation CODE for a V*QImode in terms of the
22944 same operation on V*HImode. */
22945
22946 void
22947 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
22948 {
22949 machine_mode qimode = GET_MODE (dest);
22950 machine_mode himode;
22951 rtx (*gen_il) (rtx, rtx, rtx);
22952 rtx (*gen_ih) (rtx, rtx, rtx);
22953 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
22954 struct expand_vec_perm_d d;
22955 bool ok, full_interleave;
22956 bool uns_p = false;
22957 int i;
22958
22959 if (CONST_INT_P (op2)
22960 && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
22961 && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
22962 return;
22963
22964 if (TARGET_AVX512BW
22965 && VECTOR_MODE_P (GET_MODE (op2))
22966 && ix86_expand_vecop_qihi2 (code, dest, op1, op2))
22967 return;
22968
22969 switch (qimode)
22970 {
22971 case E_V16QImode:
22972 himode = V8HImode;
22973 gen_il = gen_vec_interleave_lowv16qi;
22974 gen_ih = gen_vec_interleave_highv16qi;
22975 break;
22976 case E_V32QImode:
22977 himode = V16HImode;
22978 gen_il = gen_avx2_interleave_lowv32qi;
22979 gen_ih = gen_avx2_interleave_highv32qi;
22980 break;
22981 case E_V64QImode:
22982 himode = V32HImode;
22983 gen_il = gen_avx512bw_interleave_lowv64qi;
22984 gen_ih = gen_avx512bw_interleave_highv64qi;
22985 break;
22986 default:
22987 gcc_unreachable ();
22988 }
22989
22990 switch (code)
22991 {
22992 case MULT:
22993 /* Unpack data such that we've got a source byte in each low byte of
22994 each word. We don't care what goes into the high byte of each word.
22995 Rather than trying to get zero in there, most convenient is to let
22996 it be a copy of the low byte. */
22997 op2_l = gen_reg_rtx (qimode);
22998 op2_h = gen_reg_rtx (qimode);
22999 emit_insn (gen_il (op2_l, op2, op2));
23000 emit_insn (gen_ih (op2_h, op2, op2));
23001
23002 op1_l = gen_reg_rtx (qimode);
23003 op1_h = gen_reg_rtx (qimode);
23004 emit_insn (gen_il (op1_l, op1, op1));
23005 emit_insn (gen_ih (op1_h, op1, op1));
23006 full_interleave = qimode == V16QImode;
23007 break;
23008
23009 case ASHIFT:
23010 case LSHIFTRT:
23011 uns_p = true;
23012 /* FALLTHRU */
23013 case ASHIFTRT:
23014 op1_l = gen_reg_rtx (himode);
23015 op1_h = gen_reg_rtx (himode);
23016 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
23017 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
23018 /* vashr/vlshr/vashl */
23019 if (GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
23020 {
23021 rtx tmp = force_reg (qimode, op2);
23022 op2_l = gen_reg_rtx (himode);
23023 op2_h = gen_reg_rtx (himode);
23024 ix86_expand_sse_unpack (op2_l, tmp, uns_p, false);
23025 ix86_expand_sse_unpack (op2_h, tmp, uns_p, true);
23026 }
23027 else
23028 op2_l = op2_h = op2;
23029
23030 full_interleave = true;
23031 break;
23032 default:
23033 gcc_unreachable ();
23034 }
23035
23036 /* Perform vashr/vlshr/vashl. */
23037 if (code != MULT
23038 && GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
23039 {
23040 res_l = gen_reg_rtx (himode);
23041 res_h = gen_reg_rtx (himode);
23042 emit_insn (gen_rtx_SET (res_l,
23043 simplify_gen_binary (code, himode,
23044 op1_l, op2_l)));
23045 emit_insn (gen_rtx_SET (res_h,
23046 simplify_gen_binary (code, himode,
23047 op1_h, op2_h)));
23048 }
23049 /* Performance mult/ashr/lshr/ashl. */
23050 else
23051 {
23052 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
23053 1, OPTAB_DIRECT);
23054 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
23055 1, OPTAB_DIRECT);
23056 }
23057
23058 gcc_assert (res_l && res_h);
23059
23060 /* Merge the data back into the right place. */
23061 d.target = dest;
23062 d.op0 = gen_lowpart (qimode, res_l);
23063 d.op1 = gen_lowpart (qimode, res_h);
23064 d.vmode = qimode;
23065 d.nelt = GET_MODE_NUNITS (qimode);
23066 d.one_operand_p = false;
23067 d.testing_p = false;
23068
23069 if (full_interleave)
23070 {
23071 /* For SSE2, we used an full interleave, so the desired
23072 results are in the even elements. */
23073 for (i = 0; i < d.nelt; ++i)
23074 d.perm[i] = i * 2;
23075 }
23076 else
23077 {
23078 /* For AVX, the interleave used above was not cross-lane. So the
23079 extraction is evens but with the second and third quarter swapped.
23080 Happily, that is even one insn shorter than even extraction.
23081 For AVX512BW we have 4 lanes. We extract evens from within a lane,
23082 always first from the first and then from the second source operand,
23083 the index bits above the low 4 bits remains the same.
23084 Thus, for d.nelt == 32 we want permutation
23085 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
23086 and for d.nelt == 64 we want permutation
23087 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
23088 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
23089 for (i = 0; i < d.nelt; ++i)
23090 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
23091 }
23092
23093 ok = ix86_expand_vec_perm_const_1 (&d);
23094 gcc_assert (ok);
23095
23096 set_unique_reg_note (get_last_insn (), REG_EQUAL,
23097 gen_rtx_fmt_ee (code, qimode, op1, op2));
23098 }
23099
23100 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
23101 if op is CONST_VECTOR with all odd elements equal to their
23102 preceding element. */
23103
23104 static bool
23105 const_vector_equal_evenodd_p (rtx op)
23106 {
23107 machine_mode mode = GET_MODE (op);
23108 int i, nunits = GET_MODE_NUNITS (mode);
23109 if (GET_CODE (op) != CONST_VECTOR
23110 || nunits != CONST_VECTOR_NUNITS (op))
23111 return false;
23112 for (i = 0; i < nunits; i += 2)
23113 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
23114 return false;
23115 return true;
23116 }
23117
23118 void
23119 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
23120 bool uns_p, bool odd_p)
23121 {
23122 machine_mode mode = GET_MODE (op1);
23123 machine_mode wmode = GET_MODE (dest);
23124 rtx x;
23125 rtx orig_op1 = op1, orig_op2 = op2;
23126
23127 if (!nonimmediate_operand (op1, mode))
23128 op1 = force_reg (mode, op1);
23129 if (!nonimmediate_operand (op2, mode))
23130 op2 = force_reg (mode, op2);
23131
23132 /* We only play even/odd games with vectors of SImode. */
23133 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
23134
23135 /* If we're looking for the odd results, shift those members down to
23136 the even slots. For some cpus this is faster than a PSHUFD. */
23137 if (odd_p)
23138 {
23139 /* For XOP use vpmacsdqh, but only for smult, as it is only
23140 signed. */
23141 if (TARGET_XOP && mode == V4SImode && !uns_p)
23142 {
23143 x = force_reg (wmode, CONST0_RTX (wmode));
23144 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
23145 return;
23146 }
23147
23148 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
23149 if (!const_vector_equal_evenodd_p (orig_op1))
23150 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
23151 x, NULL, 1, OPTAB_DIRECT);
23152 if (!const_vector_equal_evenodd_p (orig_op2))
23153 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
23154 x, NULL, 1, OPTAB_DIRECT);
23155 op1 = gen_lowpart (mode, op1);
23156 op2 = gen_lowpart (mode, op2);
23157 }
23158
23159 if (mode == V16SImode)
23160 {
23161 if (uns_p)
23162 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
23163 else
23164 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
23165 }
23166 else if (mode == V8SImode)
23167 {
23168 if (uns_p)
23169 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
23170 else
23171 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
23172 }
23173 else if (uns_p)
23174 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
23175 else if (TARGET_SSE4_1)
23176 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
23177 else
23178 {
23179 rtx s1, s2, t0, t1, t2;
23180
23181 /* The easiest way to implement this without PMULDQ is to go through
23182 the motions as if we are performing a full 64-bit multiply. With
23183 the exception that we need to do less shuffling of the elements. */
23184
23185 /* Compute the sign-extension, aka highparts, of the two operands. */
23186 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
23187 op1, pc_rtx, pc_rtx);
23188 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
23189 op2, pc_rtx, pc_rtx);
23190
23191 /* Multiply LO(A) * HI(B), and vice-versa. */
23192 t1 = gen_reg_rtx (wmode);
23193 t2 = gen_reg_rtx (wmode);
23194 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
23195 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
23196
23197 /* Multiply LO(A) * LO(B). */
23198 t0 = gen_reg_rtx (wmode);
23199 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
23200
23201 /* Combine and shift the highparts into place. */
23202 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
23203 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
23204 1, OPTAB_DIRECT);
23205
23206 /* Combine high and low parts. */
23207 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
23208 return;
23209 }
23210 emit_insn (x);
23211 }
23212
23213 void
23214 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
23215 bool uns_p, bool high_p)
23216 {
23217 machine_mode wmode = GET_MODE (dest);
23218 machine_mode mode = GET_MODE (op1);
23219 rtx t1, t2, t3, t4, mask;
23220
23221 switch (mode)
23222 {
23223 case E_V4SImode:
23224 t1 = gen_reg_rtx (mode);
23225 t2 = gen_reg_rtx (mode);
23226 if (TARGET_XOP && !uns_p)
23227 {
23228 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
23229 shuffle the elements once so that all elements are in the right
23230 place for immediate use: { A C B D }. */
23231 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
23232 const1_rtx, GEN_INT (3)));
23233 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
23234 const1_rtx, GEN_INT (3)));
23235 }
23236 else
23237 {
23238 /* Put the elements into place for the multiply. */
23239 ix86_expand_vec_interleave (t1, op1, op1, high_p);
23240 ix86_expand_vec_interleave (t2, op2, op2, high_p);
23241 high_p = false;
23242 }
23243 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
23244 break;
23245
23246 case E_V8SImode:
23247 /* Shuffle the elements between the lanes. After this we
23248 have { A B E F | C D G H } for each operand. */
23249 t1 = gen_reg_rtx (V4DImode);
23250 t2 = gen_reg_rtx (V4DImode);
23251 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
23252 const0_rtx, const2_rtx,
23253 const1_rtx, GEN_INT (3)));
23254 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
23255 const0_rtx, const2_rtx,
23256 const1_rtx, GEN_INT (3)));
23257
23258 /* Shuffle the elements within the lanes. After this we
23259 have { A A B B | C C D D } or { E E F F | G G H H }. */
23260 t3 = gen_reg_rtx (V8SImode);
23261 t4 = gen_reg_rtx (V8SImode);
23262 mask = GEN_INT (high_p
23263 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
23264 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
23265 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
23266 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
23267
23268 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
23269 break;
23270
23271 case E_V8HImode:
23272 case E_V16HImode:
23273 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
23274 uns_p, OPTAB_DIRECT);
23275 t2 = expand_binop (mode,
23276 uns_p ? umul_highpart_optab : smul_highpart_optab,
23277 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
23278 gcc_assert (t1 && t2);
23279
23280 t3 = gen_reg_rtx (mode);
23281 ix86_expand_vec_interleave (t3, t1, t2, high_p);
23282 emit_move_insn (dest, gen_lowpart (wmode, t3));
23283 break;
23284
23285 case E_V16QImode:
23286 case E_V32QImode:
23287 case E_V32HImode:
23288 case E_V16SImode:
23289 case E_V64QImode:
23290 t1 = gen_reg_rtx (wmode);
23291 t2 = gen_reg_rtx (wmode);
23292 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
23293 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
23294
23295 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
23296 break;
23297
23298 default:
23299 gcc_unreachable ();
23300 }
23301 }
23302
23303 void
23304 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
23305 {
23306 rtx res_1, res_2, res_3, res_4;
23307
23308 res_1 = gen_reg_rtx (V4SImode);
23309 res_2 = gen_reg_rtx (V4SImode);
23310 res_3 = gen_reg_rtx (V2DImode);
23311 res_4 = gen_reg_rtx (V2DImode);
23312 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
23313 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
23314
23315 /* Move the results in element 2 down to element 1; we don't care
23316 what goes in elements 2 and 3. Then we can merge the parts
23317 back together with an interleave.
23318
23319 Note that two other sequences were tried:
23320 (1) Use interleaves at the start instead of psrldq, which allows
23321 us to use a single shufps to merge things back at the end.
23322 (2) Use shufps here to combine the two vectors, then pshufd to
23323 put the elements in the correct order.
23324 In both cases the cost of the reformatting stall was too high
23325 and the overall sequence slower. */
23326
23327 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
23328 const0_rtx, const2_rtx,
23329 const0_rtx, const0_rtx));
23330 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
23331 const0_rtx, const2_rtx,
23332 const0_rtx, const0_rtx));
23333 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
23334
23335 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
23336 }
23337
23338 void
23339 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
23340 {
23341 machine_mode mode = GET_MODE (op0);
23342 rtx t1, t2, t3, t4, t5, t6;
23343
23344 if (TARGET_AVX512DQ && mode == V8DImode)
23345 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
23346 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
23347 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
23348 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
23349 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
23350 else if (TARGET_XOP && mode == V2DImode)
23351 {
23352 /* op1: A,B,C,D, op2: E,F,G,H */
23353 op1 = gen_lowpart (V4SImode, op1);
23354 op2 = gen_lowpart (V4SImode, op2);
23355
23356 t1 = gen_reg_rtx (V4SImode);
23357 t2 = gen_reg_rtx (V4SImode);
23358 t3 = gen_reg_rtx (V2DImode);
23359 t4 = gen_reg_rtx (V2DImode);
23360
23361 /* t1: B,A,D,C */
23362 emit_insn (gen_sse2_pshufd_1 (t1, op1,
23363 GEN_INT (1),
23364 GEN_INT (0),
23365 GEN_INT (3),
23366 GEN_INT (2)));
23367
23368 /* t2: (B*E),(A*F),(D*G),(C*H) */
23369 emit_insn (gen_mulv4si3 (t2, t1, op2));
23370
23371 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
23372 emit_insn (gen_xop_phadddq (t3, t2));
23373
23374 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
23375 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
23376
23377 /* Multiply lower parts and add all */
23378 t5 = gen_reg_rtx (V2DImode);
23379 emit_insn (gen_vec_widen_umult_even_v4si (t5,
23380 gen_lowpart (V4SImode, op1),
23381 gen_lowpart (V4SImode, op2)));
23382 force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
23383 }
23384 else
23385 {
23386 machine_mode nmode;
23387 rtx (*umul) (rtx, rtx, rtx);
23388
23389 if (mode == V2DImode)
23390 {
23391 umul = gen_vec_widen_umult_even_v4si;
23392 nmode = V4SImode;
23393 }
23394 else if (mode == V4DImode)
23395 {
23396 umul = gen_vec_widen_umult_even_v8si;
23397 nmode = V8SImode;
23398 }
23399 else if (mode == V8DImode)
23400 {
23401 umul = gen_vec_widen_umult_even_v16si;
23402 nmode = V16SImode;
23403 }
23404 else
23405 gcc_unreachable ();
23406
23407
23408 /* Multiply low parts. */
23409 t1 = gen_reg_rtx (mode);
23410 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
23411
23412 /* Shift input vectors right 32 bits so we can multiply high parts. */
23413 t6 = GEN_INT (32);
23414 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
23415 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
23416
23417 /* Multiply high parts by low parts. */
23418 t4 = gen_reg_rtx (mode);
23419 t5 = gen_reg_rtx (mode);
23420 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
23421 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
23422
23423 /* Combine and shift the highparts back. */
23424 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
23425 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
23426
23427 /* Combine high and low parts. */
23428 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
23429 }
23430
23431 set_unique_reg_note (get_last_insn (), REG_EQUAL,
23432 gen_rtx_MULT (mode, op1, op2));
23433 }
23434
23435 /* Return 1 if control tansfer instruction INSN
23436 should be encoded with notrack prefix. */
23437
23438 bool
23439 ix86_notrack_prefixed_insn_p (rtx_insn *insn)
23440 {
23441 if (!insn || !((flag_cf_protection & CF_BRANCH)))
23442 return false;
23443
23444 if (CALL_P (insn))
23445 {
23446 rtx call = get_call_rtx_from (insn);
23447 gcc_assert (call != NULL_RTX);
23448 rtx addr = XEXP (call, 0);
23449
23450 /* Do not emit 'notrack' if it's not an indirect call. */
23451 if (MEM_P (addr)
23452 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
23453 return false;
23454 else
23455 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
23456 }
23457
23458 if (JUMP_P (insn) && !flag_cet_switch)
23459 {
23460 rtx target = JUMP_LABEL (insn);
23461 if (target == NULL_RTX || ANY_RETURN_P (target))
23462 return false;
23463
23464 /* Check the jump is a switch table. */
23465 rtx_insn *label = as_a<rtx_insn *> (target);
23466 rtx_insn *table = next_insn (label);
23467 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
23468 return false;
23469 else
23470 return true;
23471 }
23472 return false;
23473 }
23474
23475 /* Calculate integer abs() using only SSE2 instructions. */
23476
23477 void
23478 ix86_expand_sse2_abs (rtx target, rtx input)
23479 {
23480 machine_mode mode = GET_MODE (target);
23481 rtx tmp0, tmp1, x;
23482
23483 switch (mode)
23484 {
23485 case E_V2DImode:
23486 case E_V4DImode:
23487 /* For 64-bit signed integer X, with SSE4.2 use
23488 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
23489 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
23490 32 and use logical instead of arithmetic right shift (which is
23491 unimplemented) and subtract. */
23492 if (TARGET_SSE4_2)
23493 {
23494 tmp0 = gen_reg_rtx (mode);
23495 tmp1 = gen_reg_rtx (mode);
23496 emit_move_insn (tmp1, CONST0_RTX (mode));
23497 if (mode == E_V2DImode)
23498 emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
23499 else
23500 emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
23501 }
23502 else
23503 {
23504 tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
23505 GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
23506 - 1), NULL, 0, OPTAB_DIRECT);
23507 tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
23508 }
23509
23510 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
23511 NULL, 0, OPTAB_DIRECT);
23512 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
23513 target, 0, OPTAB_DIRECT);
23514 break;
23515
23516 case E_V4SImode:
23517 /* For 32-bit signed integer X, the best way to calculate the absolute
23518 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
23519 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
23520 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
23521 NULL, 0, OPTAB_DIRECT);
23522 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
23523 NULL, 0, OPTAB_DIRECT);
23524 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
23525 target, 0, OPTAB_DIRECT);
23526 break;
23527
23528 case E_V8HImode:
23529 /* For 16-bit signed integer X, the best way to calculate the absolute
23530 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
23531 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
23532
23533 x = expand_simple_binop (mode, SMAX, tmp0, input,
23534 target, 0, OPTAB_DIRECT);
23535 break;
23536
23537 case E_V16QImode:
23538 /* For 8-bit signed integer X, the best way to calculate the absolute
23539 value of X is min ((unsigned char) X, (unsigned char) (-X)),
23540 as SSE2 provides the PMINUB insn. */
23541 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
23542
23543 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
23544 target, 0, OPTAB_DIRECT);
23545 break;
23546
23547 default:
23548 gcc_unreachable ();
23549 }
23550
23551 if (x != target)
23552 emit_move_insn (target, x);
23553 }
23554
23555 /* Expand an extract from a vector register through pextr insn.
23556 Return true if successful. */
23557
23558 bool
23559 ix86_expand_pextr (rtx *operands)
23560 {
23561 rtx dst = operands[0];
23562 rtx src = operands[1];
23563
23564 unsigned int size = INTVAL (operands[2]);
23565 unsigned int pos = INTVAL (operands[3]);
23566
23567 if (SUBREG_P (dst))
23568 {
23569 /* Reject non-lowpart subregs. */
23570 if (SUBREG_BYTE (dst) > 0)
23571 return false;
23572 dst = SUBREG_REG (dst);
23573 }
23574
23575 if (SUBREG_P (src))
23576 {
23577 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
23578 src = SUBREG_REG (src);
23579 }
23580
23581 switch (GET_MODE (src))
23582 {
23583 case E_V16QImode:
23584 case E_V8HImode:
23585 case E_V4SImode:
23586 case E_V2DImode:
23587 case E_V1TImode:
23588 {
23589 machine_mode srcmode, dstmode;
23590 rtx d, pat;
23591
23592 if (!int_mode_for_size (size, 0).exists (&dstmode))
23593 return false;
23594
23595 switch (dstmode)
23596 {
23597 case E_QImode:
23598 if (!TARGET_SSE4_1)
23599 return false;
23600 srcmode = V16QImode;
23601 break;
23602
23603 case E_HImode:
23604 if (!TARGET_SSE2)
23605 return false;
23606 srcmode = V8HImode;
23607 break;
23608
23609 case E_SImode:
23610 if (!TARGET_SSE4_1)
23611 return false;
23612 srcmode = V4SImode;
23613 break;
23614
23615 case E_DImode:
23616 gcc_assert (TARGET_64BIT);
23617 if (!TARGET_SSE4_1)
23618 return false;
23619 srcmode = V2DImode;
23620 break;
23621
23622 default:
23623 return false;
23624 }
23625
23626 /* Reject extractions from misaligned positions. */
23627 if (pos & (size-1))
23628 return false;
23629
23630 if (GET_MODE (dst) == dstmode)
23631 d = dst;
23632 else
23633 d = gen_reg_rtx (dstmode);
23634
23635 /* Construct insn pattern. */
23636 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
23637 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
23638
23639 /* Let the rtl optimizers know about the zero extension performed. */
23640 if (dstmode == QImode || dstmode == HImode)
23641 {
23642 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
23643 d = gen_lowpart (SImode, d);
23644 }
23645
23646 emit_insn (gen_rtx_SET (d, pat));
23647
23648 if (d != dst)
23649 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
23650 return true;
23651 }
23652
23653 default:
23654 return false;
23655 }
23656 }
23657
23658 /* Expand an insert into a vector register through pinsr insn.
23659 Return true if successful. */
23660
23661 bool
23662 ix86_expand_pinsr (rtx *operands)
23663 {
23664 rtx dst = operands[0];
23665 rtx src = operands[3];
23666
23667 unsigned int size = INTVAL (operands[1]);
23668 unsigned int pos = INTVAL (operands[2]);
23669
23670 if (SUBREG_P (dst))
23671 {
23672 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
23673 dst = SUBREG_REG (dst);
23674 }
23675
23676 switch (GET_MODE (dst))
23677 {
23678 case E_V16QImode:
23679 case E_V8HImode:
23680 case E_V4SImode:
23681 case E_V2DImode:
23682 case E_V1TImode:
23683 {
23684 machine_mode srcmode, dstmode;
23685 rtx (*pinsr)(rtx, rtx, rtx, rtx);
23686 rtx d;
23687
23688 if (!int_mode_for_size (size, 0).exists (&srcmode))
23689 return false;
23690
23691 switch (srcmode)
23692 {
23693 case E_QImode:
23694 if (!TARGET_SSE4_1)
23695 return false;
23696 dstmode = V16QImode;
23697 pinsr = gen_sse4_1_pinsrb;
23698 break;
23699
23700 case E_HImode:
23701 if (!TARGET_SSE2)
23702 return false;
23703 dstmode = V8HImode;
23704 pinsr = gen_sse2_pinsrw;
23705 break;
23706
23707 case E_SImode:
23708 if (!TARGET_SSE4_1)
23709 return false;
23710 dstmode = V4SImode;
23711 pinsr = gen_sse4_1_pinsrd;
23712 break;
23713
23714 case E_DImode:
23715 gcc_assert (TARGET_64BIT);
23716 if (!TARGET_SSE4_1)
23717 return false;
23718 dstmode = V2DImode;
23719 pinsr = gen_sse4_1_pinsrq;
23720 break;
23721
23722 default:
23723 return false;
23724 }
23725
23726 /* Reject insertions to misaligned positions. */
23727 if (pos & (size-1))
23728 return false;
23729
23730 if (SUBREG_P (src))
23731 {
23732 unsigned int srcpos = SUBREG_BYTE (src);
23733
23734 if (srcpos > 0)
23735 {
23736 rtx extr_ops[4];
23737
23738 extr_ops[0] = gen_reg_rtx (srcmode);
23739 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
23740 extr_ops[2] = GEN_INT (size);
23741 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
23742
23743 if (!ix86_expand_pextr (extr_ops))
23744 return false;
23745
23746 src = extr_ops[0];
23747 }
23748 else
23749 src = gen_lowpart (srcmode, SUBREG_REG (src));
23750 }
23751
23752 if (GET_MODE (dst) == dstmode)
23753 d = dst;
23754 else
23755 d = gen_reg_rtx (dstmode);
23756
23757 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
23758 gen_lowpart (srcmode, src),
23759 GEN_INT (1 << (pos / size))));
23760 if (d != dst)
23761 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
23762 return true;
23763 }
23764
23765 default:
23766 return false;
23767 }
23768 }
23769
23770 /* All CPUs prefer to avoid cross-lane operations so perform reductions
23771 upper against lower halves up to SSE reg size. */
23772
23773 machine_mode
23774 ix86_split_reduction (machine_mode mode)
23775 {
23776 /* Reduce lowpart against highpart until we reach SSE reg width to
23777 avoid cross-lane operations. */
23778 switch (mode)
23779 {
23780 case E_V8DImode:
23781 case E_V4DImode:
23782 return V2DImode;
23783 case E_V16SImode:
23784 case E_V8SImode:
23785 return V4SImode;
23786 case E_V32HImode:
23787 case E_V16HImode:
23788 return V8HImode;
23789 case E_V64QImode:
23790 case E_V32QImode:
23791 return V16QImode;
23792 case E_V16SFmode:
23793 case E_V8SFmode:
23794 return V4SFmode;
23795 case E_V8DFmode:
23796 case E_V4DFmode:
23797 return V2DFmode;
23798 default:
23799 return mode;
23800 }
23801 }
23802
23803 /* Generate call to __divmoddi4. */
23804
23805 void
23806 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
23807 rtx op0, rtx op1,
23808 rtx *quot_p, rtx *rem_p)
23809 {
23810 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
23811
23812 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
23813 mode, op0, mode, op1, mode,
23814 XEXP (rem, 0), Pmode);
23815 *quot_p = quot;
23816 *rem_p = rem;
23817 }
23818
23819 void
23820 ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
23821 enum rtx_code code, bool after,
23822 bool doubleword)
23823 {
23824 rtx old_reg, new_reg, old_mem, success;
23825 machine_mode mode = GET_MODE (target);
23826 rtx_code_label *loop_label = NULL;
23827
23828 old_reg = gen_reg_rtx (mode);
23829 new_reg = old_reg;
23830 old_mem = copy_to_reg (mem);
23831 loop_label = gen_label_rtx ();
23832 emit_label (loop_label);
23833 emit_move_insn (old_reg, old_mem);
23834
23835 /* return value for atomic_fetch_op. */
23836 if (!after)
23837 emit_move_insn (target, old_reg);
23838
23839 if (code == NOT)
23840 {
23841 new_reg = expand_simple_binop (mode, AND, new_reg, val, NULL_RTX,
23842 true, OPTAB_LIB_WIDEN);
23843 new_reg = expand_simple_unop (mode, code, new_reg, NULL_RTX, true);
23844 }
23845 else
23846 new_reg = expand_simple_binop (mode, code, new_reg, val, NULL_RTX,
23847 true, OPTAB_LIB_WIDEN);
23848
23849 /* return value for atomic_op_fetch. */
23850 if (after)
23851 emit_move_insn (target, new_reg);
23852
23853 success = NULL_RTX;
23854
23855 ix86_expand_cmpxchg_loop (&success, old_mem, mem, old_reg, new_reg,
23856 gen_int_mode (MEMMODEL_SYNC_SEQ_CST,
23857 SImode),
23858 doubleword, loop_label);
23859 }
23860
23861 /* Relax cmpxchg instruction, param loop_label indicates whether
23862 the instruction should be relaxed with a pause loop. If not,
23863 it will be relaxed to an atomic load + compare, and skip
23864 cmpxchg instruction if mem != exp_input. */
23865
23866 void
23867 ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
23868 rtx mem, rtx exp_input, rtx new_input,
23869 rtx mem_model, bool doubleword,
23870 rtx_code_label *loop_label)
23871 {
23872 rtx_code_label *cmp_label = NULL;
23873 rtx_code_label *done_label = NULL;
23874 rtx target_bool = NULL_RTX, new_mem = NULL_RTX;
23875 rtx (*gen) (rtx, rtx, rtx, rtx, rtx) = NULL;
23876 rtx (*gendw) (rtx, rtx, rtx, rtx, rtx, rtx) = NULL;
23877 machine_mode mode = GET_MODE (target_val), hmode = mode;
23878
23879 if (*ptarget_bool == NULL)
23880 target_bool = gen_reg_rtx (QImode);
23881 else
23882 target_bool = *ptarget_bool;
23883
23884 cmp_label = gen_label_rtx ();
23885 done_label = gen_label_rtx ();
23886
23887 new_mem = gen_reg_rtx (mode);
23888 /* Load memory first. */
23889 expand_atomic_load (new_mem, mem, MEMMODEL_SEQ_CST);
23890
23891 switch (mode)
23892 {
23893 case E_TImode:
23894 gendw = gen_atomic_compare_and_swapti_doubleword;
23895 hmode = DImode;
23896 break;
23897 case E_DImode:
23898 if (doubleword)
23899 {
23900 gendw = gen_atomic_compare_and_swapdi_doubleword;
23901 hmode = SImode;
23902 }
23903 else
23904 gen = gen_atomic_compare_and_swapdi_1;
23905 break;
23906 case E_SImode:
23907 gen = gen_atomic_compare_and_swapsi_1;
23908 break;
23909 case E_HImode:
23910 gen = gen_atomic_compare_and_swaphi_1;
23911 break;
23912 case E_QImode:
23913 gen = gen_atomic_compare_and_swapqi_1;
23914 break;
23915 default:
23916 gcc_unreachable ();
23917 }
23918
23919 /* Compare mem value with expected value. */
23920 if (doubleword)
23921 {
23922 rtx low_new_mem = gen_lowpart (hmode, new_mem);
23923 rtx low_exp_input = gen_lowpart (hmode, exp_input);
23924 rtx high_new_mem = gen_highpart (hmode, new_mem);
23925 rtx high_exp_input = gen_highpart (hmode, exp_input);
23926 emit_cmp_and_jump_insns (low_new_mem, low_exp_input, NE, NULL_RTX,
23927 hmode, 1, cmp_label,
23928 profile_probability::guessed_never ());
23929 emit_cmp_and_jump_insns (high_new_mem, high_exp_input, NE, NULL_RTX,
23930 hmode, 1, cmp_label,
23931 profile_probability::guessed_never ());
23932 }
23933 else
23934 emit_cmp_and_jump_insns (new_mem, exp_input, NE, NULL_RTX,
23935 GET_MODE (exp_input), 1, cmp_label,
23936 profile_probability::guessed_never ());
23937
23938 /* Directly emits cmpxchg here. */
23939 if (doubleword)
23940 emit_insn (gendw (target_val, mem, exp_input,
23941 gen_lowpart (hmode, new_input),
23942 gen_highpart (hmode, new_input),
23943 mem_model));
23944 else
23945 emit_insn (gen (target_val, mem, exp_input, new_input, mem_model));
23946
23947 if (!loop_label)
23948 {
23949 emit_jump_insn (gen_jump (done_label));
23950 emit_barrier ();
23951 emit_label (cmp_label);
23952 emit_move_insn (target_val, new_mem);
23953 emit_label (done_label);
23954 ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
23955 const0_rtx);
23956 }
23957 else
23958 {
23959 ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
23960 const0_rtx);
23961 emit_cmp_and_jump_insns (target_bool, const0_rtx, EQ, const0_rtx,
23962 GET_MODE (target_bool), 1, loop_label,
23963 profile_probability::guessed_never ());
23964 emit_jump_insn (gen_jump (done_label));
23965 emit_barrier ();
23966
23967 /* If mem is not expected, pause and loop back. */
23968 emit_label (cmp_label);
23969 emit_move_insn (target_val, new_mem);
23970 emit_insn (gen_pause ());
23971 emit_jump_insn (gen_jump (loop_label));
23972 emit_barrier ();
23973 emit_label (done_label);
23974 }
23975
23976 *ptarget_bool = target_bool;
23977 }
23978
23979 #include "gt-i386-expand.h"
This page took 1.098418 seconds and 6 git commands to generate.