]> gcc.gnu.org Git - gcc.git/blob - gcc/config/i386/i386-expand.c
i386.md (ix86_expand_vector_set): Use vec_merge path for TARGET_MMX_WITH_SSE && TARGE...
[gcc.git] / gcc / config / i386 / i386-expand.c
1 /* Copyright (C) 1988-2019 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
18
19 #define IN_TARGET_CODE 1
20
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "memmodel.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "expmed.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "cgraph.h"
40 #include "diagnostic.h"
41 #include "cfgbuild.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "attribs.h"
45 #include "calls.h"
46 #include "stor-layout.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "insn-attr.h"
50 #include "flags.h"
51 #include "except.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "cfgrtl.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
57 #include "reload.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "tm-constrs.h"
61 #include "params.h"
62 #include "cselib.h"
63 #include "sched-int.h"
64 #include "opts.h"
65 #include "tree-pass.h"
66 #include "context.h"
67 #include "pass_manager.h"
68 #include "target-globals.h"
69 #include "gimple-iterator.h"
70 #include "tree-vectorizer.h"
71 #include "shrink-wrap.h"
72 #include "builtins.h"
73 #include "rtl-iter.h"
74 #include "tree-iterator.h"
75 #include "dbgcnt.h"
76 #include "case-cfn-macros.h"
77 #include "dojump.h"
78 #include "fold-const-call.h"
79 #include "tree-vrp.h"
80 #include "tree-ssanames.h"
81 #include "selftest.h"
82 #include "selftest-rtl.h"
83 #include "print-rtl.h"
84 #include "intl.h"
85 #include "ifcvt.h"
86 #include "symbol-summary.h"
87 #include "ipa-prop.h"
88 #include "ipa-fnsummary.h"
89 #include "wide-int-bitmask.h"
90 #include "tree-vector-builder.h"
91 #include "debug.h"
92 #include "dwarf2out.h"
93 #include "i386-options.h"
94 #include "i386-builtins.h"
95 #include "i386-expand.h"
96
97 /* Split one or more double-mode RTL references into pairs of half-mode
98 references. The RTL can be REG, offsettable MEM, integer constant, or
99 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
100 split and "num" is its length. lo_half and hi_half are output arrays
101 that parallel "operands". */
102
103 void
104 split_double_mode (machine_mode mode, rtx operands[],
105 int num, rtx lo_half[], rtx hi_half[])
106 {
107 machine_mode half_mode;
108 unsigned int byte;
109
110 switch (mode)
111 {
112 case E_TImode:
113 half_mode = DImode;
114 break;
115 case E_DImode:
116 half_mode = SImode;
117 break;
118 default:
119 gcc_unreachable ();
120 }
121
122 byte = GET_MODE_SIZE (half_mode);
123
124 while (num--)
125 {
126 rtx op = operands[num];
127
128 /* simplify_subreg refuse to split volatile memory addresses,
129 but we still have to handle it. */
130 if (MEM_P (op))
131 {
132 lo_half[num] = adjust_address (op, half_mode, 0);
133 hi_half[num] = adjust_address (op, half_mode, byte);
134 }
135 else
136 {
137 lo_half[num] = simplify_gen_subreg (half_mode, op,
138 GET_MODE (op) == VOIDmode
139 ? mode : GET_MODE (op), 0);
140 hi_half[num] = simplify_gen_subreg (half_mode, op,
141 GET_MODE (op) == VOIDmode
142 ? mode : GET_MODE (op), byte);
143 }
144 }
145 }
146
147 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
148 for the target. */
149
150 void
151 ix86_expand_clear (rtx dest)
152 {
153 rtx tmp;
154
155 /* We play register width games, which are only valid after reload. */
156 gcc_assert (reload_completed);
157
158 /* Avoid HImode and its attendant prefix byte. */
159 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
160 dest = gen_rtx_REG (SImode, REGNO (dest));
161 tmp = gen_rtx_SET (dest, const0_rtx);
162
163 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
164 {
165 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
166 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
167 }
168
169 emit_insn (tmp);
170 }
171
172 void
173 ix86_expand_move (machine_mode mode, rtx operands[])
174 {
175 rtx op0, op1;
176 rtx tmp, addend = NULL_RTX;
177 enum tls_model model;
178
179 op0 = operands[0];
180 op1 = operands[1];
181
182 switch (GET_CODE (op1))
183 {
184 case CONST:
185 tmp = XEXP (op1, 0);
186
187 if (GET_CODE (tmp) != PLUS
188 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
189 break;
190
191 op1 = XEXP (tmp, 0);
192 addend = XEXP (tmp, 1);
193 /* FALLTHRU */
194
195 case SYMBOL_REF:
196 model = SYMBOL_REF_TLS_MODEL (op1);
197
198 if (model)
199 op1 = legitimize_tls_address (op1, model, true);
200 else if (ix86_force_load_from_GOT_p (op1))
201 {
202 /* Load the external function address via GOT slot to avoid PLT. */
203 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
204 (TARGET_64BIT
205 ? UNSPEC_GOTPCREL
206 : UNSPEC_GOT));
207 op1 = gen_rtx_CONST (Pmode, op1);
208 op1 = gen_const_mem (Pmode, op1);
209 set_mem_alias_set (op1, ix86_GOT_alias_set ());
210 }
211 else
212 {
213 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
214 if (tmp)
215 {
216 op1 = tmp;
217 if (!addend)
218 break;
219 }
220 else
221 {
222 op1 = operands[1];
223 break;
224 }
225 }
226
227 if (addend)
228 {
229 op1 = force_operand (op1, NULL_RTX);
230 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
231 op0, 1, OPTAB_DIRECT);
232 }
233 else
234 op1 = force_operand (op1, op0);
235
236 if (op1 == op0)
237 return;
238
239 op1 = convert_to_mode (mode, op1, 1);
240
241 default:
242 break;
243 }
244
245 if ((flag_pic || MACHOPIC_INDIRECT)
246 && symbolic_operand (op1, mode))
247 {
248 if (TARGET_MACHO && !TARGET_64BIT)
249 {
250 #if TARGET_MACHO
251 /* dynamic-no-pic */
252 if (MACHOPIC_INDIRECT)
253 {
254 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
255 ? op0 : gen_reg_rtx (Pmode);
256 op1 = machopic_indirect_data_reference (op1, temp);
257 if (MACHOPIC_PURE)
258 op1 = machopic_legitimize_pic_address (op1, mode,
259 temp == op1 ? 0 : temp);
260 }
261 if (op0 != op1 && GET_CODE (op0) != MEM)
262 {
263 rtx insn = gen_rtx_SET (op0, op1);
264 emit_insn (insn);
265 return;
266 }
267 if (GET_CODE (op0) == MEM)
268 op1 = force_reg (Pmode, op1);
269 else
270 {
271 rtx temp = op0;
272 if (GET_CODE (temp) != REG)
273 temp = gen_reg_rtx (Pmode);
274 temp = legitimize_pic_address (op1, temp);
275 if (temp == op0)
276 return;
277 op1 = temp;
278 }
279 /* dynamic-no-pic */
280 #endif
281 }
282 else
283 {
284 if (MEM_P (op0))
285 op1 = force_reg (mode, op1);
286 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
287 {
288 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
289 op1 = legitimize_pic_address (op1, reg);
290 if (op0 == op1)
291 return;
292 op1 = convert_to_mode (mode, op1, 1);
293 }
294 }
295 }
296 else
297 {
298 if (MEM_P (op0)
299 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
300 || !push_operand (op0, mode))
301 && MEM_P (op1))
302 op1 = force_reg (mode, op1);
303
304 if (push_operand (op0, mode)
305 && ! general_no_elim_operand (op1, mode))
306 op1 = copy_to_mode_reg (mode, op1);
307
308 /* Force large constants in 64bit compilation into register
309 to get them CSEed. */
310 if (can_create_pseudo_p ()
311 && (mode == DImode) && TARGET_64BIT
312 && immediate_operand (op1, mode)
313 && !x86_64_zext_immediate_operand (op1, VOIDmode)
314 && !register_operand (op0, mode)
315 && optimize)
316 op1 = copy_to_mode_reg (mode, op1);
317
318 if (can_create_pseudo_p ()
319 && CONST_DOUBLE_P (op1))
320 {
321 /* If we are loading a floating point constant to a register,
322 force the value to memory now, since we'll get better code
323 out the back end. */
324
325 op1 = validize_mem (force_const_mem (mode, op1));
326 if (!register_operand (op0, mode))
327 {
328 rtx temp = gen_reg_rtx (mode);
329 emit_insn (gen_rtx_SET (temp, op1));
330 emit_move_insn (op0, temp);
331 return;
332 }
333 }
334 }
335
336 emit_insn (gen_rtx_SET (op0, op1));
337 }
338
339 void
340 ix86_expand_vector_move (machine_mode mode, rtx operands[])
341 {
342 rtx op0 = operands[0], op1 = operands[1];
343 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
344 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
345 unsigned int align = (TARGET_IAMCU
346 ? GET_MODE_BITSIZE (mode)
347 : GET_MODE_ALIGNMENT (mode));
348
349 if (push_operand (op0, VOIDmode))
350 op0 = emit_move_resolve_push (mode, op0);
351
352 /* Force constants other than zero into memory. We do not know how
353 the instructions used to build constants modify the upper 64 bits
354 of the register, once we have that information we may be able
355 to handle some of them more efficiently. */
356 if (can_create_pseudo_p ()
357 && (CONSTANT_P (op1)
358 || (SUBREG_P (op1)
359 && CONSTANT_P (SUBREG_REG (op1))))
360 && ((register_operand (op0, mode)
361 && !standard_sse_constant_p (op1, mode))
362 /* ix86_expand_vector_move_misalign() does not like constants. */
363 || (SSE_REG_MODE_P (mode)
364 && MEM_P (op0)
365 && MEM_ALIGN (op0) < align)))
366 {
367 if (SUBREG_P (op1))
368 {
369 machine_mode imode = GET_MODE (SUBREG_REG (op1));
370 rtx r = force_const_mem (imode, SUBREG_REG (op1));
371 if (r)
372 r = validize_mem (r);
373 else
374 r = force_reg (imode, SUBREG_REG (op1));
375 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
376 }
377 else
378 op1 = validize_mem (force_const_mem (mode, op1));
379 }
380
381 /* We need to check memory alignment for SSE mode since attribute
382 can make operands unaligned. */
383 if (can_create_pseudo_p ()
384 && SSE_REG_MODE_P (mode)
385 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
386 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
387 {
388 rtx tmp[2];
389
390 /* ix86_expand_vector_move_misalign() does not like both
391 arguments in memory. */
392 if (!register_operand (op0, mode)
393 && !register_operand (op1, mode))
394 op1 = force_reg (mode, op1);
395
396 tmp[0] = op0; tmp[1] = op1;
397 ix86_expand_vector_move_misalign (mode, tmp);
398 return;
399 }
400
401 /* Make operand1 a register if it isn't already. */
402 if (can_create_pseudo_p ()
403 && !register_operand (op0, mode)
404 && !register_operand (op1, mode))
405 {
406 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
407 return;
408 }
409
410 emit_insn (gen_rtx_SET (op0, op1));
411 }
412
413 /* Split 32-byte AVX unaligned load and store if needed. */
414
415 static void
416 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
417 {
418 rtx m;
419 rtx (*extract) (rtx, rtx, rtx);
420 machine_mode mode;
421
422 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
423 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
424 {
425 emit_insn (gen_rtx_SET (op0, op1));
426 return;
427 }
428
429 rtx orig_op0 = NULL_RTX;
430 mode = GET_MODE (op0);
431 switch (GET_MODE_CLASS (mode))
432 {
433 case MODE_VECTOR_INT:
434 case MODE_INT:
435 if (mode != V32QImode)
436 {
437 if (!MEM_P (op0))
438 {
439 orig_op0 = op0;
440 op0 = gen_reg_rtx (V32QImode);
441 }
442 else
443 op0 = gen_lowpart (V32QImode, op0);
444 op1 = gen_lowpart (V32QImode, op1);
445 mode = V32QImode;
446 }
447 break;
448 case MODE_VECTOR_FLOAT:
449 break;
450 default:
451 gcc_unreachable ();
452 }
453
454 switch (mode)
455 {
456 default:
457 gcc_unreachable ();
458 case E_V32QImode:
459 extract = gen_avx_vextractf128v32qi;
460 mode = V16QImode;
461 break;
462 case E_V8SFmode:
463 extract = gen_avx_vextractf128v8sf;
464 mode = V4SFmode;
465 break;
466 case E_V4DFmode:
467 extract = gen_avx_vextractf128v4df;
468 mode = V2DFmode;
469 break;
470 }
471
472 if (MEM_P (op1))
473 {
474 rtx r = gen_reg_rtx (mode);
475 m = adjust_address (op1, mode, 0);
476 emit_move_insn (r, m);
477 m = adjust_address (op1, mode, 16);
478 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
479 emit_move_insn (op0, r);
480 }
481 else if (MEM_P (op0))
482 {
483 m = adjust_address (op0, mode, 0);
484 emit_insn (extract (m, op1, const0_rtx));
485 m = adjust_address (op0, mode, 16);
486 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
487 }
488 else
489 gcc_unreachable ();
490
491 if (orig_op0)
492 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
493 }
494
495 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
496 straight to ix86_expand_vector_move. */
497 /* Code generation for scalar reg-reg moves of single and double precision data:
498 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
499 movaps reg, reg
500 else
501 movss reg, reg
502 if (x86_sse_partial_reg_dependency == true)
503 movapd reg, reg
504 else
505 movsd reg, reg
506
507 Code generation for scalar loads of double precision data:
508 if (x86_sse_split_regs == true)
509 movlpd mem, reg (gas syntax)
510 else
511 movsd mem, reg
512
513 Code generation for unaligned packed loads of single precision data
514 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
515 if (x86_sse_unaligned_move_optimal)
516 movups mem, reg
517
518 if (x86_sse_partial_reg_dependency == true)
519 {
520 xorps reg, reg
521 movlps mem, reg
522 movhps mem+8, reg
523 }
524 else
525 {
526 movlps mem, reg
527 movhps mem+8, reg
528 }
529
530 Code generation for unaligned packed loads of double precision data
531 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
532 if (x86_sse_unaligned_move_optimal)
533 movupd mem, reg
534
535 if (x86_sse_split_regs == true)
536 {
537 movlpd mem, reg
538 movhpd mem+8, reg
539 }
540 else
541 {
542 movsd mem, reg
543 movhpd mem+8, reg
544 }
545 */
546
547 void
548 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
549 {
550 rtx op0, op1, m;
551
552 op0 = operands[0];
553 op1 = operands[1];
554
555 /* Use unaligned load/store for AVX512 or when optimizing for size. */
556 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
557 {
558 emit_insn (gen_rtx_SET (op0, op1));
559 return;
560 }
561
562 if (TARGET_AVX)
563 {
564 if (GET_MODE_SIZE (mode) == 32)
565 ix86_avx256_split_vector_move_misalign (op0, op1);
566 else
567 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
568 emit_insn (gen_rtx_SET (op0, op1));
569 return;
570 }
571
572 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
573 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
574 {
575 emit_insn (gen_rtx_SET (op0, op1));
576 return;
577 }
578
579 /* ??? If we have typed data, then it would appear that using
580 movdqu is the only way to get unaligned data loaded with
581 integer type. */
582 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
583 {
584 emit_insn (gen_rtx_SET (op0, op1));
585 return;
586 }
587
588 if (MEM_P (op1))
589 {
590 if (TARGET_SSE2 && mode == V2DFmode)
591 {
592 rtx zero;
593
594 /* When SSE registers are split into halves, we can avoid
595 writing to the top half twice. */
596 if (TARGET_SSE_SPLIT_REGS)
597 {
598 emit_clobber (op0);
599 zero = op0;
600 }
601 else
602 {
603 /* ??? Not sure about the best option for the Intel chips.
604 The following would seem to satisfy; the register is
605 entirely cleared, breaking the dependency chain. We
606 then store to the upper half, with a dependency depth
607 of one. A rumor has it that Intel recommends two movsd
608 followed by an unpacklpd, but this is unconfirmed. And
609 given that the dependency depth of the unpacklpd would
610 still be one, I'm not sure why this would be better. */
611 zero = CONST0_RTX (V2DFmode);
612 }
613
614 m = adjust_address (op1, DFmode, 0);
615 emit_insn (gen_sse2_loadlpd (op0, zero, m));
616 m = adjust_address (op1, DFmode, 8);
617 emit_insn (gen_sse2_loadhpd (op0, op0, m));
618 }
619 else
620 {
621 rtx t;
622
623 if (mode != V4SFmode)
624 t = gen_reg_rtx (V4SFmode);
625 else
626 t = op0;
627
628 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
629 emit_move_insn (t, CONST0_RTX (V4SFmode));
630 else
631 emit_clobber (t);
632
633 m = adjust_address (op1, V2SFmode, 0);
634 emit_insn (gen_sse_loadlps (t, t, m));
635 m = adjust_address (op1, V2SFmode, 8);
636 emit_insn (gen_sse_loadhps (t, t, m));
637 if (mode != V4SFmode)
638 emit_move_insn (op0, gen_lowpart (mode, t));
639 }
640 }
641 else if (MEM_P (op0))
642 {
643 if (TARGET_SSE2 && mode == V2DFmode)
644 {
645 m = adjust_address (op0, DFmode, 0);
646 emit_insn (gen_sse2_storelpd (m, op1));
647 m = adjust_address (op0, DFmode, 8);
648 emit_insn (gen_sse2_storehpd (m, op1));
649 }
650 else
651 {
652 if (mode != V4SFmode)
653 op1 = gen_lowpart (V4SFmode, op1);
654
655 m = adjust_address (op0, V2SFmode, 0);
656 emit_insn (gen_sse_storelps (m, op1));
657 m = adjust_address (op0, V2SFmode, 8);
658 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
659 }
660 }
661 else
662 gcc_unreachable ();
663 }
664
665 /* Move bits 64:95 to bits 32:63. */
666
667 void
668 ix86_move_vector_high_sse_to_mmx (rtx op)
669 {
670 rtx mask = gen_rtx_PARALLEL (VOIDmode,
671 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
672 GEN_INT (0), GEN_INT (0)));
673 rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
674 op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
675 rtx insn = gen_rtx_SET (dest, op);
676 emit_insn (insn);
677 }
678
679 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
680
681 void
682 ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
683 {
684 rtx op0 = operands[0];
685 rtx op1 = operands[1];
686 rtx op2 = operands[2];
687
688 machine_mode dmode = GET_MODE (op0);
689 machine_mode smode = GET_MODE (op1);
690 machine_mode inner_dmode = GET_MODE_INNER (dmode);
691 machine_mode inner_smode = GET_MODE_INNER (smode);
692
693 /* Get the corresponding SSE mode for destination. */
694 int nunits = 16 / GET_MODE_SIZE (inner_dmode);
695 machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
696 nunits).require ();
697 machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
698 nunits / 2).require ();
699
700 /* Get the corresponding SSE mode for source. */
701 nunits = 16 / GET_MODE_SIZE (inner_smode);
702 machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
703 nunits).require ();
704
705 /* Generate SSE pack with signed/unsigned saturation. */
706 rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
707 op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
708 op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
709
710 op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
711 op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
712 rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
713 op1, op2));
714 emit_insn (insn);
715
716 ix86_move_vector_high_sse_to_mmx (op0);
717 }
718
719 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
720
721 void
722 ix86_split_mmx_punpck (rtx operands[], bool high_p)
723 {
724 rtx op0 = operands[0];
725 rtx op1 = operands[1];
726 rtx op2 = operands[2];
727 machine_mode mode = GET_MODE (op0);
728 rtx mask;
729 /* The corresponding SSE mode. */
730 machine_mode sse_mode, double_sse_mode;
731
732 switch (mode)
733 {
734 case E_V8QImode:
735 sse_mode = V16QImode;
736 double_sse_mode = V32QImode;
737 mask = gen_rtx_PARALLEL (VOIDmode,
738 gen_rtvec (16,
739 GEN_INT (0), GEN_INT (16),
740 GEN_INT (1), GEN_INT (17),
741 GEN_INT (2), GEN_INT (18),
742 GEN_INT (3), GEN_INT (19),
743 GEN_INT (4), GEN_INT (20),
744 GEN_INT (5), GEN_INT (21),
745 GEN_INT (6), GEN_INT (22),
746 GEN_INT (7), GEN_INT (23)));
747 break;
748
749 case E_V4HImode:
750 sse_mode = V8HImode;
751 double_sse_mode = V16HImode;
752 mask = gen_rtx_PARALLEL (VOIDmode,
753 gen_rtvec (8,
754 GEN_INT (0), GEN_INT (8),
755 GEN_INT (1), GEN_INT (9),
756 GEN_INT (2), GEN_INT (10),
757 GEN_INT (3), GEN_INT (11)));
758 break;
759
760 case E_V2SImode:
761 sse_mode = V4SImode;
762 double_sse_mode = V8SImode;
763 mask = gen_rtx_PARALLEL (VOIDmode,
764 gen_rtvec (4,
765 GEN_INT (0), GEN_INT (4),
766 GEN_INT (1), GEN_INT (5)));
767 break;
768
769 default:
770 gcc_unreachable ();
771 }
772
773 /* Generate SSE punpcklXX. */
774 rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
775 op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
776 op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
777
778 op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
779 op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
780 rtx insn = gen_rtx_SET (dest, op2);
781 emit_insn (insn);
782
783 if (high_p)
784 {
785 /* Move bits 64:127 to bits 0:63. */
786 mask = gen_rtx_PARALLEL (VOIDmode,
787 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
788 GEN_INT (0), GEN_INT (0)));
789 dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
790 op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
791 insn = gen_rtx_SET (dest, op1);
792 emit_insn (insn);
793 }
794 }
795
796 /* Helper function of ix86_fixup_binary_operands to canonicalize
797 operand order. Returns true if the operands should be swapped. */
798
799 static bool
800 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
801 rtx operands[])
802 {
803 rtx dst = operands[0];
804 rtx src1 = operands[1];
805 rtx src2 = operands[2];
806
807 /* If the operation is not commutative, we can't do anything. */
808 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
809 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
810 return false;
811
812 /* Highest priority is that src1 should match dst. */
813 if (rtx_equal_p (dst, src1))
814 return false;
815 if (rtx_equal_p (dst, src2))
816 return true;
817
818 /* Next highest priority is that immediate constants come second. */
819 if (immediate_operand (src2, mode))
820 return false;
821 if (immediate_operand (src1, mode))
822 return true;
823
824 /* Lowest priority is that memory references should come second. */
825 if (MEM_P (src2))
826 return false;
827 if (MEM_P (src1))
828 return true;
829
830 return false;
831 }
832
833
834 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
835 destination to use for the operation. If different from the true
836 destination in operands[0], a copy operation will be required. */
837
838 rtx
839 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
840 rtx operands[])
841 {
842 rtx dst = operands[0];
843 rtx src1 = operands[1];
844 rtx src2 = operands[2];
845
846 /* Canonicalize operand order. */
847 if (ix86_swap_binary_operands_p (code, mode, operands))
848 {
849 /* It is invalid to swap operands of different modes. */
850 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
851
852 std::swap (src1, src2);
853 }
854
855 /* Both source operands cannot be in memory. */
856 if (MEM_P (src1) && MEM_P (src2))
857 {
858 /* Optimization: Only read from memory once. */
859 if (rtx_equal_p (src1, src2))
860 {
861 src2 = force_reg (mode, src2);
862 src1 = src2;
863 }
864 else if (rtx_equal_p (dst, src1))
865 src2 = force_reg (mode, src2);
866 else
867 src1 = force_reg (mode, src1);
868 }
869
870 /* If the destination is memory, and we do not have matching source
871 operands, do things in registers. */
872 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
873 dst = gen_reg_rtx (mode);
874
875 /* Source 1 cannot be a constant. */
876 if (CONSTANT_P (src1))
877 src1 = force_reg (mode, src1);
878
879 /* Source 1 cannot be a non-matching memory. */
880 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
881 src1 = force_reg (mode, src1);
882
883 /* Improve address combine. */
884 if (code == PLUS
885 && GET_MODE_CLASS (mode) == MODE_INT
886 && MEM_P (src2))
887 src2 = force_reg (mode, src2);
888
889 operands[1] = src1;
890 operands[2] = src2;
891 return dst;
892 }
893
894 /* Similarly, but assume that the destination has already been
895 set up properly. */
896
897 void
898 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
899 machine_mode mode, rtx operands[])
900 {
901 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
902 gcc_assert (dst == operands[0]);
903 }
904
905 /* Attempt to expand a binary operator. Make the expansion closer to the
906 actual machine, then just general_operand, which will allow 3 separate
907 memory references (one output, two input) in a single insn. */
908
909 void
910 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
911 rtx operands[])
912 {
913 rtx src1, src2, dst, op, clob;
914
915 dst = ix86_fixup_binary_operands (code, mode, operands);
916 src1 = operands[1];
917 src2 = operands[2];
918
919 /* Emit the instruction. */
920
921 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
922
923 if (reload_completed
924 && code == PLUS
925 && !rtx_equal_p (dst, src1))
926 {
927 /* This is going to be an LEA; avoid splitting it later. */
928 emit_insn (op);
929 }
930 else
931 {
932 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
933 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
934 }
935
936 /* Fix up the destination if needed. */
937 if (dst != operands[0])
938 emit_move_insn (operands[0], dst);
939 }
940
941 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
942 the given OPERANDS. */
943
944 void
945 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
946 rtx operands[])
947 {
948 rtx op1 = NULL_RTX, op2 = NULL_RTX;
949 if (SUBREG_P (operands[1]))
950 {
951 op1 = operands[1];
952 op2 = operands[2];
953 }
954 else if (SUBREG_P (operands[2]))
955 {
956 op1 = operands[2];
957 op2 = operands[1];
958 }
959 /* Optimize (__m128i) d | (__m128i) e and similar code
960 when d and e are float vectors into float vector logical
961 insn. In C/C++ without using intrinsics there is no other way
962 to express vector logical operation on float vectors than
963 to cast them temporarily to integer vectors. */
964 if (op1
965 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
966 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
967 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
968 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
969 && SUBREG_BYTE (op1) == 0
970 && (GET_CODE (op2) == CONST_VECTOR
971 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
972 && SUBREG_BYTE (op2) == 0))
973 && can_create_pseudo_p ())
974 {
975 rtx dst;
976 switch (GET_MODE (SUBREG_REG (op1)))
977 {
978 case E_V4SFmode:
979 case E_V8SFmode:
980 case E_V16SFmode:
981 case E_V2DFmode:
982 case E_V4DFmode:
983 case E_V8DFmode:
984 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
985 if (GET_CODE (op2) == CONST_VECTOR)
986 {
987 op2 = gen_lowpart (GET_MODE (dst), op2);
988 op2 = force_reg (GET_MODE (dst), op2);
989 }
990 else
991 {
992 op1 = operands[1];
993 op2 = SUBREG_REG (operands[2]);
994 if (!vector_operand (op2, GET_MODE (dst)))
995 op2 = force_reg (GET_MODE (dst), op2);
996 }
997 op1 = SUBREG_REG (op1);
998 if (!vector_operand (op1, GET_MODE (dst)))
999 op1 = force_reg (GET_MODE (dst), op1);
1000 emit_insn (gen_rtx_SET (dst,
1001 gen_rtx_fmt_ee (code, GET_MODE (dst),
1002 op1, op2)));
1003 emit_move_insn (operands[0], gen_lowpart (mode, dst));
1004 return;
1005 default:
1006 break;
1007 }
1008 }
1009 if (!vector_operand (operands[1], mode))
1010 operands[1] = force_reg (mode, operands[1]);
1011 if (!vector_operand (operands[2], mode))
1012 operands[2] = force_reg (mode, operands[2]);
1013 ix86_fixup_binary_operands_no_copy (code, mode, operands);
1014 emit_insn (gen_rtx_SET (operands[0],
1015 gen_rtx_fmt_ee (code, mode, operands[1],
1016 operands[2])));
1017 }
1018
1019 /* Return TRUE or FALSE depending on whether the binary operator meets the
1020 appropriate constraints. */
1021
1022 bool
1023 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1024 rtx operands[3])
1025 {
1026 rtx dst = operands[0];
1027 rtx src1 = operands[1];
1028 rtx src2 = operands[2];
1029
1030 /* Both source operands cannot be in memory. */
1031 if (MEM_P (src1) && MEM_P (src2))
1032 return false;
1033
1034 /* Canonicalize operand order for commutative operators. */
1035 if (ix86_swap_binary_operands_p (code, mode, operands))
1036 std::swap (src1, src2);
1037
1038 /* If the destination is memory, we must have a matching source operand. */
1039 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1040 return false;
1041
1042 /* Source 1 cannot be a constant. */
1043 if (CONSTANT_P (src1))
1044 return false;
1045
1046 /* Source 1 cannot be a non-matching memory. */
1047 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1048 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1049 return (code == AND
1050 && (mode == HImode
1051 || mode == SImode
1052 || (TARGET_64BIT && mode == DImode))
1053 && satisfies_constraint_L (src2));
1054
1055 return true;
1056 }
1057
1058 /* Attempt to expand a unary operator. Make the expansion closer to the
1059 actual machine, then just general_operand, which will allow 2 separate
1060 memory references (one output, one input) in a single insn. */
1061
1062 void
1063 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1064 rtx operands[])
1065 {
1066 bool matching_memory = false;
1067 rtx src, dst, op, clob;
1068
1069 dst = operands[0];
1070 src = operands[1];
1071
1072 /* If the destination is memory, and we do not have matching source
1073 operands, do things in registers. */
1074 if (MEM_P (dst))
1075 {
1076 if (rtx_equal_p (dst, src))
1077 matching_memory = true;
1078 else
1079 dst = gen_reg_rtx (mode);
1080 }
1081
1082 /* When source operand is memory, destination must match. */
1083 if (MEM_P (src) && !matching_memory)
1084 src = force_reg (mode, src);
1085
1086 /* Emit the instruction. */
1087
1088 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1089
1090 if (code == NOT)
1091 emit_insn (op);
1092 else
1093 {
1094 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1095 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1096 }
1097
1098 /* Fix up the destination if needed. */
1099 if (dst != operands[0])
1100 emit_move_insn (operands[0], dst);
1101 }
1102
1103 /* Predict just emitted jump instruction to be taken with probability PROB. */
1104
1105 static void
1106 predict_jump (int prob)
1107 {
1108 rtx_insn *insn = get_last_insn ();
1109 gcc_assert (JUMP_P (insn));
1110 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1111 }
1112
1113 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1114 divisor are within the range [0-255]. */
1115
1116 void
1117 ix86_split_idivmod (machine_mode mode, rtx operands[],
1118 bool unsigned_p)
1119 {
1120 rtx_code_label *end_label, *qimode_label;
1121 rtx div, mod;
1122 rtx_insn *insn;
1123 rtx scratch, tmp0, tmp1, tmp2;
1124 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
1125
1126 switch (mode)
1127 {
1128 case E_SImode:
1129 if (GET_MODE (operands[0]) == SImode)
1130 {
1131 if (GET_MODE (operands[1]) == SImode)
1132 gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
1133 else
1134 gen_divmod4_1
1135 = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
1136 }
1137 else
1138 gen_divmod4_1
1139 = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
1140 break;
1141
1142 case E_DImode:
1143 gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
1144 break;
1145
1146 default:
1147 gcc_unreachable ();
1148 }
1149
1150 end_label = gen_label_rtx ();
1151 qimode_label = gen_label_rtx ();
1152
1153 scratch = gen_reg_rtx (mode);
1154
1155 /* Use 8bit unsigned divimod if dividend and divisor are within
1156 the range [0-255]. */
1157 emit_move_insn (scratch, operands[2]);
1158 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1159 scratch, 1, OPTAB_DIRECT);
1160 emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
1161 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1162 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1163 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1164 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1165 pc_rtx);
1166 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1167 predict_jump (REG_BR_PROB_BASE * 50 / 100);
1168 JUMP_LABEL (insn) = qimode_label;
1169
1170 /* Generate original signed/unsigned divimod. */
1171 div = gen_divmod4_1 (operands[0], operands[1],
1172 operands[2], operands[3]);
1173 emit_insn (div);
1174
1175 /* Branch to the end. */
1176 emit_jump_insn (gen_jump (end_label));
1177 emit_barrier ();
1178
1179 /* Generate 8bit unsigned divide. */
1180 emit_label (qimode_label);
1181 /* Don't use operands[0] for result of 8bit divide since not all
1182 registers support QImode ZERO_EXTRACT. */
1183 tmp0 = lowpart_subreg (HImode, scratch, mode);
1184 tmp1 = lowpart_subreg (HImode, operands[2], mode);
1185 tmp2 = lowpart_subreg (QImode, operands[3], mode);
1186 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1187
1188 if (unsigned_p)
1189 {
1190 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1191 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
1192 }
1193 else
1194 {
1195 div = gen_rtx_DIV (mode, operands[2], operands[3]);
1196 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
1197 }
1198 if (mode == SImode)
1199 {
1200 if (GET_MODE (operands[0]) != SImode)
1201 div = gen_rtx_ZERO_EXTEND (DImode, div);
1202 if (GET_MODE (operands[1]) != SImode)
1203 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1204 }
1205
1206 /* Extract remainder from AH. */
1207 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
1208 tmp0, GEN_INT (8), GEN_INT (8));
1209 if (REG_P (operands[1]))
1210 insn = emit_move_insn (operands[1], tmp1);
1211 else
1212 {
1213 /* Need a new scratch register since the old one has result
1214 of 8bit divide. */
1215 scratch = gen_reg_rtx (GET_MODE (operands[1]));
1216 emit_move_insn (scratch, tmp1);
1217 insn = emit_move_insn (operands[1], scratch);
1218 }
1219 set_unique_reg_note (insn, REG_EQUAL, mod);
1220
1221 /* Zero extend quotient from AL. */
1222 tmp1 = gen_lowpart (QImode, tmp0);
1223 insn = emit_insn (gen_extend_insn
1224 (operands[0], tmp1,
1225 GET_MODE (operands[0]), QImode, 1));
1226 set_unique_reg_note (insn, REG_EQUAL, div);
1227
1228 emit_label (end_label);
1229 }
1230
1231 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1232 matches destination. RTX includes clobber of FLAGS_REG. */
1233
1234 void
1235 ix86_emit_binop (enum rtx_code code, machine_mode mode,
1236 rtx dst, rtx src)
1237 {
1238 rtx op, clob;
1239
1240 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1241 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1242
1243 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1244 }
1245
1246 /* Return true if regno1 def is nearest to the insn. */
1247
1248 static bool
1249 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1250 {
1251 rtx_insn *prev = insn;
1252 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1253
1254 if (insn == start)
1255 return false;
1256 while (prev && prev != start)
1257 {
1258 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1259 {
1260 prev = PREV_INSN (prev);
1261 continue;
1262 }
1263 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1264 return true;
1265 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1266 return false;
1267 prev = PREV_INSN (prev);
1268 }
1269
1270 /* None of the regs is defined in the bb. */
1271 return false;
1272 }
1273
1274 /* Split lea instructions into a sequence of instructions
1275 which are executed on ALU to avoid AGU stalls.
1276 It is assumed that it is allowed to clobber flags register
1277 at lea position. */
1278
1279 void
1280 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1281 {
1282 unsigned int regno0, regno1, regno2;
1283 struct ix86_address parts;
1284 rtx target, tmp;
1285 int ok, adds;
1286
1287 ok = ix86_decompose_address (operands[1], &parts);
1288 gcc_assert (ok);
1289
1290 target = gen_lowpart (mode, operands[0]);
1291
1292 regno0 = true_regnum (target);
1293 regno1 = INVALID_REGNUM;
1294 regno2 = INVALID_REGNUM;
1295
1296 if (parts.base)
1297 {
1298 parts.base = gen_lowpart (mode, parts.base);
1299 regno1 = true_regnum (parts.base);
1300 }
1301
1302 if (parts.index)
1303 {
1304 parts.index = gen_lowpart (mode, parts.index);
1305 regno2 = true_regnum (parts.index);
1306 }
1307
1308 if (parts.disp)
1309 parts.disp = gen_lowpart (mode, parts.disp);
1310
1311 if (parts.scale > 1)
1312 {
1313 /* Case r1 = r1 + ... */
1314 if (regno1 == regno0)
1315 {
1316 /* If we have a case r1 = r1 + C * r2 then we
1317 should use multiplication which is very
1318 expensive. Assume cost model is wrong if we
1319 have such case here. */
1320 gcc_assert (regno2 != regno0);
1321
1322 for (adds = parts.scale; adds > 0; adds--)
1323 ix86_emit_binop (PLUS, mode, target, parts.index);
1324 }
1325 else
1326 {
1327 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1328 if (regno0 != regno2)
1329 emit_insn (gen_rtx_SET (target, parts.index));
1330
1331 /* Use shift for scaling. */
1332 ix86_emit_binop (ASHIFT, mode, target,
1333 GEN_INT (exact_log2 (parts.scale)));
1334
1335 if (parts.base)
1336 ix86_emit_binop (PLUS, mode, target, parts.base);
1337
1338 if (parts.disp && parts.disp != const0_rtx)
1339 ix86_emit_binop (PLUS, mode, target, parts.disp);
1340 }
1341 }
1342 else if (!parts.base && !parts.index)
1343 {
1344 gcc_assert(parts.disp);
1345 emit_insn (gen_rtx_SET (target, parts.disp));
1346 }
1347 else
1348 {
1349 if (!parts.base)
1350 {
1351 if (regno0 != regno2)
1352 emit_insn (gen_rtx_SET (target, parts.index));
1353 }
1354 else if (!parts.index)
1355 {
1356 if (regno0 != regno1)
1357 emit_insn (gen_rtx_SET (target, parts.base));
1358 }
1359 else
1360 {
1361 if (regno0 == regno1)
1362 tmp = parts.index;
1363 else if (regno0 == regno2)
1364 tmp = parts.base;
1365 else
1366 {
1367 rtx tmp1;
1368
1369 /* Find better operand for SET instruction, depending
1370 on which definition is farther from the insn. */
1371 if (find_nearest_reg_def (insn, regno1, regno2))
1372 tmp = parts.index, tmp1 = parts.base;
1373 else
1374 tmp = parts.base, tmp1 = parts.index;
1375
1376 emit_insn (gen_rtx_SET (target, tmp));
1377
1378 if (parts.disp && parts.disp != const0_rtx)
1379 ix86_emit_binop (PLUS, mode, target, parts.disp);
1380
1381 ix86_emit_binop (PLUS, mode, target, tmp1);
1382 return;
1383 }
1384
1385 ix86_emit_binop (PLUS, mode, target, tmp);
1386 }
1387
1388 if (parts.disp && parts.disp != const0_rtx)
1389 ix86_emit_binop (PLUS, mode, target, parts.disp);
1390 }
1391 }
1392
1393 /* Post-reload splitter for converting an SF or DFmode value in an
1394 SSE register into an unsigned SImode. */
1395
1396 void
1397 ix86_split_convert_uns_si_sse (rtx operands[])
1398 {
1399 machine_mode vecmode;
1400 rtx value, large, zero_or_two31, input, two31, x;
1401
1402 large = operands[1];
1403 zero_or_two31 = operands[2];
1404 input = operands[3];
1405 two31 = operands[4];
1406 vecmode = GET_MODE (large);
1407 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1408
1409 /* Load up the value into the low element. We must ensure that the other
1410 elements are valid floats -- zero is the easiest such value. */
1411 if (MEM_P (input))
1412 {
1413 if (vecmode == V4SFmode)
1414 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1415 else
1416 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1417 }
1418 else
1419 {
1420 input = gen_rtx_REG (vecmode, REGNO (input));
1421 emit_move_insn (value, CONST0_RTX (vecmode));
1422 if (vecmode == V4SFmode)
1423 emit_insn (gen_sse_movss (value, value, input));
1424 else
1425 emit_insn (gen_sse2_movsd (value, value, input));
1426 }
1427
1428 emit_move_insn (large, two31);
1429 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1430
1431 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1432 emit_insn (gen_rtx_SET (large, x));
1433
1434 x = gen_rtx_AND (vecmode, zero_or_two31, large);
1435 emit_insn (gen_rtx_SET (zero_or_two31, x));
1436
1437 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1438 emit_insn (gen_rtx_SET (value, x));
1439
1440 large = gen_rtx_REG (V4SImode, REGNO (large));
1441 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1442
1443 x = gen_rtx_REG (V4SImode, REGNO (value));
1444 if (vecmode == V4SFmode)
1445 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1446 else
1447 emit_insn (gen_sse2_cvttpd2dq (x, value));
1448 value = x;
1449
1450 emit_insn (gen_xorv4si3 (value, value, large));
1451 }
1452
1453 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
1454 machine_mode mode, rtx target,
1455 rtx var, int one_var);
1456
1457 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1458 Expects the 64-bit DImode to be supplied in a pair of integral
1459 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1460 -mfpmath=sse, !optimize_size only. */
1461
1462 void
1463 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1464 {
1465 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1466 rtx int_xmm, fp_xmm;
1467 rtx biases, exponents;
1468 rtx x;
1469
1470 int_xmm = gen_reg_rtx (V4SImode);
1471 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1472 emit_insn (gen_movdi_to_sse (int_xmm, input));
1473 else if (TARGET_SSE_SPLIT_REGS)
1474 {
1475 emit_clobber (int_xmm);
1476 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1477 }
1478 else
1479 {
1480 x = gen_reg_rtx (V2DImode);
1481 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1482 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1483 }
1484
1485 x = gen_rtx_CONST_VECTOR (V4SImode,
1486 gen_rtvec (4, GEN_INT (0x43300000UL),
1487 GEN_INT (0x45300000UL),
1488 const0_rtx, const0_rtx));
1489 exponents = validize_mem (force_const_mem (V4SImode, x));
1490
1491 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1492 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1493
1494 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1495 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1496 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1497 (0x1.0p84 + double(fp_value_hi_xmm)).
1498 Note these exponents differ by 32. */
1499
1500 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1501
1502 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1503 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1504 real_ldexp (&bias_lo_rvt, &dconst1, 52);
1505 real_ldexp (&bias_hi_rvt, &dconst1, 84);
1506 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1507 x = const_double_from_real_value (bias_hi_rvt, DFmode);
1508 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1509 biases = validize_mem (force_const_mem (V2DFmode, biases));
1510 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1511
1512 /* Add the upper and lower DFmode values together. */
1513 if (TARGET_SSE3)
1514 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1515 else
1516 {
1517 x = copy_to_mode_reg (V2DFmode, fp_xmm);
1518 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1519 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1520 }
1521
1522 ix86_expand_vector_extract (false, target, fp_xmm, 0);
1523 }
1524
1525 /* Not used, but eases macroization of patterns. */
1526 void
1527 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1528 {
1529 gcc_unreachable ();
1530 }
1531
1532 /* Convert an unsigned SImode value into a DFmode. Only currently used
1533 for SSE, but applicable anywhere. */
1534
1535 void
1536 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1537 {
1538 REAL_VALUE_TYPE TWO31r;
1539 rtx x, fp;
1540
1541 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1542 NULL, 1, OPTAB_DIRECT);
1543
1544 fp = gen_reg_rtx (DFmode);
1545 emit_insn (gen_floatsidf2 (fp, x));
1546
1547 real_ldexp (&TWO31r, &dconst1, 31);
1548 x = const_double_from_real_value (TWO31r, DFmode);
1549
1550 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
1551 if (x != target)
1552 emit_move_insn (target, x);
1553 }
1554
1555 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1556 32-bit mode; otherwise we have a direct convert instruction. */
1557
1558 void
1559 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
1560 {
1561 REAL_VALUE_TYPE TWO32r;
1562 rtx fp_lo, fp_hi, x;
1563
1564 fp_lo = gen_reg_rtx (DFmode);
1565 fp_hi = gen_reg_rtx (DFmode);
1566
1567 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
1568
1569 real_ldexp (&TWO32r, &dconst1, 32);
1570 x = const_double_from_real_value (TWO32r, DFmode);
1571 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
1572
1573 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
1574
1575 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
1576 0, OPTAB_DIRECT);
1577 if (x != target)
1578 emit_move_insn (target, x);
1579 }
1580
1581 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1582 For x86_32, -mfpmath=sse, !optimize_size only. */
1583 void
1584 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
1585 {
1586 REAL_VALUE_TYPE ONE16r;
1587 rtx fp_hi, fp_lo, int_hi, int_lo, x;
1588
1589 real_ldexp (&ONE16r, &dconst1, 16);
1590 x = const_double_from_real_value (ONE16r, SFmode);
1591 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
1592 NULL, 0, OPTAB_DIRECT);
1593 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
1594 NULL, 0, OPTAB_DIRECT);
1595 fp_hi = gen_reg_rtx (SFmode);
1596 fp_lo = gen_reg_rtx (SFmode);
1597 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
1598 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
1599 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
1600 0, OPTAB_DIRECT);
1601 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
1602 0, OPTAB_DIRECT);
1603 if (!rtx_equal_p (target, fp_hi))
1604 emit_move_insn (target, fp_hi);
1605 }
1606
1607 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1608 a vector of unsigned ints VAL to vector of floats TARGET. */
1609
1610 void
1611 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
1612 {
1613 rtx tmp[8];
1614 REAL_VALUE_TYPE TWO16r;
1615 machine_mode intmode = GET_MODE (val);
1616 machine_mode fltmode = GET_MODE (target);
1617 rtx (*cvt) (rtx, rtx);
1618
1619 if (intmode == V4SImode)
1620 cvt = gen_floatv4siv4sf2;
1621 else
1622 cvt = gen_floatv8siv8sf2;
1623 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
1624 tmp[0] = force_reg (intmode, tmp[0]);
1625 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
1626 OPTAB_DIRECT);
1627 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
1628 NULL_RTX, 1, OPTAB_DIRECT);
1629 tmp[3] = gen_reg_rtx (fltmode);
1630 emit_insn (cvt (tmp[3], tmp[1]));
1631 tmp[4] = gen_reg_rtx (fltmode);
1632 emit_insn (cvt (tmp[4], tmp[2]));
1633 real_ldexp (&TWO16r, &dconst1, 16);
1634 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
1635 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
1636 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
1637 OPTAB_DIRECT);
1638 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
1639 OPTAB_DIRECT);
1640 if (tmp[7] != target)
1641 emit_move_insn (target, tmp[7]);
1642 }
1643
1644 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1645 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1646 This is done by doing just signed conversion if < 0x1p31, and otherwise by
1647 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
1648
1649 rtx
1650 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
1651 {
1652 REAL_VALUE_TYPE TWO31r;
1653 rtx two31r, tmp[4];
1654 machine_mode mode = GET_MODE (val);
1655 machine_mode scalarmode = GET_MODE_INNER (mode);
1656 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
1657 rtx (*cmp) (rtx, rtx, rtx, rtx);
1658 int i;
1659
1660 for (i = 0; i < 3; i++)
1661 tmp[i] = gen_reg_rtx (mode);
1662 real_ldexp (&TWO31r, &dconst1, 31);
1663 two31r = const_double_from_real_value (TWO31r, scalarmode);
1664 two31r = ix86_build_const_vector (mode, 1, two31r);
1665 two31r = force_reg (mode, two31r);
1666 switch (mode)
1667 {
1668 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
1669 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
1670 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
1671 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
1672 default: gcc_unreachable ();
1673 }
1674 tmp[3] = gen_rtx_LE (mode, two31r, val);
1675 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
1676 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
1677 0, OPTAB_DIRECT);
1678 if (intmode == V4SImode || TARGET_AVX2)
1679 *xorp = expand_simple_binop (intmode, ASHIFT,
1680 gen_lowpart (intmode, tmp[0]),
1681 GEN_INT (31), NULL_RTX, 0,
1682 OPTAB_DIRECT);
1683 else
1684 {
1685 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
1686 two31 = ix86_build_const_vector (intmode, 1, two31);
1687 *xorp = expand_simple_binop (intmode, AND,
1688 gen_lowpart (intmode, tmp[0]),
1689 two31, NULL_RTX, 0,
1690 OPTAB_DIRECT);
1691 }
1692 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
1693 0, OPTAB_DIRECT);
1694 }
1695
1696 /* Generate code for floating point ABS or NEG. */
1697
1698 void
1699 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1700 rtx operands[])
1701 {
1702 rtx set, dst, src;
1703 bool use_sse = false;
1704 bool vector_mode = VECTOR_MODE_P (mode);
1705 machine_mode vmode = mode;
1706 rtvec par;
1707
1708 if (vector_mode)
1709 use_sse = true;
1710 else if (mode == TFmode)
1711 use_sse = true;
1712 else if (TARGET_SSE_MATH)
1713 {
1714 use_sse = SSE_FLOAT_MODE_P (mode);
1715 if (mode == SFmode)
1716 vmode = V4SFmode;
1717 else if (mode == DFmode)
1718 vmode = V2DFmode;
1719 }
1720
1721 dst = operands[0];
1722 src = operands[1];
1723
1724 set = gen_rtx_fmt_e (code, mode, src);
1725 set = gen_rtx_SET (dst, set);
1726
1727 if (use_sse)
1728 {
1729 rtx mask, use, clob;
1730
1731 /* NEG and ABS performed with SSE use bitwise mask operations.
1732 Create the appropriate mask now. */
1733 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
1734 use = gen_rtx_USE (VOIDmode, mask);
1735 if (vector_mode)
1736 par = gen_rtvec (2, set, use);
1737 else
1738 {
1739 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1740 par = gen_rtvec (3, set, use, clob);
1741 }
1742 }
1743 else
1744 {
1745 rtx clob;
1746
1747 /* Changing of sign for FP values is doable using integer unit too. */
1748 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1749 par = gen_rtvec (2, set, clob);
1750 }
1751
1752 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
1753 }
1754
1755 /* Deconstruct a floating point ABS or NEG operation
1756 with integer registers into integer operations. */
1757
1758 void
1759 ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
1760 rtx operands[])
1761 {
1762 enum rtx_code absneg_op;
1763 rtx dst, set;
1764
1765 gcc_assert (operands_match_p (operands[0], operands[1]));
1766
1767 switch (mode)
1768 {
1769 case E_SFmode:
1770 dst = gen_lowpart (SImode, operands[0]);
1771
1772 if (code == ABS)
1773 {
1774 set = gen_int_mode (0x7fffffff, SImode);
1775 absneg_op = AND;
1776 }
1777 else
1778 {
1779 set = gen_int_mode (0x80000000, SImode);
1780 absneg_op = XOR;
1781 }
1782 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1783 break;
1784
1785 case E_DFmode:
1786 if (TARGET_64BIT)
1787 {
1788 dst = gen_lowpart (DImode, operands[0]);
1789 dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
1790
1791 if (code == ABS)
1792 set = const0_rtx;
1793 else
1794 set = gen_rtx_NOT (DImode, dst);
1795 }
1796 else
1797 {
1798 dst = gen_highpart (SImode, operands[0]);
1799
1800 if (code == ABS)
1801 {
1802 set = gen_int_mode (0x7fffffff, SImode);
1803 absneg_op = AND;
1804 }
1805 else
1806 {
1807 set = gen_int_mode (0x80000000, SImode);
1808 absneg_op = XOR;
1809 }
1810 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1811 }
1812 break;
1813
1814 case E_XFmode:
1815 dst = gen_rtx_REG (SImode,
1816 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
1817 if (code == ABS)
1818 {
1819 set = GEN_INT (0x7fff);
1820 absneg_op = AND;
1821 }
1822 else
1823 {
1824 set = GEN_INT (0x8000);
1825 absneg_op = XOR;
1826 }
1827 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
1828 break;
1829
1830 default:
1831 gcc_unreachable ();
1832 }
1833
1834 set = gen_rtx_SET (dst, set);
1835
1836 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1837 rtvec par = gen_rtvec (2, set, clob);
1838
1839 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
1840 }
1841
1842 /* Expand a copysign operation. Special case operand 0 being a constant. */
1843
1844 void
1845 ix86_expand_copysign (rtx operands[])
1846 {
1847 machine_mode mode, vmode;
1848 rtx dest, op0, op1, mask;
1849
1850 dest = operands[0];
1851 op0 = operands[1];
1852 op1 = operands[2];
1853
1854 mode = GET_MODE (dest);
1855
1856 if (mode == SFmode)
1857 vmode = V4SFmode;
1858 else if (mode == DFmode)
1859 vmode = V2DFmode;
1860 else if (mode == TFmode)
1861 vmode = mode;
1862 else
1863 gcc_unreachable ();
1864
1865 mask = ix86_build_signbit_mask (vmode, 0, 0);
1866
1867 if (CONST_DOUBLE_P (op0))
1868 {
1869 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
1870 op0 = simplify_unary_operation (ABS, mode, op0, mode);
1871
1872 if (mode == SFmode || mode == DFmode)
1873 {
1874 if (op0 == CONST0_RTX (mode))
1875 op0 = CONST0_RTX (vmode);
1876 else
1877 {
1878 rtx v = ix86_build_const_vector (vmode, false, op0);
1879
1880 op0 = force_reg (vmode, v);
1881 }
1882 }
1883 else if (op0 != CONST0_RTX (mode))
1884 op0 = force_reg (mode, op0);
1885
1886 emit_insn (gen_copysign3_const (mode, dest, op0, op1, mask));
1887 }
1888 else
1889 {
1890 rtx nmask = ix86_build_signbit_mask (vmode, 0, 1);
1891
1892 emit_insn (gen_copysign3_var
1893 (mode, dest, NULL_RTX, op0, op1, nmask, mask));
1894 }
1895 }
1896
1897 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
1898 be a constant, and so has already been expanded into a vector constant. */
1899
1900 void
1901 ix86_split_copysign_const (rtx operands[])
1902 {
1903 machine_mode mode, vmode;
1904 rtx dest, op0, mask, x;
1905
1906 dest = operands[0];
1907 op0 = operands[1];
1908 mask = operands[3];
1909
1910 mode = GET_MODE (dest);
1911 vmode = GET_MODE (mask);
1912
1913 dest = lowpart_subreg (vmode, dest, mode);
1914 x = gen_rtx_AND (vmode, dest, mask);
1915 emit_insn (gen_rtx_SET (dest, x));
1916
1917 if (op0 != CONST0_RTX (vmode))
1918 {
1919 x = gen_rtx_IOR (vmode, dest, op0);
1920 emit_insn (gen_rtx_SET (dest, x));
1921 }
1922 }
1923
1924 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
1925 so we have to do two masks. */
1926
1927 void
1928 ix86_split_copysign_var (rtx operands[])
1929 {
1930 machine_mode mode, vmode;
1931 rtx dest, scratch, op0, op1, mask, nmask, x;
1932
1933 dest = operands[0];
1934 scratch = operands[1];
1935 op0 = operands[2];
1936 op1 = operands[3];
1937 nmask = operands[4];
1938 mask = operands[5];
1939
1940 mode = GET_MODE (dest);
1941 vmode = GET_MODE (mask);
1942
1943 if (rtx_equal_p (op0, op1))
1944 {
1945 /* Shouldn't happen often (it's useless, obviously), but when it does
1946 we'd generate incorrect code if we continue below. */
1947 emit_move_insn (dest, op0);
1948 return;
1949 }
1950
1951 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
1952 {
1953 gcc_assert (REGNO (op1) == REGNO (scratch));
1954
1955 x = gen_rtx_AND (vmode, scratch, mask);
1956 emit_insn (gen_rtx_SET (scratch, x));
1957
1958 dest = mask;
1959 op0 = lowpart_subreg (vmode, op0, mode);
1960 x = gen_rtx_NOT (vmode, dest);
1961 x = gen_rtx_AND (vmode, x, op0);
1962 emit_insn (gen_rtx_SET (dest, x));
1963 }
1964 else
1965 {
1966 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
1967 {
1968 x = gen_rtx_AND (vmode, scratch, mask);
1969 }
1970 else /* alternative 2,4 */
1971 {
1972 gcc_assert (REGNO (mask) == REGNO (scratch));
1973 op1 = lowpart_subreg (vmode, op1, mode);
1974 x = gen_rtx_AND (vmode, scratch, op1);
1975 }
1976 emit_insn (gen_rtx_SET (scratch, x));
1977
1978 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
1979 {
1980 dest = lowpart_subreg (vmode, op0, mode);
1981 x = gen_rtx_AND (vmode, dest, nmask);
1982 }
1983 else /* alternative 3,4 */
1984 {
1985 gcc_assert (REGNO (nmask) == REGNO (dest));
1986 dest = nmask;
1987 op0 = lowpart_subreg (vmode, op0, mode);
1988 x = gen_rtx_AND (vmode, dest, op0);
1989 }
1990 emit_insn (gen_rtx_SET (dest, x));
1991 }
1992
1993 x = gen_rtx_IOR (vmode, dest, scratch);
1994 emit_insn (gen_rtx_SET (dest, x));
1995 }
1996
1997 /* Expand an xorsign operation. */
1998
1999 void
2000 ix86_expand_xorsign (rtx operands[])
2001 {
2002 machine_mode mode, vmode;
2003 rtx dest, op0, op1, mask;
2004
2005 dest = operands[0];
2006 op0 = operands[1];
2007 op1 = operands[2];
2008
2009 mode = GET_MODE (dest);
2010
2011 if (mode == SFmode)
2012 vmode = V4SFmode;
2013 else if (mode == DFmode)
2014 vmode = V2DFmode;
2015 else
2016 gcc_unreachable ();
2017
2018 mask = ix86_build_signbit_mask (vmode, 0, 0);
2019
2020 emit_insn (gen_xorsign3_1 (mode, dest, op0, op1, mask));
2021 }
2022
2023 /* Deconstruct an xorsign operation into bit masks. */
2024
2025 void
2026 ix86_split_xorsign (rtx operands[])
2027 {
2028 machine_mode mode, vmode;
2029 rtx dest, op0, mask, x;
2030
2031 dest = operands[0];
2032 op0 = operands[1];
2033 mask = operands[3];
2034
2035 mode = GET_MODE (dest);
2036 vmode = GET_MODE (mask);
2037
2038 dest = lowpart_subreg (vmode, dest, mode);
2039 x = gen_rtx_AND (vmode, dest, mask);
2040 emit_insn (gen_rtx_SET (dest, x));
2041
2042 op0 = lowpart_subreg (vmode, op0, mode);
2043 x = gen_rtx_XOR (vmode, dest, op0);
2044 emit_insn (gen_rtx_SET (dest, x));
2045 }
2046
2047 static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2048
2049 void
2050 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2051 {
2052 machine_mode mode = GET_MODE (op0);
2053 rtx tmp;
2054
2055 /* Handle special case - vector comparsion with boolean result, transform
2056 it using ptest instruction. */
2057 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
2058 {
2059 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
2060 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
2061
2062 gcc_assert (code == EQ || code == NE);
2063 /* Generate XOR since we can't check that one operand is zero vector. */
2064 tmp = gen_reg_rtx (mode);
2065 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
2066 tmp = gen_lowpart (p_mode, tmp);
2067 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
2068 gen_rtx_UNSPEC (CCmode,
2069 gen_rtvec (2, tmp, tmp),
2070 UNSPEC_PTEST)));
2071 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2072 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2073 gen_rtx_LABEL_REF (VOIDmode, label),
2074 pc_rtx);
2075 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2076 return;
2077 }
2078
2079 switch (mode)
2080 {
2081 case E_SFmode:
2082 case E_DFmode:
2083 case E_XFmode:
2084 case E_QImode:
2085 case E_HImode:
2086 case E_SImode:
2087 simple:
2088 tmp = ix86_expand_compare (code, op0, op1);
2089 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2090 gen_rtx_LABEL_REF (VOIDmode, label),
2091 pc_rtx);
2092 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2093 return;
2094
2095 case E_DImode:
2096 if (TARGET_64BIT)
2097 goto simple;
2098 /* For 32-bit target DI comparison may be performed on
2099 SSE registers. To allow this we should avoid split
2100 to SI mode which is achieved by doing xor in DI mode
2101 and then comparing with zero (which is recognized by
2102 STV pass). We don't compare using xor when optimizing
2103 for size. */
2104 if (!optimize_insn_for_size_p ()
2105 && TARGET_STV
2106 && (code == EQ || code == NE))
2107 {
2108 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
2109 op1 = const0_rtx;
2110 }
2111 /* FALLTHRU */
2112 case E_TImode:
2113 /* Expand DImode branch into multiple compare+branch. */
2114 {
2115 rtx lo[2], hi[2];
2116 rtx_code_label *label2;
2117 enum rtx_code code1, code2, code3;
2118 machine_mode submode;
2119
2120 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2121 {
2122 std::swap (op0, op1);
2123 code = swap_condition (code);
2124 }
2125
2126 split_double_mode (mode, &op0, 1, lo+0, hi+0);
2127 split_double_mode (mode, &op1, 1, lo+1, hi+1);
2128
2129 submode = mode == DImode ? SImode : DImode;
2130
2131 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
2132 avoid two branches. This costs one extra insn, so disable when
2133 optimizing for size. */
2134
2135 if ((code == EQ || code == NE)
2136 && (!optimize_insn_for_size_p ()
2137 || hi[1] == const0_rtx || lo[1] == const0_rtx))
2138 {
2139 rtx xor0, xor1;
2140
2141 xor1 = hi[0];
2142 if (hi[1] != const0_rtx)
2143 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
2144 NULL_RTX, 0, OPTAB_WIDEN);
2145
2146 xor0 = lo[0];
2147 if (lo[1] != const0_rtx)
2148 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
2149 NULL_RTX, 0, OPTAB_WIDEN);
2150
2151 tmp = expand_binop (submode, ior_optab, xor1, xor0,
2152 NULL_RTX, 0, OPTAB_WIDEN);
2153
2154 ix86_expand_branch (code, tmp, const0_rtx, label);
2155 return;
2156 }
2157
2158 /* Otherwise, if we are doing less-than or greater-or-equal-than,
2159 op1 is a constant and the low word is zero, then we can just
2160 examine the high word. Similarly for low word -1 and
2161 less-or-equal-than or greater-than. */
2162
2163 if (CONST_INT_P (hi[1]))
2164 switch (code)
2165 {
2166 case LT: case LTU: case GE: case GEU:
2167 if (lo[1] == const0_rtx)
2168 {
2169 ix86_expand_branch (code, hi[0], hi[1], label);
2170 return;
2171 }
2172 break;
2173 case LE: case LEU: case GT: case GTU:
2174 if (lo[1] == constm1_rtx)
2175 {
2176 ix86_expand_branch (code, hi[0], hi[1], label);
2177 return;
2178 }
2179 break;
2180 default:
2181 break;
2182 }
2183
2184 /* Emulate comparisons that do not depend on Zero flag with
2185 double-word subtraction. Note that only Overflow, Sign
2186 and Carry flags are valid, so swap arguments and condition
2187 of comparisons that would otherwise test Zero flag. */
2188
2189 switch (code)
2190 {
2191 case LE: case LEU: case GT: case GTU:
2192 std::swap (lo[0], lo[1]);
2193 std::swap (hi[0], hi[1]);
2194 code = swap_condition (code);
2195 /* FALLTHRU */
2196
2197 case LT: case LTU: case GE: case GEU:
2198 {
2199 bool uns = (code == LTU || code == GEU);
2200 rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2201 = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2202
2203 if (!nonimmediate_operand (lo[0], submode))
2204 lo[0] = force_reg (submode, lo[0]);
2205 if (!x86_64_general_operand (lo[1], submode))
2206 lo[1] = force_reg (submode, lo[1]);
2207
2208 if (!register_operand (hi[0], submode))
2209 hi[0] = force_reg (submode, hi[0]);
2210 if ((uns && !nonimmediate_operand (hi[1], submode))
2211 || (!uns && !x86_64_general_operand (hi[1], submode)))
2212 hi[1] = force_reg (submode, hi[1]);
2213
2214 emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2215
2216 tmp = gen_rtx_SCRATCH (submode);
2217 emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2218
2219 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2220 ix86_expand_branch (code, tmp, const0_rtx, label);
2221 return;
2222 }
2223
2224 default:
2225 break;
2226 }
2227
2228 /* Otherwise, we need two or three jumps. */
2229
2230 label2 = gen_label_rtx ();
2231
2232 code1 = code;
2233 code2 = swap_condition (code);
2234 code3 = unsigned_condition (code);
2235
2236 switch (code)
2237 {
2238 case LT: case GT: case LTU: case GTU:
2239 break;
2240
2241 case LE: code1 = LT; code2 = GT; break;
2242 case GE: code1 = GT; code2 = LT; break;
2243 case LEU: code1 = LTU; code2 = GTU; break;
2244 case GEU: code1 = GTU; code2 = LTU; break;
2245
2246 case EQ: code1 = UNKNOWN; code2 = NE; break;
2247 case NE: code2 = UNKNOWN; break;
2248
2249 default:
2250 gcc_unreachable ();
2251 }
2252
2253 /*
2254 * a < b =>
2255 * if (hi(a) < hi(b)) goto true;
2256 * if (hi(a) > hi(b)) goto false;
2257 * if (lo(a) < lo(b)) goto true;
2258 * false:
2259 */
2260
2261 if (code1 != UNKNOWN)
2262 ix86_expand_branch (code1, hi[0], hi[1], label);
2263 if (code2 != UNKNOWN)
2264 ix86_expand_branch (code2, hi[0], hi[1], label2);
2265
2266 ix86_expand_branch (code3, lo[0], lo[1], label);
2267
2268 if (code2 != UNKNOWN)
2269 emit_label (label2);
2270 return;
2271 }
2272
2273 default:
2274 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2275 goto simple;
2276 }
2277 }
2278
2279 /* Figure out whether to use unordered fp comparisons. */
2280
2281 static bool
2282 ix86_unordered_fp_compare (enum rtx_code code)
2283 {
2284 if (!TARGET_IEEE_FP)
2285 return false;
2286
2287 switch (code)
2288 {
2289 case LT:
2290 case LE:
2291 case GT:
2292 case GE:
2293 case LTGT:
2294 return false;
2295
2296 case EQ:
2297 case NE:
2298
2299 case UNORDERED:
2300 case ORDERED:
2301 case UNLT:
2302 case UNLE:
2303 case UNGT:
2304 case UNGE:
2305 case UNEQ:
2306 return true;
2307
2308 default:
2309 gcc_unreachable ();
2310 }
2311 }
2312
2313 /* Return a comparison we can do and that it is equivalent to
2314 swap_condition (code) apart possibly from orderedness.
2315 But, never change orderedness if TARGET_IEEE_FP, returning
2316 UNKNOWN in that case if necessary. */
2317
2318 static enum rtx_code
2319 ix86_fp_swap_condition (enum rtx_code code)
2320 {
2321 switch (code)
2322 {
2323 case GT: /* GTU - CF=0 & ZF=0 */
2324 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2325 case GE: /* GEU - CF=0 */
2326 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2327 case UNLT: /* LTU - CF=1 */
2328 return TARGET_IEEE_FP ? UNKNOWN : GT;
2329 case UNLE: /* LEU - CF=1 | ZF=1 */
2330 return TARGET_IEEE_FP ? UNKNOWN : GE;
2331 default:
2332 return swap_condition (code);
2333 }
2334 }
2335
2336 /* Return cost of comparison CODE using the best strategy for performance.
2337 All following functions do use number of instructions as a cost metrics.
2338 In future this should be tweaked to compute bytes for optimize_size and
2339 take into account performance of various instructions on various CPUs. */
2340
2341 static int
2342 ix86_fp_comparison_cost (enum rtx_code code)
2343 {
2344 int arith_cost;
2345
2346 /* The cost of code using bit-twiddling on %ah. */
2347 switch (code)
2348 {
2349 case UNLE:
2350 case UNLT:
2351 case LTGT:
2352 case GT:
2353 case GE:
2354 case UNORDERED:
2355 case ORDERED:
2356 case UNEQ:
2357 arith_cost = 4;
2358 break;
2359 case LT:
2360 case NE:
2361 case EQ:
2362 case UNGE:
2363 arith_cost = TARGET_IEEE_FP ? 5 : 4;
2364 break;
2365 case LE:
2366 case UNGT:
2367 arith_cost = TARGET_IEEE_FP ? 6 : 4;
2368 break;
2369 default:
2370 gcc_unreachable ();
2371 }
2372
2373 switch (ix86_fp_comparison_strategy (code))
2374 {
2375 case IX86_FPCMP_COMI:
2376 return arith_cost > 4 ? 3 : 2;
2377 case IX86_FPCMP_SAHF:
2378 return arith_cost > 4 ? 4 : 3;
2379 default:
2380 return arith_cost;
2381 }
2382 }
2383
2384 /* Swap, force into registers, or otherwise massage the two operands
2385 to a fp comparison. The operands are updated in place; the new
2386 comparison code is returned. */
2387
2388 static enum rtx_code
2389 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2390 {
2391 bool unordered_compare = ix86_unordered_fp_compare (code);
2392 rtx op0 = *pop0, op1 = *pop1;
2393 machine_mode op_mode = GET_MODE (op0);
2394 bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
2395
2396 /* All of the unordered compare instructions only work on registers.
2397 The same is true of the fcomi compare instructions. The XFmode
2398 compare instructions require registers except when comparing
2399 against zero or when converting operand 1 from fixed point to
2400 floating point. */
2401
2402 if (!is_sse
2403 && (unordered_compare
2404 || (op_mode == XFmode
2405 && ! (standard_80387_constant_p (op0) == 1
2406 || standard_80387_constant_p (op1) == 1)
2407 && GET_CODE (op1) != FLOAT)
2408 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2409 {
2410 op0 = force_reg (op_mode, op0);
2411 op1 = force_reg (op_mode, op1);
2412 }
2413 else
2414 {
2415 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2416 things around if they appear profitable, otherwise force op0
2417 into a register. */
2418
2419 if (standard_80387_constant_p (op0) == 0
2420 || (MEM_P (op0)
2421 && ! (standard_80387_constant_p (op1) == 0
2422 || MEM_P (op1))))
2423 {
2424 enum rtx_code new_code = ix86_fp_swap_condition (code);
2425 if (new_code != UNKNOWN)
2426 {
2427 std::swap (op0, op1);
2428 code = new_code;
2429 }
2430 }
2431
2432 if (!REG_P (op0))
2433 op0 = force_reg (op_mode, op0);
2434
2435 if (CONSTANT_P (op1))
2436 {
2437 int tmp = standard_80387_constant_p (op1);
2438 if (tmp == 0)
2439 op1 = validize_mem (force_const_mem (op_mode, op1));
2440 else if (tmp == 1)
2441 {
2442 if (TARGET_CMOVE)
2443 op1 = force_reg (op_mode, op1);
2444 }
2445 else
2446 op1 = force_reg (op_mode, op1);
2447 }
2448 }
2449
2450 /* Try to rearrange the comparison to make it cheaper. */
2451 if (ix86_fp_comparison_cost (code)
2452 > ix86_fp_comparison_cost (swap_condition (code))
2453 && (REG_P (op1) || can_create_pseudo_p ()))
2454 {
2455 std::swap (op0, op1);
2456 code = swap_condition (code);
2457 if (!REG_P (op0))
2458 op0 = force_reg (op_mode, op0);
2459 }
2460
2461 *pop0 = op0;
2462 *pop1 = op1;
2463 return code;
2464 }
2465
2466 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2467
2468 static rtx
2469 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2470 {
2471 bool unordered_compare = ix86_unordered_fp_compare (code);
2472 machine_mode cmp_mode;
2473 rtx tmp, scratch;
2474
2475 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2476
2477 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2478 if (unordered_compare)
2479 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2480
2481 /* Do fcomi/sahf based test when profitable. */
2482 switch (ix86_fp_comparison_strategy (code))
2483 {
2484 case IX86_FPCMP_COMI:
2485 cmp_mode = CCFPmode;
2486 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2487 break;
2488
2489 case IX86_FPCMP_SAHF:
2490 cmp_mode = CCFPmode;
2491 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2492 scratch = gen_reg_rtx (HImode);
2493 emit_insn (gen_rtx_SET (scratch, tmp));
2494 emit_insn (gen_x86_sahf_1 (scratch));
2495 break;
2496
2497 case IX86_FPCMP_ARITH:
2498 cmp_mode = CCNOmode;
2499 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2500 scratch = gen_reg_rtx (HImode);
2501 emit_insn (gen_rtx_SET (scratch, tmp));
2502
2503 /* In the unordered case, we have to check C2 for NaN's, which
2504 doesn't happen to work out to anything nice combination-wise.
2505 So do some bit twiddling on the value we've got in AH to come
2506 up with an appropriate set of condition codes. */
2507
2508 switch (code)
2509 {
2510 case GT:
2511 case UNGT:
2512 if (code == GT || !TARGET_IEEE_FP)
2513 {
2514 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2515 code = EQ;
2516 }
2517 else
2518 {
2519 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2520 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2521 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2522 cmp_mode = CCmode;
2523 code = GEU;
2524 }
2525 break;
2526 case LT:
2527 case UNLT:
2528 if (code == LT && TARGET_IEEE_FP)
2529 {
2530 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2531 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2532 cmp_mode = CCmode;
2533 code = EQ;
2534 }
2535 else
2536 {
2537 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2538 code = NE;
2539 }
2540 break;
2541 case GE:
2542 case UNGE:
2543 if (code == GE || !TARGET_IEEE_FP)
2544 {
2545 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
2546 code = EQ;
2547 }
2548 else
2549 {
2550 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2551 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
2552 code = NE;
2553 }
2554 break;
2555 case LE:
2556 case UNLE:
2557 if (code == LE && TARGET_IEEE_FP)
2558 {
2559 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2560 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2561 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2562 cmp_mode = CCmode;
2563 code = LTU;
2564 }
2565 else
2566 {
2567 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2568 code = NE;
2569 }
2570 break;
2571 case EQ:
2572 case UNEQ:
2573 if (code == EQ && TARGET_IEEE_FP)
2574 {
2575 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2576 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2577 cmp_mode = CCmode;
2578 code = EQ;
2579 }
2580 else
2581 {
2582 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2583 code = NE;
2584 }
2585 break;
2586 case NE:
2587 case LTGT:
2588 if (code == NE && TARGET_IEEE_FP)
2589 {
2590 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2591 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
2592 GEN_INT (0x40)));
2593 code = NE;
2594 }
2595 else
2596 {
2597 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2598 code = EQ;
2599 }
2600 break;
2601
2602 case UNORDERED:
2603 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2604 code = NE;
2605 break;
2606 case ORDERED:
2607 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2608 code = EQ;
2609 break;
2610
2611 default:
2612 gcc_unreachable ();
2613 }
2614 break;
2615
2616 default:
2617 gcc_unreachable();
2618 }
2619
2620 /* Return the test that should be put into the flags user, i.e.
2621 the bcc, scc, or cmov instruction. */
2622 return gen_rtx_fmt_ee (code, VOIDmode,
2623 gen_rtx_REG (cmp_mode, FLAGS_REG),
2624 const0_rtx);
2625 }
2626
2627 /* Generate insn patterns to do an integer compare of OPERANDS. */
2628
2629 static rtx
2630 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
2631 {
2632 machine_mode cmpmode;
2633 rtx tmp, flags;
2634
2635 cmpmode = SELECT_CC_MODE (code, op0, op1);
2636 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
2637
2638 /* This is very simple, but making the interface the same as in the
2639 FP case makes the rest of the code easier. */
2640 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
2641 emit_insn (gen_rtx_SET (flags, tmp));
2642
2643 /* Return the test that should be put into the flags user, i.e.
2644 the bcc, scc, or cmov instruction. */
2645 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
2646 }
2647
2648 static rtx
2649 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
2650 {
2651 rtx ret;
2652
2653 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
2654 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
2655
2656 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
2657 {
2658 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
2659 ret = ix86_expand_fp_compare (code, op0, op1);
2660 }
2661 else
2662 ret = ix86_expand_int_compare (code, op0, op1);
2663
2664 return ret;
2665 }
2666
2667 void
2668 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
2669 {
2670 rtx ret;
2671
2672 gcc_assert (GET_MODE (dest) == QImode);
2673
2674 ret = ix86_expand_compare (code, op0, op1);
2675 PUT_MODE (ret, QImode);
2676 emit_insn (gen_rtx_SET (dest, ret));
2677 }
2678
2679 /* Expand comparison setting or clearing carry flag. Return true when
2680 successful and set pop for the operation. */
2681 static bool
2682 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
2683 {
2684 machine_mode mode
2685 = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
2686
2687 /* Do not handle double-mode compares that go through special path. */
2688 if (mode == (TARGET_64BIT ? TImode : DImode))
2689 return false;
2690
2691 if (SCALAR_FLOAT_MODE_P (mode))
2692 {
2693 rtx compare_op;
2694 rtx_insn *compare_seq;
2695
2696 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
2697
2698 /* Shortcut: following common codes never translate
2699 into carry flag compares. */
2700 if (code == EQ || code == NE || code == UNEQ || code == LTGT
2701 || code == ORDERED || code == UNORDERED)
2702 return false;
2703
2704 /* These comparisons require zero flag; swap operands so they won't. */
2705 if ((code == GT || code == UNLE || code == LE || code == UNGT)
2706 && !TARGET_IEEE_FP)
2707 {
2708 std::swap (op0, op1);
2709 code = swap_condition (code);
2710 }
2711
2712 /* Try to expand the comparison and verify that we end up with
2713 carry flag based comparison. This fails to be true only when
2714 we decide to expand comparison using arithmetic that is not
2715 too common scenario. */
2716 start_sequence ();
2717 compare_op = ix86_expand_fp_compare (code, op0, op1);
2718 compare_seq = get_insns ();
2719 end_sequence ();
2720
2721 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
2722 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
2723 else
2724 code = GET_CODE (compare_op);
2725
2726 if (code != LTU && code != GEU)
2727 return false;
2728
2729 emit_insn (compare_seq);
2730 *pop = compare_op;
2731 return true;
2732 }
2733
2734 if (!INTEGRAL_MODE_P (mode))
2735 return false;
2736
2737 switch (code)
2738 {
2739 case LTU:
2740 case GEU:
2741 break;
2742
2743 /* Convert a==0 into (unsigned)a<1. */
2744 case EQ:
2745 case NE:
2746 if (op1 != const0_rtx)
2747 return false;
2748 op1 = const1_rtx;
2749 code = (code == EQ ? LTU : GEU);
2750 break;
2751
2752 /* Convert a>b into b<a or a>=b-1. */
2753 case GTU:
2754 case LEU:
2755 if (CONST_INT_P (op1))
2756 {
2757 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
2758 /* Bail out on overflow. We still can swap operands but that
2759 would force loading of the constant into register. */
2760 if (op1 == const0_rtx
2761 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
2762 return false;
2763 code = (code == GTU ? GEU : LTU);
2764 }
2765 else
2766 {
2767 std::swap (op0, op1);
2768 code = (code == GTU ? LTU : GEU);
2769 }
2770 break;
2771
2772 /* Convert a>=0 into (unsigned)a<0x80000000. */
2773 case LT:
2774 case GE:
2775 if (mode == DImode || op1 != const0_rtx)
2776 return false;
2777 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2778 code = (code == LT ? GEU : LTU);
2779 break;
2780 case LE:
2781 case GT:
2782 if (mode == DImode || op1 != constm1_rtx)
2783 return false;
2784 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
2785 code = (code == LE ? GEU : LTU);
2786 break;
2787
2788 default:
2789 return false;
2790 }
2791 /* Swapping operands may cause constant to appear as first operand. */
2792 if (!nonimmediate_operand (op0, VOIDmode))
2793 {
2794 if (!can_create_pseudo_p ())
2795 return false;
2796 op0 = force_reg (mode, op0);
2797 }
2798 *pop = ix86_expand_compare (code, op0, op1);
2799 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
2800 return true;
2801 }
2802
2803 /* Expand conditional increment or decrement using adb/sbb instructions.
2804 The default case using setcc followed by the conditional move can be
2805 done by generic code. */
2806 bool
2807 ix86_expand_int_addcc (rtx operands[])
2808 {
2809 enum rtx_code code = GET_CODE (operands[1]);
2810 rtx flags;
2811 rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
2812 rtx compare_op;
2813 rtx val = const0_rtx;
2814 bool fpcmp = false;
2815 machine_mode mode;
2816 rtx op0 = XEXP (operands[1], 0);
2817 rtx op1 = XEXP (operands[1], 1);
2818
2819 if (operands[3] != const1_rtx
2820 && operands[3] != constm1_rtx)
2821 return false;
2822 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2823 return false;
2824 code = GET_CODE (compare_op);
2825
2826 flags = XEXP (compare_op, 0);
2827
2828 if (GET_MODE (flags) == CCFPmode)
2829 {
2830 fpcmp = true;
2831 code = ix86_fp_compare_code_to_integer (code);
2832 }
2833
2834 if (code != LTU)
2835 {
2836 val = constm1_rtx;
2837 if (fpcmp)
2838 PUT_CODE (compare_op,
2839 reverse_condition_maybe_unordered
2840 (GET_CODE (compare_op)));
2841 else
2842 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
2843 }
2844
2845 mode = GET_MODE (operands[0]);
2846
2847 /* Construct either adc or sbb insn. */
2848 if ((code == LTU) == (operands[3] == constm1_rtx))
2849 insn = gen_sub3_carry;
2850 else
2851 insn = gen_add3_carry;
2852
2853 emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
2854
2855 return true;
2856 }
2857
2858 bool
2859 ix86_expand_int_movcc (rtx operands[])
2860 {
2861 enum rtx_code code = GET_CODE (operands[1]), compare_code;
2862 rtx_insn *compare_seq;
2863 rtx compare_op;
2864 machine_mode mode = GET_MODE (operands[0]);
2865 bool sign_bit_compare_p = false;
2866 rtx op0 = XEXP (operands[1], 0);
2867 rtx op1 = XEXP (operands[1], 1);
2868
2869 if (GET_MODE (op0) == TImode
2870 || (GET_MODE (op0) == DImode
2871 && !TARGET_64BIT))
2872 return false;
2873
2874 start_sequence ();
2875 compare_op = ix86_expand_compare (code, op0, op1);
2876 compare_seq = get_insns ();
2877 end_sequence ();
2878
2879 compare_code = GET_CODE (compare_op);
2880
2881 if ((op1 == const0_rtx && (code == GE || code == LT))
2882 || (op1 == constm1_rtx && (code == GT || code == LE)))
2883 sign_bit_compare_p = true;
2884
2885 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
2886 HImode insns, we'd be swallowed in word prefix ops. */
2887
2888 if ((mode != HImode || TARGET_FAST_PREFIX)
2889 && (mode != (TARGET_64BIT ? TImode : DImode))
2890 && CONST_INT_P (operands[2])
2891 && CONST_INT_P (operands[3]))
2892 {
2893 rtx out = operands[0];
2894 HOST_WIDE_INT ct = INTVAL (operands[2]);
2895 HOST_WIDE_INT cf = INTVAL (operands[3]);
2896 HOST_WIDE_INT diff;
2897
2898 diff = ct - cf;
2899 /* Sign bit compares are better done using shifts than we do by using
2900 sbb. */
2901 if (sign_bit_compare_p
2902 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
2903 {
2904 /* Detect overlap between destination and compare sources. */
2905 rtx tmp = out;
2906
2907 if (!sign_bit_compare_p)
2908 {
2909 rtx flags;
2910 bool fpcmp = false;
2911
2912 compare_code = GET_CODE (compare_op);
2913
2914 flags = XEXP (compare_op, 0);
2915
2916 if (GET_MODE (flags) == CCFPmode)
2917 {
2918 fpcmp = true;
2919 compare_code
2920 = ix86_fp_compare_code_to_integer (compare_code);
2921 }
2922
2923 /* To simplify rest of code, restrict to the GEU case. */
2924 if (compare_code == LTU)
2925 {
2926 std::swap (ct, cf);
2927 compare_code = reverse_condition (compare_code);
2928 code = reverse_condition (code);
2929 }
2930 else
2931 {
2932 if (fpcmp)
2933 PUT_CODE (compare_op,
2934 reverse_condition_maybe_unordered
2935 (GET_CODE (compare_op)));
2936 else
2937 PUT_CODE (compare_op,
2938 reverse_condition (GET_CODE (compare_op)));
2939 }
2940 diff = ct - cf;
2941
2942 if (reg_overlap_mentioned_p (out, op0)
2943 || reg_overlap_mentioned_p (out, op1))
2944 tmp = gen_reg_rtx (mode);
2945
2946 if (mode == DImode)
2947 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
2948 else
2949 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
2950 flags, compare_op));
2951 }
2952 else
2953 {
2954 if (code == GT || code == GE)
2955 code = reverse_condition (code);
2956 else
2957 {
2958 std::swap (ct, cf);
2959 diff = ct - cf;
2960 }
2961 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
2962 }
2963
2964 if (diff == 1)
2965 {
2966 /*
2967 * cmpl op0,op1
2968 * sbbl dest,dest
2969 * [addl dest, ct]
2970 *
2971 * Size 5 - 8.
2972 */
2973 if (ct)
2974 tmp = expand_simple_binop (mode, PLUS,
2975 tmp, GEN_INT (ct),
2976 copy_rtx (tmp), 1, OPTAB_DIRECT);
2977 }
2978 else if (cf == -1)
2979 {
2980 /*
2981 * cmpl op0,op1
2982 * sbbl dest,dest
2983 * orl $ct, dest
2984 *
2985 * Size 8.
2986 */
2987 tmp = expand_simple_binop (mode, IOR,
2988 tmp, GEN_INT (ct),
2989 copy_rtx (tmp), 1, OPTAB_DIRECT);
2990 }
2991 else if (diff == -1 && ct)
2992 {
2993 /*
2994 * cmpl op0,op1
2995 * sbbl dest,dest
2996 * notl dest
2997 * [addl dest, cf]
2998 *
2999 * Size 8 - 11.
3000 */
3001 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3002 if (cf)
3003 tmp = expand_simple_binop (mode, PLUS,
3004 copy_rtx (tmp), GEN_INT (cf),
3005 copy_rtx (tmp), 1, OPTAB_DIRECT);
3006 }
3007 else
3008 {
3009 /*
3010 * cmpl op0,op1
3011 * sbbl dest,dest
3012 * [notl dest]
3013 * andl cf - ct, dest
3014 * [addl dest, ct]
3015 *
3016 * Size 8 - 11.
3017 */
3018
3019 if (cf == 0)
3020 {
3021 cf = ct;
3022 ct = 0;
3023 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3024 }
3025
3026 tmp = expand_simple_binop (mode, AND,
3027 copy_rtx (tmp),
3028 gen_int_mode (cf - ct, mode),
3029 copy_rtx (tmp), 1, OPTAB_DIRECT);
3030 if (ct)
3031 tmp = expand_simple_binop (mode, PLUS,
3032 copy_rtx (tmp), GEN_INT (ct),
3033 copy_rtx (tmp), 1, OPTAB_DIRECT);
3034 }
3035
3036 if (!rtx_equal_p (tmp, out))
3037 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3038
3039 return true;
3040 }
3041
3042 if (diff < 0)
3043 {
3044 machine_mode cmp_mode = GET_MODE (op0);
3045 enum rtx_code new_code;
3046
3047 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3048 {
3049 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3050
3051 /* We may be reversing unordered compare to normal compare, that
3052 is not valid in general (we may convert non-trapping condition
3053 to trapping one), however on i386 we currently emit all
3054 comparisons unordered. */
3055 new_code = reverse_condition_maybe_unordered (code);
3056 }
3057 else
3058 new_code = ix86_reverse_condition (code, cmp_mode);
3059 if (new_code != UNKNOWN)
3060 {
3061 std::swap (ct, cf);
3062 diff = -diff;
3063 code = new_code;
3064 }
3065 }
3066
3067 compare_code = UNKNOWN;
3068 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3069 && CONST_INT_P (op1))
3070 {
3071 if (op1 == const0_rtx
3072 && (code == LT || code == GE))
3073 compare_code = code;
3074 else if (op1 == constm1_rtx)
3075 {
3076 if (code == LE)
3077 compare_code = LT;
3078 else if (code == GT)
3079 compare_code = GE;
3080 }
3081 }
3082
3083 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3084 if (compare_code != UNKNOWN
3085 && GET_MODE (op0) == GET_MODE (out)
3086 && (cf == -1 || ct == -1))
3087 {
3088 /* If lea code below could be used, only optimize
3089 if it results in a 2 insn sequence. */
3090
3091 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3092 || diff == 3 || diff == 5 || diff == 9)
3093 || (compare_code == LT && ct == -1)
3094 || (compare_code == GE && cf == -1))
3095 {
3096 /*
3097 * notl op1 (if necessary)
3098 * sarl $31, op1
3099 * orl cf, op1
3100 */
3101 if (ct != -1)
3102 {
3103 cf = ct;
3104 ct = -1;
3105 code = reverse_condition (code);
3106 }
3107
3108 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3109
3110 out = expand_simple_binop (mode, IOR,
3111 out, GEN_INT (cf),
3112 out, 1, OPTAB_DIRECT);
3113 if (out != operands[0])
3114 emit_move_insn (operands[0], out);
3115
3116 return true;
3117 }
3118 }
3119
3120
3121 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3122 || diff == 3 || diff == 5 || diff == 9)
3123 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3124 && (mode != DImode
3125 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3126 {
3127 /*
3128 * xorl dest,dest
3129 * cmpl op1,op2
3130 * setcc dest
3131 * lea cf(dest*(ct-cf)),dest
3132 *
3133 * Size 14.
3134 *
3135 * This also catches the degenerate setcc-only case.
3136 */
3137
3138 rtx tmp;
3139 int nops;
3140
3141 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3142
3143 nops = 0;
3144 /* On x86_64 the lea instruction operates on Pmode, so we need
3145 to get arithmetics done in proper mode to match. */
3146 if (diff == 1)
3147 tmp = copy_rtx (out);
3148 else
3149 {
3150 rtx out1;
3151 out1 = copy_rtx (out);
3152 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3153 nops++;
3154 if (diff & 1)
3155 {
3156 tmp = gen_rtx_PLUS (mode, tmp, out1);
3157 nops++;
3158 }
3159 }
3160 if (cf != 0)
3161 {
3162 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
3163 nops++;
3164 }
3165 if (!rtx_equal_p (tmp, out))
3166 {
3167 if (nops == 1)
3168 out = force_operand (tmp, copy_rtx (out));
3169 else
3170 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3171 }
3172 if (!rtx_equal_p (out, operands[0]))
3173 emit_move_insn (operands[0], copy_rtx (out));
3174
3175 return true;
3176 }
3177
3178 /*
3179 * General case: Jumpful:
3180 * xorl dest,dest cmpl op1, op2
3181 * cmpl op1, op2 movl ct, dest
3182 * setcc dest jcc 1f
3183 * decl dest movl cf, dest
3184 * andl (cf-ct),dest 1:
3185 * addl ct,dest
3186 *
3187 * Size 20. Size 14.
3188 *
3189 * This is reasonably steep, but branch mispredict costs are
3190 * high on modern cpus, so consider failing only if optimizing
3191 * for space.
3192 */
3193
3194 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3195 && BRANCH_COST (optimize_insn_for_speed_p (),
3196 false) >= 2)
3197 {
3198 if (cf == 0)
3199 {
3200 machine_mode cmp_mode = GET_MODE (op0);
3201 enum rtx_code new_code;
3202
3203 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3204 {
3205 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3206
3207 /* We may be reversing unordered compare to normal compare,
3208 that is not valid in general (we may convert non-trapping
3209 condition to trapping one), however on i386 we currently
3210 emit all comparisons unordered. */
3211 new_code = reverse_condition_maybe_unordered (code);
3212 }
3213 else
3214 {
3215 new_code = ix86_reverse_condition (code, cmp_mode);
3216 if (compare_code != UNKNOWN && new_code != UNKNOWN)
3217 compare_code = reverse_condition (compare_code);
3218 }
3219
3220 if (new_code != UNKNOWN)
3221 {
3222 cf = ct;
3223 ct = 0;
3224 code = new_code;
3225 }
3226 }
3227
3228 if (compare_code != UNKNOWN)
3229 {
3230 /* notl op1 (if needed)
3231 sarl $31, op1
3232 andl (cf-ct), op1
3233 addl ct, op1
3234
3235 For x < 0 (resp. x <= -1) there will be no notl,
3236 so if possible swap the constants to get rid of the
3237 complement.
3238 True/false will be -1/0 while code below (store flag
3239 followed by decrement) is 0/-1, so the constants need
3240 to be exchanged once more. */
3241
3242 if (compare_code == GE || !cf)
3243 {
3244 code = reverse_condition (code);
3245 compare_code = LT;
3246 }
3247 else
3248 std::swap (ct, cf);
3249
3250 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3251 }
3252 else
3253 {
3254 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3255
3256 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
3257 constm1_rtx,
3258 copy_rtx (out), 1, OPTAB_DIRECT);
3259 }
3260
3261 out = expand_simple_binop (mode, AND, copy_rtx (out),
3262 gen_int_mode (cf - ct, mode),
3263 copy_rtx (out), 1, OPTAB_DIRECT);
3264 if (ct)
3265 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
3266 copy_rtx (out), 1, OPTAB_DIRECT);
3267 if (!rtx_equal_p (out, operands[0]))
3268 emit_move_insn (operands[0], copy_rtx (out));
3269
3270 return true;
3271 }
3272 }
3273
3274 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3275 {
3276 /* Try a few things more with specific constants and a variable. */
3277
3278 optab op;
3279 rtx var, orig_out, out, tmp;
3280
3281 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3282 return false;
3283
3284 /* If one of the two operands is an interesting constant, load a
3285 constant with the above and mask it in with a logical operation. */
3286
3287 if (CONST_INT_P (operands[2]))
3288 {
3289 var = operands[3];
3290 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
3291 operands[3] = constm1_rtx, op = and_optab;
3292 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
3293 operands[3] = const0_rtx, op = ior_optab;
3294 else
3295 return false;
3296 }
3297 else if (CONST_INT_P (operands[3]))
3298 {
3299 var = operands[2];
3300 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
3301 operands[2] = constm1_rtx, op = and_optab;
3302 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
3303 operands[2] = const0_rtx, op = ior_optab;
3304 else
3305 return false;
3306 }
3307 else
3308 return false;
3309
3310 orig_out = operands[0];
3311 tmp = gen_reg_rtx (mode);
3312 operands[0] = tmp;
3313
3314 /* Recurse to get the constant loaded. */
3315 if (!ix86_expand_int_movcc (operands))
3316 return false;
3317
3318 /* Mask in the interesting variable. */
3319 out = expand_binop (mode, op, var, tmp, orig_out, 0,
3320 OPTAB_WIDEN);
3321 if (!rtx_equal_p (out, orig_out))
3322 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
3323
3324 return true;
3325 }
3326
3327 /*
3328 * For comparison with above,
3329 *
3330 * movl cf,dest
3331 * movl ct,tmp
3332 * cmpl op1,op2
3333 * cmovcc tmp,dest
3334 *
3335 * Size 15.
3336 */
3337
3338 if (! nonimmediate_operand (operands[2], mode))
3339 operands[2] = force_reg (mode, operands[2]);
3340 if (! nonimmediate_operand (operands[3], mode))
3341 operands[3] = force_reg (mode, operands[3]);
3342
3343 if (! register_operand (operands[2], VOIDmode)
3344 && (mode == QImode
3345 || ! register_operand (operands[3], VOIDmode)))
3346 operands[2] = force_reg (mode, operands[2]);
3347
3348 if (mode == QImode
3349 && ! register_operand (operands[3], VOIDmode))
3350 operands[3] = force_reg (mode, operands[3]);
3351
3352 emit_insn (compare_seq);
3353 emit_insn (gen_rtx_SET (operands[0],
3354 gen_rtx_IF_THEN_ELSE (mode,
3355 compare_op, operands[2],
3356 operands[3])));
3357 return true;
3358 }
3359
3360 /* Detect conditional moves that exactly match min/max operational
3361 semantics. Note that this is IEEE safe, as long as we don't
3362 interchange the operands.
3363
3364 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3365 and TRUE if the operation is successful and instructions are emitted. */
3366
3367 static bool
3368 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
3369 rtx cmp_op1, rtx if_true, rtx if_false)
3370 {
3371 machine_mode mode;
3372 bool is_min;
3373 rtx tmp;
3374
3375 if (code == LT)
3376 ;
3377 else if (code == UNGE)
3378 std::swap (if_true, if_false);
3379 else
3380 return false;
3381
3382 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
3383 is_min = true;
3384 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
3385 is_min = false;
3386 else
3387 return false;
3388
3389 mode = GET_MODE (dest);
3390
3391 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3392 but MODE may be a vector mode and thus not appropriate. */
3393 if (!flag_finite_math_only || flag_signed_zeros)
3394 {
3395 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
3396 rtvec v;
3397
3398 if_true = force_reg (mode, if_true);
3399 v = gen_rtvec (2, if_true, if_false);
3400 tmp = gen_rtx_UNSPEC (mode, v, u);
3401 }
3402 else
3403 {
3404 code = is_min ? SMIN : SMAX;
3405 if (MEM_P (if_true) && MEM_P (if_false))
3406 if_true = force_reg (mode, if_true);
3407 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
3408 }
3409
3410 emit_insn (gen_rtx_SET (dest, tmp));
3411 return true;
3412 }
3413
3414 /* Expand an SSE comparison. Return the register with the result. */
3415
3416 static rtx
3417 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
3418 rtx op_true, rtx op_false)
3419 {
3420 machine_mode mode = GET_MODE (dest);
3421 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
3422
3423 /* In general case result of comparison can differ from operands' type. */
3424 machine_mode cmp_mode;
3425
3426 /* In AVX512F the result of comparison is an integer mask. */
3427 bool maskcmp = false;
3428 rtx x;
3429
3430 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
3431 {
3432 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
3433 cmp_mode = int_mode_for_size (nbits, 0).require ();
3434 maskcmp = true;
3435 }
3436 else
3437 cmp_mode = cmp_ops_mode;
3438
3439 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
3440
3441 int (*op1_predicate)(rtx, machine_mode)
3442 = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
3443
3444 if (!op1_predicate (cmp_op1, cmp_ops_mode))
3445 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
3446
3447 if (optimize
3448 || (maskcmp && cmp_mode != mode)
3449 || (op_true && reg_overlap_mentioned_p (dest, op_true))
3450 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
3451 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
3452
3453 /* Compare patterns for int modes are unspec in AVX512F only. */
3454 if (maskcmp && (code == GT || code == EQ))
3455 {
3456 rtx (*gen)(rtx, rtx, rtx);
3457
3458 switch (cmp_ops_mode)
3459 {
3460 case E_V64QImode:
3461 gcc_assert (TARGET_AVX512BW);
3462 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
3463 break;
3464 case E_V32HImode:
3465 gcc_assert (TARGET_AVX512BW);
3466 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
3467 break;
3468 case E_V16SImode:
3469 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
3470 break;
3471 case E_V8DImode:
3472 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
3473 break;
3474 default:
3475 gen = NULL;
3476 }
3477
3478 if (gen)
3479 {
3480 emit_insn (gen (dest, cmp_op0, cmp_op1));
3481 return dest;
3482 }
3483 }
3484 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
3485
3486 if (cmp_mode != mode && !maskcmp)
3487 {
3488 x = force_reg (cmp_ops_mode, x);
3489 convert_move (dest, x, false);
3490 }
3491 else
3492 emit_insn (gen_rtx_SET (dest, x));
3493
3494 return dest;
3495 }
3496
3497 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3498 operations. This is used for both scalar and vector conditional moves. */
3499
3500 void
3501 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
3502 {
3503 machine_mode mode = GET_MODE (dest);
3504 machine_mode cmpmode = GET_MODE (cmp);
3505
3506 /* In AVX512F the result of comparison is an integer mask. */
3507 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
3508
3509 rtx t2, t3, x;
3510
3511 /* If we have an integer mask and FP value then we need
3512 to cast mask to FP mode. */
3513 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
3514 {
3515 cmp = force_reg (cmpmode, cmp);
3516 cmp = gen_rtx_SUBREG (mode, cmp, 0);
3517 }
3518
3519 if (maskcmp)
3520 {
3521 rtx (*gen) (rtx, rtx) = NULL;
3522 if ((op_true == CONST0_RTX (mode)
3523 && vector_all_ones_operand (op_false, mode))
3524 || (op_false == CONST0_RTX (mode)
3525 && vector_all_ones_operand (op_true, mode)))
3526 switch (mode)
3527 {
3528 case E_V64QImode:
3529 if (TARGET_AVX512BW)
3530 gen = gen_avx512bw_cvtmask2bv64qi;
3531 break;
3532 case E_V32QImode:
3533 if (TARGET_AVX512VL && TARGET_AVX512BW)
3534 gen = gen_avx512vl_cvtmask2bv32qi;
3535 break;
3536 case E_V16QImode:
3537 if (TARGET_AVX512VL && TARGET_AVX512BW)
3538 gen = gen_avx512vl_cvtmask2bv16qi;
3539 break;
3540 case E_V32HImode:
3541 if (TARGET_AVX512BW)
3542 gen = gen_avx512bw_cvtmask2wv32hi;
3543 break;
3544 case E_V16HImode:
3545 if (TARGET_AVX512VL && TARGET_AVX512BW)
3546 gen = gen_avx512vl_cvtmask2wv16hi;
3547 break;
3548 case E_V8HImode:
3549 if (TARGET_AVX512VL && TARGET_AVX512BW)
3550 gen = gen_avx512vl_cvtmask2wv8hi;
3551 break;
3552 case E_V16SImode:
3553 if (TARGET_AVX512DQ)
3554 gen = gen_avx512f_cvtmask2dv16si;
3555 break;
3556 case E_V8SImode:
3557 if (TARGET_AVX512VL && TARGET_AVX512DQ)
3558 gen = gen_avx512vl_cvtmask2dv8si;
3559 break;
3560 case E_V4SImode:
3561 if (TARGET_AVX512VL && TARGET_AVX512DQ)
3562 gen = gen_avx512vl_cvtmask2dv4si;
3563 break;
3564 case E_V8DImode:
3565 if (TARGET_AVX512DQ)
3566 gen = gen_avx512f_cvtmask2qv8di;
3567 break;
3568 case E_V4DImode:
3569 if (TARGET_AVX512VL && TARGET_AVX512DQ)
3570 gen = gen_avx512vl_cvtmask2qv4di;
3571 break;
3572 case E_V2DImode:
3573 if (TARGET_AVX512VL && TARGET_AVX512DQ)
3574 gen = gen_avx512vl_cvtmask2qv2di;
3575 break;
3576 default:
3577 break;
3578 }
3579 if (gen && SCALAR_INT_MODE_P (cmpmode))
3580 {
3581 cmp = force_reg (cmpmode, cmp);
3582 if (op_true == CONST0_RTX (mode))
3583 {
3584 rtx (*gen_not) (rtx, rtx);
3585 switch (cmpmode)
3586 {
3587 case E_QImode: gen_not = gen_knotqi; break;
3588 case E_HImode: gen_not = gen_knothi; break;
3589 case E_SImode: gen_not = gen_knotsi; break;
3590 case E_DImode: gen_not = gen_knotdi; break;
3591 default: gcc_unreachable ();
3592 }
3593 rtx n = gen_reg_rtx (cmpmode);
3594 emit_insn (gen_not (n, cmp));
3595 cmp = n;
3596 }
3597 emit_insn (gen (dest, cmp));
3598 return;
3599 }
3600 }
3601 else if (vector_all_ones_operand (op_true, mode)
3602 && op_false == CONST0_RTX (mode))
3603 {
3604 emit_insn (gen_rtx_SET (dest, cmp));
3605 return;
3606 }
3607 else if (op_false == CONST0_RTX (mode))
3608 {
3609 op_true = force_reg (mode, op_true);
3610 x = gen_rtx_AND (mode, cmp, op_true);
3611 emit_insn (gen_rtx_SET (dest, x));
3612 return;
3613 }
3614 else if (op_true == CONST0_RTX (mode))
3615 {
3616 op_false = force_reg (mode, op_false);
3617 x = gen_rtx_NOT (mode, cmp);
3618 x = gen_rtx_AND (mode, x, op_false);
3619 emit_insn (gen_rtx_SET (dest, x));
3620 return;
3621 }
3622 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
3623 {
3624 op_false = force_reg (mode, op_false);
3625 x = gen_rtx_IOR (mode, cmp, op_false);
3626 emit_insn (gen_rtx_SET (dest, x));
3627 return;
3628 }
3629 else if (TARGET_XOP)
3630 {
3631 op_true = force_reg (mode, op_true);
3632
3633 if (!nonimmediate_operand (op_false, mode))
3634 op_false = force_reg (mode, op_false);
3635
3636 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
3637 op_true,
3638 op_false)));
3639 return;
3640 }
3641
3642 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
3643 rtx d = dest;
3644
3645 if (!vector_operand (op_true, mode))
3646 op_true = force_reg (mode, op_true);
3647
3648 op_false = force_reg (mode, op_false);
3649
3650 switch (mode)
3651 {
3652 case E_V4SFmode:
3653 if (TARGET_SSE4_1)
3654 gen = gen_sse4_1_blendvps;
3655 break;
3656 case E_V2DFmode:
3657 if (TARGET_SSE4_1)
3658 gen = gen_sse4_1_blendvpd;
3659 break;
3660 case E_SFmode:
3661 if (TARGET_SSE4_1)
3662 {
3663 gen = gen_sse4_1_blendvss;
3664 op_true = force_reg (mode, op_true);
3665 }
3666 break;
3667 case E_DFmode:
3668 if (TARGET_SSE4_1)
3669 {
3670 gen = gen_sse4_1_blendvsd;
3671 op_true = force_reg (mode, op_true);
3672 }
3673 break;
3674 case E_V16QImode:
3675 case E_V8HImode:
3676 case E_V4SImode:
3677 case E_V2DImode:
3678 if (TARGET_SSE4_1)
3679 {
3680 gen = gen_sse4_1_pblendvb;
3681 if (mode != V16QImode)
3682 d = gen_reg_rtx (V16QImode);
3683 op_false = gen_lowpart (V16QImode, op_false);
3684 op_true = gen_lowpart (V16QImode, op_true);
3685 cmp = gen_lowpart (V16QImode, cmp);
3686 }
3687 break;
3688 case E_V8SFmode:
3689 if (TARGET_AVX)
3690 gen = gen_avx_blendvps256;
3691 break;
3692 case E_V4DFmode:
3693 if (TARGET_AVX)
3694 gen = gen_avx_blendvpd256;
3695 break;
3696 case E_V32QImode:
3697 case E_V16HImode:
3698 case E_V8SImode:
3699 case E_V4DImode:
3700 if (TARGET_AVX2)
3701 {
3702 gen = gen_avx2_pblendvb;
3703 if (mode != V32QImode)
3704 d = gen_reg_rtx (V32QImode);
3705 op_false = gen_lowpart (V32QImode, op_false);
3706 op_true = gen_lowpart (V32QImode, op_true);
3707 cmp = gen_lowpart (V32QImode, cmp);
3708 }
3709 break;
3710
3711 case E_V64QImode:
3712 gen = gen_avx512bw_blendmv64qi;
3713 break;
3714 case E_V32HImode:
3715 gen = gen_avx512bw_blendmv32hi;
3716 break;
3717 case E_V16SImode:
3718 gen = gen_avx512f_blendmv16si;
3719 break;
3720 case E_V8DImode:
3721 gen = gen_avx512f_blendmv8di;
3722 break;
3723 case E_V8DFmode:
3724 gen = gen_avx512f_blendmv8df;
3725 break;
3726 case E_V16SFmode:
3727 gen = gen_avx512f_blendmv16sf;
3728 break;
3729
3730 default:
3731 break;
3732 }
3733
3734 if (gen != NULL)
3735 {
3736 emit_insn (gen (d, op_false, op_true, cmp));
3737 if (d != dest)
3738 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
3739 }
3740 else
3741 {
3742 op_true = force_reg (mode, op_true);
3743
3744 t2 = gen_reg_rtx (mode);
3745 if (optimize)
3746 t3 = gen_reg_rtx (mode);
3747 else
3748 t3 = dest;
3749
3750 x = gen_rtx_AND (mode, op_true, cmp);
3751 emit_insn (gen_rtx_SET (t2, x));
3752
3753 x = gen_rtx_NOT (mode, cmp);
3754 x = gen_rtx_AND (mode, x, op_false);
3755 emit_insn (gen_rtx_SET (t3, x));
3756
3757 x = gen_rtx_IOR (mode, t3, t2);
3758 emit_insn (gen_rtx_SET (dest, x));
3759 }
3760 }
3761
3762 /* Swap, force into registers, or otherwise massage the two operands
3763 to an sse comparison with a mask result. Thus we differ a bit from
3764 ix86_prepare_fp_compare_args which expects to produce a flags result.
3765
3766 The DEST operand exists to help determine whether to commute commutative
3767 operators. The POP0/POP1 operands are updated in place. The new
3768 comparison code is returned, or UNKNOWN if not implementable. */
3769
3770 static enum rtx_code
3771 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
3772 rtx *pop0, rtx *pop1)
3773 {
3774 switch (code)
3775 {
3776 case LTGT:
3777 case UNEQ:
3778 /* AVX supports all the needed comparisons. */
3779 if (TARGET_AVX)
3780 break;
3781 /* We have no LTGT as an operator. We could implement it with
3782 NE & ORDERED, but this requires an extra temporary. It's
3783 not clear that it's worth it. */
3784 return UNKNOWN;
3785
3786 case LT:
3787 case LE:
3788 case UNGT:
3789 case UNGE:
3790 /* These are supported directly. */
3791 break;
3792
3793 case EQ:
3794 case NE:
3795 case UNORDERED:
3796 case ORDERED:
3797 /* AVX has 3 operand comparisons, no need to swap anything. */
3798 if (TARGET_AVX)
3799 break;
3800 /* For commutative operators, try to canonicalize the destination
3801 operand to be first in the comparison - this helps reload to
3802 avoid extra moves. */
3803 if (!dest || !rtx_equal_p (dest, *pop1))
3804 break;
3805 /* FALLTHRU */
3806
3807 case GE:
3808 case GT:
3809 case UNLE:
3810 case UNLT:
3811 /* These are not supported directly before AVX, and furthermore
3812 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
3813 comparison operands to transform into something that is
3814 supported. */
3815 std::swap (*pop0, *pop1);
3816 code = swap_condition (code);
3817 break;
3818
3819 default:
3820 gcc_unreachable ();
3821 }
3822
3823 return code;
3824 }
3825
3826 /* Expand a floating-point conditional move. Return true if successful. */
3827
3828 bool
3829 ix86_expand_fp_movcc (rtx operands[])
3830 {
3831 machine_mode mode = GET_MODE (operands[0]);
3832 enum rtx_code code = GET_CODE (operands[1]);
3833 rtx tmp, compare_op;
3834 rtx op0 = XEXP (operands[1], 0);
3835 rtx op1 = XEXP (operands[1], 1);
3836
3837 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
3838 {
3839 machine_mode cmode;
3840
3841 /* Since we've no cmove for sse registers, don't force bad register
3842 allocation just to gain access to it. Deny movcc when the
3843 comparison mode doesn't match the move mode. */
3844 cmode = GET_MODE (op0);
3845 if (cmode == VOIDmode)
3846 cmode = GET_MODE (op1);
3847 if (cmode != mode)
3848 return false;
3849
3850 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
3851 if (code == UNKNOWN)
3852 return false;
3853
3854 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
3855 operands[2], operands[3]))
3856 return true;
3857
3858 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
3859 operands[2], operands[3]);
3860 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
3861 return true;
3862 }
3863
3864 if (GET_MODE (op0) == TImode
3865 || (GET_MODE (op0) == DImode
3866 && !TARGET_64BIT))
3867 return false;
3868
3869 /* The floating point conditional move instructions don't directly
3870 support conditions resulting from a signed integer comparison. */
3871
3872 compare_op = ix86_expand_compare (code, op0, op1);
3873 if (!fcmov_comparison_operator (compare_op, VOIDmode))
3874 {
3875 tmp = gen_reg_rtx (QImode);
3876 ix86_expand_setcc (tmp, code, op0, op1);
3877
3878 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
3879 }
3880
3881 emit_insn (gen_rtx_SET (operands[0],
3882 gen_rtx_IF_THEN_ELSE (mode, compare_op,
3883 operands[2], operands[3])));
3884
3885 return true;
3886 }
3887
3888 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
3889
3890 static int
3891 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
3892 {
3893 switch (code)
3894 {
3895 case EQ:
3896 return 0;
3897 case LT:
3898 case LTU:
3899 return 1;
3900 case LE:
3901 case LEU:
3902 return 2;
3903 case NE:
3904 return 4;
3905 case GE:
3906 case GEU:
3907 return 5;
3908 case GT:
3909 case GTU:
3910 return 6;
3911 default:
3912 gcc_unreachable ();
3913 }
3914 }
3915
3916 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
3917
3918 static int
3919 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
3920 {
3921 switch (code)
3922 {
3923 case EQ:
3924 return 0x00;
3925 case NE:
3926 return 0x04;
3927 case GT:
3928 return 0x0e;
3929 case LE:
3930 return 0x02;
3931 case GE:
3932 return 0x0d;
3933 case LT:
3934 return 0x01;
3935 case UNLE:
3936 return 0x0a;
3937 case UNLT:
3938 return 0x09;
3939 case UNGE:
3940 return 0x05;
3941 case UNGT:
3942 return 0x06;
3943 case UNEQ:
3944 return 0x18;
3945 case LTGT:
3946 return 0x0c;
3947 case ORDERED:
3948 return 0x07;
3949 case UNORDERED:
3950 return 0x03;
3951 default:
3952 gcc_unreachable ();
3953 }
3954 }
3955
3956 /* Return immediate value to be used in UNSPEC_PCMP
3957 for comparison CODE in MODE. */
3958
3959 static int
3960 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
3961 {
3962 if (FLOAT_MODE_P (mode))
3963 return ix86_fp_cmp_code_to_pcmp_immediate (code);
3964 return ix86_int_cmp_code_to_pcmp_immediate (code);
3965 }
3966
3967 /* Expand AVX-512 vector comparison. */
3968
3969 bool
3970 ix86_expand_mask_vec_cmp (rtx operands[])
3971 {
3972 machine_mode mask_mode = GET_MODE (operands[0]);
3973 machine_mode cmp_mode = GET_MODE (operands[2]);
3974 enum rtx_code code = GET_CODE (operands[1]);
3975 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
3976 int unspec_code;
3977 rtx unspec;
3978
3979 switch (code)
3980 {
3981 case LEU:
3982 case GTU:
3983 case GEU:
3984 case LTU:
3985 unspec_code = UNSPEC_UNSIGNED_PCMP;
3986 break;
3987
3988 default:
3989 unspec_code = UNSPEC_PCMP;
3990 }
3991
3992 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
3993 operands[3], imm),
3994 unspec_code);
3995 emit_insn (gen_rtx_SET (operands[0], unspec));
3996
3997 return true;
3998 }
3999
4000 /* Expand fp vector comparison. */
4001
4002 bool
4003 ix86_expand_fp_vec_cmp (rtx operands[])
4004 {
4005 enum rtx_code code = GET_CODE (operands[1]);
4006 rtx cmp;
4007
4008 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4009 &operands[2], &operands[3]);
4010 if (code == UNKNOWN)
4011 {
4012 rtx temp;
4013 switch (GET_CODE (operands[1]))
4014 {
4015 case LTGT:
4016 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
4017 operands[3], NULL, NULL);
4018 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
4019 operands[3], NULL, NULL);
4020 code = AND;
4021 break;
4022 case UNEQ:
4023 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
4024 operands[3], NULL, NULL);
4025 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
4026 operands[3], NULL, NULL);
4027 code = IOR;
4028 break;
4029 default:
4030 gcc_unreachable ();
4031 }
4032 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4033 OPTAB_DIRECT);
4034 }
4035 else
4036 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
4037 operands[1], operands[2]);
4038
4039 if (operands[0] != cmp)
4040 emit_move_insn (operands[0], cmp);
4041
4042 return true;
4043 }
4044
4045 static rtx
4046 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4047 rtx op_true, rtx op_false, bool *negate)
4048 {
4049 machine_mode data_mode = GET_MODE (dest);
4050 machine_mode mode = GET_MODE (cop0);
4051 rtx x;
4052
4053 *negate = false;
4054
4055 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4056 if (TARGET_XOP
4057 && (mode == V16QImode || mode == V8HImode
4058 || mode == V4SImode || mode == V2DImode))
4059 ;
4060 else
4061 {
4062 /* Canonicalize the comparison to EQ, GT, GTU. */
4063 switch (code)
4064 {
4065 case EQ:
4066 case GT:
4067 case GTU:
4068 break;
4069
4070 case NE:
4071 case LE:
4072 case LEU:
4073 code = reverse_condition (code);
4074 *negate = true;
4075 break;
4076
4077 case GE:
4078 case GEU:
4079 code = reverse_condition (code);
4080 *negate = true;
4081 /* FALLTHRU */
4082
4083 case LT:
4084 case LTU:
4085 std::swap (cop0, cop1);
4086 code = swap_condition (code);
4087 break;
4088
4089 default:
4090 gcc_unreachable ();
4091 }
4092
4093 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4094 if (mode == V2DImode)
4095 {
4096 switch (code)
4097 {
4098 case EQ:
4099 /* SSE4.1 supports EQ. */
4100 if (!TARGET_SSE4_1)
4101 return NULL;
4102 break;
4103
4104 case GT:
4105 case GTU:
4106 /* SSE4.2 supports GT/GTU. */
4107 if (!TARGET_SSE4_2)
4108 return NULL;
4109 break;
4110
4111 default:
4112 gcc_unreachable ();
4113 }
4114 }
4115
4116 rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
4117 rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
4118 if (*negate)
4119 std::swap (optrue, opfalse);
4120
4121 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4122 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4123 min (x, y) == x). While we add one instruction (the minimum),
4124 we remove the need for two instructions in the negation, as the
4125 result is done this way.
4126 When using masks, do it for SI/DImode element types, as it is shorter
4127 than the two subtractions. */
4128 if ((code != EQ
4129 && GET_MODE_SIZE (mode) != 64
4130 && vector_all_ones_operand (opfalse, data_mode)
4131 && optrue == CONST0_RTX (data_mode))
4132 || (code == GTU
4133 && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
4134 /* Don't do it if not using integer masks and we'd end up with
4135 the right values in the registers though. */
4136 && (GET_MODE_SIZE (mode) == 64
4137 || !vector_all_ones_operand (optrue, data_mode)
4138 || opfalse != CONST0_RTX (data_mode))))
4139 {
4140 rtx (*gen) (rtx, rtx, rtx) = NULL;
4141
4142 switch (mode)
4143 {
4144 case E_V16SImode:
4145 gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
4146 break;
4147 case E_V8DImode:
4148 gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
4149 cop0 = force_reg (mode, cop0);
4150 cop1 = force_reg (mode, cop1);
4151 break;
4152 case E_V32QImode:
4153 if (TARGET_AVX2)
4154 gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
4155 break;
4156 case E_V16HImode:
4157 if (TARGET_AVX2)
4158 gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
4159 break;
4160 case E_V8SImode:
4161 if (TARGET_AVX2)
4162 gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
4163 break;
4164 case E_V4DImode:
4165 if (TARGET_AVX512VL)
4166 {
4167 gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
4168 cop0 = force_reg (mode, cop0);
4169 cop1 = force_reg (mode, cop1);
4170 }
4171 break;
4172 case E_V16QImode:
4173 if (code == GTU && TARGET_SSE2)
4174 gen = gen_uminv16qi3;
4175 else if (code == GT && TARGET_SSE4_1)
4176 gen = gen_sminv16qi3;
4177 break;
4178 case E_V8HImode:
4179 if (code == GTU && TARGET_SSE4_1)
4180 gen = gen_uminv8hi3;
4181 else if (code == GT && TARGET_SSE2)
4182 gen = gen_sminv8hi3;
4183 break;
4184 case E_V4SImode:
4185 if (TARGET_SSE4_1)
4186 gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
4187 break;
4188 case E_V2DImode:
4189 if (TARGET_AVX512VL)
4190 {
4191 gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
4192 cop0 = force_reg (mode, cop0);
4193 cop1 = force_reg (mode, cop1);
4194 }
4195 break;
4196 default:
4197 break;
4198 }
4199
4200 if (gen)
4201 {
4202 rtx tem = gen_reg_rtx (mode);
4203 if (!vector_operand (cop0, mode))
4204 cop0 = force_reg (mode, cop0);
4205 if (!vector_operand (cop1, mode))
4206 cop1 = force_reg (mode, cop1);
4207 *negate = !*negate;
4208 emit_insn (gen (tem, cop0, cop1));
4209 cop1 = tem;
4210 code = EQ;
4211 }
4212 }
4213
4214 /* Unsigned parallel compare is not supported by the hardware.
4215 Play some tricks to turn this into a signed comparison
4216 against 0. */
4217 if (code == GTU)
4218 {
4219 cop0 = force_reg (mode, cop0);
4220
4221 switch (mode)
4222 {
4223 case E_V16SImode:
4224 case E_V8DImode:
4225 case E_V8SImode:
4226 case E_V4DImode:
4227 case E_V4SImode:
4228 case E_V2DImode:
4229 {
4230 rtx t1, t2, mask;
4231
4232 /* Subtract (-(INT MAX) - 1) from both operands to make
4233 them signed. */
4234 mask = ix86_build_signbit_mask (mode, true, false);
4235 t1 = gen_reg_rtx (mode);
4236 emit_insn (gen_sub3_insn (t1, cop0, mask));
4237
4238 t2 = gen_reg_rtx (mode);
4239 emit_insn (gen_sub3_insn (t2, cop1, mask));
4240
4241 cop0 = t1;
4242 cop1 = t2;
4243 code = GT;
4244 }
4245 break;
4246
4247 case E_V64QImode:
4248 case E_V32HImode:
4249 case E_V32QImode:
4250 case E_V16HImode:
4251 case E_V16QImode:
4252 case E_V8HImode:
4253 /* Perform a parallel unsigned saturating subtraction. */
4254 x = gen_reg_rtx (mode);
4255 emit_insn (gen_rtx_SET
4256 (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
4257 cop0 = x;
4258 cop1 = CONST0_RTX (mode);
4259 code = EQ;
4260 *negate = !*negate;
4261 break;
4262
4263 default:
4264 gcc_unreachable ();
4265 }
4266 }
4267 }
4268
4269 if (*negate)
4270 std::swap (op_true, op_false);
4271
4272 /* Allow the comparison to be done in one mode, but the movcc to
4273 happen in another mode. */
4274 if (data_mode == mode)
4275 {
4276 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
4277 op_true, op_false);
4278 }
4279 else
4280 {
4281 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
4282 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
4283 op_true, op_false);
4284 if (GET_MODE (x) == mode)
4285 x = gen_lowpart (data_mode, x);
4286 }
4287
4288 return x;
4289 }
4290
4291 /* Expand integer vector comparison. */
4292
4293 bool
4294 ix86_expand_int_vec_cmp (rtx operands[])
4295 {
4296 rtx_code code = GET_CODE (operands[1]);
4297 bool negate = false;
4298 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
4299 operands[3], NULL, NULL, &negate);
4300
4301 if (!cmp)
4302 return false;
4303
4304 if (negate)
4305 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
4306 CONST0_RTX (GET_MODE (cmp)),
4307 NULL, NULL, &negate);
4308
4309 gcc_assert (!negate);
4310
4311 if (operands[0] != cmp)
4312 emit_move_insn (operands[0], cmp);
4313
4314 return true;
4315 }
4316
4317 /* Expand a floating-point vector conditional move; a vcond operation
4318 rather than a movcc operation. */
4319
4320 bool
4321 ix86_expand_fp_vcond (rtx operands[])
4322 {
4323 enum rtx_code code = GET_CODE (operands[3]);
4324 rtx cmp;
4325
4326 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4327 &operands[4], &operands[5]);
4328 if (code == UNKNOWN)
4329 {
4330 rtx temp;
4331 switch (GET_CODE (operands[3]))
4332 {
4333 case LTGT:
4334 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
4335 operands[5], operands[0], operands[0]);
4336 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
4337 operands[5], operands[1], operands[2]);
4338 code = AND;
4339 break;
4340 case UNEQ:
4341 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
4342 operands[5], operands[0], operands[0]);
4343 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
4344 operands[5], operands[1], operands[2]);
4345 code = IOR;
4346 break;
4347 default:
4348 gcc_unreachable ();
4349 }
4350 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4351 OPTAB_DIRECT);
4352 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4353 return true;
4354 }
4355
4356 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
4357 operands[5], operands[1], operands[2]))
4358 return true;
4359
4360 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
4361 operands[1], operands[2]);
4362 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4363 return true;
4364 }
4365
4366 /* Expand a signed/unsigned integral vector conditional move. */
4367
4368 bool
4369 ix86_expand_int_vcond (rtx operands[])
4370 {
4371 machine_mode data_mode = GET_MODE (operands[0]);
4372 machine_mode mode = GET_MODE (operands[4]);
4373 enum rtx_code code = GET_CODE (operands[3]);
4374 bool negate = false;
4375 rtx x, cop0, cop1;
4376
4377 cop0 = operands[4];
4378 cop1 = operands[5];
4379
4380 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4381 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4382 if ((code == LT || code == GE)
4383 && data_mode == mode
4384 && cop1 == CONST0_RTX (mode)
4385 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
4386 && GET_MODE_UNIT_SIZE (data_mode) > 1
4387 && GET_MODE_UNIT_SIZE (data_mode) <= 8
4388 && (GET_MODE_SIZE (data_mode) == 16
4389 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
4390 {
4391 rtx negop = operands[2 - (code == LT)];
4392 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
4393 if (negop == CONST1_RTX (data_mode))
4394 {
4395 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
4396 operands[0], 1, OPTAB_DIRECT);
4397 if (res != operands[0])
4398 emit_move_insn (operands[0], res);
4399 return true;
4400 }
4401 else if (GET_MODE_INNER (data_mode) != DImode
4402 && vector_all_ones_operand (negop, data_mode))
4403 {
4404 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
4405 operands[0], 0, OPTAB_DIRECT);
4406 if (res != operands[0])
4407 emit_move_insn (operands[0], res);
4408 return true;
4409 }
4410 }
4411
4412 if (!nonimmediate_operand (cop1, mode))
4413 cop1 = force_reg (mode, cop1);
4414 if (!general_operand (operands[1], data_mode))
4415 operands[1] = force_reg (data_mode, operands[1]);
4416 if (!general_operand (operands[2], data_mode))
4417 operands[2] = force_reg (data_mode, operands[2]);
4418
4419 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
4420 operands[1], operands[2], &negate);
4421
4422 if (!x)
4423 return false;
4424
4425 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
4426 operands[2-negate]);
4427 return true;
4428 }
4429
4430 static bool
4431 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
4432 struct expand_vec_perm_d *d)
4433 {
4434 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4435 expander, so args are either in d, or in op0, op1 etc. */
4436 machine_mode mode = GET_MODE (d ? d->op0 : op0);
4437 machine_mode maskmode = mode;
4438 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4439
4440 switch (mode)
4441 {
4442 case E_V8HImode:
4443 if (TARGET_AVX512VL && TARGET_AVX512BW)
4444 gen = gen_avx512vl_vpermt2varv8hi3;
4445 break;
4446 case E_V16HImode:
4447 if (TARGET_AVX512VL && TARGET_AVX512BW)
4448 gen = gen_avx512vl_vpermt2varv16hi3;
4449 break;
4450 case E_V64QImode:
4451 if (TARGET_AVX512VBMI)
4452 gen = gen_avx512bw_vpermt2varv64qi3;
4453 break;
4454 case E_V32HImode:
4455 if (TARGET_AVX512BW)
4456 gen = gen_avx512bw_vpermt2varv32hi3;
4457 break;
4458 case E_V4SImode:
4459 if (TARGET_AVX512VL)
4460 gen = gen_avx512vl_vpermt2varv4si3;
4461 break;
4462 case E_V8SImode:
4463 if (TARGET_AVX512VL)
4464 gen = gen_avx512vl_vpermt2varv8si3;
4465 break;
4466 case E_V16SImode:
4467 if (TARGET_AVX512F)
4468 gen = gen_avx512f_vpermt2varv16si3;
4469 break;
4470 case E_V4SFmode:
4471 if (TARGET_AVX512VL)
4472 {
4473 gen = gen_avx512vl_vpermt2varv4sf3;
4474 maskmode = V4SImode;
4475 }
4476 break;
4477 case E_V8SFmode:
4478 if (TARGET_AVX512VL)
4479 {
4480 gen = gen_avx512vl_vpermt2varv8sf3;
4481 maskmode = V8SImode;
4482 }
4483 break;
4484 case E_V16SFmode:
4485 if (TARGET_AVX512F)
4486 {
4487 gen = gen_avx512f_vpermt2varv16sf3;
4488 maskmode = V16SImode;
4489 }
4490 break;
4491 case E_V2DImode:
4492 if (TARGET_AVX512VL)
4493 gen = gen_avx512vl_vpermt2varv2di3;
4494 break;
4495 case E_V4DImode:
4496 if (TARGET_AVX512VL)
4497 gen = gen_avx512vl_vpermt2varv4di3;
4498 break;
4499 case E_V8DImode:
4500 if (TARGET_AVX512F)
4501 gen = gen_avx512f_vpermt2varv8di3;
4502 break;
4503 case E_V2DFmode:
4504 if (TARGET_AVX512VL)
4505 {
4506 gen = gen_avx512vl_vpermt2varv2df3;
4507 maskmode = V2DImode;
4508 }
4509 break;
4510 case E_V4DFmode:
4511 if (TARGET_AVX512VL)
4512 {
4513 gen = gen_avx512vl_vpermt2varv4df3;
4514 maskmode = V4DImode;
4515 }
4516 break;
4517 case E_V8DFmode:
4518 if (TARGET_AVX512F)
4519 {
4520 gen = gen_avx512f_vpermt2varv8df3;
4521 maskmode = V8DImode;
4522 }
4523 break;
4524 default:
4525 break;
4526 }
4527
4528 if (gen == NULL)
4529 return false;
4530
4531 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4532 expander, so args are either in d, or in op0, op1 etc. */
4533 if (d)
4534 {
4535 rtx vec[64];
4536 target = d->target;
4537 op0 = d->op0;
4538 op1 = d->op1;
4539 for (int i = 0; i < d->nelt; ++i)
4540 vec[i] = GEN_INT (d->perm[i]);
4541 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
4542 }
4543
4544 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
4545 return true;
4546 }
4547
4548 /* Expand a variable vector permutation. */
4549
4550 void
4551 ix86_expand_vec_perm (rtx operands[])
4552 {
4553 rtx target = operands[0];
4554 rtx op0 = operands[1];
4555 rtx op1 = operands[2];
4556 rtx mask = operands[3];
4557 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
4558 machine_mode mode = GET_MODE (op0);
4559 machine_mode maskmode = GET_MODE (mask);
4560 int w, e, i;
4561 bool one_operand_shuffle = rtx_equal_p (op0, op1);
4562
4563 /* Number of elements in the vector. */
4564 w = GET_MODE_NUNITS (mode);
4565 e = GET_MODE_UNIT_SIZE (mode);
4566 gcc_assert (w <= 64);
4567
4568 if (TARGET_AVX512F && one_operand_shuffle)
4569 {
4570 rtx (*gen) (rtx, rtx, rtx) = NULL;
4571 switch (mode)
4572 {
4573 case E_V16SImode:
4574 gen =gen_avx512f_permvarv16si;
4575 break;
4576 case E_V16SFmode:
4577 gen = gen_avx512f_permvarv16sf;
4578 break;
4579 case E_V8DImode:
4580 gen = gen_avx512f_permvarv8di;
4581 break;
4582 case E_V8DFmode:
4583 gen = gen_avx512f_permvarv8df;
4584 break;
4585 default:
4586 break;
4587 }
4588 if (gen != NULL)
4589 {
4590 emit_insn (gen (target, op0, mask));
4591 return;
4592 }
4593 }
4594
4595 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
4596 return;
4597
4598 if (TARGET_AVX2)
4599 {
4600 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
4601 {
4602 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
4603 an constant shuffle operand. With a tiny bit of effort we can
4604 use VPERMD instead. A re-interpretation stall for V4DFmode is
4605 unfortunate but there's no avoiding it.
4606 Similarly for V16HImode we don't have instructions for variable
4607 shuffling, while for V32QImode we can use after preparing suitable
4608 masks vpshufb; vpshufb; vpermq; vpor. */
4609
4610 if (mode == V16HImode)
4611 {
4612 maskmode = mode = V32QImode;
4613 w = 32;
4614 e = 1;
4615 }
4616 else
4617 {
4618 maskmode = mode = V8SImode;
4619 w = 8;
4620 e = 4;
4621 }
4622 t1 = gen_reg_rtx (maskmode);
4623
4624 /* Replicate the low bits of the V4DImode mask into V8SImode:
4625 mask = { A B C D }
4626 t1 = { A A B B C C D D }. */
4627 for (i = 0; i < w / 2; ++i)
4628 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
4629 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4630 vt = force_reg (maskmode, vt);
4631 mask = gen_lowpart (maskmode, mask);
4632 if (maskmode == V8SImode)
4633 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
4634 else
4635 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
4636
4637 /* Multiply the shuffle indicies by two. */
4638 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
4639 OPTAB_DIRECT);
4640
4641 /* Add one to the odd shuffle indicies:
4642 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
4643 for (i = 0; i < w / 2; ++i)
4644 {
4645 vec[i * 2] = const0_rtx;
4646 vec[i * 2 + 1] = const1_rtx;
4647 }
4648 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
4649 vt = validize_mem (force_const_mem (maskmode, vt));
4650 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
4651 OPTAB_DIRECT);
4652
4653 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
4654 operands[3] = mask = t1;
4655 target = gen_reg_rtx (mode);
4656 op0 = gen_lowpart (mode, op0);
4657 op1 = gen_lowpart (mode, op1);
4658 }
4659
4660 switch (mode)
4661 {
4662 case E_V8SImode:
4663 /* The VPERMD and VPERMPS instructions already properly ignore
4664 the high bits of the shuffle elements. No need for us to
4665 perform an AND ourselves. */
4666 if (one_operand_shuffle)
4667 {
4668 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
4669 if (target != operands[0])
4670 emit_move_insn (operands[0],
4671 gen_lowpart (GET_MODE (operands[0]), target));
4672 }
4673 else
4674 {
4675 t1 = gen_reg_rtx (V8SImode);
4676 t2 = gen_reg_rtx (V8SImode);
4677 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
4678 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
4679 goto merge_two;
4680 }
4681 return;
4682
4683 case E_V8SFmode:
4684 mask = gen_lowpart (V8SImode, mask);
4685 if (one_operand_shuffle)
4686 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
4687 else
4688 {
4689 t1 = gen_reg_rtx (V8SFmode);
4690 t2 = gen_reg_rtx (V8SFmode);
4691 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
4692 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
4693 goto merge_two;
4694 }
4695 return;
4696
4697 case E_V4SImode:
4698 /* By combining the two 128-bit input vectors into one 256-bit
4699 input vector, we can use VPERMD and VPERMPS for the full
4700 two-operand shuffle. */
4701 t1 = gen_reg_rtx (V8SImode);
4702 t2 = gen_reg_rtx (V8SImode);
4703 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
4704 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4705 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
4706 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
4707 return;
4708
4709 case E_V4SFmode:
4710 t1 = gen_reg_rtx (V8SFmode);
4711 t2 = gen_reg_rtx (V8SImode);
4712 mask = gen_lowpart (V4SImode, mask);
4713 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
4714 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
4715 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
4716 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
4717 return;
4718
4719 case E_V32QImode:
4720 t1 = gen_reg_rtx (V32QImode);
4721 t2 = gen_reg_rtx (V32QImode);
4722 t3 = gen_reg_rtx (V32QImode);
4723 vt2 = GEN_INT (-128);
4724 vt = gen_const_vec_duplicate (V32QImode, vt2);
4725 vt = force_reg (V32QImode, vt);
4726 for (i = 0; i < 32; i++)
4727 vec[i] = i < 16 ? vt2 : const0_rtx;
4728 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
4729 vt2 = force_reg (V32QImode, vt2);
4730 /* From mask create two adjusted masks, which contain the same
4731 bits as mask in the low 7 bits of each vector element.
4732 The first mask will have the most significant bit clear
4733 if it requests element from the same 128-bit lane
4734 and MSB set if it requests element from the other 128-bit lane.
4735 The second mask will have the opposite values of the MSB,
4736 and additionally will have its 128-bit lanes swapped.
4737 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
4738 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
4739 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
4740 stands for other 12 bytes. */
4741 /* The bit whether element is from the same lane or the other
4742 lane is bit 4, so shift it up by 3 to the MSB position. */
4743 t5 = gen_reg_rtx (V4DImode);
4744 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
4745 GEN_INT (3)));
4746 /* Clear MSB bits from the mask just in case it had them set. */
4747 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
4748 /* After this t1 will have MSB set for elements from other lane. */
4749 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
4750 /* Clear bits other than MSB. */
4751 emit_insn (gen_andv32qi3 (t1, t1, vt));
4752 /* Or in the lower bits from mask into t3. */
4753 emit_insn (gen_iorv32qi3 (t3, t1, t2));
4754 /* And invert MSB bits in t1, so MSB is set for elements from the same
4755 lane. */
4756 emit_insn (gen_xorv32qi3 (t1, t1, vt));
4757 /* Swap 128-bit lanes in t3. */
4758 t6 = gen_reg_rtx (V4DImode);
4759 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
4760 const2_rtx, GEN_INT (3),
4761 const0_rtx, const1_rtx));
4762 /* And or in the lower bits from mask into t1. */
4763 emit_insn (gen_iorv32qi3 (t1, t1, t2));
4764 if (one_operand_shuffle)
4765 {
4766 /* Each of these shuffles will put 0s in places where
4767 element from the other 128-bit lane is needed, otherwise
4768 will shuffle in the requested value. */
4769 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
4770 gen_lowpart (V32QImode, t6)));
4771 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
4772 /* For t3 the 128-bit lanes are swapped again. */
4773 t7 = gen_reg_rtx (V4DImode);
4774 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
4775 const2_rtx, GEN_INT (3),
4776 const0_rtx, const1_rtx));
4777 /* And oring both together leads to the result. */
4778 emit_insn (gen_iorv32qi3 (target, t1,
4779 gen_lowpart (V32QImode, t7)));
4780 if (target != operands[0])
4781 emit_move_insn (operands[0],
4782 gen_lowpart (GET_MODE (operands[0]), target));
4783 return;
4784 }
4785
4786 t4 = gen_reg_rtx (V32QImode);
4787 /* Similarly to the above one_operand_shuffle code,
4788 just for repeated twice for each operand. merge_two:
4789 code will merge the two results together. */
4790 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
4791 gen_lowpart (V32QImode, t6)));
4792 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
4793 gen_lowpart (V32QImode, t6)));
4794 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
4795 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
4796 t7 = gen_reg_rtx (V4DImode);
4797 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
4798 const2_rtx, GEN_INT (3),
4799 const0_rtx, const1_rtx));
4800 t8 = gen_reg_rtx (V4DImode);
4801 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
4802 const2_rtx, GEN_INT (3),
4803 const0_rtx, const1_rtx));
4804 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
4805 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
4806 t1 = t4;
4807 t2 = t3;
4808 goto merge_two;
4809
4810 default:
4811 gcc_assert (GET_MODE_SIZE (mode) <= 16);
4812 break;
4813 }
4814 }
4815
4816 if (TARGET_XOP)
4817 {
4818 /* The XOP VPPERM insn supports three inputs. By ignoring the
4819 one_operand_shuffle special case, we avoid creating another
4820 set of constant vectors in memory. */
4821 one_operand_shuffle = false;
4822
4823 /* mask = mask & {2*w-1, ...} */
4824 vt = GEN_INT (2*w - 1);
4825 }
4826 else
4827 {
4828 /* mask = mask & {w-1, ...} */
4829 vt = GEN_INT (w - 1);
4830 }
4831
4832 vt = gen_const_vec_duplicate (maskmode, vt);
4833 mask = expand_simple_binop (maskmode, AND, mask, vt,
4834 NULL_RTX, 0, OPTAB_DIRECT);
4835
4836 /* For non-QImode operations, convert the word permutation control
4837 into a byte permutation control. */
4838 if (mode != V16QImode)
4839 {
4840 mask = expand_simple_binop (maskmode, ASHIFT, mask,
4841 GEN_INT (exact_log2 (e)),
4842 NULL_RTX, 0, OPTAB_DIRECT);
4843
4844 /* Convert mask to vector of chars. */
4845 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
4846
4847 /* Replicate each of the input bytes into byte positions:
4848 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
4849 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
4850 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
4851 for (i = 0; i < 16; ++i)
4852 vec[i] = GEN_INT (i/e * e);
4853 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
4854 vt = validize_mem (force_const_mem (V16QImode, vt));
4855 if (TARGET_XOP)
4856 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
4857 else
4858 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
4859
4860 /* Convert it into the byte positions by doing
4861 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
4862 for (i = 0; i < 16; ++i)
4863 vec[i] = GEN_INT (i % e);
4864 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
4865 vt = validize_mem (force_const_mem (V16QImode, vt));
4866 emit_insn (gen_addv16qi3 (mask, mask, vt));
4867 }
4868
4869 /* The actual shuffle operations all operate on V16QImode. */
4870 op0 = gen_lowpart (V16QImode, op0);
4871 op1 = gen_lowpart (V16QImode, op1);
4872
4873 if (TARGET_XOP)
4874 {
4875 if (GET_MODE (target) != V16QImode)
4876 target = gen_reg_rtx (V16QImode);
4877 emit_insn (gen_xop_pperm (target, op0, op1, mask));
4878 if (target != operands[0])
4879 emit_move_insn (operands[0],
4880 gen_lowpart (GET_MODE (operands[0]), target));
4881 }
4882 else if (one_operand_shuffle)
4883 {
4884 if (GET_MODE (target) != V16QImode)
4885 target = gen_reg_rtx (V16QImode);
4886 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
4887 if (target != operands[0])
4888 emit_move_insn (operands[0],
4889 gen_lowpart (GET_MODE (operands[0]), target));
4890 }
4891 else
4892 {
4893 rtx xops[6];
4894 bool ok;
4895
4896 /* Shuffle the two input vectors independently. */
4897 t1 = gen_reg_rtx (V16QImode);
4898 t2 = gen_reg_rtx (V16QImode);
4899 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
4900 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
4901
4902 merge_two:
4903 /* Then merge them together. The key is whether any given control
4904 element contained a bit set that indicates the second word. */
4905 mask = operands[3];
4906 vt = GEN_INT (w);
4907 if (maskmode == V2DImode && !TARGET_SSE4_1)
4908 {
4909 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
4910 more shuffle to convert the V2DI input mask into a V4SI
4911 input mask. At which point the masking that expand_int_vcond
4912 will work as desired. */
4913 rtx t3 = gen_reg_rtx (V4SImode);
4914 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
4915 const0_rtx, const0_rtx,
4916 const2_rtx, const2_rtx));
4917 mask = t3;
4918 maskmode = V4SImode;
4919 e = w = 4;
4920 }
4921
4922 vt = gen_const_vec_duplicate (maskmode, vt);
4923 vt = force_reg (maskmode, vt);
4924 mask = expand_simple_binop (maskmode, AND, mask, vt,
4925 NULL_RTX, 0, OPTAB_DIRECT);
4926
4927 if (GET_MODE (target) != mode)
4928 target = gen_reg_rtx (mode);
4929 xops[0] = target;
4930 xops[1] = gen_lowpart (mode, t2);
4931 xops[2] = gen_lowpart (mode, t1);
4932 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
4933 xops[4] = mask;
4934 xops[5] = vt;
4935 ok = ix86_expand_int_vcond (xops);
4936 gcc_assert (ok);
4937 if (target != operands[0])
4938 emit_move_insn (operands[0],
4939 gen_lowpart (GET_MODE (operands[0]), target));
4940 }
4941 }
4942
4943 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
4944 true if we should do zero extension, else sign extension. HIGH_P is
4945 true if we want the N/2 high elements, else the low elements. */
4946
4947 void
4948 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
4949 {
4950 machine_mode imode = GET_MODE (src);
4951 rtx tmp;
4952
4953 if (TARGET_SSE4_1)
4954 {
4955 rtx (*unpack)(rtx, rtx);
4956 rtx (*extract)(rtx, rtx) = NULL;
4957 machine_mode halfmode = BLKmode;
4958
4959 switch (imode)
4960 {
4961 case E_V64QImode:
4962 if (unsigned_p)
4963 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
4964 else
4965 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
4966 halfmode = V32QImode;
4967 extract
4968 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
4969 break;
4970 case E_V32QImode:
4971 if (unsigned_p)
4972 unpack = gen_avx2_zero_extendv16qiv16hi2;
4973 else
4974 unpack = gen_avx2_sign_extendv16qiv16hi2;
4975 halfmode = V16QImode;
4976 extract
4977 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
4978 break;
4979 case E_V32HImode:
4980 if (unsigned_p)
4981 unpack = gen_avx512f_zero_extendv16hiv16si2;
4982 else
4983 unpack = gen_avx512f_sign_extendv16hiv16si2;
4984 halfmode = V16HImode;
4985 extract
4986 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
4987 break;
4988 case E_V16HImode:
4989 if (unsigned_p)
4990 unpack = gen_avx2_zero_extendv8hiv8si2;
4991 else
4992 unpack = gen_avx2_sign_extendv8hiv8si2;
4993 halfmode = V8HImode;
4994 extract
4995 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
4996 break;
4997 case E_V16SImode:
4998 if (unsigned_p)
4999 unpack = gen_avx512f_zero_extendv8siv8di2;
5000 else
5001 unpack = gen_avx512f_sign_extendv8siv8di2;
5002 halfmode = V8SImode;
5003 extract
5004 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
5005 break;
5006 case E_V8SImode:
5007 if (unsigned_p)
5008 unpack = gen_avx2_zero_extendv4siv4di2;
5009 else
5010 unpack = gen_avx2_sign_extendv4siv4di2;
5011 halfmode = V4SImode;
5012 extract
5013 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
5014 break;
5015 case E_V16QImode:
5016 if (unsigned_p)
5017 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
5018 else
5019 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
5020 break;
5021 case E_V8HImode:
5022 if (unsigned_p)
5023 unpack = gen_sse4_1_zero_extendv4hiv4si2;
5024 else
5025 unpack = gen_sse4_1_sign_extendv4hiv4si2;
5026 break;
5027 case E_V4SImode:
5028 if (unsigned_p)
5029 unpack = gen_sse4_1_zero_extendv2siv2di2;
5030 else
5031 unpack = gen_sse4_1_sign_extendv2siv2di2;
5032 break;
5033 default:
5034 gcc_unreachable ();
5035 }
5036
5037 if (GET_MODE_SIZE (imode) >= 32)
5038 {
5039 tmp = gen_reg_rtx (halfmode);
5040 emit_insn (extract (tmp, src));
5041 }
5042 else if (high_p)
5043 {
5044 /* Shift higher 8 bytes to lower 8 bytes. */
5045 tmp = gen_reg_rtx (V1TImode);
5046 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
5047 GEN_INT (64)));
5048 tmp = gen_lowpart (imode, tmp);
5049 }
5050 else
5051 tmp = src;
5052
5053 emit_insn (unpack (dest, tmp));
5054 }
5055 else
5056 {
5057 rtx (*unpack)(rtx, rtx, rtx);
5058
5059 switch (imode)
5060 {
5061 case E_V16QImode:
5062 if (high_p)
5063 unpack = gen_vec_interleave_highv16qi;
5064 else
5065 unpack = gen_vec_interleave_lowv16qi;
5066 break;
5067 case E_V8HImode:
5068 if (high_p)
5069 unpack = gen_vec_interleave_highv8hi;
5070 else
5071 unpack = gen_vec_interleave_lowv8hi;
5072 break;
5073 case E_V4SImode:
5074 if (high_p)
5075 unpack = gen_vec_interleave_highv4si;
5076 else
5077 unpack = gen_vec_interleave_lowv4si;
5078 break;
5079 default:
5080 gcc_unreachable ();
5081 }
5082
5083 if (unsigned_p)
5084 tmp = force_reg (imode, CONST0_RTX (imode));
5085 else
5086 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5087 src, pc_rtx, pc_rtx);
5088
5089 rtx tmp2 = gen_reg_rtx (imode);
5090 emit_insn (unpack (tmp2, src, tmp));
5091 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
5092 }
5093 }
5094
5095 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5096 but works for floating pointer parameters and nonoffsetable memories.
5097 For pushes, it returns just stack offsets; the values will be saved
5098 in the right order. Maximally three parts are generated. */
5099
5100 static int
5101 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
5102 {
5103 int size;
5104
5105 if (!TARGET_64BIT)
5106 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
5107 else
5108 size = (GET_MODE_SIZE (mode) + 4) / 8;
5109
5110 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
5111 gcc_assert (size >= 2 && size <= 4);
5112
5113 /* Optimize constant pool reference to immediates. This is used by fp
5114 moves, that force all constants to memory to allow combining. */
5115 if (MEM_P (operand) && MEM_READONLY_P (operand))
5116 operand = avoid_constant_pool_reference (operand);
5117
5118 if (MEM_P (operand) && !offsettable_memref_p (operand))
5119 {
5120 /* The only non-offsetable memories we handle are pushes. */
5121 int ok = push_operand (operand, VOIDmode);
5122
5123 gcc_assert (ok);
5124
5125 operand = copy_rtx (operand);
5126 PUT_MODE (operand, word_mode);
5127 parts[0] = parts[1] = parts[2] = parts[3] = operand;
5128 return size;
5129 }
5130
5131 if (GET_CODE (operand) == CONST_VECTOR)
5132 {
5133 scalar_int_mode imode = int_mode_for_mode (mode).require ();
5134 /* Caution: if we looked through a constant pool memory above,
5135 the operand may actually have a different mode now. That's
5136 ok, since we want to pun this all the way back to an integer. */
5137 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
5138 gcc_assert (operand != NULL);
5139 mode = imode;
5140 }
5141
5142 if (!TARGET_64BIT)
5143 {
5144 if (mode == DImode)
5145 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5146 else
5147 {
5148 int i;
5149
5150 if (REG_P (operand))
5151 {
5152 gcc_assert (reload_completed);
5153 for (i = 0; i < size; i++)
5154 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
5155 }
5156 else if (offsettable_memref_p (operand))
5157 {
5158 operand = adjust_address (operand, SImode, 0);
5159 parts[0] = operand;
5160 for (i = 1; i < size; i++)
5161 parts[i] = adjust_address (operand, SImode, 4 * i);
5162 }
5163 else if (CONST_DOUBLE_P (operand))
5164 {
5165 const REAL_VALUE_TYPE *r;
5166 long l[4];
5167
5168 r = CONST_DOUBLE_REAL_VALUE (operand);
5169 switch (mode)
5170 {
5171 case E_TFmode:
5172 real_to_target (l, r, mode);
5173 parts[3] = gen_int_mode (l[3], SImode);
5174 parts[2] = gen_int_mode (l[2], SImode);
5175 break;
5176 case E_XFmode:
5177 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5178 long double may not be 80-bit. */
5179 real_to_target (l, r, mode);
5180 parts[2] = gen_int_mode (l[2], SImode);
5181 break;
5182 case E_DFmode:
5183 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
5184 break;
5185 default:
5186 gcc_unreachable ();
5187 }
5188 parts[1] = gen_int_mode (l[1], SImode);
5189 parts[0] = gen_int_mode (l[0], SImode);
5190 }
5191 else
5192 gcc_unreachable ();
5193 }
5194 }
5195 else
5196 {
5197 if (mode == TImode)
5198 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5199 if (mode == XFmode || mode == TFmode)
5200 {
5201 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
5202 if (REG_P (operand))
5203 {
5204 gcc_assert (reload_completed);
5205 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
5206 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
5207 }
5208 else if (offsettable_memref_p (operand))
5209 {
5210 operand = adjust_address (operand, DImode, 0);
5211 parts[0] = operand;
5212 parts[1] = adjust_address (operand, upper_mode, 8);
5213 }
5214 else if (CONST_DOUBLE_P (operand))
5215 {
5216 long l[4];
5217
5218 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
5219
5220 /* real_to_target puts 32-bit pieces in each long. */
5221 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
5222 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
5223 << 32), DImode);
5224
5225 if (upper_mode == SImode)
5226 parts[1] = gen_int_mode (l[2], SImode);
5227 else
5228 parts[1]
5229 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
5230 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
5231 << 32), DImode);
5232 }
5233 else
5234 gcc_unreachable ();
5235 }
5236 }
5237
5238 return size;
5239 }
5240
5241 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5242 Return false when normal moves are needed; true when all required
5243 insns have been emitted. Operands 2-4 contain the input values
5244 int the correct order; operands 5-7 contain the output values. */
5245
5246 void
5247 ix86_split_long_move (rtx operands[])
5248 {
5249 rtx part[2][4];
5250 int nparts, i, j;
5251 int push = 0;
5252 int collisions = 0;
5253 machine_mode mode = GET_MODE (operands[0]);
5254 bool collisionparts[4];
5255
5256 /* The DFmode expanders may ask us to move double.
5257 For 64bit target this is single move. By hiding the fact
5258 here we simplify i386.md splitters. */
5259 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
5260 {
5261 /* Optimize constant pool reference to immediates. This is used by
5262 fp moves, that force all constants to memory to allow combining. */
5263
5264 if (MEM_P (operands[1])
5265 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
5266 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
5267 operands[1] = get_pool_constant (XEXP (operands[1], 0));
5268 if (push_operand (operands[0], VOIDmode))
5269 {
5270 operands[0] = copy_rtx (operands[0]);
5271 PUT_MODE (operands[0], word_mode);
5272 }
5273 else
5274 operands[0] = gen_lowpart (DImode, operands[0]);
5275 operands[1] = gen_lowpart (DImode, operands[1]);
5276 emit_move_insn (operands[0], operands[1]);
5277 return;
5278 }
5279
5280 /* The only non-offsettable memory we handle is push. */
5281 if (push_operand (operands[0], VOIDmode))
5282 push = 1;
5283 else
5284 gcc_assert (!MEM_P (operands[0])
5285 || offsettable_memref_p (operands[0]));
5286
5287 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
5288 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
5289
5290 /* When emitting push, take care for source operands on the stack. */
5291 if (push && MEM_P (operands[1])
5292 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
5293 {
5294 rtx src_base = XEXP (part[1][nparts - 1], 0);
5295
5296 /* Compensate for the stack decrement by 4. */
5297 if (!TARGET_64BIT && nparts == 3
5298 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
5299 src_base = plus_constant (Pmode, src_base, 4);
5300
5301 /* src_base refers to the stack pointer and is
5302 automatically decreased by emitted push. */
5303 for (i = 0; i < nparts; i++)
5304 part[1][i] = change_address (part[1][i],
5305 GET_MODE (part[1][i]), src_base);
5306 }
5307
5308 /* We need to do copy in the right order in case an address register
5309 of the source overlaps the destination. */
5310 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
5311 {
5312 rtx tmp;
5313
5314 for (i = 0; i < nparts; i++)
5315 {
5316 collisionparts[i]
5317 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
5318 if (collisionparts[i])
5319 collisions++;
5320 }
5321
5322 /* Collision in the middle part can be handled by reordering. */
5323 if (collisions == 1 && nparts == 3 && collisionparts [1])
5324 {
5325 std::swap (part[0][1], part[0][2]);
5326 std::swap (part[1][1], part[1][2]);
5327 }
5328 else if (collisions == 1
5329 && nparts == 4
5330 && (collisionparts [1] || collisionparts [2]))
5331 {
5332 if (collisionparts [1])
5333 {
5334 std::swap (part[0][1], part[0][2]);
5335 std::swap (part[1][1], part[1][2]);
5336 }
5337 else
5338 {
5339 std::swap (part[0][2], part[0][3]);
5340 std::swap (part[1][2], part[1][3]);
5341 }
5342 }
5343
5344 /* If there are more collisions, we can't handle it by reordering.
5345 Do an lea to the last part and use only one colliding move. */
5346 else if (collisions > 1)
5347 {
5348 rtx base, addr;
5349
5350 collisions = 1;
5351
5352 base = part[0][nparts - 1];
5353
5354 /* Handle the case when the last part isn't valid for lea.
5355 Happens in 64-bit mode storing the 12-byte XFmode. */
5356 if (GET_MODE (base) != Pmode)
5357 base = gen_rtx_REG (Pmode, REGNO (base));
5358
5359 addr = XEXP (part[1][0], 0);
5360 if (TARGET_TLS_DIRECT_SEG_REFS)
5361 {
5362 struct ix86_address parts;
5363 int ok = ix86_decompose_address (addr, &parts);
5364 gcc_assert (ok);
5365 /* It is not valid to use %gs: or %fs: in lea. */
5366 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
5367 }
5368 emit_insn (gen_rtx_SET (base, addr));
5369 part[1][0] = replace_equiv_address (part[1][0], base);
5370 for (i = 1; i < nparts; i++)
5371 {
5372 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
5373 part[1][i] = replace_equiv_address (part[1][i], tmp);
5374 }
5375 }
5376 }
5377
5378 if (push)
5379 {
5380 if (!TARGET_64BIT)
5381 {
5382 if (nparts == 3)
5383 {
5384 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
5385 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
5386 emit_move_insn (part[0][2], part[1][2]);
5387 }
5388 else if (nparts == 4)
5389 {
5390 emit_move_insn (part[0][3], part[1][3]);
5391 emit_move_insn (part[0][2], part[1][2]);
5392 }
5393 }
5394 else
5395 {
5396 /* In 64bit mode we don't have 32bit push available. In case this is
5397 register, it is OK - we will just use larger counterpart. We also
5398 retype memory - these comes from attempt to avoid REX prefix on
5399 moving of second half of TFmode value. */
5400 if (GET_MODE (part[1][1]) == SImode)
5401 {
5402 switch (GET_CODE (part[1][1]))
5403 {
5404 case MEM:
5405 part[1][1] = adjust_address (part[1][1], DImode, 0);
5406 break;
5407
5408 case REG:
5409 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
5410 break;
5411
5412 default:
5413 gcc_unreachable ();
5414 }
5415
5416 if (GET_MODE (part[1][0]) == SImode)
5417 part[1][0] = part[1][1];
5418 }
5419 }
5420 emit_move_insn (part[0][1], part[1][1]);
5421 emit_move_insn (part[0][0], part[1][0]);
5422 return;
5423 }
5424
5425 /* Choose correct order to not overwrite the source before it is copied. */
5426 if ((REG_P (part[0][0])
5427 && REG_P (part[1][1])
5428 && (REGNO (part[0][0]) == REGNO (part[1][1])
5429 || (nparts == 3
5430 && REGNO (part[0][0]) == REGNO (part[1][2]))
5431 || (nparts == 4
5432 && REGNO (part[0][0]) == REGNO (part[1][3]))))
5433 || (collisions > 0
5434 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
5435 {
5436 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
5437 {
5438 operands[2 + i] = part[0][j];
5439 operands[6 + i] = part[1][j];
5440 }
5441 }
5442 else
5443 {
5444 for (i = 0; i < nparts; i++)
5445 {
5446 operands[2 + i] = part[0][i];
5447 operands[6 + i] = part[1][i];
5448 }
5449 }
5450
5451 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
5452 if (optimize_insn_for_size_p ())
5453 {
5454 for (j = 0; j < nparts - 1; j++)
5455 if (CONST_INT_P (operands[6 + j])
5456 && operands[6 + j] != const0_rtx
5457 && REG_P (operands[2 + j]))
5458 for (i = j; i < nparts - 1; i++)
5459 if (CONST_INT_P (operands[7 + i])
5460 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
5461 operands[7 + i] = operands[2 + j];
5462 }
5463
5464 for (i = 0; i < nparts; i++)
5465 emit_move_insn (operands[2 + i], operands[6 + i]);
5466
5467 return;
5468 }
5469
5470 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
5471 left shift by a constant, either using a single shift or
5472 a sequence of add instructions. */
5473
5474 static void
5475 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
5476 {
5477 if (count == 1
5478 || (count * ix86_cost->add <= ix86_cost->shift_const
5479 && !optimize_insn_for_size_p ()))
5480 {
5481 while (count-- > 0)
5482 emit_insn (gen_add2_insn (operand, operand));
5483 }
5484 else
5485 {
5486 rtx (*insn)(rtx, rtx, rtx);
5487
5488 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5489 emit_insn (insn (operand, operand, GEN_INT (count)));
5490 }
5491 }
5492
5493 void
5494 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
5495 {
5496 rtx (*gen_ashl3)(rtx, rtx, rtx);
5497 rtx (*gen_shld)(rtx, rtx, rtx);
5498 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5499 machine_mode half_mode;
5500
5501 rtx low[2], high[2];
5502 int count;
5503
5504 if (CONST_INT_P (operands[2]))
5505 {
5506 split_double_mode (mode, operands, 2, low, high);
5507 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5508
5509 if (count >= half_width)
5510 {
5511 emit_move_insn (high[0], low[1]);
5512 emit_move_insn (low[0], const0_rtx);
5513
5514 if (count > half_width)
5515 ix86_expand_ashl_const (high[0], count - half_width, mode);
5516 }
5517 else
5518 {
5519 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5520
5521 if (!rtx_equal_p (operands[0], operands[1]))
5522 emit_move_insn (operands[0], operands[1]);
5523
5524 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
5525 ix86_expand_ashl_const (low[0], count, mode);
5526 }
5527 return;
5528 }
5529
5530 split_double_mode (mode, operands, 1, low, high);
5531 half_mode = mode == DImode ? SImode : DImode;
5532
5533 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
5534
5535 if (operands[1] == const1_rtx)
5536 {
5537 /* Assuming we've chosen a QImode capable registers, then 1 << N
5538 can be done with two 32/64-bit shifts, no branches, no cmoves. */
5539 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
5540 {
5541 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
5542
5543 ix86_expand_clear (low[0]);
5544 ix86_expand_clear (high[0]);
5545 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
5546
5547 d = gen_lowpart (QImode, low[0]);
5548 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5549 s = gen_rtx_EQ (QImode, flags, const0_rtx);
5550 emit_insn (gen_rtx_SET (d, s));
5551
5552 d = gen_lowpart (QImode, high[0]);
5553 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
5554 s = gen_rtx_NE (QImode, flags, const0_rtx);
5555 emit_insn (gen_rtx_SET (d, s));
5556 }
5557
5558 /* Otherwise, we can get the same results by manually performing
5559 a bit extract operation on bit 5/6, and then performing the two
5560 shifts. The two methods of getting 0/1 into low/high are exactly
5561 the same size. Avoiding the shift in the bit extract case helps
5562 pentium4 a bit; no one else seems to care much either way. */
5563 else
5564 {
5565 rtx (*gen_lshr3)(rtx, rtx, rtx);
5566 rtx (*gen_and3)(rtx, rtx, rtx);
5567 rtx (*gen_xor3)(rtx, rtx, rtx);
5568 HOST_WIDE_INT bits;
5569 rtx x;
5570
5571 if (mode == DImode)
5572 {
5573 gen_lshr3 = gen_lshrsi3;
5574 gen_and3 = gen_andsi3;
5575 gen_xor3 = gen_xorsi3;
5576 bits = 5;
5577 }
5578 else
5579 {
5580 gen_lshr3 = gen_lshrdi3;
5581 gen_and3 = gen_anddi3;
5582 gen_xor3 = gen_xordi3;
5583 bits = 6;
5584 }
5585
5586 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
5587 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
5588 else
5589 x = gen_lowpart (half_mode, operands[2]);
5590 emit_insn (gen_rtx_SET (high[0], x));
5591
5592 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
5593 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
5594 emit_move_insn (low[0], high[0]);
5595 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
5596 }
5597
5598 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5599 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
5600 return;
5601 }
5602
5603 if (operands[1] == constm1_rtx)
5604 {
5605 /* For -1 << N, we can avoid the shld instruction, because we
5606 know that we're shifting 0...31/63 ones into a -1. */
5607 emit_move_insn (low[0], constm1_rtx);
5608 if (optimize_insn_for_size_p ())
5609 emit_move_insn (high[0], low[0]);
5610 else
5611 emit_move_insn (high[0], constm1_rtx);
5612 }
5613 else
5614 {
5615 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
5616
5617 if (!rtx_equal_p (operands[0], operands[1]))
5618 emit_move_insn (operands[0], operands[1]);
5619
5620 split_double_mode (mode, operands, 1, low, high);
5621 emit_insn (gen_shld (high[0], low[0], operands[2]));
5622 }
5623
5624 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
5625
5626 if (TARGET_CMOVE && scratch)
5627 {
5628 ix86_expand_clear (scratch);
5629 emit_insn (gen_x86_shift_adj_1
5630 (half_mode, high[0], low[0], operands[2], scratch));
5631 }
5632 else
5633 emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
5634 }
5635
5636 void
5637 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
5638 {
5639 rtx (*gen_ashr3)(rtx, rtx, rtx)
5640 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
5641 rtx (*gen_shrd)(rtx, rtx, rtx);
5642 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5643
5644 rtx low[2], high[2];
5645 int count;
5646
5647 if (CONST_INT_P (operands[2]))
5648 {
5649 split_double_mode (mode, operands, 2, low, high);
5650 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5651
5652 if (count == GET_MODE_BITSIZE (mode) - 1)
5653 {
5654 emit_move_insn (high[0], high[1]);
5655 emit_insn (gen_ashr3 (high[0], high[0],
5656 GEN_INT (half_width - 1)));
5657 emit_move_insn (low[0], high[0]);
5658
5659 }
5660 else if (count >= half_width)
5661 {
5662 emit_move_insn (low[0], high[1]);
5663 emit_move_insn (high[0], low[0]);
5664 emit_insn (gen_ashr3 (high[0], high[0],
5665 GEN_INT (half_width - 1)));
5666
5667 if (count > half_width)
5668 emit_insn (gen_ashr3 (low[0], low[0],
5669 GEN_INT (count - half_width)));
5670 }
5671 else
5672 {
5673 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5674
5675 if (!rtx_equal_p (operands[0], operands[1]))
5676 emit_move_insn (operands[0], operands[1]);
5677
5678 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
5679 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
5680 }
5681 }
5682 else
5683 {
5684 machine_mode half_mode;
5685
5686 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5687
5688 if (!rtx_equal_p (operands[0], operands[1]))
5689 emit_move_insn (operands[0], operands[1]);
5690
5691 split_double_mode (mode, operands, 1, low, high);
5692 half_mode = mode == DImode ? SImode : DImode;
5693
5694 emit_insn (gen_shrd (low[0], high[0], operands[2]));
5695 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
5696
5697 if (TARGET_CMOVE && scratch)
5698 {
5699 emit_move_insn (scratch, high[0]);
5700 emit_insn (gen_ashr3 (scratch, scratch,
5701 GEN_INT (half_width - 1)));
5702 emit_insn (gen_x86_shift_adj_1
5703 (half_mode, low[0], high[0], operands[2], scratch));
5704 }
5705 else
5706 emit_insn (gen_x86_shift_adj_3
5707 (half_mode, low[0], high[0], operands[2]));
5708 }
5709 }
5710
5711 void
5712 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
5713 {
5714 rtx (*gen_lshr3)(rtx, rtx, rtx)
5715 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
5716 rtx (*gen_shrd)(rtx, rtx, rtx);
5717 int half_width = GET_MODE_BITSIZE (mode) >> 1;
5718
5719 rtx low[2], high[2];
5720 int count;
5721
5722 if (CONST_INT_P (operands[2]))
5723 {
5724 split_double_mode (mode, operands, 2, low, high);
5725 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
5726
5727 if (count >= half_width)
5728 {
5729 emit_move_insn (low[0], high[1]);
5730 ix86_expand_clear (high[0]);
5731
5732 if (count > half_width)
5733 emit_insn (gen_lshr3 (low[0], low[0],
5734 GEN_INT (count - half_width)));
5735 }
5736 else
5737 {
5738 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5739
5740 if (!rtx_equal_p (operands[0], operands[1]))
5741 emit_move_insn (operands[0], operands[1]);
5742
5743 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
5744 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
5745 }
5746 }
5747 else
5748 {
5749 machine_mode half_mode;
5750
5751 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
5752
5753 if (!rtx_equal_p (operands[0], operands[1]))
5754 emit_move_insn (operands[0], operands[1]);
5755
5756 split_double_mode (mode, operands, 1, low, high);
5757 half_mode = mode == DImode ? SImode : DImode;
5758
5759 emit_insn (gen_shrd (low[0], high[0], operands[2]));
5760 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
5761
5762 if (TARGET_CMOVE && scratch)
5763 {
5764 ix86_expand_clear (scratch);
5765 emit_insn (gen_x86_shift_adj_1
5766 (half_mode, low[0], high[0], operands[2], scratch));
5767 }
5768 else
5769 emit_insn (gen_x86_shift_adj_2
5770 (half_mode, low[0], high[0], operands[2]));
5771 }
5772 }
5773
5774 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
5775 DImode for constant loop counts. */
5776
5777 static machine_mode
5778 counter_mode (rtx count_exp)
5779 {
5780 if (GET_MODE (count_exp) != VOIDmode)
5781 return GET_MODE (count_exp);
5782 if (!CONST_INT_P (count_exp))
5783 return Pmode;
5784 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
5785 return DImode;
5786 return SImode;
5787 }
5788
5789 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
5790 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
5791 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
5792 memory by VALUE (supposed to be in MODE).
5793
5794 The size is rounded down to whole number of chunk size moved at once.
5795 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
5796
5797
5798 static void
5799 expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
5800 rtx destptr, rtx srcptr, rtx value,
5801 rtx count, machine_mode mode, int unroll,
5802 int expected_size, bool issetmem)
5803 {
5804 rtx_code_label *out_label, *top_label;
5805 rtx iter, tmp;
5806 machine_mode iter_mode = counter_mode (count);
5807 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
5808 rtx piece_size = GEN_INT (piece_size_n);
5809 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
5810 rtx size;
5811 int i;
5812
5813 top_label = gen_label_rtx ();
5814 out_label = gen_label_rtx ();
5815 iter = gen_reg_rtx (iter_mode);
5816
5817 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
5818 NULL, 1, OPTAB_DIRECT);
5819 /* Those two should combine. */
5820 if (piece_size == const1_rtx)
5821 {
5822 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
5823 true, out_label);
5824 predict_jump (REG_BR_PROB_BASE * 10 / 100);
5825 }
5826 emit_move_insn (iter, const0_rtx);
5827
5828 emit_label (top_label);
5829
5830 tmp = convert_modes (Pmode, iter_mode, iter, true);
5831
5832 /* This assert could be relaxed - in this case we'll need to compute
5833 smallest power of two, containing in PIECE_SIZE_N and pass it to
5834 offset_address. */
5835 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
5836 destmem = offset_address (destmem, tmp, piece_size_n);
5837 destmem = adjust_address (destmem, mode, 0);
5838
5839 if (!issetmem)
5840 {
5841 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
5842 srcmem = adjust_address (srcmem, mode, 0);
5843
5844 /* When unrolling for chips that reorder memory reads and writes,
5845 we can save registers by using single temporary.
5846 Also using 4 temporaries is overkill in 32bit mode. */
5847 if (!TARGET_64BIT && 0)
5848 {
5849 for (i = 0; i < unroll; i++)
5850 {
5851 if (i)
5852 {
5853 destmem = adjust_address (copy_rtx (destmem), mode,
5854 GET_MODE_SIZE (mode));
5855 srcmem = adjust_address (copy_rtx (srcmem), mode,
5856 GET_MODE_SIZE (mode));
5857 }
5858 emit_move_insn (destmem, srcmem);
5859 }
5860 }
5861 else
5862 {
5863 rtx tmpreg[4];
5864 gcc_assert (unroll <= 4);
5865 for (i = 0; i < unroll; i++)
5866 {
5867 tmpreg[i] = gen_reg_rtx (mode);
5868 if (i)
5869 srcmem = adjust_address (copy_rtx (srcmem), mode,
5870 GET_MODE_SIZE (mode));
5871 emit_move_insn (tmpreg[i], srcmem);
5872 }
5873 for (i = 0; i < unroll; i++)
5874 {
5875 if (i)
5876 destmem = adjust_address (copy_rtx (destmem), mode,
5877 GET_MODE_SIZE (mode));
5878 emit_move_insn (destmem, tmpreg[i]);
5879 }
5880 }
5881 }
5882 else
5883 for (i = 0; i < unroll; i++)
5884 {
5885 if (i)
5886 destmem = adjust_address (copy_rtx (destmem), mode,
5887 GET_MODE_SIZE (mode));
5888 emit_move_insn (destmem, value);
5889 }
5890
5891 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
5892 true, OPTAB_LIB_WIDEN);
5893 if (tmp != iter)
5894 emit_move_insn (iter, tmp);
5895
5896 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
5897 true, top_label);
5898 if (expected_size != -1)
5899 {
5900 expected_size /= GET_MODE_SIZE (mode) * unroll;
5901 if (expected_size == 0)
5902 predict_jump (0);
5903 else if (expected_size > REG_BR_PROB_BASE)
5904 predict_jump (REG_BR_PROB_BASE - 1);
5905 else
5906 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
5907 / expected_size);
5908 }
5909 else
5910 predict_jump (REG_BR_PROB_BASE * 80 / 100);
5911 iter = ix86_zero_extend_to_Pmode (iter);
5912 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
5913 true, OPTAB_LIB_WIDEN);
5914 if (tmp != destptr)
5915 emit_move_insn (destptr, tmp);
5916 if (!issetmem)
5917 {
5918 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
5919 true, OPTAB_LIB_WIDEN);
5920 if (tmp != srcptr)
5921 emit_move_insn (srcptr, tmp);
5922 }
5923 emit_label (out_label);
5924 }
5925
5926 /* Divide COUNTREG by SCALE. */
5927 static rtx
5928 scale_counter (rtx countreg, int scale)
5929 {
5930 rtx sc;
5931
5932 if (scale == 1)
5933 return countreg;
5934 if (CONST_INT_P (countreg))
5935 return GEN_INT (INTVAL (countreg) / scale);
5936 gcc_assert (REG_P (countreg));
5937
5938 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
5939 GEN_INT (exact_log2 (scale)),
5940 NULL, 1, OPTAB_DIRECT);
5941 return sc;
5942 }
5943
5944 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
5945 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
5946 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
5947 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
5948 ORIG_VALUE is the original value passed to memset to fill the memory with.
5949 Other arguments have same meaning as for previous function. */
5950
5951 static void
5952 expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
5953 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
5954 rtx count,
5955 machine_mode mode, bool issetmem)
5956 {
5957 rtx destexp;
5958 rtx srcexp;
5959 rtx countreg;
5960 HOST_WIDE_INT rounded_count;
5961
5962 /* If possible, it is shorter to use rep movs.
5963 TODO: Maybe it is better to move this logic to decide_alg. */
5964 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
5965 && (!issetmem || orig_value == const0_rtx))
5966 mode = SImode;
5967
5968 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
5969 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
5970
5971 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
5972 GET_MODE_SIZE (mode)));
5973 if (mode != QImode)
5974 {
5975 destexp = gen_rtx_ASHIFT (Pmode, countreg,
5976 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
5977 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
5978 }
5979 else
5980 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
5981 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
5982 {
5983 rounded_count
5984 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
5985 destmem = shallow_copy_rtx (destmem);
5986 set_mem_size (destmem, rounded_count);
5987 }
5988 else if (MEM_SIZE_KNOWN_P (destmem))
5989 clear_mem_size (destmem);
5990
5991 if (issetmem)
5992 {
5993 value = force_reg (mode, gen_lowpart (mode, value));
5994 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
5995 }
5996 else
5997 {
5998 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
5999 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
6000 if (mode != QImode)
6001 {
6002 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
6003 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
6004 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
6005 }
6006 else
6007 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
6008 if (CONST_INT_P (count))
6009 {
6010 rounded_count
6011 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
6012 srcmem = shallow_copy_rtx (srcmem);
6013 set_mem_size (srcmem, rounded_count);
6014 }
6015 else
6016 {
6017 if (MEM_SIZE_KNOWN_P (srcmem))
6018 clear_mem_size (srcmem);
6019 }
6020 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
6021 destexp, srcexp));
6022 }
6023 }
6024
6025 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
6026 DESTMEM.
6027 SRC is passed by pointer to be updated on return.
6028 Return value is updated DST. */
6029 static rtx
6030 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
6031 HOST_WIDE_INT size_to_move)
6032 {
6033 rtx dst = destmem, src = *srcmem, adjust, tempreg;
6034 enum insn_code code;
6035 machine_mode move_mode;
6036 int piece_size, i;
6037
6038 /* Find the widest mode in which we could perform moves.
6039 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6040 it until move of such size is supported. */
6041 piece_size = 1 << floor_log2 (size_to_move);
6042 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
6043 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6044 {
6045 gcc_assert (piece_size > 1);
6046 piece_size >>= 1;
6047 }
6048
6049 /* Find the corresponding vector mode with the same size as MOVE_MODE.
6050 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
6051 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
6052 {
6053 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
6054 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
6055 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
6056 {
6057 move_mode = word_mode;
6058 piece_size = GET_MODE_SIZE (move_mode);
6059 code = optab_handler (mov_optab, move_mode);
6060 }
6061 }
6062 gcc_assert (code != CODE_FOR_nothing);
6063
6064 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6065 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
6066
6067 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6068 gcc_assert (size_to_move % piece_size == 0);
6069 adjust = GEN_INT (piece_size);
6070 for (i = 0; i < size_to_move; i += piece_size)
6071 {
6072 /* We move from memory to memory, so we'll need to do it via
6073 a temporary register. */
6074 tempreg = gen_reg_rtx (move_mode);
6075 emit_insn (GEN_FCN (code) (tempreg, src));
6076 emit_insn (GEN_FCN (code) (dst, tempreg));
6077
6078 emit_move_insn (destptr,
6079 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
6080 emit_move_insn (srcptr,
6081 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
6082
6083 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6084 piece_size);
6085 src = adjust_automodify_address_nv (src, move_mode, srcptr,
6086 piece_size);
6087 }
6088
6089 /* Update DST and SRC rtx. */
6090 *srcmem = src;
6091 return dst;
6092 }
6093
6094 /* Helper function for the string operations below. Dest VARIABLE whether
6095 it is aligned to VALUE bytes. If true, jump to the label. */
6096
6097 static rtx_code_label *
6098 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
6099 {
6100 rtx_code_label *label = gen_label_rtx ();
6101 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
6102 if (GET_MODE (variable) == DImode)
6103 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
6104 else
6105 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
6106 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
6107 1, label);
6108 if (epilogue)
6109 predict_jump (REG_BR_PROB_BASE * 50 / 100);
6110 else
6111 predict_jump (REG_BR_PROB_BASE * 90 / 100);
6112 return label;
6113 }
6114
6115
6116 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
6117
6118 static void
6119 expand_cpymem_epilogue (rtx destmem, rtx srcmem,
6120 rtx destptr, rtx srcptr, rtx count, int max_size)
6121 {
6122 rtx src, dest;
6123 if (CONST_INT_P (count))
6124 {
6125 HOST_WIDE_INT countval = INTVAL (count);
6126 HOST_WIDE_INT epilogue_size = countval % max_size;
6127 int i;
6128
6129 /* For now MAX_SIZE should be a power of 2. This assert could be
6130 relaxed, but it'll require a bit more complicated epilogue
6131 expanding. */
6132 gcc_assert ((max_size & (max_size - 1)) == 0);
6133 for (i = max_size; i >= 1; i >>= 1)
6134 {
6135 if (epilogue_size & i)
6136 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6137 }
6138 return;
6139 }
6140 if (max_size > 8)
6141 {
6142 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
6143 count, 1, OPTAB_DIRECT);
6144 expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
6145 count, QImode, 1, 4, false);
6146 return;
6147 }
6148
6149 /* When there are stringops, we can cheaply increase dest and src pointers.
6150 Otherwise we save code size by maintaining offset (zero is readily
6151 available from preceding rep operation) and using x86 addressing modes.
6152 */
6153 if (TARGET_SINGLE_STRINGOP)
6154 {
6155 if (max_size > 4)
6156 {
6157 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6158 src = change_address (srcmem, SImode, srcptr);
6159 dest = change_address (destmem, SImode, destptr);
6160 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6161 emit_label (label);
6162 LABEL_NUSES (label) = 1;
6163 }
6164 if (max_size > 2)
6165 {
6166 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6167 src = change_address (srcmem, HImode, srcptr);
6168 dest = change_address (destmem, HImode, destptr);
6169 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6170 emit_label (label);
6171 LABEL_NUSES (label) = 1;
6172 }
6173 if (max_size > 1)
6174 {
6175 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6176 src = change_address (srcmem, QImode, srcptr);
6177 dest = change_address (destmem, QImode, destptr);
6178 emit_insn (gen_strmov (destptr, dest, srcptr, src));
6179 emit_label (label);
6180 LABEL_NUSES (label) = 1;
6181 }
6182 }
6183 else
6184 {
6185 rtx offset = force_reg (Pmode, const0_rtx);
6186 rtx tmp;
6187
6188 if (max_size > 4)
6189 {
6190 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6191 src = change_address (srcmem, SImode, srcptr);
6192 dest = change_address (destmem, SImode, destptr);
6193 emit_move_insn (dest, src);
6194 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
6195 true, OPTAB_LIB_WIDEN);
6196 if (tmp != offset)
6197 emit_move_insn (offset, tmp);
6198 emit_label (label);
6199 LABEL_NUSES (label) = 1;
6200 }
6201 if (max_size > 2)
6202 {
6203 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6204 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6205 src = change_address (srcmem, HImode, tmp);
6206 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6207 dest = change_address (destmem, HImode, tmp);
6208 emit_move_insn (dest, src);
6209 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
6210 true, OPTAB_LIB_WIDEN);
6211 if (tmp != offset)
6212 emit_move_insn (offset, tmp);
6213 emit_label (label);
6214 LABEL_NUSES (label) = 1;
6215 }
6216 if (max_size > 1)
6217 {
6218 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6219 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
6220 src = change_address (srcmem, QImode, tmp);
6221 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
6222 dest = change_address (destmem, QImode, tmp);
6223 emit_move_insn (dest, src);
6224 emit_label (label);
6225 LABEL_NUSES (label) = 1;
6226 }
6227 }
6228 }
6229
6230 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
6231 with value PROMOTED_VAL.
6232 SRC is passed by pointer to be updated on return.
6233 Return value is updated DST. */
6234 static rtx
6235 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
6236 HOST_WIDE_INT size_to_move)
6237 {
6238 rtx dst = destmem, adjust;
6239 enum insn_code code;
6240 machine_mode move_mode;
6241 int piece_size, i;
6242
6243 /* Find the widest mode in which we could perform moves.
6244 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6245 it until move of such size is supported. */
6246 move_mode = GET_MODE (promoted_val);
6247 if (move_mode == VOIDmode)
6248 move_mode = QImode;
6249 if (size_to_move < GET_MODE_SIZE (move_mode))
6250 {
6251 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
6252 move_mode = int_mode_for_size (move_bits, 0).require ();
6253 promoted_val = gen_lowpart (move_mode, promoted_val);
6254 }
6255 piece_size = GET_MODE_SIZE (move_mode);
6256 code = optab_handler (mov_optab, move_mode);
6257 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
6258
6259 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
6260
6261 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6262 gcc_assert (size_to_move % piece_size == 0);
6263 adjust = GEN_INT (piece_size);
6264 for (i = 0; i < size_to_move; i += piece_size)
6265 {
6266 if (piece_size <= GET_MODE_SIZE (word_mode))
6267 {
6268 emit_insn (gen_strset (destptr, dst, promoted_val));
6269 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6270 piece_size);
6271 continue;
6272 }
6273
6274 emit_insn (GEN_FCN (code) (dst, promoted_val));
6275
6276 emit_move_insn (destptr,
6277 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
6278
6279 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
6280 piece_size);
6281 }
6282
6283 /* Update DST rtx. */
6284 return dst;
6285 }
6286 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6287 static void
6288 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
6289 rtx count, int max_size)
6290 {
6291 count = expand_simple_binop (counter_mode (count), AND, count,
6292 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
6293 expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
6294 gen_lowpart (QImode, value), count, QImode,
6295 1, max_size / 2, true);
6296 }
6297
6298 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6299 static void
6300 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
6301 rtx count, int max_size)
6302 {
6303 rtx dest;
6304
6305 if (CONST_INT_P (count))
6306 {
6307 HOST_WIDE_INT countval = INTVAL (count);
6308 HOST_WIDE_INT epilogue_size = countval % max_size;
6309 int i;
6310
6311 /* For now MAX_SIZE should be a power of 2. This assert could be
6312 relaxed, but it'll require a bit more complicated epilogue
6313 expanding. */
6314 gcc_assert ((max_size & (max_size - 1)) == 0);
6315 for (i = max_size; i >= 1; i >>= 1)
6316 {
6317 if (epilogue_size & i)
6318 {
6319 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6320 destmem = emit_memset (destmem, destptr, vec_value, i);
6321 else
6322 destmem = emit_memset (destmem, destptr, value, i);
6323 }
6324 }
6325 return;
6326 }
6327 if (max_size > 32)
6328 {
6329 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
6330 return;
6331 }
6332 if (max_size > 16)
6333 {
6334 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
6335 if (TARGET_64BIT)
6336 {
6337 dest = change_address (destmem, DImode, destptr);
6338 emit_insn (gen_strset (destptr, dest, value));
6339 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
6340 emit_insn (gen_strset (destptr, dest, value));
6341 }
6342 else
6343 {
6344 dest = change_address (destmem, SImode, destptr);
6345 emit_insn (gen_strset (destptr, dest, value));
6346 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6347 emit_insn (gen_strset (destptr, dest, value));
6348 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
6349 emit_insn (gen_strset (destptr, dest, value));
6350 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
6351 emit_insn (gen_strset (destptr, dest, value));
6352 }
6353 emit_label (label);
6354 LABEL_NUSES (label) = 1;
6355 }
6356 if (max_size > 8)
6357 {
6358 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
6359 if (TARGET_64BIT)
6360 {
6361 dest = change_address (destmem, DImode, destptr);
6362 emit_insn (gen_strset (destptr, dest, value));
6363 }
6364 else
6365 {
6366 dest = change_address (destmem, SImode, destptr);
6367 emit_insn (gen_strset (destptr, dest, value));
6368 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
6369 emit_insn (gen_strset (destptr, dest, value));
6370 }
6371 emit_label (label);
6372 LABEL_NUSES (label) = 1;
6373 }
6374 if (max_size > 4)
6375 {
6376 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
6377 dest = change_address (destmem, SImode, destptr);
6378 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
6379 emit_label (label);
6380 LABEL_NUSES (label) = 1;
6381 }
6382 if (max_size > 2)
6383 {
6384 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
6385 dest = change_address (destmem, HImode, destptr);
6386 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
6387 emit_label (label);
6388 LABEL_NUSES (label) = 1;
6389 }
6390 if (max_size > 1)
6391 {
6392 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
6393 dest = change_address (destmem, QImode, destptr);
6394 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
6395 emit_label (label);
6396 LABEL_NUSES (label) = 1;
6397 }
6398 }
6399
6400 /* Adjust COUNTER by the VALUE. */
6401 static void
6402 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
6403 {
6404 emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
6405 }
6406
6407 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
6408 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
6409 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
6410 ignored.
6411 Return value is updated DESTMEM. */
6412
6413 static rtx
6414 expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
6415 rtx destptr, rtx srcptr, rtx value,
6416 rtx vec_value, rtx count, int align,
6417 int desired_alignment, bool issetmem)
6418 {
6419 int i;
6420 for (i = 1; i < desired_alignment; i <<= 1)
6421 {
6422 if (align <= i)
6423 {
6424 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
6425 if (issetmem)
6426 {
6427 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
6428 destmem = emit_memset (destmem, destptr, vec_value, i);
6429 else
6430 destmem = emit_memset (destmem, destptr, value, i);
6431 }
6432 else
6433 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
6434 ix86_adjust_counter (count, i);
6435 emit_label (label);
6436 LABEL_NUSES (label) = 1;
6437 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
6438 }
6439 }
6440 return destmem;
6441 }
6442
6443 /* Test if COUNT&SIZE is nonzero and if so, expand movme
6444 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
6445 and jump to DONE_LABEL. */
6446 static void
6447 expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
6448 rtx destptr, rtx srcptr,
6449 rtx value, rtx vec_value,
6450 rtx count, int size,
6451 rtx done_label, bool issetmem)
6452 {
6453 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
6454 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
6455 rtx modesize;
6456 int n;
6457
6458 /* If we do not have vector value to copy, we must reduce size. */
6459 if (issetmem)
6460 {
6461 if (!vec_value)
6462 {
6463 if (GET_MODE (value) == VOIDmode && size > 8)
6464 mode = Pmode;
6465 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
6466 mode = GET_MODE (value);
6467 }
6468 else
6469 mode = GET_MODE (vec_value), value = vec_value;
6470 }
6471 else
6472 {
6473 /* Choose appropriate vector mode. */
6474 if (size >= 32)
6475 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
6476 else if (size >= 16)
6477 mode = TARGET_SSE ? V16QImode : DImode;
6478 srcmem = change_address (srcmem, mode, srcptr);
6479 }
6480 destmem = change_address (destmem, mode, destptr);
6481 modesize = GEN_INT (GET_MODE_SIZE (mode));
6482 gcc_assert (GET_MODE_SIZE (mode) <= size);
6483 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6484 {
6485 if (issetmem)
6486 emit_move_insn (destmem, gen_lowpart (mode, value));
6487 else
6488 {
6489 emit_move_insn (destmem, srcmem);
6490 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6491 }
6492 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6493 }
6494
6495 destmem = offset_address (destmem, count, 1);
6496 destmem = offset_address (destmem, GEN_INT (-2 * size),
6497 GET_MODE_SIZE (mode));
6498 if (!issetmem)
6499 {
6500 srcmem = offset_address (srcmem, count, 1);
6501 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
6502 GET_MODE_SIZE (mode));
6503 }
6504 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
6505 {
6506 if (issetmem)
6507 emit_move_insn (destmem, gen_lowpart (mode, value));
6508 else
6509 {
6510 emit_move_insn (destmem, srcmem);
6511 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6512 }
6513 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6514 }
6515 emit_jump_insn (gen_jump (done_label));
6516 emit_barrier ();
6517
6518 emit_label (label);
6519 LABEL_NUSES (label) = 1;
6520 }
6521
6522 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
6523 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
6524 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
6525 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
6526 DONE_LABEL is a label after the whole copying sequence. The label is created
6527 on demand if *DONE_LABEL is NULL.
6528 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
6529 bounds after the initial copies.
6530
6531 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
6532 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
6533 we will dispatch to a library call for large blocks.
6534
6535 In pseudocode we do:
6536
6537 if (COUNT < SIZE)
6538 {
6539 Assume that SIZE is 4. Bigger sizes are handled analogously
6540 if (COUNT & 4)
6541 {
6542 copy 4 bytes from SRCPTR to DESTPTR
6543 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
6544 goto done_label
6545 }
6546 if (!COUNT)
6547 goto done_label;
6548 copy 1 byte from SRCPTR to DESTPTR
6549 if (COUNT & 2)
6550 {
6551 copy 2 bytes from SRCPTR to DESTPTR
6552 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
6553 }
6554 }
6555 else
6556 {
6557 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
6558 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
6559
6560 OLD_DESPTR = DESTPTR;
6561 Align DESTPTR up to DESIRED_ALIGN
6562 SRCPTR += DESTPTR - OLD_DESTPTR
6563 COUNT -= DEST_PTR - OLD_DESTPTR
6564 if (DYNAMIC_CHECK)
6565 Round COUNT down to multiple of SIZE
6566 << optional caller supplied zero size guard is here >>
6567 << optional caller supplied dynamic check is here >>
6568 << caller supplied main copy loop is here >>
6569 }
6570 done_label:
6571 */
6572 static void
6573 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
6574 rtx *destptr, rtx *srcptr,
6575 machine_mode mode,
6576 rtx value, rtx vec_value,
6577 rtx *count,
6578 rtx_code_label **done_label,
6579 int size,
6580 int desired_align,
6581 int align,
6582 unsigned HOST_WIDE_INT *min_size,
6583 bool dynamic_check,
6584 bool issetmem)
6585 {
6586 rtx_code_label *loop_label = NULL, *label;
6587 int n;
6588 rtx modesize;
6589 int prolog_size = 0;
6590 rtx mode_value;
6591
6592 /* Chose proper value to copy. */
6593 if (issetmem && VECTOR_MODE_P (mode))
6594 mode_value = vec_value;
6595 else
6596 mode_value = value;
6597 gcc_assert (GET_MODE_SIZE (mode) <= size);
6598
6599 /* See if block is big or small, handle small blocks. */
6600 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
6601 {
6602 int size2 = size;
6603 loop_label = gen_label_rtx ();
6604
6605 if (!*done_label)
6606 *done_label = gen_label_rtx ();
6607
6608 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
6609 1, loop_label);
6610 size2 >>= 1;
6611
6612 /* Handle sizes > 3. */
6613 for (;size2 > 2; size2 >>= 1)
6614 expand_small_cpymem_or_setmem (destmem, srcmem,
6615 *destptr, *srcptr,
6616 value, vec_value,
6617 *count,
6618 size2, *done_label, issetmem);
6619 /* Nothing to copy? Jump to DONE_LABEL if so */
6620 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
6621 1, *done_label);
6622
6623 /* Do a byte copy. */
6624 destmem = change_address (destmem, QImode, *destptr);
6625 if (issetmem)
6626 emit_move_insn (destmem, gen_lowpart (QImode, value));
6627 else
6628 {
6629 srcmem = change_address (srcmem, QImode, *srcptr);
6630 emit_move_insn (destmem, srcmem);
6631 }
6632
6633 /* Handle sizes 2 and 3. */
6634 label = ix86_expand_aligntest (*count, 2, false);
6635 destmem = change_address (destmem, HImode, *destptr);
6636 destmem = offset_address (destmem, *count, 1);
6637 destmem = offset_address (destmem, GEN_INT (-2), 2);
6638 if (issetmem)
6639 emit_move_insn (destmem, gen_lowpart (HImode, value));
6640 else
6641 {
6642 srcmem = change_address (srcmem, HImode, *srcptr);
6643 srcmem = offset_address (srcmem, *count, 1);
6644 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
6645 emit_move_insn (destmem, srcmem);
6646 }
6647
6648 emit_label (label);
6649 LABEL_NUSES (label) = 1;
6650 emit_jump_insn (gen_jump (*done_label));
6651 emit_barrier ();
6652 }
6653 else
6654 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
6655 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
6656
6657 /* Start memcpy for COUNT >= SIZE. */
6658 if (loop_label)
6659 {
6660 emit_label (loop_label);
6661 LABEL_NUSES (loop_label) = 1;
6662 }
6663
6664 /* Copy first desired_align bytes. */
6665 if (!issetmem)
6666 srcmem = change_address (srcmem, mode, *srcptr);
6667 destmem = change_address (destmem, mode, *destptr);
6668 modesize = GEN_INT (GET_MODE_SIZE (mode));
6669 for (n = 0; prolog_size < desired_align - align; n++)
6670 {
6671 if (issetmem)
6672 emit_move_insn (destmem, mode_value);
6673 else
6674 {
6675 emit_move_insn (destmem, srcmem);
6676 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
6677 }
6678 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
6679 prolog_size += GET_MODE_SIZE (mode);
6680 }
6681
6682
6683 /* Copy last SIZE bytes. */
6684 destmem = offset_address (destmem, *count, 1);
6685 destmem = offset_address (destmem,
6686 GEN_INT (-size - prolog_size),
6687 1);
6688 if (issetmem)
6689 emit_move_insn (destmem, mode_value);
6690 else
6691 {
6692 srcmem = offset_address (srcmem, *count, 1);
6693 srcmem = offset_address (srcmem,
6694 GEN_INT (-size - prolog_size),
6695 1);
6696 emit_move_insn (destmem, srcmem);
6697 }
6698 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
6699 {
6700 destmem = offset_address (destmem, modesize, 1);
6701 if (issetmem)
6702 emit_move_insn (destmem, mode_value);
6703 else
6704 {
6705 srcmem = offset_address (srcmem, modesize, 1);
6706 emit_move_insn (destmem, srcmem);
6707 }
6708 }
6709
6710 /* Align destination. */
6711 if (desired_align > 1 && desired_align > align)
6712 {
6713 rtx saveddest = *destptr;
6714
6715 gcc_assert (desired_align <= size);
6716 /* Align destptr up, place it to new register. */
6717 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
6718 GEN_INT (prolog_size),
6719 NULL_RTX, 1, OPTAB_DIRECT);
6720 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
6721 REG_POINTER (*destptr) = 1;
6722 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
6723 GEN_INT (-desired_align),
6724 *destptr, 1, OPTAB_DIRECT);
6725 /* See how many bytes we skipped. */
6726 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
6727 *destptr,
6728 saveddest, 1, OPTAB_DIRECT);
6729 /* Adjust srcptr and count. */
6730 if (!issetmem)
6731 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
6732 saveddest, *srcptr, 1, OPTAB_DIRECT);
6733 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
6734 saveddest, *count, 1, OPTAB_DIRECT);
6735 /* We copied at most size + prolog_size. */
6736 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
6737 *min_size
6738 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
6739 else
6740 *min_size = 0;
6741
6742 /* Our loops always round down the block size, but for dispatch to
6743 library we need precise value. */
6744 if (dynamic_check)
6745 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
6746 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
6747 }
6748 else
6749 {
6750 gcc_assert (prolog_size == 0);
6751 /* Decrease count, so we won't end up copying last word twice. */
6752 if (!CONST_INT_P (*count))
6753 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
6754 constm1_rtx, *count, 1, OPTAB_DIRECT);
6755 else
6756 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
6757 (unsigned HOST_WIDE_INT)size));
6758 if (*min_size)
6759 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
6760 }
6761 }
6762
6763
6764 /* This function is like the previous one, except here we know how many bytes
6765 need to be copied. That allows us to update alignment not only of DST, which
6766 is returned, but also of SRC, which is passed as a pointer for that
6767 reason. */
6768 static rtx
6769 expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
6770 rtx srcreg, rtx value, rtx vec_value,
6771 int desired_align, int align_bytes,
6772 bool issetmem)
6773 {
6774 rtx src = NULL;
6775 rtx orig_dst = dst;
6776 rtx orig_src = NULL;
6777 int piece_size = 1;
6778 int copied_bytes = 0;
6779
6780 if (!issetmem)
6781 {
6782 gcc_assert (srcp != NULL);
6783 src = *srcp;
6784 orig_src = src;
6785 }
6786
6787 for (piece_size = 1;
6788 piece_size <= desired_align && copied_bytes < align_bytes;
6789 piece_size <<= 1)
6790 {
6791 if (align_bytes & piece_size)
6792 {
6793 if (issetmem)
6794 {
6795 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
6796 dst = emit_memset (dst, destreg, vec_value, piece_size);
6797 else
6798 dst = emit_memset (dst, destreg, value, piece_size);
6799 }
6800 else
6801 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
6802 copied_bytes += piece_size;
6803 }
6804 }
6805 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
6806 set_mem_align (dst, desired_align * BITS_PER_UNIT);
6807 if (MEM_SIZE_KNOWN_P (orig_dst))
6808 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
6809
6810 if (!issetmem)
6811 {
6812 int src_align_bytes = get_mem_align_offset (src, desired_align
6813 * BITS_PER_UNIT);
6814 if (src_align_bytes >= 0)
6815 src_align_bytes = desired_align - src_align_bytes;
6816 if (src_align_bytes >= 0)
6817 {
6818 unsigned int src_align;
6819 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
6820 {
6821 if ((src_align_bytes & (src_align - 1))
6822 == (align_bytes & (src_align - 1)))
6823 break;
6824 }
6825 if (src_align > (unsigned int) desired_align)
6826 src_align = desired_align;
6827 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
6828 set_mem_align (src, src_align * BITS_PER_UNIT);
6829 }
6830 if (MEM_SIZE_KNOWN_P (orig_src))
6831 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
6832 *srcp = src;
6833 }
6834
6835 return dst;
6836 }
6837
6838 /* Return true if ALG can be used in current context.
6839 Assume we expand memset if MEMSET is true. */
6840 static bool
6841 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
6842 {
6843 if (alg == no_stringop)
6844 return false;
6845 if (alg == vector_loop)
6846 return TARGET_SSE || TARGET_AVX;
6847 /* Algorithms using the rep prefix want at least edi and ecx;
6848 additionally, memset wants eax and memcpy wants esi. Don't
6849 consider such algorithms if the user has appropriated those
6850 registers for their own purposes, or if we have a non-default
6851 address space, since some string insns cannot override the segment. */
6852 if (alg == rep_prefix_1_byte
6853 || alg == rep_prefix_4_byte
6854 || alg == rep_prefix_8_byte)
6855 {
6856 if (have_as)
6857 return false;
6858 if (fixed_regs[CX_REG]
6859 || fixed_regs[DI_REG]
6860 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
6861 return false;
6862 }
6863 return true;
6864 }
6865
6866 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
6867 static enum stringop_alg
6868 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
6869 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
6870 bool memset, bool zero_memset, bool have_as,
6871 int *dynamic_check, bool *noalign, bool recur)
6872 {
6873 const struct stringop_algs *algs;
6874 bool optimize_for_speed;
6875 int max = 0;
6876 const struct processor_costs *cost;
6877 int i;
6878 bool any_alg_usable_p = false;
6879
6880 *noalign = false;
6881 *dynamic_check = -1;
6882
6883 /* Even if the string operation call is cold, we still might spend a lot
6884 of time processing large blocks. */
6885 if (optimize_function_for_size_p (cfun)
6886 || (optimize_insn_for_size_p ()
6887 && (max_size < 256
6888 || (expected_size != -1 && expected_size < 256))))
6889 optimize_for_speed = false;
6890 else
6891 optimize_for_speed = true;
6892
6893 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
6894 if (memset)
6895 algs = &cost->memset[TARGET_64BIT != 0];
6896 else
6897 algs = &cost->memcpy[TARGET_64BIT != 0];
6898
6899 /* See maximal size for user defined algorithm. */
6900 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
6901 {
6902 enum stringop_alg candidate = algs->size[i].alg;
6903 bool usable = alg_usable_p (candidate, memset, have_as);
6904 any_alg_usable_p |= usable;
6905
6906 if (candidate != libcall && candidate && usable)
6907 max = algs->size[i].max;
6908 }
6909
6910 /* If expected size is not known but max size is small enough
6911 so inline version is a win, set expected size into
6912 the range. */
6913 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
6914 && expected_size == -1)
6915 expected_size = min_size / 2 + max_size / 2;
6916
6917 /* If user specified the algorithm, honor it if possible. */
6918 if (ix86_stringop_alg != no_stringop
6919 && alg_usable_p (ix86_stringop_alg, memset, have_as))
6920 return ix86_stringop_alg;
6921 /* rep; movq or rep; movl is the smallest variant. */
6922 else if (!optimize_for_speed)
6923 {
6924 *noalign = true;
6925 if (!count || (count & 3) || (memset && !zero_memset))
6926 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
6927 ? rep_prefix_1_byte : loop_1_byte;
6928 else
6929 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
6930 ? rep_prefix_4_byte : loop;
6931 }
6932 /* Very tiny blocks are best handled via the loop, REP is expensive to
6933 setup. */
6934 else if (expected_size != -1 && expected_size < 4)
6935 return loop_1_byte;
6936 else if (expected_size != -1)
6937 {
6938 enum stringop_alg alg = libcall;
6939 bool alg_noalign = false;
6940 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
6941 {
6942 /* We get here if the algorithms that were not libcall-based
6943 were rep-prefix based and we are unable to use rep prefixes
6944 based on global register usage. Break out of the loop and
6945 use the heuristic below. */
6946 if (algs->size[i].max == 0)
6947 break;
6948 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
6949 {
6950 enum stringop_alg candidate = algs->size[i].alg;
6951
6952 if (candidate != libcall
6953 && alg_usable_p (candidate, memset, have_as))
6954 {
6955 alg = candidate;
6956 alg_noalign = algs->size[i].noalign;
6957 }
6958 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
6959 last non-libcall inline algorithm. */
6960 if (TARGET_INLINE_ALL_STRINGOPS)
6961 {
6962 /* When the current size is best to be copied by a libcall,
6963 but we are still forced to inline, run the heuristic below
6964 that will pick code for medium sized blocks. */
6965 if (alg != libcall)
6966 {
6967 *noalign = alg_noalign;
6968 return alg;
6969 }
6970 else if (!any_alg_usable_p)
6971 break;
6972 }
6973 else if (alg_usable_p (candidate, memset, have_as))
6974 {
6975 *noalign = algs->size[i].noalign;
6976 return candidate;
6977 }
6978 }
6979 }
6980 }
6981 /* When asked to inline the call anyway, try to pick meaningful choice.
6982 We look for maximal size of block that is faster to copy by hand and
6983 take blocks of at most of that size guessing that average size will
6984 be roughly half of the block.
6985
6986 If this turns out to be bad, we might simply specify the preferred
6987 choice in ix86_costs. */
6988 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
6989 && (algs->unknown_size == libcall
6990 || !alg_usable_p (algs->unknown_size, memset, have_as)))
6991 {
6992 enum stringop_alg alg;
6993 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
6994
6995 /* If there aren't any usable algorithms or if recursing already,
6996 then recursing on smaller sizes or same size isn't going to
6997 find anything. Just return the simple byte-at-a-time copy loop. */
6998 if (!any_alg_usable_p || recur)
6999 {
7000 /* Pick something reasonable. */
7001 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
7002 *dynamic_check = 128;
7003 return loop_1_byte;
7004 }
7005 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
7006 zero_memset, have_as, dynamic_check, noalign, true);
7007 gcc_assert (*dynamic_check == -1);
7008 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
7009 *dynamic_check = max;
7010 else
7011 gcc_assert (alg != libcall);
7012 return alg;
7013 }
7014 return (alg_usable_p (algs->unknown_size, memset, have_as)
7015 ? algs->unknown_size : libcall);
7016 }
7017
7018 /* Decide on alignment. We know that the operand is already aligned to ALIGN
7019 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
7020 static int
7021 decide_alignment (int align,
7022 enum stringop_alg alg,
7023 int expected_size,
7024 machine_mode move_mode)
7025 {
7026 int desired_align = 0;
7027
7028 gcc_assert (alg != no_stringop);
7029
7030 if (alg == libcall)
7031 return 0;
7032 if (move_mode == VOIDmode)
7033 return 0;
7034
7035 desired_align = GET_MODE_SIZE (move_mode);
7036 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
7037 copying whole cacheline at once. */
7038 if (TARGET_PENTIUMPRO
7039 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
7040 desired_align = 8;
7041
7042 if (optimize_size)
7043 desired_align = 1;
7044 if (desired_align < align)
7045 desired_align = align;
7046 if (expected_size != -1 && expected_size < 4)
7047 desired_align = align;
7048
7049 return desired_align;
7050 }
7051
7052
7053 /* Helper function for memcpy. For QImode value 0xXY produce
7054 0xXYXYXYXY of wide specified by MODE. This is essentially
7055 a * 0x10101010, but we can do slightly better than
7056 synth_mult by unwinding the sequence by hand on CPUs with
7057 slow multiply. */
7058 static rtx
7059 promote_duplicated_reg (machine_mode mode, rtx val)
7060 {
7061 machine_mode valmode = GET_MODE (val);
7062 rtx tmp;
7063 int nops = mode == DImode ? 3 : 2;
7064
7065 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
7066 if (val == const0_rtx)
7067 return copy_to_mode_reg (mode, CONST0_RTX (mode));
7068 if (CONST_INT_P (val))
7069 {
7070 HOST_WIDE_INT v = INTVAL (val) & 255;
7071
7072 v |= v << 8;
7073 v |= v << 16;
7074 if (mode == DImode)
7075 v |= (v << 16) << 16;
7076 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
7077 }
7078
7079 if (valmode == VOIDmode)
7080 valmode = QImode;
7081 if (valmode != QImode)
7082 val = gen_lowpart (QImode, val);
7083 if (mode == QImode)
7084 return val;
7085 if (!TARGET_PARTIAL_REG_STALL)
7086 nops--;
7087 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
7088 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
7089 <= (ix86_cost->shift_const + ix86_cost->add) * nops
7090 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
7091 {
7092 rtx reg = convert_modes (mode, QImode, val, true);
7093 tmp = promote_duplicated_reg (mode, const1_rtx);
7094 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
7095 OPTAB_DIRECT);
7096 }
7097 else
7098 {
7099 rtx reg = convert_modes (mode, QImode, val, true);
7100
7101 if (!TARGET_PARTIAL_REG_STALL)
7102 if (mode == SImode)
7103 emit_insn (gen_insvsi_1 (reg, reg));
7104 else
7105 emit_insn (gen_insvdi_1 (reg, reg));
7106 else
7107 {
7108 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
7109 NULL, 1, OPTAB_DIRECT);
7110 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
7111 OPTAB_DIRECT);
7112 }
7113 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
7114 NULL, 1, OPTAB_DIRECT);
7115 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7116 if (mode == SImode)
7117 return reg;
7118 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
7119 NULL, 1, OPTAB_DIRECT);
7120 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
7121 return reg;
7122 }
7123 }
7124
7125 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
7126 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
7127 alignment from ALIGN to DESIRED_ALIGN. */
7128 static rtx
7129 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
7130 int align)
7131 {
7132 rtx promoted_val;
7133
7134 if (TARGET_64BIT
7135 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
7136 promoted_val = promote_duplicated_reg (DImode, val);
7137 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
7138 promoted_val = promote_duplicated_reg (SImode, val);
7139 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
7140 promoted_val = promote_duplicated_reg (HImode, val);
7141 else
7142 promoted_val = val;
7143
7144 return promoted_val;
7145 }
7146
7147 /* Copy the address to a Pmode register. This is used for x32 to
7148 truncate DImode TLS address to a SImode register. */
7149
7150 static rtx
7151 ix86_copy_addr_to_reg (rtx addr)
7152 {
7153 rtx reg;
7154 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
7155 {
7156 reg = copy_addr_to_reg (addr);
7157 REG_POINTER (reg) = 1;
7158 return reg;
7159 }
7160 else
7161 {
7162 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
7163 reg = copy_to_mode_reg (DImode, addr);
7164 REG_POINTER (reg) = 1;
7165 return gen_rtx_SUBREG (SImode, reg, 0);
7166 }
7167 }
7168
7169 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
7170 operations when profitable. The code depends upon architecture, block size
7171 and alignment, but always has one of the following overall structures:
7172
7173 Aligned move sequence:
7174
7175 1) Prologue guard: Conditional that jumps up to epilogues for small
7176 blocks that can be handled by epilogue alone. This is faster
7177 but also needed for correctness, since prologue assume the block
7178 is larger than the desired alignment.
7179
7180 Optional dynamic check for size and libcall for large
7181 blocks is emitted here too, with -minline-stringops-dynamically.
7182
7183 2) Prologue: copy first few bytes in order to get destination
7184 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
7185 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
7186 copied. We emit either a jump tree on power of two sized
7187 blocks, or a byte loop.
7188
7189 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7190 with specified algorithm.
7191
7192 4) Epilogue: code copying tail of the block that is too small to be
7193 handled by main body (or up to size guarded by prologue guard).
7194
7195 Misaligned move sequence
7196
7197 1) missaligned move prologue/epilogue containing:
7198 a) Prologue handling small memory blocks and jumping to done_label
7199 (skipped if blocks are known to be large enough)
7200 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
7201 needed by single possibly misaligned move
7202 (skipped if alignment is not needed)
7203 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
7204
7205 2) Zero size guard dispatching to done_label, if needed
7206
7207 3) dispatch to library call, if needed,
7208
7209 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7210 with specified algorithm. */
7211 bool
7212 ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
7213 rtx align_exp, rtx expected_align_exp,
7214 rtx expected_size_exp, rtx min_size_exp,
7215 rtx max_size_exp, rtx probable_max_size_exp,
7216 bool issetmem)
7217 {
7218 rtx destreg;
7219 rtx srcreg = NULL;
7220 rtx_code_label *label = NULL;
7221 rtx tmp;
7222 rtx_code_label *jump_around_label = NULL;
7223 HOST_WIDE_INT align = 1;
7224 unsigned HOST_WIDE_INT count = 0;
7225 HOST_WIDE_INT expected_size = -1;
7226 int size_needed = 0, epilogue_size_needed;
7227 int desired_align = 0, align_bytes = 0;
7228 enum stringop_alg alg;
7229 rtx promoted_val = NULL;
7230 rtx vec_promoted_val = NULL;
7231 bool force_loopy_epilogue = false;
7232 int dynamic_check;
7233 bool need_zero_guard = false;
7234 bool noalign;
7235 machine_mode move_mode = VOIDmode;
7236 machine_mode wider_mode;
7237 int unroll_factor = 1;
7238 /* TODO: Once value ranges are available, fill in proper data. */
7239 unsigned HOST_WIDE_INT min_size = 0;
7240 unsigned HOST_WIDE_INT max_size = -1;
7241 unsigned HOST_WIDE_INT probable_max_size = -1;
7242 bool misaligned_prologue_used = false;
7243 bool have_as;
7244
7245 if (CONST_INT_P (align_exp))
7246 align = INTVAL (align_exp);
7247 /* i386 can do misaligned access on reasonably increased cost. */
7248 if (CONST_INT_P (expected_align_exp)
7249 && INTVAL (expected_align_exp) > align)
7250 align = INTVAL (expected_align_exp);
7251 /* ALIGN is the minimum of destination and source alignment, but we care here
7252 just about destination alignment. */
7253 else if (!issetmem
7254 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
7255 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
7256
7257 if (CONST_INT_P (count_exp))
7258 {
7259 min_size = max_size = probable_max_size = count = expected_size
7260 = INTVAL (count_exp);
7261 /* When COUNT is 0, there is nothing to do. */
7262 if (!count)
7263 return true;
7264 }
7265 else
7266 {
7267 if (min_size_exp)
7268 min_size = INTVAL (min_size_exp);
7269 if (max_size_exp)
7270 max_size = INTVAL (max_size_exp);
7271 if (probable_max_size_exp)
7272 probable_max_size = INTVAL (probable_max_size_exp);
7273 if (CONST_INT_P (expected_size_exp))
7274 expected_size = INTVAL (expected_size_exp);
7275 }
7276
7277 /* Make sure we don't need to care about overflow later on. */
7278 if (count > (HOST_WIDE_INT_1U << 30))
7279 return false;
7280
7281 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
7282 if (!issetmem)
7283 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
7284
7285 /* Step 0: Decide on preferred algorithm, desired alignment and
7286 size of chunks to be copied by main loop. */
7287 alg = decide_alg (count, expected_size, min_size, probable_max_size,
7288 issetmem,
7289 issetmem && val_exp == const0_rtx, have_as,
7290 &dynamic_check, &noalign, false);
7291
7292 if (dump_file)
7293 fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
7294 stringop_alg_names[alg]);
7295
7296 if (alg == libcall)
7297 return false;
7298 gcc_assert (alg != no_stringop);
7299
7300 /* For now vector-version of memset is generated only for memory zeroing, as
7301 creating of promoted vector value is very cheap in this case. */
7302 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
7303 alg = unrolled_loop;
7304
7305 if (!count)
7306 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
7307 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
7308 if (!issetmem)
7309 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
7310
7311 unroll_factor = 1;
7312 move_mode = word_mode;
7313 switch (alg)
7314 {
7315 case libcall:
7316 case no_stringop:
7317 case last_alg:
7318 gcc_unreachable ();
7319 case loop_1_byte:
7320 need_zero_guard = true;
7321 move_mode = QImode;
7322 break;
7323 case loop:
7324 need_zero_guard = true;
7325 break;
7326 case unrolled_loop:
7327 need_zero_guard = true;
7328 unroll_factor = (TARGET_64BIT ? 4 : 2);
7329 break;
7330 case vector_loop:
7331 need_zero_guard = true;
7332 unroll_factor = 4;
7333 /* Find the widest supported mode. */
7334 move_mode = word_mode;
7335 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
7336 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
7337 move_mode = wider_mode;
7338
7339 if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (move_mode) > 128)
7340 move_mode = TImode;
7341
7342 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7343 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7344 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
7345 {
7346 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
7347 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
7348 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
7349 move_mode = word_mode;
7350 }
7351 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
7352 break;
7353 case rep_prefix_8_byte:
7354 move_mode = DImode;
7355 break;
7356 case rep_prefix_4_byte:
7357 move_mode = SImode;
7358 break;
7359 case rep_prefix_1_byte:
7360 move_mode = QImode;
7361 break;
7362 }
7363 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
7364 epilogue_size_needed = size_needed;
7365
7366 /* If we are going to call any library calls conditionally, make sure any
7367 pending stack adjustment happen before the first conditional branch,
7368 otherwise they will be emitted before the library call only and won't
7369 happen from the other branches. */
7370 if (dynamic_check != -1)
7371 do_pending_stack_adjust ();
7372
7373 desired_align = decide_alignment (align, alg, expected_size, move_mode);
7374 if (!TARGET_ALIGN_STRINGOPS || noalign)
7375 align = desired_align;
7376
7377 /* Step 1: Prologue guard. */
7378
7379 /* Alignment code needs count to be in register. */
7380 if (CONST_INT_P (count_exp) && desired_align > align)
7381 {
7382 if (INTVAL (count_exp) > desired_align
7383 && INTVAL (count_exp) > size_needed)
7384 {
7385 align_bytes
7386 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
7387 if (align_bytes <= 0)
7388 align_bytes = 0;
7389 else
7390 align_bytes = desired_align - align_bytes;
7391 }
7392 if (align_bytes == 0)
7393 count_exp = force_reg (counter_mode (count_exp), count_exp);
7394 }
7395 gcc_assert (desired_align >= 1 && align >= 1);
7396
7397 /* Misaligned move sequences handle both prologue and epilogue at once.
7398 Default code generation results in a smaller code for large alignments
7399 and also avoids redundant job when sizes are known precisely. */
7400 misaligned_prologue_used
7401 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
7402 && MAX (desired_align, epilogue_size_needed) <= 32
7403 && desired_align <= epilogue_size_needed
7404 && ((desired_align > align && !align_bytes)
7405 || (!count && epilogue_size_needed > 1)));
7406
7407 /* Do the cheap promotion to allow better CSE across the
7408 main loop and epilogue (ie one load of the big constant in the
7409 front of all code.
7410 For now the misaligned move sequences do not have fast path
7411 without broadcasting. */
7412 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
7413 {
7414 if (alg == vector_loop)
7415 {
7416 gcc_assert (val_exp == const0_rtx);
7417 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
7418 promoted_val = promote_duplicated_reg_to_size (val_exp,
7419 GET_MODE_SIZE (word_mode),
7420 desired_align, align);
7421 }
7422 else
7423 {
7424 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7425 desired_align, align);
7426 }
7427 }
7428 /* Misaligned move sequences handles both prologues and epilogues at once.
7429 Default code generation results in smaller code for large alignments and
7430 also avoids redundant job when sizes are known precisely. */
7431 if (misaligned_prologue_used)
7432 {
7433 /* Misaligned move prologue handled small blocks by itself. */
7434 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
7435 (dst, src, &destreg, &srcreg,
7436 move_mode, promoted_val, vec_promoted_val,
7437 &count_exp,
7438 &jump_around_label,
7439 desired_align < align
7440 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
7441 desired_align, align, &min_size, dynamic_check, issetmem);
7442 if (!issetmem)
7443 src = change_address (src, BLKmode, srcreg);
7444 dst = change_address (dst, BLKmode, destreg);
7445 set_mem_align (dst, desired_align * BITS_PER_UNIT);
7446 epilogue_size_needed = 0;
7447 if (need_zero_guard
7448 && min_size < (unsigned HOST_WIDE_INT) size_needed)
7449 {
7450 /* It is possible that we copied enough so the main loop will not
7451 execute. */
7452 gcc_assert (size_needed > 1);
7453 if (jump_around_label == NULL_RTX)
7454 jump_around_label = gen_label_rtx ();
7455 emit_cmp_and_jump_insns (count_exp,
7456 GEN_INT (size_needed),
7457 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
7458 if (expected_size == -1
7459 || expected_size < (desired_align - align) / 2 + size_needed)
7460 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7461 else
7462 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7463 }
7464 }
7465 /* Ensure that alignment prologue won't copy past end of block. */
7466 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
7467 {
7468 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
7469 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
7470 Make sure it is power of 2. */
7471 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
7472
7473 /* To improve performance of small blocks, we jump around the VAL
7474 promoting mode. This mean that if the promoted VAL is not constant,
7475 we might not use it in the epilogue and have to use byte
7476 loop variant. */
7477 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
7478 force_loopy_epilogue = true;
7479 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7480 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7481 {
7482 /* If main algorithm works on QImode, no epilogue is needed.
7483 For small sizes just don't align anything. */
7484 if (size_needed == 1)
7485 desired_align = align;
7486 else
7487 goto epilogue;
7488 }
7489 else if (!count
7490 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
7491 {
7492 label = gen_label_rtx ();
7493 emit_cmp_and_jump_insns (count_exp,
7494 GEN_INT (epilogue_size_needed),
7495 LTU, 0, counter_mode (count_exp), 1, label);
7496 if (expected_size == -1 || expected_size < epilogue_size_needed)
7497 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7498 else
7499 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7500 }
7501 }
7502
7503 /* Emit code to decide on runtime whether library call or inline should be
7504 used. */
7505 if (dynamic_check != -1)
7506 {
7507 if (!issetmem && CONST_INT_P (count_exp))
7508 {
7509 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
7510 {
7511 emit_block_copy_via_libcall (dst, src, count_exp);
7512 count_exp = const0_rtx;
7513 goto epilogue;
7514 }
7515 }
7516 else
7517 {
7518 rtx_code_label *hot_label = gen_label_rtx ();
7519 if (jump_around_label == NULL_RTX)
7520 jump_around_label = gen_label_rtx ();
7521 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
7522 LEU, 0, counter_mode (count_exp),
7523 1, hot_label);
7524 predict_jump (REG_BR_PROB_BASE * 90 / 100);
7525 if (issetmem)
7526 set_storage_via_libcall (dst, count_exp, val_exp);
7527 else
7528 emit_block_copy_via_libcall (dst, src, count_exp);
7529 emit_jump (jump_around_label);
7530 emit_label (hot_label);
7531 }
7532 }
7533
7534 /* Step 2: Alignment prologue. */
7535 /* Do the expensive promotion once we branched off the small blocks. */
7536 if (issetmem && !promoted_val)
7537 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
7538 desired_align, align);
7539
7540 if (desired_align > align && !misaligned_prologue_used)
7541 {
7542 if (align_bytes == 0)
7543 {
7544 /* Except for the first move in prologue, we no longer know
7545 constant offset in aliasing info. It don't seems to worth
7546 the pain to maintain it for the first move, so throw away
7547 the info early. */
7548 dst = change_address (dst, BLKmode, destreg);
7549 if (!issetmem)
7550 src = change_address (src, BLKmode, srcreg);
7551 dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
7552 promoted_val, vec_promoted_val,
7553 count_exp, align, desired_align,
7554 issetmem);
7555 /* At most desired_align - align bytes are copied. */
7556 if (min_size < (unsigned)(desired_align - align))
7557 min_size = 0;
7558 else
7559 min_size -= desired_align - align;
7560 }
7561 else
7562 {
7563 /* If we know how many bytes need to be stored before dst is
7564 sufficiently aligned, maintain aliasing info accurately. */
7565 dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
7566 srcreg,
7567 promoted_val,
7568 vec_promoted_val,
7569 desired_align,
7570 align_bytes,
7571 issetmem);
7572
7573 count_exp = plus_constant (counter_mode (count_exp),
7574 count_exp, -align_bytes);
7575 count -= align_bytes;
7576 min_size -= align_bytes;
7577 max_size -= align_bytes;
7578 }
7579 if (need_zero_guard
7580 && min_size < (unsigned HOST_WIDE_INT) size_needed
7581 && (count < (unsigned HOST_WIDE_INT) size_needed
7582 || (align_bytes == 0
7583 && count < ((unsigned HOST_WIDE_INT) size_needed
7584 + desired_align - align))))
7585 {
7586 /* It is possible that we copied enough so the main loop will not
7587 execute. */
7588 gcc_assert (size_needed > 1);
7589 if (label == NULL_RTX)
7590 label = gen_label_rtx ();
7591 emit_cmp_and_jump_insns (count_exp,
7592 GEN_INT (size_needed),
7593 LTU, 0, counter_mode (count_exp), 1, label);
7594 if (expected_size == -1
7595 || expected_size < (desired_align - align) / 2 + size_needed)
7596 predict_jump (REG_BR_PROB_BASE * 20 / 100);
7597 else
7598 predict_jump (REG_BR_PROB_BASE * 60 / 100);
7599 }
7600 }
7601 if (label && size_needed == 1)
7602 {
7603 emit_label (label);
7604 LABEL_NUSES (label) = 1;
7605 label = NULL;
7606 epilogue_size_needed = 1;
7607 if (issetmem)
7608 promoted_val = val_exp;
7609 }
7610 else if (label == NULL_RTX && !misaligned_prologue_used)
7611 epilogue_size_needed = size_needed;
7612
7613 /* Step 3: Main loop. */
7614
7615 switch (alg)
7616 {
7617 case libcall:
7618 case no_stringop:
7619 case last_alg:
7620 gcc_unreachable ();
7621 case loop_1_byte:
7622 case loop:
7623 case unrolled_loop:
7624 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
7625 count_exp, move_mode, unroll_factor,
7626 expected_size, issetmem);
7627 break;
7628 case vector_loop:
7629 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
7630 vec_promoted_val, count_exp, move_mode,
7631 unroll_factor, expected_size, issetmem);
7632 break;
7633 case rep_prefix_8_byte:
7634 case rep_prefix_4_byte:
7635 case rep_prefix_1_byte:
7636 expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
7637 val_exp, count_exp, move_mode, issetmem);
7638 break;
7639 }
7640 /* Adjust properly the offset of src and dest memory for aliasing. */
7641 if (CONST_INT_P (count_exp))
7642 {
7643 if (!issetmem)
7644 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
7645 (count / size_needed) * size_needed);
7646 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
7647 (count / size_needed) * size_needed);
7648 }
7649 else
7650 {
7651 if (!issetmem)
7652 src = change_address (src, BLKmode, srcreg);
7653 dst = change_address (dst, BLKmode, destreg);
7654 }
7655
7656 /* Step 4: Epilogue to copy the remaining bytes. */
7657 epilogue:
7658 if (label)
7659 {
7660 /* When the main loop is done, COUNT_EXP might hold original count,
7661 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
7662 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
7663 bytes. Compensate if needed. */
7664
7665 if (size_needed < epilogue_size_needed)
7666 {
7667 tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
7668 GEN_INT (size_needed - 1), count_exp, 1,
7669 OPTAB_DIRECT);
7670 if (tmp != count_exp)
7671 emit_move_insn (count_exp, tmp);
7672 }
7673 emit_label (label);
7674 LABEL_NUSES (label) = 1;
7675 }
7676
7677 if (count_exp != const0_rtx && epilogue_size_needed > 1)
7678 {
7679 if (force_loopy_epilogue)
7680 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
7681 epilogue_size_needed);
7682 else
7683 {
7684 if (issetmem)
7685 expand_setmem_epilogue (dst, destreg, promoted_val,
7686 vec_promoted_val, count_exp,
7687 epilogue_size_needed);
7688 else
7689 expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
7690 epilogue_size_needed);
7691 }
7692 }
7693 if (jump_around_label)
7694 emit_label (jump_around_label);
7695 return true;
7696 }
7697
7698
7699 /* Expand the appropriate insns for doing strlen if not just doing
7700 repnz; scasb
7701
7702 out = result, initialized with the start address
7703 align_rtx = alignment of the address.
7704 scratch = scratch register, initialized with the startaddress when
7705 not aligned, otherwise undefined
7706
7707 This is just the body. It needs the initializations mentioned above and
7708 some address computing at the end. These things are done in i386.md. */
7709
7710 static void
7711 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
7712 {
7713 int align;
7714 rtx tmp;
7715 rtx_code_label *align_2_label = NULL;
7716 rtx_code_label *align_3_label = NULL;
7717 rtx_code_label *align_4_label = gen_label_rtx ();
7718 rtx_code_label *end_0_label = gen_label_rtx ();
7719 rtx mem;
7720 rtx tmpreg = gen_reg_rtx (SImode);
7721 rtx scratch = gen_reg_rtx (SImode);
7722 rtx cmp;
7723
7724 align = 0;
7725 if (CONST_INT_P (align_rtx))
7726 align = INTVAL (align_rtx);
7727
7728 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
7729
7730 /* Is there a known alignment and is it less than 4? */
7731 if (align < 4)
7732 {
7733 rtx scratch1 = gen_reg_rtx (Pmode);
7734 emit_move_insn (scratch1, out);
7735 /* Is there a known alignment and is it not 2? */
7736 if (align != 2)
7737 {
7738 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
7739 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
7740
7741 /* Leave just the 3 lower bits. */
7742 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
7743 NULL_RTX, 0, OPTAB_WIDEN);
7744
7745 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
7746 Pmode, 1, align_4_label);
7747 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
7748 Pmode, 1, align_2_label);
7749 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
7750 Pmode, 1, align_3_label);
7751 }
7752 else
7753 {
7754 /* Since the alignment is 2, we have to check 2 or 0 bytes;
7755 check if is aligned to 4 - byte. */
7756
7757 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
7758 NULL_RTX, 0, OPTAB_WIDEN);
7759
7760 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
7761 Pmode, 1, align_4_label);
7762 }
7763
7764 mem = change_address (src, QImode, out);
7765
7766 /* Now compare the bytes. */
7767
7768 /* Compare the first n unaligned byte on a byte per byte basis. */
7769 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
7770 QImode, 1, end_0_label);
7771
7772 /* Increment the address. */
7773 emit_insn (gen_add2_insn (out, const1_rtx));
7774
7775 /* Not needed with an alignment of 2 */
7776 if (align != 2)
7777 {
7778 emit_label (align_2_label);
7779
7780 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
7781 end_0_label);
7782
7783 emit_insn (gen_add2_insn (out, const1_rtx));
7784
7785 emit_label (align_3_label);
7786 }
7787
7788 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
7789 end_0_label);
7790
7791 emit_insn (gen_add2_insn (out, const1_rtx));
7792 }
7793
7794 /* Generate loop to check 4 bytes at a time. It is not a good idea to
7795 align this loop. It gives only huge programs, but does not help to
7796 speed up. */
7797 emit_label (align_4_label);
7798
7799 mem = change_address (src, SImode, out);
7800 emit_move_insn (scratch, mem);
7801 emit_insn (gen_add2_insn (out, GEN_INT (4)));
7802
7803 /* This formula yields a nonzero result iff one of the bytes is zero.
7804 This saves three branches inside loop and many cycles. */
7805
7806 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
7807 emit_insn (gen_one_cmplsi2 (scratch, scratch));
7808 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
7809 emit_insn (gen_andsi3 (tmpreg, tmpreg,
7810 gen_int_mode (0x80808080, SImode)));
7811 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
7812 align_4_label);
7813
7814 if (TARGET_CMOVE)
7815 {
7816 rtx reg = gen_reg_rtx (SImode);
7817 rtx reg2 = gen_reg_rtx (Pmode);
7818 emit_move_insn (reg, tmpreg);
7819 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
7820
7821 /* If zero is not in the first two bytes, move two bytes forward. */
7822 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
7823 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7824 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
7825 emit_insn (gen_rtx_SET (tmpreg,
7826 gen_rtx_IF_THEN_ELSE (SImode, tmp,
7827 reg,
7828 tmpreg)));
7829 /* Emit lea manually to avoid clobbering of flags. */
7830 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
7831
7832 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7833 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
7834 emit_insn (gen_rtx_SET (out,
7835 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
7836 reg2,
7837 out)));
7838 }
7839 else
7840 {
7841 rtx_code_label *end_2_label = gen_label_rtx ();
7842 /* Is zero in the first two bytes? */
7843
7844 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
7845 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
7846 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
7847 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
7848 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
7849 pc_rtx);
7850 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
7851 JUMP_LABEL (tmp) = end_2_label;
7852
7853 /* Not in the first two. Move two bytes forward. */
7854 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
7855 emit_insn (gen_add2_insn (out, const2_rtx));
7856
7857 emit_label (end_2_label);
7858
7859 }
7860
7861 /* Avoid branch in fixing the byte. */
7862 tmpreg = gen_lowpart (QImode, tmpreg);
7863 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
7864 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
7865 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
7866 emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
7867
7868 emit_label (end_0_label);
7869 }
7870
7871 /* Expand strlen. */
7872
7873 bool
7874 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
7875 {
7876 if (TARGET_UNROLL_STRLEN
7877 && TARGET_INLINE_ALL_STRINGOPS
7878 && eoschar == const0_rtx
7879 && optimize > 1)
7880 {
7881 /* The generic case of strlen expander is long. Avoid it's
7882 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
7883 rtx addr = force_reg (Pmode, XEXP (src, 0));
7884 /* Well it seems that some optimizer does not combine a call like
7885 foo(strlen(bar), strlen(bar));
7886 when the move and the subtraction is done here. It does calculate
7887 the length just once when these instructions are done inside of
7888 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
7889 often used and I use one fewer register for the lifetime of
7890 output_strlen_unroll() this is better. */
7891
7892 emit_move_insn (out, addr);
7893
7894 ix86_expand_strlensi_unroll_1 (out, src, align);
7895
7896 /* strlensi_unroll_1 returns the address of the zero at the end of
7897 the string, like memchr(), so compute the length by subtracting
7898 the start address. */
7899 emit_insn (gen_sub2_insn (out, addr));
7900 return true;
7901 }
7902 else
7903 return false;
7904 }
7905
7906 /* For given symbol (function) construct code to compute address of it's PLT
7907 entry in large x86-64 PIC model. */
7908
7909 static rtx
7910 construct_plt_address (rtx symbol)
7911 {
7912 rtx tmp, unspec;
7913
7914 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
7915 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
7916 gcc_assert (Pmode == DImode);
7917
7918 tmp = gen_reg_rtx (Pmode);
7919 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
7920
7921 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
7922 emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
7923 return tmp;
7924 }
7925
7926 /* Additional registers that are clobbered by SYSV calls. */
7927
7928 static int const x86_64_ms_sysv_extra_clobbered_registers
7929 [NUM_X86_64_MS_CLOBBERED_REGS] =
7930 {
7931 SI_REG, DI_REG,
7932 XMM6_REG, XMM7_REG,
7933 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
7934 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
7935 };
7936
7937 rtx_insn *
7938 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
7939 rtx callarg2,
7940 rtx pop, bool sibcall)
7941 {
7942 rtx vec[3];
7943 rtx use = NULL, call;
7944 unsigned int vec_len = 0;
7945 tree fndecl;
7946
7947 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
7948 {
7949 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
7950 if (fndecl
7951 && (lookup_attribute ("interrupt",
7952 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
7953 error ("interrupt service routine cannot be called directly");
7954 }
7955 else
7956 fndecl = NULL_TREE;
7957
7958 if (pop == const0_rtx)
7959 pop = NULL;
7960 gcc_assert (!TARGET_64BIT || !pop);
7961
7962 if (TARGET_MACHO && !TARGET_64BIT)
7963 {
7964 #if TARGET_MACHO
7965 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
7966 fnaddr = machopic_indirect_call_target (fnaddr);
7967 #endif
7968 }
7969 else
7970 {
7971 /* Static functions and indirect calls don't need the pic register. Also,
7972 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
7973 it an indirect call. */
7974 rtx addr = XEXP (fnaddr, 0);
7975 if (flag_pic
7976 && GET_CODE (addr) == SYMBOL_REF
7977 && !SYMBOL_REF_LOCAL_P (addr))
7978 {
7979 if (flag_plt
7980 && (SYMBOL_REF_DECL (addr) == NULL_TREE
7981 || !lookup_attribute ("noplt",
7982 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
7983 {
7984 if (!TARGET_64BIT
7985 || (ix86_cmodel == CM_LARGE_PIC
7986 && DEFAULT_ABI != MS_ABI))
7987 {
7988 use_reg (&use, gen_rtx_REG (Pmode,
7989 REAL_PIC_OFFSET_TABLE_REGNUM));
7990 if (ix86_use_pseudo_pic_reg ())
7991 emit_move_insn (gen_rtx_REG (Pmode,
7992 REAL_PIC_OFFSET_TABLE_REGNUM),
7993 pic_offset_table_rtx);
7994 }
7995 }
7996 else if (!TARGET_PECOFF && !TARGET_MACHO)
7997 {
7998 if (TARGET_64BIT)
7999 {
8000 fnaddr = gen_rtx_UNSPEC (Pmode,
8001 gen_rtvec (1, addr),
8002 UNSPEC_GOTPCREL);
8003 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
8004 }
8005 else
8006 {
8007 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
8008 UNSPEC_GOT);
8009 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
8010 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
8011 fnaddr);
8012 }
8013 fnaddr = gen_const_mem (Pmode, fnaddr);
8014 /* Pmode may not be the same as word_mode for x32, which
8015 doesn't support indirect branch via 32-bit memory slot.
8016 Since x32 GOT slot is 64 bit with zero upper 32 bits,
8017 indirect branch via x32 GOT slot is OK. */
8018 if (GET_MODE (fnaddr) != word_mode)
8019 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
8020 fnaddr = gen_rtx_MEM (QImode, fnaddr);
8021 }
8022 }
8023 }
8024
8025 /* Skip setting up RAX register for -mskip-rax-setup when there are no
8026 parameters passed in vector registers. */
8027 if (TARGET_64BIT
8028 && (INTVAL (callarg2) > 0
8029 || (INTVAL (callarg2) == 0
8030 && (TARGET_SSE || !flag_skip_rax_setup))))
8031 {
8032 rtx al = gen_rtx_REG (QImode, AX_REG);
8033 emit_move_insn (al, callarg2);
8034 use_reg (&use, al);
8035 }
8036
8037 if (ix86_cmodel == CM_LARGE_PIC
8038 && !TARGET_PECOFF
8039 && MEM_P (fnaddr)
8040 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
8041 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
8042 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
8043 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
8044 branch via x32 GOT slot is OK. */
8045 else if (!(TARGET_X32
8046 && MEM_P (fnaddr)
8047 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
8048 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
8049 && (sibcall
8050 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
8051 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
8052 {
8053 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
8054 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
8055 }
8056
8057 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
8058
8059 if (retval)
8060 call = gen_rtx_SET (retval, call);
8061 vec[vec_len++] = call;
8062
8063 if (pop)
8064 {
8065 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
8066 pop = gen_rtx_SET (stack_pointer_rtx, pop);
8067 vec[vec_len++] = pop;
8068 }
8069
8070 if (cfun->machine->no_caller_saved_registers
8071 && (!fndecl
8072 || (!TREE_THIS_VOLATILE (fndecl)
8073 && !lookup_attribute ("no_caller_saved_registers",
8074 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
8075 {
8076 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
8077 bool is_64bit_ms_abi = (TARGET_64BIT
8078 && ix86_function_abi (fndecl) == MS_ABI);
8079 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
8080
8081 /* If there are no caller-saved registers, add all registers
8082 that are clobbered by the call which returns. */
8083 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
8084 if (!fixed_regs[i]
8085 && (ix86_call_used_regs[i] == 1
8086 || (ix86_call_used_regs[i] & c_mask))
8087 && !STACK_REGNO_P (i)
8088 && !MMX_REGNO_P (i))
8089 clobber_reg (&use,
8090 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
8091 }
8092 else if (TARGET_64BIT_MS_ABI
8093 && (!callarg2 || INTVAL (callarg2) != -2))
8094 {
8095 unsigned i;
8096
8097 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
8098 {
8099 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
8100 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
8101
8102 clobber_reg (&use, gen_rtx_REG (mode, regno));
8103 }
8104
8105 /* Set here, but it may get cleared later. */
8106 if (TARGET_CALL_MS2SYSV_XLOGUES)
8107 {
8108 if (!TARGET_SSE)
8109 ;
8110
8111 /* Don't break hot-patched functions. */
8112 else if (ix86_function_ms_hook_prologue (current_function_decl))
8113 ;
8114
8115 /* TODO: Cases not yet examined. */
8116 else if (flag_split_stack)
8117 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
8118
8119 else
8120 {
8121 gcc_assert (!reload_completed);
8122 cfun->machine->call_ms2sysv = true;
8123 }
8124 }
8125 }
8126
8127 if (vec_len > 1)
8128 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
8129 rtx_insn *call_insn = emit_call_insn (call);
8130 if (use)
8131 CALL_INSN_FUNCTION_USAGE (call_insn) = use;
8132
8133 return call_insn;
8134 }
8135
8136 /* Split simple return with popping POPC bytes from stack to indirect
8137 branch with stack adjustment . */
8138
8139 void
8140 ix86_split_simple_return_pop_internal (rtx popc)
8141 {
8142 struct machine_function *m = cfun->machine;
8143 rtx ecx = gen_rtx_REG (SImode, CX_REG);
8144 rtx_insn *insn;
8145
8146 /* There is no "pascal" calling convention in any 64bit ABI. */
8147 gcc_assert (!TARGET_64BIT);
8148
8149 insn = emit_insn (gen_pop (ecx));
8150 m->fs.cfa_offset -= UNITS_PER_WORD;
8151 m->fs.sp_offset -= UNITS_PER_WORD;
8152
8153 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
8154 x = gen_rtx_SET (stack_pointer_rtx, x);
8155 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8156 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
8157 RTX_FRAME_RELATED_P (insn) = 1;
8158
8159 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
8160 x = gen_rtx_SET (stack_pointer_rtx, x);
8161 insn = emit_insn (x);
8162 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
8163 RTX_FRAME_RELATED_P (insn) = 1;
8164
8165 /* Now return address is in ECX. */
8166 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
8167 }
8168
8169 /* Errors in the source file can cause expand_expr to return const0_rtx
8170 where we expect a vector. To avoid crashing, use one of the vector
8171 clear instructions. */
8172
8173 static rtx
8174 safe_vector_operand (rtx x, machine_mode mode)
8175 {
8176 if (x == const0_rtx)
8177 x = CONST0_RTX (mode);
8178 return x;
8179 }
8180
8181 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
8182
8183 static rtx
8184 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
8185 {
8186 rtx pat;
8187 tree arg0 = CALL_EXPR_ARG (exp, 0);
8188 tree arg1 = CALL_EXPR_ARG (exp, 1);
8189 rtx op0 = expand_normal (arg0);
8190 rtx op1 = expand_normal (arg1);
8191 machine_mode tmode = insn_data[icode].operand[0].mode;
8192 machine_mode mode0 = insn_data[icode].operand[1].mode;
8193 machine_mode mode1 = insn_data[icode].operand[2].mode;
8194
8195 if (VECTOR_MODE_P (mode0))
8196 op0 = safe_vector_operand (op0, mode0);
8197 if (VECTOR_MODE_P (mode1))
8198 op1 = safe_vector_operand (op1, mode1);
8199
8200 if (optimize || !target
8201 || GET_MODE (target) != tmode
8202 || !insn_data[icode].operand[0].predicate (target, tmode))
8203 target = gen_reg_rtx (tmode);
8204
8205 if (GET_MODE (op1) == SImode && mode1 == TImode)
8206 {
8207 rtx x = gen_reg_rtx (V4SImode);
8208 emit_insn (gen_sse2_loadd (x, op1));
8209 op1 = gen_lowpart (TImode, x);
8210 }
8211
8212 if (!insn_data[icode].operand[1].predicate (op0, mode0))
8213 op0 = copy_to_mode_reg (mode0, op0);
8214 if (!insn_data[icode].operand[2].predicate (op1, mode1))
8215 op1 = copy_to_mode_reg (mode1, op1);
8216
8217 pat = GEN_FCN (icode) (target, op0, op1);
8218 if (! pat)
8219 return 0;
8220
8221 emit_insn (pat);
8222
8223 return target;
8224 }
8225
8226 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
8227
8228 static rtx
8229 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
8230 enum ix86_builtin_func_type m_type,
8231 enum rtx_code sub_code)
8232 {
8233 rtx pat;
8234 int i;
8235 int nargs;
8236 bool comparison_p = false;
8237 bool tf_p = false;
8238 bool last_arg_constant = false;
8239 int num_memory = 0;
8240 struct {
8241 rtx op;
8242 machine_mode mode;
8243 } args[4];
8244
8245 machine_mode tmode = insn_data[icode].operand[0].mode;
8246
8247 switch (m_type)
8248 {
8249 case MULTI_ARG_4_DF2_DI_I:
8250 case MULTI_ARG_4_DF2_DI_I1:
8251 case MULTI_ARG_4_SF2_SI_I:
8252 case MULTI_ARG_4_SF2_SI_I1:
8253 nargs = 4;
8254 last_arg_constant = true;
8255 break;
8256
8257 case MULTI_ARG_3_SF:
8258 case MULTI_ARG_3_DF:
8259 case MULTI_ARG_3_SF2:
8260 case MULTI_ARG_3_DF2:
8261 case MULTI_ARG_3_DI:
8262 case MULTI_ARG_3_SI:
8263 case MULTI_ARG_3_SI_DI:
8264 case MULTI_ARG_3_HI:
8265 case MULTI_ARG_3_HI_SI:
8266 case MULTI_ARG_3_QI:
8267 case MULTI_ARG_3_DI2:
8268 case MULTI_ARG_3_SI2:
8269 case MULTI_ARG_3_HI2:
8270 case MULTI_ARG_3_QI2:
8271 nargs = 3;
8272 break;
8273
8274 case MULTI_ARG_2_SF:
8275 case MULTI_ARG_2_DF:
8276 case MULTI_ARG_2_DI:
8277 case MULTI_ARG_2_SI:
8278 case MULTI_ARG_2_HI:
8279 case MULTI_ARG_2_QI:
8280 nargs = 2;
8281 break;
8282
8283 case MULTI_ARG_2_DI_IMM:
8284 case MULTI_ARG_2_SI_IMM:
8285 case MULTI_ARG_2_HI_IMM:
8286 case MULTI_ARG_2_QI_IMM:
8287 nargs = 2;
8288 last_arg_constant = true;
8289 break;
8290
8291 case MULTI_ARG_1_SF:
8292 case MULTI_ARG_1_DF:
8293 case MULTI_ARG_1_SF2:
8294 case MULTI_ARG_1_DF2:
8295 case MULTI_ARG_1_DI:
8296 case MULTI_ARG_1_SI:
8297 case MULTI_ARG_1_HI:
8298 case MULTI_ARG_1_QI:
8299 case MULTI_ARG_1_SI_DI:
8300 case MULTI_ARG_1_HI_DI:
8301 case MULTI_ARG_1_HI_SI:
8302 case MULTI_ARG_1_QI_DI:
8303 case MULTI_ARG_1_QI_SI:
8304 case MULTI_ARG_1_QI_HI:
8305 nargs = 1;
8306 break;
8307
8308 case MULTI_ARG_2_DI_CMP:
8309 case MULTI_ARG_2_SI_CMP:
8310 case MULTI_ARG_2_HI_CMP:
8311 case MULTI_ARG_2_QI_CMP:
8312 nargs = 2;
8313 comparison_p = true;
8314 break;
8315
8316 case MULTI_ARG_2_SF_TF:
8317 case MULTI_ARG_2_DF_TF:
8318 case MULTI_ARG_2_DI_TF:
8319 case MULTI_ARG_2_SI_TF:
8320 case MULTI_ARG_2_HI_TF:
8321 case MULTI_ARG_2_QI_TF:
8322 nargs = 2;
8323 tf_p = true;
8324 break;
8325
8326 default:
8327 gcc_unreachable ();
8328 }
8329
8330 if (optimize || !target
8331 || GET_MODE (target) != tmode
8332 || !insn_data[icode].operand[0].predicate (target, tmode))
8333 target = gen_reg_rtx (tmode);
8334 else if (memory_operand (target, tmode))
8335 num_memory++;
8336
8337 gcc_assert (nargs <= 4);
8338
8339 for (i = 0; i < nargs; i++)
8340 {
8341 tree arg = CALL_EXPR_ARG (exp, i);
8342 rtx op = expand_normal (arg);
8343 int adjust = (comparison_p) ? 1 : 0;
8344 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
8345
8346 if (last_arg_constant && i == nargs - 1)
8347 {
8348 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
8349 {
8350 enum insn_code new_icode = icode;
8351 switch (icode)
8352 {
8353 case CODE_FOR_xop_vpermil2v2df3:
8354 case CODE_FOR_xop_vpermil2v4sf3:
8355 case CODE_FOR_xop_vpermil2v4df3:
8356 case CODE_FOR_xop_vpermil2v8sf3:
8357 error ("the last argument must be a 2-bit immediate");
8358 return gen_reg_rtx (tmode);
8359 case CODE_FOR_xop_rotlv2di3:
8360 new_icode = CODE_FOR_rotlv2di3;
8361 goto xop_rotl;
8362 case CODE_FOR_xop_rotlv4si3:
8363 new_icode = CODE_FOR_rotlv4si3;
8364 goto xop_rotl;
8365 case CODE_FOR_xop_rotlv8hi3:
8366 new_icode = CODE_FOR_rotlv8hi3;
8367 goto xop_rotl;
8368 case CODE_FOR_xop_rotlv16qi3:
8369 new_icode = CODE_FOR_rotlv16qi3;
8370 xop_rotl:
8371 if (CONST_INT_P (op))
8372 {
8373 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
8374 op = GEN_INT (INTVAL (op) & mask);
8375 gcc_checking_assert
8376 (insn_data[icode].operand[i + 1].predicate (op, mode));
8377 }
8378 else
8379 {
8380 gcc_checking_assert
8381 (nargs == 2
8382 && insn_data[new_icode].operand[0].mode == tmode
8383 && insn_data[new_icode].operand[1].mode == tmode
8384 && insn_data[new_icode].operand[2].mode == mode
8385 && insn_data[new_icode].operand[0].predicate
8386 == insn_data[icode].operand[0].predicate
8387 && insn_data[new_icode].operand[1].predicate
8388 == insn_data[icode].operand[1].predicate);
8389 icode = new_icode;
8390 goto non_constant;
8391 }
8392 break;
8393 default:
8394 gcc_unreachable ();
8395 }
8396 }
8397 }
8398 else
8399 {
8400 non_constant:
8401 if (VECTOR_MODE_P (mode))
8402 op = safe_vector_operand (op, mode);
8403
8404 /* If we aren't optimizing, only allow one memory operand to be
8405 generated. */
8406 if (memory_operand (op, mode))
8407 num_memory++;
8408
8409 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
8410
8411 if (optimize
8412 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
8413 || num_memory > 1)
8414 op = force_reg (mode, op);
8415 }
8416
8417 args[i].op = op;
8418 args[i].mode = mode;
8419 }
8420
8421 switch (nargs)
8422 {
8423 case 1:
8424 pat = GEN_FCN (icode) (target, args[0].op);
8425 break;
8426
8427 case 2:
8428 if (tf_p)
8429 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
8430 GEN_INT ((int)sub_code));
8431 else if (! comparison_p)
8432 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
8433 else
8434 {
8435 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
8436 args[0].op,
8437 args[1].op);
8438
8439 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
8440 }
8441 break;
8442
8443 case 3:
8444 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
8445 break;
8446
8447 case 4:
8448 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
8449 break;
8450
8451 default:
8452 gcc_unreachable ();
8453 }
8454
8455 if (! pat)
8456 return 0;
8457
8458 emit_insn (pat);
8459 return target;
8460 }
8461
8462 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
8463 insns with vec_merge. */
8464
8465 static rtx
8466 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
8467 rtx target)
8468 {
8469 rtx pat;
8470 tree arg0 = CALL_EXPR_ARG (exp, 0);
8471 rtx op1, op0 = expand_normal (arg0);
8472 machine_mode tmode = insn_data[icode].operand[0].mode;
8473 machine_mode mode0 = insn_data[icode].operand[1].mode;
8474
8475 if (optimize || !target
8476 || GET_MODE (target) != tmode
8477 || !insn_data[icode].operand[0].predicate (target, tmode))
8478 target = gen_reg_rtx (tmode);
8479
8480 if (VECTOR_MODE_P (mode0))
8481 op0 = safe_vector_operand (op0, mode0);
8482
8483 if ((optimize && !register_operand (op0, mode0))
8484 || !insn_data[icode].operand[1].predicate (op0, mode0))
8485 op0 = copy_to_mode_reg (mode0, op0);
8486
8487 op1 = op0;
8488 if (!insn_data[icode].operand[2].predicate (op1, mode0))
8489 op1 = copy_to_mode_reg (mode0, op1);
8490
8491 pat = GEN_FCN (icode) (target, op0, op1);
8492 if (! pat)
8493 return 0;
8494 emit_insn (pat);
8495 return target;
8496 }
8497
8498 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
8499
8500 static rtx
8501 ix86_expand_sse_compare (const struct builtin_description *d,
8502 tree exp, rtx target, bool swap)
8503 {
8504 rtx pat;
8505 tree arg0 = CALL_EXPR_ARG (exp, 0);
8506 tree arg1 = CALL_EXPR_ARG (exp, 1);
8507 rtx op0 = expand_normal (arg0);
8508 rtx op1 = expand_normal (arg1);
8509 rtx op2;
8510 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8511 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8512 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8513 enum rtx_code comparison = d->comparison;
8514
8515 if (VECTOR_MODE_P (mode0))
8516 op0 = safe_vector_operand (op0, mode0);
8517 if (VECTOR_MODE_P (mode1))
8518 op1 = safe_vector_operand (op1, mode1);
8519
8520 /* Swap operands if we have a comparison that isn't available in
8521 hardware. */
8522 if (swap)
8523 std::swap (op0, op1);
8524
8525 if (optimize || !target
8526 || GET_MODE (target) != tmode
8527 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8528 target = gen_reg_rtx (tmode);
8529
8530 if ((optimize && !register_operand (op0, mode0))
8531 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
8532 op0 = copy_to_mode_reg (mode0, op0);
8533 if ((optimize && !register_operand (op1, mode1))
8534 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
8535 op1 = copy_to_mode_reg (mode1, op1);
8536
8537 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
8538 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8539 if (! pat)
8540 return 0;
8541 emit_insn (pat);
8542 return target;
8543 }
8544
8545 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
8546
8547 static rtx
8548 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
8549 rtx target)
8550 {
8551 rtx pat;
8552 tree arg0 = CALL_EXPR_ARG (exp, 0);
8553 tree arg1 = CALL_EXPR_ARG (exp, 1);
8554 rtx op0 = expand_normal (arg0);
8555 rtx op1 = expand_normal (arg1);
8556 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
8557 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
8558 enum rtx_code comparison = d->comparison;
8559
8560 if (VECTOR_MODE_P (mode0))
8561 op0 = safe_vector_operand (op0, mode0);
8562 if (VECTOR_MODE_P (mode1))
8563 op1 = safe_vector_operand (op1, mode1);
8564
8565 /* Swap operands if we have a comparison that isn't available in
8566 hardware. */
8567 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
8568 std::swap (op0, op1);
8569
8570 target = gen_reg_rtx (SImode);
8571 emit_move_insn (target, const0_rtx);
8572 target = gen_rtx_SUBREG (QImode, target, 0);
8573
8574 if ((optimize && !register_operand (op0, mode0))
8575 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8576 op0 = copy_to_mode_reg (mode0, op0);
8577 if ((optimize && !register_operand (op1, mode1))
8578 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8579 op1 = copy_to_mode_reg (mode1, op1);
8580
8581 pat = GEN_FCN (d->icode) (op0, op1);
8582 if (! pat)
8583 return 0;
8584 emit_insn (pat);
8585 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8586 gen_rtx_fmt_ee (comparison, QImode,
8587 SET_DEST (pat),
8588 const0_rtx)));
8589
8590 return SUBREG_REG (target);
8591 }
8592
8593 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
8594
8595 static rtx
8596 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
8597 rtx target)
8598 {
8599 rtx pat;
8600 tree arg0 = CALL_EXPR_ARG (exp, 0);
8601 rtx op1, op0 = expand_normal (arg0);
8602 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8603 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8604
8605 if (optimize || target == 0
8606 || GET_MODE (target) != tmode
8607 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8608 target = gen_reg_rtx (tmode);
8609
8610 if (VECTOR_MODE_P (mode0))
8611 op0 = safe_vector_operand (op0, mode0);
8612
8613 if ((optimize && !register_operand (op0, mode0))
8614 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8615 op0 = copy_to_mode_reg (mode0, op0);
8616
8617 op1 = GEN_INT (d->comparison);
8618
8619 pat = GEN_FCN (d->icode) (target, op0, op1);
8620 if (! pat)
8621 return 0;
8622 emit_insn (pat);
8623 return target;
8624 }
8625
8626 static rtx
8627 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
8628 tree exp, rtx target)
8629 {
8630 rtx pat;
8631 tree arg0 = CALL_EXPR_ARG (exp, 0);
8632 tree arg1 = CALL_EXPR_ARG (exp, 1);
8633 rtx op0 = expand_normal (arg0);
8634 rtx op1 = expand_normal (arg1);
8635 rtx op2;
8636 machine_mode tmode = insn_data[d->icode].operand[0].mode;
8637 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
8638 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
8639
8640 if (optimize || target == 0
8641 || GET_MODE (target) != tmode
8642 || !insn_data[d->icode].operand[0].predicate (target, tmode))
8643 target = gen_reg_rtx (tmode);
8644
8645 op0 = safe_vector_operand (op0, mode0);
8646 op1 = safe_vector_operand (op1, mode1);
8647
8648 if ((optimize && !register_operand (op0, mode0))
8649 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8650 op0 = copy_to_mode_reg (mode0, op0);
8651 if ((optimize && !register_operand (op1, mode1))
8652 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8653 op1 = copy_to_mode_reg (mode1, op1);
8654
8655 op2 = GEN_INT (d->comparison);
8656
8657 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
8658 if (! pat)
8659 return 0;
8660 emit_insn (pat);
8661 return target;
8662 }
8663
8664 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
8665
8666 static rtx
8667 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
8668 rtx target)
8669 {
8670 rtx pat;
8671 tree arg0 = CALL_EXPR_ARG (exp, 0);
8672 tree arg1 = CALL_EXPR_ARG (exp, 1);
8673 rtx op0 = expand_normal (arg0);
8674 rtx op1 = expand_normal (arg1);
8675 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
8676 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
8677 enum rtx_code comparison = d->comparison;
8678
8679 if (VECTOR_MODE_P (mode0))
8680 op0 = safe_vector_operand (op0, mode0);
8681 if (VECTOR_MODE_P (mode1))
8682 op1 = safe_vector_operand (op1, mode1);
8683
8684 target = gen_reg_rtx (SImode);
8685 emit_move_insn (target, const0_rtx);
8686 target = gen_rtx_SUBREG (QImode, target, 0);
8687
8688 if ((optimize && !register_operand (op0, mode0))
8689 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
8690 op0 = copy_to_mode_reg (mode0, op0);
8691 if ((optimize && !register_operand (op1, mode1))
8692 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
8693 op1 = copy_to_mode_reg (mode1, op1);
8694
8695 pat = GEN_FCN (d->icode) (op0, op1);
8696 if (! pat)
8697 return 0;
8698 emit_insn (pat);
8699 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8700 gen_rtx_fmt_ee (comparison, QImode,
8701 SET_DEST (pat),
8702 const0_rtx)));
8703
8704 return SUBREG_REG (target);
8705 }
8706
8707 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
8708
8709 static rtx
8710 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
8711 tree exp, rtx target)
8712 {
8713 rtx pat;
8714 tree arg0 = CALL_EXPR_ARG (exp, 0);
8715 tree arg1 = CALL_EXPR_ARG (exp, 1);
8716 tree arg2 = CALL_EXPR_ARG (exp, 2);
8717 tree arg3 = CALL_EXPR_ARG (exp, 3);
8718 tree arg4 = CALL_EXPR_ARG (exp, 4);
8719 rtx scratch0, scratch1;
8720 rtx op0 = expand_normal (arg0);
8721 rtx op1 = expand_normal (arg1);
8722 rtx op2 = expand_normal (arg2);
8723 rtx op3 = expand_normal (arg3);
8724 rtx op4 = expand_normal (arg4);
8725 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
8726
8727 tmode0 = insn_data[d->icode].operand[0].mode;
8728 tmode1 = insn_data[d->icode].operand[1].mode;
8729 modev2 = insn_data[d->icode].operand[2].mode;
8730 modei3 = insn_data[d->icode].operand[3].mode;
8731 modev4 = insn_data[d->icode].operand[4].mode;
8732 modei5 = insn_data[d->icode].operand[5].mode;
8733 modeimm = insn_data[d->icode].operand[6].mode;
8734
8735 if (VECTOR_MODE_P (modev2))
8736 op0 = safe_vector_operand (op0, modev2);
8737 if (VECTOR_MODE_P (modev4))
8738 op2 = safe_vector_operand (op2, modev4);
8739
8740 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
8741 op0 = copy_to_mode_reg (modev2, op0);
8742 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
8743 op1 = copy_to_mode_reg (modei3, op1);
8744 if ((optimize && !register_operand (op2, modev4))
8745 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
8746 op2 = copy_to_mode_reg (modev4, op2);
8747 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
8748 op3 = copy_to_mode_reg (modei5, op3);
8749
8750 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
8751 {
8752 error ("the fifth argument must be an 8-bit immediate");
8753 return const0_rtx;
8754 }
8755
8756 if (d->code == IX86_BUILTIN_PCMPESTRI128)
8757 {
8758 if (optimize || !target
8759 || GET_MODE (target) != tmode0
8760 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
8761 target = gen_reg_rtx (tmode0);
8762
8763 scratch1 = gen_reg_rtx (tmode1);
8764
8765 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
8766 }
8767 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
8768 {
8769 if (optimize || !target
8770 || GET_MODE (target) != tmode1
8771 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
8772 target = gen_reg_rtx (tmode1);
8773
8774 scratch0 = gen_reg_rtx (tmode0);
8775
8776 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
8777 }
8778 else
8779 {
8780 gcc_assert (d->flag);
8781
8782 scratch0 = gen_reg_rtx (tmode0);
8783 scratch1 = gen_reg_rtx (tmode1);
8784
8785 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
8786 }
8787
8788 if (! pat)
8789 return 0;
8790
8791 emit_insn (pat);
8792
8793 if (d->flag)
8794 {
8795 target = gen_reg_rtx (SImode);
8796 emit_move_insn (target, const0_rtx);
8797 target = gen_rtx_SUBREG (QImode, target, 0);
8798
8799 emit_insn
8800 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8801 gen_rtx_fmt_ee (EQ, QImode,
8802 gen_rtx_REG ((machine_mode) d->flag,
8803 FLAGS_REG),
8804 const0_rtx)));
8805 return SUBREG_REG (target);
8806 }
8807 else
8808 return target;
8809 }
8810
8811
8812 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
8813
8814 static rtx
8815 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
8816 tree exp, rtx target)
8817 {
8818 rtx pat;
8819 tree arg0 = CALL_EXPR_ARG (exp, 0);
8820 tree arg1 = CALL_EXPR_ARG (exp, 1);
8821 tree arg2 = CALL_EXPR_ARG (exp, 2);
8822 rtx scratch0, scratch1;
8823 rtx op0 = expand_normal (arg0);
8824 rtx op1 = expand_normal (arg1);
8825 rtx op2 = expand_normal (arg2);
8826 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
8827
8828 tmode0 = insn_data[d->icode].operand[0].mode;
8829 tmode1 = insn_data[d->icode].operand[1].mode;
8830 modev2 = insn_data[d->icode].operand[2].mode;
8831 modev3 = insn_data[d->icode].operand[3].mode;
8832 modeimm = insn_data[d->icode].operand[4].mode;
8833
8834 if (VECTOR_MODE_P (modev2))
8835 op0 = safe_vector_operand (op0, modev2);
8836 if (VECTOR_MODE_P (modev3))
8837 op1 = safe_vector_operand (op1, modev3);
8838
8839 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
8840 op0 = copy_to_mode_reg (modev2, op0);
8841 if ((optimize && !register_operand (op1, modev3))
8842 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
8843 op1 = copy_to_mode_reg (modev3, op1);
8844
8845 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
8846 {
8847 error ("the third argument must be an 8-bit immediate");
8848 return const0_rtx;
8849 }
8850
8851 if (d->code == IX86_BUILTIN_PCMPISTRI128)
8852 {
8853 if (optimize || !target
8854 || GET_MODE (target) != tmode0
8855 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
8856 target = gen_reg_rtx (tmode0);
8857
8858 scratch1 = gen_reg_rtx (tmode1);
8859
8860 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
8861 }
8862 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
8863 {
8864 if (optimize || !target
8865 || GET_MODE (target) != tmode1
8866 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
8867 target = gen_reg_rtx (tmode1);
8868
8869 scratch0 = gen_reg_rtx (tmode0);
8870
8871 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
8872 }
8873 else
8874 {
8875 gcc_assert (d->flag);
8876
8877 scratch0 = gen_reg_rtx (tmode0);
8878 scratch1 = gen_reg_rtx (tmode1);
8879
8880 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
8881 }
8882
8883 if (! pat)
8884 return 0;
8885
8886 emit_insn (pat);
8887
8888 if (d->flag)
8889 {
8890 target = gen_reg_rtx (SImode);
8891 emit_move_insn (target, const0_rtx);
8892 target = gen_rtx_SUBREG (QImode, target, 0);
8893
8894 emit_insn
8895 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
8896 gen_rtx_fmt_ee (EQ, QImode,
8897 gen_rtx_REG ((machine_mode) d->flag,
8898 FLAGS_REG),
8899 const0_rtx)));
8900 return SUBREG_REG (target);
8901 }
8902 else
8903 return target;
8904 }
8905
8906 /* Fixup modeless constants to fit required mode. */
8907
8908 static rtx
8909 fixup_modeless_constant (rtx x, machine_mode mode)
8910 {
8911 if (GET_MODE (x) == VOIDmode)
8912 x = convert_to_mode (mode, x, 1);
8913 return x;
8914 }
8915
8916 /* Subroutine of ix86_expand_builtin to take care of insns with
8917 variable number of operands. */
8918
8919 static rtx
8920 ix86_expand_args_builtin (const struct builtin_description *d,
8921 tree exp, rtx target)
8922 {
8923 rtx pat, real_target;
8924 unsigned int i, nargs;
8925 unsigned int nargs_constant = 0;
8926 unsigned int mask_pos = 0;
8927 int num_memory = 0;
8928 struct
8929 {
8930 rtx op;
8931 machine_mode mode;
8932 } args[6];
8933 bool second_arg_count = false;
8934 enum insn_code icode = d->icode;
8935 const struct insn_data_d *insn_p = &insn_data[icode];
8936 machine_mode tmode = insn_p->operand[0].mode;
8937 machine_mode rmode = VOIDmode;
8938 bool swap = false;
8939 enum rtx_code comparison = d->comparison;
8940
8941 switch ((enum ix86_builtin_func_type) d->flag)
8942 {
8943 case V2DF_FTYPE_V2DF_ROUND:
8944 case V4DF_FTYPE_V4DF_ROUND:
8945 case V8DF_FTYPE_V8DF_ROUND:
8946 case V4SF_FTYPE_V4SF_ROUND:
8947 case V8SF_FTYPE_V8SF_ROUND:
8948 case V16SF_FTYPE_V16SF_ROUND:
8949 case V4SI_FTYPE_V4SF_ROUND:
8950 case V8SI_FTYPE_V8SF_ROUND:
8951 case V16SI_FTYPE_V16SF_ROUND:
8952 return ix86_expand_sse_round (d, exp, target);
8953 case V4SI_FTYPE_V2DF_V2DF_ROUND:
8954 case V8SI_FTYPE_V4DF_V4DF_ROUND:
8955 case V16SI_FTYPE_V8DF_V8DF_ROUND:
8956 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
8957 case INT_FTYPE_V8SF_V8SF_PTEST:
8958 case INT_FTYPE_V4DI_V4DI_PTEST:
8959 case INT_FTYPE_V4DF_V4DF_PTEST:
8960 case INT_FTYPE_V4SF_V4SF_PTEST:
8961 case INT_FTYPE_V2DI_V2DI_PTEST:
8962 case INT_FTYPE_V2DF_V2DF_PTEST:
8963 return ix86_expand_sse_ptest (d, exp, target);
8964 case FLOAT128_FTYPE_FLOAT128:
8965 case FLOAT_FTYPE_FLOAT:
8966 case INT_FTYPE_INT:
8967 case UINT_FTYPE_UINT:
8968 case UINT16_FTYPE_UINT16:
8969 case UINT64_FTYPE_INT:
8970 case UINT64_FTYPE_UINT64:
8971 case INT64_FTYPE_INT64:
8972 case INT64_FTYPE_V4SF:
8973 case INT64_FTYPE_V2DF:
8974 case INT_FTYPE_V16QI:
8975 case INT_FTYPE_V8QI:
8976 case INT_FTYPE_V8SF:
8977 case INT_FTYPE_V4DF:
8978 case INT_FTYPE_V4SF:
8979 case INT_FTYPE_V2DF:
8980 case INT_FTYPE_V32QI:
8981 case V16QI_FTYPE_V16QI:
8982 case V8SI_FTYPE_V8SF:
8983 case V8SI_FTYPE_V4SI:
8984 case V8HI_FTYPE_V8HI:
8985 case V8HI_FTYPE_V16QI:
8986 case V8QI_FTYPE_V8QI:
8987 case V8SF_FTYPE_V8SF:
8988 case V8SF_FTYPE_V8SI:
8989 case V8SF_FTYPE_V4SF:
8990 case V8SF_FTYPE_V8HI:
8991 case V4SI_FTYPE_V4SI:
8992 case V4SI_FTYPE_V16QI:
8993 case V4SI_FTYPE_V4SF:
8994 case V4SI_FTYPE_V8SI:
8995 case V4SI_FTYPE_V8HI:
8996 case V4SI_FTYPE_V4DF:
8997 case V4SI_FTYPE_V2DF:
8998 case V4HI_FTYPE_V4HI:
8999 case V4DF_FTYPE_V4DF:
9000 case V4DF_FTYPE_V4SI:
9001 case V4DF_FTYPE_V4SF:
9002 case V4DF_FTYPE_V2DF:
9003 case V4SF_FTYPE_V4SF:
9004 case V4SF_FTYPE_V4SI:
9005 case V4SF_FTYPE_V8SF:
9006 case V4SF_FTYPE_V4DF:
9007 case V4SF_FTYPE_V8HI:
9008 case V4SF_FTYPE_V2DF:
9009 case V2DI_FTYPE_V2DI:
9010 case V2DI_FTYPE_V16QI:
9011 case V2DI_FTYPE_V8HI:
9012 case V2DI_FTYPE_V4SI:
9013 case V2DF_FTYPE_V2DF:
9014 case V2DF_FTYPE_V4SI:
9015 case V2DF_FTYPE_V4DF:
9016 case V2DF_FTYPE_V4SF:
9017 case V2DF_FTYPE_V2SI:
9018 case V2SI_FTYPE_V2SI:
9019 case V2SI_FTYPE_V4SF:
9020 case V2SI_FTYPE_V2SF:
9021 case V2SI_FTYPE_V2DF:
9022 case V2SF_FTYPE_V2SF:
9023 case V2SF_FTYPE_V2SI:
9024 case V32QI_FTYPE_V32QI:
9025 case V32QI_FTYPE_V16QI:
9026 case V16HI_FTYPE_V16HI:
9027 case V16HI_FTYPE_V8HI:
9028 case V8SI_FTYPE_V8SI:
9029 case V16HI_FTYPE_V16QI:
9030 case V8SI_FTYPE_V16QI:
9031 case V4DI_FTYPE_V16QI:
9032 case V8SI_FTYPE_V8HI:
9033 case V4DI_FTYPE_V8HI:
9034 case V4DI_FTYPE_V4SI:
9035 case V4DI_FTYPE_V2DI:
9036 case UQI_FTYPE_UQI:
9037 case UHI_FTYPE_UHI:
9038 case USI_FTYPE_USI:
9039 case USI_FTYPE_UQI:
9040 case USI_FTYPE_UHI:
9041 case UDI_FTYPE_UDI:
9042 case UHI_FTYPE_V16QI:
9043 case USI_FTYPE_V32QI:
9044 case UDI_FTYPE_V64QI:
9045 case V16QI_FTYPE_UHI:
9046 case V32QI_FTYPE_USI:
9047 case V64QI_FTYPE_UDI:
9048 case V8HI_FTYPE_UQI:
9049 case V16HI_FTYPE_UHI:
9050 case V32HI_FTYPE_USI:
9051 case V4SI_FTYPE_UQI:
9052 case V8SI_FTYPE_UQI:
9053 case V4SI_FTYPE_UHI:
9054 case V8SI_FTYPE_UHI:
9055 case UQI_FTYPE_V8HI:
9056 case UHI_FTYPE_V16HI:
9057 case USI_FTYPE_V32HI:
9058 case UQI_FTYPE_V4SI:
9059 case UQI_FTYPE_V8SI:
9060 case UHI_FTYPE_V16SI:
9061 case UQI_FTYPE_V2DI:
9062 case UQI_FTYPE_V4DI:
9063 case UQI_FTYPE_V8DI:
9064 case V16SI_FTYPE_UHI:
9065 case V2DI_FTYPE_UQI:
9066 case V4DI_FTYPE_UQI:
9067 case V16SI_FTYPE_INT:
9068 case V16SF_FTYPE_V8SF:
9069 case V16SI_FTYPE_V8SI:
9070 case V16SF_FTYPE_V4SF:
9071 case V16SI_FTYPE_V4SI:
9072 case V16SI_FTYPE_V16SF:
9073 case V16SI_FTYPE_V16SI:
9074 case V64QI_FTYPE_V64QI:
9075 case V32HI_FTYPE_V32HI:
9076 case V16SF_FTYPE_V16SF:
9077 case V8DI_FTYPE_UQI:
9078 case V8DI_FTYPE_V8DI:
9079 case V8DF_FTYPE_V4DF:
9080 case V8DF_FTYPE_V2DF:
9081 case V8DF_FTYPE_V8DF:
9082 case V4DI_FTYPE_V4DI:
9083 case V16HI_FTYPE_V16SF:
9084 case V8HI_FTYPE_V8SF:
9085 case V8HI_FTYPE_V4SF:
9086 nargs = 1;
9087 break;
9088 case V4SF_FTYPE_V4SF_VEC_MERGE:
9089 case V2DF_FTYPE_V2DF_VEC_MERGE:
9090 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
9091 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
9092 case V16QI_FTYPE_V16QI_V16QI:
9093 case V16QI_FTYPE_V8HI_V8HI:
9094 case V16SF_FTYPE_V16SF_V16SF:
9095 case V8QI_FTYPE_V8QI_V8QI:
9096 case V8QI_FTYPE_V4HI_V4HI:
9097 case V8HI_FTYPE_V8HI_V8HI:
9098 case V8HI_FTYPE_V16QI_V16QI:
9099 case V8HI_FTYPE_V4SI_V4SI:
9100 case V8SF_FTYPE_V8SF_V8SF:
9101 case V8SF_FTYPE_V8SF_V8SI:
9102 case V8DF_FTYPE_V8DF_V8DF:
9103 case V4SI_FTYPE_V4SI_V4SI:
9104 case V4SI_FTYPE_V8HI_V8HI:
9105 case V4SI_FTYPE_V2DF_V2DF:
9106 case V4HI_FTYPE_V4HI_V4HI:
9107 case V4HI_FTYPE_V8QI_V8QI:
9108 case V4HI_FTYPE_V2SI_V2SI:
9109 case V4DF_FTYPE_V4DF_V4DF:
9110 case V4DF_FTYPE_V4DF_V4DI:
9111 case V4SF_FTYPE_V4SF_V4SF:
9112 case V4SF_FTYPE_V4SF_V4SI:
9113 case V4SF_FTYPE_V4SF_V2SI:
9114 case V4SF_FTYPE_V4SF_V2DF:
9115 case V4SF_FTYPE_V4SF_UINT:
9116 case V4SF_FTYPE_V4SF_DI:
9117 case V4SF_FTYPE_V4SF_SI:
9118 case V2DI_FTYPE_V2DI_V2DI:
9119 case V2DI_FTYPE_V16QI_V16QI:
9120 case V2DI_FTYPE_V4SI_V4SI:
9121 case V2DI_FTYPE_V2DI_V16QI:
9122 case V2SI_FTYPE_V2SI_V2SI:
9123 case V2SI_FTYPE_V4HI_V4HI:
9124 case V2SI_FTYPE_V2SF_V2SF:
9125 case V2DF_FTYPE_V2DF_V2DF:
9126 case V2DF_FTYPE_V2DF_V4SF:
9127 case V2DF_FTYPE_V2DF_V2DI:
9128 case V2DF_FTYPE_V2DF_DI:
9129 case V2DF_FTYPE_V2DF_SI:
9130 case V2DF_FTYPE_V2DF_UINT:
9131 case V2SF_FTYPE_V2SF_V2SF:
9132 case V1DI_FTYPE_V1DI_V1DI:
9133 case V1DI_FTYPE_V8QI_V8QI:
9134 case V1DI_FTYPE_V2SI_V2SI:
9135 case V32QI_FTYPE_V16HI_V16HI:
9136 case V16HI_FTYPE_V8SI_V8SI:
9137 case V64QI_FTYPE_V64QI_V64QI:
9138 case V32QI_FTYPE_V32QI_V32QI:
9139 case V16HI_FTYPE_V32QI_V32QI:
9140 case V16HI_FTYPE_V16HI_V16HI:
9141 case V8SI_FTYPE_V4DF_V4DF:
9142 case V8SI_FTYPE_V8SI_V8SI:
9143 case V8SI_FTYPE_V16HI_V16HI:
9144 case V4DI_FTYPE_V4DI_V4DI:
9145 case V4DI_FTYPE_V8SI_V8SI:
9146 case V8DI_FTYPE_V64QI_V64QI:
9147 if (comparison == UNKNOWN)
9148 return ix86_expand_binop_builtin (icode, exp, target);
9149 nargs = 2;
9150 break;
9151 case V4SF_FTYPE_V4SF_V4SF_SWAP:
9152 case V2DF_FTYPE_V2DF_V2DF_SWAP:
9153 gcc_assert (comparison != UNKNOWN);
9154 nargs = 2;
9155 swap = true;
9156 break;
9157 case V16HI_FTYPE_V16HI_V8HI_COUNT:
9158 case V16HI_FTYPE_V16HI_SI_COUNT:
9159 case V8SI_FTYPE_V8SI_V4SI_COUNT:
9160 case V8SI_FTYPE_V8SI_SI_COUNT:
9161 case V4DI_FTYPE_V4DI_V2DI_COUNT:
9162 case V4DI_FTYPE_V4DI_INT_COUNT:
9163 case V8HI_FTYPE_V8HI_V8HI_COUNT:
9164 case V8HI_FTYPE_V8HI_SI_COUNT:
9165 case V4SI_FTYPE_V4SI_V4SI_COUNT:
9166 case V4SI_FTYPE_V4SI_SI_COUNT:
9167 case V4HI_FTYPE_V4HI_V4HI_COUNT:
9168 case V4HI_FTYPE_V4HI_SI_COUNT:
9169 case V2DI_FTYPE_V2DI_V2DI_COUNT:
9170 case V2DI_FTYPE_V2DI_SI_COUNT:
9171 case V2SI_FTYPE_V2SI_V2SI_COUNT:
9172 case V2SI_FTYPE_V2SI_SI_COUNT:
9173 case V1DI_FTYPE_V1DI_V1DI_COUNT:
9174 case V1DI_FTYPE_V1DI_SI_COUNT:
9175 nargs = 2;
9176 second_arg_count = true;
9177 break;
9178 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
9179 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
9180 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
9181 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
9182 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
9183 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
9184 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
9185 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
9186 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
9187 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
9188 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
9189 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
9190 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
9191 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
9192 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
9193 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
9194 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
9195 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
9196 nargs = 4;
9197 second_arg_count = true;
9198 break;
9199 case UINT64_FTYPE_UINT64_UINT64:
9200 case UINT_FTYPE_UINT_UINT:
9201 case UINT_FTYPE_UINT_USHORT:
9202 case UINT_FTYPE_UINT_UCHAR:
9203 case UINT16_FTYPE_UINT16_INT:
9204 case UINT8_FTYPE_UINT8_INT:
9205 case UQI_FTYPE_UQI_UQI:
9206 case UHI_FTYPE_UHI_UHI:
9207 case USI_FTYPE_USI_USI:
9208 case UDI_FTYPE_UDI_UDI:
9209 case V16SI_FTYPE_V8DF_V8DF:
9210 case V32HI_FTYPE_V16SF_V16SF:
9211 case V16HI_FTYPE_V8SF_V8SF:
9212 case V8HI_FTYPE_V4SF_V4SF:
9213 case V16HI_FTYPE_V16SF_UHI:
9214 case V8HI_FTYPE_V8SF_UQI:
9215 case V8HI_FTYPE_V4SF_UQI:
9216 nargs = 2;
9217 break;
9218 case V2DI_FTYPE_V2DI_INT_CONVERT:
9219 nargs = 2;
9220 rmode = V1TImode;
9221 nargs_constant = 1;
9222 break;
9223 case V4DI_FTYPE_V4DI_INT_CONVERT:
9224 nargs = 2;
9225 rmode = V2TImode;
9226 nargs_constant = 1;
9227 break;
9228 case V8DI_FTYPE_V8DI_INT_CONVERT:
9229 nargs = 2;
9230 rmode = V4TImode;
9231 nargs_constant = 1;
9232 break;
9233 case V8HI_FTYPE_V8HI_INT:
9234 case V8HI_FTYPE_V8SF_INT:
9235 case V16HI_FTYPE_V16SF_INT:
9236 case V8HI_FTYPE_V4SF_INT:
9237 case V8SF_FTYPE_V8SF_INT:
9238 case V4SF_FTYPE_V16SF_INT:
9239 case V16SF_FTYPE_V16SF_INT:
9240 case V4SI_FTYPE_V4SI_INT:
9241 case V4SI_FTYPE_V8SI_INT:
9242 case V4HI_FTYPE_V4HI_INT:
9243 case V4DF_FTYPE_V4DF_INT:
9244 case V4DF_FTYPE_V8DF_INT:
9245 case V4SF_FTYPE_V4SF_INT:
9246 case V4SF_FTYPE_V8SF_INT:
9247 case V2DI_FTYPE_V2DI_INT:
9248 case V2DF_FTYPE_V2DF_INT:
9249 case V2DF_FTYPE_V4DF_INT:
9250 case V16HI_FTYPE_V16HI_INT:
9251 case V8SI_FTYPE_V8SI_INT:
9252 case V16SI_FTYPE_V16SI_INT:
9253 case V4SI_FTYPE_V16SI_INT:
9254 case V4DI_FTYPE_V4DI_INT:
9255 case V2DI_FTYPE_V4DI_INT:
9256 case V4DI_FTYPE_V8DI_INT:
9257 case UQI_FTYPE_UQI_UQI_CONST:
9258 case UHI_FTYPE_UHI_UQI:
9259 case USI_FTYPE_USI_UQI:
9260 case UDI_FTYPE_UDI_UQI:
9261 nargs = 2;
9262 nargs_constant = 1;
9263 break;
9264 case V16QI_FTYPE_V16QI_V16QI_V16QI:
9265 case V8SF_FTYPE_V8SF_V8SF_V8SF:
9266 case V4DF_FTYPE_V4DF_V4DF_V4DF:
9267 case V4SF_FTYPE_V4SF_V4SF_V4SF:
9268 case V2DF_FTYPE_V2DF_V2DF_V2DF:
9269 case V32QI_FTYPE_V32QI_V32QI_V32QI:
9270 case UHI_FTYPE_V16SI_V16SI_UHI:
9271 case UQI_FTYPE_V8DI_V8DI_UQI:
9272 case V16HI_FTYPE_V16SI_V16HI_UHI:
9273 case V16QI_FTYPE_V16SI_V16QI_UHI:
9274 case V16QI_FTYPE_V8DI_V16QI_UQI:
9275 case V16SF_FTYPE_V16SF_V16SF_UHI:
9276 case V16SF_FTYPE_V4SF_V16SF_UHI:
9277 case V16SI_FTYPE_SI_V16SI_UHI:
9278 case V16SI_FTYPE_V16HI_V16SI_UHI:
9279 case V16SI_FTYPE_V16QI_V16SI_UHI:
9280 case V8SF_FTYPE_V4SF_V8SF_UQI:
9281 case V4DF_FTYPE_V2DF_V4DF_UQI:
9282 case V8SI_FTYPE_V4SI_V8SI_UQI:
9283 case V8SI_FTYPE_SI_V8SI_UQI:
9284 case V4SI_FTYPE_V4SI_V4SI_UQI:
9285 case V4SI_FTYPE_SI_V4SI_UQI:
9286 case V4DI_FTYPE_V2DI_V4DI_UQI:
9287 case V4DI_FTYPE_DI_V4DI_UQI:
9288 case V2DI_FTYPE_V2DI_V2DI_UQI:
9289 case V2DI_FTYPE_DI_V2DI_UQI:
9290 case V64QI_FTYPE_V64QI_V64QI_UDI:
9291 case V64QI_FTYPE_V16QI_V64QI_UDI:
9292 case V64QI_FTYPE_QI_V64QI_UDI:
9293 case V32QI_FTYPE_V32QI_V32QI_USI:
9294 case V32QI_FTYPE_V16QI_V32QI_USI:
9295 case V32QI_FTYPE_QI_V32QI_USI:
9296 case V16QI_FTYPE_V16QI_V16QI_UHI:
9297 case V16QI_FTYPE_QI_V16QI_UHI:
9298 case V32HI_FTYPE_V8HI_V32HI_USI:
9299 case V32HI_FTYPE_HI_V32HI_USI:
9300 case V16HI_FTYPE_V8HI_V16HI_UHI:
9301 case V16HI_FTYPE_HI_V16HI_UHI:
9302 case V8HI_FTYPE_V8HI_V8HI_UQI:
9303 case V8HI_FTYPE_HI_V8HI_UQI:
9304 case V8SF_FTYPE_V8HI_V8SF_UQI:
9305 case V4SF_FTYPE_V8HI_V4SF_UQI:
9306 case V8SI_FTYPE_V8SF_V8SI_UQI:
9307 case V4SI_FTYPE_V4SF_V4SI_UQI:
9308 case V4DI_FTYPE_V4SF_V4DI_UQI:
9309 case V2DI_FTYPE_V4SF_V2DI_UQI:
9310 case V4SF_FTYPE_V4DI_V4SF_UQI:
9311 case V4SF_FTYPE_V2DI_V4SF_UQI:
9312 case V4DF_FTYPE_V4DI_V4DF_UQI:
9313 case V2DF_FTYPE_V2DI_V2DF_UQI:
9314 case V16QI_FTYPE_V8HI_V16QI_UQI:
9315 case V16QI_FTYPE_V16HI_V16QI_UHI:
9316 case V16QI_FTYPE_V4SI_V16QI_UQI:
9317 case V16QI_FTYPE_V8SI_V16QI_UQI:
9318 case V8HI_FTYPE_V4SI_V8HI_UQI:
9319 case V8HI_FTYPE_V8SI_V8HI_UQI:
9320 case V16QI_FTYPE_V2DI_V16QI_UQI:
9321 case V16QI_FTYPE_V4DI_V16QI_UQI:
9322 case V8HI_FTYPE_V2DI_V8HI_UQI:
9323 case V8HI_FTYPE_V4DI_V8HI_UQI:
9324 case V4SI_FTYPE_V2DI_V4SI_UQI:
9325 case V4SI_FTYPE_V4DI_V4SI_UQI:
9326 case V32QI_FTYPE_V32HI_V32QI_USI:
9327 case UHI_FTYPE_V16QI_V16QI_UHI:
9328 case USI_FTYPE_V32QI_V32QI_USI:
9329 case UDI_FTYPE_V64QI_V64QI_UDI:
9330 case UQI_FTYPE_V8HI_V8HI_UQI:
9331 case UHI_FTYPE_V16HI_V16HI_UHI:
9332 case USI_FTYPE_V32HI_V32HI_USI:
9333 case UQI_FTYPE_V4SI_V4SI_UQI:
9334 case UQI_FTYPE_V8SI_V8SI_UQI:
9335 case UQI_FTYPE_V2DI_V2DI_UQI:
9336 case UQI_FTYPE_V4DI_V4DI_UQI:
9337 case V4SF_FTYPE_V2DF_V4SF_UQI:
9338 case V4SF_FTYPE_V4DF_V4SF_UQI:
9339 case V16SI_FTYPE_V16SI_V16SI_UHI:
9340 case V16SI_FTYPE_V4SI_V16SI_UHI:
9341 case V2DI_FTYPE_V4SI_V2DI_UQI:
9342 case V2DI_FTYPE_V8HI_V2DI_UQI:
9343 case V2DI_FTYPE_V16QI_V2DI_UQI:
9344 case V4DI_FTYPE_V4DI_V4DI_UQI:
9345 case V4DI_FTYPE_V4SI_V4DI_UQI:
9346 case V4DI_FTYPE_V8HI_V4DI_UQI:
9347 case V4DI_FTYPE_V16QI_V4DI_UQI:
9348 case V4DI_FTYPE_V4DF_V4DI_UQI:
9349 case V2DI_FTYPE_V2DF_V2DI_UQI:
9350 case V4SI_FTYPE_V4DF_V4SI_UQI:
9351 case V4SI_FTYPE_V2DF_V4SI_UQI:
9352 case V4SI_FTYPE_V8HI_V4SI_UQI:
9353 case V4SI_FTYPE_V16QI_V4SI_UQI:
9354 case V4DI_FTYPE_V4DI_V4DI_V4DI:
9355 case V8DF_FTYPE_V2DF_V8DF_UQI:
9356 case V8DF_FTYPE_V4DF_V8DF_UQI:
9357 case V8DF_FTYPE_V8DF_V8DF_UQI:
9358 case V8SF_FTYPE_V8SF_V8SF_UQI:
9359 case V8SF_FTYPE_V8SI_V8SF_UQI:
9360 case V4DF_FTYPE_V4DF_V4DF_UQI:
9361 case V4SF_FTYPE_V4SF_V4SF_UQI:
9362 case V2DF_FTYPE_V2DF_V2DF_UQI:
9363 case V2DF_FTYPE_V4SF_V2DF_UQI:
9364 case V2DF_FTYPE_V4SI_V2DF_UQI:
9365 case V4SF_FTYPE_V4SI_V4SF_UQI:
9366 case V4DF_FTYPE_V4SF_V4DF_UQI:
9367 case V4DF_FTYPE_V4SI_V4DF_UQI:
9368 case V8SI_FTYPE_V8SI_V8SI_UQI:
9369 case V8SI_FTYPE_V8HI_V8SI_UQI:
9370 case V8SI_FTYPE_V16QI_V8SI_UQI:
9371 case V8DF_FTYPE_V8SI_V8DF_UQI:
9372 case V8DI_FTYPE_DI_V8DI_UQI:
9373 case V16SF_FTYPE_V8SF_V16SF_UHI:
9374 case V16SI_FTYPE_V8SI_V16SI_UHI:
9375 case V16HI_FTYPE_V16HI_V16HI_UHI:
9376 case V8HI_FTYPE_V16QI_V8HI_UQI:
9377 case V16HI_FTYPE_V16QI_V16HI_UHI:
9378 case V32HI_FTYPE_V32HI_V32HI_USI:
9379 case V32HI_FTYPE_V32QI_V32HI_USI:
9380 case V8DI_FTYPE_V16QI_V8DI_UQI:
9381 case V8DI_FTYPE_V2DI_V8DI_UQI:
9382 case V8DI_FTYPE_V4DI_V8DI_UQI:
9383 case V8DI_FTYPE_V8DI_V8DI_UQI:
9384 case V8DI_FTYPE_V8HI_V8DI_UQI:
9385 case V8DI_FTYPE_V8SI_V8DI_UQI:
9386 case V8HI_FTYPE_V8DI_V8HI_UQI:
9387 case V8SI_FTYPE_V8DI_V8SI_UQI:
9388 case V4SI_FTYPE_V4SI_V4SI_V4SI:
9389 case V16SI_FTYPE_V16SI_V16SI_V16SI:
9390 case V8DI_FTYPE_V8DI_V8DI_V8DI:
9391 case V32HI_FTYPE_V32HI_V32HI_V32HI:
9392 case V2DI_FTYPE_V2DI_V2DI_V2DI:
9393 case V16HI_FTYPE_V16HI_V16HI_V16HI:
9394 case V8SI_FTYPE_V8SI_V8SI_V8SI:
9395 case V8HI_FTYPE_V8HI_V8HI_V8HI:
9396 case V32HI_FTYPE_V16SF_V16SF_USI:
9397 case V16HI_FTYPE_V8SF_V8SF_UHI:
9398 case V8HI_FTYPE_V4SF_V4SF_UQI:
9399 case V16HI_FTYPE_V16SF_V16HI_UHI:
9400 case V8HI_FTYPE_V8SF_V8HI_UQI:
9401 case V8HI_FTYPE_V4SF_V8HI_UQI:
9402 case V16SF_FTYPE_V16SF_V32HI_V32HI:
9403 case V8SF_FTYPE_V8SF_V16HI_V16HI:
9404 case V4SF_FTYPE_V4SF_V8HI_V8HI:
9405 nargs = 3;
9406 break;
9407 case V32QI_FTYPE_V32QI_V32QI_INT:
9408 case V16HI_FTYPE_V16HI_V16HI_INT:
9409 case V16QI_FTYPE_V16QI_V16QI_INT:
9410 case V4DI_FTYPE_V4DI_V4DI_INT:
9411 case V8HI_FTYPE_V8HI_V8HI_INT:
9412 case V8SI_FTYPE_V8SI_V8SI_INT:
9413 case V8SI_FTYPE_V8SI_V4SI_INT:
9414 case V8SF_FTYPE_V8SF_V8SF_INT:
9415 case V8SF_FTYPE_V8SF_V4SF_INT:
9416 case V4SI_FTYPE_V4SI_V4SI_INT:
9417 case V4DF_FTYPE_V4DF_V4DF_INT:
9418 case V16SF_FTYPE_V16SF_V16SF_INT:
9419 case V16SF_FTYPE_V16SF_V4SF_INT:
9420 case V16SI_FTYPE_V16SI_V4SI_INT:
9421 case V4DF_FTYPE_V4DF_V2DF_INT:
9422 case V4SF_FTYPE_V4SF_V4SF_INT:
9423 case V2DI_FTYPE_V2DI_V2DI_INT:
9424 case V4DI_FTYPE_V4DI_V2DI_INT:
9425 case V2DF_FTYPE_V2DF_V2DF_INT:
9426 case UQI_FTYPE_V8DI_V8UDI_INT:
9427 case UQI_FTYPE_V8DF_V8DF_INT:
9428 case UQI_FTYPE_V2DF_V2DF_INT:
9429 case UQI_FTYPE_V4SF_V4SF_INT:
9430 case UHI_FTYPE_V16SI_V16SI_INT:
9431 case UHI_FTYPE_V16SF_V16SF_INT:
9432 case V64QI_FTYPE_V64QI_V64QI_INT:
9433 case V32HI_FTYPE_V32HI_V32HI_INT:
9434 case V16SI_FTYPE_V16SI_V16SI_INT:
9435 case V8DI_FTYPE_V8DI_V8DI_INT:
9436 nargs = 3;
9437 nargs_constant = 1;
9438 break;
9439 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
9440 nargs = 3;
9441 rmode = V4DImode;
9442 nargs_constant = 1;
9443 break;
9444 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
9445 nargs = 3;
9446 rmode = V2DImode;
9447 nargs_constant = 1;
9448 break;
9449 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
9450 nargs = 3;
9451 rmode = DImode;
9452 nargs_constant = 1;
9453 break;
9454 case V2DI_FTYPE_V2DI_UINT_UINT:
9455 nargs = 3;
9456 nargs_constant = 2;
9457 break;
9458 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
9459 nargs = 3;
9460 rmode = V8DImode;
9461 nargs_constant = 1;
9462 break;
9463 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
9464 nargs = 5;
9465 rmode = V8DImode;
9466 mask_pos = 2;
9467 nargs_constant = 1;
9468 break;
9469 case QI_FTYPE_V8DF_INT_UQI:
9470 case QI_FTYPE_V4DF_INT_UQI:
9471 case QI_FTYPE_V2DF_INT_UQI:
9472 case HI_FTYPE_V16SF_INT_UHI:
9473 case QI_FTYPE_V8SF_INT_UQI:
9474 case QI_FTYPE_V4SF_INT_UQI:
9475 case V4SI_FTYPE_V4SI_V4SI_UHI:
9476 case V8SI_FTYPE_V8SI_V8SI_UHI:
9477 nargs = 3;
9478 mask_pos = 1;
9479 nargs_constant = 1;
9480 break;
9481 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
9482 nargs = 5;
9483 rmode = V4DImode;
9484 mask_pos = 2;
9485 nargs_constant = 1;
9486 break;
9487 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
9488 nargs = 5;
9489 rmode = V2DImode;
9490 mask_pos = 2;
9491 nargs_constant = 1;
9492 break;
9493 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
9494 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
9495 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
9496 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
9497 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
9498 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
9499 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
9500 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
9501 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
9502 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
9503 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
9504 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
9505 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
9506 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
9507 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
9508 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
9509 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
9510 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
9511 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
9512 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
9513 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
9514 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
9515 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
9516 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
9517 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
9518 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
9519 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
9520 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
9521 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
9522 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
9523 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
9524 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
9525 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
9526 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
9527 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
9528 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
9529 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
9530 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
9531 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
9532 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
9533 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
9534 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
9535 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
9536 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
9537 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
9538 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
9539 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
9540 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
9541 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
9542 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
9543 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
9544 case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
9545 case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
9546 case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
9547 nargs = 4;
9548 break;
9549 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
9550 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
9551 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
9552 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
9553 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
9554 nargs = 4;
9555 nargs_constant = 1;
9556 break;
9557 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
9558 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
9559 case QI_FTYPE_V4DF_V4DF_INT_UQI:
9560 case QI_FTYPE_V8SF_V8SF_INT_UQI:
9561 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
9562 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
9563 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
9564 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
9565 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
9566 case USI_FTYPE_V32QI_V32QI_INT_USI:
9567 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
9568 case USI_FTYPE_V32HI_V32HI_INT_USI:
9569 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
9570 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
9571 nargs = 4;
9572 mask_pos = 1;
9573 nargs_constant = 1;
9574 break;
9575 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
9576 nargs = 4;
9577 nargs_constant = 2;
9578 break;
9579 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
9580 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
9581 case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
9582 case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
9583 case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
9584 nargs = 4;
9585 break;
9586 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
9587 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
9588 mask_pos = 1;
9589 nargs = 4;
9590 nargs_constant = 1;
9591 break;
9592 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
9593 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
9594 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
9595 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
9596 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
9597 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
9598 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
9599 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
9600 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
9601 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
9602 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
9603 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
9604 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
9605 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
9606 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
9607 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
9608 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
9609 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
9610 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
9611 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
9612 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
9613 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
9614 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
9615 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
9616 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
9617 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
9618 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
9619 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
9620 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
9621 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
9622 nargs = 4;
9623 mask_pos = 2;
9624 nargs_constant = 1;
9625 break;
9626 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
9627 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
9628 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
9629 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
9630 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
9631 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
9632 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
9633 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
9634 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
9635 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
9636 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
9637 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
9638 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
9639 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
9640 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
9641 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
9642 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
9643 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
9644 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
9645 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
9646 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
9647 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
9648 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
9649 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
9650 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
9651 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
9652 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
9653 nargs = 5;
9654 mask_pos = 2;
9655 nargs_constant = 1;
9656 break;
9657 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
9658 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
9659 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
9660 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
9661 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
9662 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
9663 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
9664 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
9665 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
9666 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
9667 nargs = 5;
9668 mask_pos = 1;
9669 nargs_constant = 1;
9670 break;
9671 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
9672 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
9673 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
9674 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
9675 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
9676 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
9677 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
9678 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
9679 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
9680 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
9681 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
9682 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
9683 nargs = 5;
9684 mask_pos = 1;
9685 nargs_constant = 2;
9686 break;
9687
9688 default:
9689 gcc_unreachable ();
9690 }
9691
9692 gcc_assert (nargs <= ARRAY_SIZE (args));
9693
9694 if (comparison != UNKNOWN)
9695 {
9696 gcc_assert (nargs == 2);
9697 return ix86_expand_sse_compare (d, exp, target, swap);
9698 }
9699
9700 if (rmode == VOIDmode || rmode == tmode)
9701 {
9702 if (optimize
9703 || target == 0
9704 || GET_MODE (target) != tmode
9705 || !insn_p->operand[0].predicate (target, tmode))
9706 target = gen_reg_rtx (tmode);
9707 else if (memory_operand (target, tmode))
9708 num_memory++;
9709 real_target = target;
9710 }
9711 else
9712 {
9713 real_target = gen_reg_rtx (tmode);
9714 target = lowpart_subreg (rmode, real_target, tmode);
9715 }
9716
9717 for (i = 0; i < nargs; i++)
9718 {
9719 tree arg = CALL_EXPR_ARG (exp, i);
9720 rtx op = expand_normal (arg);
9721 machine_mode mode = insn_p->operand[i + 1].mode;
9722 bool match = insn_p->operand[i + 1].predicate (op, mode);
9723
9724 if (second_arg_count && i == 1)
9725 {
9726 /* SIMD shift insns take either an 8-bit immediate or
9727 register as count. But builtin functions take int as
9728 count. If count doesn't match, we put it in register.
9729 The instructions are using 64-bit count, if op is just
9730 32-bit, zero-extend it, as negative shift counts
9731 are undefined behavior and zero-extension is more
9732 efficient. */
9733 if (!match)
9734 {
9735 if (SCALAR_INT_MODE_P (GET_MODE (op)))
9736 op = convert_modes (mode, GET_MODE (op), op, 1);
9737 else
9738 op = lowpart_subreg (mode, op, GET_MODE (op));
9739 if (!insn_p->operand[i + 1].predicate (op, mode))
9740 op = copy_to_reg (op);
9741 }
9742 }
9743 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
9744 (!mask_pos && (nargs - i) <= nargs_constant))
9745 {
9746 if (!match)
9747 switch (icode)
9748 {
9749 case CODE_FOR_avx_vinsertf128v4di:
9750 case CODE_FOR_avx_vextractf128v4di:
9751 error ("the last argument must be an 1-bit immediate");
9752 return const0_rtx;
9753
9754 case CODE_FOR_avx512f_cmpv8di3_mask:
9755 case CODE_FOR_avx512f_cmpv16si3_mask:
9756 case CODE_FOR_avx512f_ucmpv8di3_mask:
9757 case CODE_FOR_avx512f_ucmpv16si3_mask:
9758 case CODE_FOR_avx512vl_cmpv4di3_mask:
9759 case CODE_FOR_avx512vl_cmpv8si3_mask:
9760 case CODE_FOR_avx512vl_ucmpv4di3_mask:
9761 case CODE_FOR_avx512vl_ucmpv8si3_mask:
9762 case CODE_FOR_avx512vl_cmpv2di3_mask:
9763 case CODE_FOR_avx512vl_cmpv4si3_mask:
9764 case CODE_FOR_avx512vl_ucmpv2di3_mask:
9765 case CODE_FOR_avx512vl_ucmpv4si3_mask:
9766 error ("the last argument must be a 3-bit immediate");
9767 return const0_rtx;
9768
9769 case CODE_FOR_sse4_1_roundsd:
9770 case CODE_FOR_sse4_1_roundss:
9771
9772 case CODE_FOR_sse4_1_roundpd:
9773 case CODE_FOR_sse4_1_roundps:
9774 case CODE_FOR_avx_roundpd256:
9775 case CODE_FOR_avx_roundps256:
9776
9777 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
9778 case CODE_FOR_sse4_1_roundps_sfix:
9779 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
9780 case CODE_FOR_avx_roundps_sfix256:
9781
9782 case CODE_FOR_sse4_1_blendps:
9783 case CODE_FOR_avx_blendpd256:
9784 case CODE_FOR_avx_vpermilv4df:
9785 case CODE_FOR_avx_vpermilv4df_mask:
9786 case CODE_FOR_avx512f_getmantv8df_mask:
9787 case CODE_FOR_avx512f_getmantv16sf_mask:
9788 case CODE_FOR_avx512vl_getmantv8sf_mask:
9789 case CODE_FOR_avx512vl_getmantv4df_mask:
9790 case CODE_FOR_avx512vl_getmantv4sf_mask:
9791 case CODE_FOR_avx512vl_getmantv2df_mask:
9792 case CODE_FOR_avx512dq_rangepv8df_mask_round:
9793 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
9794 case CODE_FOR_avx512dq_rangepv4df_mask:
9795 case CODE_FOR_avx512dq_rangepv8sf_mask:
9796 case CODE_FOR_avx512dq_rangepv2df_mask:
9797 case CODE_FOR_avx512dq_rangepv4sf_mask:
9798 case CODE_FOR_avx_shufpd256_mask:
9799 error ("the last argument must be a 4-bit immediate");
9800 return const0_rtx;
9801
9802 case CODE_FOR_sha1rnds4:
9803 case CODE_FOR_sse4_1_blendpd:
9804 case CODE_FOR_avx_vpermilv2df:
9805 case CODE_FOR_avx_vpermilv2df_mask:
9806 case CODE_FOR_xop_vpermil2v2df3:
9807 case CODE_FOR_xop_vpermil2v4sf3:
9808 case CODE_FOR_xop_vpermil2v4df3:
9809 case CODE_FOR_xop_vpermil2v8sf3:
9810 case CODE_FOR_avx512f_vinsertf32x4_mask:
9811 case CODE_FOR_avx512f_vinserti32x4_mask:
9812 case CODE_FOR_avx512f_vextractf32x4_mask:
9813 case CODE_FOR_avx512f_vextracti32x4_mask:
9814 case CODE_FOR_sse2_shufpd:
9815 case CODE_FOR_sse2_shufpd_mask:
9816 case CODE_FOR_avx512dq_shuf_f64x2_mask:
9817 case CODE_FOR_avx512dq_shuf_i64x2_mask:
9818 case CODE_FOR_avx512vl_shuf_i32x4_mask:
9819 case CODE_FOR_avx512vl_shuf_f32x4_mask:
9820 error ("the last argument must be a 2-bit immediate");
9821 return const0_rtx;
9822
9823 case CODE_FOR_avx_vextractf128v4df:
9824 case CODE_FOR_avx_vextractf128v8sf:
9825 case CODE_FOR_avx_vextractf128v8si:
9826 case CODE_FOR_avx_vinsertf128v4df:
9827 case CODE_FOR_avx_vinsertf128v8sf:
9828 case CODE_FOR_avx_vinsertf128v8si:
9829 case CODE_FOR_avx512f_vinsertf64x4_mask:
9830 case CODE_FOR_avx512f_vinserti64x4_mask:
9831 case CODE_FOR_avx512f_vextractf64x4_mask:
9832 case CODE_FOR_avx512f_vextracti64x4_mask:
9833 case CODE_FOR_avx512dq_vinsertf32x8_mask:
9834 case CODE_FOR_avx512dq_vinserti32x8_mask:
9835 case CODE_FOR_avx512vl_vinsertv4df:
9836 case CODE_FOR_avx512vl_vinsertv4di:
9837 case CODE_FOR_avx512vl_vinsertv8sf:
9838 case CODE_FOR_avx512vl_vinsertv8si:
9839 error ("the last argument must be a 1-bit immediate");
9840 return const0_rtx;
9841
9842 case CODE_FOR_avx_vmcmpv2df3:
9843 case CODE_FOR_avx_vmcmpv4sf3:
9844 case CODE_FOR_avx_cmpv2df3:
9845 case CODE_FOR_avx_cmpv4sf3:
9846 case CODE_FOR_avx_cmpv4df3:
9847 case CODE_FOR_avx_cmpv8sf3:
9848 case CODE_FOR_avx512f_cmpv8df3_mask:
9849 case CODE_FOR_avx512f_cmpv16sf3_mask:
9850 case CODE_FOR_avx512f_vmcmpv2df3_mask:
9851 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
9852 error ("the last argument must be a 5-bit immediate");
9853 return const0_rtx;
9854
9855 default:
9856 switch (nargs_constant)
9857 {
9858 case 2:
9859 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
9860 (!mask_pos && (nargs - i) == nargs_constant))
9861 {
9862 error ("the next to last argument must be an 8-bit immediate");
9863 break;
9864 }
9865 /* FALLTHRU */
9866 case 1:
9867 error ("the last argument must be an 8-bit immediate");
9868 break;
9869 default:
9870 gcc_unreachable ();
9871 }
9872 return const0_rtx;
9873 }
9874 }
9875 else
9876 {
9877 if (VECTOR_MODE_P (mode))
9878 op = safe_vector_operand (op, mode);
9879
9880 /* If we aren't optimizing, only allow one memory operand to
9881 be generated. */
9882 if (memory_operand (op, mode))
9883 num_memory++;
9884
9885 op = fixup_modeless_constant (op, mode);
9886
9887 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
9888 {
9889 if (optimize || !match || num_memory > 1)
9890 op = copy_to_mode_reg (mode, op);
9891 }
9892 else
9893 {
9894 op = copy_to_reg (op);
9895 op = lowpart_subreg (mode, op, GET_MODE (op));
9896 }
9897 }
9898
9899 args[i].op = op;
9900 args[i].mode = mode;
9901 }
9902
9903 switch (nargs)
9904 {
9905 case 1:
9906 pat = GEN_FCN (icode) (real_target, args[0].op);
9907 break;
9908 case 2:
9909 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
9910 break;
9911 case 3:
9912 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9913 args[2].op);
9914 break;
9915 case 4:
9916 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9917 args[2].op, args[3].op);
9918 break;
9919 case 5:
9920 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9921 args[2].op, args[3].op, args[4].op);
9922 break;
9923 case 6:
9924 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
9925 args[2].op, args[3].op, args[4].op,
9926 args[5].op);
9927 break;
9928 default:
9929 gcc_unreachable ();
9930 }
9931
9932 if (! pat)
9933 return 0;
9934
9935 emit_insn (pat);
9936 return target;
9937 }
9938
9939 /* Transform pattern of following layout:
9940 (set A
9941 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
9942 )
9943 into:
9944 (set (A B)) */
9945
9946 static rtx
9947 ix86_erase_embedded_rounding (rtx pat)
9948 {
9949 if (GET_CODE (pat) == INSN)
9950 pat = PATTERN (pat);
9951
9952 gcc_assert (GET_CODE (pat) == SET);
9953 rtx src = SET_SRC (pat);
9954 gcc_assert (XVECLEN (src, 0) == 2);
9955 rtx p0 = XVECEXP (src, 0, 0);
9956 gcc_assert (GET_CODE (src) == UNSPEC
9957 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
9958 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
9959 return res;
9960 }
9961
9962 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
9963 with rounding. */
9964 static rtx
9965 ix86_expand_sse_comi_round (const struct builtin_description *d,
9966 tree exp, rtx target)
9967 {
9968 rtx pat, set_dst;
9969 tree arg0 = CALL_EXPR_ARG (exp, 0);
9970 tree arg1 = CALL_EXPR_ARG (exp, 1);
9971 tree arg2 = CALL_EXPR_ARG (exp, 2);
9972 tree arg3 = CALL_EXPR_ARG (exp, 3);
9973 rtx op0 = expand_normal (arg0);
9974 rtx op1 = expand_normal (arg1);
9975 rtx op2 = expand_normal (arg2);
9976 rtx op3 = expand_normal (arg3);
9977 enum insn_code icode = d->icode;
9978 const struct insn_data_d *insn_p = &insn_data[icode];
9979 machine_mode mode0 = insn_p->operand[0].mode;
9980 machine_mode mode1 = insn_p->operand[1].mode;
9981
9982 /* See avxintrin.h for values. */
9983 static const enum rtx_code comparisons[32] =
9984 {
9985 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
9986 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
9987 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
9988 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
9989 };
9990 static const bool ordereds[32] =
9991 {
9992 true, true, true, false, false, false, false, true,
9993 false, false, false, true, true, true, true, false,
9994 true, true, true, false, false, false, false, true,
9995 false, false, false, true, true, true, true, false
9996 };
9997 static const bool non_signalings[32] =
9998 {
9999 true, false, false, true, true, false, false, true,
10000 true, false, false, true, true, false, false, true,
10001 false, true, true, false, false, true, true, false,
10002 false, true, true, false, false, true, true, false
10003 };
10004
10005 if (!CONST_INT_P (op2))
10006 {
10007 error ("the third argument must be comparison constant");
10008 return const0_rtx;
10009 }
10010 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
10011 {
10012 error ("incorrect comparison mode");
10013 return const0_rtx;
10014 }
10015
10016 if (!insn_p->operand[2].predicate (op3, SImode))
10017 {
10018 error ("incorrect rounding operand");
10019 return const0_rtx;
10020 }
10021
10022 if (VECTOR_MODE_P (mode0))
10023 op0 = safe_vector_operand (op0, mode0);
10024 if (VECTOR_MODE_P (mode1))
10025 op1 = safe_vector_operand (op1, mode1);
10026
10027 enum rtx_code comparison = comparisons[INTVAL (op2)];
10028 bool ordered = ordereds[INTVAL (op2)];
10029 bool non_signaling = non_signalings[INTVAL (op2)];
10030 rtx const_val = const0_rtx;
10031
10032 bool check_unordered = false;
10033 machine_mode mode = CCFPmode;
10034 switch (comparison)
10035 {
10036 case ORDERED:
10037 if (!ordered)
10038 {
10039 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
10040 if (!non_signaling)
10041 ordered = true;
10042 mode = CCSmode;
10043 }
10044 else
10045 {
10046 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
10047 if (non_signaling)
10048 ordered = false;
10049 mode = CCPmode;
10050 }
10051 comparison = NE;
10052 break;
10053 case UNORDERED:
10054 if (ordered)
10055 {
10056 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
10057 if (non_signaling)
10058 ordered = false;
10059 mode = CCSmode;
10060 }
10061 else
10062 {
10063 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
10064 if (!non_signaling)
10065 ordered = true;
10066 mode = CCPmode;
10067 }
10068 comparison = EQ;
10069 break;
10070
10071 case LE: /* -> GE */
10072 case LT: /* -> GT */
10073 case UNGE: /* -> UNLE */
10074 case UNGT: /* -> UNLT */
10075 std::swap (op0, op1);
10076 comparison = swap_condition (comparison);
10077 /* FALLTHRU */
10078 case GT:
10079 case GE:
10080 case UNEQ:
10081 case UNLT:
10082 case UNLE:
10083 case LTGT:
10084 /* These are supported by CCFPmode. NB: Use ordered/signaling
10085 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
10086 with NAN operands. */
10087 if (ordered == non_signaling)
10088 ordered = !ordered;
10089 break;
10090 case EQ:
10091 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10092 _CMP_EQ_OQ/_CMP_EQ_OS. */
10093 check_unordered = true;
10094 mode = CCZmode;
10095 break;
10096 case NE:
10097 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10098 _CMP_NEQ_UQ/_CMP_NEQ_US. */
10099 gcc_assert (!ordered);
10100 check_unordered = true;
10101 mode = CCZmode;
10102 const_val = const1_rtx;
10103 break;
10104 default:
10105 gcc_unreachable ();
10106 }
10107
10108 target = gen_reg_rtx (SImode);
10109 emit_move_insn (target, const_val);
10110 target = gen_rtx_SUBREG (QImode, target, 0);
10111
10112 if ((optimize && !register_operand (op0, mode0))
10113 || !insn_p->operand[0].predicate (op0, mode0))
10114 op0 = copy_to_mode_reg (mode0, op0);
10115 if ((optimize && !register_operand (op1, mode1))
10116 || !insn_p->operand[1].predicate (op1, mode1))
10117 op1 = copy_to_mode_reg (mode1, op1);
10118
10119 /*
10120 1. COMI: ordered and signaling.
10121 2. UCOMI: unordered and non-signaling.
10122 */
10123 if (non_signaling)
10124 icode = (icode == CODE_FOR_sse_comi_round
10125 ? CODE_FOR_sse_ucomi_round
10126 : CODE_FOR_sse2_ucomi_round);
10127
10128 pat = GEN_FCN (icode) (op0, op1, op3);
10129 if (! pat)
10130 return 0;
10131
10132 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
10133 if (INTVAL (op3) == NO_ROUND)
10134 {
10135 pat = ix86_erase_embedded_rounding (pat);
10136 if (! pat)
10137 return 0;
10138
10139 set_dst = SET_DEST (pat);
10140 }
10141 else
10142 {
10143 gcc_assert (GET_CODE (pat) == SET);
10144 set_dst = SET_DEST (pat);
10145 }
10146
10147 emit_insn (pat);
10148
10149 rtx_code_label *label = NULL;
10150
10151 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10152 with NAN operands. */
10153 if (check_unordered)
10154 {
10155 gcc_assert (comparison == EQ || comparison == NE);
10156
10157 rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
10158 label = gen_label_rtx ();
10159 rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
10160 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
10161 gen_rtx_LABEL_REF (VOIDmode, label),
10162 pc_rtx);
10163 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
10164 }
10165
10166 /* NB: Set CCFPmode and check a different CCmode which is in subset
10167 of CCFPmode. */
10168 if (GET_MODE (set_dst) != mode)
10169 {
10170 gcc_assert (mode == CCAmode || mode == CCCmode
10171 || mode == CCOmode || mode == CCPmode
10172 || mode == CCSmode || mode == CCZmode);
10173 set_dst = gen_rtx_REG (mode, FLAGS_REG);
10174 }
10175
10176 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10177 gen_rtx_fmt_ee (comparison, QImode,
10178 set_dst,
10179 const0_rtx)));
10180
10181 if (label)
10182 emit_label (label);
10183
10184 return SUBREG_REG (target);
10185 }
10186
10187 static rtx
10188 ix86_expand_round_builtin (const struct builtin_description *d,
10189 tree exp, rtx target)
10190 {
10191 rtx pat;
10192 unsigned int i, nargs;
10193 struct
10194 {
10195 rtx op;
10196 machine_mode mode;
10197 } args[6];
10198 enum insn_code icode = d->icode;
10199 const struct insn_data_d *insn_p = &insn_data[icode];
10200 machine_mode tmode = insn_p->operand[0].mode;
10201 unsigned int nargs_constant = 0;
10202 unsigned int redundant_embed_rnd = 0;
10203
10204 switch ((enum ix86_builtin_func_type) d->flag)
10205 {
10206 case UINT64_FTYPE_V2DF_INT:
10207 case UINT64_FTYPE_V4SF_INT:
10208 case UINT_FTYPE_V2DF_INT:
10209 case UINT_FTYPE_V4SF_INT:
10210 case INT64_FTYPE_V2DF_INT:
10211 case INT64_FTYPE_V4SF_INT:
10212 case INT_FTYPE_V2DF_INT:
10213 case INT_FTYPE_V4SF_INT:
10214 nargs = 2;
10215 break;
10216 case V4SF_FTYPE_V4SF_UINT_INT:
10217 case V4SF_FTYPE_V4SF_UINT64_INT:
10218 case V2DF_FTYPE_V2DF_UINT64_INT:
10219 case V4SF_FTYPE_V4SF_INT_INT:
10220 case V4SF_FTYPE_V4SF_INT64_INT:
10221 case V2DF_FTYPE_V2DF_INT64_INT:
10222 case V4SF_FTYPE_V4SF_V4SF_INT:
10223 case V2DF_FTYPE_V2DF_V2DF_INT:
10224 case V4SF_FTYPE_V4SF_V2DF_INT:
10225 case V2DF_FTYPE_V2DF_V4SF_INT:
10226 nargs = 3;
10227 break;
10228 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
10229 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
10230 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
10231 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
10232 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
10233 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
10234 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
10235 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
10236 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
10237 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
10238 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
10239 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
10240 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
10241 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
10242 nargs = 4;
10243 break;
10244 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
10245 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
10246 nargs_constant = 2;
10247 nargs = 4;
10248 break;
10249 case INT_FTYPE_V4SF_V4SF_INT_INT:
10250 case INT_FTYPE_V2DF_V2DF_INT_INT:
10251 return ix86_expand_sse_comi_round (d, exp, target);
10252 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
10253 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
10254 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
10255 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
10256 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
10257 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
10258 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
10259 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
10260 nargs = 5;
10261 break;
10262 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
10263 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
10264 nargs_constant = 4;
10265 nargs = 5;
10266 break;
10267 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
10268 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
10269 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
10270 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
10271 nargs_constant = 3;
10272 nargs = 5;
10273 break;
10274 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
10275 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
10276 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
10277 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
10278 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
10279 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
10280 nargs = 6;
10281 nargs_constant = 4;
10282 break;
10283 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
10284 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
10285 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
10286 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
10287 nargs = 6;
10288 nargs_constant = 3;
10289 break;
10290 default:
10291 gcc_unreachable ();
10292 }
10293 gcc_assert (nargs <= ARRAY_SIZE (args));
10294
10295 if (optimize
10296 || target == 0
10297 || GET_MODE (target) != tmode
10298 || !insn_p->operand[0].predicate (target, tmode))
10299 target = gen_reg_rtx (tmode);
10300
10301 for (i = 0; i < nargs; i++)
10302 {
10303 tree arg = CALL_EXPR_ARG (exp, i);
10304 rtx op = expand_normal (arg);
10305 machine_mode mode = insn_p->operand[i + 1].mode;
10306 bool match = insn_p->operand[i + 1].predicate (op, mode);
10307
10308 if (i == nargs - nargs_constant)
10309 {
10310 if (!match)
10311 {
10312 switch (icode)
10313 {
10314 case CODE_FOR_avx512f_getmantv8df_mask_round:
10315 case CODE_FOR_avx512f_getmantv16sf_mask_round:
10316 case CODE_FOR_avx512f_vgetmantv2df_round:
10317 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
10318 case CODE_FOR_avx512f_vgetmantv4sf_round:
10319 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
10320 error ("the immediate argument must be a 4-bit immediate");
10321 return const0_rtx;
10322 case CODE_FOR_avx512f_cmpv8df3_mask_round:
10323 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
10324 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
10325 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
10326 error ("the immediate argument must be a 5-bit immediate");
10327 return const0_rtx;
10328 default:
10329 error ("the immediate argument must be an 8-bit immediate");
10330 return const0_rtx;
10331 }
10332 }
10333 }
10334 else if (i == nargs-1)
10335 {
10336 if (!insn_p->operand[nargs].predicate (op, SImode))
10337 {
10338 error ("incorrect rounding operand");
10339 return const0_rtx;
10340 }
10341
10342 /* If there is no rounding use normal version of the pattern. */
10343 if (INTVAL (op) == NO_ROUND)
10344 redundant_embed_rnd = 1;
10345 }
10346 else
10347 {
10348 if (VECTOR_MODE_P (mode))
10349 op = safe_vector_operand (op, mode);
10350
10351 op = fixup_modeless_constant (op, mode);
10352
10353 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10354 {
10355 if (optimize || !match)
10356 op = copy_to_mode_reg (mode, op);
10357 }
10358 else
10359 {
10360 op = copy_to_reg (op);
10361 op = lowpart_subreg (mode, op, GET_MODE (op));
10362 }
10363 }
10364
10365 args[i].op = op;
10366 args[i].mode = mode;
10367 }
10368
10369 switch (nargs)
10370 {
10371 case 1:
10372 pat = GEN_FCN (icode) (target, args[0].op);
10373 break;
10374 case 2:
10375 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
10376 break;
10377 case 3:
10378 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10379 args[2].op);
10380 break;
10381 case 4:
10382 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10383 args[2].op, args[3].op);
10384 break;
10385 case 5:
10386 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10387 args[2].op, args[3].op, args[4].op);
10388 break;
10389 case 6:
10390 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
10391 args[2].op, args[3].op, args[4].op,
10392 args[5].op);
10393 break;
10394 default:
10395 gcc_unreachable ();
10396 }
10397
10398 if (!pat)
10399 return 0;
10400
10401 if (redundant_embed_rnd)
10402 pat = ix86_erase_embedded_rounding (pat);
10403
10404 emit_insn (pat);
10405 return target;
10406 }
10407
10408 /* Subroutine of ix86_expand_builtin to take care of special insns
10409 with variable number of operands. */
10410
10411 static rtx
10412 ix86_expand_special_args_builtin (const struct builtin_description *d,
10413 tree exp, rtx target)
10414 {
10415 tree arg;
10416 rtx pat, op;
10417 unsigned int i, nargs, arg_adjust, memory;
10418 bool aligned_mem = false;
10419 struct
10420 {
10421 rtx op;
10422 machine_mode mode;
10423 } args[3];
10424 enum insn_code icode = d->icode;
10425 bool last_arg_constant = false;
10426 const struct insn_data_d *insn_p = &insn_data[icode];
10427 machine_mode tmode = insn_p->operand[0].mode;
10428 enum { load, store } klass;
10429
10430 switch ((enum ix86_builtin_func_type) d->flag)
10431 {
10432 case VOID_FTYPE_VOID:
10433 emit_insn (GEN_FCN (icode) (target));
10434 return 0;
10435 case VOID_FTYPE_UINT64:
10436 case VOID_FTYPE_UNSIGNED:
10437 nargs = 0;
10438 klass = store;
10439 memory = 0;
10440 break;
10441
10442 case INT_FTYPE_VOID:
10443 case USHORT_FTYPE_VOID:
10444 case UINT64_FTYPE_VOID:
10445 case UINT_FTYPE_VOID:
10446 case UNSIGNED_FTYPE_VOID:
10447 nargs = 0;
10448 klass = load;
10449 memory = 0;
10450 break;
10451 case UINT64_FTYPE_PUNSIGNED:
10452 case V2DI_FTYPE_PV2DI:
10453 case V4DI_FTYPE_PV4DI:
10454 case V32QI_FTYPE_PCCHAR:
10455 case V16QI_FTYPE_PCCHAR:
10456 case V8SF_FTYPE_PCV4SF:
10457 case V8SF_FTYPE_PCFLOAT:
10458 case V4SF_FTYPE_PCFLOAT:
10459 case V4DF_FTYPE_PCV2DF:
10460 case V4DF_FTYPE_PCDOUBLE:
10461 case V2DF_FTYPE_PCDOUBLE:
10462 case VOID_FTYPE_PVOID:
10463 case V8DI_FTYPE_PV8DI:
10464 nargs = 1;
10465 klass = load;
10466 memory = 0;
10467 switch (icode)
10468 {
10469 case CODE_FOR_sse4_1_movntdqa:
10470 case CODE_FOR_avx2_movntdqa:
10471 case CODE_FOR_avx512f_movntdqa:
10472 aligned_mem = true;
10473 break;
10474 default:
10475 break;
10476 }
10477 break;
10478 case VOID_FTYPE_PV2SF_V4SF:
10479 case VOID_FTYPE_PV8DI_V8DI:
10480 case VOID_FTYPE_PV4DI_V4DI:
10481 case VOID_FTYPE_PV2DI_V2DI:
10482 case VOID_FTYPE_PCHAR_V32QI:
10483 case VOID_FTYPE_PCHAR_V16QI:
10484 case VOID_FTYPE_PFLOAT_V16SF:
10485 case VOID_FTYPE_PFLOAT_V8SF:
10486 case VOID_FTYPE_PFLOAT_V4SF:
10487 case VOID_FTYPE_PDOUBLE_V8DF:
10488 case VOID_FTYPE_PDOUBLE_V4DF:
10489 case VOID_FTYPE_PDOUBLE_V2DF:
10490 case VOID_FTYPE_PLONGLONG_LONGLONG:
10491 case VOID_FTYPE_PULONGLONG_ULONGLONG:
10492 case VOID_FTYPE_PUNSIGNED_UNSIGNED:
10493 case VOID_FTYPE_PINT_INT:
10494 nargs = 1;
10495 klass = store;
10496 /* Reserve memory operand for target. */
10497 memory = ARRAY_SIZE (args);
10498 switch (icode)
10499 {
10500 /* These builtins and instructions require the memory
10501 to be properly aligned. */
10502 case CODE_FOR_avx_movntv4di:
10503 case CODE_FOR_sse2_movntv2di:
10504 case CODE_FOR_avx_movntv8sf:
10505 case CODE_FOR_sse_movntv4sf:
10506 case CODE_FOR_sse4a_vmmovntv4sf:
10507 case CODE_FOR_avx_movntv4df:
10508 case CODE_FOR_sse2_movntv2df:
10509 case CODE_FOR_sse4a_vmmovntv2df:
10510 case CODE_FOR_sse2_movntidi:
10511 case CODE_FOR_sse_movntq:
10512 case CODE_FOR_sse2_movntisi:
10513 case CODE_FOR_avx512f_movntv16sf:
10514 case CODE_FOR_avx512f_movntv8df:
10515 case CODE_FOR_avx512f_movntv8di:
10516 aligned_mem = true;
10517 break;
10518 default:
10519 break;
10520 }
10521 break;
10522 case VOID_FTYPE_PVOID_PCVOID:
10523 nargs = 1;
10524 klass = store;
10525 memory = 0;
10526
10527 break;
10528 case V4SF_FTYPE_V4SF_PCV2SF:
10529 case V2DF_FTYPE_V2DF_PCDOUBLE:
10530 nargs = 2;
10531 klass = load;
10532 memory = 1;
10533 break;
10534 case V8SF_FTYPE_PCV8SF_V8SI:
10535 case V4DF_FTYPE_PCV4DF_V4DI:
10536 case V4SF_FTYPE_PCV4SF_V4SI:
10537 case V2DF_FTYPE_PCV2DF_V2DI:
10538 case V8SI_FTYPE_PCV8SI_V8SI:
10539 case V4DI_FTYPE_PCV4DI_V4DI:
10540 case V4SI_FTYPE_PCV4SI_V4SI:
10541 case V2DI_FTYPE_PCV2DI_V2DI:
10542 case VOID_FTYPE_INT_INT64:
10543 nargs = 2;
10544 klass = load;
10545 memory = 0;
10546 break;
10547 case VOID_FTYPE_PV8DF_V8DF_UQI:
10548 case VOID_FTYPE_PV4DF_V4DF_UQI:
10549 case VOID_FTYPE_PV2DF_V2DF_UQI:
10550 case VOID_FTYPE_PV16SF_V16SF_UHI:
10551 case VOID_FTYPE_PV8SF_V8SF_UQI:
10552 case VOID_FTYPE_PV4SF_V4SF_UQI:
10553 case VOID_FTYPE_PV8DI_V8DI_UQI:
10554 case VOID_FTYPE_PV4DI_V4DI_UQI:
10555 case VOID_FTYPE_PV2DI_V2DI_UQI:
10556 case VOID_FTYPE_PV16SI_V16SI_UHI:
10557 case VOID_FTYPE_PV8SI_V8SI_UQI:
10558 case VOID_FTYPE_PV4SI_V4SI_UQI:
10559 case VOID_FTYPE_PV64QI_V64QI_UDI:
10560 case VOID_FTYPE_PV32HI_V32HI_USI:
10561 case VOID_FTYPE_PV32QI_V32QI_USI:
10562 case VOID_FTYPE_PV16QI_V16QI_UHI:
10563 case VOID_FTYPE_PV16HI_V16HI_UHI:
10564 case VOID_FTYPE_PV8HI_V8HI_UQI:
10565 switch (icode)
10566 {
10567 /* These builtins and instructions require the memory
10568 to be properly aligned. */
10569 case CODE_FOR_avx512f_storev16sf_mask:
10570 case CODE_FOR_avx512f_storev16si_mask:
10571 case CODE_FOR_avx512f_storev8df_mask:
10572 case CODE_FOR_avx512f_storev8di_mask:
10573 case CODE_FOR_avx512vl_storev8sf_mask:
10574 case CODE_FOR_avx512vl_storev8si_mask:
10575 case CODE_FOR_avx512vl_storev4df_mask:
10576 case CODE_FOR_avx512vl_storev4di_mask:
10577 case CODE_FOR_avx512vl_storev4sf_mask:
10578 case CODE_FOR_avx512vl_storev4si_mask:
10579 case CODE_FOR_avx512vl_storev2df_mask:
10580 case CODE_FOR_avx512vl_storev2di_mask:
10581 aligned_mem = true;
10582 break;
10583 default:
10584 break;
10585 }
10586 /* FALLTHRU */
10587 case VOID_FTYPE_PV8SF_V8SI_V8SF:
10588 case VOID_FTYPE_PV4DF_V4DI_V4DF:
10589 case VOID_FTYPE_PV4SF_V4SI_V4SF:
10590 case VOID_FTYPE_PV2DF_V2DI_V2DF:
10591 case VOID_FTYPE_PV8SI_V8SI_V8SI:
10592 case VOID_FTYPE_PV4DI_V4DI_V4DI:
10593 case VOID_FTYPE_PV4SI_V4SI_V4SI:
10594 case VOID_FTYPE_PV2DI_V2DI_V2DI:
10595 case VOID_FTYPE_PV8SI_V8DI_UQI:
10596 case VOID_FTYPE_PV8HI_V8DI_UQI:
10597 case VOID_FTYPE_PV16HI_V16SI_UHI:
10598 case VOID_FTYPE_PV16QI_V8DI_UQI:
10599 case VOID_FTYPE_PV16QI_V16SI_UHI:
10600 case VOID_FTYPE_PV4SI_V4DI_UQI:
10601 case VOID_FTYPE_PV4SI_V2DI_UQI:
10602 case VOID_FTYPE_PV8HI_V4DI_UQI:
10603 case VOID_FTYPE_PV8HI_V2DI_UQI:
10604 case VOID_FTYPE_PV8HI_V8SI_UQI:
10605 case VOID_FTYPE_PV8HI_V4SI_UQI:
10606 case VOID_FTYPE_PV16QI_V4DI_UQI:
10607 case VOID_FTYPE_PV16QI_V2DI_UQI:
10608 case VOID_FTYPE_PV16QI_V8SI_UQI:
10609 case VOID_FTYPE_PV16QI_V4SI_UQI:
10610 case VOID_FTYPE_PCHAR_V64QI_UDI:
10611 case VOID_FTYPE_PCHAR_V32QI_USI:
10612 case VOID_FTYPE_PCHAR_V16QI_UHI:
10613 case VOID_FTYPE_PSHORT_V32HI_USI:
10614 case VOID_FTYPE_PSHORT_V16HI_UHI:
10615 case VOID_FTYPE_PSHORT_V8HI_UQI:
10616 case VOID_FTYPE_PINT_V16SI_UHI:
10617 case VOID_FTYPE_PINT_V8SI_UQI:
10618 case VOID_FTYPE_PINT_V4SI_UQI:
10619 case VOID_FTYPE_PINT64_V8DI_UQI:
10620 case VOID_FTYPE_PINT64_V4DI_UQI:
10621 case VOID_FTYPE_PINT64_V2DI_UQI:
10622 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
10623 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
10624 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
10625 case VOID_FTYPE_PFLOAT_V16SF_UHI:
10626 case VOID_FTYPE_PFLOAT_V8SF_UQI:
10627 case VOID_FTYPE_PFLOAT_V4SF_UQI:
10628 case VOID_FTYPE_PV32QI_V32HI_USI:
10629 case VOID_FTYPE_PV16QI_V16HI_UHI:
10630 case VOID_FTYPE_PV8QI_V8HI_UQI:
10631 nargs = 2;
10632 klass = store;
10633 /* Reserve memory operand for target. */
10634 memory = ARRAY_SIZE (args);
10635 break;
10636 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
10637 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
10638 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
10639 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
10640 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
10641 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
10642 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
10643 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
10644 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
10645 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
10646 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
10647 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
10648 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
10649 case V32HI_FTYPE_PCV32HI_V32HI_USI:
10650 case V32QI_FTYPE_PCV32QI_V32QI_USI:
10651 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
10652 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
10653 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
10654 switch (icode)
10655 {
10656 /* These builtins and instructions require the memory
10657 to be properly aligned. */
10658 case CODE_FOR_avx512f_loadv16sf_mask:
10659 case CODE_FOR_avx512f_loadv16si_mask:
10660 case CODE_FOR_avx512f_loadv8df_mask:
10661 case CODE_FOR_avx512f_loadv8di_mask:
10662 case CODE_FOR_avx512vl_loadv8sf_mask:
10663 case CODE_FOR_avx512vl_loadv8si_mask:
10664 case CODE_FOR_avx512vl_loadv4df_mask:
10665 case CODE_FOR_avx512vl_loadv4di_mask:
10666 case CODE_FOR_avx512vl_loadv4sf_mask:
10667 case CODE_FOR_avx512vl_loadv4si_mask:
10668 case CODE_FOR_avx512vl_loadv2df_mask:
10669 case CODE_FOR_avx512vl_loadv2di_mask:
10670 case CODE_FOR_avx512bw_loadv64qi_mask:
10671 case CODE_FOR_avx512vl_loadv32qi_mask:
10672 case CODE_FOR_avx512vl_loadv16qi_mask:
10673 case CODE_FOR_avx512bw_loadv32hi_mask:
10674 case CODE_FOR_avx512vl_loadv16hi_mask:
10675 case CODE_FOR_avx512vl_loadv8hi_mask:
10676 aligned_mem = true;
10677 break;
10678 default:
10679 break;
10680 }
10681 /* FALLTHRU */
10682 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
10683 case V32QI_FTYPE_PCCHAR_V32QI_USI:
10684 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
10685 case V32HI_FTYPE_PCSHORT_V32HI_USI:
10686 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
10687 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
10688 case V16SI_FTYPE_PCINT_V16SI_UHI:
10689 case V8SI_FTYPE_PCINT_V8SI_UQI:
10690 case V4SI_FTYPE_PCINT_V4SI_UQI:
10691 case V8DI_FTYPE_PCINT64_V8DI_UQI:
10692 case V4DI_FTYPE_PCINT64_V4DI_UQI:
10693 case V2DI_FTYPE_PCINT64_V2DI_UQI:
10694 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
10695 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
10696 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
10697 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
10698 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
10699 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
10700 nargs = 3;
10701 klass = load;
10702 memory = 0;
10703 break;
10704 case VOID_FTYPE_UINT_UINT_UINT:
10705 case VOID_FTYPE_UINT64_UINT_UINT:
10706 case UCHAR_FTYPE_UINT_UINT_UINT:
10707 case UCHAR_FTYPE_UINT64_UINT_UINT:
10708 nargs = 3;
10709 klass = load;
10710 memory = ARRAY_SIZE (args);
10711 last_arg_constant = true;
10712 break;
10713 default:
10714 gcc_unreachable ();
10715 }
10716
10717 gcc_assert (nargs <= ARRAY_SIZE (args));
10718
10719 if (klass == store)
10720 {
10721 arg = CALL_EXPR_ARG (exp, 0);
10722 op = expand_normal (arg);
10723 gcc_assert (target == 0);
10724 if (memory)
10725 {
10726 op = ix86_zero_extend_to_Pmode (op);
10727 target = gen_rtx_MEM (tmode, op);
10728 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
10729 on it. Try to improve it using get_pointer_alignment,
10730 and if the special builtin is one that requires strict
10731 mode alignment, also from it's GET_MODE_ALIGNMENT.
10732 Failure to do so could lead to ix86_legitimate_combined_insn
10733 rejecting all changes to such insns. */
10734 unsigned int align = get_pointer_alignment (arg);
10735 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
10736 align = GET_MODE_ALIGNMENT (tmode);
10737 if (MEM_ALIGN (target) < align)
10738 set_mem_align (target, align);
10739 }
10740 else
10741 target = force_reg (tmode, op);
10742 arg_adjust = 1;
10743 }
10744 else
10745 {
10746 arg_adjust = 0;
10747 if (optimize
10748 || target == 0
10749 || !register_operand (target, tmode)
10750 || GET_MODE (target) != tmode)
10751 target = gen_reg_rtx (tmode);
10752 }
10753
10754 for (i = 0; i < nargs; i++)
10755 {
10756 machine_mode mode = insn_p->operand[i + 1].mode;
10757 bool match;
10758
10759 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
10760 op = expand_normal (arg);
10761 match = insn_p->operand[i + 1].predicate (op, mode);
10762
10763 if (last_arg_constant && (i + 1) == nargs)
10764 {
10765 if (!match)
10766 {
10767 if (icode == CODE_FOR_lwp_lwpvalsi3
10768 || icode == CODE_FOR_lwp_lwpinssi3
10769 || icode == CODE_FOR_lwp_lwpvaldi3
10770 || icode == CODE_FOR_lwp_lwpinsdi3)
10771 error ("the last argument must be a 32-bit immediate");
10772 else
10773 error ("the last argument must be an 8-bit immediate");
10774 return const0_rtx;
10775 }
10776 }
10777 else
10778 {
10779 if (i == memory)
10780 {
10781 /* This must be the memory operand. */
10782 op = ix86_zero_extend_to_Pmode (op);
10783 op = gen_rtx_MEM (mode, op);
10784 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
10785 on it. Try to improve it using get_pointer_alignment,
10786 and if the special builtin is one that requires strict
10787 mode alignment, also from it's GET_MODE_ALIGNMENT.
10788 Failure to do so could lead to ix86_legitimate_combined_insn
10789 rejecting all changes to such insns. */
10790 unsigned int align = get_pointer_alignment (arg);
10791 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
10792 align = GET_MODE_ALIGNMENT (mode);
10793 if (MEM_ALIGN (op) < align)
10794 set_mem_align (op, align);
10795 }
10796 else
10797 {
10798 /* This must be register. */
10799 if (VECTOR_MODE_P (mode))
10800 op = safe_vector_operand (op, mode);
10801
10802 op = fixup_modeless_constant (op, mode);
10803
10804 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
10805 op = copy_to_mode_reg (mode, op);
10806 else
10807 {
10808 op = copy_to_reg (op);
10809 op = lowpart_subreg (mode, op, GET_MODE (op));
10810 }
10811 }
10812 }
10813
10814 args[i].op = op;
10815 args[i].mode = mode;
10816 }
10817
10818 switch (nargs)
10819 {
10820 case 0:
10821 pat = GEN_FCN (icode) (target);
10822 break;
10823 case 1:
10824 pat = GEN_FCN (icode) (target, args[0].op);
10825 break;
10826 case 2:
10827 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
10828 break;
10829 case 3:
10830 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
10831 break;
10832 default:
10833 gcc_unreachable ();
10834 }
10835
10836 if (! pat)
10837 return 0;
10838 emit_insn (pat);
10839 return klass == store ? 0 : target;
10840 }
10841
10842 /* Return the integer constant in ARG. Constrain it to be in the range
10843 of the subparts of VEC_TYPE; issue an error if not. */
10844
10845 static int
10846 get_element_number (tree vec_type, tree arg)
10847 {
10848 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
10849
10850 if (!tree_fits_uhwi_p (arg)
10851 || (elt = tree_to_uhwi (arg), elt > max))
10852 {
10853 error ("selector must be an integer constant in the range "
10854 "[0, %wi]", max);
10855 return 0;
10856 }
10857
10858 return elt;
10859 }
10860
10861 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10862 ix86_expand_vector_init. We DO have language-level syntax for this, in
10863 the form of (type){ init-list }. Except that since we can't place emms
10864 instructions from inside the compiler, we can't allow the use of MMX
10865 registers unless the user explicitly asks for it. So we do *not* define
10866 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
10867 we have builtins invoked by mmintrin.h that gives us license to emit
10868 these sorts of instructions. */
10869
10870 static rtx
10871 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
10872 {
10873 machine_mode tmode = TYPE_MODE (type);
10874 machine_mode inner_mode = GET_MODE_INNER (tmode);
10875 int i, n_elt = GET_MODE_NUNITS (tmode);
10876 rtvec v = rtvec_alloc (n_elt);
10877
10878 gcc_assert (VECTOR_MODE_P (tmode));
10879 gcc_assert (call_expr_nargs (exp) == n_elt);
10880
10881 for (i = 0; i < n_elt; ++i)
10882 {
10883 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
10884 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
10885 }
10886
10887 if (!target || !register_operand (target, tmode))
10888 target = gen_reg_rtx (tmode);
10889
10890 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
10891 return target;
10892 }
10893
10894 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10895 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
10896 had a language-level syntax for referencing vector elements. */
10897
10898 static rtx
10899 ix86_expand_vec_ext_builtin (tree exp, rtx target)
10900 {
10901 machine_mode tmode, mode0;
10902 tree arg0, arg1;
10903 int elt;
10904 rtx op0;
10905
10906 arg0 = CALL_EXPR_ARG (exp, 0);
10907 arg1 = CALL_EXPR_ARG (exp, 1);
10908
10909 op0 = expand_normal (arg0);
10910 elt = get_element_number (TREE_TYPE (arg0), arg1);
10911
10912 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
10913 mode0 = TYPE_MODE (TREE_TYPE (arg0));
10914 gcc_assert (VECTOR_MODE_P (mode0));
10915
10916 op0 = force_reg (mode0, op0);
10917
10918 if (optimize || !target || !register_operand (target, tmode))
10919 target = gen_reg_rtx (tmode);
10920
10921 ix86_expand_vector_extract (true, target, op0, elt);
10922
10923 return target;
10924 }
10925
10926 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10927 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
10928 a language-level syntax for referencing vector elements. */
10929
10930 static rtx
10931 ix86_expand_vec_set_builtin (tree exp)
10932 {
10933 machine_mode tmode, mode1;
10934 tree arg0, arg1, arg2;
10935 int elt;
10936 rtx op0, op1, target;
10937
10938 arg0 = CALL_EXPR_ARG (exp, 0);
10939 arg1 = CALL_EXPR_ARG (exp, 1);
10940 arg2 = CALL_EXPR_ARG (exp, 2);
10941
10942 tmode = TYPE_MODE (TREE_TYPE (arg0));
10943 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
10944 gcc_assert (VECTOR_MODE_P (tmode));
10945
10946 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
10947 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
10948 elt = get_element_number (TREE_TYPE (arg0), arg2);
10949
10950 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
10951 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
10952
10953 op0 = force_reg (tmode, op0);
10954 op1 = force_reg (mode1, op1);
10955
10956 /* OP0 is the source of these builtin functions and shouldn't be
10957 modified. Create a copy, use it and return it as target. */
10958 target = gen_reg_rtx (tmode);
10959 emit_move_insn (target, op0);
10960 ix86_expand_vector_set (true, target, op1, elt);
10961
10962 return target;
10963 }
10964
10965 /* Expand an expression EXP that calls a built-in function,
10966 with result going to TARGET if that's convenient
10967 (and in mode MODE if that's convenient).
10968 SUBTARGET may be used as the target for computing one of EXP's operands.
10969 IGNORE is nonzero if the value is to be ignored. */
10970
10971 rtx
10972 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
10973 machine_mode mode, int ignore)
10974 {
10975 size_t i;
10976 enum insn_code icode, icode2;
10977 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
10978 tree arg0, arg1, arg2, arg3, arg4;
10979 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
10980 machine_mode mode0, mode1, mode2, mode3, mode4;
10981 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
10982
10983 /* For CPU builtins that can be folded, fold first and expand the fold. */
10984 switch (fcode)
10985 {
10986 case IX86_BUILTIN_CPU_INIT:
10987 {
10988 /* Make it call __cpu_indicator_init in libgcc. */
10989 tree call_expr, fndecl, type;
10990 type = build_function_type_list (integer_type_node, NULL_TREE);
10991 fndecl = build_fn_decl ("__cpu_indicator_init", type);
10992 call_expr = build_call_expr (fndecl, 0);
10993 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
10994 }
10995 case IX86_BUILTIN_CPU_IS:
10996 case IX86_BUILTIN_CPU_SUPPORTS:
10997 {
10998 tree arg0 = CALL_EXPR_ARG (exp, 0);
10999 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
11000 gcc_assert (fold_expr != NULL_TREE);
11001 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
11002 }
11003 }
11004
11005 HOST_WIDE_INT isa = ix86_isa_flags;
11006 HOST_WIDE_INT isa2 = ix86_isa_flags2;
11007 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
11008 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
11009 /* The general case is we require all the ISAs specified in bisa{,2}
11010 to be enabled.
11011 The exceptions are:
11012 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
11013 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
11014 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
11015 where for each this pair it is sufficient if either of the ISAs is
11016 enabled, plus if it is ored with other options also those others. */
11017 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
11018 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
11019 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
11020 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
11021 if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
11022 == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
11023 && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
11024 isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
11025 if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
11026 == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
11027 && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
11028 isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
11029 /* Use SSE/SSE2/SSSE3 to emulate MMX intrinsics in 64-bit mode when
11030 MMX is disabled. NB: Since MMX intrinsics are marked with
11031 SSE/SSE2/SSSE3, enable them without SSE/SSE2/SSSE3 if MMX is
11032 enabled. */
11033 if (TARGET_MMX || TARGET_MMX_WITH_SSE)
11034 {
11035 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX))
11036 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX))
11037 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX)) != 0)
11038 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX);
11039 if (((bisa & (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX))
11040 == (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX))
11041 && (isa & (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX)) != 0)
11042 isa |= (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX);
11043 if (((bisa & (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX))
11044 == (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX))
11045 && (isa & (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX)) != 0)
11046 isa |= (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX);
11047 }
11048 if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
11049 {
11050 bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
11051 if (TARGET_ABI_X32)
11052 bisa |= OPTION_MASK_ABI_X32;
11053 else
11054 bisa |= OPTION_MASK_ABI_64;
11055 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
11056 (enum fpmath_unit) 0, false, add_abi_p);
11057 if (!opts)
11058 error ("%qE needs unknown isa option", fndecl);
11059 else
11060 {
11061 gcc_assert (opts != NULL);
11062 error ("%qE needs isa option %s", fndecl, opts);
11063 free (opts);
11064 }
11065 return expand_call (exp, target, ignore);
11066 }
11067
11068 switch (fcode)
11069 {
11070 case IX86_BUILTIN_MASKMOVQ:
11071 case IX86_BUILTIN_MASKMOVDQU:
11072 icode = (fcode == IX86_BUILTIN_MASKMOVQ
11073 ? CODE_FOR_mmx_maskmovq
11074 : CODE_FOR_sse2_maskmovdqu);
11075 /* Note the arg order is different from the operand order. */
11076 arg1 = CALL_EXPR_ARG (exp, 0);
11077 arg2 = CALL_EXPR_ARG (exp, 1);
11078 arg0 = CALL_EXPR_ARG (exp, 2);
11079 op0 = expand_normal (arg0);
11080 op1 = expand_normal (arg1);
11081 op2 = expand_normal (arg2);
11082 mode0 = insn_data[icode].operand[0].mode;
11083 mode1 = insn_data[icode].operand[1].mode;
11084 mode2 = insn_data[icode].operand[2].mode;
11085
11086 op0 = ix86_zero_extend_to_Pmode (op0);
11087 op0 = gen_rtx_MEM (mode1, op0);
11088
11089 if (!insn_data[icode].operand[0].predicate (op0, mode0))
11090 op0 = copy_to_mode_reg (mode0, op0);
11091 if (!insn_data[icode].operand[1].predicate (op1, mode1))
11092 op1 = copy_to_mode_reg (mode1, op1);
11093 if (!insn_data[icode].operand[2].predicate (op2, mode2))
11094 op2 = copy_to_mode_reg (mode2, op2);
11095 pat = GEN_FCN (icode) (op0, op1, op2);
11096 if (! pat)
11097 return 0;
11098 emit_insn (pat);
11099 return 0;
11100
11101 case IX86_BUILTIN_LDMXCSR:
11102 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
11103 target = assign_386_stack_local (SImode, SLOT_TEMP);
11104 emit_move_insn (target, op0);
11105 emit_insn (gen_sse_ldmxcsr (target));
11106 return 0;
11107
11108 case IX86_BUILTIN_STMXCSR:
11109 target = assign_386_stack_local (SImode, SLOT_TEMP);
11110 emit_insn (gen_sse_stmxcsr (target));
11111 return copy_to_mode_reg (SImode, target);
11112
11113 case IX86_BUILTIN_CLFLUSH:
11114 arg0 = CALL_EXPR_ARG (exp, 0);
11115 op0 = expand_normal (arg0);
11116 icode = CODE_FOR_sse2_clflush;
11117 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11118 op0 = ix86_zero_extend_to_Pmode (op0);
11119
11120 emit_insn (gen_sse2_clflush (op0));
11121 return 0;
11122
11123 case IX86_BUILTIN_CLWB:
11124 arg0 = CALL_EXPR_ARG (exp, 0);
11125 op0 = expand_normal (arg0);
11126 icode = CODE_FOR_clwb;
11127 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11128 op0 = ix86_zero_extend_to_Pmode (op0);
11129
11130 emit_insn (gen_clwb (op0));
11131 return 0;
11132
11133 case IX86_BUILTIN_CLFLUSHOPT:
11134 arg0 = CALL_EXPR_ARG (exp, 0);
11135 op0 = expand_normal (arg0);
11136 icode = CODE_FOR_clflushopt;
11137 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11138 op0 = ix86_zero_extend_to_Pmode (op0);
11139
11140 emit_insn (gen_clflushopt (op0));
11141 return 0;
11142
11143 case IX86_BUILTIN_MONITOR:
11144 case IX86_BUILTIN_MONITORX:
11145 arg0 = CALL_EXPR_ARG (exp, 0);
11146 arg1 = CALL_EXPR_ARG (exp, 1);
11147 arg2 = CALL_EXPR_ARG (exp, 2);
11148 op0 = expand_normal (arg0);
11149 op1 = expand_normal (arg1);
11150 op2 = expand_normal (arg2);
11151 if (!REG_P (op0))
11152 op0 = ix86_zero_extend_to_Pmode (op0);
11153 if (!REG_P (op1))
11154 op1 = copy_to_mode_reg (SImode, op1);
11155 if (!REG_P (op2))
11156 op2 = copy_to_mode_reg (SImode, op2);
11157
11158 emit_insn (fcode == IX86_BUILTIN_MONITOR
11159 ? gen_sse3_monitor (Pmode, op0, op1, op2)
11160 : gen_monitorx (Pmode, op0, op1, op2));
11161 return 0;
11162
11163 case IX86_BUILTIN_MWAIT:
11164 arg0 = CALL_EXPR_ARG (exp, 0);
11165 arg1 = CALL_EXPR_ARG (exp, 1);
11166 op0 = expand_normal (arg0);
11167 op1 = expand_normal (arg1);
11168 if (!REG_P (op0))
11169 op0 = copy_to_mode_reg (SImode, op0);
11170 if (!REG_P (op1))
11171 op1 = copy_to_mode_reg (SImode, op1);
11172 emit_insn (gen_sse3_mwait (op0, op1));
11173 return 0;
11174
11175 case IX86_BUILTIN_MWAITX:
11176 arg0 = CALL_EXPR_ARG (exp, 0);
11177 arg1 = CALL_EXPR_ARG (exp, 1);
11178 arg2 = CALL_EXPR_ARG (exp, 2);
11179 op0 = expand_normal (arg0);
11180 op1 = expand_normal (arg1);
11181 op2 = expand_normal (arg2);
11182 if (!REG_P (op0))
11183 op0 = copy_to_mode_reg (SImode, op0);
11184 if (!REG_P (op1))
11185 op1 = copy_to_mode_reg (SImode, op1);
11186 if (!REG_P (op2))
11187 op2 = copy_to_mode_reg (SImode, op2);
11188 emit_insn (gen_mwaitx (op0, op1, op2));
11189 return 0;
11190
11191 case IX86_BUILTIN_UMONITOR:
11192 arg0 = CALL_EXPR_ARG (exp, 0);
11193 op0 = expand_normal (arg0);
11194
11195 op0 = ix86_zero_extend_to_Pmode (op0);
11196 emit_insn (gen_umonitor (Pmode, op0));
11197 return 0;
11198
11199 case IX86_BUILTIN_UMWAIT:
11200 case IX86_BUILTIN_TPAUSE:
11201 arg0 = CALL_EXPR_ARG (exp, 0);
11202 arg1 = CALL_EXPR_ARG (exp, 1);
11203 op0 = expand_normal (arg0);
11204 op1 = expand_normal (arg1);
11205
11206 if (!REG_P (op0))
11207 op0 = copy_to_mode_reg (SImode, op0);
11208
11209 op1 = force_reg (DImode, op1);
11210
11211 if (TARGET_64BIT)
11212 {
11213 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11214 NULL, 1, OPTAB_DIRECT);
11215 switch (fcode)
11216 {
11217 case IX86_BUILTIN_UMWAIT:
11218 icode = CODE_FOR_umwait_rex64;
11219 break;
11220 case IX86_BUILTIN_TPAUSE:
11221 icode = CODE_FOR_tpause_rex64;
11222 break;
11223 default:
11224 gcc_unreachable ();
11225 }
11226
11227 op2 = gen_lowpart (SImode, op2);
11228 op1 = gen_lowpart (SImode, op1);
11229 pat = GEN_FCN (icode) (op0, op1, op2);
11230 }
11231 else
11232 {
11233 switch (fcode)
11234 {
11235 case IX86_BUILTIN_UMWAIT:
11236 icode = CODE_FOR_umwait;
11237 break;
11238 case IX86_BUILTIN_TPAUSE:
11239 icode = CODE_FOR_tpause;
11240 break;
11241 default:
11242 gcc_unreachable ();
11243 }
11244 pat = GEN_FCN (icode) (op0, op1);
11245 }
11246
11247 if (!pat)
11248 return 0;
11249
11250 emit_insn (pat);
11251
11252 if (target == 0
11253 || !register_operand (target, QImode))
11254 target = gen_reg_rtx (QImode);
11255
11256 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11257 const0_rtx);
11258 emit_insn (gen_rtx_SET (target, pat));
11259
11260 return target;
11261
11262 case IX86_BUILTIN_CLZERO:
11263 arg0 = CALL_EXPR_ARG (exp, 0);
11264 op0 = expand_normal (arg0);
11265 if (!REG_P (op0))
11266 op0 = ix86_zero_extend_to_Pmode (op0);
11267 emit_insn (gen_clzero (Pmode, op0));
11268 return 0;
11269
11270 case IX86_BUILTIN_CLDEMOTE:
11271 arg0 = CALL_EXPR_ARG (exp, 0);
11272 op0 = expand_normal (arg0);
11273 icode = CODE_FOR_cldemote;
11274 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11275 op0 = ix86_zero_extend_to_Pmode (op0);
11276
11277 emit_insn (gen_cldemote (op0));
11278 return 0;
11279
11280 case IX86_BUILTIN_VEC_INIT_V2SI:
11281 case IX86_BUILTIN_VEC_INIT_V4HI:
11282 case IX86_BUILTIN_VEC_INIT_V8QI:
11283 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
11284
11285 case IX86_BUILTIN_VEC_EXT_V2DF:
11286 case IX86_BUILTIN_VEC_EXT_V2DI:
11287 case IX86_BUILTIN_VEC_EXT_V4SF:
11288 case IX86_BUILTIN_VEC_EXT_V4SI:
11289 case IX86_BUILTIN_VEC_EXT_V8HI:
11290 case IX86_BUILTIN_VEC_EXT_V2SI:
11291 case IX86_BUILTIN_VEC_EXT_V4HI:
11292 case IX86_BUILTIN_VEC_EXT_V16QI:
11293 return ix86_expand_vec_ext_builtin (exp, target);
11294
11295 case IX86_BUILTIN_VEC_SET_V2DI:
11296 case IX86_BUILTIN_VEC_SET_V4SF:
11297 case IX86_BUILTIN_VEC_SET_V4SI:
11298 case IX86_BUILTIN_VEC_SET_V8HI:
11299 case IX86_BUILTIN_VEC_SET_V4HI:
11300 case IX86_BUILTIN_VEC_SET_V16QI:
11301 return ix86_expand_vec_set_builtin (exp);
11302
11303 case IX86_BUILTIN_NANQ:
11304 case IX86_BUILTIN_NANSQ:
11305 return expand_call (exp, target, ignore);
11306
11307 case IX86_BUILTIN_RDPID:
11308
11309 op0 = gen_reg_rtx (word_mode);
11310
11311 if (TARGET_64BIT)
11312 {
11313 insn = gen_rdpid_rex64 (op0);
11314 op0 = convert_to_mode (SImode, op0, 1);
11315 }
11316 else
11317 insn = gen_rdpid (op0);
11318
11319 emit_insn (insn);
11320
11321 if (target == 0
11322 || !register_operand (target, SImode))
11323 target = gen_reg_rtx (SImode);
11324
11325 emit_move_insn (target, op0);
11326 return target;
11327
11328 case IX86_BUILTIN_2INTERSECTD512:
11329 case IX86_BUILTIN_2INTERSECTQ512:
11330 case IX86_BUILTIN_2INTERSECTD256:
11331 case IX86_BUILTIN_2INTERSECTQ256:
11332 case IX86_BUILTIN_2INTERSECTD128:
11333 case IX86_BUILTIN_2INTERSECTQ128:
11334 arg0 = CALL_EXPR_ARG (exp, 0);
11335 arg1 = CALL_EXPR_ARG (exp, 1);
11336 arg2 = CALL_EXPR_ARG (exp, 2);
11337 arg3 = CALL_EXPR_ARG (exp, 3);
11338 op0 = expand_normal (arg0);
11339 op1 = expand_normal (arg1);
11340 op2 = expand_normal (arg2);
11341 op3 = expand_normal (arg3);
11342
11343 if (!address_operand (op0, VOIDmode))
11344 {
11345 op0 = convert_memory_address (Pmode, op0);
11346 op0 = copy_addr_to_reg (op0);
11347 }
11348 if (!address_operand (op1, VOIDmode))
11349 {
11350 op1 = convert_memory_address (Pmode, op1);
11351 op1 = copy_addr_to_reg (op1);
11352 }
11353
11354 switch (fcode)
11355 {
11356 case IX86_BUILTIN_2INTERSECTD512:
11357 mode4 = P2HImode;
11358 icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
11359 break;
11360 case IX86_BUILTIN_2INTERSECTQ512:
11361 mode4 = P2QImode;
11362 icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
11363 break;
11364 case IX86_BUILTIN_2INTERSECTD256:
11365 mode4 = P2QImode;
11366 icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
11367 break;
11368 case IX86_BUILTIN_2INTERSECTQ256:
11369 mode4 = P2QImode;
11370 icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
11371 break;
11372 case IX86_BUILTIN_2INTERSECTD128:
11373 mode4 = P2QImode;
11374 icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
11375 break;
11376 case IX86_BUILTIN_2INTERSECTQ128:
11377 mode4 = P2QImode;
11378 icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
11379 break;
11380 default:
11381 gcc_unreachable ();
11382 }
11383
11384 mode2 = insn_data[icode].operand[1].mode;
11385 mode3 = insn_data[icode].operand[2].mode;
11386 if (!insn_data[icode].operand[1].predicate (op2, mode2))
11387 op2 = copy_to_mode_reg (mode2, op2);
11388 if (!insn_data[icode].operand[2].predicate (op3, mode3))
11389 op3 = copy_to_mode_reg (mode3, op3);
11390
11391 op4 = gen_reg_rtx (mode4);
11392 emit_insn (GEN_FCN (icode) (op4, op2, op3));
11393 mode0 = mode4 == P2HImode ? HImode : QImode;
11394 emit_move_insn (gen_rtx_MEM (mode0, op0),
11395 gen_lowpart (mode0, op4));
11396 emit_move_insn (gen_rtx_MEM (mode0, op1),
11397 gen_highpart (mode0, op4));
11398
11399 return 0;
11400
11401 case IX86_BUILTIN_RDPMC:
11402 case IX86_BUILTIN_RDTSC:
11403 case IX86_BUILTIN_RDTSCP:
11404 case IX86_BUILTIN_XGETBV:
11405
11406 op0 = gen_reg_rtx (DImode);
11407 op1 = gen_reg_rtx (DImode);
11408
11409 if (fcode == IX86_BUILTIN_RDPMC)
11410 {
11411 arg0 = CALL_EXPR_ARG (exp, 0);
11412 op2 = expand_normal (arg0);
11413 if (!register_operand (op2, SImode))
11414 op2 = copy_to_mode_reg (SImode, op2);
11415
11416 insn = (TARGET_64BIT
11417 ? gen_rdpmc_rex64 (op0, op1, op2)
11418 : gen_rdpmc (op0, op2));
11419 emit_insn (insn);
11420 }
11421 else if (fcode == IX86_BUILTIN_XGETBV)
11422 {
11423 arg0 = CALL_EXPR_ARG (exp, 0);
11424 op2 = expand_normal (arg0);
11425 if (!register_operand (op2, SImode))
11426 op2 = copy_to_mode_reg (SImode, op2);
11427
11428 insn = (TARGET_64BIT
11429 ? gen_xgetbv_rex64 (op0, op1, op2)
11430 : gen_xgetbv (op0, op2));
11431 emit_insn (insn);
11432 }
11433 else if (fcode == IX86_BUILTIN_RDTSC)
11434 {
11435 insn = (TARGET_64BIT
11436 ? gen_rdtsc_rex64 (op0, op1)
11437 : gen_rdtsc (op0));
11438 emit_insn (insn);
11439 }
11440 else
11441 {
11442 op2 = gen_reg_rtx (SImode);
11443
11444 insn = (TARGET_64BIT
11445 ? gen_rdtscp_rex64 (op0, op1, op2)
11446 : gen_rdtscp (op0, op2));
11447 emit_insn (insn);
11448
11449 arg0 = CALL_EXPR_ARG (exp, 0);
11450 op4 = expand_normal (arg0);
11451 if (!address_operand (op4, VOIDmode))
11452 {
11453 op4 = convert_memory_address (Pmode, op4);
11454 op4 = copy_addr_to_reg (op4);
11455 }
11456 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
11457 }
11458
11459 if (target == 0
11460 || !register_operand (target, DImode))
11461 target = gen_reg_rtx (DImode);
11462
11463 if (TARGET_64BIT)
11464 {
11465 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
11466 op1, 1, OPTAB_DIRECT);
11467 op0 = expand_simple_binop (DImode, IOR, op0, op1,
11468 op0, 1, OPTAB_DIRECT);
11469 }
11470
11471 emit_move_insn (target, op0);
11472 return target;
11473
11474 case IX86_BUILTIN_ENQCMD:
11475 case IX86_BUILTIN_ENQCMDS:
11476 case IX86_BUILTIN_MOVDIR64B:
11477
11478 arg0 = CALL_EXPR_ARG (exp, 0);
11479 arg1 = CALL_EXPR_ARG (exp, 1);
11480 op0 = expand_normal (arg0);
11481 op1 = expand_normal (arg1);
11482
11483 op0 = ix86_zero_extend_to_Pmode (op0);
11484 if (!address_operand (op1, VOIDmode))
11485 {
11486 op1 = convert_memory_address (Pmode, op1);
11487 op1 = copy_addr_to_reg (op1);
11488 }
11489 op1 = gen_rtx_MEM (XImode, op1);
11490
11491 if (fcode == IX86_BUILTIN_MOVDIR64B)
11492 {
11493 emit_insn (gen_movdir64b (Pmode, op0, op1));
11494 return 0;
11495 }
11496 else
11497 {
11498 rtx pat;
11499
11500 target = gen_reg_rtx (SImode);
11501 emit_move_insn (target, const0_rtx);
11502 target = gen_rtx_SUBREG (QImode, target, 0);
11503
11504 if (fcode == IX86_BUILTIN_ENQCMD)
11505 pat = gen_enqcmd (UNSPECV_ENQCMD, Pmode, op0, op1);
11506 else
11507 pat = gen_enqcmd (UNSPECV_ENQCMDS, Pmode, op0, op1);
11508
11509 emit_insn (pat);
11510
11511 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
11512 gen_rtx_fmt_ee (EQ, QImode,
11513 SET_DEST (pat),
11514 const0_rtx)));
11515
11516 return SUBREG_REG (target);
11517 }
11518
11519 case IX86_BUILTIN_FXSAVE:
11520 case IX86_BUILTIN_FXRSTOR:
11521 case IX86_BUILTIN_FXSAVE64:
11522 case IX86_BUILTIN_FXRSTOR64:
11523 case IX86_BUILTIN_FNSTENV:
11524 case IX86_BUILTIN_FLDENV:
11525 mode0 = BLKmode;
11526 switch (fcode)
11527 {
11528 case IX86_BUILTIN_FXSAVE:
11529 icode = CODE_FOR_fxsave;
11530 break;
11531 case IX86_BUILTIN_FXRSTOR:
11532 icode = CODE_FOR_fxrstor;
11533 break;
11534 case IX86_BUILTIN_FXSAVE64:
11535 icode = CODE_FOR_fxsave64;
11536 break;
11537 case IX86_BUILTIN_FXRSTOR64:
11538 icode = CODE_FOR_fxrstor64;
11539 break;
11540 case IX86_BUILTIN_FNSTENV:
11541 icode = CODE_FOR_fnstenv;
11542 break;
11543 case IX86_BUILTIN_FLDENV:
11544 icode = CODE_FOR_fldenv;
11545 break;
11546 default:
11547 gcc_unreachable ();
11548 }
11549
11550 arg0 = CALL_EXPR_ARG (exp, 0);
11551 op0 = expand_normal (arg0);
11552
11553 if (!address_operand (op0, VOIDmode))
11554 {
11555 op0 = convert_memory_address (Pmode, op0);
11556 op0 = copy_addr_to_reg (op0);
11557 }
11558 op0 = gen_rtx_MEM (mode0, op0);
11559
11560 pat = GEN_FCN (icode) (op0);
11561 if (pat)
11562 emit_insn (pat);
11563 return 0;
11564
11565 case IX86_BUILTIN_XSETBV:
11566 arg0 = CALL_EXPR_ARG (exp, 0);
11567 arg1 = CALL_EXPR_ARG (exp, 1);
11568 op0 = expand_normal (arg0);
11569 op1 = expand_normal (arg1);
11570
11571 if (!REG_P (op0))
11572 op0 = copy_to_mode_reg (SImode, op0);
11573
11574 op1 = force_reg (DImode, op1);
11575
11576 if (TARGET_64BIT)
11577 {
11578 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11579 NULL, 1, OPTAB_DIRECT);
11580
11581 icode = CODE_FOR_xsetbv_rex64;
11582
11583 op2 = gen_lowpart (SImode, op2);
11584 op1 = gen_lowpart (SImode, op1);
11585 pat = GEN_FCN (icode) (op0, op1, op2);
11586 }
11587 else
11588 {
11589 icode = CODE_FOR_xsetbv;
11590
11591 pat = GEN_FCN (icode) (op0, op1);
11592 }
11593 if (pat)
11594 emit_insn (pat);
11595 return 0;
11596
11597 case IX86_BUILTIN_XSAVE:
11598 case IX86_BUILTIN_XRSTOR:
11599 case IX86_BUILTIN_XSAVE64:
11600 case IX86_BUILTIN_XRSTOR64:
11601 case IX86_BUILTIN_XSAVEOPT:
11602 case IX86_BUILTIN_XSAVEOPT64:
11603 case IX86_BUILTIN_XSAVES:
11604 case IX86_BUILTIN_XRSTORS:
11605 case IX86_BUILTIN_XSAVES64:
11606 case IX86_BUILTIN_XRSTORS64:
11607 case IX86_BUILTIN_XSAVEC:
11608 case IX86_BUILTIN_XSAVEC64:
11609 arg0 = CALL_EXPR_ARG (exp, 0);
11610 arg1 = CALL_EXPR_ARG (exp, 1);
11611 op0 = expand_normal (arg0);
11612 op1 = expand_normal (arg1);
11613
11614 if (!address_operand (op0, VOIDmode))
11615 {
11616 op0 = convert_memory_address (Pmode, op0);
11617 op0 = copy_addr_to_reg (op0);
11618 }
11619 op0 = gen_rtx_MEM (BLKmode, op0);
11620
11621 op1 = force_reg (DImode, op1);
11622
11623 if (TARGET_64BIT)
11624 {
11625 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
11626 NULL, 1, OPTAB_DIRECT);
11627 switch (fcode)
11628 {
11629 case IX86_BUILTIN_XSAVE:
11630 icode = CODE_FOR_xsave_rex64;
11631 break;
11632 case IX86_BUILTIN_XRSTOR:
11633 icode = CODE_FOR_xrstor_rex64;
11634 break;
11635 case IX86_BUILTIN_XSAVE64:
11636 icode = CODE_FOR_xsave64;
11637 break;
11638 case IX86_BUILTIN_XRSTOR64:
11639 icode = CODE_FOR_xrstor64;
11640 break;
11641 case IX86_BUILTIN_XSAVEOPT:
11642 icode = CODE_FOR_xsaveopt_rex64;
11643 break;
11644 case IX86_BUILTIN_XSAVEOPT64:
11645 icode = CODE_FOR_xsaveopt64;
11646 break;
11647 case IX86_BUILTIN_XSAVES:
11648 icode = CODE_FOR_xsaves_rex64;
11649 break;
11650 case IX86_BUILTIN_XRSTORS:
11651 icode = CODE_FOR_xrstors_rex64;
11652 break;
11653 case IX86_BUILTIN_XSAVES64:
11654 icode = CODE_FOR_xsaves64;
11655 break;
11656 case IX86_BUILTIN_XRSTORS64:
11657 icode = CODE_FOR_xrstors64;
11658 break;
11659 case IX86_BUILTIN_XSAVEC:
11660 icode = CODE_FOR_xsavec_rex64;
11661 break;
11662 case IX86_BUILTIN_XSAVEC64:
11663 icode = CODE_FOR_xsavec64;
11664 break;
11665 default:
11666 gcc_unreachable ();
11667 }
11668
11669 op2 = gen_lowpart (SImode, op2);
11670 op1 = gen_lowpart (SImode, op1);
11671 pat = GEN_FCN (icode) (op0, op1, op2);
11672 }
11673 else
11674 {
11675 switch (fcode)
11676 {
11677 case IX86_BUILTIN_XSAVE:
11678 icode = CODE_FOR_xsave;
11679 break;
11680 case IX86_BUILTIN_XRSTOR:
11681 icode = CODE_FOR_xrstor;
11682 break;
11683 case IX86_BUILTIN_XSAVEOPT:
11684 icode = CODE_FOR_xsaveopt;
11685 break;
11686 case IX86_BUILTIN_XSAVES:
11687 icode = CODE_FOR_xsaves;
11688 break;
11689 case IX86_BUILTIN_XRSTORS:
11690 icode = CODE_FOR_xrstors;
11691 break;
11692 case IX86_BUILTIN_XSAVEC:
11693 icode = CODE_FOR_xsavec;
11694 break;
11695 default:
11696 gcc_unreachable ();
11697 }
11698 pat = GEN_FCN (icode) (op0, op1);
11699 }
11700
11701 if (pat)
11702 emit_insn (pat);
11703 return 0;
11704
11705 case IX86_BUILTIN_LLWPCB:
11706 arg0 = CALL_EXPR_ARG (exp, 0);
11707 op0 = expand_normal (arg0);
11708 icode = CODE_FOR_lwp_llwpcb;
11709 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
11710 op0 = ix86_zero_extend_to_Pmode (op0);
11711 emit_insn (gen_lwp_llwpcb (op0));
11712 return 0;
11713
11714 case IX86_BUILTIN_SLWPCB:
11715 icode = CODE_FOR_lwp_slwpcb;
11716 if (!target
11717 || !insn_data[icode].operand[0].predicate (target, Pmode))
11718 target = gen_reg_rtx (Pmode);
11719 emit_insn (gen_lwp_slwpcb (target));
11720 return target;
11721
11722 case IX86_BUILTIN_BEXTRI32:
11723 case IX86_BUILTIN_BEXTRI64:
11724 arg0 = CALL_EXPR_ARG (exp, 0);
11725 arg1 = CALL_EXPR_ARG (exp, 1);
11726 op0 = expand_normal (arg0);
11727 op1 = expand_normal (arg1);
11728 icode = (fcode == IX86_BUILTIN_BEXTRI32
11729 ? CODE_FOR_tbm_bextri_si
11730 : CODE_FOR_tbm_bextri_di);
11731 if (!CONST_INT_P (op1))
11732 {
11733 error ("last argument must be an immediate");
11734 return const0_rtx;
11735 }
11736 else
11737 {
11738 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
11739 unsigned char lsb_index = INTVAL (op1) & 0xFF;
11740 op1 = GEN_INT (length);
11741 op2 = GEN_INT (lsb_index);
11742
11743 mode1 = insn_data[icode].operand[1].mode;
11744 if (!insn_data[icode].operand[1].predicate (op0, mode1))
11745 op0 = copy_to_mode_reg (mode1, op0);
11746
11747 mode0 = insn_data[icode].operand[0].mode;
11748 if (target == 0
11749 || !register_operand (target, mode0))
11750 target = gen_reg_rtx (mode0);
11751
11752 pat = GEN_FCN (icode) (target, op0, op1, op2);
11753 if (pat)
11754 emit_insn (pat);
11755 return target;
11756 }
11757
11758 case IX86_BUILTIN_RDRAND16_STEP:
11759 icode = CODE_FOR_rdrandhi_1;
11760 mode0 = HImode;
11761 goto rdrand_step;
11762
11763 case IX86_BUILTIN_RDRAND32_STEP:
11764 icode = CODE_FOR_rdrandsi_1;
11765 mode0 = SImode;
11766 goto rdrand_step;
11767
11768 case IX86_BUILTIN_RDRAND64_STEP:
11769 icode = CODE_FOR_rdranddi_1;
11770 mode0 = DImode;
11771
11772 rdrand_step:
11773 arg0 = CALL_EXPR_ARG (exp, 0);
11774 op1 = expand_normal (arg0);
11775 if (!address_operand (op1, VOIDmode))
11776 {
11777 op1 = convert_memory_address (Pmode, op1);
11778 op1 = copy_addr_to_reg (op1);
11779 }
11780
11781 op0 = gen_reg_rtx (mode0);
11782 emit_insn (GEN_FCN (icode) (op0));
11783
11784 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
11785
11786 op1 = gen_reg_rtx (SImode);
11787 emit_move_insn (op1, CONST1_RTX (SImode));
11788
11789 /* Emit SImode conditional move. */
11790 if (mode0 == HImode)
11791 {
11792 if (TARGET_ZERO_EXTEND_WITH_AND
11793 && optimize_function_for_speed_p (cfun))
11794 {
11795 op2 = force_reg (SImode, const0_rtx);
11796
11797 emit_insn (gen_movstricthi
11798 (gen_lowpart (HImode, op2), op0));
11799 }
11800 else
11801 {
11802 op2 = gen_reg_rtx (SImode);
11803
11804 emit_insn (gen_zero_extendhisi2 (op2, op0));
11805 }
11806 }
11807 else if (mode0 == SImode)
11808 op2 = op0;
11809 else
11810 op2 = gen_rtx_SUBREG (SImode, op0, 0);
11811
11812 if (target == 0
11813 || !register_operand (target, SImode))
11814 target = gen_reg_rtx (SImode);
11815
11816 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
11817 const0_rtx);
11818 emit_insn (gen_rtx_SET (target,
11819 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
11820 return target;
11821
11822 case IX86_BUILTIN_RDSEED16_STEP:
11823 icode = CODE_FOR_rdseedhi_1;
11824 mode0 = HImode;
11825 goto rdseed_step;
11826
11827 case IX86_BUILTIN_RDSEED32_STEP:
11828 icode = CODE_FOR_rdseedsi_1;
11829 mode0 = SImode;
11830 goto rdseed_step;
11831
11832 case IX86_BUILTIN_RDSEED64_STEP:
11833 icode = CODE_FOR_rdseeddi_1;
11834 mode0 = DImode;
11835
11836 rdseed_step:
11837 arg0 = CALL_EXPR_ARG (exp, 0);
11838 op1 = expand_normal (arg0);
11839 if (!address_operand (op1, VOIDmode))
11840 {
11841 op1 = convert_memory_address (Pmode, op1);
11842 op1 = copy_addr_to_reg (op1);
11843 }
11844
11845 op0 = gen_reg_rtx (mode0);
11846 emit_insn (GEN_FCN (icode) (op0));
11847
11848 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
11849
11850 op2 = gen_reg_rtx (QImode);
11851
11852 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
11853 const0_rtx);
11854 emit_insn (gen_rtx_SET (op2, pat));
11855
11856 if (target == 0
11857 || !register_operand (target, SImode))
11858 target = gen_reg_rtx (SImode);
11859
11860 emit_insn (gen_zero_extendqisi2 (target, op2));
11861 return target;
11862
11863 case IX86_BUILTIN_SBB32:
11864 icode = CODE_FOR_subborrowsi;
11865 icode2 = CODE_FOR_subborrowsi_0;
11866 mode0 = SImode;
11867 mode1 = DImode;
11868 mode2 = CCmode;
11869 goto handlecarry;
11870
11871 case IX86_BUILTIN_SBB64:
11872 icode = CODE_FOR_subborrowdi;
11873 icode2 = CODE_FOR_subborrowdi_0;
11874 mode0 = DImode;
11875 mode1 = TImode;
11876 mode2 = CCmode;
11877 goto handlecarry;
11878
11879 case IX86_BUILTIN_ADDCARRYX32:
11880 icode = CODE_FOR_addcarrysi;
11881 icode2 = CODE_FOR_addcarrysi_0;
11882 mode0 = SImode;
11883 mode1 = DImode;
11884 mode2 = CCCmode;
11885 goto handlecarry;
11886
11887 case IX86_BUILTIN_ADDCARRYX64:
11888 icode = CODE_FOR_addcarrydi;
11889 icode2 = CODE_FOR_addcarrydi_0;
11890 mode0 = DImode;
11891 mode1 = TImode;
11892 mode2 = CCCmode;
11893
11894 handlecarry:
11895 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
11896 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
11897 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
11898 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
11899
11900 op1 = expand_normal (arg0);
11901 if (!integer_zerop (arg0))
11902 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
11903
11904 op2 = expand_normal (arg1);
11905 if (!register_operand (op2, mode0))
11906 op2 = copy_to_mode_reg (mode0, op2);
11907
11908 op3 = expand_normal (arg2);
11909 if (!register_operand (op3, mode0))
11910 op3 = copy_to_mode_reg (mode0, op3);
11911
11912 op4 = expand_normal (arg3);
11913 if (!address_operand (op4, VOIDmode))
11914 {
11915 op4 = convert_memory_address (Pmode, op4);
11916 op4 = copy_addr_to_reg (op4);
11917 }
11918
11919 op0 = gen_reg_rtx (mode0);
11920 if (integer_zerop (arg0))
11921 {
11922 /* If arg0 is 0, optimize right away into add or sub
11923 instruction that sets CCCmode flags. */
11924 op1 = gen_rtx_REG (mode2, FLAGS_REG);
11925 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
11926 }
11927 else
11928 {
11929 /* Generate CF from input operand. */
11930 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
11931
11932 /* Generate instruction that consumes CF. */
11933 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
11934 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
11935 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
11936 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
11937 }
11938
11939 /* Return current CF value. */
11940 if (target == 0)
11941 target = gen_reg_rtx (QImode);
11942
11943 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
11944 emit_insn (gen_rtx_SET (target, pat));
11945
11946 /* Store the result. */
11947 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
11948
11949 return target;
11950
11951 case IX86_BUILTIN_READ_FLAGS:
11952 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
11953
11954 if (optimize
11955 || target == NULL_RTX
11956 || !nonimmediate_operand (target, word_mode)
11957 || GET_MODE (target) != word_mode)
11958 target = gen_reg_rtx (word_mode);
11959
11960 emit_insn (gen_pop (target));
11961 return target;
11962
11963 case IX86_BUILTIN_WRITE_FLAGS:
11964
11965 arg0 = CALL_EXPR_ARG (exp, 0);
11966 op0 = expand_normal (arg0);
11967 if (!general_no_elim_operand (op0, word_mode))
11968 op0 = copy_to_mode_reg (word_mode, op0);
11969
11970 emit_insn (gen_push (op0));
11971 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
11972 return 0;
11973
11974 case IX86_BUILTIN_KTESTC8:
11975 icode = CODE_FOR_ktestqi;
11976 mode3 = CCCmode;
11977 goto kortest;
11978
11979 case IX86_BUILTIN_KTESTZ8:
11980 icode = CODE_FOR_ktestqi;
11981 mode3 = CCZmode;
11982 goto kortest;
11983
11984 case IX86_BUILTIN_KTESTC16:
11985 icode = CODE_FOR_ktesthi;
11986 mode3 = CCCmode;
11987 goto kortest;
11988
11989 case IX86_BUILTIN_KTESTZ16:
11990 icode = CODE_FOR_ktesthi;
11991 mode3 = CCZmode;
11992 goto kortest;
11993
11994 case IX86_BUILTIN_KTESTC32:
11995 icode = CODE_FOR_ktestsi;
11996 mode3 = CCCmode;
11997 goto kortest;
11998
11999 case IX86_BUILTIN_KTESTZ32:
12000 icode = CODE_FOR_ktestsi;
12001 mode3 = CCZmode;
12002 goto kortest;
12003
12004 case IX86_BUILTIN_KTESTC64:
12005 icode = CODE_FOR_ktestdi;
12006 mode3 = CCCmode;
12007 goto kortest;
12008
12009 case IX86_BUILTIN_KTESTZ64:
12010 icode = CODE_FOR_ktestdi;
12011 mode3 = CCZmode;
12012 goto kortest;
12013
12014 case IX86_BUILTIN_KORTESTC8:
12015 icode = CODE_FOR_kortestqi;
12016 mode3 = CCCmode;
12017 goto kortest;
12018
12019 case IX86_BUILTIN_KORTESTZ8:
12020 icode = CODE_FOR_kortestqi;
12021 mode3 = CCZmode;
12022 goto kortest;
12023
12024 case IX86_BUILTIN_KORTESTC16:
12025 icode = CODE_FOR_kortesthi;
12026 mode3 = CCCmode;
12027 goto kortest;
12028
12029 case IX86_BUILTIN_KORTESTZ16:
12030 icode = CODE_FOR_kortesthi;
12031 mode3 = CCZmode;
12032 goto kortest;
12033
12034 case IX86_BUILTIN_KORTESTC32:
12035 icode = CODE_FOR_kortestsi;
12036 mode3 = CCCmode;
12037 goto kortest;
12038
12039 case IX86_BUILTIN_KORTESTZ32:
12040 icode = CODE_FOR_kortestsi;
12041 mode3 = CCZmode;
12042 goto kortest;
12043
12044 case IX86_BUILTIN_KORTESTC64:
12045 icode = CODE_FOR_kortestdi;
12046 mode3 = CCCmode;
12047 goto kortest;
12048
12049 case IX86_BUILTIN_KORTESTZ64:
12050 icode = CODE_FOR_kortestdi;
12051 mode3 = CCZmode;
12052
12053 kortest:
12054 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
12055 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
12056 op0 = expand_normal (arg0);
12057 op1 = expand_normal (arg1);
12058
12059 mode0 = insn_data[icode].operand[0].mode;
12060 mode1 = insn_data[icode].operand[1].mode;
12061
12062 if (GET_MODE (op0) != VOIDmode)
12063 op0 = force_reg (GET_MODE (op0), op0);
12064
12065 op0 = gen_lowpart (mode0, op0);
12066
12067 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12068 op0 = copy_to_mode_reg (mode0, op0);
12069
12070 if (GET_MODE (op1) != VOIDmode)
12071 op1 = force_reg (GET_MODE (op1), op1);
12072
12073 op1 = gen_lowpart (mode1, op1);
12074
12075 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12076 op1 = copy_to_mode_reg (mode1, op1);
12077
12078 target = gen_reg_rtx (QImode);
12079
12080 /* Emit kortest. */
12081 emit_insn (GEN_FCN (icode) (op0, op1));
12082 /* And use setcc to return result from flags. */
12083 ix86_expand_setcc (target, EQ,
12084 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
12085 return target;
12086
12087 case IX86_BUILTIN_GATHERSIV2DF:
12088 icode = CODE_FOR_avx2_gathersiv2df;
12089 goto gather_gen;
12090 case IX86_BUILTIN_GATHERSIV4DF:
12091 icode = CODE_FOR_avx2_gathersiv4df;
12092 goto gather_gen;
12093 case IX86_BUILTIN_GATHERDIV2DF:
12094 icode = CODE_FOR_avx2_gatherdiv2df;
12095 goto gather_gen;
12096 case IX86_BUILTIN_GATHERDIV4DF:
12097 icode = CODE_FOR_avx2_gatherdiv4df;
12098 goto gather_gen;
12099 case IX86_BUILTIN_GATHERSIV4SF:
12100 icode = CODE_FOR_avx2_gathersiv4sf;
12101 goto gather_gen;
12102 case IX86_BUILTIN_GATHERSIV8SF:
12103 icode = CODE_FOR_avx2_gathersiv8sf;
12104 goto gather_gen;
12105 case IX86_BUILTIN_GATHERDIV4SF:
12106 icode = CODE_FOR_avx2_gatherdiv4sf;
12107 goto gather_gen;
12108 case IX86_BUILTIN_GATHERDIV8SF:
12109 icode = CODE_FOR_avx2_gatherdiv8sf;
12110 goto gather_gen;
12111 case IX86_BUILTIN_GATHERSIV2DI:
12112 icode = CODE_FOR_avx2_gathersiv2di;
12113 goto gather_gen;
12114 case IX86_BUILTIN_GATHERSIV4DI:
12115 icode = CODE_FOR_avx2_gathersiv4di;
12116 goto gather_gen;
12117 case IX86_BUILTIN_GATHERDIV2DI:
12118 icode = CODE_FOR_avx2_gatherdiv2di;
12119 goto gather_gen;
12120 case IX86_BUILTIN_GATHERDIV4DI:
12121 icode = CODE_FOR_avx2_gatherdiv4di;
12122 goto gather_gen;
12123 case IX86_BUILTIN_GATHERSIV4SI:
12124 icode = CODE_FOR_avx2_gathersiv4si;
12125 goto gather_gen;
12126 case IX86_BUILTIN_GATHERSIV8SI:
12127 icode = CODE_FOR_avx2_gathersiv8si;
12128 goto gather_gen;
12129 case IX86_BUILTIN_GATHERDIV4SI:
12130 icode = CODE_FOR_avx2_gatherdiv4si;
12131 goto gather_gen;
12132 case IX86_BUILTIN_GATHERDIV8SI:
12133 icode = CODE_FOR_avx2_gatherdiv8si;
12134 goto gather_gen;
12135 case IX86_BUILTIN_GATHERALTSIV4DF:
12136 icode = CODE_FOR_avx2_gathersiv4df;
12137 goto gather_gen;
12138 case IX86_BUILTIN_GATHERALTDIV8SF:
12139 icode = CODE_FOR_avx2_gatherdiv8sf;
12140 goto gather_gen;
12141 case IX86_BUILTIN_GATHERALTSIV4DI:
12142 icode = CODE_FOR_avx2_gathersiv4di;
12143 goto gather_gen;
12144 case IX86_BUILTIN_GATHERALTDIV8SI:
12145 icode = CODE_FOR_avx2_gatherdiv8si;
12146 goto gather_gen;
12147 case IX86_BUILTIN_GATHER3SIV16SF:
12148 icode = CODE_FOR_avx512f_gathersiv16sf;
12149 goto gather_gen;
12150 case IX86_BUILTIN_GATHER3SIV8DF:
12151 icode = CODE_FOR_avx512f_gathersiv8df;
12152 goto gather_gen;
12153 case IX86_BUILTIN_GATHER3DIV16SF:
12154 icode = CODE_FOR_avx512f_gatherdiv16sf;
12155 goto gather_gen;
12156 case IX86_BUILTIN_GATHER3DIV8DF:
12157 icode = CODE_FOR_avx512f_gatherdiv8df;
12158 goto gather_gen;
12159 case IX86_BUILTIN_GATHER3SIV16SI:
12160 icode = CODE_FOR_avx512f_gathersiv16si;
12161 goto gather_gen;
12162 case IX86_BUILTIN_GATHER3SIV8DI:
12163 icode = CODE_FOR_avx512f_gathersiv8di;
12164 goto gather_gen;
12165 case IX86_BUILTIN_GATHER3DIV16SI:
12166 icode = CODE_FOR_avx512f_gatherdiv16si;
12167 goto gather_gen;
12168 case IX86_BUILTIN_GATHER3DIV8DI:
12169 icode = CODE_FOR_avx512f_gatherdiv8di;
12170 goto gather_gen;
12171 case IX86_BUILTIN_GATHER3ALTSIV8DF:
12172 icode = CODE_FOR_avx512f_gathersiv8df;
12173 goto gather_gen;
12174 case IX86_BUILTIN_GATHER3ALTDIV16SF:
12175 icode = CODE_FOR_avx512f_gatherdiv16sf;
12176 goto gather_gen;
12177 case IX86_BUILTIN_GATHER3ALTSIV8DI:
12178 icode = CODE_FOR_avx512f_gathersiv8di;
12179 goto gather_gen;
12180 case IX86_BUILTIN_GATHER3ALTDIV16SI:
12181 icode = CODE_FOR_avx512f_gatherdiv16si;
12182 goto gather_gen;
12183 case IX86_BUILTIN_GATHER3SIV2DF:
12184 icode = CODE_FOR_avx512vl_gathersiv2df;
12185 goto gather_gen;
12186 case IX86_BUILTIN_GATHER3SIV4DF:
12187 icode = CODE_FOR_avx512vl_gathersiv4df;
12188 goto gather_gen;
12189 case IX86_BUILTIN_GATHER3DIV2DF:
12190 icode = CODE_FOR_avx512vl_gatherdiv2df;
12191 goto gather_gen;
12192 case IX86_BUILTIN_GATHER3DIV4DF:
12193 icode = CODE_FOR_avx512vl_gatherdiv4df;
12194 goto gather_gen;
12195 case IX86_BUILTIN_GATHER3SIV4SF:
12196 icode = CODE_FOR_avx512vl_gathersiv4sf;
12197 goto gather_gen;
12198 case IX86_BUILTIN_GATHER3SIV8SF:
12199 icode = CODE_FOR_avx512vl_gathersiv8sf;
12200 goto gather_gen;
12201 case IX86_BUILTIN_GATHER3DIV4SF:
12202 icode = CODE_FOR_avx512vl_gatherdiv4sf;
12203 goto gather_gen;
12204 case IX86_BUILTIN_GATHER3DIV8SF:
12205 icode = CODE_FOR_avx512vl_gatherdiv8sf;
12206 goto gather_gen;
12207 case IX86_BUILTIN_GATHER3SIV2DI:
12208 icode = CODE_FOR_avx512vl_gathersiv2di;
12209 goto gather_gen;
12210 case IX86_BUILTIN_GATHER3SIV4DI:
12211 icode = CODE_FOR_avx512vl_gathersiv4di;
12212 goto gather_gen;
12213 case IX86_BUILTIN_GATHER3DIV2DI:
12214 icode = CODE_FOR_avx512vl_gatherdiv2di;
12215 goto gather_gen;
12216 case IX86_BUILTIN_GATHER3DIV4DI:
12217 icode = CODE_FOR_avx512vl_gatherdiv4di;
12218 goto gather_gen;
12219 case IX86_BUILTIN_GATHER3SIV4SI:
12220 icode = CODE_FOR_avx512vl_gathersiv4si;
12221 goto gather_gen;
12222 case IX86_BUILTIN_GATHER3SIV8SI:
12223 icode = CODE_FOR_avx512vl_gathersiv8si;
12224 goto gather_gen;
12225 case IX86_BUILTIN_GATHER3DIV4SI:
12226 icode = CODE_FOR_avx512vl_gatherdiv4si;
12227 goto gather_gen;
12228 case IX86_BUILTIN_GATHER3DIV8SI:
12229 icode = CODE_FOR_avx512vl_gatherdiv8si;
12230 goto gather_gen;
12231 case IX86_BUILTIN_GATHER3ALTSIV4DF:
12232 icode = CODE_FOR_avx512vl_gathersiv4df;
12233 goto gather_gen;
12234 case IX86_BUILTIN_GATHER3ALTDIV8SF:
12235 icode = CODE_FOR_avx512vl_gatherdiv8sf;
12236 goto gather_gen;
12237 case IX86_BUILTIN_GATHER3ALTSIV4DI:
12238 icode = CODE_FOR_avx512vl_gathersiv4di;
12239 goto gather_gen;
12240 case IX86_BUILTIN_GATHER3ALTDIV8SI:
12241 icode = CODE_FOR_avx512vl_gatherdiv8si;
12242 goto gather_gen;
12243 case IX86_BUILTIN_SCATTERSIV16SF:
12244 icode = CODE_FOR_avx512f_scattersiv16sf;
12245 goto scatter_gen;
12246 case IX86_BUILTIN_SCATTERSIV8DF:
12247 icode = CODE_FOR_avx512f_scattersiv8df;
12248 goto scatter_gen;
12249 case IX86_BUILTIN_SCATTERDIV16SF:
12250 icode = CODE_FOR_avx512f_scatterdiv16sf;
12251 goto scatter_gen;
12252 case IX86_BUILTIN_SCATTERDIV8DF:
12253 icode = CODE_FOR_avx512f_scatterdiv8df;
12254 goto scatter_gen;
12255 case IX86_BUILTIN_SCATTERSIV16SI:
12256 icode = CODE_FOR_avx512f_scattersiv16si;
12257 goto scatter_gen;
12258 case IX86_BUILTIN_SCATTERSIV8DI:
12259 icode = CODE_FOR_avx512f_scattersiv8di;
12260 goto scatter_gen;
12261 case IX86_BUILTIN_SCATTERDIV16SI:
12262 icode = CODE_FOR_avx512f_scatterdiv16si;
12263 goto scatter_gen;
12264 case IX86_BUILTIN_SCATTERDIV8DI:
12265 icode = CODE_FOR_avx512f_scatterdiv8di;
12266 goto scatter_gen;
12267 case IX86_BUILTIN_SCATTERSIV8SF:
12268 icode = CODE_FOR_avx512vl_scattersiv8sf;
12269 goto scatter_gen;
12270 case IX86_BUILTIN_SCATTERSIV4SF:
12271 icode = CODE_FOR_avx512vl_scattersiv4sf;
12272 goto scatter_gen;
12273 case IX86_BUILTIN_SCATTERSIV4DF:
12274 icode = CODE_FOR_avx512vl_scattersiv4df;
12275 goto scatter_gen;
12276 case IX86_BUILTIN_SCATTERSIV2DF:
12277 icode = CODE_FOR_avx512vl_scattersiv2df;
12278 goto scatter_gen;
12279 case IX86_BUILTIN_SCATTERDIV8SF:
12280 icode = CODE_FOR_avx512vl_scatterdiv8sf;
12281 goto scatter_gen;
12282 case IX86_BUILTIN_SCATTERDIV4SF:
12283 icode = CODE_FOR_avx512vl_scatterdiv4sf;
12284 goto scatter_gen;
12285 case IX86_BUILTIN_SCATTERDIV4DF:
12286 icode = CODE_FOR_avx512vl_scatterdiv4df;
12287 goto scatter_gen;
12288 case IX86_BUILTIN_SCATTERDIV2DF:
12289 icode = CODE_FOR_avx512vl_scatterdiv2df;
12290 goto scatter_gen;
12291 case IX86_BUILTIN_SCATTERSIV8SI:
12292 icode = CODE_FOR_avx512vl_scattersiv8si;
12293 goto scatter_gen;
12294 case IX86_BUILTIN_SCATTERSIV4SI:
12295 icode = CODE_FOR_avx512vl_scattersiv4si;
12296 goto scatter_gen;
12297 case IX86_BUILTIN_SCATTERSIV4DI:
12298 icode = CODE_FOR_avx512vl_scattersiv4di;
12299 goto scatter_gen;
12300 case IX86_BUILTIN_SCATTERSIV2DI:
12301 icode = CODE_FOR_avx512vl_scattersiv2di;
12302 goto scatter_gen;
12303 case IX86_BUILTIN_SCATTERDIV8SI:
12304 icode = CODE_FOR_avx512vl_scatterdiv8si;
12305 goto scatter_gen;
12306 case IX86_BUILTIN_SCATTERDIV4SI:
12307 icode = CODE_FOR_avx512vl_scatterdiv4si;
12308 goto scatter_gen;
12309 case IX86_BUILTIN_SCATTERDIV4DI:
12310 icode = CODE_FOR_avx512vl_scatterdiv4di;
12311 goto scatter_gen;
12312 case IX86_BUILTIN_SCATTERDIV2DI:
12313 icode = CODE_FOR_avx512vl_scatterdiv2di;
12314 goto scatter_gen;
12315 case IX86_BUILTIN_GATHERPFDPD:
12316 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
12317 goto vec_prefetch_gen;
12318 case IX86_BUILTIN_SCATTERALTSIV8DF:
12319 icode = CODE_FOR_avx512f_scattersiv8df;
12320 goto scatter_gen;
12321 case IX86_BUILTIN_SCATTERALTDIV16SF:
12322 icode = CODE_FOR_avx512f_scatterdiv16sf;
12323 goto scatter_gen;
12324 case IX86_BUILTIN_SCATTERALTSIV8DI:
12325 icode = CODE_FOR_avx512f_scattersiv8di;
12326 goto scatter_gen;
12327 case IX86_BUILTIN_SCATTERALTDIV16SI:
12328 icode = CODE_FOR_avx512f_scatterdiv16si;
12329 goto scatter_gen;
12330 case IX86_BUILTIN_SCATTERALTSIV4DF:
12331 icode = CODE_FOR_avx512vl_scattersiv4df;
12332 goto scatter_gen;
12333 case IX86_BUILTIN_SCATTERALTDIV8SF:
12334 icode = CODE_FOR_avx512vl_scatterdiv8sf;
12335 goto scatter_gen;
12336 case IX86_BUILTIN_SCATTERALTSIV4DI:
12337 icode = CODE_FOR_avx512vl_scattersiv4di;
12338 goto scatter_gen;
12339 case IX86_BUILTIN_SCATTERALTDIV8SI:
12340 icode = CODE_FOR_avx512vl_scatterdiv8si;
12341 goto scatter_gen;
12342 case IX86_BUILTIN_SCATTERALTSIV2DF:
12343 icode = CODE_FOR_avx512vl_scattersiv2df;
12344 goto scatter_gen;
12345 case IX86_BUILTIN_SCATTERALTDIV4SF:
12346 icode = CODE_FOR_avx512vl_scatterdiv4sf;
12347 goto scatter_gen;
12348 case IX86_BUILTIN_SCATTERALTSIV2DI:
12349 icode = CODE_FOR_avx512vl_scattersiv2di;
12350 goto scatter_gen;
12351 case IX86_BUILTIN_SCATTERALTDIV4SI:
12352 icode = CODE_FOR_avx512vl_scatterdiv4si;
12353 goto scatter_gen;
12354 case IX86_BUILTIN_GATHERPFDPS:
12355 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
12356 goto vec_prefetch_gen;
12357 case IX86_BUILTIN_GATHERPFQPD:
12358 icode = CODE_FOR_avx512pf_gatherpfv8didf;
12359 goto vec_prefetch_gen;
12360 case IX86_BUILTIN_GATHERPFQPS:
12361 icode = CODE_FOR_avx512pf_gatherpfv8disf;
12362 goto vec_prefetch_gen;
12363 case IX86_BUILTIN_SCATTERPFDPD:
12364 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
12365 goto vec_prefetch_gen;
12366 case IX86_BUILTIN_SCATTERPFDPS:
12367 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
12368 goto vec_prefetch_gen;
12369 case IX86_BUILTIN_SCATTERPFQPD:
12370 icode = CODE_FOR_avx512pf_scatterpfv8didf;
12371 goto vec_prefetch_gen;
12372 case IX86_BUILTIN_SCATTERPFQPS:
12373 icode = CODE_FOR_avx512pf_scatterpfv8disf;
12374 goto vec_prefetch_gen;
12375
12376 gather_gen:
12377 rtx half;
12378 rtx (*gen) (rtx, rtx);
12379
12380 arg0 = CALL_EXPR_ARG (exp, 0);
12381 arg1 = CALL_EXPR_ARG (exp, 1);
12382 arg2 = CALL_EXPR_ARG (exp, 2);
12383 arg3 = CALL_EXPR_ARG (exp, 3);
12384 arg4 = CALL_EXPR_ARG (exp, 4);
12385 op0 = expand_normal (arg0);
12386 op1 = expand_normal (arg1);
12387 op2 = expand_normal (arg2);
12388 op3 = expand_normal (arg3);
12389 op4 = expand_normal (arg4);
12390 /* Note the arg order is different from the operand order. */
12391 mode0 = insn_data[icode].operand[1].mode;
12392 mode2 = insn_data[icode].operand[3].mode;
12393 mode3 = insn_data[icode].operand[4].mode;
12394 mode4 = insn_data[icode].operand[5].mode;
12395
12396 if (target == NULL_RTX
12397 || GET_MODE (target) != insn_data[icode].operand[0].mode
12398 || !insn_data[icode].operand[0].predicate (target,
12399 GET_MODE (target)))
12400 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
12401 else
12402 subtarget = target;
12403
12404 switch (fcode)
12405 {
12406 case IX86_BUILTIN_GATHER3ALTSIV8DF:
12407 case IX86_BUILTIN_GATHER3ALTSIV8DI:
12408 half = gen_reg_rtx (V8SImode);
12409 if (!nonimmediate_operand (op2, V16SImode))
12410 op2 = copy_to_mode_reg (V16SImode, op2);
12411 emit_insn (gen_vec_extract_lo_v16si (half, op2));
12412 op2 = half;
12413 break;
12414 case IX86_BUILTIN_GATHER3ALTSIV4DF:
12415 case IX86_BUILTIN_GATHER3ALTSIV4DI:
12416 case IX86_BUILTIN_GATHERALTSIV4DF:
12417 case IX86_BUILTIN_GATHERALTSIV4DI:
12418 half = gen_reg_rtx (V4SImode);
12419 if (!nonimmediate_operand (op2, V8SImode))
12420 op2 = copy_to_mode_reg (V8SImode, op2);
12421 emit_insn (gen_vec_extract_lo_v8si (half, op2));
12422 op2 = half;
12423 break;
12424 case IX86_BUILTIN_GATHER3ALTDIV16SF:
12425 case IX86_BUILTIN_GATHER3ALTDIV16SI:
12426 half = gen_reg_rtx (mode0);
12427 if (mode0 == V8SFmode)
12428 gen = gen_vec_extract_lo_v16sf;
12429 else
12430 gen = gen_vec_extract_lo_v16si;
12431 if (!nonimmediate_operand (op0, GET_MODE (op0)))
12432 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
12433 emit_insn (gen (half, op0));
12434 op0 = half;
12435 op3 = lowpart_subreg (QImode, op3, HImode);
12436 break;
12437 case IX86_BUILTIN_GATHER3ALTDIV8SF:
12438 case IX86_BUILTIN_GATHER3ALTDIV8SI:
12439 case IX86_BUILTIN_GATHERALTDIV8SF:
12440 case IX86_BUILTIN_GATHERALTDIV8SI:
12441 half = gen_reg_rtx (mode0);
12442 if (mode0 == V4SFmode)
12443 gen = gen_vec_extract_lo_v8sf;
12444 else
12445 gen = gen_vec_extract_lo_v8si;
12446 if (!nonimmediate_operand (op0, GET_MODE (op0)))
12447 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
12448 emit_insn (gen (half, op0));
12449 op0 = half;
12450 if (VECTOR_MODE_P (GET_MODE (op3)))
12451 {
12452 half = gen_reg_rtx (mode0);
12453 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12454 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12455 emit_insn (gen (half, op3));
12456 op3 = half;
12457 }
12458 break;
12459 default:
12460 break;
12461 }
12462
12463 /* Force memory operand only with base register here. But we
12464 don't want to do it on memory operand for other builtin
12465 functions. */
12466 op1 = ix86_zero_extend_to_Pmode (op1);
12467
12468 if (!insn_data[icode].operand[1].predicate (op0, mode0))
12469 op0 = copy_to_mode_reg (mode0, op0);
12470 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
12471 op1 = copy_to_mode_reg (Pmode, op1);
12472 if (!insn_data[icode].operand[3].predicate (op2, mode2))
12473 op2 = copy_to_mode_reg (mode2, op2);
12474
12475 op3 = fixup_modeless_constant (op3, mode3);
12476
12477 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
12478 {
12479 if (!insn_data[icode].operand[4].predicate (op3, mode3))
12480 op3 = copy_to_mode_reg (mode3, op3);
12481 }
12482 else
12483 {
12484 op3 = copy_to_reg (op3);
12485 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
12486 }
12487 if (!insn_data[icode].operand[5].predicate (op4, mode4))
12488 {
12489 error ("the last argument must be scale 1, 2, 4, 8");
12490 return const0_rtx;
12491 }
12492
12493 /* Optimize. If mask is known to have all high bits set,
12494 replace op0 with pc_rtx to signal that the instruction
12495 overwrites the whole destination and doesn't use its
12496 previous contents. */
12497 if (optimize)
12498 {
12499 if (TREE_CODE (arg3) == INTEGER_CST)
12500 {
12501 if (integer_all_onesp (arg3))
12502 op0 = pc_rtx;
12503 }
12504 else if (TREE_CODE (arg3) == VECTOR_CST)
12505 {
12506 unsigned int negative = 0;
12507 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
12508 {
12509 tree cst = VECTOR_CST_ELT (arg3, i);
12510 if (TREE_CODE (cst) == INTEGER_CST
12511 && tree_int_cst_sign_bit (cst))
12512 negative++;
12513 else if (TREE_CODE (cst) == REAL_CST
12514 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
12515 negative++;
12516 }
12517 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
12518 op0 = pc_rtx;
12519 }
12520 else if (TREE_CODE (arg3) == SSA_NAME
12521 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
12522 {
12523 /* Recognize also when mask is like:
12524 __v2df src = _mm_setzero_pd ();
12525 __v2df mask = _mm_cmpeq_pd (src, src);
12526 or
12527 __v8sf src = _mm256_setzero_ps ();
12528 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
12529 as that is a cheaper way to load all ones into
12530 a register than having to load a constant from
12531 memory. */
12532 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
12533 if (is_gimple_call (def_stmt))
12534 {
12535 tree fndecl = gimple_call_fndecl (def_stmt);
12536 if (fndecl
12537 && fndecl_built_in_p (fndecl, BUILT_IN_MD))
12538 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
12539 {
12540 case IX86_BUILTIN_CMPPD:
12541 case IX86_BUILTIN_CMPPS:
12542 case IX86_BUILTIN_CMPPD256:
12543 case IX86_BUILTIN_CMPPS256:
12544 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
12545 break;
12546 /* FALLTHRU */
12547 case IX86_BUILTIN_CMPEQPD:
12548 case IX86_BUILTIN_CMPEQPS:
12549 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
12550 && initializer_zerop (gimple_call_arg (def_stmt,
12551 1)))
12552 op0 = pc_rtx;
12553 break;
12554 default:
12555 break;
12556 }
12557 }
12558 }
12559 }
12560
12561 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
12562 if (! pat)
12563 return const0_rtx;
12564 emit_insn (pat);
12565
12566 switch (fcode)
12567 {
12568 case IX86_BUILTIN_GATHER3DIV16SF:
12569 if (target == NULL_RTX)
12570 target = gen_reg_rtx (V8SFmode);
12571 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
12572 break;
12573 case IX86_BUILTIN_GATHER3DIV16SI:
12574 if (target == NULL_RTX)
12575 target = gen_reg_rtx (V8SImode);
12576 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
12577 break;
12578 case IX86_BUILTIN_GATHER3DIV8SF:
12579 case IX86_BUILTIN_GATHERDIV8SF:
12580 if (target == NULL_RTX)
12581 target = gen_reg_rtx (V4SFmode);
12582 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
12583 break;
12584 case IX86_BUILTIN_GATHER3DIV8SI:
12585 case IX86_BUILTIN_GATHERDIV8SI:
12586 if (target == NULL_RTX)
12587 target = gen_reg_rtx (V4SImode);
12588 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
12589 break;
12590 default:
12591 target = subtarget;
12592 break;
12593 }
12594 return target;
12595
12596 scatter_gen:
12597 arg0 = CALL_EXPR_ARG (exp, 0);
12598 arg1 = CALL_EXPR_ARG (exp, 1);
12599 arg2 = CALL_EXPR_ARG (exp, 2);
12600 arg3 = CALL_EXPR_ARG (exp, 3);
12601 arg4 = CALL_EXPR_ARG (exp, 4);
12602 op0 = expand_normal (arg0);
12603 op1 = expand_normal (arg1);
12604 op2 = expand_normal (arg2);
12605 op3 = expand_normal (arg3);
12606 op4 = expand_normal (arg4);
12607 mode1 = insn_data[icode].operand[1].mode;
12608 mode2 = insn_data[icode].operand[2].mode;
12609 mode3 = insn_data[icode].operand[3].mode;
12610 mode4 = insn_data[icode].operand[4].mode;
12611
12612 /* Scatter instruction stores operand op3 to memory with
12613 indices from op2 and scale from op4 under writemask op1.
12614 If index operand op2 has more elements then source operand
12615 op3 one need to use only its low half. And vice versa. */
12616 switch (fcode)
12617 {
12618 case IX86_BUILTIN_SCATTERALTSIV8DF:
12619 case IX86_BUILTIN_SCATTERALTSIV8DI:
12620 half = gen_reg_rtx (V8SImode);
12621 if (!nonimmediate_operand (op2, V16SImode))
12622 op2 = copy_to_mode_reg (V16SImode, op2);
12623 emit_insn (gen_vec_extract_lo_v16si (half, op2));
12624 op2 = half;
12625 break;
12626 case IX86_BUILTIN_SCATTERALTDIV16SF:
12627 case IX86_BUILTIN_SCATTERALTDIV16SI:
12628 half = gen_reg_rtx (mode3);
12629 if (mode3 == V8SFmode)
12630 gen = gen_vec_extract_lo_v16sf;
12631 else
12632 gen = gen_vec_extract_lo_v16si;
12633 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12634 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12635 emit_insn (gen (half, op3));
12636 op3 = half;
12637 break;
12638 case IX86_BUILTIN_SCATTERALTSIV4DF:
12639 case IX86_BUILTIN_SCATTERALTSIV4DI:
12640 half = gen_reg_rtx (V4SImode);
12641 if (!nonimmediate_operand (op2, V8SImode))
12642 op2 = copy_to_mode_reg (V8SImode, op2);
12643 emit_insn (gen_vec_extract_lo_v8si (half, op2));
12644 op2 = half;
12645 break;
12646 case IX86_BUILTIN_SCATTERALTDIV8SF:
12647 case IX86_BUILTIN_SCATTERALTDIV8SI:
12648 half = gen_reg_rtx (mode3);
12649 if (mode3 == V4SFmode)
12650 gen = gen_vec_extract_lo_v8sf;
12651 else
12652 gen = gen_vec_extract_lo_v8si;
12653 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12654 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12655 emit_insn (gen (half, op3));
12656 op3 = half;
12657 break;
12658 case IX86_BUILTIN_SCATTERALTSIV2DF:
12659 case IX86_BUILTIN_SCATTERALTSIV2DI:
12660 if (!nonimmediate_operand (op2, V4SImode))
12661 op2 = copy_to_mode_reg (V4SImode, op2);
12662 break;
12663 case IX86_BUILTIN_SCATTERALTDIV4SF:
12664 case IX86_BUILTIN_SCATTERALTDIV4SI:
12665 if (!nonimmediate_operand (op3, GET_MODE (op3)))
12666 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
12667 break;
12668 default:
12669 break;
12670 }
12671
12672 /* Force memory operand only with base register here. But we
12673 don't want to do it on memory operand for other builtin
12674 functions. */
12675 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
12676
12677 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12678 op0 = copy_to_mode_reg (Pmode, op0);
12679
12680 op1 = fixup_modeless_constant (op1, mode1);
12681
12682 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
12683 {
12684 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12685 op1 = copy_to_mode_reg (mode1, op1);
12686 }
12687 else
12688 {
12689 op1 = copy_to_reg (op1);
12690 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
12691 }
12692
12693 if (!insn_data[icode].operand[2].predicate (op2, mode2))
12694 op2 = copy_to_mode_reg (mode2, op2);
12695
12696 if (!insn_data[icode].operand[3].predicate (op3, mode3))
12697 op3 = copy_to_mode_reg (mode3, op3);
12698
12699 if (!insn_data[icode].operand[4].predicate (op4, mode4))
12700 {
12701 error ("the last argument must be scale 1, 2, 4, 8");
12702 return const0_rtx;
12703 }
12704
12705 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
12706 if (! pat)
12707 return const0_rtx;
12708
12709 emit_insn (pat);
12710 return 0;
12711
12712 vec_prefetch_gen:
12713 arg0 = CALL_EXPR_ARG (exp, 0);
12714 arg1 = CALL_EXPR_ARG (exp, 1);
12715 arg2 = CALL_EXPR_ARG (exp, 2);
12716 arg3 = CALL_EXPR_ARG (exp, 3);
12717 arg4 = CALL_EXPR_ARG (exp, 4);
12718 op0 = expand_normal (arg0);
12719 op1 = expand_normal (arg1);
12720 op2 = expand_normal (arg2);
12721 op3 = expand_normal (arg3);
12722 op4 = expand_normal (arg4);
12723 mode0 = insn_data[icode].operand[0].mode;
12724 mode1 = insn_data[icode].operand[1].mode;
12725 mode3 = insn_data[icode].operand[3].mode;
12726 mode4 = insn_data[icode].operand[4].mode;
12727
12728 op0 = fixup_modeless_constant (op0, mode0);
12729
12730 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
12731 {
12732 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12733 op0 = copy_to_mode_reg (mode0, op0);
12734 }
12735 else
12736 {
12737 op0 = copy_to_reg (op0);
12738 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
12739 }
12740
12741 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12742 op1 = copy_to_mode_reg (mode1, op1);
12743
12744 /* Force memory operand only with base register here. But we
12745 don't want to do it on memory operand for other builtin
12746 functions. */
12747 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
12748
12749 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
12750 op2 = copy_to_mode_reg (Pmode, op2);
12751
12752 if (!insn_data[icode].operand[3].predicate (op3, mode3))
12753 {
12754 error ("the forth argument must be scale 1, 2, 4, 8");
12755 return const0_rtx;
12756 }
12757
12758 if (!insn_data[icode].operand[4].predicate (op4, mode4))
12759 {
12760 error ("incorrect hint operand");
12761 return const0_rtx;
12762 }
12763
12764 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
12765 if (! pat)
12766 return const0_rtx;
12767
12768 emit_insn (pat);
12769
12770 return 0;
12771
12772 case IX86_BUILTIN_XABORT:
12773 icode = CODE_FOR_xabort;
12774 arg0 = CALL_EXPR_ARG (exp, 0);
12775 op0 = expand_normal (arg0);
12776 mode0 = insn_data[icode].operand[0].mode;
12777 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12778 {
12779 error ("the argument to %<xabort%> intrinsic must "
12780 "be an 8-bit immediate");
12781 return const0_rtx;
12782 }
12783 emit_insn (gen_xabort (op0));
12784 return 0;
12785
12786 case IX86_BUILTIN_RSTORSSP:
12787 case IX86_BUILTIN_CLRSSBSY:
12788 arg0 = CALL_EXPR_ARG (exp, 0);
12789 op0 = expand_normal (arg0);
12790 icode = (fcode == IX86_BUILTIN_RSTORSSP
12791 ? CODE_FOR_rstorssp
12792 : CODE_FOR_clrssbsy);
12793 if (!address_operand (op0, VOIDmode))
12794 {
12795 op1 = convert_memory_address (Pmode, op0);
12796 op0 = copy_addr_to_reg (op1);
12797 }
12798 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
12799 return 0;
12800
12801 case IX86_BUILTIN_WRSSD:
12802 case IX86_BUILTIN_WRSSQ:
12803 case IX86_BUILTIN_WRUSSD:
12804 case IX86_BUILTIN_WRUSSQ:
12805 arg0 = CALL_EXPR_ARG (exp, 0);
12806 op0 = expand_normal (arg0);
12807 arg1 = CALL_EXPR_ARG (exp, 1);
12808 op1 = expand_normal (arg1);
12809 switch (fcode)
12810 {
12811 case IX86_BUILTIN_WRSSD:
12812 icode = CODE_FOR_wrsssi;
12813 mode = SImode;
12814 break;
12815 case IX86_BUILTIN_WRSSQ:
12816 icode = CODE_FOR_wrssdi;
12817 mode = DImode;
12818 break;
12819 case IX86_BUILTIN_WRUSSD:
12820 icode = CODE_FOR_wrusssi;
12821 mode = SImode;
12822 break;
12823 case IX86_BUILTIN_WRUSSQ:
12824 icode = CODE_FOR_wrussdi;
12825 mode = DImode;
12826 break;
12827 }
12828 op0 = force_reg (mode, op0);
12829 if (!address_operand (op1, VOIDmode))
12830 {
12831 op2 = convert_memory_address (Pmode, op1);
12832 op1 = copy_addr_to_reg (op2);
12833 }
12834 emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
12835 return 0;
12836
12837 default:
12838 break;
12839 }
12840
12841 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
12842 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
12843 {
12844 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
12845 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
12846 target);
12847 }
12848
12849 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
12850 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
12851 {
12852 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
12853 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
12854 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
12855 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
12856 int masked = 1;
12857 machine_mode mode, wide_mode, nar_mode;
12858
12859 nar_mode = V4SFmode;
12860 mode = V16SFmode;
12861 wide_mode = V64SFmode;
12862 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
12863 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
12864
12865 switch (fcode)
12866 {
12867 case IX86_BUILTIN_4FMAPS:
12868 fcn = gen_avx5124fmaddps_4fmaddps;
12869 masked = 0;
12870 goto v4fma_expand;
12871
12872 case IX86_BUILTIN_4DPWSSD:
12873 nar_mode = V4SImode;
12874 mode = V16SImode;
12875 wide_mode = V64SImode;
12876 fcn = gen_avx5124vnniw_vp4dpwssd;
12877 masked = 0;
12878 goto v4fma_expand;
12879
12880 case IX86_BUILTIN_4DPWSSDS:
12881 nar_mode = V4SImode;
12882 mode = V16SImode;
12883 wide_mode = V64SImode;
12884 fcn = gen_avx5124vnniw_vp4dpwssds;
12885 masked = 0;
12886 goto v4fma_expand;
12887
12888 case IX86_BUILTIN_4FNMAPS:
12889 fcn = gen_avx5124fmaddps_4fnmaddps;
12890 masked = 0;
12891 goto v4fma_expand;
12892
12893 case IX86_BUILTIN_4FNMAPS_MASK:
12894 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
12895 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
12896 goto v4fma_expand;
12897
12898 case IX86_BUILTIN_4DPWSSD_MASK:
12899 nar_mode = V4SImode;
12900 mode = V16SImode;
12901 wide_mode = V64SImode;
12902 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
12903 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
12904 goto v4fma_expand;
12905
12906 case IX86_BUILTIN_4DPWSSDS_MASK:
12907 nar_mode = V4SImode;
12908 mode = V16SImode;
12909 wide_mode = V64SImode;
12910 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
12911 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
12912 goto v4fma_expand;
12913
12914 case IX86_BUILTIN_4FMAPS_MASK:
12915 {
12916 tree args[4];
12917 rtx ops[4];
12918 rtx wide_reg;
12919 rtx accum;
12920 rtx addr;
12921 rtx mem;
12922
12923 v4fma_expand:
12924 wide_reg = gen_reg_rtx (wide_mode);
12925 for (i = 0; i < 4; i++)
12926 {
12927 args[i] = CALL_EXPR_ARG (exp, i);
12928 ops[i] = expand_normal (args[i]);
12929
12930 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
12931 ops[i]);
12932 }
12933
12934 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
12935 accum = force_reg (mode, accum);
12936
12937 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
12938 addr = force_reg (Pmode, addr);
12939
12940 mem = gen_rtx_MEM (nar_mode, addr);
12941
12942 target = gen_reg_rtx (mode);
12943
12944 emit_move_insn (target, accum);
12945
12946 if (! masked)
12947 emit_insn (fcn (target, accum, wide_reg, mem));
12948 else
12949 {
12950 rtx merge, mask;
12951 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
12952
12953 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
12954
12955 if (CONST_INT_P (mask))
12956 mask = fixup_modeless_constant (mask, HImode);
12957
12958 mask = force_reg (HImode, mask);
12959
12960 if (GET_MODE (mask) != HImode)
12961 mask = gen_rtx_SUBREG (HImode, mask, 0);
12962
12963 /* If merge is 0 then we're about to emit z-masked variant. */
12964 if (const0_operand (merge, mode))
12965 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
12966 /* If merge is the same as accum then emit merge-masked variant. */
12967 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
12968 {
12969 merge = force_reg (mode, merge);
12970 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
12971 }
12972 /* Merge with something unknown might happen if we z-mask w/ -O0. */
12973 else
12974 {
12975 target = gen_reg_rtx (mode);
12976 emit_move_insn (target, merge);
12977 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
12978 }
12979 }
12980 return target;
12981 }
12982
12983 case IX86_BUILTIN_4FNMASS:
12984 fcn = gen_avx5124fmaddps_4fnmaddss;
12985 masked = 0;
12986 goto s4fma_expand;
12987
12988 case IX86_BUILTIN_4FMASS:
12989 fcn = gen_avx5124fmaddps_4fmaddss;
12990 masked = 0;
12991 goto s4fma_expand;
12992
12993 case IX86_BUILTIN_4FNMASS_MASK:
12994 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
12995 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
12996 goto s4fma_expand;
12997
12998 case IX86_BUILTIN_4FMASS_MASK:
12999 {
13000 tree args[4];
13001 rtx ops[4];
13002 rtx wide_reg;
13003 rtx accum;
13004 rtx addr;
13005 rtx mem;
13006
13007 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
13008 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
13009
13010 s4fma_expand:
13011 mode = V4SFmode;
13012 wide_reg = gen_reg_rtx (V64SFmode);
13013 for (i = 0; i < 4; i++)
13014 {
13015 rtx tmp;
13016 args[i] = CALL_EXPR_ARG (exp, i);
13017 ops[i] = expand_normal (args[i]);
13018
13019 tmp = gen_reg_rtx (SFmode);
13020 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
13021
13022 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
13023 gen_rtx_SUBREG (V16SFmode, tmp, 0));
13024 }
13025
13026 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
13027 accum = force_reg (V4SFmode, accum);
13028
13029 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
13030 addr = force_reg (Pmode, addr);
13031
13032 mem = gen_rtx_MEM (V4SFmode, addr);
13033
13034 target = gen_reg_rtx (V4SFmode);
13035
13036 emit_move_insn (target, accum);
13037
13038 if (! masked)
13039 emit_insn (fcn (target, accum, wide_reg, mem));
13040 else
13041 {
13042 rtx merge, mask;
13043 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
13044
13045 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
13046
13047 if (CONST_INT_P (mask))
13048 mask = fixup_modeless_constant (mask, QImode);
13049
13050 mask = force_reg (QImode, mask);
13051
13052 if (GET_MODE (mask) != QImode)
13053 mask = gen_rtx_SUBREG (QImode, mask, 0);
13054
13055 /* If merge is 0 then we're about to emit z-masked variant. */
13056 if (const0_operand (merge, mode))
13057 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
13058 /* If merge is the same as accum then emit merge-masked
13059 variant. */
13060 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
13061 {
13062 merge = force_reg (mode, merge);
13063 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
13064 }
13065 /* Merge with something unknown might happen if we z-mask
13066 w/ -O0. */
13067 else
13068 {
13069 target = gen_reg_rtx (mode);
13070 emit_move_insn (target, merge);
13071 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
13072 }
13073 }
13074 return target;
13075 }
13076 case IX86_BUILTIN_RDPID:
13077 return ix86_expand_special_args_builtin (bdesc_args + i, exp,
13078 target);
13079 case IX86_BUILTIN_FABSQ:
13080 case IX86_BUILTIN_COPYSIGNQ:
13081 if (!TARGET_SSE)
13082 /* Emit a normal call if SSE isn't available. */
13083 return expand_call (exp, target, ignore);
13084 /* FALLTHRU */
13085 default:
13086 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
13087 }
13088 }
13089
13090 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
13091 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
13092 {
13093 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
13094 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
13095 }
13096
13097 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
13098 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
13099 {
13100 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
13101 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
13102 }
13103
13104 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
13105 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
13106 {
13107 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
13108 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
13109 }
13110
13111 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
13112 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
13113 {
13114 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
13115 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
13116 }
13117
13118 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
13119 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
13120 {
13121 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
13122 const struct builtin_description *d = bdesc_multi_arg + i;
13123 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
13124 (enum ix86_builtin_func_type)
13125 d->flag, d->comparison);
13126 }
13127
13128 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
13129 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
13130 {
13131 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
13132 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
13133 target);
13134 }
13135
13136 if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
13137 && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
13138 {
13139 i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
13140 return ix86_expand_special_args_builtin (bdesc_cet_rdssp + i, exp,
13141 target);
13142 }
13143
13144 gcc_unreachable ();
13145 }
13146
13147 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
13148 fill target with val via vec_duplicate. */
13149
13150 static bool
13151 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
13152 {
13153 bool ok;
13154 rtx_insn *insn;
13155 rtx dup;
13156
13157 /* First attempt to recognize VAL as-is. */
13158 dup = gen_vec_duplicate (mode, val);
13159 insn = emit_insn (gen_rtx_SET (target, dup));
13160 if (recog_memoized (insn) < 0)
13161 {
13162 rtx_insn *seq;
13163 machine_mode innermode = GET_MODE_INNER (mode);
13164 rtx reg;
13165
13166 /* If that fails, force VAL into a register. */
13167
13168 start_sequence ();
13169 reg = force_reg (innermode, val);
13170 if (GET_MODE (reg) != innermode)
13171 reg = gen_lowpart (innermode, reg);
13172 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
13173 seq = get_insns ();
13174 end_sequence ();
13175 if (seq)
13176 emit_insn_before (seq, insn);
13177
13178 ok = recog_memoized (insn) >= 0;
13179 gcc_assert (ok);
13180 }
13181 return true;
13182 }
13183
13184 /* Get a vector mode of the same size as the original but with elements
13185 twice as wide. This is only guaranteed to apply to integral vectors. */
13186
13187 static machine_mode
13188 get_mode_wider_vector (machine_mode o)
13189 {
13190 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
13191 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
13192 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
13193 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
13194 return n;
13195 }
13196
13197 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
13198 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
13199
13200 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13201 with all elements equal to VAR. Return true if successful. */
13202
13203 static bool
13204 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
13205 rtx target, rtx val)
13206 {
13207 bool ok;
13208
13209 switch (mode)
13210 {
13211 case E_V2SImode:
13212 case E_V2SFmode:
13213 if (!mmx_ok)
13214 return false;
13215 /* FALLTHRU */
13216
13217 case E_V4DFmode:
13218 case E_V4DImode:
13219 case E_V8SFmode:
13220 case E_V8SImode:
13221 case E_V2DFmode:
13222 case E_V2DImode:
13223 case E_V4SFmode:
13224 case E_V4SImode:
13225 case E_V16SImode:
13226 case E_V8DImode:
13227 case E_V16SFmode:
13228 case E_V8DFmode:
13229 return ix86_vector_duplicate_value (mode, target, val);
13230
13231 case E_V4HImode:
13232 if (!mmx_ok)
13233 return false;
13234 if (TARGET_SSE || TARGET_3DNOW_A)
13235 {
13236 rtx x;
13237
13238 val = gen_lowpart (SImode, val);
13239 x = gen_rtx_TRUNCATE (HImode, val);
13240 x = gen_rtx_VEC_DUPLICATE (mode, x);
13241 emit_insn (gen_rtx_SET (target, x));
13242 return true;
13243 }
13244 goto widen;
13245
13246 case E_V8QImode:
13247 if (!mmx_ok)
13248 return false;
13249 goto widen;
13250
13251 case E_V8HImode:
13252 if (TARGET_AVX2)
13253 return ix86_vector_duplicate_value (mode, target, val);
13254
13255 if (TARGET_SSE2)
13256 {
13257 struct expand_vec_perm_d dperm;
13258 rtx tmp1, tmp2;
13259
13260 permute:
13261 memset (&dperm, 0, sizeof (dperm));
13262 dperm.target = target;
13263 dperm.vmode = mode;
13264 dperm.nelt = GET_MODE_NUNITS (mode);
13265 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
13266 dperm.one_operand_p = true;
13267
13268 /* Extend to SImode using a paradoxical SUBREG. */
13269 tmp1 = gen_reg_rtx (SImode);
13270 emit_move_insn (tmp1, gen_lowpart (SImode, val));
13271
13272 /* Insert the SImode value as low element of a V4SImode vector. */
13273 tmp2 = gen_reg_rtx (V4SImode);
13274 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
13275 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
13276
13277 ok = (expand_vec_perm_1 (&dperm)
13278 || expand_vec_perm_broadcast_1 (&dperm));
13279 gcc_assert (ok);
13280 return ok;
13281 }
13282 goto widen;
13283
13284 case E_V16QImode:
13285 if (TARGET_AVX2)
13286 return ix86_vector_duplicate_value (mode, target, val);
13287
13288 if (TARGET_SSE2)
13289 goto permute;
13290 goto widen;
13291
13292 widen:
13293 /* Replicate the value once into the next wider mode and recurse. */
13294 {
13295 machine_mode smode, wsmode, wvmode;
13296 rtx x;
13297
13298 smode = GET_MODE_INNER (mode);
13299 wvmode = get_mode_wider_vector (mode);
13300 wsmode = GET_MODE_INNER (wvmode);
13301
13302 val = convert_modes (wsmode, smode, val, true);
13303 x = expand_simple_binop (wsmode, ASHIFT, val,
13304 GEN_INT (GET_MODE_BITSIZE (smode)),
13305 NULL_RTX, 1, OPTAB_LIB_WIDEN);
13306 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
13307
13308 x = gen_reg_rtx (wvmode);
13309 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
13310 gcc_assert (ok);
13311 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
13312 return ok;
13313 }
13314
13315 case E_V16HImode:
13316 case E_V32QImode:
13317 if (TARGET_AVX2)
13318 return ix86_vector_duplicate_value (mode, target, val);
13319 else
13320 {
13321 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
13322 rtx x = gen_reg_rtx (hvmode);
13323
13324 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
13325 gcc_assert (ok);
13326
13327 x = gen_rtx_VEC_CONCAT (mode, x, x);
13328 emit_insn (gen_rtx_SET (target, x));
13329 }
13330 return true;
13331
13332 case E_V64QImode:
13333 case E_V32HImode:
13334 if (TARGET_AVX512BW)
13335 return ix86_vector_duplicate_value (mode, target, val);
13336 else
13337 {
13338 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
13339 rtx x = gen_reg_rtx (hvmode);
13340
13341 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
13342 gcc_assert (ok);
13343
13344 x = gen_rtx_VEC_CONCAT (mode, x, x);
13345 emit_insn (gen_rtx_SET (target, x));
13346 }
13347 return true;
13348
13349 default:
13350 return false;
13351 }
13352 }
13353
13354 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13355 whose ONE_VAR element is VAR, and other elements are zero. Return true
13356 if successful. */
13357
13358 static bool
13359 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
13360 rtx target, rtx var, int one_var)
13361 {
13362 machine_mode vsimode;
13363 rtx new_target;
13364 rtx x, tmp;
13365 bool use_vector_set = false;
13366 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
13367
13368 switch (mode)
13369 {
13370 case E_V2DImode:
13371 /* For SSE4.1, we normally use vector set. But if the second
13372 element is zero and inter-unit moves are OK, we use movq
13373 instead. */
13374 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
13375 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
13376 && one_var == 0));
13377 break;
13378 case E_V16QImode:
13379 case E_V4SImode:
13380 case E_V4SFmode:
13381 use_vector_set = TARGET_SSE4_1;
13382 break;
13383 case E_V8HImode:
13384 use_vector_set = TARGET_SSE2;
13385 break;
13386 case E_V4HImode:
13387 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
13388 break;
13389 case E_V32QImode:
13390 case E_V16HImode:
13391 use_vector_set = TARGET_AVX;
13392 break;
13393 case E_V8SImode:
13394 use_vector_set = TARGET_AVX;
13395 gen_vec_set_0 = gen_vec_setv8si_0;
13396 break;
13397 case E_V8SFmode:
13398 use_vector_set = TARGET_AVX;
13399 gen_vec_set_0 = gen_vec_setv8sf_0;
13400 break;
13401 case E_V4DFmode:
13402 use_vector_set = TARGET_AVX;
13403 gen_vec_set_0 = gen_vec_setv4df_0;
13404 break;
13405 case E_V4DImode:
13406 /* Use ix86_expand_vector_set in 64bit mode only. */
13407 use_vector_set = TARGET_AVX && TARGET_64BIT;
13408 gen_vec_set_0 = gen_vec_setv4di_0;
13409 break;
13410 case E_V16SImode:
13411 use_vector_set = TARGET_AVX512F && one_var == 0;
13412 gen_vec_set_0 = gen_vec_setv16si_0;
13413 break;
13414 case E_V16SFmode:
13415 use_vector_set = TARGET_AVX512F && one_var == 0;
13416 gen_vec_set_0 = gen_vec_setv16sf_0;
13417 break;
13418 case E_V8DFmode:
13419 use_vector_set = TARGET_AVX512F && one_var == 0;
13420 gen_vec_set_0 = gen_vec_setv8df_0;
13421 break;
13422 case E_V8DImode:
13423 /* Use ix86_expand_vector_set in 64bit mode only. */
13424 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
13425 gen_vec_set_0 = gen_vec_setv8di_0;
13426 break;
13427 default:
13428 break;
13429 }
13430
13431 if (use_vector_set)
13432 {
13433 if (gen_vec_set_0 && one_var == 0)
13434 {
13435 var = force_reg (GET_MODE_INNER (mode), var);
13436 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
13437 return true;
13438 }
13439 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
13440 var = force_reg (GET_MODE_INNER (mode), var);
13441 ix86_expand_vector_set (mmx_ok, target, var, one_var);
13442 return true;
13443 }
13444
13445 switch (mode)
13446 {
13447 case E_V2SFmode:
13448 case E_V2SImode:
13449 if (!mmx_ok)
13450 return false;
13451 /* FALLTHRU */
13452
13453 case E_V2DFmode:
13454 case E_V2DImode:
13455 if (one_var != 0)
13456 return false;
13457 var = force_reg (GET_MODE_INNER (mode), var);
13458 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
13459 emit_insn (gen_rtx_SET (target, x));
13460 return true;
13461
13462 case E_V4SFmode:
13463 case E_V4SImode:
13464 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
13465 new_target = gen_reg_rtx (mode);
13466 else
13467 new_target = target;
13468 var = force_reg (GET_MODE_INNER (mode), var);
13469 x = gen_rtx_VEC_DUPLICATE (mode, var);
13470 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
13471 emit_insn (gen_rtx_SET (new_target, x));
13472 if (one_var != 0)
13473 {
13474 /* We need to shuffle the value to the correct position, so
13475 create a new pseudo to store the intermediate result. */
13476
13477 /* With SSE2, we can use the integer shuffle insns. */
13478 if (mode != V4SFmode && TARGET_SSE2)
13479 {
13480 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
13481 const1_rtx,
13482 GEN_INT (one_var == 1 ? 0 : 1),
13483 GEN_INT (one_var == 2 ? 0 : 1),
13484 GEN_INT (one_var == 3 ? 0 : 1)));
13485 if (target != new_target)
13486 emit_move_insn (target, new_target);
13487 return true;
13488 }
13489
13490 /* Otherwise convert the intermediate result to V4SFmode and
13491 use the SSE1 shuffle instructions. */
13492 if (mode != V4SFmode)
13493 {
13494 tmp = gen_reg_rtx (V4SFmode);
13495 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
13496 }
13497 else
13498 tmp = new_target;
13499
13500 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
13501 const1_rtx,
13502 GEN_INT (one_var == 1 ? 0 : 1),
13503 GEN_INT (one_var == 2 ? 0+4 : 1+4),
13504 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
13505
13506 if (mode != V4SFmode)
13507 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
13508 else if (tmp != target)
13509 emit_move_insn (target, tmp);
13510 }
13511 else if (target != new_target)
13512 emit_move_insn (target, new_target);
13513 return true;
13514
13515 case E_V8HImode:
13516 case E_V16QImode:
13517 vsimode = V4SImode;
13518 goto widen;
13519 case E_V4HImode:
13520 case E_V8QImode:
13521 if (!mmx_ok)
13522 return false;
13523 vsimode = V2SImode;
13524 goto widen;
13525 widen:
13526 if (one_var != 0)
13527 return false;
13528
13529 /* Zero extend the variable element to SImode and recurse. */
13530 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
13531
13532 x = gen_reg_rtx (vsimode);
13533 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
13534 var, one_var))
13535 gcc_unreachable ();
13536
13537 emit_move_insn (target, gen_lowpart (mode, x));
13538 return true;
13539
13540 default:
13541 return false;
13542 }
13543 }
13544
13545 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13546 consisting of the values in VALS. It is known that all elements
13547 except ONE_VAR are constants. Return true if successful. */
13548
13549 static bool
13550 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
13551 rtx target, rtx vals, int one_var)
13552 {
13553 rtx var = XVECEXP (vals, 0, one_var);
13554 machine_mode wmode;
13555 rtx const_vec, x;
13556
13557 const_vec = copy_rtx (vals);
13558 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
13559 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
13560
13561 switch (mode)
13562 {
13563 case E_V2DFmode:
13564 case E_V2DImode:
13565 case E_V2SFmode:
13566 case E_V2SImode:
13567 /* For the two element vectors, it's just as easy to use
13568 the general case. */
13569 return false;
13570
13571 case E_V4DImode:
13572 /* Use ix86_expand_vector_set in 64bit mode only. */
13573 if (!TARGET_64BIT)
13574 return false;
13575 /* FALLTHRU */
13576 case E_V4DFmode:
13577 case E_V8SFmode:
13578 case E_V8SImode:
13579 case E_V16HImode:
13580 case E_V32QImode:
13581 case E_V4SFmode:
13582 case E_V4SImode:
13583 case E_V8HImode:
13584 case E_V4HImode:
13585 break;
13586
13587 case E_V16QImode:
13588 if (TARGET_SSE4_1)
13589 break;
13590 wmode = V8HImode;
13591 goto widen;
13592 case E_V8QImode:
13593 wmode = V4HImode;
13594 goto widen;
13595 widen:
13596 /* There's no way to set one QImode entry easily. Combine
13597 the variable value with its adjacent constant value, and
13598 promote to an HImode set. */
13599 x = XVECEXP (vals, 0, one_var ^ 1);
13600 if (one_var & 1)
13601 {
13602 var = convert_modes (HImode, QImode, var, true);
13603 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
13604 NULL_RTX, 1, OPTAB_LIB_WIDEN);
13605 x = GEN_INT (INTVAL (x) & 0xff);
13606 }
13607 else
13608 {
13609 var = convert_modes (HImode, QImode, var, true);
13610 x = gen_int_mode (UINTVAL (x) << 8, HImode);
13611 }
13612 if (x != const0_rtx)
13613 var = expand_simple_binop (HImode, IOR, var, x, var,
13614 1, OPTAB_LIB_WIDEN);
13615
13616 x = gen_reg_rtx (wmode);
13617 emit_move_insn (x, gen_lowpart (wmode, const_vec));
13618 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
13619
13620 emit_move_insn (target, gen_lowpart (mode, x));
13621 return true;
13622
13623 default:
13624 return false;
13625 }
13626
13627 emit_move_insn (target, const_vec);
13628 ix86_expand_vector_set (mmx_ok, target, var, one_var);
13629 return true;
13630 }
13631
13632 /* A subroutine of ix86_expand_vector_init_general. Use vector
13633 concatenate to handle the most general case: all values variable,
13634 and none identical. */
13635
13636 static void
13637 ix86_expand_vector_init_concat (machine_mode mode,
13638 rtx target, rtx *ops, int n)
13639 {
13640 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
13641 rtx first[16], second[8], third[4];
13642 rtvec v;
13643 int i, j;
13644
13645 switch (n)
13646 {
13647 case 2:
13648 switch (mode)
13649 {
13650 case E_V16SImode:
13651 cmode = V8SImode;
13652 break;
13653 case E_V16SFmode:
13654 cmode = V8SFmode;
13655 break;
13656 case E_V8DImode:
13657 cmode = V4DImode;
13658 break;
13659 case E_V8DFmode:
13660 cmode = V4DFmode;
13661 break;
13662 case E_V8SImode:
13663 cmode = V4SImode;
13664 break;
13665 case E_V8SFmode:
13666 cmode = V4SFmode;
13667 break;
13668 case E_V4DImode:
13669 cmode = V2DImode;
13670 break;
13671 case E_V4DFmode:
13672 cmode = V2DFmode;
13673 break;
13674 case E_V4SImode:
13675 cmode = V2SImode;
13676 break;
13677 case E_V4SFmode:
13678 cmode = V2SFmode;
13679 break;
13680 case E_V2DImode:
13681 cmode = DImode;
13682 break;
13683 case E_V2SImode:
13684 cmode = SImode;
13685 break;
13686 case E_V2DFmode:
13687 cmode = DFmode;
13688 break;
13689 case E_V2SFmode:
13690 cmode = SFmode;
13691 break;
13692 default:
13693 gcc_unreachable ();
13694 }
13695
13696 if (!register_operand (ops[1], cmode))
13697 ops[1] = force_reg (cmode, ops[1]);
13698 if (!register_operand (ops[0], cmode))
13699 ops[0] = force_reg (cmode, ops[0]);
13700 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
13701 ops[1])));
13702 break;
13703
13704 case 4:
13705 switch (mode)
13706 {
13707 case E_V4DImode:
13708 cmode = V2DImode;
13709 break;
13710 case E_V4DFmode:
13711 cmode = V2DFmode;
13712 break;
13713 case E_V4SImode:
13714 cmode = V2SImode;
13715 break;
13716 case E_V4SFmode:
13717 cmode = V2SFmode;
13718 break;
13719 default:
13720 gcc_unreachable ();
13721 }
13722 goto half;
13723
13724 case 8:
13725 switch (mode)
13726 {
13727 case E_V8DImode:
13728 cmode = V2DImode;
13729 hmode = V4DImode;
13730 break;
13731 case E_V8DFmode:
13732 cmode = V2DFmode;
13733 hmode = V4DFmode;
13734 break;
13735 case E_V8SImode:
13736 cmode = V2SImode;
13737 hmode = V4SImode;
13738 break;
13739 case E_V8SFmode:
13740 cmode = V2SFmode;
13741 hmode = V4SFmode;
13742 break;
13743 default:
13744 gcc_unreachable ();
13745 }
13746 goto half;
13747
13748 case 16:
13749 switch (mode)
13750 {
13751 case E_V16SImode:
13752 cmode = V2SImode;
13753 hmode = V4SImode;
13754 gmode = V8SImode;
13755 break;
13756 case E_V16SFmode:
13757 cmode = V2SFmode;
13758 hmode = V4SFmode;
13759 gmode = V8SFmode;
13760 break;
13761 default:
13762 gcc_unreachable ();
13763 }
13764 goto half;
13765
13766 half:
13767 /* FIXME: We process inputs backward to help RA. PR 36222. */
13768 i = n - 1;
13769 j = (n >> 1) - 1;
13770 for (; i > 0; i -= 2, j--)
13771 {
13772 first[j] = gen_reg_rtx (cmode);
13773 v = gen_rtvec (2, ops[i - 1], ops[i]);
13774 ix86_expand_vector_init (false, first[j],
13775 gen_rtx_PARALLEL (cmode, v));
13776 }
13777
13778 n >>= 1;
13779 if (n > 4)
13780 {
13781 gcc_assert (hmode != VOIDmode);
13782 gcc_assert (gmode != VOIDmode);
13783 for (i = j = 0; i < n; i += 2, j++)
13784 {
13785 second[j] = gen_reg_rtx (hmode);
13786 ix86_expand_vector_init_concat (hmode, second [j],
13787 &first [i], 2);
13788 }
13789 n >>= 1;
13790 for (i = j = 0; i < n; i += 2, j++)
13791 {
13792 third[j] = gen_reg_rtx (gmode);
13793 ix86_expand_vector_init_concat (gmode, third[j],
13794 &second[i], 2);
13795 }
13796 n >>= 1;
13797 ix86_expand_vector_init_concat (mode, target, third, n);
13798 }
13799 else if (n > 2)
13800 {
13801 gcc_assert (hmode != VOIDmode);
13802 for (i = j = 0; i < n; i += 2, j++)
13803 {
13804 second[j] = gen_reg_rtx (hmode);
13805 ix86_expand_vector_init_concat (hmode, second [j],
13806 &first [i], 2);
13807 }
13808 n >>= 1;
13809 ix86_expand_vector_init_concat (mode, target, second, n);
13810 }
13811 else
13812 ix86_expand_vector_init_concat (mode, target, first, n);
13813 break;
13814
13815 default:
13816 gcc_unreachable ();
13817 }
13818 }
13819
13820 /* A subroutine of ix86_expand_vector_init_general. Use vector
13821 interleave to handle the most general case: all values variable,
13822 and none identical. */
13823
13824 static void
13825 ix86_expand_vector_init_interleave (machine_mode mode,
13826 rtx target, rtx *ops, int n)
13827 {
13828 machine_mode first_imode, second_imode, third_imode, inner_mode;
13829 int i, j;
13830 rtx op0, op1;
13831 rtx (*gen_load_even) (rtx, rtx, rtx);
13832 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
13833 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
13834
13835 switch (mode)
13836 {
13837 case E_V8HImode:
13838 gen_load_even = gen_vec_setv8hi;
13839 gen_interleave_first_low = gen_vec_interleave_lowv4si;
13840 gen_interleave_second_low = gen_vec_interleave_lowv2di;
13841 inner_mode = HImode;
13842 first_imode = V4SImode;
13843 second_imode = V2DImode;
13844 third_imode = VOIDmode;
13845 break;
13846 case E_V16QImode:
13847 gen_load_even = gen_vec_setv16qi;
13848 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
13849 gen_interleave_second_low = gen_vec_interleave_lowv4si;
13850 inner_mode = QImode;
13851 first_imode = V8HImode;
13852 second_imode = V4SImode;
13853 third_imode = V2DImode;
13854 break;
13855 default:
13856 gcc_unreachable ();
13857 }
13858
13859 for (i = 0; i < n; i++)
13860 {
13861 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
13862 op0 = gen_reg_rtx (SImode);
13863 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
13864
13865 /* Insert the SImode value as low element of V4SImode vector. */
13866 op1 = gen_reg_rtx (V4SImode);
13867 op0 = gen_rtx_VEC_MERGE (V4SImode,
13868 gen_rtx_VEC_DUPLICATE (V4SImode,
13869 op0),
13870 CONST0_RTX (V4SImode),
13871 const1_rtx);
13872 emit_insn (gen_rtx_SET (op1, op0));
13873
13874 /* Cast the V4SImode vector back to a vector in orignal mode. */
13875 op0 = gen_reg_rtx (mode);
13876 emit_move_insn (op0, gen_lowpart (mode, op1));
13877
13878 /* Load even elements into the second position. */
13879 emit_insn (gen_load_even (op0,
13880 force_reg (inner_mode,
13881 ops [i + i + 1]),
13882 const1_rtx));
13883
13884 /* Cast vector to FIRST_IMODE vector. */
13885 ops[i] = gen_reg_rtx (first_imode);
13886 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
13887 }
13888
13889 /* Interleave low FIRST_IMODE vectors. */
13890 for (i = j = 0; i < n; i += 2, j++)
13891 {
13892 op0 = gen_reg_rtx (first_imode);
13893 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
13894
13895 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
13896 ops[j] = gen_reg_rtx (second_imode);
13897 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
13898 }
13899
13900 /* Interleave low SECOND_IMODE vectors. */
13901 switch (second_imode)
13902 {
13903 case E_V4SImode:
13904 for (i = j = 0; i < n / 2; i += 2, j++)
13905 {
13906 op0 = gen_reg_rtx (second_imode);
13907 emit_insn (gen_interleave_second_low (op0, ops[i],
13908 ops[i + 1]));
13909
13910 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
13911 vector. */
13912 ops[j] = gen_reg_rtx (third_imode);
13913 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
13914 }
13915 second_imode = V2DImode;
13916 gen_interleave_second_low = gen_vec_interleave_lowv2di;
13917 /* FALLTHRU */
13918
13919 case E_V2DImode:
13920 op0 = gen_reg_rtx (second_imode);
13921 emit_insn (gen_interleave_second_low (op0, ops[0],
13922 ops[1]));
13923
13924 /* Cast the SECOND_IMODE vector back to a vector on original
13925 mode. */
13926 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
13927 break;
13928
13929 default:
13930 gcc_unreachable ();
13931 }
13932 }
13933
13934 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
13935 all values variable, and none identical. */
13936
13937 static void
13938 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
13939 rtx target, rtx vals)
13940 {
13941 rtx ops[64], op0, op1, op2, op3, op4, op5;
13942 machine_mode half_mode = VOIDmode;
13943 machine_mode quarter_mode = VOIDmode;
13944 int n, i;
13945
13946 switch (mode)
13947 {
13948 case E_V2SFmode:
13949 case E_V2SImode:
13950 if (!mmx_ok && !TARGET_SSE)
13951 break;
13952 /* FALLTHRU */
13953
13954 case E_V16SImode:
13955 case E_V16SFmode:
13956 case E_V8DFmode:
13957 case E_V8DImode:
13958 case E_V8SFmode:
13959 case E_V8SImode:
13960 case E_V4DFmode:
13961 case E_V4DImode:
13962 case E_V4SFmode:
13963 case E_V4SImode:
13964 case E_V2DFmode:
13965 case E_V2DImode:
13966 n = GET_MODE_NUNITS (mode);
13967 for (i = 0; i < n; i++)
13968 ops[i] = XVECEXP (vals, 0, i);
13969 ix86_expand_vector_init_concat (mode, target, ops, n);
13970 return;
13971
13972 case E_V2TImode:
13973 for (i = 0; i < 2; i++)
13974 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
13975 op0 = gen_reg_rtx (V4DImode);
13976 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
13977 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
13978 return;
13979
13980 case E_V4TImode:
13981 for (i = 0; i < 4; i++)
13982 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
13983 ops[4] = gen_reg_rtx (V4DImode);
13984 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
13985 ops[5] = gen_reg_rtx (V4DImode);
13986 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
13987 op0 = gen_reg_rtx (V8DImode);
13988 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
13989 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
13990 return;
13991
13992 case E_V32QImode:
13993 half_mode = V16QImode;
13994 goto half;
13995
13996 case E_V16HImode:
13997 half_mode = V8HImode;
13998 goto half;
13999
14000 half:
14001 n = GET_MODE_NUNITS (mode);
14002 for (i = 0; i < n; i++)
14003 ops[i] = XVECEXP (vals, 0, i);
14004 op0 = gen_reg_rtx (half_mode);
14005 op1 = gen_reg_rtx (half_mode);
14006 ix86_expand_vector_init_interleave (half_mode, op0, ops,
14007 n >> 2);
14008 ix86_expand_vector_init_interleave (half_mode, op1,
14009 &ops [n >> 1], n >> 2);
14010 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
14011 return;
14012
14013 case E_V64QImode:
14014 quarter_mode = V16QImode;
14015 half_mode = V32QImode;
14016 goto quarter;
14017
14018 case E_V32HImode:
14019 quarter_mode = V8HImode;
14020 half_mode = V16HImode;
14021 goto quarter;
14022
14023 quarter:
14024 n = GET_MODE_NUNITS (mode);
14025 for (i = 0; i < n; i++)
14026 ops[i] = XVECEXP (vals, 0, i);
14027 op0 = gen_reg_rtx (quarter_mode);
14028 op1 = gen_reg_rtx (quarter_mode);
14029 op2 = gen_reg_rtx (quarter_mode);
14030 op3 = gen_reg_rtx (quarter_mode);
14031 op4 = gen_reg_rtx (half_mode);
14032 op5 = gen_reg_rtx (half_mode);
14033 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
14034 n >> 3);
14035 ix86_expand_vector_init_interleave (quarter_mode, op1,
14036 &ops [n >> 2], n >> 3);
14037 ix86_expand_vector_init_interleave (quarter_mode, op2,
14038 &ops [n >> 1], n >> 3);
14039 ix86_expand_vector_init_interleave (quarter_mode, op3,
14040 &ops [(n >> 1) | (n >> 2)], n >> 3);
14041 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
14042 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
14043 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
14044 return;
14045
14046 case E_V16QImode:
14047 if (!TARGET_SSE4_1)
14048 break;
14049 /* FALLTHRU */
14050
14051 case E_V8HImode:
14052 if (!TARGET_SSE2)
14053 break;
14054
14055 /* Don't use ix86_expand_vector_init_interleave if we can't
14056 move from GPR to SSE register directly. */
14057 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
14058 break;
14059
14060 n = GET_MODE_NUNITS (mode);
14061 for (i = 0; i < n; i++)
14062 ops[i] = XVECEXP (vals, 0, i);
14063 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
14064 return;
14065
14066 case E_V4HImode:
14067 case E_V8QImode:
14068 break;
14069
14070 default:
14071 gcc_unreachable ();
14072 }
14073
14074 {
14075 int i, j, n_elts, n_words, n_elt_per_word;
14076 machine_mode inner_mode;
14077 rtx words[4], shift;
14078
14079 inner_mode = GET_MODE_INNER (mode);
14080 n_elts = GET_MODE_NUNITS (mode);
14081 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
14082 n_elt_per_word = n_elts / n_words;
14083 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
14084
14085 for (i = 0; i < n_words; ++i)
14086 {
14087 rtx word = NULL_RTX;
14088
14089 for (j = 0; j < n_elt_per_word; ++j)
14090 {
14091 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
14092 elt = convert_modes (word_mode, inner_mode, elt, true);
14093
14094 if (j == 0)
14095 word = elt;
14096 else
14097 {
14098 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
14099 word, 1, OPTAB_LIB_WIDEN);
14100 word = expand_simple_binop (word_mode, IOR, word, elt,
14101 word, 1, OPTAB_LIB_WIDEN);
14102 }
14103 }
14104
14105 words[i] = word;
14106 }
14107
14108 if (n_words == 1)
14109 emit_move_insn (target, gen_lowpart (mode, words[0]));
14110 else if (n_words == 2)
14111 {
14112 rtx tmp = gen_reg_rtx (mode);
14113 emit_clobber (tmp);
14114 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
14115 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
14116 emit_move_insn (target, tmp);
14117 }
14118 else if (n_words == 4)
14119 {
14120 rtx tmp = gen_reg_rtx (V4SImode);
14121 gcc_assert (word_mode == SImode);
14122 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
14123 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
14124 emit_move_insn (target, gen_lowpart (mode, tmp));
14125 }
14126 else
14127 gcc_unreachable ();
14128 }
14129 }
14130
14131 /* Initialize vector TARGET via VALS. Suppress the use of MMX
14132 instructions unless MMX_OK is true. */
14133
14134 void
14135 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
14136 {
14137 machine_mode mode = GET_MODE (target);
14138 machine_mode inner_mode = GET_MODE_INNER (mode);
14139 int n_elts = GET_MODE_NUNITS (mode);
14140 int n_var = 0, one_var = -1;
14141 bool all_same = true, all_const_zero = true;
14142 int i;
14143 rtx x;
14144
14145 /* Handle first initialization from vector elts. */
14146 if (n_elts != XVECLEN (vals, 0))
14147 {
14148 rtx subtarget = target;
14149 x = XVECEXP (vals, 0, 0);
14150 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
14151 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
14152 {
14153 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
14154 if (inner_mode == QImode || inner_mode == HImode)
14155 {
14156 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
14157 mode = mode_for_vector (SImode, n_bits / 4).require ();
14158 inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
14159 ops[0] = gen_lowpart (inner_mode, ops[0]);
14160 ops[1] = gen_lowpart (inner_mode, ops[1]);
14161 subtarget = gen_reg_rtx (mode);
14162 }
14163 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
14164 if (subtarget != target)
14165 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
14166 return;
14167 }
14168 gcc_unreachable ();
14169 }
14170
14171 for (i = 0; i < n_elts; ++i)
14172 {
14173 x = XVECEXP (vals, 0, i);
14174 if (!(CONST_SCALAR_INT_P (x)
14175 || CONST_DOUBLE_P (x)
14176 || CONST_FIXED_P (x)))
14177 n_var++, one_var = i;
14178 else if (x != CONST0_RTX (inner_mode))
14179 all_const_zero = false;
14180 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
14181 all_same = false;
14182 }
14183
14184 /* Constants are best loaded from the constant pool. */
14185 if (n_var == 0)
14186 {
14187 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
14188 return;
14189 }
14190
14191 /* If all values are identical, broadcast the value. */
14192 if (all_same
14193 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
14194 XVECEXP (vals, 0, 0)))
14195 return;
14196
14197 /* Values where only one field is non-constant are best loaded from
14198 the pool and overwritten via move later. */
14199 if (n_var == 1)
14200 {
14201 if (all_const_zero
14202 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
14203 XVECEXP (vals, 0, one_var),
14204 one_var))
14205 return;
14206
14207 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
14208 return;
14209 }
14210
14211 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
14212 }
14213
14214 void
14215 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
14216 {
14217 machine_mode mode = GET_MODE (target);
14218 machine_mode inner_mode = GET_MODE_INNER (mode);
14219 machine_mode half_mode;
14220 bool use_vec_merge = false;
14221 rtx tmp;
14222 static rtx (*gen_extract[6][2]) (rtx, rtx)
14223 = {
14224 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
14225 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
14226 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
14227 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
14228 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
14229 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
14230 };
14231 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
14232 = {
14233 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
14234 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
14235 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
14236 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
14237 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
14238 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
14239 };
14240 int i, j, n;
14241 machine_mode mmode = VOIDmode;
14242 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
14243
14244 switch (mode)
14245 {
14246 case E_V2SImode:
14247 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14248 if (use_vec_merge)
14249 break;
14250 /* FALLTHRU */
14251
14252 case E_V2SFmode:
14253 if (mmx_ok)
14254 {
14255 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
14256 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
14257 if (elt == 0)
14258 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
14259 else
14260 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
14261 emit_insn (gen_rtx_SET (target, tmp));
14262 return;
14263 }
14264 break;
14265
14266 case E_V2DImode:
14267 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
14268 if (use_vec_merge)
14269 break;
14270
14271 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
14272 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
14273 if (elt == 0)
14274 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
14275 else
14276 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
14277 emit_insn (gen_rtx_SET (target, tmp));
14278 return;
14279
14280 case E_V2DFmode:
14281 /* NB: For ELT == 0, use standard scalar operation patterns which
14282 preserve the rest of the vector for combiner:
14283
14284 (vec_merge:V2DF
14285 (vec_duplicate:V2DF (reg:DF))
14286 (reg:V2DF)
14287 (const_int 1))
14288 */
14289 if (elt == 0)
14290 goto do_vec_merge;
14291
14292 {
14293 rtx op0, op1;
14294
14295 /* For the two element vectors, we implement a VEC_CONCAT with
14296 the extraction of the other element. */
14297
14298 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
14299 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
14300
14301 if (elt == 0)
14302 op0 = val, op1 = tmp;
14303 else
14304 op0 = tmp, op1 = val;
14305
14306 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
14307 emit_insn (gen_rtx_SET (target, tmp));
14308 }
14309 return;
14310
14311 case E_V4SFmode:
14312 use_vec_merge = TARGET_SSE4_1;
14313 if (use_vec_merge)
14314 break;
14315
14316 switch (elt)
14317 {
14318 case 0:
14319 use_vec_merge = true;
14320 break;
14321
14322 case 1:
14323 /* tmp = target = A B C D */
14324 tmp = copy_to_reg (target);
14325 /* target = A A B B */
14326 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
14327 /* target = X A B B */
14328 ix86_expand_vector_set (false, target, val, 0);
14329 /* target = A X C D */
14330 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14331 const1_rtx, const0_rtx,
14332 GEN_INT (2+4), GEN_INT (3+4)));
14333 return;
14334
14335 case 2:
14336 /* tmp = target = A B C D */
14337 tmp = copy_to_reg (target);
14338 /* tmp = X B C D */
14339 ix86_expand_vector_set (false, tmp, val, 0);
14340 /* target = A B X D */
14341 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14342 const0_rtx, const1_rtx,
14343 GEN_INT (0+4), GEN_INT (3+4)));
14344 return;
14345
14346 case 3:
14347 /* tmp = target = A B C D */
14348 tmp = copy_to_reg (target);
14349 /* tmp = X B C D */
14350 ix86_expand_vector_set (false, tmp, val, 0);
14351 /* target = A B X D */
14352 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
14353 const0_rtx, const1_rtx,
14354 GEN_INT (2+4), GEN_INT (0+4)));
14355 return;
14356
14357 default:
14358 gcc_unreachable ();
14359 }
14360 break;
14361
14362 case E_V4SImode:
14363 use_vec_merge = TARGET_SSE4_1;
14364 if (use_vec_merge)
14365 break;
14366
14367 /* Element 0 handled by vec_merge below. */
14368 if (elt == 0)
14369 {
14370 use_vec_merge = true;
14371 break;
14372 }
14373
14374 if (TARGET_SSE2)
14375 {
14376 /* With SSE2, use integer shuffles to swap element 0 and ELT,
14377 store into element 0, then shuffle them back. */
14378
14379 rtx order[4];
14380
14381 order[0] = GEN_INT (elt);
14382 order[1] = const1_rtx;
14383 order[2] = const2_rtx;
14384 order[3] = GEN_INT (3);
14385 order[elt] = const0_rtx;
14386
14387 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
14388 order[1], order[2], order[3]));
14389
14390 ix86_expand_vector_set (false, target, val, 0);
14391
14392 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
14393 order[1], order[2], order[3]));
14394 }
14395 else
14396 {
14397 /* For SSE1, we have to reuse the V4SF code. */
14398 rtx t = gen_reg_rtx (V4SFmode);
14399 emit_move_insn (t, gen_lowpart (V4SFmode, target));
14400 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
14401 emit_move_insn (target, gen_lowpart (mode, t));
14402 }
14403 return;
14404
14405 case E_V8HImode:
14406 use_vec_merge = TARGET_SSE2;
14407 break;
14408 case E_V4HImode:
14409 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
14410 break;
14411
14412 case E_V16QImode:
14413 use_vec_merge = TARGET_SSE4_1;
14414 break;
14415
14416 case E_V8QImode:
14417 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
14418 break;
14419
14420 case E_V32QImode:
14421 half_mode = V16QImode;
14422 j = 0;
14423 n = 16;
14424 goto half;
14425
14426 case E_V16HImode:
14427 half_mode = V8HImode;
14428 j = 1;
14429 n = 8;
14430 goto half;
14431
14432 case E_V8SImode:
14433 half_mode = V4SImode;
14434 j = 2;
14435 n = 4;
14436 goto half;
14437
14438 case E_V4DImode:
14439 half_mode = V2DImode;
14440 j = 3;
14441 n = 2;
14442 goto half;
14443
14444 case E_V8SFmode:
14445 half_mode = V4SFmode;
14446 j = 4;
14447 n = 4;
14448 goto half;
14449
14450 case E_V4DFmode:
14451 half_mode = V2DFmode;
14452 j = 5;
14453 n = 2;
14454 goto half;
14455
14456 half:
14457 /* Compute offset. */
14458 i = elt / n;
14459 elt %= n;
14460
14461 gcc_assert (i <= 1);
14462
14463 /* Extract the half. */
14464 tmp = gen_reg_rtx (half_mode);
14465 emit_insn (gen_extract[j][i] (tmp, target));
14466
14467 /* Put val in tmp at elt. */
14468 ix86_expand_vector_set (false, tmp, val, elt);
14469
14470 /* Put it back. */
14471 emit_insn (gen_insert[j][i] (target, target, tmp));
14472 return;
14473
14474 case E_V8DFmode:
14475 if (TARGET_AVX512F)
14476 {
14477 mmode = QImode;
14478 gen_blendm = gen_avx512f_blendmv8df;
14479 }
14480 break;
14481
14482 case E_V8DImode:
14483 if (TARGET_AVX512F)
14484 {
14485 mmode = QImode;
14486 gen_blendm = gen_avx512f_blendmv8di;
14487 }
14488 break;
14489
14490 case E_V16SFmode:
14491 if (TARGET_AVX512F)
14492 {
14493 mmode = HImode;
14494 gen_blendm = gen_avx512f_blendmv16sf;
14495 }
14496 break;
14497
14498 case E_V16SImode:
14499 if (TARGET_AVX512F)
14500 {
14501 mmode = HImode;
14502 gen_blendm = gen_avx512f_blendmv16si;
14503 }
14504 break;
14505
14506 case E_V32HImode:
14507 if (TARGET_AVX512BW)
14508 {
14509 mmode = SImode;
14510 gen_blendm = gen_avx512bw_blendmv32hi;
14511 }
14512 else if (TARGET_AVX512F)
14513 {
14514 half_mode = E_V8HImode;
14515 n = 8;
14516 goto quarter;
14517 }
14518 break;
14519
14520 case E_V64QImode:
14521 if (TARGET_AVX512BW)
14522 {
14523 mmode = DImode;
14524 gen_blendm = gen_avx512bw_blendmv64qi;
14525 }
14526 else if (TARGET_AVX512F)
14527 {
14528 half_mode = E_V16QImode;
14529 n = 16;
14530 goto quarter;
14531 }
14532 break;
14533
14534 quarter:
14535 /* Compute offset. */
14536 i = elt / n;
14537 elt %= n;
14538
14539 gcc_assert (i <= 3);
14540
14541 {
14542 /* Extract the quarter. */
14543 tmp = gen_reg_rtx (V4SImode);
14544 rtx tmp2 = gen_lowpart (V16SImode, target);
14545 rtx mask = gen_reg_rtx (QImode);
14546
14547 emit_move_insn (mask, constm1_rtx);
14548 emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
14549 tmp, mask));
14550
14551 tmp2 = gen_reg_rtx (half_mode);
14552 emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
14553 tmp = tmp2;
14554
14555 /* Put val in tmp at elt. */
14556 ix86_expand_vector_set (false, tmp, val, elt);
14557
14558 /* Put it back. */
14559 tmp2 = gen_reg_rtx (V16SImode);
14560 rtx tmp3 = gen_lowpart (V16SImode, target);
14561 mask = gen_reg_rtx (HImode);
14562 emit_move_insn (mask, constm1_rtx);
14563 tmp = gen_lowpart (V4SImode, tmp);
14564 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
14565 tmp3, mask));
14566 emit_move_insn (target, gen_lowpart (mode, tmp2));
14567 }
14568 return;
14569
14570 default:
14571 break;
14572 }
14573
14574 if (mmode != VOIDmode)
14575 {
14576 tmp = gen_reg_rtx (mode);
14577 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
14578 /* The avx512*_blendm<mode> expanders have different operand order
14579 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
14580 elements where the mask is set and second input operand otherwise,
14581 in {sse,avx}*_*blend* the first input operand is used for elements
14582 where the mask is clear and second input operand otherwise. */
14583 emit_insn (gen_blendm (target, target, tmp,
14584 force_reg (mmode,
14585 gen_int_mode (HOST_WIDE_INT_1U << elt,
14586 mmode))));
14587 }
14588 else if (use_vec_merge)
14589 {
14590 do_vec_merge:
14591 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
14592 tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
14593 GEN_INT (HOST_WIDE_INT_1U << elt));
14594 emit_insn (gen_rtx_SET (target, tmp));
14595 }
14596 else
14597 {
14598 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
14599
14600 emit_move_insn (mem, target);
14601
14602 tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
14603 emit_move_insn (tmp, val);
14604
14605 emit_move_insn (target, mem);
14606 }
14607 }
14608
14609 void
14610 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
14611 {
14612 machine_mode mode = GET_MODE (vec);
14613 machine_mode inner_mode = GET_MODE_INNER (mode);
14614 bool use_vec_extr = false;
14615 rtx tmp;
14616
14617 switch (mode)
14618 {
14619 case E_V2SImode:
14620 case E_V2SFmode:
14621 if (!mmx_ok)
14622 break;
14623 /* FALLTHRU */
14624
14625 case E_V2DFmode:
14626 case E_V2DImode:
14627 case E_V2TImode:
14628 case E_V4TImode:
14629 use_vec_extr = true;
14630 break;
14631
14632 case E_V4SFmode:
14633 use_vec_extr = TARGET_SSE4_1;
14634 if (use_vec_extr)
14635 break;
14636
14637 switch (elt)
14638 {
14639 case 0:
14640 tmp = vec;
14641 break;
14642
14643 case 1:
14644 case 3:
14645 tmp = gen_reg_rtx (mode);
14646 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
14647 GEN_INT (elt), GEN_INT (elt),
14648 GEN_INT (elt+4), GEN_INT (elt+4)));
14649 break;
14650
14651 case 2:
14652 tmp = gen_reg_rtx (mode);
14653 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
14654 break;
14655
14656 default:
14657 gcc_unreachable ();
14658 }
14659 vec = tmp;
14660 use_vec_extr = true;
14661 elt = 0;
14662 break;
14663
14664 case E_V4SImode:
14665 use_vec_extr = TARGET_SSE4_1;
14666 if (use_vec_extr)
14667 break;
14668
14669 if (TARGET_SSE2)
14670 {
14671 switch (elt)
14672 {
14673 case 0:
14674 tmp = vec;
14675 break;
14676
14677 case 1:
14678 case 3:
14679 tmp = gen_reg_rtx (mode);
14680 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
14681 GEN_INT (elt), GEN_INT (elt),
14682 GEN_INT (elt), GEN_INT (elt)));
14683 break;
14684
14685 case 2:
14686 tmp = gen_reg_rtx (mode);
14687 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
14688 break;
14689
14690 default:
14691 gcc_unreachable ();
14692 }
14693 vec = tmp;
14694 use_vec_extr = true;
14695 elt = 0;
14696 }
14697 else
14698 {
14699 /* For SSE1, we have to reuse the V4SF code. */
14700 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
14701 gen_lowpart (V4SFmode, vec), elt);
14702 return;
14703 }
14704 break;
14705
14706 case E_V8HImode:
14707 use_vec_extr = TARGET_SSE2;
14708 break;
14709 case E_V4HImode:
14710 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
14711 break;
14712
14713 case E_V16QImode:
14714 use_vec_extr = TARGET_SSE4_1;
14715 if (!use_vec_extr
14716 && TARGET_SSE2
14717 && elt == 0
14718 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
14719 {
14720 tmp = gen_reg_rtx (SImode);
14721 ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
14722 0);
14723 emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
14724 return;
14725 }
14726 break;
14727
14728 case E_V8SFmode:
14729 if (TARGET_AVX)
14730 {
14731 tmp = gen_reg_rtx (V4SFmode);
14732 if (elt < 4)
14733 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
14734 else
14735 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
14736 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14737 return;
14738 }
14739 break;
14740
14741 case E_V4DFmode:
14742 if (TARGET_AVX)
14743 {
14744 tmp = gen_reg_rtx (V2DFmode);
14745 if (elt < 2)
14746 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
14747 else
14748 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
14749 ix86_expand_vector_extract (false, target, tmp, elt & 1);
14750 return;
14751 }
14752 break;
14753
14754 case E_V32QImode:
14755 if (TARGET_AVX)
14756 {
14757 tmp = gen_reg_rtx (V16QImode);
14758 if (elt < 16)
14759 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
14760 else
14761 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
14762 ix86_expand_vector_extract (false, target, tmp, elt & 15);
14763 return;
14764 }
14765 break;
14766
14767 case E_V16HImode:
14768 if (TARGET_AVX)
14769 {
14770 tmp = gen_reg_rtx (V8HImode);
14771 if (elt < 8)
14772 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
14773 else
14774 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
14775 ix86_expand_vector_extract (false, target, tmp, elt & 7);
14776 return;
14777 }
14778 break;
14779
14780 case E_V8SImode:
14781 if (TARGET_AVX)
14782 {
14783 tmp = gen_reg_rtx (V4SImode);
14784 if (elt < 4)
14785 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
14786 else
14787 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
14788 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14789 return;
14790 }
14791 break;
14792
14793 case E_V4DImode:
14794 if (TARGET_AVX)
14795 {
14796 tmp = gen_reg_rtx (V2DImode);
14797 if (elt < 2)
14798 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
14799 else
14800 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
14801 ix86_expand_vector_extract (false, target, tmp, elt & 1);
14802 return;
14803 }
14804 break;
14805
14806 case E_V32HImode:
14807 if (TARGET_AVX512BW)
14808 {
14809 tmp = gen_reg_rtx (V16HImode);
14810 if (elt < 16)
14811 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
14812 else
14813 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
14814 ix86_expand_vector_extract (false, target, tmp, elt & 15);
14815 return;
14816 }
14817 break;
14818
14819 case E_V64QImode:
14820 if (TARGET_AVX512BW)
14821 {
14822 tmp = gen_reg_rtx (V32QImode);
14823 if (elt < 32)
14824 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
14825 else
14826 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
14827 ix86_expand_vector_extract (false, target, tmp, elt & 31);
14828 return;
14829 }
14830 break;
14831
14832 case E_V16SFmode:
14833 tmp = gen_reg_rtx (V8SFmode);
14834 if (elt < 8)
14835 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
14836 else
14837 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
14838 ix86_expand_vector_extract (false, target, tmp, elt & 7);
14839 return;
14840
14841 case E_V8DFmode:
14842 tmp = gen_reg_rtx (V4DFmode);
14843 if (elt < 4)
14844 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
14845 else
14846 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
14847 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14848 return;
14849
14850 case E_V16SImode:
14851 tmp = gen_reg_rtx (V8SImode);
14852 if (elt < 8)
14853 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
14854 else
14855 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
14856 ix86_expand_vector_extract (false, target, tmp, elt & 7);
14857 return;
14858
14859 case E_V8DImode:
14860 tmp = gen_reg_rtx (V4DImode);
14861 if (elt < 4)
14862 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
14863 else
14864 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
14865 ix86_expand_vector_extract (false, target, tmp, elt & 3);
14866 return;
14867
14868 case E_V8QImode:
14869 /* ??? Could extract the appropriate HImode element and shift. */
14870 default:
14871 break;
14872 }
14873
14874 if (use_vec_extr)
14875 {
14876 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
14877 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
14878
14879 /* Let the rtl optimizers know about the zero extension performed. */
14880 if (inner_mode == QImode || inner_mode == HImode)
14881 {
14882 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
14883 target = gen_lowpart (SImode, target);
14884 }
14885
14886 emit_insn (gen_rtx_SET (target, tmp));
14887 }
14888 else
14889 {
14890 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
14891
14892 emit_move_insn (mem, vec);
14893
14894 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
14895 emit_move_insn (target, tmp);
14896 }
14897 }
14898
14899 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
14900 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
14901 The upper bits of DEST are undefined, though they shouldn't cause
14902 exceptions (some bits from src or all zeros are ok). */
14903
14904 static void
14905 emit_reduc_half (rtx dest, rtx src, int i)
14906 {
14907 rtx tem, d = dest;
14908 switch (GET_MODE (src))
14909 {
14910 case E_V4SFmode:
14911 if (i == 128)
14912 tem = gen_sse_movhlps (dest, src, src);
14913 else
14914 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
14915 GEN_INT (1 + 4), GEN_INT (1 + 4));
14916 break;
14917 case E_V2DFmode:
14918 tem = gen_vec_interleave_highv2df (dest, src, src);
14919 break;
14920 case E_V16QImode:
14921 case E_V8HImode:
14922 case E_V4SImode:
14923 case E_V2DImode:
14924 d = gen_reg_rtx (V1TImode);
14925 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
14926 GEN_INT (i / 2));
14927 break;
14928 case E_V8SFmode:
14929 if (i == 256)
14930 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
14931 else
14932 tem = gen_avx_shufps256 (dest, src, src,
14933 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
14934 break;
14935 case E_V4DFmode:
14936 if (i == 256)
14937 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
14938 else
14939 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
14940 break;
14941 case E_V32QImode:
14942 case E_V16HImode:
14943 case E_V8SImode:
14944 case E_V4DImode:
14945 if (i == 256)
14946 {
14947 if (GET_MODE (dest) != V4DImode)
14948 d = gen_reg_rtx (V4DImode);
14949 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
14950 gen_lowpart (V4DImode, src),
14951 const1_rtx);
14952 }
14953 else
14954 {
14955 d = gen_reg_rtx (V2TImode);
14956 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
14957 GEN_INT (i / 2));
14958 }
14959 break;
14960 case E_V64QImode:
14961 case E_V32HImode:
14962 case E_V16SImode:
14963 case E_V16SFmode:
14964 case E_V8DImode:
14965 case E_V8DFmode:
14966 if (i > 128)
14967 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
14968 gen_lowpart (V16SImode, src),
14969 gen_lowpart (V16SImode, src),
14970 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
14971 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
14972 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
14973 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
14974 GEN_INT (0xC), GEN_INT (0xD),
14975 GEN_INT (0xE), GEN_INT (0xF),
14976 GEN_INT (0x10), GEN_INT (0x11),
14977 GEN_INT (0x12), GEN_INT (0x13),
14978 GEN_INT (0x14), GEN_INT (0x15),
14979 GEN_INT (0x16), GEN_INT (0x17));
14980 else
14981 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
14982 gen_lowpart (V16SImode, src),
14983 GEN_INT (i == 128 ? 0x2 : 0x1),
14984 GEN_INT (0x3),
14985 GEN_INT (0x3),
14986 GEN_INT (0x3),
14987 GEN_INT (i == 128 ? 0x6 : 0x5),
14988 GEN_INT (0x7),
14989 GEN_INT (0x7),
14990 GEN_INT (0x7),
14991 GEN_INT (i == 128 ? 0xA : 0x9),
14992 GEN_INT (0xB),
14993 GEN_INT (0xB),
14994 GEN_INT (0xB),
14995 GEN_INT (i == 128 ? 0xE : 0xD),
14996 GEN_INT (0xF),
14997 GEN_INT (0xF),
14998 GEN_INT (0xF));
14999 break;
15000 default:
15001 gcc_unreachable ();
15002 }
15003 emit_insn (tem);
15004 if (d != dest)
15005 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
15006 }
15007
15008 /* Expand a vector reduction. FN is the binary pattern to reduce;
15009 DEST is the destination; IN is the input vector. */
15010
15011 void
15012 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
15013 {
15014 rtx half, dst, vec = in;
15015 machine_mode mode = GET_MODE (in);
15016 int i;
15017
15018 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
15019 if (TARGET_SSE4_1
15020 && mode == V8HImode
15021 && fn == gen_uminv8hi3)
15022 {
15023 emit_insn (gen_sse4_1_phminposuw (dest, in));
15024 return;
15025 }
15026
15027 for (i = GET_MODE_BITSIZE (mode);
15028 i > GET_MODE_UNIT_BITSIZE (mode);
15029 i >>= 1)
15030 {
15031 half = gen_reg_rtx (mode);
15032 emit_reduc_half (half, vec, i);
15033 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
15034 dst = dest;
15035 else
15036 dst = gen_reg_rtx (mode);
15037 emit_insn (fn (dst, half, vec));
15038 vec = dst;
15039 }
15040 }
15041
15042 /* Output code to perform a conditional jump to LABEL, if C2 flag in
15043 FP status register is set. */
15044
15045 void
15046 ix86_emit_fp_unordered_jump (rtx label)
15047 {
15048 rtx reg = gen_reg_rtx (HImode);
15049 rtx_insn *insn;
15050 rtx temp;
15051
15052 emit_insn (gen_x86_fnstsw_1 (reg));
15053
15054 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
15055 {
15056 emit_insn (gen_x86_sahf_1 (reg));
15057
15058 temp = gen_rtx_REG (CCmode, FLAGS_REG);
15059 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
15060 }
15061 else
15062 {
15063 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
15064
15065 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
15066 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
15067 }
15068
15069 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
15070 gen_rtx_LABEL_REF (VOIDmode, label),
15071 pc_rtx);
15072 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
15073 predict_jump (REG_BR_PROB_BASE * 10 / 100);
15074 JUMP_LABEL (insn) = label;
15075 }
15076
15077 /* Output code to perform an sinh XFmode calculation. */
15078
15079 void ix86_emit_i387_sinh (rtx op0, rtx op1)
15080 {
15081 rtx e1 = gen_reg_rtx (XFmode);
15082 rtx e2 = gen_reg_rtx (XFmode);
15083 rtx scratch = gen_reg_rtx (HImode);
15084 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15085 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15086 rtx cst1, tmp;
15087 rtx_code_label *jump_label = gen_label_rtx ();
15088 rtx_insn *insn;
15089
15090 /* scratch = fxam (op1) */
15091 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15092
15093 /* e1 = expm1 (|op1|) */
15094 emit_insn (gen_absxf2 (e2, op1));
15095 emit_insn (gen_expm1xf2 (e1, e2));
15096
15097 /* e2 = e1 / (e1 + 1.0) + e1 */
15098 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15099 emit_insn (gen_addxf3 (e2, e1, cst1));
15100 emit_insn (gen_divxf3 (e2, e1, e2));
15101 emit_insn (gen_addxf3 (e2, e2, e1));
15102
15103 /* flags = signbit (op1) */
15104 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15105
15106 /* if (flags) then e2 = -e2 */
15107 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15108 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15109 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15110 pc_rtx);
15111 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15112 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15113 JUMP_LABEL (insn) = jump_label;
15114
15115 emit_insn (gen_negxf2 (e2, e2));
15116
15117 emit_label (jump_label);
15118 LABEL_NUSES (jump_label) = 1;
15119
15120 /* op0 = 0.5 * e2 */
15121 half = force_reg (XFmode, half);
15122 emit_insn (gen_mulxf3 (op0, e2, half));
15123 }
15124
15125 /* Output code to perform an cosh XFmode calculation. */
15126
15127 void ix86_emit_i387_cosh (rtx op0, rtx op1)
15128 {
15129 rtx e1 = gen_reg_rtx (XFmode);
15130 rtx e2 = gen_reg_rtx (XFmode);
15131 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15132 rtx cst1;
15133
15134 /* e1 = exp (op1) */
15135 emit_insn (gen_expxf2 (e1, op1));
15136
15137 /* e2 = e1 + 1.0 / e1 */
15138 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15139 emit_insn (gen_divxf3 (e2, cst1, e1));
15140 emit_insn (gen_addxf3 (e2, e1, e2));
15141
15142 /* op0 = 0.5 * e2 */
15143 half = force_reg (XFmode, half);
15144 emit_insn (gen_mulxf3 (op0, e2, half));
15145 }
15146
15147 /* Output code to perform an tanh XFmode calculation. */
15148
15149 void ix86_emit_i387_tanh (rtx op0, rtx op1)
15150 {
15151 rtx e1 = gen_reg_rtx (XFmode);
15152 rtx e2 = gen_reg_rtx (XFmode);
15153 rtx scratch = gen_reg_rtx (HImode);
15154 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15155 rtx cst2, tmp;
15156 rtx_code_label *jump_label = gen_label_rtx ();
15157 rtx_insn *insn;
15158
15159 /* scratch = fxam (op1) */
15160 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15161
15162 /* e1 = expm1 (-|2 * op1|) */
15163 emit_insn (gen_addxf3 (e2, op1, op1));
15164 emit_insn (gen_absxf2 (e2, e2));
15165 emit_insn (gen_negxf2 (e2, e2));
15166 emit_insn (gen_expm1xf2 (e1, e2));
15167
15168 /* e2 = e1 / (e1 + 2.0) */
15169 cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
15170 emit_insn (gen_addxf3 (e2, e1, cst2));
15171 emit_insn (gen_divxf3 (e2, e1, e2));
15172
15173 /* flags = signbit (op1) */
15174 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15175
15176 /* if (!flags) then e2 = -e2 */
15177 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15178 gen_rtx_NE (VOIDmode, flags, const0_rtx),
15179 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15180 pc_rtx);
15181 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15182 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15183 JUMP_LABEL (insn) = jump_label;
15184
15185 emit_insn (gen_negxf2 (e2, e2));
15186
15187 emit_label (jump_label);
15188 LABEL_NUSES (jump_label) = 1;
15189
15190 emit_move_insn (op0, e2);
15191 }
15192
15193 /* Output code to perform an asinh XFmode calculation. */
15194
15195 void ix86_emit_i387_asinh (rtx op0, rtx op1)
15196 {
15197 rtx e1 = gen_reg_rtx (XFmode);
15198 rtx e2 = gen_reg_rtx (XFmode);
15199 rtx scratch = gen_reg_rtx (HImode);
15200 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15201 rtx cst1, tmp;
15202 rtx_code_label *jump_label = gen_label_rtx ();
15203 rtx_insn *insn;
15204
15205 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
15206 emit_insn (gen_mulxf3 (e1, op1, op1));
15207 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15208 emit_insn (gen_addxf3 (e2, e1, cst1));
15209 emit_insn (gen_sqrtxf2 (e2, e2));
15210 emit_insn (gen_addxf3 (e2, e2, cst1));
15211
15212 /* e1 = e1 / e2 */
15213 emit_insn (gen_divxf3 (e1, e1, e2));
15214
15215 /* scratch = fxam (op1) */
15216 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15217
15218 /* e1 = e1 + |op1| */
15219 emit_insn (gen_absxf2 (e2, op1));
15220 emit_insn (gen_addxf3 (e1, e1, e2));
15221
15222 /* e2 = log1p (e1) */
15223 ix86_emit_i387_log1p (e2, e1);
15224
15225 /* flags = signbit (op1) */
15226 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15227
15228 /* if (flags) then e2 = -e2 */
15229 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15230 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15231 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15232 pc_rtx);
15233 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15234 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15235 JUMP_LABEL (insn) = jump_label;
15236
15237 emit_insn (gen_negxf2 (e2, e2));
15238
15239 emit_label (jump_label);
15240 LABEL_NUSES (jump_label) = 1;
15241
15242 emit_move_insn (op0, e2);
15243 }
15244
15245 /* Output code to perform an acosh XFmode calculation. */
15246
15247 void ix86_emit_i387_acosh (rtx op0, rtx op1)
15248 {
15249 rtx e1 = gen_reg_rtx (XFmode);
15250 rtx e2 = gen_reg_rtx (XFmode);
15251 rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15252
15253 /* e2 = sqrt (op1 + 1.0) */
15254 emit_insn (gen_addxf3 (e2, op1, cst1));
15255 emit_insn (gen_sqrtxf2 (e2, e2));
15256
15257 /* e1 = sqrt (op1 - 1.0) */
15258 emit_insn (gen_subxf3 (e1, op1, cst1));
15259 emit_insn (gen_sqrtxf2 (e1, e1));
15260
15261 /* e1 = e1 * e2 */
15262 emit_insn (gen_mulxf3 (e1, e1, e2));
15263
15264 /* e1 = e1 + op1 */
15265 emit_insn (gen_addxf3 (e1, e1, op1));
15266
15267 /* op0 = log (e1) */
15268 emit_insn (gen_logxf2 (op0, e1));
15269 }
15270
15271 /* Output code to perform an atanh XFmode calculation. */
15272
15273 void ix86_emit_i387_atanh (rtx op0, rtx op1)
15274 {
15275 rtx e1 = gen_reg_rtx (XFmode);
15276 rtx e2 = gen_reg_rtx (XFmode);
15277 rtx scratch = gen_reg_rtx (HImode);
15278 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15279 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15280 rtx cst1, tmp;
15281 rtx_code_label *jump_label = gen_label_rtx ();
15282 rtx_insn *insn;
15283
15284 /* scratch = fxam (op1) */
15285 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15286
15287 /* e2 = |op1| */
15288 emit_insn (gen_absxf2 (e2, op1));
15289
15290 /* e1 = -(e2 + e2) / (e2 + 1.0) */
15291 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15292 emit_insn (gen_addxf3 (e1, e2, cst1));
15293 emit_insn (gen_addxf3 (e2, e2, e2));
15294 emit_insn (gen_negxf2 (e2, e2));
15295 emit_insn (gen_divxf3 (e1, e2, e1));
15296
15297 /* e2 = log1p (e1) */
15298 ix86_emit_i387_log1p (e2, e1);
15299
15300 /* flags = signbit (op1) */
15301 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15302
15303 /* if (!flags) then e2 = -e2 */
15304 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15305 gen_rtx_NE (VOIDmode, flags, const0_rtx),
15306 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15307 pc_rtx);
15308 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15309 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15310 JUMP_LABEL (insn) = jump_label;
15311
15312 emit_insn (gen_negxf2 (e2, e2));
15313
15314 emit_label (jump_label);
15315 LABEL_NUSES (jump_label) = 1;
15316
15317 /* op0 = 0.5 * e2 */
15318 half = force_reg (XFmode, half);
15319 emit_insn (gen_mulxf3 (op0, e2, half));
15320 }
15321
15322 /* Output code to perform a log1p XFmode calculation. */
15323
15324 void ix86_emit_i387_log1p (rtx op0, rtx op1)
15325 {
15326 rtx_code_label *label1 = gen_label_rtx ();
15327 rtx_code_label *label2 = gen_label_rtx ();
15328
15329 rtx tmp = gen_reg_rtx (XFmode);
15330 rtx res = gen_reg_rtx (XFmode);
15331 rtx cst, cstln2, cst1;
15332 rtx_insn *insn;
15333
15334 cst = const_double_from_real_value
15335 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
15336 cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
15337
15338 emit_insn (gen_absxf2 (tmp, op1));
15339
15340 cst = force_reg (XFmode, cst);
15341 ix86_expand_branch (GE, tmp, cst, label1);
15342 predict_jump (REG_BR_PROB_BASE * 10 / 100);
15343 insn = get_last_insn ();
15344 JUMP_LABEL (insn) = label1;
15345
15346 emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
15347 emit_jump (label2);
15348
15349 emit_label (label1);
15350 LABEL_NUSES (label1) = 1;
15351
15352 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
15353 emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
15354 emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
15355
15356 emit_label (label2);
15357 LABEL_NUSES (label2) = 1;
15358
15359 emit_move_insn (op0, res);
15360 }
15361
15362 /* Emit code for round calculation. */
15363 void ix86_emit_i387_round (rtx op0, rtx op1)
15364 {
15365 machine_mode inmode = GET_MODE (op1);
15366 machine_mode outmode = GET_MODE (op0);
15367 rtx e1 = gen_reg_rtx (XFmode);
15368 rtx e2 = gen_reg_rtx (XFmode);
15369 rtx scratch = gen_reg_rtx (HImode);
15370 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
15371 rtx half = const_double_from_real_value (dconsthalf, XFmode);
15372 rtx res = gen_reg_rtx (outmode);
15373 rtx_code_label *jump_label = gen_label_rtx ();
15374 rtx (*floor_insn) (rtx, rtx);
15375 rtx (*neg_insn) (rtx, rtx);
15376 rtx_insn *insn;
15377 rtx tmp;
15378
15379 switch (inmode)
15380 {
15381 case E_SFmode:
15382 case E_DFmode:
15383 tmp = gen_reg_rtx (XFmode);
15384
15385 emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
15386 op1 = tmp;
15387 break;
15388 case E_XFmode:
15389 break;
15390 default:
15391 gcc_unreachable ();
15392 }
15393
15394 switch (outmode)
15395 {
15396 case E_SFmode:
15397 floor_insn = gen_frndintxf2_floor;
15398 neg_insn = gen_negsf2;
15399 break;
15400 case E_DFmode:
15401 floor_insn = gen_frndintxf2_floor;
15402 neg_insn = gen_negdf2;
15403 break;
15404 case E_XFmode:
15405 floor_insn = gen_frndintxf2_floor;
15406 neg_insn = gen_negxf2;
15407 break;
15408 case E_HImode:
15409 floor_insn = gen_lfloorxfhi2;
15410 neg_insn = gen_neghi2;
15411 break;
15412 case E_SImode:
15413 floor_insn = gen_lfloorxfsi2;
15414 neg_insn = gen_negsi2;
15415 break;
15416 case E_DImode:
15417 floor_insn = gen_lfloorxfdi2;
15418 neg_insn = gen_negdi2;
15419 break;
15420 default:
15421 gcc_unreachable ();
15422 }
15423
15424 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
15425
15426 /* scratch = fxam(op1) */
15427 emit_insn (gen_fxamxf2_i387 (scratch, op1));
15428
15429 /* e1 = fabs(op1) */
15430 emit_insn (gen_absxf2 (e1, op1));
15431
15432 /* e2 = e1 + 0.5 */
15433 half = force_reg (XFmode, half);
15434 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
15435
15436 /* res = floor(e2) */
15437 switch (outmode)
15438 {
15439 case E_SFmode:
15440 case E_DFmode:
15441 {
15442 tmp = gen_reg_rtx (XFmode);
15443
15444 emit_insn (floor_insn (tmp, e2));
15445 emit_insn (gen_rtx_SET (res,
15446 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
15447 UNSPEC_TRUNC_NOOP)));
15448 }
15449 break;
15450 default:
15451 emit_insn (floor_insn (res, e2));
15452 }
15453
15454 /* flags = signbit(a) */
15455 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
15456
15457 /* if (flags) then res = -res */
15458 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
15459 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
15460 gen_rtx_LABEL_REF (VOIDmode, jump_label),
15461 pc_rtx);
15462 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15463 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15464 JUMP_LABEL (insn) = jump_label;
15465
15466 emit_insn (neg_insn (res, res));
15467
15468 emit_label (jump_label);
15469 LABEL_NUSES (jump_label) = 1;
15470
15471 emit_move_insn (op0, res);
15472 }
15473
15474 /* Output code to perform a Newton-Rhapson approximation of a single precision
15475 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
15476
15477 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
15478 {
15479 rtx x0, x1, e0, e1;
15480
15481 x0 = gen_reg_rtx (mode);
15482 e0 = gen_reg_rtx (mode);
15483 e1 = gen_reg_rtx (mode);
15484 x1 = gen_reg_rtx (mode);
15485
15486 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
15487
15488 b = force_reg (mode, b);
15489
15490 /* x0 = rcp(b) estimate */
15491 if (mode == V16SFmode || mode == V8DFmode)
15492 {
15493 if (TARGET_AVX512ER)
15494 {
15495 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15496 UNSPEC_RCP28)));
15497 /* res = a * x0 */
15498 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
15499 return;
15500 }
15501 else
15502 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15503 UNSPEC_RCP14)));
15504 }
15505 else
15506 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
15507 UNSPEC_RCP)));
15508
15509 /* e0 = x0 * b */
15510 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
15511
15512 /* e0 = x0 * e0 */
15513 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
15514
15515 /* e1 = x0 + x0 */
15516 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
15517
15518 /* x1 = e1 - e0 */
15519 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
15520
15521 /* res = a * x1 */
15522 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
15523 }
15524
15525 /* Output code to perform a Newton-Rhapson approximation of a
15526 single precision floating point [reciprocal] square root. */
15527
15528 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
15529 {
15530 rtx x0, e0, e1, e2, e3, mthree, mhalf;
15531 REAL_VALUE_TYPE r;
15532 int unspec;
15533
15534 x0 = gen_reg_rtx (mode);
15535 e0 = gen_reg_rtx (mode);
15536 e1 = gen_reg_rtx (mode);
15537 e2 = gen_reg_rtx (mode);
15538 e3 = gen_reg_rtx (mode);
15539
15540 if (TARGET_AVX512ER && mode == V16SFmode)
15541 {
15542 if (recip)
15543 /* res = rsqrt28(a) estimate */
15544 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15545 UNSPEC_RSQRT28)));
15546 else
15547 {
15548 /* x0 = rsqrt28(a) estimate */
15549 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15550 UNSPEC_RSQRT28)));
15551 /* res = rcp28(x0) estimate */
15552 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
15553 UNSPEC_RCP28)));
15554 }
15555 return;
15556 }
15557
15558 real_from_integer (&r, VOIDmode, -3, SIGNED);
15559 mthree = const_double_from_real_value (r, SFmode);
15560
15561 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
15562 mhalf = const_double_from_real_value (r, SFmode);
15563 unspec = UNSPEC_RSQRT;
15564
15565 if (VECTOR_MODE_P (mode))
15566 {
15567 mthree = ix86_build_const_vector (mode, true, mthree);
15568 mhalf = ix86_build_const_vector (mode, true, mhalf);
15569 /* There is no 512-bit rsqrt. There is however rsqrt14. */
15570 if (GET_MODE_SIZE (mode) == 64)
15571 unspec = UNSPEC_RSQRT14;
15572 }
15573
15574 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
15575 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
15576
15577 a = force_reg (mode, a);
15578
15579 /* x0 = rsqrt(a) estimate */
15580 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
15581 unspec)));
15582
15583 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
15584 if (!recip)
15585 {
15586 rtx zero = force_reg (mode, CONST0_RTX(mode));
15587 rtx mask;
15588
15589 /* Handle masked compare. */
15590 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
15591 {
15592 mask = gen_reg_rtx (HImode);
15593 /* Imm value 0x4 corresponds to not-equal comparison. */
15594 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
15595 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
15596 }
15597 else
15598 {
15599 mask = gen_reg_rtx (mode);
15600 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
15601 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
15602 }
15603 }
15604
15605 /* e0 = x0 * a */
15606 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
15607 /* e1 = e0 * x0 */
15608 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
15609
15610 /* e2 = e1 - 3. */
15611 mthree = force_reg (mode, mthree);
15612 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
15613
15614 mhalf = force_reg (mode, mhalf);
15615 if (recip)
15616 /* e3 = -.5 * x0 */
15617 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
15618 else
15619 /* e3 = -.5 * e0 */
15620 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
15621 /* ret = e2 * e3 */
15622 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
15623 }
15624
15625 /* Expand fabs (OP0) and return a new rtx that holds the result. The
15626 mask for masking out the sign-bit is stored in *SMASK, if that is
15627 non-null. */
15628
15629 static rtx
15630 ix86_expand_sse_fabs (rtx op0, rtx *smask)
15631 {
15632 machine_mode vmode, mode = GET_MODE (op0);
15633 rtx xa, mask;
15634
15635 xa = gen_reg_rtx (mode);
15636 if (mode == SFmode)
15637 vmode = V4SFmode;
15638 else if (mode == DFmode)
15639 vmode = V2DFmode;
15640 else
15641 vmode = mode;
15642 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
15643 if (!VECTOR_MODE_P (mode))
15644 {
15645 /* We need to generate a scalar mode mask in this case. */
15646 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
15647 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
15648 mask = gen_reg_rtx (mode);
15649 emit_insn (gen_rtx_SET (mask, tmp));
15650 }
15651 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
15652
15653 if (smask)
15654 *smask = mask;
15655
15656 return xa;
15657 }
15658
15659 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
15660 swapping the operands if SWAP_OPERANDS is true. The expanded
15661 code is a forward jump to a newly created label in case the
15662 comparison is true. The generated label rtx is returned. */
15663 static rtx_code_label *
15664 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
15665 bool swap_operands)
15666 {
15667 bool unordered_compare = ix86_unordered_fp_compare (code);
15668 rtx_code_label *label;
15669 rtx tmp, reg;
15670
15671 if (swap_operands)
15672 std::swap (op0, op1);
15673
15674 label = gen_label_rtx ();
15675 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
15676 if (unordered_compare)
15677 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
15678 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
15679 emit_insn (gen_rtx_SET (reg, tmp));
15680 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
15681 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
15682 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
15683 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
15684 JUMP_LABEL (tmp) = label;
15685
15686 return label;
15687 }
15688
15689 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
15690 using comparison code CODE. Operands are swapped for the comparison if
15691 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
15692 static rtx
15693 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
15694 bool swap_operands)
15695 {
15696 rtx (*insn)(rtx, rtx, rtx, rtx);
15697 machine_mode mode = GET_MODE (op0);
15698 rtx mask = gen_reg_rtx (mode);
15699
15700 if (swap_operands)
15701 std::swap (op0, op1);
15702
15703 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
15704
15705 emit_insn (insn (mask, op0, op1,
15706 gen_rtx_fmt_ee (code, mode, op0, op1)));
15707 return mask;
15708 }
15709
15710 /* Expand copysign from SIGN to the positive value ABS_VALUE
15711 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
15712 the sign-bit. */
15713
15714 static void
15715 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
15716 {
15717 machine_mode mode = GET_MODE (sign);
15718 rtx sgn = gen_reg_rtx (mode);
15719 if (mask == NULL_RTX)
15720 {
15721 machine_mode vmode;
15722
15723 if (mode == SFmode)
15724 vmode = V4SFmode;
15725 else if (mode == DFmode)
15726 vmode = V2DFmode;
15727 else
15728 vmode = mode;
15729
15730 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
15731 if (!VECTOR_MODE_P (mode))
15732 {
15733 /* We need to generate a scalar mode mask in this case. */
15734 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
15735 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
15736 mask = gen_reg_rtx (mode);
15737 emit_insn (gen_rtx_SET (mask, tmp));
15738 }
15739 }
15740 else
15741 mask = gen_rtx_NOT (mode, mask);
15742 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
15743 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
15744 }
15745
15746 /* Expand SSE sequence for computing lround from OP1 storing
15747 into OP0. */
15748
15749 void
15750 ix86_expand_lround (rtx op0, rtx op1)
15751 {
15752 /* C code for the stuff we're doing below:
15753 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
15754 return (long)tmp;
15755 */
15756 machine_mode mode = GET_MODE (op1);
15757 const struct real_format *fmt;
15758 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
15759 rtx adj;
15760
15761 /* load nextafter (0.5, 0.0) */
15762 fmt = REAL_MODE_FORMAT (mode);
15763 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
15764 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
15765
15766 /* adj = copysign (0.5, op1) */
15767 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
15768 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
15769
15770 /* adj = op1 + adj */
15771 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
15772
15773 /* op0 = (imode)adj */
15774 expand_fix (op0, adj, 0);
15775 }
15776
15777 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
15778 into OPERAND0. */
15779
15780 void
15781 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
15782 {
15783 /* C code for the stuff we're doing below (for do_floor):
15784 xi = (long)op1;
15785 xi -= (double)xi > op1 ? 1 : 0;
15786 return xi;
15787 */
15788 machine_mode fmode = GET_MODE (op1);
15789 machine_mode imode = GET_MODE (op0);
15790 rtx ireg, freg, tmp;
15791 rtx_code_label *label;
15792
15793 /* reg = (long)op1 */
15794 ireg = gen_reg_rtx (imode);
15795 expand_fix (ireg, op1, 0);
15796
15797 /* freg = (double)reg */
15798 freg = gen_reg_rtx (fmode);
15799 expand_float (freg, ireg, 0);
15800
15801 /* ireg = (freg > op1) ? ireg - 1 : ireg */
15802 label = ix86_expand_sse_compare_and_jump (UNLE,
15803 freg, op1, !do_floor);
15804 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
15805 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
15806 emit_move_insn (ireg, tmp);
15807
15808 emit_label (label);
15809 LABEL_NUSES (label) = 1;
15810
15811 emit_move_insn (op0, ireg);
15812 }
15813
15814 /* Generate and return a rtx of mode MODE for 2**n where n is the number
15815 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
15816
15817 static rtx
15818 ix86_gen_TWO52 (machine_mode mode)
15819 {
15820 REAL_VALUE_TYPE TWO52r;
15821 rtx TWO52;
15822
15823 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
15824 TWO52 = const_double_from_real_value (TWO52r, mode);
15825 TWO52 = force_reg (mode, TWO52);
15826
15827 return TWO52;
15828 }
15829
15830 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
15831
15832 void
15833 ix86_expand_rint (rtx operand0, rtx operand1)
15834 {
15835 /* C code for the stuff we're doing below:
15836 xa = fabs (operand1);
15837 if (!isless (xa, 2**52))
15838 return operand1;
15839 two52 = 2**52;
15840 if (flag_rounding_math)
15841 {
15842 two52 = copysign (two52, operand1);
15843 xa = operand1;
15844 }
15845 xa = xa + two52 - two52;
15846 return copysign (xa, operand1);
15847 */
15848 machine_mode mode = GET_MODE (operand0);
15849 rtx res, xa, TWO52, two52, mask;
15850 rtx_code_label *label;
15851
15852 res = gen_reg_rtx (mode);
15853 emit_move_insn (res, operand1);
15854
15855 /* xa = abs (operand1) */
15856 xa = ix86_expand_sse_fabs (res, &mask);
15857
15858 /* if (!isless (xa, TWO52)) goto label; */
15859 TWO52 = ix86_gen_TWO52 (mode);
15860 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15861
15862 two52 = TWO52;
15863 if (flag_rounding_math)
15864 {
15865 two52 = gen_reg_rtx (mode);
15866 ix86_sse_copysign_to_positive (two52, TWO52, res, mask);
15867 xa = res;
15868 }
15869
15870 xa = expand_simple_binop (mode, PLUS, xa, two52, NULL_RTX, 0, OPTAB_DIRECT);
15871 xa = expand_simple_binop (mode, MINUS, xa, two52, xa, 0, OPTAB_DIRECT);
15872
15873 ix86_sse_copysign_to_positive (res, xa, res, mask);
15874
15875 emit_label (label);
15876 LABEL_NUSES (label) = 1;
15877
15878 emit_move_insn (operand0, res);
15879 }
15880
15881 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
15882 into OPERAND0. */
15883 void
15884 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
15885 {
15886 /* C code for the stuff we expand below.
15887 double xa = fabs (x), x2;
15888 if (!isless (xa, TWO52))
15889 return x;
15890 xa = xa + TWO52 - TWO52;
15891 x2 = copysign (xa, x);
15892 Compensate. Floor:
15893 if (x2 > x)
15894 x2 -= 1;
15895 Compensate. Ceil:
15896 if (x2 < x)
15897 x2 += 1;
15898 if (HONOR_SIGNED_ZEROS (mode))
15899 x2 = copysign (x2, x);
15900 return x2;
15901 */
15902 machine_mode mode = GET_MODE (operand0);
15903 rtx xa, TWO52, tmp, one, res, mask;
15904 rtx_code_label *label;
15905
15906 TWO52 = ix86_gen_TWO52 (mode);
15907
15908 /* Temporary for holding the result, initialized to the input
15909 operand to ease control flow. */
15910 res = gen_reg_rtx (mode);
15911 emit_move_insn (res, operand1);
15912
15913 /* xa = abs (operand1) */
15914 xa = ix86_expand_sse_fabs (res, &mask);
15915
15916 /* if (!isless (xa, TWO52)) goto label; */
15917 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15918
15919 /* xa = xa + TWO52 - TWO52; */
15920 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
15921 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
15922
15923 /* xa = copysign (xa, operand1) */
15924 ix86_sse_copysign_to_positive (xa, xa, res, mask);
15925
15926 /* generate 1.0 */
15927 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
15928
15929 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
15930 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
15931 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
15932 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
15933 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
15934 if (!do_floor && HONOR_SIGNED_ZEROS (mode))
15935 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
15936 emit_move_insn (res, tmp);
15937
15938 emit_label (label);
15939 LABEL_NUSES (label) = 1;
15940
15941 emit_move_insn (operand0, res);
15942 }
15943
15944 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
15945 into OPERAND0. */
15946 void
15947 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
15948 {
15949 /* C code for the stuff we expand below.
15950 double xa = fabs (x), x2;
15951 if (!isless (xa, TWO52))
15952 return x;
15953 x2 = (double)(long)x;
15954 Compensate. Floor:
15955 if (x2 > x)
15956 x2 -= 1;
15957 Compensate. Ceil:
15958 if (x2 < x)
15959 x2 += 1;
15960 if (HONOR_SIGNED_ZEROS (mode))
15961 return copysign (x2, x);
15962 return x2;
15963 */
15964 machine_mode mode = GET_MODE (operand0);
15965 rtx xa, xi, TWO52, tmp, one, res, mask;
15966 rtx_code_label *label;
15967
15968 TWO52 = ix86_gen_TWO52 (mode);
15969
15970 /* Temporary for holding the result, initialized to the input
15971 operand to ease control flow. */
15972 res = gen_reg_rtx (mode);
15973 emit_move_insn (res, operand1);
15974
15975 /* xa = abs (operand1) */
15976 xa = ix86_expand_sse_fabs (res, &mask);
15977
15978 /* if (!isless (xa, TWO52)) goto label; */
15979 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
15980
15981 /* xa = (double)(long)x */
15982 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
15983 expand_fix (xi, res, 0);
15984 expand_float (xa, xi, 0);
15985
15986 /* generate 1.0 */
15987 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
15988
15989 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
15990 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
15991 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
15992 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
15993 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
15994 emit_move_insn (res, tmp);
15995
15996 if (HONOR_SIGNED_ZEROS (mode))
15997 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
15998
15999 emit_label (label);
16000 LABEL_NUSES (label) = 1;
16001
16002 emit_move_insn (operand0, res);
16003 }
16004
16005 /* Expand SSE sequence for computing round from OPERAND1 storing
16006 into OPERAND0. Sequence that works without relying on DImode truncation
16007 via cvttsd2siq that is only available on 64bit targets. */
16008 void
16009 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
16010 {
16011 /* C code for the stuff we expand below.
16012 double xa = fabs (x), xa2, x2;
16013 if (!isless (xa, TWO52))
16014 return x;
16015 Using the absolute value and copying back sign makes
16016 -0.0 -> -0.0 correct.
16017 xa2 = xa + TWO52 - TWO52;
16018 Compensate.
16019 dxa = xa2 - xa;
16020 if (dxa <= -0.5)
16021 xa2 += 1;
16022 else if (dxa > 0.5)
16023 xa2 -= 1;
16024 x2 = copysign (xa2, x);
16025 return x2;
16026 */
16027 machine_mode mode = GET_MODE (operand0);
16028 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
16029 rtx_code_label *label;
16030
16031 TWO52 = ix86_gen_TWO52 (mode);
16032
16033 /* Temporary for holding the result, initialized to the input
16034 operand to ease control flow. */
16035 res = gen_reg_rtx (mode);
16036 emit_move_insn (res, operand1);
16037
16038 /* xa = abs (operand1) */
16039 xa = ix86_expand_sse_fabs (res, &mask);
16040
16041 /* if (!isless (xa, TWO52)) goto label; */
16042 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16043
16044 /* xa2 = xa + TWO52 - TWO52; */
16045 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16046 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
16047
16048 /* dxa = xa2 - xa; */
16049 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
16050
16051 /* generate 0.5, 1.0 and -0.5 */
16052 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
16053 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
16054 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
16055 0, OPTAB_DIRECT);
16056
16057 /* Compensate. */
16058 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
16059 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
16060 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
16061 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16062 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
16063 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
16064 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
16065 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
16066
16067 /* res = copysign (xa2, operand1) */
16068 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
16069
16070 emit_label (label);
16071 LABEL_NUSES (label) = 1;
16072
16073 emit_move_insn (operand0, res);
16074 }
16075
16076 /* Expand SSE sequence for computing trunc from OPERAND1 storing
16077 into OPERAND0. */
16078 void
16079 ix86_expand_trunc (rtx operand0, rtx operand1)
16080 {
16081 /* C code for SSE variant we expand below.
16082 double xa = fabs (x), x2;
16083 if (!isless (xa, TWO52))
16084 return x;
16085 x2 = (double)(long)x;
16086 if (HONOR_SIGNED_ZEROS (mode))
16087 return copysign (x2, x);
16088 return x2;
16089 */
16090 machine_mode mode = GET_MODE (operand0);
16091 rtx xa, xi, TWO52, res, mask;
16092 rtx_code_label *label;
16093
16094 TWO52 = ix86_gen_TWO52 (mode);
16095
16096 /* Temporary for holding the result, initialized to the input
16097 operand to ease control flow. */
16098 res = gen_reg_rtx (mode);
16099 emit_move_insn (res, operand1);
16100
16101 /* xa = abs (operand1) */
16102 xa = ix86_expand_sse_fabs (res, &mask);
16103
16104 /* if (!isless (xa, TWO52)) goto label; */
16105 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16106
16107 /* x = (double)(long)x */
16108 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
16109 expand_fix (xi, res, 0);
16110 expand_float (res, xi, 0);
16111
16112 if (HONOR_SIGNED_ZEROS (mode))
16113 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
16114
16115 emit_label (label);
16116 LABEL_NUSES (label) = 1;
16117
16118 emit_move_insn (operand0, res);
16119 }
16120
16121 /* Expand SSE sequence for computing trunc from OPERAND1 storing
16122 into OPERAND0. */
16123 void
16124 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
16125 {
16126 machine_mode mode = GET_MODE (operand0);
16127 rtx xa, mask, TWO52, one, res, smask, tmp;
16128 rtx_code_label *label;
16129
16130 /* C code for SSE variant we expand below.
16131 double xa = fabs (x), x2;
16132 if (!isless (xa, TWO52))
16133 return x;
16134 xa2 = xa + TWO52 - TWO52;
16135 Compensate:
16136 if (xa2 > xa)
16137 xa2 -= 1.0;
16138 x2 = copysign (xa2, x);
16139 return x2;
16140 */
16141
16142 TWO52 = ix86_gen_TWO52 (mode);
16143
16144 /* Temporary for holding the result, initialized to the input
16145 operand to ease control flow. */
16146 res = gen_reg_rtx (mode);
16147 emit_move_insn (res, operand1);
16148
16149 /* xa = abs (operand1) */
16150 xa = ix86_expand_sse_fabs (res, &smask);
16151
16152 /* if (!isless (xa, TWO52)) goto label; */
16153 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16154
16155 /* res = xa + TWO52 - TWO52; */
16156 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
16157 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
16158 emit_move_insn (res, tmp);
16159
16160 /* generate 1.0 */
16161 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
16162
16163 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
16164 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
16165 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
16166 tmp = expand_simple_binop (mode, MINUS,
16167 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
16168 emit_move_insn (res, tmp);
16169
16170 /* res = copysign (res, operand1) */
16171 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
16172
16173 emit_label (label);
16174 LABEL_NUSES (label) = 1;
16175
16176 emit_move_insn (operand0, res);
16177 }
16178
16179 /* Expand SSE sequence for computing round from OPERAND1 storing
16180 into OPERAND0. */
16181 void
16182 ix86_expand_round (rtx operand0, rtx operand1)
16183 {
16184 /* C code for the stuff we're doing below:
16185 double xa = fabs (x);
16186 if (!isless (xa, TWO52))
16187 return x;
16188 xa = (double)(long)(xa + nextafter (0.5, 0.0));
16189 return copysign (xa, x);
16190 */
16191 machine_mode mode = GET_MODE (operand0);
16192 rtx res, TWO52, xa, xi, half, mask;
16193 rtx_code_label *label;
16194 const struct real_format *fmt;
16195 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16196
16197 /* Temporary for holding the result, initialized to the input
16198 operand to ease control flow. */
16199 res = gen_reg_rtx (mode);
16200 emit_move_insn (res, operand1);
16201
16202 TWO52 = ix86_gen_TWO52 (mode);
16203 xa = ix86_expand_sse_fabs (res, &mask);
16204 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
16205
16206 /* load nextafter (0.5, 0.0) */
16207 fmt = REAL_MODE_FORMAT (mode);
16208 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16209 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16210
16211 /* xa = xa + 0.5 */
16212 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
16213 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
16214
16215 /* xa = (double)(int64_t)xa */
16216 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
16217 expand_fix (xi, xa, 0);
16218 expand_float (xa, xi, 0);
16219
16220 /* res = copysign (xa, operand1) */
16221 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
16222
16223 emit_label (label);
16224 LABEL_NUSES (label) = 1;
16225
16226 emit_move_insn (operand0, res);
16227 }
16228
16229 /* Expand SSE sequence for computing round
16230 from OP1 storing into OP0 using sse4 round insn. */
16231 void
16232 ix86_expand_round_sse4 (rtx op0, rtx op1)
16233 {
16234 machine_mode mode = GET_MODE (op0);
16235 rtx e1, e2, res, half;
16236 const struct real_format *fmt;
16237 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
16238 rtx (*gen_copysign) (rtx, rtx, rtx);
16239 rtx (*gen_round) (rtx, rtx, rtx);
16240
16241 switch (mode)
16242 {
16243 case E_SFmode:
16244 gen_copysign = gen_copysignsf3;
16245 gen_round = gen_sse4_1_roundsf2;
16246 break;
16247 case E_DFmode:
16248 gen_copysign = gen_copysigndf3;
16249 gen_round = gen_sse4_1_rounddf2;
16250 break;
16251 default:
16252 gcc_unreachable ();
16253 }
16254
16255 /* round (a) = trunc (a + copysign (0.5, a)) */
16256
16257 /* load nextafter (0.5, 0.0) */
16258 fmt = REAL_MODE_FORMAT (mode);
16259 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
16260 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
16261 half = const_double_from_real_value (pred_half, mode);
16262
16263 /* e1 = copysign (0.5, op1) */
16264 e1 = gen_reg_rtx (mode);
16265 emit_insn (gen_copysign (e1, half, op1));
16266
16267 /* e2 = op1 + e1 */
16268 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
16269
16270 /* res = trunc (e2) */
16271 res = gen_reg_rtx (mode);
16272 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
16273
16274 emit_move_insn (op0, res);
16275 }
16276
16277 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
16278 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
16279 insn every time. */
16280
16281 static GTY(()) rtx_insn *vselect_insn;
16282
16283 /* Initialize vselect_insn. */
16284
16285 static void
16286 init_vselect_insn (void)
16287 {
16288 unsigned i;
16289 rtx x;
16290
16291 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
16292 for (i = 0; i < MAX_VECT_LEN; ++i)
16293 XVECEXP (x, 0, i) = const0_rtx;
16294 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
16295 const0_rtx), x);
16296 x = gen_rtx_SET (const0_rtx, x);
16297 start_sequence ();
16298 vselect_insn = emit_insn (x);
16299 end_sequence ();
16300 }
16301
16302 /* Construct (set target (vec_select op0 (parallel perm))) and
16303 return true if that's a valid instruction in the active ISA. */
16304
16305 static bool
16306 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
16307 unsigned nelt, bool testing_p)
16308 {
16309 unsigned int i;
16310 rtx x, save_vconcat;
16311 int icode;
16312
16313 if (vselect_insn == NULL_RTX)
16314 init_vselect_insn ();
16315
16316 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
16317 PUT_NUM_ELEM (XVEC (x, 0), nelt);
16318 for (i = 0; i < nelt; ++i)
16319 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
16320 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
16321 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
16322 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
16323 SET_DEST (PATTERN (vselect_insn)) = target;
16324 icode = recog_memoized (vselect_insn);
16325
16326 if (icode >= 0 && !testing_p)
16327 emit_insn (copy_rtx (PATTERN (vselect_insn)));
16328
16329 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
16330 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
16331 INSN_CODE (vselect_insn) = -1;
16332
16333 return icode >= 0;
16334 }
16335
16336 /* Similar, but generate a vec_concat from op0 and op1 as well. */
16337
16338 static bool
16339 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
16340 const unsigned char *perm, unsigned nelt,
16341 bool testing_p)
16342 {
16343 machine_mode v2mode;
16344 rtx x;
16345 bool ok;
16346
16347 if (vselect_insn == NULL_RTX)
16348 init_vselect_insn ();
16349
16350 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
16351 return false;
16352 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
16353 PUT_MODE (x, v2mode);
16354 XEXP (x, 0) = op0;
16355 XEXP (x, 1) = op1;
16356 ok = expand_vselect (target, x, perm, nelt, testing_p);
16357 XEXP (x, 0) = const0_rtx;
16358 XEXP (x, 1) = const0_rtx;
16359 return ok;
16360 }
16361
16362 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
16363 using movss or movsd. */
16364 static bool
16365 expand_vec_perm_movs (struct expand_vec_perm_d *d)
16366 {
16367 machine_mode vmode = d->vmode;
16368 unsigned i, nelt = d->nelt;
16369 rtx x;
16370
16371 if (d->one_operand_p)
16372 return false;
16373
16374 if (!(TARGET_SSE && vmode == V4SFmode)
16375 && !(TARGET_SSE2 && vmode == V2DFmode))
16376 return false;
16377
16378 /* Only the first element is changed. */
16379 if (d->perm[0] != nelt && d->perm[0] != 0)
16380 return false;
16381 for (i = 1; i < nelt; ++i)
16382 if (d->perm[i] != i + nelt - d->perm[0])
16383 return false;
16384
16385 if (d->testing_p)
16386 return true;
16387
16388 if (d->perm[0] == nelt)
16389 x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
16390 else
16391 x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
16392
16393 emit_insn (gen_rtx_SET (d->target, x));
16394
16395 return true;
16396 }
16397
16398 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
16399 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
16400
16401 static bool
16402 expand_vec_perm_blend (struct expand_vec_perm_d *d)
16403 {
16404 machine_mode mmode, vmode = d->vmode;
16405 unsigned i, nelt = d->nelt;
16406 unsigned HOST_WIDE_INT mask;
16407 rtx target, op0, op1, maskop, x;
16408 rtx rperm[32], vperm;
16409
16410 if (d->one_operand_p)
16411 return false;
16412 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
16413 && (TARGET_AVX512BW
16414 || GET_MODE_UNIT_SIZE (vmode) >= 4))
16415 ;
16416 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
16417 ;
16418 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
16419 ;
16420 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
16421 ;
16422 else
16423 return false;
16424
16425 /* This is a blend, not a permute. Elements must stay in their
16426 respective lanes. */
16427 for (i = 0; i < nelt; ++i)
16428 {
16429 unsigned e = d->perm[i];
16430 if (!(e == i || e == i + nelt))
16431 return false;
16432 }
16433
16434 if (d->testing_p)
16435 return true;
16436
16437 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
16438 decision should be extracted elsewhere, so that we only try that
16439 sequence once all budget==3 options have been tried. */
16440 target = d->target;
16441 op0 = d->op0;
16442 op1 = d->op1;
16443 mask = 0;
16444
16445 switch (vmode)
16446 {
16447 case E_V8DFmode:
16448 case E_V16SFmode:
16449 case E_V4DFmode:
16450 case E_V8SFmode:
16451 case E_V2DFmode:
16452 case E_V4SFmode:
16453 case E_V8HImode:
16454 case E_V8SImode:
16455 case E_V32HImode:
16456 case E_V64QImode:
16457 case E_V16SImode:
16458 case E_V8DImode:
16459 for (i = 0; i < nelt; ++i)
16460 mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
16461 break;
16462
16463 case E_V2DImode:
16464 for (i = 0; i < 2; ++i)
16465 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
16466 vmode = V8HImode;
16467 goto do_subreg;
16468
16469 case E_V4SImode:
16470 for (i = 0; i < 4; ++i)
16471 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
16472 vmode = V8HImode;
16473 goto do_subreg;
16474
16475 case E_V16QImode:
16476 /* See if bytes move in pairs so we can use pblendw with
16477 an immediate argument, rather than pblendvb with a vector
16478 argument. */
16479 for (i = 0; i < 16; i += 2)
16480 if (d->perm[i] + 1 != d->perm[i + 1])
16481 {
16482 use_pblendvb:
16483 for (i = 0; i < nelt; ++i)
16484 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
16485
16486 finish_pblendvb:
16487 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16488 vperm = force_reg (vmode, vperm);
16489
16490 if (GET_MODE_SIZE (vmode) == 16)
16491 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
16492 else
16493 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
16494 if (target != d->target)
16495 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16496 return true;
16497 }
16498
16499 for (i = 0; i < 8; ++i)
16500 mask |= (d->perm[i * 2] >= 16) << i;
16501 vmode = V8HImode;
16502 /* FALLTHRU */
16503
16504 do_subreg:
16505 target = gen_reg_rtx (vmode);
16506 op0 = gen_lowpart (vmode, op0);
16507 op1 = gen_lowpart (vmode, op1);
16508 break;
16509
16510 case E_V32QImode:
16511 /* See if bytes move in pairs. If not, vpblendvb must be used. */
16512 for (i = 0; i < 32; i += 2)
16513 if (d->perm[i] + 1 != d->perm[i + 1])
16514 goto use_pblendvb;
16515 /* See if bytes move in quadruplets. If yes, vpblendd
16516 with immediate can be used. */
16517 for (i = 0; i < 32; i += 4)
16518 if (d->perm[i] + 2 != d->perm[i + 2])
16519 break;
16520 if (i < 32)
16521 {
16522 /* See if bytes move the same in both lanes. If yes,
16523 vpblendw with immediate can be used. */
16524 for (i = 0; i < 16; i += 2)
16525 if (d->perm[i] + 16 != d->perm[i + 16])
16526 goto use_pblendvb;
16527
16528 /* Use vpblendw. */
16529 for (i = 0; i < 16; ++i)
16530 mask |= (d->perm[i * 2] >= 32) << i;
16531 vmode = V16HImode;
16532 goto do_subreg;
16533 }
16534
16535 /* Use vpblendd. */
16536 for (i = 0; i < 8; ++i)
16537 mask |= (d->perm[i * 4] >= 32) << i;
16538 vmode = V8SImode;
16539 goto do_subreg;
16540
16541 case E_V16HImode:
16542 /* See if words move in pairs. If yes, vpblendd can be used. */
16543 for (i = 0; i < 16; i += 2)
16544 if (d->perm[i] + 1 != d->perm[i + 1])
16545 break;
16546 if (i < 16)
16547 {
16548 /* See if words move the same in both lanes. If not,
16549 vpblendvb must be used. */
16550 for (i = 0; i < 8; i++)
16551 if (d->perm[i] + 8 != d->perm[i + 8])
16552 {
16553 /* Use vpblendvb. */
16554 for (i = 0; i < 32; ++i)
16555 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
16556
16557 vmode = V32QImode;
16558 nelt = 32;
16559 target = gen_reg_rtx (vmode);
16560 op0 = gen_lowpart (vmode, op0);
16561 op1 = gen_lowpart (vmode, op1);
16562 goto finish_pblendvb;
16563 }
16564
16565 /* Use vpblendw. */
16566 for (i = 0; i < 16; ++i)
16567 mask |= (d->perm[i] >= 16) << i;
16568 break;
16569 }
16570
16571 /* Use vpblendd. */
16572 for (i = 0; i < 8; ++i)
16573 mask |= (d->perm[i * 2] >= 16) << i;
16574 vmode = V8SImode;
16575 goto do_subreg;
16576
16577 case E_V4DImode:
16578 /* Use vpblendd. */
16579 for (i = 0; i < 4; ++i)
16580 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
16581 vmode = V8SImode;
16582 goto do_subreg;
16583
16584 default:
16585 gcc_unreachable ();
16586 }
16587
16588 switch (vmode)
16589 {
16590 case E_V8DFmode:
16591 case E_V8DImode:
16592 mmode = QImode;
16593 break;
16594 case E_V16SFmode:
16595 case E_V16SImode:
16596 mmode = HImode;
16597 break;
16598 case E_V32HImode:
16599 mmode = SImode;
16600 break;
16601 case E_V64QImode:
16602 mmode = DImode;
16603 break;
16604 default:
16605 mmode = VOIDmode;
16606 }
16607
16608 if (mmode != VOIDmode)
16609 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
16610 else
16611 maskop = GEN_INT (mask);
16612
16613 /* This matches five different patterns with the different modes. */
16614 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
16615 x = gen_rtx_SET (target, x);
16616 emit_insn (x);
16617 if (target != d->target)
16618 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16619
16620 return true;
16621 }
16622
16623 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
16624 in terms of the variable form of vpermilps.
16625
16626 Note that we will have already failed the immediate input vpermilps,
16627 which requires that the high and low part shuffle be identical; the
16628 variable form doesn't require that. */
16629
16630 static bool
16631 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
16632 {
16633 rtx rperm[8], vperm;
16634 unsigned i;
16635
16636 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
16637 return false;
16638
16639 /* We can only permute within the 128-bit lane. */
16640 for (i = 0; i < 8; ++i)
16641 {
16642 unsigned e = d->perm[i];
16643 if (i < 4 ? e >= 4 : e < 4)
16644 return false;
16645 }
16646
16647 if (d->testing_p)
16648 return true;
16649
16650 for (i = 0; i < 8; ++i)
16651 {
16652 unsigned e = d->perm[i];
16653
16654 /* Within each 128-bit lane, the elements of op0 are numbered
16655 from 0 and the elements of op1 are numbered from 4. */
16656 if (e >= 8 + 4)
16657 e -= 8;
16658 else if (e >= 4)
16659 e -= 4;
16660
16661 rperm[i] = GEN_INT (e);
16662 }
16663
16664 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
16665 vperm = force_reg (V8SImode, vperm);
16666 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
16667
16668 return true;
16669 }
16670
16671 /* Return true if permutation D can be performed as VMODE permutation
16672 instead. */
16673
16674 static bool
16675 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
16676 {
16677 unsigned int i, j, chunk;
16678
16679 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
16680 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
16681 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
16682 return false;
16683
16684 if (GET_MODE_NUNITS (vmode) >= d->nelt)
16685 return true;
16686
16687 chunk = d->nelt / GET_MODE_NUNITS (vmode);
16688 for (i = 0; i < d->nelt; i += chunk)
16689 if (d->perm[i] & (chunk - 1))
16690 return false;
16691 else
16692 for (j = 1; j < chunk; ++j)
16693 if (d->perm[i] + j != d->perm[i + j])
16694 return false;
16695
16696 return true;
16697 }
16698
16699 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
16700 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
16701
16702 static bool
16703 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
16704 {
16705 unsigned i, nelt, eltsz, mask;
16706 unsigned char perm[64];
16707 machine_mode vmode = V16QImode;
16708 rtx rperm[64], vperm, target, op0, op1;
16709
16710 nelt = d->nelt;
16711
16712 if (!d->one_operand_p)
16713 {
16714 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
16715 {
16716 if (TARGET_AVX2
16717 && valid_perm_using_mode_p (V2TImode, d))
16718 {
16719 if (d->testing_p)
16720 return true;
16721
16722 /* Use vperm2i128 insn. The pattern uses
16723 V4DImode instead of V2TImode. */
16724 target = d->target;
16725 if (d->vmode != V4DImode)
16726 target = gen_reg_rtx (V4DImode);
16727 op0 = gen_lowpart (V4DImode, d->op0);
16728 op1 = gen_lowpart (V4DImode, d->op1);
16729 rperm[0]
16730 = GEN_INT ((d->perm[0] / (nelt / 2))
16731 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
16732 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
16733 if (target != d->target)
16734 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16735 return true;
16736 }
16737 return false;
16738 }
16739 }
16740 else
16741 {
16742 if (GET_MODE_SIZE (d->vmode) == 16)
16743 {
16744 if (!TARGET_SSSE3)
16745 return false;
16746 }
16747 else if (GET_MODE_SIZE (d->vmode) == 32)
16748 {
16749 if (!TARGET_AVX2)
16750 return false;
16751
16752 /* V4DImode should be already handled through
16753 expand_vselect by vpermq instruction. */
16754 gcc_assert (d->vmode != V4DImode);
16755
16756 vmode = V32QImode;
16757 if (d->vmode == V8SImode
16758 || d->vmode == V16HImode
16759 || d->vmode == V32QImode)
16760 {
16761 /* First see if vpermq can be used for
16762 V8SImode/V16HImode/V32QImode. */
16763 if (valid_perm_using_mode_p (V4DImode, d))
16764 {
16765 for (i = 0; i < 4; i++)
16766 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
16767 if (d->testing_p)
16768 return true;
16769 target = gen_reg_rtx (V4DImode);
16770 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
16771 perm, 4, false))
16772 {
16773 emit_move_insn (d->target,
16774 gen_lowpart (d->vmode, target));
16775 return true;
16776 }
16777 return false;
16778 }
16779
16780 /* Next see if vpermd can be used. */
16781 if (valid_perm_using_mode_p (V8SImode, d))
16782 vmode = V8SImode;
16783 }
16784 /* Or if vpermps can be used. */
16785 else if (d->vmode == V8SFmode)
16786 vmode = V8SImode;
16787
16788 if (vmode == V32QImode)
16789 {
16790 /* vpshufb only works intra lanes, it is not
16791 possible to shuffle bytes in between the lanes. */
16792 for (i = 0; i < nelt; ++i)
16793 if ((d->perm[i] ^ i) & (nelt / 2))
16794 return false;
16795 }
16796 }
16797 else if (GET_MODE_SIZE (d->vmode) == 64)
16798 {
16799 if (!TARGET_AVX512BW)
16800 return false;
16801
16802 /* If vpermq didn't work, vpshufb won't work either. */
16803 if (d->vmode == V8DFmode || d->vmode == V8DImode)
16804 return false;
16805
16806 vmode = V64QImode;
16807 if (d->vmode == V16SImode
16808 || d->vmode == V32HImode
16809 || d->vmode == V64QImode)
16810 {
16811 /* First see if vpermq can be used for
16812 V16SImode/V32HImode/V64QImode. */
16813 if (valid_perm_using_mode_p (V8DImode, d))
16814 {
16815 for (i = 0; i < 8; i++)
16816 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
16817 if (d->testing_p)
16818 return true;
16819 target = gen_reg_rtx (V8DImode);
16820 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
16821 perm, 8, false))
16822 {
16823 emit_move_insn (d->target,
16824 gen_lowpart (d->vmode, target));
16825 return true;
16826 }
16827 return false;
16828 }
16829
16830 /* Next see if vpermd can be used. */
16831 if (valid_perm_using_mode_p (V16SImode, d))
16832 vmode = V16SImode;
16833 }
16834 /* Or if vpermps can be used. */
16835 else if (d->vmode == V16SFmode)
16836 vmode = V16SImode;
16837 if (vmode == V64QImode)
16838 {
16839 /* vpshufb only works intra lanes, it is not
16840 possible to shuffle bytes in between the lanes. */
16841 for (i = 0; i < nelt; ++i)
16842 if ((d->perm[i] ^ i) & (nelt / 4))
16843 return false;
16844 }
16845 }
16846 else
16847 return false;
16848 }
16849
16850 if (d->testing_p)
16851 return true;
16852
16853 if (vmode == V8SImode)
16854 for (i = 0; i < 8; ++i)
16855 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
16856 else if (vmode == V16SImode)
16857 for (i = 0; i < 16; ++i)
16858 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
16859 else
16860 {
16861 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
16862 if (!d->one_operand_p)
16863 mask = 2 * nelt - 1;
16864 else if (vmode == V16QImode)
16865 mask = nelt - 1;
16866 else if (vmode == V64QImode)
16867 mask = nelt / 4 - 1;
16868 else
16869 mask = nelt / 2 - 1;
16870
16871 for (i = 0; i < nelt; ++i)
16872 {
16873 unsigned j, e = d->perm[i] & mask;
16874 for (j = 0; j < eltsz; ++j)
16875 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
16876 }
16877 }
16878
16879 vperm = gen_rtx_CONST_VECTOR (vmode,
16880 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
16881 vperm = force_reg (vmode, vperm);
16882
16883 target = d->target;
16884 if (d->vmode != vmode)
16885 target = gen_reg_rtx (vmode);
16886 op0 = gen_lowpart (vmode, d->op0);
16887 if (d->one_operand_p)
16888 {
16889 if (vmode == V16QImode)
16890 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
16891 else if (vmode == V32QImode)
16892 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
16893 else if (vmode == V64QImode)
16894 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
16895 else if (vmode == V8SFmode)
16896 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
16897 else if (vmode == V8SImode)
16898 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
16899 else if (vmode == V16SFmode)
16900 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
16901 else if (vmode == V16SImode)
16902 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
16903 else
16904 gcc_unreachable ();
16905 }
16906 else
16907 {
16908 op1 = gen_lowpart (vmode, d->op1);
16909 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
16910 }
16911 if (target != d->target)
16912 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
16913
16914 return true;
16915 }
16916
16917 /* For V*[QHS]Imode permutations, check if the same permutation
16918 can't be performed in a 2x, 4x or 8x wider inner mode. */
16919
16920 static bool
16921 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
16922 struct expand_vec_perm_d *nd)
16923 {
16924 int i;
16925 machine_mode mode = VOIDmode;
16926
16927 switch (d->vmode)
16928 {
16929 case E_V16QImode: mode = V8HImode; break;
16930 case E_V32QImode: mode = V16HImode; break;
16931 case E_V64QImode: mode = V32HImode; break;
16932 case E_V8HImode: mode = V4SImode; break;
16933 case E_V16HImode: mode = V8SImode; break;
16934 case E_V32HImode: mode = V16SImode; break;
16935 case E_V4SImode: mode = V2DImode; break;
16936 case E_V8SImode: mode = V4DImode; break;
16937 case E_V16SImode: mode = V8DImode; break;
16938 default: return false;
16939 }
16940 for (i = 0; i < d->nelt; i += 2)
16941 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
16942 return false;
16943 nd->vmode = mode;
16944 nd->nelt = d->nelt / 2;
16945 for (i = 0; i < nd->nelt; i++)
16946 nd->perm[i] = d->perm[2 * i] / 2;
16947 if (GET_MODE_INNER (mode) != DImode)
16948 canonicalize_vector_int_perm (nd, nd);
16949 if (nd != d)
16950 {
16951 nd->one_operand_p = d->one_operand_p;
16952 nd->testing_p = d->testing_p;
16953 if (d->op0 == d->op1)
16954 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
16955 else
16956 {
16957 nd->op0 = gen_lowpart (nd->vmode, d->op0);
16958 nd->op1 = gen_lowpart (nd->vmode, d->op1);
16959 }
16960 if (d->testing_p)
16961 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
16962 else
16963 nd->target = gen_reg_rtx (nd->vmode);
16964 }
16965 return true;
16966 }
16967
16968 /* Try to expand one-operand permutation with constant mask. */
16969
16970 static bool
16971 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
16972 {
16973 machine_mode mode = GET_MODE (d->op0);
16974 machine_mode maskmode = mode;
16975 rtx (*gen) (rtx, rtx, rtx) = NULL;
16976 rtx target, op0, mask;
16977 rtx vec[64];
16978
16979 if (!rtx_equal_p (d->op0, d->op1))
16980 return false;
16981
16982 if (!TARGET_AVX512F)
16983 return false;
16984
16985 switch (mode)
16986 {
16987 case E_V16SImode:
16988 gen = gen_avx512f_permvarv16si;
16989 break;
16990 case E_V16SFmode:
16991 gen = gen_avx512f_permvarv16sf;
16992 maskmode = V16SImode;
16993 break;
16994 case E_V8DImode:
16995 gen = gen_avx512f_permvarv8di;
16996 break;
16997 case E_V8DFmode:
16998 gen = gen_avx512f_permvarv8df;
16999 maskmode = V8DImode;
17000 break;
17001 default:
17002 return false;
17003 }
17004
17005 target = d->target;
17006 op0 = d->op0;
17007 for (int i = 0; i < d->nelt; ++i)
17008 vec[i] = GEN_INT (d->perm[i]);
17009 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
17010 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
17011 return true;
17012 }
17013
17014 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
17015
17016 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
17017 in a single instruction. */
17018
17019 static bool
17020 expand_vec_perm_1 (struct expand_vec_perm_d *d)
17021 {
17022 unsigned i, nelt = d->nelt;
17023 struct expand_vec_perm_d nd;
17024
17025 /* Check plain VEC_SELECT first, because AVX has instructions that could
17026 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
17027 input where SEL+CONCAT may not. */
17028 if (d->one_operand_p)
17029 {
17030 int mask = nelt - 1;
17031 bool identity_perm = true;
17032 bool broadcast_perm = true;
17033
17034 for (i = 0; i < nelt; i++)
17035 {
17036 nd.perm[i] = d->perm[i] & mask;
17037 if (nd.perm[i] != i)
17038 identity_perm = false;
17039 if (nd.perm[i])
17040 broadcast_perm = false;
17041 }
17042
17043 if (identity_perm)
17044 {
17045 if (!d->testing_p)
17046 emit_move_insn (d->target, d->op0);
17047 return true;
17048 }
17049 else if (broadcast_perm && TARGET_AVX2)
17050 {
17051 /* Use vpbroadcast{b,w,d}. */
17052 rtx (*gen) (rtx, rtx) = NULL;
17053 switch (d->vmode)
17054 {
17055 case E_V64QImode:
17056 if (TARGET_AVX512BW)
17057 gen = gen_avx512bw_vec_dupv64qi_1;
17058 break;
17059 case E_V32QImode:
17060 gen = gen_avx2_pbroadcastv32qi_1;
17061 break;
17062 case E_V32HImode:
17063 if (TARGET_AVX512BW)
17064 gen = gen_avx512bw_vec_dupv32hi_1;
17065 break;
17066 case E_V16HImode:
17067 gen = gen_avx2_pbroadcastv16hi_1;
17068 break;
17069 case E_V16SImode:
17070 if (TARGET_AVX512F)
17071 gen = gen_avx512f_vec_dupv16si_1;
17072 break;
17073 case E_V8SImode:
17074 gen = gen_avx2_pbroadcastv8si_1;
17075 break;
17076 case E_V16QImode:
17077 gen = gen_avx2_pbroadcastv16qi;
17078 break;
17079 case E_V8HImode:
17080 gen = gen_avx2_pbroadcastv8hi;
17081 break;
17082 case E_V16SFmode:
17083 if (TARGET_AVX512F)
17084 gen = gen_avx512f_vec_dupv16sf_1;
17085 break;
17086 case E_V8SFmode:
17087 gen = gen_avx2_vec_dupv8sf_1;
17088 break;
17089 case E_V8DFmode:
17090 if (TARGET_AVX512F)
17091 gen = gen_avx512f_vec_dupv8df_1;
17092 break;
17093 case E_V8DImode:
17094 if (TARGET_AVX512F)
17095 gen = gen_avx512f_vec_dupv8di_1;
17096 break;
17097 /* For other modes prefer other shuffles this function creates. */
17098 default: break;
17099 }
17100 if (gen != NULL)
17101 {
17102 if (!d->testing_p)
17103 emit_insn (gen (d->target, d->op0));
17104 return true;
17105 }
17106 }
17107
17108 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
17109 return true;
17110
17111 /* There are plenty of patterns in sse.md that are written for
17112 SEL+CONCAT and are not replicated for a single op. Perhaps
17113 that should be changed, to avoid the nastiness here. */
17114
17115 /* Recognize interleave style patterns, which means incrementing
17116 every other permutation operand. */
17117 for (i = 0; i < nelt; i += 2)
17118 {
17119 nd.perm[i] = d->perm[i] & mask;
17120 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
17121 }
17122 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
17123 d->testing_p))
17124 return true;
17125
17126 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
17127 if (nelt >= 4)
17128 {
17129 for (i = 0; i < nelt; i += 4)
17130 {
17131 nd.perm[i + 0] = d->perm[i + 0] & mask;
17132 nd.perm[i + 1] = d->perm[i + 1] & mask;
17133 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
17134 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
17135 }
17136
17137 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
17138 d->testing_p))
17139 return true;
17140 }
17141 }
17142
17143 /* Try movss/movsd instructions. */
17144 if (expand_vec_perm_movs (d))
17145 return true;
17146
17147 /* Finally, try the fully general two operand permute. */
17148 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
17149 d->testing_p))
17150 return true;
17151
17152 /* Recognize interleave style patterns with reversed operands. */
17153 if (!d->one_operand_p)
17154 {
17155 for (i = 0; i < nelt; ++i)
17156 {
17157 unsigned e = d->perm[i];
17158 if (e >= nelt)
17159 e -= nelt;
17160 else
17161 e += nelt;
17162 nd.perm[i] = e;
17163 }
17164
17165 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
17166 d->testing_p))
17167 return true;
17168 }
17169
17170 /* Try the SSE4.1 blend variable merge instructions. */
17171 if (expand_vec_perm_blend (d))
17172 return true;
17173
17174 /* Try one of the AVX vpermil variable permutations. */
17175 if (expand_vec_perm_vpermil (d))
17176 return true;
17177
17178 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
17179 vpshufb, vpermd, vpermps or vpermq variable permutation. */
17180 if (expand_vec_perm_pshufb (d))
17181 return true;
17182
17183 /* Try the AVX2 vpalignr instruction. */
17184 if (expand_vec_perm_palignr (d, true))
17185 return true;
17186
17187 /* Try the AVX512F vperm{s,d} instructions. */
17188 if (ix86_expand_vec_one_operand_perm_avx512 (d))
17189 return true;
17190
17191 /* Try the AVX512F vpermt2/vpermi2 instructions. */
17192 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
17193 return true;
17194
17195 /* See if we can get the same permutation in different vector integer
17196 mode. */
17197 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
17198 {
17199 if (!d->testing_p)
17200 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
17201 return true;
17202 }
17203 return false;
17204 }
17205
17206 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
17207 in terms of a pair of pshuflw + pshufhw instructions. */
17208
17209 static bool
17210 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
17211 {
17212 unsigned char perm2[MAX_VECT_LEN];
17213 unsigned i;
17214 bool ok;
17215
17216 if (d->vmode != V8HImode || !d->one_operand_p)
17217 return false;
17218
17219 /* The two permutations only operate in 64-bit lanes. */
17220 for (i = 0; i < 4; ++i)
17221 if (d->perm[i] >= 4)
17222 return false;
17223 for (i = 4; i < 8; ++i)
17224 if (d->perm[i] < 4)
17225 return false;
17226
17227 if (d->testing_p)
17228 return true;
17229
17230 /* Emit the pshuflw. */
17231 memcpy (perm2, d->perm, 4);
17232 for (i = 4; i < 8; ++i)
17233 perm2[i] = i;
17234 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
17235 gcc_assert (ok);
17236
17237 /* Emit the pshufhw. */
17238 memcpy (perm2 + 4, d->perm + 4, 4);
17239 for (i = 0; i < 4; ++i)
17240 perm2[i] = i;
17241 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
17242 gcc_assert (ok);
17243
17244 return true;
17245 }
17246
17247 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
17248 the permutation using the SSSE3 palignr instruction. This succeeds
17249 when all of the elements in PERM fit within one vector and we merely
17250 need to shift them down so that a single vector permutation has a
17251 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
17252 the vpalignr instruction itself can perform the requested permutation. */
17253
17254 static bool
17255 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
17256 {
17257 unsigned i, nelt = d->nelt;
17258 unsigned min, max, minswap, maxswap;
17259 bool in_order, ok, swap = false;
17260 rtx shift, target;
17261 struct expand_vec_perm_d dcopy;
17262
17263 /* Even with AVX, palignr only operates on 128-bit vectors,
17264 in AVX2 palignr operates on both 128-bit lanes. */
17265 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
17266 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
17267 return false;
17268
17269 min = 2 * nelt;
17270 max = 0;
17271 minswap = 2 * nelt;
17272 maxswap = 0;
17273 for (i = 0; i < nelt; ++i)
17274 {
17275 unsigned e = d->perm[i];
17276 unsigned eswap = d->perm[i] ^ nelt;
17277 if (GET_MODE_SIZE (d->vmode) == 32)
17278 {
17279 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
17280 eswap = e ^ (nelt / 2);
17281 }
17282 if (e < min)
17283 min = e;
17284 if (e > max)
17285 max = e;
17286 if (eswap < minswap)
17287 minswap = eswap;
17288 if (eswap > maxswap)
17289 maxswap = eswap;
17290 }
17291 if (min == 0
17292 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
17293 {
17294 if (d->one_operand_p
17295 || minswap == 0
17296 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
17297 ? nelt / 2 : nelt))
17298 return false;
17299 swap = true;
17300 min = minswap;
17301 max = maxswap;
17302 }
17303
17304 /* Given that we have SSSE3, we know we'll be able to implement the
17305 single operand permutation after the palignr with pshufb for
17306 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
17307 first. */
17308 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
17309 return true;
17310
17311 dcopy = *d;
17312 if (swap)
17313 {
17314 dcopy.op0 = d->op1;
17315 dcopy.op1 = d->op0;
17316 for (i = 0; i < nelt; ++i)
17317 dcopy.perm[i] ^= nelt;
17318 }
17319
17320 in_order = true;
17321 for (i = 0; i < nelt; ++i)
17322 {
17323 unsigned e = dcopy.perm[i];
17324 if (GET_MODE_SIZE (d->vmode) == 32
17325 && e >= nelt
17326 && (e & (nelt / 2 - 1)) < min)
17327 e = e - min - (nelt / 2);
17328 else
17329 e = e - min;
17330 if (e != i)
17331 in_order = false;
17332 dcopy.perm[i] = e;
17333 }
17334 dcopy.one_operand_p = true;
17335
17336 if (single_insn_only_p && !in_order)
17337 return false;
17338
17339 /* For AVX2, test whether we can permute the result in one instruction. */
17340 if (d->testing_p)
17341 {
17342 if (in_order)
17343 return true;
17344 dcopy.op1 = dcopy.op0;
17345 return expand_vec_perm_1 (&dcopy);
17346 }
17347
17348 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
17349 if (GET_MODE_SIZE (d->vmode) == 16)
17350 {
17351 target = gen_reg_rtx (TImode);
17352 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
17353 gen_lowpart (TImode, dcopy.op0), shift));
17354 }
17355 else
17356 {
17357 target = gen_reg_rtx (V2TImode);
17358 emit_insn (gen_avx2_palignrv2ti (target,
17359 gen_lowpart (V2TImode, dcopy.op1),
17360 gen_lowpart (V2TImode, dcopy.op0),
17361 shift));
17362 }
17363
17364 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
17365
17366 /* Test for the degenerate case where the alignment by itself
17367 produces the desired permutation. */
17368 if (in_order)
17369 {
17370 emit_move_insn (d->target, dcopy.op0);
17371 return true;
17372 }
17373
17374 ok = expand_vec_perm_1 (&dcopy);
17375 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
17376
17377 return ok;
17378 }
17379
17380 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17381 the permutation using the SSE4_1 pblendv instruction. Potentially
17382 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
17383
17384 static bool
17385 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
17386 {
17387 unsigned i, which, nelt = d->nelt;
17388 struct expand_vec_perm_d dcopy, dcopy1;
17389 machine_mode vmode = d->vmode;
17390 bool ok;
17391
17392 /* Use the same checks as in expand_vec_perm_blend. */
17393 if (d->one_operand_p)
17394 return false;
17395 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
17396 ;
17397 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
17398 ;
17399 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
17400 ;
17401 else
17402 return false;
17403
17404 /* Figure out where permutation elements stay not in their
17405 respective lanes. */
17406 for (i = 0, which = 0; i < nelt; ++i)
17407 {
17408 unsigned e = d->perm[i];
17409 if (e != i)
17410 which |= (e < nelt ? 1 : 2);
17411 }
17412 /* We can pblend the part where elements stay not in their
17413 respective lanes only when these elements are all in one
17414 half of a permutation.
17415 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
17416 lanes, but both 8 and 9 >= 8
17417 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
17418 respective lanes and 8 >= 8, but 2 not. */
17419 if (which != 1 && which != 2)
17420 return false;
17421 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
17422 return true;
17423
17424 /* First we apply one operand permutation to the part where
17425 elements stay not in their respective lanes. */
17426 dcopy = *d;
17427 if (which == 2)
17428 dcopy.op0 = dcopy.op1 = d->op1;
17429 else
17430 dcopy.op0 = dcopy.op1 = d->op0;
17431 if (!d->testing_p)
17432 dcopy.target = gen_reg_rtx (vmode);
17433 dcopy.one_operand_p = true;
17434
17435 for (i = 0; i < nelt; ++i)
17436 dcopy.perm[i] = d->perm[i] & (nelt - 1);
17437
17438 ok = expand_vec_perm_1 (&dcopy);
17439 if (GET_MODE_SIZE (vmode) != 16 && !ok)
17440 return false;
17441 else
17442 gcc_assert (ok);
17443 if (d->testing_p)
17444 return true;
17445
17446 /* Next we put permuted elements into their positions. */
17447 dcopy1 = *d;
17448 if (which == 2)
17449 dcopy1.op1 = dcopy.target;
17450 else
17451 dcopy1.op0 = dcopy.target;
17452
17453 for (i = 0; i < nelt; ++i)
17454 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
17455
17456 ok = expand_vec_perm_blend (&dcopy1);
17457 gcc_assert (ok);
17458
17459 return true;
17460 }
17461
17462 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
17463
17464 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
17465 a two vector permutation into a single vector permutation by using
17466 an interleave operation to merge the vectors. */
17467
17468 static bool
17469 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
17470 {
17471 struct expand_vec_perm_d dremap, dfinal;
17472 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
17473 unsigned HOST_WIDE_INT contents;
17474 unsigned char remap[2 * MAX_VECT_LEN];
17475 rtx_insn *seq;
17476 bool ok, same_halves = false;
17477
17478 if (GET_MODE_SIZE (d->vmode) == 16)
17479 {
17480 if (d->one_operand_p)
17481 return false;
17482 }
17483 else if (GET_MODE_SIZE (d->vmode) == 32)
17484 {
17485 if (!TARGET_AVX)
17486 return false;
17487 /* For 32-byte modes allow even d->one_operand_p.
17488 The lack of cross-lane shuffling in some instructions
17489 might prevent a single insn shuffle. */
17490 dfinal = *d;
17491 dfinal.testing_p = true;
17492 /* If expand_vec_perm_interleave3 can expand this into
17493 a 3 insn sequence, give up and let it be expanded as
17494 3 insn sequence. While that is one insn longer,
17495 it doesn't need a memory operand and in the common
17496 case that both interleave low and high permutations
17497 with the same operands are adjacent needs 4 insns
17498 for both after CSE. */
17499 if (expand_vec_perm_interleave3 (&dfinal))
17500 return false;
17501 }
17502 else
17503 return false;
17504
17505 /* Examine from whence the elements come. */
17506 contents = 0;
17507 for (i = 0; i < nelt; ++i)
17508 contents |= HOST_WIDE_INT_1U << d->perm[i];
17509
17510 memset (remap, 0xff, sizeof (remap));
17511 dremap = *d;
17512
17513 if (GET_MODE_SIZE (d->vmode) == 16)
17514 {
17515 unsigned HOST_WIDE_INT h1, h2, h3, h4;
17516
17517 /* Split the two input vectors into 4 halves. */
17518 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
17519 h2 = h1 << nelt2;
17520 h3 = h2 << nelt2;
17521 h4 = h3 << nelt2;
17522
17523 /* If the elements from the low halves use interleave low, and similarly
17524 for interleave high. If the elements are from mis-matched halves, we
17525 can use shufps for V4SF/V4SI or do a DImode shuffle. */
17526 if ((contents & (h1 | h3)) == contents)
17527 {
17528 /* punpckl* */
17529 for (i = 0; i < nelt2; ++i)
17530 {
17531 remap[i] = i * 2;
17532 remap[i + nelt] = i * 2 + 1;
17533 dremap.perm[i * 2] = i;
17534 dremap.perm[i * 2 + 1] = i + nelt;
17535 }
17536 if (!TARGET_SSE2 && d->vmode == V4SImode)
17537 dremap.vmode = V4SFmode;
17538 }
17539 else if ((contents & (h2 | h4)) == contents)
17540 {
17541 /* punpckh* */
17542 for (i = 0; i < nelt2; ++i)
17543 {
17544 remap[i + nelt2] = i * 2;
17545 remap[i + nelt + nelt2] = i * 2 + 1;
17546 dremap.perm[i * 2] = i + nelt2;
17547 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
17548 }
17549 if (!TARGET_SSE2 && d->vmode == V4SImode)
17550 dremap.vmode = V4SFmode;
17551 }
17552 else if ((contents & (h1 | h4)) == contents)
17553 {
17554 /* shufps */
17555 for (i = 0; i < nelt2; ++i)
17556 {
17557 remap[i] = i;
17558 remap[i + nelt + nelt2] = i + nelt2;
17559 dremap.perm[i] = i;
17560 dremap.perm[i + nelt2] = i + nelt + nelt2;
17561 }
17562 if (nelt != 4)
17563 {
17564 /* shufpd */
17565 dremap.vmode = V2DImode;
17566 dremap.nelt = 2;
17567 dremap.perm[0] = 0;
17568 dremap.perm[1] = 3;
17569 }
17570 }
17571 else if ((contents & (h2 | h3)) == contents)
17572 {
17573 /* shufps */
17574 for (i = 0; i < nelt2; ++i)
17575 {
17576 remap[i + nelt2] = i;
17577 remap[i + nelt] = i + nelt2;
17578 dremap.perm[i] = i + nelt2;
17579 dremap.perm[i + nelt2] = i + nelt;
17580 }
17581 if (nelt != 4)
17582 {
17583 /* shufpd */
17584 dremap.vmode = V2DImode;
17585 dremap.nelt = 2;
17586 dremap.perm[0] = 1;
17587 dremap.perm[1] = 2;
17588 }
17589 }
17590 else
17591 return false;
17592 }
17593 else
17594 {
17595 unsigned int nelt4 = nelt / 4, nzcnt = 0;
17596 unsigned HOST_WIDE_INT q[8];
17597 unsigned int nonzero_halves[4];
17598
17599 /* Split the two input vectors into 8 quarters. */
17600 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
17601 for (i = 1; i < 8; ++i)
17602 q[i] = q[0] << (nelt4 * i);
17603 for (i = 0; i < 4; ++i)
17604 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
17605 {
17606 nonzero_halves[nzcnt] = i;
17607 ++nzcnt;
17608 }
17609
17610 if (nzcnt == 1)
17611 {
17612 gcc_assert (d->one_operand_p);
17613 nonzero_halves[1] = nonzero_halves[0];
17614 same_halves = true;
17615 }
17616 else if (d->one_operand_p)
17617 {
17618 gcc_assert (nonzero_halves[0] == 0);
17619 gcc_assert (nonzero_halves[1] == 1);
17620 }
17621
17622 if (nzcnt <= 2)
17623 {
17624 if (d->perm[0] / nelt2 == nonzero_halves[1])
17625 {
17626 /* Attempt to increase the likelihood that dfinal
17627 shuffle will be intra-lane. */
17628 std::swap (nonzero_halves[0], nonzero_halves[1]);
17629 }
17630
17631 /* vperm2f128 or vperm2i128. */
17632 for (i = 0; i < nelt2; ++i)
17633 {
17634 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
17635 remap[i + nonzero_halves[0] * nelt2] = i;
17636 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
17637 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
17638 }
17639
17640 if (d->vmode != V8SFmode
17641 && d->vmode != V4DFmode
17642 && d->vmode != V8SImode)
17643 {
17644 dremap.vmode = V8SImode;
17645 dremap.nelt = 8;
17646 for (i = 0; i < 4; ++i)
17647 {
17648 dremap.perm[i] = i + nonzero_halves[0] * 4;
17649 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
17650 }
17651 }
17652 }
17653 else if (d->one_operand_p)
17654 return false;
17655 else if (TARGET_AVX2
17656 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
17657 {
17658 /* vpunpckl* */
17659 for (i = 0; i < nelt4; ++i)
17660 {
17661 remap[i] = i * 2;
17662 remap[i + nelt] = i * 2 + 1;
17663 remap[i + nelt2] = i * 2 + nelt2;
17664 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
17665 dremap.perm[i * 2] = i;
17666 dremap.perm[i * 2 + 1] = i + nelt;
17667 dremap.perm[i * 2 + nelt2] = i + nelt2;
17668 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
17669 }
17670 }
17671 else if (TARGET_AVX2
17672 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
17673 {
17674 /* vpunpckh* */
17675 for (i = 0; i < nelt4; ++i)
17676 {
17677 remap[i + nelt4] = i * 2;
17678 remap[i + nelt + nelt4] = i * 2 + 1;
17679 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
17680 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
17681 dremap.perm[i * 2] = i + nelt4;
17682 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
17683 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
17684 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
17685 }
17686 }
17687 else
17688 return false;
17689 }
17690
17691 /* Use the remapping array set up above to move the elements from their
17692 swizzled locations into their final destinations. */
17693 dfinal = *d;
17694 for (i = 0; i < nelt; ++i)
17695 {
17696 unsigned e = remap[d->perm[i]];
17697 gcc_assert (e < nelt);
17698 /* If same_halves is true, both halves of the remapped vector are the
17699 same. Avoid cross-lane accesses if possible. */
17700 if (same_halves && i >= nelt2)
17701 {
17702 gcc_assert (e < nelt2);
17703 dfinal.perm[i] = e + nelt2;
17704 }
17705 else
17706 dfinal.perm[i] = e;
17707 }
17708 if (!d->testing_p)
17709 {
17710 dremap.target = gen_reg_rtx (dremap.vmode);
17711 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
17712 }
17713 dfinal.op1 = dfinal.op0;
17714 dfinal.one_operand_p = true;
17715
17716 /* Test if the final remap can be done with a single insn. For V4SFmode or
17717 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
17718 start_sequence ();
17719 ok = expand_vec_perm_1 (&dfinal);
17720 seq = get_insns ();
17721 end_sequence ();
17722
17723 if (!ok)
17724 return false;
17725
17726 if (d->testing_p)
17727 return true;
17728
17729 if (dremap.vmode != dfinal.vmode)
17730 {
17731 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
17732 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
17733 }
17734
17735 ok = expand_vec_perm_1 (&dremap);
17736 gcc_assert (ok);
17737
17738 emit_insn (seq);
17739 return true;
17740 }
17741
17742 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
17743 a single vector cross-lane permutation into vpermq followed
17744 by any of the single insn permutations. */
17745
17746 static bool
17747 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
17748 {
17749 struct expand_vec_perm_d dremap, dfinal;
17750 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
17751 unsigned contents[2];
17752 bool ok;
17753
17754 if (!(TARGET_AVX2
17755 && (d->vmode == V32QImode || d->vmode == V16HImode)
17756 && d->one_operand_p))
17757 return false;
17758
17759 contents[0] = 0;
17760 contents[1] = 0;
17761 for (i = 0; i < nelt2; ++i)
17762 {
17763 contents[0] |= 1u << (d->perm[i] / nelt4);
17764 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
17765 }
17766
17767 for (i = 0; i < 2; ++i)
17768 {
17769 unsigned int cnt = 0;
17770 for (j = 0; j < 4; ++j)
17771 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
17772 return false;
17773 }
17774
17775 if (d->testing_p)
17776 return true;
17777
17778 dremap = *d;
17779 dremap.vmode = V4DImode;
17780 dremap.nelt = 4;
17781 dremap.target = gen_reg_rtx (V4DImode);
17782 dremap.op0 = gen_lowpart (V4DImode, d->op0);
17783 dremap.op1 = dremap.op0;
17784 dremap.one_operand_p = true;
17785 for (i = 0; i < 2; ++i)
17786 {
17787 unsigned int cnt = 0;
17788 for (j = 0; j < 4; ++j)
17789 if ((contents[i] & (1u << j)) != 0)
17790 dremap.perm[2 * i + cnt++] = j;
17791 for (; cnt < 2; ++cnt)
17792 dremap.perm[2 * i + cnt] = 0;
17793 }
17794
17795 dfinal = *d;
17796 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
17797 dfinal.op1 = dfinal.op0;
17798 dfinal.one_operand_p = true;
17799 for (i = 0, j = 0; i < nelt; ++i)
17800 {
17801 if (i == nelt2)
17802 j = 2;
17803 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
17804 if ((d->perm[i] / nelt4) == dremap.perm[j])
17805 ;
17806 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
17807 dfinal.perm[i] |= nelt4;
17808 else
17809 gcc_unreachable ();
17810 }
17811
17812 ok = expand_vec_perm_1 (&dremap);
17813 gcc_assert (ok);
17814
17815 ok = expand_vec_perm_1 (&dfinal);
17816 gcc_assert (ok);
17817
17818 return true;
17819 }
17820
17821 static bool canonicalize_perm (struct expand_vec_perm_d *d);
17822
17823 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
17824 a vector permutation using two instructions, vperm2f128 resp.
17825 vperm2i128 followed by any single in-lane permutation. */
17826
17827 static bool
17828 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
17829 {
17830 struct expand_vec_perm_d dfirst, dsecond;
17831 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
17832 bool ok;
17833
17834 if (!TARGET_AVX
17835 || GET_MODE_SIZE (d->vmode) != 32
17836 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
17837 return false;
17838
17839 dsecond = *d;
17840 dsecond.one_operand_p = false;
17841 dsecond.testing_p = true;
17842
17843 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
17844 immediate. For perm < 16 the second permutation uses
17845 d->op0 as first operand, for perm >= 16 it uses d->op1
17846 as first operand. The second operand is the result of
17847 vperm2[fi]128. */
17848 for (perm = 0; perm < 32; perm++)
17849 {
17850 /* Ignore permutations which do not move anything cross-lane. */
17851 if (perm < 16)
17852 {
17853 /* The second shuffle for e.g. V4DFmode has
17854 0123 and ABCD operands.
17855 Ignore AB23, as 23 is already in the second lane
17856 of the first operand. */
17857 if ((perm & 0xc) == (1 << 2)) continue;
17858 /* And 01CD, as 01 is in the first lane of the first
17859 operand. */
17860 if ((perm & 3) == 0) continue;
17861 /* And 4567, as then the vperm2[fi]128 doesn't change
17862 anything on the original 4567 second operand. */
17863 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
17864 }
17865 else
17866 {
17867 /* The second shuffle for e.g. V4DFmode has
17868 4567 and ABCD operands.
17869 Ignore AB67, as 67 is already in the second lane
17870 of the first operand. */
17871 if ((perm & 0xc) == (3 << 2)) continue;
17872 /* And 45CD, as 45 is in the first lane of the first
17873 operand. */
17874 if ((perm & 3) == 2) continue;
17875 /* And 0123, as then the vperm2[fi]128 doesn't change
17876 anything on the original 0123 first operand. */
17877 if ((perm & 0xf) == (1 << 2)) continue;
17878 }
17879
17880 for (i = 0; i < nelt; i++)
17881 {
17882 j = d->perm[i] / nelt2;
17883 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
17884 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
17885 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
17886 dsecond.perm[i] = d->perm[i] & (nelt - 1);
17887 else
17888 break;
17889 }
17890
17891 if (i == nelt)
17892 {
17893 start_sequence ();
17894 ok = expand_vec_perm_1 (&dsecond);
17895 end_sequence ();
17896 }
17897 else
17898 ok = false;
17899
17900 if (ok)
17901 {
17902 if (d->testing_p)
17903 return true;
17904
17905 /* Found a usable second shuffle. dfirst will be
17906 vperm2f128 on d->op0 and d->op1. */
17907 dsecond.testing_p = false;
17908 dfirst = *d;
17909 dfirst.target = gen_reg_rtx (d->vmode);
17910 for (i = 0; i < nelt; i++)
17911 dfirst.perm[i] = (i & (nelt2 - 1))
17912 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
17913
17914 canonicalize_perm (&dfirst);
17915 ok = expand_vec_perm_1 (&dfirst);
17916 gcc_assert (ok);
17917
17918 /* And dsecond is some single insn shuffle, taking
17919 d->op0 and result of vperm2f128 (if perm < 16) or
17920 d->op1 and result of vperm2f128 (otherwise). */
17921 if (perm >= 16)
17922 dsecond.op0 = dsecond.op1;
17923 dsecond.op1 = dfirst.target;
17924
17925 ok = expand_vec_perm_1 (&dsecond);
17926 gcc_assert (ok);
17927
17928 return true;
17929 }
17930
17931 /* For one operand, the only useful vperm2f128 permutation is 0x01
17932 aka lanes swap. */
17933 if (d->one_operand_p)
17934 return false;
17935 }
17936
17937 return false;
17938 }
17939
17940 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
17941 a two vector permutation using 2 intra-lane interleave insns
17942 and cross-lane shuffle for 32-byte vectors. */
17943
17944 static bool
17945 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
17946 {
17947 unsigned i, nelt;
17948 rtx (*gen) (rtx, rtx, rtx);
17949
17950 if (d->one_operand_p)
17951 return false;
17952 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
17953 ;
17954 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
17955 ;
17956 else
17957 return false;
17958
17959 nelt = d->nelt;
17960 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
17961 return false;
17962 for (i = 0; i < nelt; i += 2)
17963 if (d->perm[i] != d->perm[0] + i / 2
17964 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
17965 return false;
17966
17967 if (d->testing_p)
17968 return true;
17969
17970 switch (d->vmode)
17971 {
17972 case E_V32QImode:
17973 if (d->perm[0])
17974 gen = gen_vec_interleave_highv32qi;
17975 else
17976 gen = gen_vec_interleave_lowv32qi;
17977 break;
17978 case E_V16HImode:
17979 if (d->perm[0])
17980 gen = gen_vec_interleave_highv16hi;
17981 else
17982 gen = gen_vec_interleave_lowv16hi;
17983 break;
17984 case E_V8SImode:
17985 if (d->perm[0])
17986 gen = gen_vec_interleave_highv8si;
17987 else
17988 gen = gen_vec_interleave_lowv8si;
17989 break;
17990 case E_V4DImode:
17991 if (d->perm[0])
17992 gen = gen_vec_interleave_highv4di;
17993 else
17994 gen = gen_vec_interleave_lowv4di;
17995 break;
17996 case E_V8SFmode:
17997 if (d->perm[0])
17998 gen = gen_vec_interleave_highv8sf;
17999 else
18000 gen = gen_vec_interleave_lowv8sf;
18001 break;
18002 case E_V4DFmode:
18003 if (d->perm[0])
18004 gen = gen_vec_interleave_highv4df;
18005 else
18006 gen = gen_vec_interleave_lowv4df;
18007 break;
18008 default:
18009 gcc_unreachable ();
18010 }
18011
18012 emit_insn (gen (d->target, d->op0, d->op1));
18013 return true;
18014 }
18015
18016 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
18017 a single vector permutation using a single intra-lane vector
18018 permutation, vperm2f128 swapping the lanes and vblend* insn blending
18019 the non-swapped and swapped vectors together. */
18020
18021 static bool
18022 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
18023 {
18024 struct expand_vec_perm_d dfirst, dsecond;
18025 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
18026 rtx_insn *seq;
18027 bool ok;
18028 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
18029
18030 if (!TARGET_AVX
18031 || TARGET_AVX2
18032 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
18033 || !d->one_operand_p)
18034 return false;
18035
18036 dfirst = *d;
18037 for (i = 0; i < nelt; i++)
18038 dfirst.perm[i] = 0xff;
18039 for (i = 0, msk = 0; i < nelt; i++)
18040 {
18041 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
18042 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
18043 return false;
18044 dfirst.perm[j] = d->perm[i];
18045 if (j != i)
18046 msk |= (1 << i);
18047 }
18048 for (i = 0; i < nelt; i++)
18049 if (dfirst.perm[i] == 0xff)
18050 dfirst.perm[i] = i;
18051
18052 if (!d->testing_p)
18053 dfirst.target = gen_reg_rtx (dfirst.vmode);
18054
18055 start_sequence ();
18056 ok = expand_vec_perm_1 (&dfirst);
18057 seq = get_insns ();
18058 end_sequence ();
18059
18060 if (!ok)
18061 return false;
18062
18063 if (d->testing_p)
18064 return true;
18065
18066 emit_insn (seq);
18067
18068 dsecond = *d;
18069 dsecond.op0 = dfirst.target;
18070 dsecond.op1 = dfirst.target;
18071 dsecond.one_operand_p = true;
18072 dsecond.target = gen_reg_rtx (dsecond.vmode);
18073 for (i = 0; i < nelt; i++)
18074 dsecond.perm[i] = i ^ nelt2;
18075
18076 ok = expand_vec_perm_1 (&dsecond);
18077 gcc_assert (ok);
18078
18079 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
18080 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
18081 return true;
18082 }
18083
18084 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
18085 permutation using two vperm2f128, followed by a vshufpd insn blending
18086 the two vectors together. */
18087
18088 static bool
18089 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
18090 {
18091 struct expand_vec_perm_d dfirst, dsecond, dthird;
18092 bool ok;
18093
18094 if (!TARGET_AVX || (d->vmode != V4DFmode))
18095 return false;
18096
18097 if (d->testing_p)
18098 return true;
18099
18100 dfirst = *d;
18101 dsecond = *d;
18102 dthird = *d;
18103
18104 dfirst.perm[0] = (d->perm[0] & ~1);
18105 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
18106 dfirst.perm[2] = (d->perm[2] & ~1);
18107 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
18108 dsecond.perm[0] = (d->perm[1] & ~1);
18109 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
18110 dsecond.perm[2] = (d->perm[3] & ~1);
18111 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
18112 dthird.perm[0] = (d->perm[0] % 2);
18113 dthird.perm[1] = (d->perm[1] % 2) + 4;
18114 dthird.perm[2] = (d->perm[2] % 2) + 2;
18115 dthird.perm[3] = (d->perm[3] % 2) + 6;
18116
18117 dfirst.target = gen_reg_rtx (dfirst.vmode);
18118 dsecond.target = gen_reg_rtx (dsecond.vmode);
18119 dthird.op0 = dfirst.target;
18120 dthird.op1 = dsecond.target;
18121 dthird.one_operand_p = false;
18122
18123 canonicalize_perm (&dfirst);
18124 canonicalize_perm (&dsecond);
18125
18126 ok = expand_vec_perm_1 (&dfirst)
18127 && expand_vec_perm_1 (&dsecond)
18128 && expand_vec_perm_1 (&dthird);
18129
18130 gcc_assert (ok);
18131
18132 return true;
18133 }
18134
18135 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
18136 permutation with two pshufb insns and an ior. We should have already
18137 failed all two instruction sequences. */
18138
18139 static bool
18140 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
18141 {
18142 rtx rperm[2][16], vperm, l, h, op, m128;
18143 unsigned int i, nelt, eltsz;
18144
18145 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
18146 return false;
18147 gcc_assert (!d->one_operand_p);
18148
18149 if (d->testing_p)
18150 return true;
18151
18152 nelt = d->nelt;
18153 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18154
18155 /* Generate two permutation masks. If the required element is within
18156 the given vector it is shuffled into the proper lane. If the required
18157 element is in the other vector, force a zero into the lane by setting
18158 bit 7 in the permutation mask. */
18159 m128 = GEN_INT (-128);
18160 for (i = 0; i < nelt; ++i)
18161 {
18162 unsigned j, e = d->perm[i];
18163 unsigned which = (e >= nelt);
18164 if (e >= nelt)
18165 e -= nelt;
18166
18167 for (j = 0; j < eltsz; ++j)
18168 {
18169 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
18170 rperm[1-which][i*eltsz + j] = m128;
18171 }
18172 }
18173
18174 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
18175 vperm = force_reg (V16QImode, vperm);
18176
18177 l = gen_reg_rtx (V16QImode);
18178 op = gen_lowpart (V16QImode, d->op0);
18179 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
18180
18181 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
18182 vperm = force_reg (V16QImode, vperm);
18183
18184 h = gen_reg_rtx (V16QImode);
18185 op = gen_lowpart (V16QImode, d->op1);
18186 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
18187
18188 op = d->target;
18189 if (d->vmode != V16QImode)
18190 op = gen_reg_rtx (V16QImode);
18191 emit_insn (gen_iorv16qi3 (op, l, h));
18192 if (op != d->target)
18193 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18194
18195 return true;
18196 }
18197
18198 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
18199 with two vpshufb insns, vpermq and vpor. We should have already failed
18200 all two or three instruction sequences. */
18201
18202 static bool
18203 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
18204 {
18205 rtx rperm[2][32], vperm, l, h, hp, op, m128;
18206 unsigned int i, nelt, eltsz;
18207
18208 if (!TARGET_AVX2
18209 || !d->one_operand_p
18210 || (d->vmode != V32QImode && d->vmode != V16HImode))
18211 return false;
18212
18213 if (d->testing_p)
18214 return true;
18215
18216 nelt = d->nelt;
18217 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18218
18219 /* Generate two permutation masks. If the required element is within
18220 the same lane, it is shuffled in. If the required element from the
18221 other lane, force a zero by setting bit 7 in the permutation mask.
18222 In the other mask the mask has non-negative elements if element
18223 is requested from the other lane, but also moved to the other lane,
18224 so that the result of vpshufb can have the two V2TImode halves
18225 swapped. */
18226 m128 = GEN_INT (-128);
18227 for (i = 0; i < nelt; ++i)
18228 {
18229 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18230 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
18231
18232 for (j = 0; j < eltsz; ++j)
18233 {
18234 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
18235 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
18236 }
18237 }
18238
18239 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
18240 vperm = force_reg (V32QImode, vperm);
18241
18242 h = gen_reg_rtx (V32QImode);
18243 op = gen_lowpart (V32QImode, d->op0);
18244 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
18245
18246 /* Swap the 128-byte lanes of h into hp. */
18247 hp = gen_reg_rtx (V4DImode);
18248 op = gen_lowpart (V4DImode, h);
18249 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
18250 const1_rtx));
18251
18252 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
18253 vperm = force_reg (V32QImode, vperm);
18254
18255 l = gen_reg_rtx (V32QImode);
18256 op = gen_lowpart (V32QImode, d->op0);
18257 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
18258
18259 op = d->target;
18260 if (d->vmode != V32QImode)
18261 op = gen_reg_rtx (V32QImode);
18262 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
18263 if (op != d->target)
18264 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18265
18266 return true;
18267 }
18268
18269 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18270 and extract-odd permutations of two V32QImode and V16QImode operand
18271 with two vpshufb insns, vpor and vpermq. We should have already
18272 failed all two or three instruction sequences. */
18273
18274 static bool
18275 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
18276 {
18277 rtx rperm[2][32], vperm, l, h, ior, op, m128;
18278 unsigned int i, nelt, eltsz;
18279
18280 if (!TARGET_AVX2
18281 || d->one_operand_p
18282 || (d->vmode != V32QImode && d->vmode != V16HImode))
18283 return false;
18284
18285 for (i = 0; i < d->nelt; ++i)
18286 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
18287 return false;
18288
18289 if (d->testing_p)
18290 return true;
18291
18292 nelt = d->nelt;
18293 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18294
18295 /* Generate two permutation masks. In the first permutation mask
18296 the first quarter will contain indexes for the first half
18297 of the op0, the second quarter will contain bit 7 set, third quarter
18298 will contain indexes for the second half of the op0 and the
18299 last quarter bit 7 set. In the second permutation mask
18300 the first quarter will contain bit 7 set, the second quarter
18301 indexes for the first half of the op1, the third quarter bit 7 set
18302 and last quarter indexes for the second half of the op1.
18303 I.e. the first mask e.g. for V32QImode extract even will be:
18304 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
18305 (all values masked with 0xf except for -128) and second mask
18306 for extract even will be
18307 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
18308 m128 = GEN_INT (-128);
18309 for (i = 0; i < nelt; ++i)
18310 {
18311 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18312 unsigned which = d->perm[i] >= nelt;
18313 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
18314
18315 for (j = 0; j < eltsz; ++j)
18316 {
18317 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
18318 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
18319 }
18320 }
18321
18322 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
18323 vperm = force_reg (V32QImode, vperm);
18324
18325 l = gen_reg_rtx (V32QImode);
18326 op = gen_lowpart (V32QImode, d->op0);
18327 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
18328
18329 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
18330 vperm = force_reg (V32QImode, vperm);
18331
18332 h = gen_reg_rtx (V32QImode);
18333 op = gen_lowpart (V32QImode, d->op1);
18334 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
18335
18336 ior = gen_reg_rtx (V32QImode);
18337 emit_insn (gen_iorv32qi3 (ior, l, h));
18338
18339 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
18340 op = gen_reg_rtx (V4DImode);
18341 ior = gen_lowpart (V4DImode, ior);
18342 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
18343 const1_rtx, GEN_INT (3)));
18344 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
18345
18346 return true;
18347 }
18348
18349 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18350 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
18351 with two "and" and "pack" or two "shift" and "pack" insns. We should
18352 have already failed all two instruction sequences. */
18353
18354 static bool
18355 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
18356 {
18357 rtx op, dop0, dop1, t;
18358 unsigned i, odd, c, s, nelt = d->nelt;
18359 bool end_perm = false;
18360 machine_mode half_mode;
18361 rtx (*gen_and) (rtx, rtx, rtx);
18362 rtx (*gen_pack) (rtx, rtx, rtx);
18363 rtx (*gen_shift) (rtx, rtx, rtx);
18364
18365 if (d->one_operand_p)
18366 return false;
18367
18368 switch (d->vmode)
18369 {
18370 case E_V8HImode:
18371 /* Required for "pack". */
18372 if (!TARGET_SSE4_1)
18373 return false;
18374 c = 0xffff;
18375 s = 16;
18376 half_mode = V4SImode;
18377 gen_and = gen_andv4si3;
18378 gen_pack = gen_sse4_1_packusdw;
18379 gen_shift = gen_lshrv4si3;
18380 break;
18381 case E_V16QImode:
18382 /* No check as all instructions are SSE2. */
18383 c = 0xff;
18384 s = 8;
18385 half_mode = V8HImode;
18386 gen_and = gen_andv8hi3;
18387 gen_pack = gen_sse2_packuswb;
18388 gen_shift = gen_lshrv8hi3;
18389 break;
18390 case E_V16HImode:
18391 if (!TARGET_AVX2)
18392 return false;
18393 c = 0xffff;
18394 s = 16;
18395 half_mode = V8SImode;
18396 gen_and = gen_andv8si3;
18397 gen_pack = gen_avx2_packusdw;
18398 gen_shift = gen_lshrv8si3;
18399 end_perm = true;
18400 break;
18401 case E_V32QImode:
18402 if (!TARGET_AVX2)
18403 return false;
18404 c = 0xff;
18405 s = 8;
18406 half_mode = V16HImode;
18407 gen_and = gen_andv16hi3;
18408 gen_pack = gen_avx2_packuswb;
18409 gen_shift = gen_lshrv16hi3;
18410 end_perm = true;
18411 break;
18412 default:
18413 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
18414 general shuffles. */
18415 return false;
18416 }
18417
18418 /* Check that permutation is even or odd. */
18419 odd = d->perm[0];
18420 if (odd > 1)
18421 return false;
18422
18423 for (i = 1; i < nelt; ++i)
18424 if (d->perm[i] != 2 * i + odd)
18425 return false;
18426
18427 if (d->testing_p)
18428 return true;
18429
18430 dop0 = gen_reg_rtx (half_mode);
18431 dop1 = gen_reg_rtx (half_mode);
18432 if (odd == 0)
18433 {
18434 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
18435 t = force_reg (half_mode, t);
18436 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
18437 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
18438 }
18439 else
18440 {
18441 emit_insn (gen_shift (dop0,
18442 gen_lowpart (half_mode, d->op0),
18443 GEN_INT (s)));
18444 emit_insn (gen_shift (dop1,
18445 gen_lowpart (half_mode, d->op1),
18446 GEN_INT (s)));
18447 }
18448 /* In AVX2 for 256 bit case we need to permute pack result. */
18449 if (TARGET_AVX2 && end_perm)
18450 {
18451 op = gen_reg_rtx (d->vmode);
18452 t = gen_reg_rtx (V4DImode);
18453 emit_insn (gen_pack (op, dop0, dop1));
18454 emit_insn (gen_avx2_permv4di_1 (t,
18455 gen_lowpart (V4DImode, op),
18456 const0_rtx,
18457 const2_rtx,
18458 const1_rtx,
18459 GEN_INT (3)));
18460 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
18461 }
18462 else
18463 emit_insn (gen_pack (d->target, dop0, dop1));
18464
18465 return true;
18466 }
18467
18468 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18469 and extract-odd permutations of two V64QI operands
18470 with two "shifts", two "truncs" and one "concat" insns for "odd"
18471 and two "truncs" and one concat insn for "even."
18472 Have already failed all two instruction sequences. */
18473
18474 static bool
18475 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
18476 {
18477 rtx t1, t2, t3, t4;
18478 unsigned i, odd, nelt = d->nelt;
18479
18480 if (!TARGET_AVX512BW
18481 || d->one_operand_p
18482 || d->vmode != V64QImode)
18483 return false;
18484
18485 /* Check that permutation is even or odd. */
18486 odd = d->perm[0];
18487 if (odd > 1)
18488 return false;
18489
18490 for (i = 1; i < nelt; ++i)
18491 if (d->perm[i] != 2 * i + odd)
18492 return false;
18493
18494 if (d->testing_p)
18495 return true;
18496
18497
18498 if (odd)
18499 {
18500 t1 = gen_reg_rtx (V32HImode);
18501 t2 = gen_reg_rtx (V32HImode);
18502 emit_insn (gen_lshrv32hi3 (t1,
18503 gen_lowpart (V32HImode, d->op0),
18504 GEN_INT (8)));
18505 emit_insn (gen_lshrv32hi3 (t2,
18506 gen_lowpart (V32HImode, d->op1),
18507 GEN_INT (8)));
18508 }
18509 else
18510 {
18511 t1 = gen_lowpart (V32HImode, d->op0);
18512 t2 = gen_lowpart (V32HImode, d->op1);
18513 }
18514
18515 t3 = gen_reg_rtx (V32QImode);
18516 t4 = gen_reg_rtx (V32QImode);
18517 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
18518 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
18519 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
18520
18521 return true;
18522 }
18523
18524 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
18525 and extract-odd permutations. */
18526
18527 static bool
18528 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
18529 {
18530 rtx t1, t2, t3, t4, t5;
18531
18532 switch (d->vmode)
18533 {
18534 case E_V4DFmode:
18535 if (d->testing_p)
18536 break;
18537 t1 = gen_reg_rtx (V4DFmode);
18538 t2 = gen_reg_rtx (V4DFmode);
18539
18540 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
18541 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
18542 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
18543
18544 /* Now an unpck[lh]pd will produce the result required. */
18545 if (odd)
18546 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
18547 else
18548 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
18549 emit_insn (t3);
18550 break;
18551
18552 case E_V8SFmode:
18553 {
18554 int mask = odd ? 0xdd : 0x88;
18555
18556 if (d->testing_p)
18557 break;
18558 t1 = gen_reg_rtx (V8SFmode);
18559 t2 = gen_reg_rtx (V8SFmode);
18560 t3 = gen_reg_rtx (V8SFmode);
18561
18562 /* Shuffle within the 128-bit lanes to produce:
18563 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
18564 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
18565 GEN_INT (mask)));
18566
18567 /* Shuffle the lanes around to produce:
18568 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
18569 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
18570 GEN_INT (0x3)));
18571
18572 /* Shuffle within the 128-bit lanes to produce:
18573 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
18574 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
18575
18576 /* Shuffle within the 128-bit lanes to produce:
18577 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
18578 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
18579
18580 /* Shuffle the lanes around to produce:
18581 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
18582 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
18583 GEN_INT (0x20)));
18584 }
18585 break;
18586
18587 case E_V2DFmode:
18588 case E_V4SFmode:
18589 case E_V2DImode:
18590 case E_V4SImode:
18591 /* These are always directly implementable by expand_vec_perm_1. */
18592 gcc_unreachable ();
18593
18594 case E_V8HImode:
18595 if (TARGET_SSE4_1)
18596 return expand_vec_perm_even_odd_pack (d);
18597 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
18598 return expand_vec_perm_pshufb2 (d);
18599 else
18600 {
18601 if (d->testing_p)
18602 break;
18603 /* We need 2*log2(N)-1 operations to achieve odd/even
18604 with interleave. */
18605 t1 = gen_reg_rtx (V8HImode);
18606 t2 = gen_reg_rtx (V8HImode);
18607 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
18608 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
18609 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
18610 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
18611 if (odd)
18612 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
18613 else
18614 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
18615 emit_insn (t3);
18616 }
18617 break;
18618
18619 case E_V16QImode:
18620 return expand_vec_perm_even_odd_pack (d);
18621
18622 case E_V16HImode:
18623 case E_V32QImode:
18624 return expand_vec_perm_even_odd_pack (d);
18625
18626 case E_V64QImode:
18627 return expand_vec_perm_even_odd_trunc (d);
18628
18629 case E_V4DImode:
18630 if (!TARGET_AVX2)
18631 {
18632 struct expand_vec_perm_d d_copy = *d;
18633 d_copy.vmode = V4DFmode;
18634 if (d->testing_p)
18635 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
18636 else
18637 d_copy.target = gen_reg_rtx (V4DFmode);
18638 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
18639 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
18640 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
18641 {
18642 if (!d->testing_p)
18643 emit_move_insn (d->target,
18644 gen_lowpart (V4DImode, d_copy.target));
18645 return true;
18646 }
18647 return false;
18648 }
18649
18650 if (d->testing_p)
18651 break;
18652
18653 t1 = gen_reg_rtx (V4DImode);
18654 t2 = gen_reg_rtx (V4DImode);
18655
18656 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
18657 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
18658 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
18659
18660 /* Now an vpunpck[lh]qdq will produce the result required. */
18661 if (odd)
18662 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
18663 else
18664 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
18665 emit_insn (t3);
18666 break;
18667
18668 case E_V8SImode:
18669 if (!TARGET_AVX2)
18670 {
18671 struct expand_vec_perm_d d_copy = *d;
18672 d_copy.vmode = V8SFmode;
18673 if (d->testing_p)
18674 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
18675 else
18676 d_copy.target = gen_reg_rtx (V8SFmode);
18677 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
18678 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
18679 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
18680 {
18681 if (!d->testing_p)
18682 emit_move_insn (d->target,
18683 gen_lowpart (V8SImode, d_copy.target));
18684 return true;
18685 }
18686 return false;
18687 }
18688
18689 if (d->testing_p)
18690 break;
18691
18692 t1 = gen_reg_rtx (V8SImode);
18693 t2 = gen_reg_rtx (V8SImode);
18694 t3 = gen_reg_rtx (V4DImode);
18695 t4 = gen_reg_rtx (V4DImode);
18696 t5 = gen_reg_rtx (V4DImode);
18697
18698 /* Shuffle the lanes around into
18699 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
18700 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
18701 gen_lowpart (V4DImode, d->op1),
18702 GEN_INT (0x20)));
18703 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
18704 gen_lowpart (V4DImode, d->op1),
18705 GEN_INT (0x31)));
18706
18707 /* Swap the 2nd and 3rd position in each lane into
18708 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
18709 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
18710 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18711 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
18712 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18713
18714 /* Now an vpunpck[lh]qdq will produce
18715 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
18716 if (odd)
18717 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
18718 gen_lowpart (V4DImode, t2));
18719 else
18720 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
18721 gen_lowpart (V4DImode, t2));
18722 emit_insn (t3);
18723 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
18724 break;
18725
18726 default:
18727 gcc_unreachable ();
18728 }
18729
18730 return true;
18731 }
18732
18733 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
18734 extract-even and extract-odd permutations. */
18735
18736 static bool
18737 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
18738 {
18739 unsigned i, odd, nelt = d->nelt;
18740
18741 odd = d->perm[0];
18742 if (odd != 0 && odd != 1)
18743 return false;
18744
18745 for (i = 1; i < nelt; ++i)
18746 if (d->perm[i] != 2 * i + odd)
18747 return false;
18748
18749 return expand_vec_perm_even_odd_1 (d, odd);
18750 }
18751
18752 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
18753 permutations. We assume that expand_vec_perm_1 has already failed. */
18754
18755 static bool
18756 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
18757 {
18758 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
18759 machine_mode vmode = d->vmode;
18760 unsigned char perm2[4];
18761 rtx op0 = d->op0, dest;
18762 bool ok;
18763
18764 switch (vmode)
18765 {
18766 case E_V4DFmode:
18767 case E_V8SFmode:
18768 /* These are special-cased in sse.md so that we can optionally
18769 use the vbroadcast instruction. They expand to two insns
18770 if the input happens to be in a register. */
18771 gcc_unreachable ();
18772
18773 case E_V2DFmode:
18774 case E_V2DImode:
18775 case E_V4SFmode:
18776 case E_V4SImode:
18777 /* These are always implementable using standard shuffle patterns. */
18778 gcc_unreachable ();
18779
18780 case E_V8HImode:
18781 case E_V16QImode:
18782 /* These can be implemented via interleave. We save one insn by
18783 stopping once we have promoted to V4SImode and then use pshufd. */
18784 if (d->testing_p)
18785 return true;
18786 do
18787 {
18788 rtx dest;
18789 rtx (*gen) (rtx, rtx, rtx)
18790 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
18791 : gen_vec_interleave_lowv8hi;
18792
18793 if (elt >= nelt2)
18794 {
18795 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
18796 : gen_vec_interleave_highv8hi;
18797 elt -= nelt2;
18798 }
18799 nelt2 /= 2;
18800
18801 dest = gen_reg_rtx (vmode);
18802 emit_insn (gen (dest, op0, op0));
18803 vmode = get_mode_wider_vector (vmode);
18804 op0 = gen_lowpart (vmode, dest);
18805 }
18806 while (vmode != V4SImode);
18807
18808 memset (perm2, elt, 4);
18809 dest = gen_reg_rtx (V4SImode);
18810 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
18811 gcc_assert (ok);
18812 if (!d->testing_p)
18813 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
18814 return true;
18815
18816 case E_V64QImode:
18817 case E_V32QImode:
18818 case E_V16HImode:
18819 case E_V8SImode:
18820 case E_V4DImode:
18821 /* For AVX2 broadcasts of the first element vpbroadcast* or
18822 vpermq should be used by expand_vec_perm_1. */
18823 gcc_assert (!TARGET_AVX2 || d->perm[0]);
18824 return false;
18825
18826 default:
18827 gcc_unreachable ();
18828 }
18829 }
18830
18831 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
18832 broadcast permutations. */
18833
18834 static bool
18835 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
18836 {
18837 unsigned i, elt, nelt = d->nelt;
18838
18839 if (!d->one_operand_p)
18840 return false;
18841
18842 elt = d->perm[0];
18843 for (i = 1; i < nelt; ++i)
18844 if (d->perm[i] != elt)
18845 return false;
18846
18847 return expand_vec_perm_broadcast_1 (d);
18848 }
18849
18850 /* Implement arbitrary permutations of two V64QImode operands
18851 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
18852 static bool
18853 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
18854 {
18855 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
18856 return false;
18857
18858 if (d->testing_p)
18859 return true;
18860
18861 struct expand_vec_perm_d ds[2];
18862 rtx rperm[128], vperm, target0, target1;
18863 unsigned int i, nelt;
18864 machine_mode vmode;
18865
18866 nelt = d->nelt;
18867 vmode = V64QImode;
18868
18869 for (i = 0; i < 2; i++)
18870 {
18871 ds[i] = *d;
18872 ds[i].vmode = V32HImode;
18873 ds[i].nelt = 32;
18874 ds[i].target = gen_reg_rtx (V32HImode);
18875 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
18876 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
18877 }
18878
18879 /* Prepare permutations such that the first one takes care of
18880 putting the even bytes into the right positions or one higher
18881 positions (ds[0]) and the second one takes care of
18882 putting the odd bytes into the right positions or one below
18883 (ds[1]). */
18884
18885 for (i = 0; i < nelt; i++)
18886 {
18887 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
18888 if (i & 1)
18889 {
18890 rperm[i] = constm1_rtx;
18891 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
18892 }
18893 else
18894 {
18895 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
18896 rperm[i + 64] = constm1_rtx;
18897 }
18898 }
18899
18900 bool ok = expand_vec_perm_1 (&ds[0]);
18901 gcc_assert (ok);
18902 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
18903
18904 ok = expand_vec_perm_1 (&ds[1]);
18905 gcc_assert (ok);
18906 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
18907
18908 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
18909 vperm = force_reg (vmode, vperm);
18910 target0 = gen_reg_rtx (V64QImode);
18911 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
18912
18913 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
18914 vperm = force_reg (vmode, vperm);
18915 target1 = gen_reg_rtx (V64QImode);
18916 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
18917
18918 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
18919 return true;
18920 }
18921
18922 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
18923 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
18924 all the shorter instruction sequences. */
18925
18926 static bool
18927 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
18928 {
18929 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
18930 unsigned int i, nelt, eltsz;
18931 bool used[4];
18932
18933 if (!TARGET_AVX2
18934 || d->one_operand_p
18935 || (d->vmode != V32QImode && d->vmode != V16HImode))
18936 return false;
18937
18938 if (d->testing_p)
18939 return true;
18940
18941 nelt = d->nelt;
18942 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
18943
18944 /* Generate 4 permutation masks. If the required element is within
18945 the same lane, it is shuffled in. If the required element from the
18946 other lane, force a zero by setting bit 7 in the permutation mask.
18947 In the other mask the mask has non-negative elements if element
18948 is requested from the other lane, but also moved to the other lane,
18949 so that the result of vpshufb can have the two V2TImode halves
18950 swapped. */
18951 m128 = GEN_INT (-128);
18952 for (i = 0; i < 32; ++i)
18953 {
18954 rperm[0][i] = m128;
18955 rperm[1][i] = m128;
18956 rperm[2][i] = m128;
18957 rperm[3][i] = m128;
18958 }
18959 used[0] = false;
18960 used[1] = false;
18961 used[2] = false;
18962 used[3] = false;
18963 for (i = 0; i < nelt; ++i)
18964 {
18965 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
18966 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
18967 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
18968
18969 for (j = 0; j < eltsz; ++j)
18970 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
18971 used[which] = true;
18972 }
18973
18974 for (i = 0; i < 2; ++i)
18975 {
18976 if (!used[2 * i + 1])
18977 {
18978 h[i] = NULL_RTX;
18979 continue;
18980 }
18981 vperm = gen_rtx_CONST_VECTOR (V32QImode,
18982 gen_rtvec_v (32, rperm[2 * i + 1]));
18983 vperm = force_reg (V32QImode, vperm);
18984 h[i] = gen_reg_rtx (V32QImode);
18985 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
18986 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
18987 }
18988
18989 /* Swap the 128-byte lanes of h[X]. */
18990 for (i = 0; i < 2; ++i)
18991 {
18992 if (h[i] == NULL_RTX)
18993 continue;
18994 op = gen_reg_rtx (V4DImode);
18995 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
18996 const2_rtx, GEN_INT (3), const0_rtx,
18997 const1_rtx));
18998 h[i] = gen_lowpart (V32QImode, op);
18999 }
19000
19001 for (i = 0; i < 2; ++i)
19002 {
19003 if (!used[2 * i])
19004 {
19005 l[i] = NULL_RTX;
19006 continue;
19007 }
19008 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
19009 vperm = force_reg (V32QImode, vperm);
19010 l[i] = gen_reg_rtx (V32QImode);
19011 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
19012 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
19013 }
19014
19015 for (i = 0; i < 2; ++i)
19016 {
19017 if (h[i] && l[i])
19018 {
19019 op = gen_reg_rtx (V32QImode);
19020 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
19021 l[i] = op;
19022 }
19023 else if (h[i])
19024 l[i] = h[i];
19025 }
19026
19027 gcc_assert (l[0] && l[1]);
19028 op = d->target;
19029 if (d->vmode != V32QImode)
19030 op = gen_reg_rtx (V32QImode);
19031 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
19032 if (op != d->target)
19033 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
19034 return true;
19035 }
19036
19037 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
19038 taken care of, perform the expansion in D and return true on success. */
19039
19040 static bool
19041 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
19042 {
19043 /* Try a single instruction expansion. */
19044 if (expand_vec_perm_1 (d))
19045 return true;
19046
19047 /* Try sequences of two instructions. */
19048
19049 if (expand_vec_perm_pshuflw_pshufhw (d))
19050 return true;
19051
19052 if (expand_vec_perm_palignr (d, false))
19053 return true;
19054
19055 if (expand_vec_perm_interleave2 (d))
19056 return true;
19057
19058 if (expand_vec_perm_broadcast (d))
19059 return true;
19060
19061 if (expand_vec_perm_vpermq_perm_1 (d))
19062 return true;
19063
19064 if (expand_vec_perm_vperm2f128 (d))
19065 return true;
19066
19067 if (expand_vec_perm_pblendv (d))
19068 return true;
19069
19070 /* Try sequences of three instructions. */
19071
19072 if (expand_vec_perm_even_odd_pack (d))
19073 return true;
19074
19075 if (expand_vec_perm_2vperm2f128_vshuf (d))
19076 return true;
19077
19078 if (expand_vec_perm_pshufb2 (d))
19079 return true;
19080
19081 if (expand_vec_perm_interleave3 (d))
19082 return true;
19083
19084 if (expand_vec_perm_vperm2f128_vblend (d))
19085 return true;
19086
19087 /* Try sequences of four instructions. */
19088
19089 if (expand_vec_perm_even_odd_trunc (d))
19090 return true;
19091 if (expand_vec_perm_vpshufb2_vpermq (d))
19092 return true;
19093
19094 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
19095 return true;
19096
19097 if (expand_vec_perm_vpermt2_vpshub2 (d))
19098 return true;
19099
19100 /* ??? Look for narrow permutations whose element orderings would
19101 allow the promotion to a wider mode. */
19102
19103 /* ??? Look for sequences of interleave or a wider permute that place
19104 the data into the correct lanes for a half-vector shuffle like
19105 pshuf[lh]w or vpermilps. */
19106
19107 /* ??? Look for sequences of interleave that produce the desired results.
19108 The combinatorics of punpck[lh] get pretty ugly... */
19109
19110 if (expand_vec_perm_even_odd (d))
19111 return true;
19112
19113 /* Even longer sequences. */
19114 if (expand_vec_perm_vpshufb4_vpermq2 (d))
19115 return true;
19116
19117 /* See if we can get the same permutation in different vector integer
19118 mode. */
19119 struct expand_vec_perm_d nd;
19120 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19121 {
19122 if (!d->testing_p)
19123 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19124 return true;
19125 }
19126
19127 return false;
19128 }
19129
19130 /* If a permutation only uses one operand, make it clear. Returns true
19131 if the permutation references both operands. */
19132
19133 static bool
19134 canonicalize_perm (struct expand_vec_perm_d *d)
19135 {
19136 int i, which, nelt = d->nelt;
19137
19138 for (i = which = 0; i < nelt; ++i)
19139 which |= (d->perm[i] < nelt ? 1 : 2);
19140
19141 d->one_operand_p = true;
19142 switch (which)
19143 {
19144 default:
19145 gcc_unreachable();
19146
19147 case 3:
19148 if (!rtx_equal_p (d->op0, d->op1))
19149 {
19150 d->one_operand_p = false;
19151 break;
19152 }
19153 /* The elements of PERM do not suggest that only the first operand
19154 is used, but both operands are identical. Allow easier matching
19155 of the permutation by folding the permutation into the single
19156 input vector. */
19157 /* FALLTHRU */
19158
19159 case 2:
19160 for (i = 0; i < nelt; ++i)
19161 d->perm[i] &= nelt - 1;
19162 d->op0 = d->op1;
19163 break;
19164
19165 case 1:
19166 d->op1 = d->op0;
19167 break;
19168 }
19169
19170 return (which == 3);
19171 }
19172
19173 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
19174
19175 bool
19176 ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
19177 rtx op1, const vec_perm_indices &sel)
19178 {
19179 struct expand_vec_perm_d d;
19180 unsigned char perm[MAX_VECT_LEN];
19181 unsigned int i, nelt, which;
19182 bool two_args;
19183
19184 d.target = target;
19185 d.op0 = op0;
19186 d.op1 = op1;
19187
19188 d.vmode = vmode;
19189 gcc_assert (VECTOR_MODE_P (d.vmode));
19190 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19191 d.testing_p = !target;
19192
19193 gcc_assert (sel.length () == nelt);
19194 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
19195
19196 /* Given sufficient ISA support we can just return true here
19197 for selected vector modes. */
19198 switch (d.vmode)
19199 {
19200 case E_V16SFmode:
19201 case E_V16SImode:
19202 case E_V8DImode:
19203 case E_V8DFmode:
19204 if (!TARGET_AVX512F)
19205 return false;
19206 /* All implementable with a single vperm[it]2 insn. */
19207 if (d.testing_p)
19208 return true;
19209 break;
19210 case E_V32HImode:
19211 if (!TARGET_AVX512BW)
19212 return false;
19213 if (d.testing_p)
19214 /* All implementable with a single vperm[it]2 insn. */
19215 return true;
19216 break;
19217 case E_V64QImode:
19218 if (!TARGET_AVX512BW)
19219 return false;
19220 if (d.testing_p)
19221 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
19222 return true;
19223 break;
19224 case E_V8SImode:
19225 case E_V8SFmode:
19226 case E_V4DFmode:
19227 case E_V4DImode:
19228 if (!TARGET_AVX)
19229 return false;
19230 if (d.testing_p && TARGET_AVX512VL)
19231 /* All implementable with a single vperm[it]2 insn. */
19232 return true;
19233 break;
19234 case E_V16HImode:
19235 if (!TARGET_SSE2)
19236 return false;
19237 if (d.testing_p && TARGET_AVX2)
19238 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19239 return true;
19240 break;
19241 case E_V32QImode:
19242 if (!TARGET_SSE2)
19243 return false;
19244 if (d.testing_p && TARGET_AVX2)
19245 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19246 return true;
19247 break;
19248 case E_V8HImode:
19249 case E_V16QImode:
19250 if (!TARGET_SSE2)
19251 return false;
19252 /* Fall through. */
19253 case E_V4SImode:
19254 case E_V4SFmode:
19255 if (!TARGET_SSE)
19256 return false;
19257 /* All implementable with a single vpperm insn. */
19258 if (d.testing_p && TARGET_XOP)
19259 return true;
19260 /* All implementable with 2 pshufb + 1 ior. */
19261 if (d.testing_p && TARGET_SSSE3)
19262 return true;
19263 break;
19264 case E_V2DImode:
19265 case E_V2DFmode:
19266 if (!TARGET_SSE)
19267 return false;
19268 /* All implementable with shufpd or unpck[lh]pd. */
19269 if (d.testing_p)
19270 return true;
19271 break;
19272 default:
19273 return false;
19274 }
19275
19276 for (i = which = 0; i < nelt; ++i)
19277 {
19278 unsigned char e = sel[i];
19279 gcc_assert (e < 2 * nelt);
19280 d.perm[i] = e;
19281 perm[i] = e;
19282 which |= (e < nelt ? 1 : 2);
19283 }
19284
19285 if (d.testing_p)
19286 {
19287 /* For all elements from second vector, fold the elements to first. */
19288 if (which == 2)
19289 for (i = 0; i < nelt; ++i)
19290 d.perm[i] -= nelt;
19291
19292 /* Check whether the mask can be applied to the vector type. */
19293 d.one_operand_p = (which != 3);
19294
19295 /* Implementable with shufps or pshufd. */
19296 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
19297 return true;
19298
19299 /* Otherwise we have to go through the motions and see if we can
19300 figure out how to generate the requested permutation. */
19301 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
19302 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
19303 if (!d.one_operand_p)
19304 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
19305
19306 start_sequence ();
19307 bool ret = ix86_expand_vec_perm_const_1 (&d);
19308 end_sequence ();
19309
19310 return ret;
19311 }
19312
19313 two_args = canonicalize_perm (&d);
19314
19315 if (ix86_expand_vec_perm_const_1 (&d))
19316 return true;
19317
19318 /* If the selector says both arguments are needed, but the operands are the
19319 same, the above tried to expand with one_operand_p and flattened selector.
19320 If that didn't work, retry without one_operand_p; we succeeded with that
19321 during testing. */
19322 if (two_args && d.one_operand_p)
19323 {
19324 d.one_operand_p = false;
19325 memcpy (d.perm, perm, sizeof (perm));
19326 return ix86_expand_vec_perm_const_1 (&d);
19327 }
19328
19329 return false;
19330 }
19331
19332 void
19333 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
19334 {
19335 struct expand_vec_perm_d d;
19336 unsigned i, nelt;
19337
19338 d.target = targ;
19339 d.op0 = op0;
19340 d.op1 = op1;
19341 d.vmode = GET_MODE (targ);
19342 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19343 d.one_operand_p = false;
19344 d.testing_p = false;
19345
19346 for (i = 0; i < nelt; ++i)
19347 d.perm[i] = i * 2 + odd;
19348
19349 /* We'll either be able to implement the permutation directly... */
19350 if (expand_vec_perm_1 (&d))
19351 return;
19352
19353 /* ... or we use the special-case patterns. */
19354 expand_vec_perm_even_odd_1 (&d, odd);
19355 }
19356
19357 static void
19358 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
19359 {
19360 struct expand_vec_perm_d d;
19361 unsigned i, nelt, base;
19362 bool ok;
19363
19364 d.target = targ;
19365 d.op0 = op0;
19366 d.op1 = op1;
19367 d.vmode = GET_MODE (targ);
19368 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
19369 d.one_operand_p = false;
19370 d.testing_p = false;
19371
19372 base = high_p ? nelt / 2 : 0;
19373 for (i = 0; i < nelt / 2; ++i)
19374 {
19375 d.perm[i * 2] = i + base;
19376 d.perm[i * 2 + 1] = i + base + nelt;
19377 }
19378
19379 /* Note that for AVX this isn't one instruction. */
19380 ok = ix86_expand_vec_perm_const_1 (&d);
19381 gcc_assert (ok);
19382 }
19383
19384
19385 /* Expand a vector operation CODE for a V*QImode in terms of the
19386 same operation on V*HImode. */
19387
19388 void
19389 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
19390 {
19391 machine_mode qimode = GET_MODE (dest);
19392 machine_mode himode;
19393 rtx (*gen_il) (rtx, rtx, rtx);
19394 rtx (*gen_ih) (rtx, rtx, rtx);
19395 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
19396 struct expand_vec_perm_d d;
19397 bool ok, full_interleave;
19398 bool uns_p = false;
19399 int i;
19400
19401 switch (qimode)
19402 {
19403 case E_V16QImode:
19404 himode = V8HImode;
19405 gen_il = gen_vec_interleave_lowv16qi;
19406 gen_ih = gen_vec_interleave_highv16qi;
19407 break;
19408 case E_V32QImode:
19409 himode = V16HImode;
19410 gen_il = gen_avx2_interleave_lowv32qi;
19411 gen_ih = gen_avx2_interleave_highv32qi;
19412 break;
19413 case E_V64QImode:
19414 himode = V32HImode;
19415 gen_il = gen_avx512bw_interleave_lowv64qi;
19416 gen_ih = gen_avx512bw_interleave_highv64qi;
19417 break;
19418 default:
19419 gcc_unreachable ();
19420 }
19421
19422 op2_l = op2_h = op2;
19423 switch (code)
19424 {
19425 case MULT:
19426 /* Unpack data such that we've got a source byte in each low byte of
19427 each word. We don't care what goes into the high byte of each word.
19428 Rather than trying to get zero in there, most convenient is to let
19429 it be a copy of the low byte. */
19430 op2_l = gen_reg_rtx (qimode);
19431 op2_h = gen_reg_rtx (qimode);
19432 emit_insn (gen_il (op2_l, op2, op2));
19433 emit_insn (gen_ih (op2_h, op2, op2));
19434
19435 op1_l = gen_reg_rtx (qimode);
19436 op1_h = gen_reg_rtx (qimode);
19437 emit_insn (gen_il (op1_l, op1, op1));
19438 emit_insn (gen_ih (op1_h, op1, op1));
19439 full_interleave = qimode == V16QImode;
19440 break;
19441
19442 case ASHIFT:
19443 case LSHIFTRT:
19444 uns_p = true;
19445 /* FALLTHRU */
19446 case ASHIFTRT:
19447 op1_l = gen_reg_rtx (himode);
19448 op1_h = gen_reg_rtx (himode);
19449 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
19450 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
19451 full_interleave = true;
19452 break;
19453 default:
19454 gcc_unreachable ();
19455 }
19456
19457 /* Perform the operation. */
19458 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
19459 1, OPTAB_DIRECT);
19460 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
19461 1, OPTAB_DIRECT);
19462 gcc_assert (res_l && res_h);
19463
19464 /* Merge the data back into the right place. */
19465 d.target = dest;
19466 d.op0 = gen_lowpart (qimode, res_l);
19467 d.op1 = gen_lowpart (qimode, res_h);
19468 d.vmode = qimode;
19469 d.nelt = GET_MODE_NUNITS (qimode);
19470 d.one_operand_p = false;
19471 d.testing_p = false;
19472
19473 if (full_interleave)
19474 {
19475 /* For SSE2, we used an full interleave, so the desired
19476 results are in the even elements. */
19477 for (i = 0; i < d.nelt; ++i)
19478 d.perm[i] = i * 2;
19479 }
19480 else
19481 {
19482 /* For AVX, the interleave used above was not cross-lane. So the
19483 extraction is evens but with the second and third quarter swapped.
19484 Happily, that is even one insn shorter than even extraction.
19485 For AVX512BW we have 4 lanes. We extract evens from within a lane,
19486 always first from the first and then from the second source operand,
19487 the index bits above the low 4 bits remains the same.
19488 Thus, for d.nelt == 32 we want permutation
19489 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
19490 and for d.nelt == 64 we want permutation
19491 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
19492 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
19493 for (i = 0; i < d.nelt; ++i)
19494 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
19495 }
19496
19497 ok = ix86_expand_vec_perm_const_1 (&d);
19498 gcc_assert (ok);
19499
19500 set_unique_reg_note (get_last_insn (), REG_EQUAL,
19501 gen_rtx_fmt_ee (code, qimode, op1, op2));
19502 }
19503
19504 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
19505 if op is CONST_VECTOR with all odd elements equal to their
19506 preceding element. */
19507
19508 static bool
19509 const_vector_equal_evenodd_p (rtx op)
19510 {
19511 machine_mode mode = GET_MODE (op);
19512 int i, nunits = GET_MODE_NUNITS (mode);
19513 if (GET_CODE (op) != CONST_VECTOR
19514 || nunits != CONST_VECTOR_NUNITS (op))
19515 return false;
19516 for (i = 0; i < nunits; i += 2)
19517 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
19518 return false;
19519 return true;
19520 }
19521
19522 void
19523 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
19524 bool uns_p, bool odd_p)
19525 {
19526 machine_mode mode = GET_MODE (op1);
19527 machine_mode wmode = GET_MODE (dest);
19528 rtx x;
19529 rtx orig_op1 = op1, orig_op2 = op2;
19530
19531 if (!nonimmediate_operand (op1, mode))
19532 op1 = force_reg (mode, op1);
19533 if (!nonimmediate_operand (op2, mode))
19534 op2 = force_reg (mode, op2);
19535
19536 /* We only play even/odd games with vectors of SImode. */
19537 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
19538
19539 /* If we're looking for the odd results, shift those members down to
19540 the even slots. For some cpus this is faster than a PSHUFD. */
19541 if (odd_p)
19542 {
19543 /* For XOP use vpmacsdqh, but only for smult, as it is only
19544 signed. */
19545 if (TARGET_XOP && mode == V4SImode && !uns_p)
19546 {
19547 x = force_reg (wmode, CONST0_RTX (wmode));
19548 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
19549 return;
19550 }
19551
19552 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
19553 if (!const_vector_equal_evenodd_p (orig_op1))
19554 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
19555 x, NULL, 1, OPTAB_DIRECT);
19556 if (!const_vector_equal_evenodd_p (orig_op2))
19557 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
19558 x, NULL, 1, OPTAB_DIRECT);
19559 op1 = gen_lowpart (mode, op1);
19560 op2 = gen_lowpart (mode, op2);
19561 }
19562
19563 if (mode == V16SImode)
19564 {
19565 if (uns_p)
19566 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
19567 else
19568 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
19569 }
19570 else if (mode == V8SImode)
19571 {
19572 if (uns_p)
19573 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
19574 else
19575 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
19576 }
19577 else if (uns_p)
19578 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
19579 else if (TARGET_SSE4_1)
19580 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
19581 else
19582 {
19583 rtx s1, s2, t0, t1, t2;
19584
19585 /* The easiest way to implement this without PMULDQ is to go through
19586 the motions as if we are performing a full 64-bit multiply. With
19587 the exception that we need to do less shuffling of the elements. */
19588
19589 /* Compute the sign-extension, aka highparts, of the two operands. */
19590 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
19591 op1, pc_rtx, pc_rtx);
19592 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
19593 op2, pc_rtx, pc_rtx);
19594
19595 /* Multiply LO(A) * HI(B), and vice-versa. */
19596 t1 = gen_reg_rtx (wmode);
19597 t2 = gen_reg_rtx (wmode);
19598 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
19599 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
19600
19601 /* Multiply LO(A) * LO(B). */
19602 t0 = gen_reg_rtx (wmode);
19603 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
19604
19605 /* Combine and shift the highparts into place. */
19606 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
19607 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
19608 1, OPTAB_DIRECT);
19609
19610 /* Combine high and low parts. */
19611 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
19612 return;
19613 }
19614 emit_insn (x);
19615 }
19616
19617 void
19618 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
19619 bool uns_p, bool high_p)
19620 {
19621 machine_mode wmode = GET_MODE (dest);
19622 machine_mode mode = GET_MODE (op1);
19623 rtx t1, t2, t3, t4, mask;
19624
19625 switch (mode)
19626 {
19627 case E_V4SImode:
19628 t1 = gen_reg_rtx (mode);
19629 t2 = gen_reg_rtx (mode);
19630 if (TARGET_XOP && !uns_p)
19631 {
19632 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
19633 shuffle the elements once so that all elements are in the right
19634 place for immediate use: { A C B D }. */
19635 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
19636 const1_rtx, GEN_INT (3)));
19637 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
19638 const1_rtx, GEN_INT (3)));
19639 }
19640 else
19641 {
19642 /* Put the elements into place for the multiply. */
19643 ix86_expand_vec_interleave (t1, op1, op1, high_p);
19644 ix86_expand_vec_interleave (t2, op2, op2, high_p);
19645 high_p = false;
19646 }
19647 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
19648 break;
19649
19650 case E_V8SImode:
19651 /* Shuffle the elements between the lanes. After this we
19652 have { A B E F | C D G H } for each operand. */
19653 t1 = gen_reg_rtx (V4DImode);
19654 t2 = gen_reg_rtx (V4DImode);
19655 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
19656 const0_rtx, const2_rtx,
19657 const1_rtx, GEN_INT (3)));
19658 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
19659 const0_rtx, const2_rtx,
19660 const1_rtx, GEN_INT (3)));
19661
19662 /* Shuffle the elements within the lanes. After this we
19663 have { A A B B | C C D D } or { E E F F | G G H H }. */
19664 t3 = gen_reg_rtx (V8SImode);
19665 t4 = gen_reg_rtx (V8SImode);
19666 mask = GEN_INT (high_p
19667 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
19668 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
19669 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
19670 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
19671
19672 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
19673 break;
19674
19675 case E_V8HImode:
19676 case E_V16HImode:
19677 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
19678 uns_p, OPTAB_DIRECT);
19679 t2 = expand_binop (mode,
19680 uns_p ? umul_highpart_optab : smul_highpart_optab,
19681 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
19682 gcc_assert (t1 && t2);
19683
19684 t3 = gen_reg_rtx (mode);
19685 ix86_expand_vec_interleave (t3, t1, t2, high_p);
19686 emit_move_insn (dest, gen_lowpart (wmode, t3));
19687 break;
19688
19689 case E_V16QImode:
19690 case E_V32QImode:
19691 case E_V32HImode:
19692 case E_V16SImode:
19693 case E_V64QImode:
19694 t1 = gen_reg_rtx (wmode);
19695 t2 = gen_reg_rtx (wmode);
19696 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
19697 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
19698
19699 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
19700 break;
19701
19702 default:
19703 gcc_unreachable ();
19704 }
19705 }
19706
19707 void
19708 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
19709 {
19710 rtx res_1, res_2, res_3, res_4;
19711
19712 res_1 = gen_reg_rtx (V4SImode);
19713 res_2 = gen_reg_rtx (V4SImode);
19714 res_3 = gen_reg_rtx (V2DImode);
19715 res_4 = gen_reg_rtx (V2DImode);
19716 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
19717 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
19718
19719 /* Move the results in element 2 down to element 1; we don't care
19720 what goes in elements 2 and 3. Then we can merge the parts
19721 back together with an interleave.
19722
19723 Note that two other sequences were tried:
19724 (1) Use interleaves at the start instead of psrldq, which allows
19725 us to use a single shufps to merge things back at the end.
19726 (2) Use shufps here to combine the two vectors, then pshufd to
19727 put the elements in the correct order.
19728 In both cases the cost of the reformatting stall was too high
19729 and the overall sequence slower. */
19730
19731 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
19732 const0_rtx, const2_rtx,
19733 const0_rtx, const0_rtx));
19734 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
19735 const0_rtx, const2_rtx,
19736 const0_rtx, const0_rtx));
19737 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
19738
19739 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
19740 }
19741
19742 void
19743 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
19744 {
19745 machine_mode mode = GET_MODE (op0);
19746 rtx t1, t2, t3, t4, t5, t6;
19747
19748 if (TARGET_AVX512DQ && mode == V8DImode)
19749 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
19750 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
19751 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
19752 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
19753 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
19754 else if (TARGET_XOP && mode == V2DImode)
19755 {
19756 /* op1: A,B,C,D, op2: E,F,G,H */
19757 op1 = gen_lowpart (V4SImode, op1);
19758 op2 = gen_lowpart (V4SImode, op2);
19759
19760 t1 = gen_reg_rtx (V4SImode);
19761 t2 = gen_reg_rtx (V4SImode);
19762 t3 = gen_reg_rtx (V2DImode);
19763 t4 = gen_reg_rtx (V2DImode);
19764
19765 /* t1: B,A,D,C */
19766 emit_insn (gen_sse2_pshufd_1 (t1, op1,
19767 GEN_INT (1),
19768 GEN_INT (0),
19769 GEN_INT (3),
19770 GEN_INT (2)));
19771
19772 /* t2: (B*E),(A*F),(D*G),(C*H) */
19773 emit_insn (gen_mulv4si3 (t2, t1, op2));
19774
19775 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
19776 emit_insn (gen_xop_phadddq (t3, t2));
19777
19778 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
19779 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
19780
19781 /* Multiply lower parts and add all */
19782 t5 = gen_reg_rtx (V2DImode);
19783 emit_insn (gen_vec_widen_umult_even_v4si (t5,
19784 gen_lowpart (V4SImode, op1),
19785 gen_lowpart (V4SImode, op2)));
19786 force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
19787 }
19788 else
19789 {
19790 machine_mode nmode;
19791 rtx (*umul) (rtx, rtx, rtx);
19792
19793 if (mode == V2DImode)
19794 {
19795 umul = gen_vec_widen_umult_even_v4si;
19796 nmode = V4SImode;
19797 }
19798 else if (mode == V4DImode)
19799 {
19800 umul = gen_vec_widen_umult_even_v8si;
19801 nmode = V8SImode;
19802 }
19803 else if (mode == V8DImode)
19804 {
19805 umul = gen_vec_widen_umult_even_v16si;
19806 nmode = V16SImode;
19807 }
19808 else
19809 gcc_unreachable ();
19810
19811
19812 /* Multiply low parts. */
19813 t1 = gen_reg_rtx (mode);
19814 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
19815
19816 /* Shift input vectors right 32 bits so we can multiply high parts. */
19817 t6 = GEN_INT (32);
19818 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
19819 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
19820
19821 /* Multiply high parts by low parts. */
19822 t4 = gen_reg_rtx (mode);
19823 t5 = gen_reg_rtx (mode);
19824 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
19825 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
19826
19827 /* Combine and shift the highparts back. */
19828 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
19829 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
19830
19831 /* Combine high and low parts. */
19832 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
19833 }
19834
19835 set_unique_reg_note (get_last_insn (), REG_EQUAL,
19836 gen_rtx_MULT (mode, op1, op2));
19837 }
19838
19839 /* Return 1 if control tansfer instruction INSN
19840 should be encoded with notrack prefix. */
19841
19842 bool
19843 ix86_notrack_prefixed_insn_p (rtx insn)
19844 {
19845 if (!insn || !((flag_cf_protection & CF_BRANCH)))
19846 return false;
19847
19848 if (CALL_P (insn))
19849 {
19850 rtx call = get_call_rtx_from (insn);
19851 gcc_assert (call != NULL_RTX);
19852 rtx addr = XEXP (call, 0);
19853
19854 /* Do not emit 'notrack' if it's not an indirect call. */
19855 if (MEM_P (addr)
19856 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
19857 return false;
19858 else
19859 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
19860 }
19861
19862 if (JUMP_P (insn) && !flag_cet_switch)
19863 {
19864 rtx target = JUMP_LABEL (insn);
19865 if (target == NULL_RTX || ANY_RETURN_P (target))
19866 return false;
19867
19868 /* Check the jump is a switch table. */
19869 rtx_insn *label = as_a<rtx_insn *> (target);
19870 rtx_insn *table = next_insn (label);
19871 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
19872 return false;
19873 else
19874 return true;
19875 }
19876 return false;
19877 }
19878
19879 /* Calculate integer abs() using only SSE2 instructions. */
19880
19881 void
19882 ix86_expand_sse2_abs (rtx target, rtx input)
19883 {
19884 machine_mode mode = GET_MODE (target);
19885 rtx tmp0, tmp1, x;
19886
19887 switch (mode)
19888 {
19889 case E_V2DImode:
19890 case E_V4DImode:
19891 /* For 64-bit signed integer X, with SSE4.2 use
19892 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
19893 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
19894 32 and use logical instead of arithmetic right shift (which is
19895 unimplemented) and subtract. */
19896 if (TARGET_SSE4_2)
19897 {
19898 tmp0 = gen_reg_rtx (mode);
19899 tmp1 = gen_reg_rtx (mode);
19900 emit_move_insn (tmp1, CONST0_RTX (mode));
19901 if (mode == E_V2DImode)
19902 emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
19903 else
19904 emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
19905 }
19906 else
19907 {
19908 tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
19909 GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
19910 - 1), NULL, 0, OPTAB_DIRECT);
19911 tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
19912 }
19913
19914 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
19915 NULL, 0, OPTAB_DIRECT);
19916 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
19917 target, 0, OPTAB_DIRECT);
19918 break;
19919
19920 case E_V4SImode:
19921 /* For 32-bit signed integer X, the best way to calculate the absolute
19922 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
19923 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
19924 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
19925 NULL, 0, OPTAB_DIRECT);
19926 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
19927 NULL, 0, OPTAB_DIRECT);
19928 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
19929 target, 0, OPTAB_DIRECT);
19930 break;
19931
19932 case E_V8HImode:
19933 /* For 16-bit signed integer X, the best way to calculate the absolute
19934 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
19935 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
19936
19937 x = expand_simple_binop (mode, SMAX, tmp0, input,
19938 target, 0, OPTAB_DIRECT);
19939 break;
19940
19941 case E_V16QImode:
19942 /* For 8-bit signed integer X, the best way to calculate the absolute
19943 value of X is min ((unsigned char) X, (unsigned char) (-X)),
19944 as SSE2 provides the PMINUB insn. */
19945 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
19946
19947 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
19948 target, 0, OPTAB_DIRECT);
19949 break;
19950
19951 default:
19952 gcc_unreachable ();
19953 }
19954
19955 if (x != target)
19956 emit_move_insn (target, x);
19957 }
19958
19959 /* Expand an extract from a vector register through pextr insn.
19960 Return true if successful. */
19961
19962 bool
19963 ix86_expand_pextr (rtx *operands)
19964 {
19965 rtx dst = operands[0];
19966 rtx src = operands[1];
19967
19968 unsigned int size = INTVAL (operands[2]);
19969 unsigned int pos = INTVAL (operands[3]);
19970
19971 if (SUBREG_P (dst))
19972 {
19973 /* Reject non-lowpart subregs. */
19974 if (SUBREG_BYTE (dst) > 0)
19975 return false;
19976 dst = SUBREG_REG (dst);
19977 }
19978
19979 if (SUBREG_P (src))
19980 {
19981 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
19982 src = SUBREG_REG (src);
19983 }
19984
19985 switch (GET_MODE (src))
19986 {
19987 case E_V16QImode:
19988 case E_V8HImode:
19989 case E_V4SImode:
19990 case E_V2DImode:
19991 case E_V1TImode:
19992 case E_TImode:
19993 {
19994 machine_mode srcmode, dstmode;
19995 rtx d, pat;
19996
19997 if (!int_mode_for_size (size, 0).exists (&dstmode))
19998 return false;
19999
20000 switch (dstmode)
20001 {
20002 case E_QImode:
20003 if (!TARGET_SSE4_1)
20004 return false;
20005 srcmode = V16QImode;
20006 break;
20007
20008 case E_HImode:
20009 if (!TARGET_SSE2)
20010 return false;
20011 srcmode = V8HImode;
20012 break;
20013
20014 case E_SImode:
20015 if (!TARGET_SSE4_1)
20016 return false;
20017 srcmode = V4SImode;
20018 break;
20019
20020 case E_DImode:
20021 gcc_assert (TARGET_64BIT);
20022 if (!TARGET_SSE4_1)
20023 return false;
20024 srcmode = V2DImode;
20025 break;
20026
20027 default:
20028 return false;
20029 }
20030
20031 /* Reject extractions from misaligned positions. */
20032 if (pos & (size-1))
20033 return false;
20034
20035 if (GET_MODE (dst) == dstmode)
20036 d = dst;
20037 else
20038 d = gen_reg_rtx (dstmode);
20039
20040 /* Construct insn pattern. */
20041 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
20042 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
20043
20044 /* Let the rtl optimizers know about the zero extension performed. */
20045 if (dstmode == QImode || dstmode == HImode)
20046 {
20047 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
20048 d = gen_lowpart (SImode, d);
20049 }
20050
20051 emit_insn (gen_rtx_SET (d, pat));
20052
20053 if (d != dst)
20054 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
20055 return true;
20056 }
20057
20058 default:
20059 return false;
20060 }
20061 }
20062
20063 /* Expand an insert into a vector register through pinsr insn.
20064 Return true if successful. */
20065
20066 bool
20067 ix86_expand_pinsr (rtx *operands)
20068 {
20069 rtx dst = operands[0];
20070 rtx src = operands[3];
20071
20072 unsigned int size = INTVAL (operands[1]);
20073 unsigned int pos = INTVAL (operands[2]);
20074
20075 if (SUBREG_P (dst))
20076 {
20077 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
20078 dst = SUBREG_REG (dst);
20079 }
20080
20081 switch (GET_MODE (dst))
20082 {
20083 case E_V16QImode:
20084 case E_V8HImode:
20085 case E_V4SImode:
20086 case E_V2DImode:
20087 case E_V1TImode:
20088 case E_TImode:
20089 {
20090 machine_mode srcmode, dstmode;
20091 rtx (*pinsr)(rtx, rtx, rtx, rtx);
20092 rtx d;
20093
20094 if (!int_mode_for_size (size, 0).exists (&srcmode))
20095 return false;
20096
20097 switch (srcmode)
20098 {
20099 case E_QImode:
20100 if (!TARGET_SSE4_1)
20101 return false;
20102 dstmode = V16QImode;
20103 pinsr = gen_sse4_1_pinsrb;
20104 break;
20105
20106 case E_HImode:
20107 if (!TARGET_SSE2)
20108 return false;
20109 dstmode = V8HImode;
20110 pinsr = gen_sse2_pinsrw;
20111 break;
20112
20113 case E_SImode:
20114 if (!TARGET_SSE4_1)
20115 return false;
20116 dstmode = V4SImode;
20117 pinsr = gen_sse4_1_pinsrd;
20118 break;
20119
20120 case E_DImode:
20121 gcc_assert (TARGET_64BIT);
20122 if (!TARGET_SSE4_1)
20123 return false;
20124 dstmode = V2DImode;
20125 pinsr = gen_sse4_1_pinsrq;
20126 break;
20127
20128 default:
20129 return false;
20130 }
20131
20132 /* Reject insertions to misaligned positions. */
20133 if (pos & (size-1))
20134 return false;
20135
20136 if (SUBREG_P (src))
20137 {
20138 unsigned int srcpos = SUBREG_BYTE (src);
20139
20140 if (srcpos > 0)
20141 {
20142 rtx extr_ops[4];
20143
20144 extr_ops[0] = gen_reg_rtx (srcmode);
20145 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
20146 extr_ops[2] = GEN_INT (size);
20147 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
20148
20149 if (!ix86_expand_pextr (extr_ops))
20150 return false;
20151
20152 src = extr_ops[0];
20153 }
20154 else
20155 src = gen_lowpart (srcmode, SUBREG_REG (src));
20156 }
20157
20158 if (GET_MODE (dst) == dstmode)
20159 d = dst;
20160 else
20161 d = gen_reg_rtx (dstmode);
20162
20163 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
20164 gen_lowpart (srcmode, src),
20165 GEN_INT (1 << (pos / size))));
20166 if (d != dst)
20167 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
20168 return true;
20169 }
20170
20171 default:
20172 return false;
20173 }
20174 }
20175
20176 /* All CPUs prefer to avoid cross-lane operations so perform reductions
20177 upper against lower halves up to SSE reg size. */
20178
20179 machine_mode
20180 ix86_split_reduction (machine_mode mode)
20181 {
20182 /* Reduce lowpart against highpart until we reach SSE reg width to
20183 avoid cross-lane operations. */
20184 switch (mode)
20185 {
20186 case E_V8DImode:
20187 case E_V4DImode:
20188 return V2DImode;
20189 case E_V16SImode:
20190 case E_V8SImode:
20191 return V4SImode;
20192 case E_V32HImode:
20193 case E_V16HImode:
20194 return V8HImode;
20195 case E_V64QImode:
20196 case E_V32QImode:
20197 return V16QImode;
20198 case E_V16SFmode:
20199 case E_V8SFmode:
20200 return V4SFmode;
20201 case E_V8DFmode:
20202 case E_V4DFmode:
20203 return V2DFmode;
20204 default:
20205 return mode;
20206 }
20207 }
20208
20209 /* Generate call to __divmoddi4. */
20210
20211 void
20212 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
20213 rtx op0, rtx op1,
20214 rtx *quot_p, rtx *rem_p)
20215 {
20216 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
20217
20218 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
20219 mode, op0, mode, op1, mode,
20220 XEXP (rem, 0), Pmode);
20221 *quot_p = quot;
20222 *rem_p = rem;
20223 }
20224
20225 #include "gt-i386-expand.h"
This page took 0.967278 seconds and 6 git commands to generate.