1 /* Copyright (C) 1988-2021 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
19 #define IN_TARGET_CODE 1
23 #include "coretypes.h"
33 #include "stringpool.h"
40 #include "diagnostic.h"
43 #include "fold-const.h"
46 #include "stor-layout.h"
49 #include "insn-attr.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
60 #include "tm-constrs.h"
62 #include "sched-int.h"
64 #include "tree-pass.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
73 #include "tree-iterator.h"
75 #include "case-cfn-macros.h"
77 #include "fold-const-call.h"
79 #include "tree-ssanames.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
85 #include "symbol-summary.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
91 #include "dwarf2out.h"
92 #include "i386-options.h"
93 #include "i386-builtins.h"
94 #include "i386-expand.h"
96 /* Split one or more double-mode RTL references into pairs of half-mode
97 references. The RTL can be REG, offsettable MEM, integer constant, or
98 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
99 split and "num" is its length. lo_half and hi_half are output arrays
100 that parallel "operands". */
103 split_double_mode (machine_mode mode
, rtx operands
[],
104 int num
, rtx lo_half
[], rtx hi_half
[])
106 machine_mode half_mode
;
108 rtx mem_op
= NULL_RTX
;
129 byte
= GET_MODE_SIZE (half_mode
);
133 rtx op
= operands
[num
];
135 /* simplify_subreg refuse to split volatile memory addresses,
136 but we still have to handle it. */
139 if (mem_op
&& rtx_equal_p (op
, mem_op
))
141 lo_half
[num
] = lo_half
[mem_num
];
142 hi_half
[num
] = hi_half
[mem_num
];
148 lo_half
[num
] = adjust_address (op
, half_mode
, 0);
149 hi_half
[num
] = adjust_address (op
, half_mode
, byte
);
154 lo_half
[num
] = simplify_gen_subreg (half_mode
, op
,
155 GET_MODE (op
) == VOIDmode
156 ? mode
: GET_MODE (op
), 0);
157 hi_half
[num
] = simplify_gen_subreg (half_mode
, op
,
158 GET_MODE (op
) == VOIDmode
159 ? mode
: GET_MODE (op
), byte
);
164 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
168 ix86_expand_clear (rtx dest
)
172 /* We play register width games, which are only valid after reload. */
173 gcc_assert (reload_completed
);
175 /* Avoid HImode and its attendant prefix byte. */
176 if (GET_MODE_SIZE (GET_MODE (dest
)) < 4)
177 dest
= gen_rtx_REG (SImode
, REGNO (dest
));
178 tmp
= gen_rtx_SET (dest
, const0_rtx
);
180 if (!TARGET_USE_MOV0
|| optimize_insn_for_size_p ())
182 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
183 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, tmp
, clob
));
190 ix86_expand_move (machine_mode mode
, rtx operands
[])
193 rtx tmp
, addend
= NULL_RTX
;
194 enum tls_model model
;
199 /* Avoid complex sets of likely spilled hard registers before reload. */
200 if (!ix86_hardreg_mov_ok (op0
, op1
))
202 tmp
= gen_reg_rtx (mode
);
204 ix86_expand_move (mode
, operands
);
210 switch (GET_CODE (op1
))
215 if (GET_CODE (tmp
) != PLUS
216 || GET_CODE (XEXP (tmp
, 0)) != SYMBOL_REF
)
220 addend
= XEXP (tmp
, 1);
224 model
= SYMBOL_REF_TLS_MODEL (op1
);
227 op1
= legitimize_tls_address (op1
, model
, true);
228 else if (ix86_force_load_from_GOT_p (op1
))
230 /* Load the external function address via GOT slot to avoid PLT. */
231 op1
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, op1
),
235 op1
= gen_rtx_CONST (Pmode
, op1
);
236 op1
= gen_const_mem (Pmode
, op1
);
237 set_mem_alias_set (op1
, ix86_GOT_alias_set ());
241 tmp
= legitimize_pe_coff_symbol (op1
, addend
!= NULL_RTX
);
257 op1
= force_operand (op1
, NULL_RTX
);
258 op1
= expand_simple_binop (Pmode
, PLUS
, op1
, addend
,
259 op0
, 1, OPTAB_DIRECT
);
262 op1
= force_operand (op1
, op0
);
267 op1
= convert_to_mode (mode
, op1
, 1);
273 if ((flag_pic
|| MACHOPIC_INDIRECT
)
274 && symbolic_operand (op1
, mode
))
276 if (TARGET_MACHO
&& !TARGET_64BIT
)
280 if (MACHOPIC_INDIRECT
)
282 rtx temp
= (op0
&& REG_P (op0
) && mode
== Pmode
)
283 ? op0
: gen_reg_rtx (Pmode
);
284 op1
= machopic_indirect_data_reference (op1
, temp
);
286 op1
= machopic_legitimize_pic_address (op1
, mode
,
287 temp
== op1
? 0 : temp
);
289 if (op0
!= op1
&& GET_CODE (op0
) != MEM
)
291 rtx insn
= gen_rtx_SET (op0
, op1
);
295 if (GET_CODE (op0
) == MEM
)
296 op1
= force_reg (Pmode
, op1
);
300 if (GET_CODE (temp
) != REG
)
301 temp
= gen_reg_rtx (Pmode
);
302 temp
= legitimize_pic_address (op1
, temp
);
313 op1
= force_reg (mode
, op1
);
314 else if (!(TARGET_64BIT
&& x86_64_movabs_operand (op1
, DImode
)))
316 rtx reg
= can_create_pseudo_p () ? NULL_RTX
: op0
;
317 op1
= legitimize_pic_address (op1
, reg
);
320 op1
= convert_to_mode (mode
, op1
, 1);
327 && (PUSH_ROUNDING (GET_MODE_SIZE (mode
)) != GET_MODE_SIZE (mode
)
328 || !push_operand (op0
, mode
))
330 op1
= force_reg (mode
, op1
);
332 if (push_operand (op0
, mode
)
333 && ! general_no_elim_operand (op1
, mode
))
334 op1
= copy_to_mode_reg (mode
, op1
);
336 /* Force large constants in 64bit compilation into register
337 to get them CSEed. */
338 if (can_create_pseudo_p ()
339 && (mode
== DImode
) && TARGET_64BIT
340 && immediate_operand (op1
, mode
)
341 && !x86_64_zext_immediate_operand (op1
, VOIDmode
)
342 && !register_operand (op0
, mode
)
344 op1
= copy_to_mode_reg (mode
, op1
);
346 if (can_create_pseudo_p ()
347 && CONST_DOUBLE_P (op1
))
349 /* If we are loading a floating point constant to a register,
350 force the value to memory now, since we'll get better code
353 op1
= validize_mem (force_const_mem (mode
, op1
));
354 if (!register_operand (op0
, mode
))
356 rtx temp
= gen_reg_rtx (mode
);
357 emit_insn (gen_rtx_SET (temp
, op1
));
358 emit_move_insn (op0
, temp
);
364 emit_insn (gen_rtx_SET (op0
, op1
));
368 ix86_expand_vector_move (machine_mode mode
, rtx operands
[])
370 rtx op0
= operands
[0], op1
= operands
[1];
371 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
372 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
373 unsigned int align
= (TARGET_IAMCU
374 ? GET_MODE_BITSIZE (mode
)
375 : GET_MODE_ALIGNMENT (mode
));
377 if (push_operand (op0
, VOIDmode
))
378 op0
= emit_move_resolve_push (mode
, op0
);
380 /* Force constants other than zero into memory. We do not know how
381 the instructions used to build constants modify the upper 64 bits
382 of the register, once we have that information we may be able
383 to handle some of them more efficiently. */
384 if (can_create_pseudo_p ()
387 && CONSTANT_P (SUBREG_REG (op1
))))
388 && ((register_operand (op0
, mode
)
389 && !standard_sse_constant_p (op1
, mode
))
390 /* ix86_expand_vector_move_misalign() does not like constants. */
391 || (SSE_REG_MODE_P (mode
)
393 && MEM_ALIGN (op0
) < align
)))
397 machine_mode imode
= GET_MODE (SUBREG_REG (op1
));
398 rtx r
= force_const_mem (imode
, SUBREG_REG (op1
));
400 r
= validize_mem (r
);
402 r
= force_reg (imode
, SUBREG_REG (op1
));
403 op1
= simplify_gen_subreg (mode
, r
, imode
, SUBREG_BYTE (op1
));
406 op1
= validize_mem (force_const_mem (mode
, op1
));
409 /* We need to check memory alignment for SSE mode since attribute
410 can make operands unaligned. */
411 if (can_create_pseudo_p ()
412 && SSE_REG_MODE_P (mode
)
413 && ((MEM_P (op0
) && (MEM_ALIGN (op0
) < align
))
414 || (MEM_P (op1
) && (MEM_ALIGN (op1
) < align
))))
418 /* ix86_expand_vector_move_misalign() does not like both
419 arguments in memory. */
420 if (!register_operand (op0
, mode
)
421 && !register_operand (op1
, mode
))
422 op1
= force_reg (mode
, op1
);
424 tmp
[0] = op0
; tmp
[1] = op1
;
425 ix86_expand_vector_move_misalign (mode
, tmp
);
429 /* Make operand1 a register if it isn't already. */
430 if (can_create_pseudo_p ()
431 && !register_operand (op0
, mode
)
432 && !register_operand (op1
, mode
))
434 emit_move_insn (op0
, force_reg (GET_MODE (op0
), op1
));
438 emit_insn (gen_rtx_SET (op0
, op1
));
441 /* Split 32-byte AVX unaligned load and store if needed. */
444 ix86_avx256_split_vector_move_misalign (rtx op0
, rtx op1
)
447 rtx (*extract
) (rtx
, rtx
, rtx
);
450 if ((MEM_P (op1
) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD
)
451 || (MEM_P (op0
) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE
))
453 emit_insn (gen_rtx_SET (op0
, op1
));
457 rtx orig_op0
= NULL_RTX
;
458 mode
= GET_MODE (op0
);
459 switch (GET_MODE_CLASS (mode
))
461 case MODE_VECTOR_INT
:
463 if (mode
!= V32QImode
)
468 op0
= gen_reg_rtx (V32QImode
);
471 op0
= gen_lowpart (V32QImode
, op0
);
472 op1
= gen_lowpart (V32QImode
, op1
);
476 case MODE_VECTOR_FLOAT
:
487 extract
= gen_avx_vextractf128v32qi
;
491 extract
= gen_avx_vextractf128v8sf
;
495 extract
= gen_avx_vextractf128v4df
;
502 rtx r
= gen_reg_rtx (mode
);
503 m
= adjust_address (op1
, mode
, 0);
504 emit_move_insn (r
, m
);
505 m
= adjust_address (op1
, mode
, 16);
506 r
= gen_rtx_VEC_CONCAT (GET_MODE (op0
), r
, m
);
507 emit_move_insn (op0
, r
);
509 else if (MEM_P (op0
))
511 m
= adjust_address (op0
, mode
, 0);
512 emit_insn (extract (m
, op1
, const0_rtx
));
513 m
= adjust_address (op0
, mode
, 16);
514 emit_insn (extract (m
, copy_rtx (op1
), const1_rtx
));
520 emit_move_insn (orig_op0
, gen_lowpart (GET_MODE (orig_op0
), op0
));
523 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
524 straight to ix86_expand_vector_move. */
525 /* Code generation for scalar reg-reg moves of single and double precision data:
526 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
530 if (x86_sse_partial_reg_dependency == true)
535 Code generation for scalar loads of double precision data:
536 if (x86_sse_split_regs == true)
537 movlpd mem, reg (gas syntax)
541 Code generation for unaligned packed loads of single precision data
542 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
543 if (x86_sse_unaligned_move_optimal)
546 if (x86_sse_partial_reg_dependency == true)
558 Code generation for unaligned packed loads of double precision data
559 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
560 if (x86_sse_unaligned_move_optimal)
563 if (x86_sse_split_regs == true)
576 ix86_expand_vector_move_misalign (machine_mode mode
, rtx operands
[])
583 /* Use unaligned load/store for AVX512 or when optimizing for size. */
584 if (GET_MODE_SIZE (mode
) == 64 || optimize_insn_for_size_p ())
586 emit_insn (gen_rtx_SET (op0
, op1
));
592 if (GET_MODE_SIZE (mode
) == 32)
593 ix86_avx256_split_vector_move_misalign (op0
, op1
);
595 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
596 emit_insn (gen_rtx_SET (op0
, op1
));
600 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
601 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
)
603 emit_insn (gen_rtx_SET (op0
, op1
));
607 /* ??? If we have typed data, then it would appear that using
608 movdqu is the only way to get unaligned data loaded with
610 if (TARGET_SSE2
&& GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
612 emit_insn (gen_rtx_SET (op0
, op1
));
618 if (TARGET_SSE2
&& mode
== V2DFmode
)
622 /* When SSE registers are split into halves, we can avoid
623 writing to the top half twice. */
624 if (TARGET_SSE_SPLIT_REGS
)
631 /* ??? Not sure about the best option for the Intel chips.
632 The following would seem to satisfy; the register is
633 entirely cleared, breaking the dependency chain. We
634 then store to the upper half, with a dependency depth
635 of one. A rumor has it that Intel recommends two movsd
636 followed by an unpacklpd, but this is unconfirmed. And
637 given that the dependency depth of the unpacklpd would
638 still be one, I'm not sure why this would be better. */
639 zero
= CONST0_RTX (V2DFmode
);
642 m
= adjust_address (op1
, DFmode
, 0);
643 emit_insn (gen_sse2_loadlpd (op0
, zero
, m
));
644 m
= adjust_address (op1
, DFmode
, 8);
645 emit_insn (gen_sse2_loadhpd (op0
, op0
, m
));
651 if (mode
!= V4SFmode
)
652 t
= gen_reg_rtx (V4SFmode
);
656 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY
)
657 emit_move_insn (t
, CONST0_RTX (V4SFmode
));
661 m
= adjust_address (op1
, V2SFmode
, 0);
662 emit_insn (gen_sse_loadlps (t
, t
, m
));
663 m
= adjust_address (op1
, V2SFmode
, 8);
664 emit_insn (gen_sse_loadhps (t
, t
, m
));
665 if (mode
!= V4SFmode
)
666 emit_move_insn (op0
, gen_lowpart (mode
, t
));
669 else if (MEM_P (op0
))
671 if (TARGET_SSE2
&& mode
== V2DFmode
)
673 m
= adjust_address (op0
, DFmode
, 0);
674 emit_insn (gen_sse2_storelpd (m
, op1
));
675 m
= adjust_address (op0
, DFmode
, 8);
676 emit_insn (gen_sse2_storehpd (m
, op1
));
680 if (mode
!= V4SFmode
)
681 op1
= gen_lowpart (V4SFmode
, op1
);
683 m
= adjust_address (op0
, V2SFmode
, 0);
684 emit_insn (gen_sse_storelps (m
, op1
));
685 m
= adjust_address (op0
, V2SFmode
, 8);
686 emit_insn (gen_sse_storehps (m
, copy_rtx (op1
)));
693 /* Move bits 64:95 to bits 32:63. */
696 ix86_move_vector_high_sse_to_mmx (rtx op
)
698 rtx mask
= gen_rtx_PARALLEL (VOIDmode
,
699 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
700 GEN_INT (0), GEN_INT (0)));
701 rtx dest
= lowpart_subreg (V4SImode
, op
, GET_MODE (op
));
702 op
= gen_rtx_VEC_SELECT (V4SImode
, dest
, mask
);
703 rtx insn
= gen_rtx_SET (dest
, op
);
707 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
710 ix86_split_mmx_pack (rtx operands
[], enum rtx_code code
)
712 rtx op0
= operands
[0];
713 rtx op1
= operands
[1];
714 rtx op2
= operands
[2];
716 machine_mode dmode
= GET_MODE (op0
);
717 machine_mode smode
= GET_MODE (op1
);
718 machine_mode inner_dmode
= GET_MODE_INNER (dmode
);
719 machine_mode inner_smode
= GET_MODE_INNER (smode
);
721 /* Get the corresponding SSE mode for destination. */
722 int nunits
= 16 / GET_MODE_SIZE (inner_dmode
);
723 machine_mode sse_dmode
= mode_for_vector (GET_MODE_INNER (dmode
),
725 machine_mode sse_half_dmode
= mode_for_vector (GET_MODE_INNER (dmode
),
726 nunits
/ 2).require ();
728 /* Get the corresponding SSE mode for source. */
729 nunits
= 16 / GET_MODE_SIZE (inner_smode
);
730 machine_mode sse_smode
= mode_for_vector (GET_MODE_INNER (smode
),
733 /* Generate SSE pack with signed/unsigned saturation. */
734 rtx dest
= lowpart_subreg (sse_dmode
, op0
, GET_MODE (op0
));
735 op1
= lowpart_subreg (sse_smode
, op1
, GET_MODE (op1
));
736 op2
= lowpart_subreg (sse_smode
, op2
, GET_MODE (op2
));
738 op1
= gen_rtx_fmt_e (code
, sse_half_dmode
, op1
);
739 op2
= gen_rtx_fmt_e (code
, sse_half_dmode
, op2
);
740 rtx insn
= gen_rtx_SET (dest
, gen_rtx_VEC_CONCAT (sse_dmode
,
744 ix86_move_vector_high_sse_to_mmx (op0
);
747 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
750 ix86_split_mmx_punpck (rtx operands
[], bool high_p
)
752 rtx op0
= operands
[0];
753 rtx op1
= operands
[1];
754 rtx op2
= operands
[2];
755 machine_mode mode
= GET_MODE (op0
);
757 /* The corresponding SSE mode. */
758 machine_mode sse_mode
, double_sse_mode
;
763 sse_mode
= V16QImode
;
764 double_sse_mode
= V32QImode
;
765 mask
= gen_rtx_PARALLEL (VOIDmode
,
767 GEN_INT (0), GEN_INT (16),
768 GEN_INT (1), GEN_INT (17),
769 GEN_INT (2), GEN_INT (18),
770 GEN_INT (3), GEN_INT (19),
771 GEN_INT (4), GEN_INT (20),
772 GEN_INT (5), GEN_INT (21),
773 GEN_INT (6), GEN_INT (22),
774 GEN_INT (7), GEN_INT (23)));
779 double_sse_mode
= V16HImode
;
780 mask
= gen_rtx_PARALLEL (VOIDmode
,
782 GEN_INT (0), GEN_INT (8),
783 GEN_INT (1), GEN_INT (9),
784 GEN_INT (2), GEN_INT (10),
785 GEN_INT (3), GEN_INT (11)));
790 double_sse_mode
= V8SImode
;
791 mask
= gen_rtx_PARALLEL (VOIDmode
,
793 GEN_INT (0), GEN_INT (4),
794 GEN_INT (1), GEN_INT (5)));
801 /* Generate SSE punpcklXX. */
802 rtx dest
= lowpart_subreg (sse_mode
, op0
, GET_MODE (op0
));
803 op1
= lowpart_subreg (sse_mode
, op1
, GET_MODE (op1
));
804 op2
= lowpart_subreg (sse_mode
, op2
, GET_MODE (op2
));
806 op1
= gen_rtx_VEC_CONCAT (double_sse_mode
, op1
, op2
);
807 op2
= gen_rtx_VEC_SELECT (sse_mode
, op1
, mask
);
808 rtx insn
= gen_rtx_SET (dest
, op2
);
813 /* Move bits 64:127 to bits 0:63. */
814 mask
= gen_rtx_PARALLEL (VOIDmode
,
815 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
816 GEN_INT (0), GEN_INT (0)));
817 dest
= lowpart_subreg (V4SImode
, dest
, GET_MODE (dest
));
818 op1
= gen_rtx_VEC_SELECT (V4SImode
, dest
, mask
);
819 insn
= gen_rtx_SET (dest
, op1
);
824 /* Helper function of ix86_fixup_binary_operands to canonicalize
825 operand order. Returns true if the operands should be swapped. */
828 ix86_swap_binary_operands_p (enum rtx_code code
, machine_mode mode
,
831 rtx dst
= operands
[0];
832 rtx src1
= operands
[1];
833 rtx src2
= operands
[2];
835 /* If the operation is not commutative, we can't do anything. */
836 if (GET_RTX_CLASS (code
) != RTX_COMM_ARITH
837 && GET_RTX_CLASS (code
) != RTX_COMM_COMPARE
)
840 /* Highest priority is that src1 should match dst. */
841 if (rtx_equal_p (dst
, src1
))
843 if (rtx_equal_p (dst
, src2
))
846 /* Next highest priority is that immediate constants come second. */
847 if (immediate_operand (src2
, mode
))
849 if (immediate_operand (src1
, mode
))
852 /* Lowest priority is that memory references should come second. */
862 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
863 destination to use for the operation. If different from the true
864 destination in operands[0], a copy operation will be required. */
867 ix86_fixup_binary_operands (enum rtx_code code
, machine_mode mode
,
870 rtx dst
= operands
[0];
871 rtx src1
= operands
[1];
872 rtx src2
= operands
[2];
874 /* Canonicalize operand order. */
875 if (ix86_swap_binary_operands_p (code
, mode
, operands
))
877 /* It is invalid to swap operands of different modes. */
878 gcc_assert (GET_MODE (src1
) == GET_MODE (src2
));
880 std::swap (src1
, src2
);
883 /* Both source operands cannot be in memory. */
884 if (MEM_P (src1
) && MEM_P (src2
))
886 /* Optimization: Only read from memory once. */
887 if (rtx_equal_p (src1
, src2
))
889 src2
= force_reg (mode
, src2
);
892 else if (rtx_equal_p (dst
, src1
))
893 src2
= force_reg (mode
, src2
);
895 src1
= force_reg (mode
, src1
);
898 /* If the destination is memory, and we do not have matching source
899 operands, do things in registers. */
900 if (MEM_P (dst
) && !rtx_equal_p (dst
, src1
))
901 dst
= gen_reg_rtx (mode
);
903 /* Source 1 cannot be a constant. */
904 if (CONSTANT_P (src1
))
905 src1
= force_reg (mode
, src1
);
907 /* Source 1 cannot be a non-matching memory. */
908 if (MEM_P (src1
) && !rtx_equal_p (dst
, src1
))
909 src1
= force_reg (mode
, src1
);
911 /* Improve address combine. */
913 && GET_MODE_CLASS (mode
) == MODE_INT
915 src2
= force_reg (mode
, src2
);
922 /* Similarly, but assume that the destination has already been
926 ix86_fixup_binary_operands_no_copy (enum rtx_code code
,
927 machine_mode mode
, rtx operands
[])
929 rtx dst
= ix86_fixup_binary_operands (code
, mode
, operands
);
930 gcc_assert (dst
== operands
[0]);
933 /* Attempt to expand a binary operator. Make the expansion closer to the
934 actual machine, then just general_operand, which will allow 3 separate
935 memory references (one output, two input) in a single insn. */
938 ix86_expand_binary_operator (enum rtx_code code
, machine_mode mode
,
941 rtx src1
, src2
, dst
, op
, clob
;
943 dst
= ix86_fixup_binary_operands (code
, mode
, operands
);
947 /* Emit the instruction. */
949 op
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, src1
, src2
));
953 && !rtx_equal_p (dst
, src1
))
955 /* This is going to be an LEA; avoid splitting it later. */
960 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
961 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
964 /* Fix up the destination if needed. */
965 if (dst
!= operands
[0])
966 emit_move_insn (operands
[0], dst
);
969 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
970 the given OPERANDS. */
973 ix86_expand_vector_logical_operator (enum rtx_code code
, machine_mode mode
,
976 rtx op1
= NULL_RTX
, op2
= NULL_RTX
;
977 if (SUBREG_P (operands
[1]))
982 else if (SUBREG_P (operands
[2]))
987 /* Optimize (__m128i) d | (__m128i) e and similar code
988 when d and e are float vectors into float vector logical
989 insn. In C/C++ without using intrinsics there is no other way
990 to express vector logical operation on float vectors than
991 to cast them temporarily to integer vectors. */
993 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
994 && (SUBREG_P (op2
) || GET_CODE (op2
) == CONST_VECTOR
)
995 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1
))) == MODE_VECTOR_FLOAT
996 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1
))) == GET_MODE_SIZE (mode
)
997 && SUBREG_BYTE (op1
) == 0
998 && (GET_CODE (op2
) == CONST_VECTOR
999 || (GET_MODE (SUBREG_REG (op1
)) == GET_MODE (SUBREG_REG (op2
))
1000 && SUBREG_BYTE (op2
) == 0))
1001 && can_create_pseudo_p ())
1004 switch (GET_MODE (SUBREG_REG (op1
)))
1012 dst
= gen_reg_rtx (GET_MODE (SUBREG_REG (op1
)));
1013 if (GET_CODE (op2
) == CONST_VECTOR
)
1015 op2
= gen_lowpart (GET_MODE (dst
), op2
);
1016 op2
= force_reg (GET_MODE (dst
), op2
);
1021 op2
= SUBREG_REG (operands
[2]);
1022 if (!vector_operand (op2
, GET_MODE (dst
)))
1023 op2
= force_reg (GET_MODE (dst
), op2
);
1025 op1
= SUBREG_REG (op1
);
1026 if (!vector_operand (op1
, GET_MODE (dst
)))
1027 op1
= force_reg (GET_MODE (dst
), op1
);
1028 emit_insn (gen_rtx_SET (dst
,
1029 gen_rtx_fmt_ee (code
, GET_MODE (dst
),
1031 emit_move_insn (operands
[0], gen_lowpart (mode
, dst
));
1037 if (!vector_operand (operands
[1], mode
))
1038 operands
[1] = force_reg (mode
, operands
[1]);
1039 if (!vector_operand (operands
[2], mode
))
1040 operands
[2] = force_reg (mode
, operands
[2]);
1041 ix86_fixup_binary_operands_no_copy (code
, mode
, operands
);
1042 emit_insn (gen_rtx_SET (operands
[0],
1043 gen_rtx_fmt_ee (code
, mode
, operands
[1],
1047 /* Return TRUE or FALSE depending on whether the binary operator meets the
1048 appropriate constraints. */
1051 ix86_binary_operator_ok (enum rtx_code code
, machine_mode mode
,
1054 rtx dst
= operands
[0];
1055 rtx src1
= operands
[1];
1056 rtx src2
= operands
[2];
1058 /* Both source operands cannot be in memory. */
1059 if ((MEM_P (src1
) || bcst_mem_operand (src1
, mode
))
1060 && (MEM_P (src2
) || bcst_mem_operand (src2
, mode
)))
1063 /* Canonicalize operand order for commutative operators. */
1064 if (ix86_swap_binary_operands_p (code
, mode
, operands
))
1065 std::swap (src1
, src2
);
1067 /* If the destination is memory, we must have a matching source operand. */
1068 if (MEM_P (dst
) && !rtx_equal_p (dst
, src1
))
1071 /* Source 1 cannot be a constant. */
1072 if (CONSTANT_P (src1
))
1075 /* Source 1 cannot be a non-matching memory. */
1076 if (MEM_P (src1
) && !rtx_equal_p (dst
, src1
))
1077 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1081 || (TARGET_64BIT
&& mode
== DImode
))
1082 && satisfies_constraint_L (src2
));
1087 /* Attempt to expand a unary operator. Make the expansion closer to the
1088 actual machine, then just general_operand, which will allow 2 separate
1089 memory references (one output, one input) in a single insn. */
1092 ix86_expand_unary_operator (enum rtx_code code
, machine_mode mode
,
1095 bool matching_memory
= false;
1096 rtx src
, dst
, op
, clob
;
1101 /* If the destination is memory, and we do not have matching source
1102 operands, do things in registers. */
1105 if (rtx_equal_p (dst
, src
))
1106 matching_memory
= true;
1108 dst
= gen_reg_rtx (mode
);
1111 /* When source operand is memory, destination must match. */
1112 if (MEM_P (src
) && !matching_memory
)
1113 src
= force_reg (mode
, src
);
1115 /* Emit the instruction. */
1117 op
= gen_rtx_SET (dst
, gen_rtx_fmt_e (code
, mode
, src
));
1123 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1124 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1127 /* Fix up the destination if needed. */
1128 if (dst
!= operands
[0])
1129 emit_move_insn (operands
[0], dst
);
1132 /* Predict just emitted jump instruction to be taken with probability PROB. */
1135 predict_jump (int prob
)
1137 rtx_insn
*insn
= get_last_insn ();
1138 gcc_assert (JUMP_P (insn
));
1139 add_reg_br_prob_note (insn
, profile_probability::from_reg_br_prob_base (prob
));
1142 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1143 divisor are within the range [0-255]. */
1146 ix86_split_idivmod (machine_mode mode
, rtx operands
[],
1149 rtx_code_label
*end_label
, *qimode_label
;
1152 rtx scratch
, tmp0
, tmp1
, tmp2
;
1153 rtx (*gen_divmod4_1
) (rtx
, rtx
, rtx
, rtx
);
1158 if (GET_MODE (operands
[0]) == SImode
)
1160 if (GET_MODE (operands
[1]) == SImode
)
1161 gen_divmod4_1
= unsigned_p
? gen_udivmodsi4_1
: gen_divmodsi4_1
;
1164 = unsigned_p
? gen_udivmodsi4_zext_2
: gen_divmodsi4_zext_2
;
1168 = unsigned_p
? gen_udivmodsi4_zext_1
: gen_divmodsi4_zext_1
;
1172 gen_divmod4_1
= unsigned_p
? gen_udivmoddi4_1
: gen_divmoddi4_1
;
1179 end_label
= gen_label_rtx ();
1180 qimode_label
= gen_label_rtx ();
1182 scratch
= gen_reg_rtx (mode
);
1184 /* Use 8bit unsigned divimod if dividend and divisor are within
1185 the range [0-255]. */
1186 emit_move_insn (scratch
, operands
[2]);
1187 scratch
= expand_simple_binop (mode
, IOR
, scratch
, operands
[3],
1188 scratch
, 1, OPTAB_DIRECT
);
1189 emit_insn (gen_test_ccno_1 (mode
, scratch
, GEN_INT (-0x100)));
1190 tmp0
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
1191 tmp0
= gen_rtx_EQ (VOIDmode
, tmp0
, const0_rtx
);
1192 tmp0
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp0
,
1193 gen_rtx_LABEL_REF (VOIDmode
, qimode_label
),
1195 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp0
));
1196 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
1197 JUMP_LABEL (insn
) = qimode_label
;
1199 /* Generate original signed/unsigned divimod. */
1200 emit_insn (gen_divmod4_1 (operands
[0], operands
[1],
1201 operands
[2], operands
[3]));
1203 /* Branch to the end. */
1204 emit_jump_insn (gen_jump (end_label
));
1207 /* Generate 8bit unsigned divide. */
1208 emit_label (qimode_label
);
1209 /* Don't use operands[0] for result of 8bit divide since not all
1210 registers support QImode ZERO_EXTRACT. */
1211 tmp0
= lowpart_subreg (HImode
, scratch
, mode
);
1212 tmp1
= lowpart_subreg (HImode
, operands
[2], mode
);
1213 tmp2
= lowpart_subreg (QImode
, operands
[3], mode
);
1214 emit_insn (gen_udivmodhiqi3 (tmp0
, tmp1
, tmp2
));
1218 div
= gen_rtx_UDIV (mode
, operands
[2], operands
[3]);
1219 mod
= gen_rtx_UMOD (mode
, operands
[2], operands
[3]);
1223 div
= gen_rtx_DIV (mode
, operands
[2], operands
[3]);
1224 mod
= gen_rtx_MOD (mode
, operands
[2], operands
[3]);
1228 if (GET_MODE (operands
[0]) != SImode
)
1229 div
= gen_rtx_ZERO_EXTEND (DImode
, div
);
1230 if (GET_MODE (operands
[1]) != SImode
)
1231 mod
= gen_rtx_ZERO_EXTEND (DImode
, mod
);
1234 /* Extract remainder from AH. */
1235 scratch
= gen_lowpart (GET_MODE (operands
[1]), scratch
);
1236 tmp1
= gen_rtx_ZERO_EXTRACT (GET_MODE (operands
[1]), scratch
,
1237 GEN_INT (8), GEN_INT (8));
1238 insn
= emit_move_insn (operands
[1], tmp1
);
1239 set_unique_reg_note (insn
, REG_EQUAL
, mod
);
1241 /* Zero extend quotient from AL. */
1242 tmp1
= gen_lowpart (QImode
, tmp0
);
1243 insn
= emit_insn (gen_extend_insn
1245 GET_MODE (operands
[0]), QImode
, 1));
1246 set_unique_reg_note (insn
, REG_EQUAL
, div
);
1248 emit_label (end_label
);
1251 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1252 matches destination. RTX includes clobber of FLAGS_REG. */
1255 ix86_emit_binop (enum rtx_code code
, machine_mode mode
,
1260 op
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, dst
, src
));
1261 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1263 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1266 /* Return true if regno1 def is nearest to the insn. */
1269 find_nearest_reg_def (rtx_insn
*insn
, int regno1
, int regno2
)
1271 rtx_insn
*prev
= insn
;
1272 rtx_insn
*start
= BB_HEAD (BLOCK_FOR_INSN (insn
));
1276 while (prev
&& prev
!= start
)
1278 if (!INSN_P (prev
) || !NONDEBUG_INSN_P (prev
))
1280 prev
= PREV_INSN (prev
);
1283 if (insn_defines_reg (regno1
, INVALID_REGNUM
, prev
))
1285 else if (insn_defines_reg (regno2
, INVALID_REGNUM
, prev
))
1287 prev
= PREV_INSN (prev
);
1290 /* None of the regs is defined in the bb. */
1294 /* Split lea instructions into a sequence of instructions
1295 which are executed on ALU to avoid AGU stalls.
1296 It is assumed that it is allowed to clobber flags register
1300 ix86_split_lea_for_addr (rtx_insn
*insn
, rtx operands
[], machine_mode mode
)
1302 unsigned int regno0
, regno1
, regno2
;
1303 struct ix86_address parts
;
1307 ok
= ix86_decompose_address (operands
[1], &parts
);
1310 target
= gen_lowpart (mode
, operands
[0]);
1312 regno0
= true_regnum (target
);
1313 regno1
= INVALID_REGNUM
;
1314 regno2
= INVALID_REGNUM
;
1318 parts
.base
= gen_lowpart (mode
, parts
.base
);
1319 regno1
= true_regnum (parts
.base
);
1324 parts
.index
= gen_lowpart (mode
, parts
.index
);
1325 regno2
= true_regnum (parts
.index
);
1329 parts
.disp
= gen_lowpart (mode
, parts
.disp
);
1331 if (parts
.scale
> 1)
1333 /* Case r1 = r1 + ... */
1334 if (regno1
== regno0
)
1336 /* If we have a case r1 = r1 + C * r2 then we
1337 should use multiplication which is very
1338 expensive. Assume cost model is wrong if we
1339 have such case here. */
1340 gcc_assert (regno2
!= regno0
);
1342 for (adds
= parts
.scale
; adds
> 0; adds
--)
1343 ix86_emit_binop (PLUS
, mode
, target
, parts
.index
);
1347 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1348 if (regno0
!= regno2
)
1349 emit_insn (gen_rtx_SET (target
, parts
.index
));
1351 /* Use shift for scaling, but emit it as MULT instead
1352 to avoid it being immediately peephole2 optimized back
1354 ix86_emit_binop (MULT
, mode
, target
, GEN_INT (parts
.scale
));
1357 ix86_emit_binop (PLUS
, mode
, target
, parts
.base
);
1359 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1360 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1363 else if (!parts
.base
&& !parts
.index
)
1365 gcc_assert(parts
.disp
);
1366 emit_insn (gen_rtx_SET (target
, parts
.disp
));
1372 if (regno0
!= regno2
)
1373 emit_insn (gen_rtx_SET (target
, parts
.index
));
1375 else if (!parts
.index
)
1377 if (regno0
!= regno1
)
1378 emit_insn (gen_rtx_SET (target
, parts
.base
));
1382 if (regno0
== regno1
)
1384 else if (regno0
== regno2
)
1390 /* Find better operand for SET instruction, depending
1391 on which definition is farther from the insn. */
1392 if (find_nearest_reg_def (insn
, regno1
, regno2
))
1393 tmp
= parts
.index
, tmp1
= parts
.base
;
1395 tmp
= parts
.base
, tmp1
= parts
.index
;
1397 emit_insn (gen_rtx_SET (target
, tmp
));
1399 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1400 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1402 ix86_emit_binop (PLUS
, mode
, target
, tmp1
);
1406 ix86_emit_binop (PLUS
, mode
, target
, tmp
);
1409 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1410 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1414 /* Post-reload splitter for converting an SF or DFmode value in an
1415 SSE register into an unsigned SImode. */
1418 ix86_split_convert_uns_si_sse (rtx operands
[])
1420 machine_mode vecmode
;
1421 rtx value
, large
, zero_or_two31
, input
, two31
, x
;
1423 large
= operands
[1];
1424 zero_or_two31
= operands
[2];
1425 input
= operands
[3];
1426 two31
= operands
[4];
1427 vecmode
= GET_MODE (large
);
1428 value
= gen_rtx_REG (vecmode
, REGNO (operands
[0]));
1430 /* Load up the value into the low element. We must ensure that the other
1431 elements are valid floats -- zero is the easiest such value. */
1434 if (vecmode
== V4SFmode
)
1435 emit_insn (gen_vec_setv4sf_0 (value
, CONST0_RTX (V4SFmode
), input
));
1437 emit_insn (gen_sse2_loadlpd (value
, CONST0_RTX (V2DFmode
), input
));
1441 input
= gen_rtx_REG (vecmode
, REGNO (input
));
1442 emit_move_insn (value
, CONST0_RTX (vecmode
));
1443 if (vecmode
== V4SFmode
)
1444 emit_insn (gen_sse_movss (value
, value
, input
));
1446 emit_insn (gen_sse2_movsd (value
, value
, input
));
1449 emit_move_insn (large
, two31
);
1450 emit_move_insn (zero_or_two31
, MEM_P (two31
) ? large
: two31
);
1452 x
= gen_rtx_fmt_ee (LE
, vecmode
, large
, value
);
1453 emit_insn (gen_rtx_SET (large
, x
));
1455 x
= gen_rtx_AND (vecmode
, zero_or_two31
, large
);
1456 emit_insn (gen_rtx_SET (zero_or_two31
, x
));
1458 x
= gen_rtx_MINUS (vecmode
, value
, zero_or_two31
);
1459 emit_insn (gen_rtx_SET (value
, x
));
1461 large
= gen_rtx_REG (V4SImode
, REGNO (large
));
1462 emit_insn (gen_ashlv4si3 (large
, large
, GEN_INT (31)));
1464 x
= gen_rtx_REG (V4SImode
, REGNO (value
));
1465 if (vecmode
== V4SFmode
)
1466 emit_insn (gen_fix_truncv4sfv4si2 (x
, value
));
1468 emit_insn (gen_sse2_cvttpd2dq (x
, value
));
1471 emit_insn (gen_xorv4si3 (value
, value
, large
));
1474 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok
,
1475 machine_mode mode
, rtx target
,
1476 rtx var
, int one_var
);
1478 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1479 Expects the 64-bit DImode to be supplied in a pair of integral
1480 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1481 -mfpmath=sse, !optimize_size only. */
1484 ix86_expand_convert_uns_didf_sse (rtx target
, rtx input
)
1486 REAL_VALUE_TYPE bias_lo_rvt
, bias_hi_rvt
;
1487 rtx int_xmm
, fp_xmm
;
1488 rtx biases
, exponents
;
1491 int_xmm
= gen_reg_rtx (V4SImode
);
1492 if (TARGET_INTER_UNIT_MOVES_TO_VEC
)
1493 emit_insn (gen_movdi_to_sse (int_xmm
, input
));
1494 else if (TARGET_SSE_SPLIT_REGS
)
1496 emit_clobber (int_xmm
);
1497 emit_move_insn (gen_lowpart (DImode
, int_xmm
), input
);
1501 x
= gen_reg_rtx (V2DImode
);
1502 ix86_expand_vector_init_one_nonzero (false, V2DImode
, x
, input
, 0);
1503 emit_move_insn (int_xmm
, gen_lowpart (V4SImode
, x
));
1506 x
= gen_rtx_CONST_VECTOR (V4SImode
,
1507 gen_rtvec (4, GEN_INT (0x43300000UL
),
1508 GEN_INT (0x45300000UL
),
1509 const0_rtx
, const0_rtx
));
1510 exponents
= validize_mem (force_const_mem (V4SImode
, x
));
1512 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1513 emit_insn (gen_vec_interleave_lowv4si (int_xmm
, int_xmm
, exponents
));
1515 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1516 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1517 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1518 (0x1.0p84 + double(fp_value_hi_xmm)).
1519 Note these exponents differ by 32. */
1521 fp_xmm
= copy_to_mode_reg (V2DFmode
, gen_lowpart (V2DFmode
, int_xmm
));
1523 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1524 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1525 real_ldexp (&bias_lo_rvt
, &dconst1
, 52);
1526 real_ldexp (&bias_hi_rvt
, &dconst1
, 84);
1527 biases
= const_double_from_real_value (bias_lo_rvt
, DFmode
);
1528 x
= const_double_from_real_value (bias_hi_rvt
, DFmode
);
1529 biases
= gen_rtx_CONST_VECTOR (V2DFmode
, gen_rtvec (2, biases
, x
));
1530 biases
= validize_mem (force_const_mem (V2DFmode
, biases
));
1531 emit_insn (gen_subv2df3 (fp_xmm
, fp_xmm
, biases
));
1533 /* Add the upper and lower DFmode values together. */
1535 emit_insn (gen_sse3_haddv2df3 (fp_xmm
, fp_xmm
, fp_xmm
));
1538 x
= copy_to_mode_reg (V2DFmode
, fp_xmm
);
1539 emit_insn (gen_vec_interleave_highv2df (fp_xmm
, fp_xmm
, fp_xmm
));
1540 emit_insn (gen_addv2df3 (fp_xmm
, fp_xmm
, x
));
1543 ix86_expand_vector_extract (false, target
, fp_xmm
, 0);
1546 /* Not used, but eases macroization of patterns. */
1548 ix86_expand_convert_uns_sixf_sse (rtx
, rtx
)
1553 /* Convert an unsigned SImode value into a DFmode. Only currently used
1554 for SSE, but applicable anywhere. */
1557 ix86_expand_convert_uns_sidf_sse (rtx target
, rtx input
)
1559 REAL_VALUE_TYPE TWO31r
;
1562 x
= expand_simple_binop (SImode
, PLUS
, input
, GEN_INT (-2147483647 - 1),
1563 NULL
, 1, OPTAB_DIRECT
);
1565 fp
= gen_reg_rtx (DFmode
);
1566 emit_insn (gen_floatsidf2 (fp
, x
));
1568 real_ldexp (&TWO31r
, &dconst1
, 31);
1569 x
= const_double_from_real_value (TWO31r
, DFmode
);
1571 x
= expand_simple_binop (DFmode
, PLUS
, fp
, x
, target
, 0, OPTAB_DIRECT
);
1573 emit_move_insn (target
, x
);
1576 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1577 32-bit mode; otherwise we have a direct convert instruction. */
1580 ix86_expand_convert_sign_didf_sse (rtx target
, rtx input
)
1582 REAL_VALUE_TYPE TWO32r
;
1583 rtx fp_lo
, fp_hi
, x
;
1585 fp_lo
= gen_reg_rtx (DFmode
);
1586 fp_hi
= gen_reg_rtx (DFmode
);
1588 emit_insn (gen_floatsidf2 (fp_hi
, gen_highpart (SImode
, input
)));
1590 real_ldexp (&TWO32r
, &dconst1
, 32);
1591 x
= const_double_from_real_value (TWO32r
, DFmode
);
1592 fp_hi
= expand_simple_binop (DFmode
, MULT
, fp_hi
, x
, fp_hi
, 0, OPTAB_DIRECT
);
1594 ix86_expand_convert_uns_sidf_sse (fp_lo
, gen_lowpart (SImode
, input
));
1596 x
= expand_simple_binop (DFmode
, PLUS
, fp_hi
, fp_lo
, target
,
1599 emit_move_insn (target
, x
);
1602 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1603 For x86_32, -mfpmath=sse, !optimize_size only. */
1605 ix86_expand_convert_uns_sisf_sse (rtx target
, rtx input
)
1607 REAL_VALUE_TYPE ONE16r
;
1608 rtx fp_hi
, fp_lo
, int_hi
, int_lo
, x
;
1610 real_ldexp (&ONE16r
, &dconst1
, 16);
1611 x
= const_double_from_real_value (ONE16r
, SFmode
);
1612 int_lo
= expand_simple_binop (SImode
, AND
, input
, GEN_INT(0xffff),
1613 NULL
, 0, OPTAB_DIRECT
);
1614 int_hi
= expand_simple_binop (SImode
, LSHIFTRT
, input
, GEN_INT(16),
1615 NULL
, 0, OPTAB_DIRECT
);
1616 fp_hi
= gen_reg_rtx (SFmode
);
1617 fp_lo
= gen_reg_rtx (SFmode
);
1618 emit_insn (gen_floatsisf2 (fp_hi
, int_hi
));
1619 emit_insn (gen_floatsisf2 (fp_lo
, int_lo
));
1620 fp_hi
= expand_simple_binop (SFmode
, MULT
, fp_hi
, x
, fp_hi
,
1622 fp_hi
= expand_simple_binop (SFmode
, PLUS
, fp_hi
, fp_lo
, target
,
1624 if (!rtx_equal_p (target
, fp_hi
))
1625 emit_move_insn (target
, fp_hi
);
1628 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1629 a vector of unsigned ints VAL to vector of floats TARGET. */
1632 ix86_expand_vector_convert_uns_vsivsf (rtx target
, rtx val
)
1635 REAL_VALUE_TYPE TWO16r
;
1636 machine_mode intmode
= GET_MODE (val
);
1637 machine_mode fltmode
= GET_MODE (target
);
1638 rtx (*cvt
) (rtx
, rtx
);
1640 if (intmode
== V4SImode
)
1641 cvt
= gen_floatv4siv4sf2
;
1643 cvt
= gen_floatv8siv8sf2
;
1644 tmp
[0] = ix86_build_const_vector (intmode
, 1, GEN_INT (0xffff));
1645 tmp
[0] = force_reg (intmode
, tmp
[0]);
1646 tmp
[1] = expand_simple_binop (intmode
, AND
, val
, tmp
[0], NULL_RTX
, 1,
1648 tmp
[2] = expand_simple_binop (intmode
, LSHIFTRT
, val
, GEN_INT (16),
1649 NULL_RTX
, 1, OPTAB_DIRECT
);
1650 tmp
[3] = gen_reg_rtx (fltmode
);
1651 emit_insn (cvt (tmp
[3], tmp
[1]));
1652 tmp
[4] = gen_reg_rtx (fltmode
);
1653 emit_insn (cvt (tmp
[4], tmp
[2]));
1654 real_ldexp (&TWO16r
, &dconst1
, 16);
1655 tmp
[5] = const_double_from_real_value (TWO16r
, SFmode
);
1656 tmp
[5] = force_reg (fltmode
, ix86_build_const_vector (fltmode
, 1, tmp
[5]));
1657 tmp
[6] = expand_simple_binop (fltmode
, MULT
, tmp
[4], tmp
[5], NULL_RTX
, 1,
1659 tmp
[7] = expand_simple_binop (fltmode
, PLUS
, tmp
[3], tmp
[6], target
, 1,
1661 if (tmp
[7] != target
)
1662 emit_move_insn (target
, tmp
[7]);
1665 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1666 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1667 This is done by doing just signed conversion if < 0x1p31, and otherwise by
1668 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
1671 ix86_expand_adjust_ufix_to_sfix_si (rtx val
, rtx
*xorp
)
1673 REAL_VALUE_TYPE TWO31r
;
1675 machine_mode mode
= GET_MODE (val
);
1676 machine_mode scalarmode
= GET_MODE_INNER (mode
);
1677 machine_mode intmode
= GET_MODE_SIZE (mode
) == 32 ? V8SImode
: V4SImode
;
1678 rtx (*cmp
) (rtx
, rtx
, rtx
, rtx
);
1681 for (i
= 0; i
< 3; i
++)
1682 tmp
[i
] = gen_reg_rtx (mode
);
1683 real_ldexp (&TWO31r
, &dconst1
, 31);
1684 two31r
= const_double_from_real_value (TWO31r
, scalarmode
);
1685 two31r
= ix86_build_const_vector (mode
, 1, two31r
);
1686 two31r
= force_reg (mode
, two31r
);
1689 case E_V8SFmode
: cmp
= gen_avx_maskcmpv8sf3
; break;
1690 case E_V4SFmode
: cmp
= gen_sse_maskcmpv4sf3
; break;
1691 case E_V4DFmode
: cmp
= gen_avx_maskcmpv4df3
; break;
1692 case E_V2DFmode
: cmp
= gen_sse2_maskcmpv2df3
; break;
1693 default: gcc_unreachable ();
1695 tmp
[3] = gen_rtx_LE (mode
, two31r
, val
);
1696 emit_insn (cmp (tmp
[0], two31r
, val
, tmp
[3]));
1697 tmp
[1] = expand_simple_binop (mode
, AND
, tmp
[0], two31r
, tmp
[1],
1699 if (intmode
== V4SImode
|| TARGET_AVX2
)
1700 *xorp
= expand_simple_binop (intmode
, ASHIFT
,
1701 gen_lowpart (intmode
, tmp
[0]),
1702 GEN_INT (31), NULL_RTX
, 0,
1706 rtx two31
= gen_int_mode (HOST_WIDE_INT_1U
<< 31, SImode
);
1707 two31
= ix86_build_const_vector (intmode
, 1, two31
);
1708 *xorp
= expand_simple_binop (intmode
, AND
,
1709 gen_lowpart (intmode
, tmp
[0]),
1713 return expand_simple_binop (mode
, MINUS
, val
, tmp
[1], tmp
[2],
1717 /* Generate code for floating point ABS or NEG. */
1720 ix86_expand_fp_absneg_operator (enum rtx_code code
, machine_mode mode
,
1724 bool use_sse
= false;
1725 bool vector_mode
= VECTOR_MODE_P (mode
);
1726 machine_mode vmode
= mode
;
1729 if (vector_mode
|| mode
== TFmode
)
1731 else if (TARGET_SSE_MATH
)
1733 use_sse
= SSE_FLOAT_MODE_P (mode
);
1736 else if (mode
== DFmode
)
1743 set
= gen_rtx_fmt_e (code
, mode
, src
);
1744 set
= gen_rtx_SET (dst
, set
);
1748 rtx mask
, use
, clob
;
1750 /* NEG and ABS performed with SSE use bitwise mask operations.
1751 Create the appropriate mask now. */
1752 mask
= ix86_build_signbit_mask (vmode
, vector_mode
, code
== ABS
);
1753 use
= gen_rtx_USE (VOIDmode
, mask
);
1754 if (vector_mode
|| mode
== TFmode
)
1755 par
= gen_rtvec (2, set
, use
);
1758 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1759 par
= gen_rtvec (3, set
, use
, clob
);
1766 /* Changing of sign for FP values is doable using integer unit too. */
1767 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1768 par
= gen_rtvec (2, set
, clob
);
1771 emit_insn (gen_rtx_PARALLEL (VOIDmode
, par
));
1774 /* Deconstruct a floating point ABS or NEG operation
1775 with integer registers into integer operations. */
1778 ix86_split_fp_absneg_operator (enum rtx_code code
, machine_mode mode
,
1781 enum rtx_code absneg_op
;
1784 gcc_assert (operands_match_p (operands
[0], operands
[1]));
1789 dst
= gen_lowpart (SImode
, operands
[0]);
1793 set
= gen_int_mode (0x7fffffff, SImode
);
1798 set
= gen_int_mode (0x80000000, SImode
);
1801 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
1807 dst
= gen_lowpart (DImode
, operands
[0]);
1808 dst
= gen_rtx_ZERO_EXTRACT (DImode
, dst
, const1_rtx
, GEN_INT (63));
1813 set
= gen_rtx_NOT (DImode
, dst
);
1817 dst
= gen_highpart (SImode
, operands
[0]);
1821 set
= gen_int_mode (0x7fffffff, SImode
);
1826 set
= gen_int_mode (0x80000000, SImode
);
1829 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
1834 dst
= gen_rtx_REG (SImode
,
1835 REGNO (operands
[0]) + (TARGET_64BIT
? 1 : 2));
1838 set
= GEN_INT (0x7fff);
1843 set
= GEN_INT (0x8000);
1846 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
1853 set
= gen_rtx_SET (dst
, set
);
1855 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1856 rtvec par
= gen_rtvec (2, set
, clob
);
1858 emit_insn (gen_rtx_PARALLEL (VOIDmode
, par
));
1861 /* Expand a copysign operation. Special case operand 0 being a constant. */
1864 ix86_expand_copysign (rtx operands
[])
1866 machine_mode mode
, vmode
;
1867 rtx dest
, op0
, op1
, mask
;
1873 mode
= GET_MODE (dest
);
1877 else if (mode
== DFmode
)
1879 else if (mode
== TFmode
)
1884 mask
= ix86_build_signbit_mask (vmode
, 0, 0);
1886 if (CONST_DOUBLE_P (op0
))
1888 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0
)))
1889 op0
= simplify_unary_operation (ABS
, mode
, op0
, mode
);
1891 if (mode
== SFmode
|| mode
== DFmode
)
1893 if (op0
== CONST0_RTX (mode
))
1894 op0
= CONST0_RTX (vmode
);
1897 rtx v
= ix86_build_const_vector (vmode
, false, op0
);
1899 op0
= force_reg (vmode
, v
);
1902 else if (op0
!= CONST0_RTX (mode
))
1903 op0
= force_reg (mode
, op0
);
1905 emit_insn (gen_copysign3_const (mode
, dest
, op0
, op1
, mask
));
1909 rtx nmask
= ix86_build_signbit_mask (vmode
, 0, 1);
1911 emit_insn (gen_copysign3_var
1912 (mode
, dest
, NULL_RTX
, op0
, op1
, nmask
, mask
));
1916 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
1917 be a constant, and so has already been expanded into a vector constant. */
1920 ix86_split_copysign_const (rtx operands
[])
1922 machine_mode mode
, vmode
;
1923 rtx dest
, op0
, mask
, x
;
1929 mode
= GET_MODE (dest
);
1930 vmode
= GET_MODE (mask
);
1932 dest
= lowpart_subreg (vmode
, dest
, mode
);
1933 x
= gen_rtx_AND (vmode
, dest
, mask
);
1934 emit_insn (gen_rtx_SET (dest
, x
));
1936 if (op0
!= CONST0_RTX (vmode
))
1938 x
= gen_rtx_IOR (vmode
, dest
, op0
);
1939 emit_insn (gen_rtx_SET (dest
, x
));
1943 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
1944 so we have to do two masks. */
1947 ix86_split_copysign_var (rtx operands
[])
1949 machine_mode mode
, vmode
;
1950 rtx dest
, scratch
, op0
, op1
, mask
, nmask
, x
;
1953 scratch
= operands
[1];
1956 nmask
= operands
[4];
1959 mode
= GET_MODE (dest
);
1960 vmode
= GET_MODE (mask
);
1962 if (rtx_equal_p (op0
, op1
))
1964 /* Shouldn't happen often (it's useless, obviously), but when it does
1965 we'd generate incorrect code if we continue below. */
1966 emit_move_insn (dest
, op0
);
1970 if (REG_P (mask
) && REGNO (dest
) == REGNO (mask
)) /* alternative 0 */
1972 gcc_assert (REGNO (op1
) == REGNO (scratch
));
1974 x
= gen_rtx_AND (vmode
, scratch
, mask
);
1975 emit_insn (gen_rtx_SET (scratch
, x
));
1978 op0
= lowpart_subreg (vmode
, op0
, mode
);
1979 x
= gen_rtx_NOT (vmode
, dest
);
1980 x
= gen_rtx_AND (vmode
, x
, op0
);
1981 emit_insn (gen_rtx_SET (dest
, x
));
1985 if (REGNO (op1
) == REGNO (scratch
)) /* alternative 1,3 */
1987 x
= gen_rtx_AND (vmode
, scratch
, mask
);
1989 else /* alternative 2,4 */
1991 gcc_assert (REGNO (mask
) == REGNO (scratch
));
1992 op1
= lowpart_subreg (vmode
, op1
, mode
);
1993 x
= gen_rtx_AND (vmode
, scratch
, op1
);
1995 emit_insn (gen_rtx_SET (scratch
, x
));
1997 if (REGNO (op0
) == REGNO (dest
)) /* alternative 1,2 */
1999 dest
= lowpart_subreg (vmode
, op0
, mode
);
2000 x
= gen_rtx_AND (vmode
, dest
, nmask
);
2002 else /* alternative 3,4 */
2004 gcc_assert (REGNO (nmask
) == REGNO (dest
));
2006 op0
= lowpart_subreg (vmode
, op0
, mode
);
2007 x
= gen_rtx_AND (vmode
, dest
, op0
);
2009 emit_insn (gen_rtx_SET (dest
, x
));
2012 x
= gen_rtx_IOR (vmode
, dest
, scratch
);
2013 emit_insn (gen_rtx_SET (dest
, x
));
2016 /* Expand an xorsign operation. */
2019 ix86_expand_xorsign (rtx operands
[])
2021 machine_mode mode
, vmode
;
2022 rtx dest
, op0
, op1
, mask
;
2028 mode
= GET_MODE (dest
);
2032 else if (mode
== DFmode
)
2037 mask
= ix86_build_signbit_mask (vmode
, 0, 0);
2039 emit_insn (gen_xorsign3_1 (mode
, dest
, op0
, op1
, mask
));
2042 /* Deconstruct an xorsign operation into bit masks. */
2045 ix86_split_xorsign (rtx operands
[])
2047 machine_mode mode
, vmode
;
2048 rtx dest
, op0
, mask
, x
;
2054 mode
= GET_MODE (dest
);
2055 vmode
= GET_MODE (mask
);
2057 dest
= lowpart_subreg (vmode
, dest
, mode
);
2058 x
= gen_rtx_AND (vmode
, dest
, mask
);
2059 emit_insn (gen_rtx_SET (dest
, x
));
2061 op0
= lowpart_subreg (vmode
, op0
, mode
);
2062 x
= gen_rtx_XOR (vmode
, dest
, op0
);
2063 emit_insn (gen_rtx_SET (dest
, x
));
2066 static rtx
ix86_expand_compare (enum rtx_code code
, rtx op0
, rtx op1
);
2069 ix86_expand_branch (enum rtx_code code
, rtx op0
, rtx op1
, rtx label
)
2071 machine_mode mode
= GET_MODE (op0
);
2074 /* Handle special case - vector comparsion with boolean result, transform
2075 it using ptest instruction. */
2076 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
2078 rtx flag
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
2079 machine_mode p_mode
= GET_MODE_SIZE (mode
) == 32 ? V4DImode
: V2DImode
;
2081 gcc_assert (code
== EQ
|| code
== NE
);
2082 /* Generate XOR since we can't check that one operand is zero vector. */
2083 tmp
= gen_reg_rtx (mode
);
2084 emit_insn (gen_rtx_SET (tmp
, gen_rtx_XOR (mode
, op0
, op1
)));
2085 tmp
= gen_lowpart (p_mode
, tmp
);
2086 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode
, FLAGS_REG
),
2087 gen_rtx_UNSPEC (CCmode
,
2088 gen_rtvec (2, tmp
, tmp
),
2090 tmp
= gen_rtx_fmt_ee (code
, VOIDmode
, flag
, const0_rtx
);
2091 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
2092 gen_rtx_LABEL_REF (VOIDmode
, label
),
2094 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2107 tmp
= ix86_expand_compare (code
, op0
, op1
);
2108 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
2109 gen_rtx_LABEL_REF (VOIDmode
, label
),
2111 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2117 /* For 32-bit target DI comparison may be performed on
2118 SSE registers. To allow this we should avoid split
2119 to SI mode which is achieved by doing xor in DI mode
2120 and then comparing with zero (which is recognized by
2121 STV pass). We don't compare using xor when optimizing
2123 if (!optimize_insn_for_size_p ()
2125 && (code
== EQ
|| code
== NE
))
2127 op0
= force_reg (mode
, gen_rtx_XOR (mode
, op0
, op1
));
2132 /* Expand DImode branch into multiple compare+branch. */
2135 rtx_code_label
*label2
;
2136 enum rtx_code code1
, code2
, code3
;
2137 machine_mode submode
;
2139 if (CONSTANT_P (op0
) && !CONSTANT_P (op1
))
2141 std::swap (op0
, op1
);
2142 code
= swap_condition (code
);
2145 split_double_mode (mode
, &op0
, 1, lo
+0, hi
+0);
2146 split_double_mode (mode
, &op1
, 1, lo
+1, hi
+1);
2148 submode
= mode
== DImode
? SImode
: DImode
;
2150 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
2151 avoid two branches. This costs one extra insn, so disable when
2152 optimizing for size. */
2154 if ((code
== EQ
|| code
== NE
)
2155 && (!optimize_insn_for_size_p ()
2156 || hi
[1] == const0_rtx
|| lo
[1] == const0_rtx
))
2161 if (hi
[1] != const0_rtx
)
2162 xor1
= expand_binop (submode
, xor_optab
, xor1
, hi
[1],
2163 NULL_RTX
, 0, OPTAB_WIDEN
);
2166 if (lo
[1] != const0_rtx
)
2167 xor0
= expand_binop (submode
, xor_optab
, xor0
, lo
[1],
2168 NULL_RTX
, 0, OPTAB_WIDEN
);
2170 tmp
= expand_binop (submode
, ior_optab
, xor1
, xor0
,
2171 NULL_RTX
, 0, OPTAB_WIDEN
);
2173 ix86_expand_branch (code
, tmp
, const0_rtx
, label
);
2177 /* Otherwise, if we are doing less-than or greater-or-equal-than,
2178 op1 is a constant and the low word is zero, then we can just
2179 examine the high word. Similarly for low word -1 and
2180 less-or-equal-than or greater-than. */
2182 if (CONST_INT_P (hi
[1]))
2185 case LT
: case LTU
: case GE
: case GEU
:
2186 if (lo
[1] == const0_rtx
)
2188 ix86_expand_branch (code
, hi
[0], hi
[1], label
);
2192 case LE
: case LEU
: case GT
: case GTU
:
2193 if (lo
[1] == constm1_rtx
)
2195 ix86_expand_branch (code
, hi
[0], hi
[1], label
);
2203 /* Emulate comparisons that do not depend on Zero flag with
2204 double-word subtraction. Note that only Overflow, Sign
2205 and Carry flags are valid, so swap arguments and condition
2206 of comparisons that would otherwise test Zero flag. */
2210 case LE
: case LEU
: case GT
: case GTU
:
2211 std::swap (lo
[0], lo
[1]);
2212 std::swap (hi
[0], hi
[1]);
2213 code
= swap_condition (code
);
2216 case LT
: case LTU
: case GE
: case GEU
:
2218 bool uns
= (code
== LTU
|| code
== GEU
);
2219 rtx (*sbb_insn
) (machine_mode
, rtx
, rtx
, rtx
)
2220 = uns
? gen_sub3_carry_ccc
: gen_sub3_carry_ccgz
;
2222 if (!nonimmediate_operand (lo
[0], submode
))
2223 lo
[0] = force_reg (submode
, lo
[0]);
2224 if (!x86_64_general_operand (lo
[1], submode
))
2225 lo
[1] = force_reg (submode
, lo
[1]);
2227 if (!register_operand (hi
[0], submode
))
2228 hi
[0] = force_reg (submode
, hi
[0]);
2229 if ((uns
&& !nonimmediate_operand (hi
[1], submode
))
2230 || (!uns
&& !x86_64_general_operand (hi
[1], submode
)))
2231 hi
[1] = force_reg (submode
, hi
[1]);
2233 emit_insn (gen_cmp_1 (submode
, lo
[0], lo
[1]));
2235 tmp
= gen_rtx_SCRATCH (submode
);
2236 emit_insn (sbb_insn (submode
, tmp
, hi
[0], hi
[1]));
2238 tmp
= gen_rtx_REG (uns
? CCCmode
: CCGZmode
, FLAGS_REG
);
2239 ix86_expand_branch (code
, tmp
, const0_rtx
, label
);
2247 /* Otherwise, we need two or three jumps. */
2249 label2
= gen_label_rtx ();
2252 code2
= swap_condition (code
);
2253 code3
= unsigned_condition (code
);
2257 case LT
: case GT
: case LTU
: case GTU
:
2260 case LE
: code1
= LT
; code2
= GT
; break;
2261 case GE
: code1
= GT
; code2
= LT
; break;
2262 case LEU
: code1
= LTU
; code2
= GTU
; break;
2263 case GEU
: code1
= GTU
; code2
= LTU
; break;
2265 case EQ
: code1
= UNKNOWN
; code2
= NE
; break;
2266 case NE
: code2
= UNKNOWN
; break;
2274 * if (hi(a) < hi(b)) goto true;
2275 * if (hi(a) > hi(b)) goto false;
2276 * if (lo(a) < lo(b)) goto true;
2280 if (code1
!= UNKNOWN
)
2281 ix86_expand_branch (code1
, hi
[0], hi
[1], label
);
2282 if (code2
!= UNKNOWN
)
2283 ix86_expand_branch (code2
, hi
[0], hi
[1], label2
);
2285 ix86_expand_branch (code3
, lo
[0], lo
[1], label
);
2287 if (code2
!= UNKNOWN
)
2288 emit_label (label2
);
2293 gcc_assert (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_CC
);
2298 /* Figure out whether to use unordered fp comparisons. */
2301 ix86_unordered_fp_compare (enum rtx_code code
)
2303 if (!TARGET_IEEE_FP
)
2332 /* Return a comparison we can do and that it is equivalent to
2333 swap_condition (code) apart possibly from orderedness.
2334 But, never change orderedness if TARGET_IEEE_FP, returning
2335 UNKNOWN in that case if necessary. */
2337 static enum rtx_code
2338 ix86_fp_swap_condition (enum rtx_code code
)
2342 case GT
: /* GTU - CF=0 & ZF=0 */
2343 return TARGET_IEEE_FP
? UNKNOWN
: UNLT
;
2344 case GE
: /* GEU - CF=0 */
2345 return TARGET_IEEE_FP
? UNKNOWN
: UNLE
;
2346 case UNLT
: /* LTU - CF=1 */
2347 return TARGET_IEEE_FP
? UNKNOWN
: GT
;
2348 case UNLE
: /* LEU - CF=1 | ZF=1 */
2349 return TARGET_IEEE_FP
? UNKNOWN
: GE
;
2351 return swap_condition (code
);
2355 /* Return cost of comparison CODE using the best strategy for performance.
2356 All following functions do use number of instructions as a cost metrics.
2357 In future this should be tweaked to compute bytes for optimize_size and
2358 take into account performance of various instructions on various CPUs. */
2361 ix86_fp_comparison_cost (enum rtx_code code
)
2365 /* The cost of code using bit-twiddling on %ah. */
2382 arith_cost
= TARGET_IEEE_FP
? 5 : 4;
2386 arith_cost
= TARGET_IEEE_FP
? 6 : 4;
2392 switch (ix86_fp_comparison_strategy (code
))
2394 case IX86_FPCMP_COMI
:
2395 return arith_cost
> 4 ? 3 : 2;
2396 case IX86_FPCMP_SAHF
:
2397 return arith_cost
> 4 ? 4 : 3;
2403 /* Swap, force into registers, or otherwise massage the two operands
2404 to a fp comparison. The operands are updated in place; the new
2405 comparison code is returned. */
2407 static enum rtx_code
2408 ix86_prepare_fp_compare_args (enum rtx_code code
, rtx
*pop0
, rtx
*pop1
)
2410 bool unordered_compare
= ix86_unordered_fp_compare (code
);
2411 rtx op0
= *pop0
, op1
= *pop1
;
2412 machine_mode op_mode
= GET_MODE (op0
);
2413 bool is_sse
= TARGET_SSE_MATH
&& SSE_FLOAT_MODE_P (op_mode
);
2415 /* All of the unordered compare instructions only work on registers.
2416 The same is true of the fcomi compare instructions. The XFmode
2417 compare instructions require registers except when comparing
2418 against zero or when converting operand 1 from fixed point to
2422 && (unordered_compare
2423 || (op_mode
== XFmode
2424 && ! (standard_80387_constant_p (op0
) == 1
2425 || standard_80387_constant_p (op1
) == 1)
2426 && GET_CODE (op1
) != FLOAT
)
2427 || ix86_fp_comparison_strategy (code
) == IX86_FPCMP_COMI
))
2429 op0
= force_reg (op_mode
, op0
);
2430 op1
= force_reg (op_mode
, op1
);
2434 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2435 things around if they appear profitable, otherwise force op0
2438 if (standard_80387_constant_p (op0
) == 0
2440 && ! (standard_80387_constant_p (op1
) == 0
2443 enum rtx_code new_code
= ix86_fp_swap_condition (code
);
2444 if (new_code
!= UNKNOWN
)
2446 std::swap (op0
, op1
);
2452 op0
= force_reg (op_mode
, op0
);
2454 if (CONSTANT_P (op1
))
2456 int tmp
= standard_80387_constant_p (op1
);
2458 op1
= validize_mem (force_const_mem (op_mode
, op1
));
2462 op1
= force_reg (op_mode
, op1
);
2465 op1
= force_reg (op_mode
, op1
);
2469 /* Try to rearrange the comparison to make it cheaper. */
2470 if (ix86_fp_comparison_cost (code
)
2471 > ix86_fp_comparison_cost (swap_condition (code
))
2472 && (REG_P (op1
) || can_create_pseudo_p ()))
2474 std::swap (op0
, op1
);
2475 code
= swap_condition (code
);
2477 op0
= force_reg (op_mode
, op0
);
2485 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2488 ix86_expand_fp_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2490 bool unordered_compare
= ix86_unordered_fp_compare (code
);
2491 machine_mode cmp_mode
;
2494 code
= ix86_prepare_fp_compare_args (code
, &op0
, &op1
);
2496 tmp
= gen_rtx_COMPARE (CCFPmode
, op0
, op1
);
2497 if (unordered_compare
)
2498 tmp
= gen_rtx_UNSPEC (CCFPmode
, gen_rtvec (1, tmp
), UNSPEC_NOTRAP
);
2500 /* Do fcomi/sahf based test when profitable. */
2501 switch (ix86_fp_comparison_strategy (code
))
2503 case IX86_FPCMP_COMI
:
2504 cmp_mode
= CCFPmode
;
2505 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode
, FLAGS_REG
), tmp
));
2508 case IX86_FPCMP_SAHF
:
2509 cmp_mode
= CCFPmode
;
2510 tmp
= gen_rtx_UNSPEC (HImode
, gen_rtvec (1, tmp
), UNSPEC_FNSTSW
);
2511 scratch
= gen_reg_rtx (HImode
);
2512 emit_insn (gen_rtx_SET (scratch
, tmp
));
2513 emit_insn (gen_x86_sahf_1 (scratch
));
2516 case IX86_FPCMP_ARITH
:
2517 cmp_mode
= CCNOmode
;
2518 tmp
= gen_rtx_UNSPEC (HImode
, gen_rtvec (1, tmp
), UNSPEC_FNSTSW
);
2519 scratch
= gen_reg_rtx (HImode
);
2520 emit_insn (gen_rtx_SET (scratch
, tmp
));
2522 /* In the unordered case, we have to check C2 for NaN's, which
2523 doesn't happen to work out to anything nice combination-wise.
2524 So do some bit twiddling on the value we've got in AH to come
2525 up with an appropriate set of condition codes. */
2531 if (code
== GT
|| !TARGET_IEEE_FP
)
2533 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x45)));
2538 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2539 emit_insn (gen_addqi_ext_1 (scratch
, scratch
, constm1_rtx
));
2540 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x44)));
2547 if (code
== LT
&& TARGET_IEEE_FP
)
2549 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2550 emit_insn (gen_cmpqi_ext_3 (scratch
, const1_rtx
));
2556 emit_insn (gen_testqi_ext_1_ccno (scratch
, const1_rtx
));
2562 if (code
== GE
|| !TARGET_IEEE_FP
)
2564 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x05)));
2569 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2570 emit_insn (gen_xorqi_ext_1_cc (scratch
, scratch
, const1_rtx
));
2576 if (code
== LE
&& TARGET_IEEE_FP
)
2578 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2579 emit_insn (gen_addqi_ext_1 (scratch
, scratch
, constm1_rtx
));
2580 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x40)));
2586 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x45)));
2592 if (code
== EQ
&& TARGET_IEEE_FP
)
2594 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2595 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x40)));
2601 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x40)));
2607 if (code
== NE
&& TARGET_IEEE_FP
)
2609 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2610 emit_insn (gen_xorqi_ext_1_cc (scratch
, scratch
,
2616 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x40)));
2622 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x04)));
2626 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x04)));
2639 /* Return the test that should be put into the flags user, i.e.
2640 the bcc, scc, or cmov instruction. */
2641 return gen_rtx_fmt_ee (code
, VOIDmode
,
2642 gen_rtx_REG (cmp_mode
, FLAGS_REG
),
2646 /* Generate insn patterns to do an integer compare of OPERANDS. */
2649 ix86_expand_int_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2651 machine_mode cmpmode
;
2654 cmpmode
= SELECT_CC_MODE (code
, op0
, op1
);
2655 flags
= gen_rtx_REG (cmpmode
, FLAGS_REG
);
2657 /* This is very simple, but making the interface the same as in the
2658 FP case makes the rest of the code easier. */
2659 tmp
= gen_rtx_COMPARE (cmpmode
, op0
, op1
);
2660 emit_insn (gen_rtx_SET (flags
, tmp
));
2662 /* Return the test that should be put into the flags user, i.e.
2663 the bcc, scc, or cmov instruction. */
2664 return gen_rtx_fmt_ee (code
, VOIDmode
, flags
, const0_rtx
);
2668 ix86_expand_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2672 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_CC
)
2673 ret
= gen_rtx_fmt_ee (code
, VOIDmode
, op0
, op1
);
2675 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0
)))
2677 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0
)));
2678 ret
= ix86_expand_fp_compare (code
, op0
, op1
);
2681 ret
= ix86_expand_int_compare (code
, op0
, op1
);
2687 ix86_expand_setcc (rtx dest
, enum rtx_code code
, rtx op0
, rtx op1
)
2691 gcc_assert (GET_MODE (dest
) == QImode
);
2693 ret
= ix86_expand_compare (code
, op0
, op1
);
2694 PUT_MODE (ret
, QImode
);
2695 emit_insn (gen_rtx_SET (dest
, ret
));
2698 /* Expand comparison setting or clearing carry flag. Return true when
2699 successful and set pop for the operation. */
2701 ix86_expand_carry_flag_compare (enum rtx_code code
, rtx op0
, rtx op1
, rtx
*pop
)
2704 = GET_MODE (op0
) != VOIDmode
? GET_MODE (op0
) : GET_MODE (op1
);
2706 /* Do not handle double-mode compares that go through special path. */
2707 if (mode
== (TARGET_64BIT
? TImode
: DImode
))
2710 if (SCALAR_FLOAT_MODE_P (mode
))
2713 rtx_insn
*compare_seq
;
2715 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode
));
2717 /* Shortcut: following common codes never translate
2718 into carry flag compares. */
2719 if (code
== EQ
|| code
== NE
|| code
== UNEQ
|| code
== LTGT
2720 || code
== ORDERED
|| code
== UNORDERED
)
2723 /* These comparisons require zero flag; swap operands so they won't. */
2724 if ((code
== GT
|| code
== UNLE
|| code
== LE
|| code
== UNGT
)
2727 std::swap (op0
, op1
);
2728 code
= swap_condition (code
);
2731 /* Try to expand the comparison and verify that we end up with
2732 carry flag based comparison. This fails to be true only when
2733 we decide to expand comparison using arithmetic that is not
2734 too common scenario. */
2736 compare_op
= ix86_expand_fp_compare (code
, op0
, op1
);
2737 compare_seq
= get_insns ();
2740 if (GET_MODE (XEXP (compare_op
, 0)) == CCFPmode
)
2741 code
= ix86_fp_compare_code_to_integer (GET_CODE (compare_op
));
2743 code
= GET_CODE (compare_op
);
2745 if (code
!= LTU
&& code
!= GEU
)
2748 emit_insn (compare_seq
);
2753 if (!INTEGRAL_MODE_P (mode
))
2762 /* Convert a==0 into (unsigned)a<1. */
2765 if (op1
!= const0_rtx
)
2768 code
= (code
== EQ
? LTU
: GEU
);
2771 /* Convert a>b into b<a or a>=b-1. */
2774 if (CONST_INT_P (op1
))
2776 op1
= gen_int_mode (INTVAL (op1
) + 1, GET_MODE (op0
));
2777 /* Bail out on overflow. We still can swap operands but that
2778 would force loading of the constant into register. */
2779 if (op1
== const0_rtx
2780 || !x86_64_immediate_operand (op1
, GET_MODE (op1
)))
2782 code
= (code
== GTU
? GEU
: LTU
);
2786 std::swap (op0
, op1
);
2787 code
= (code
== GTU
? LTU
: GEU
);
2791 /* Convert a>=0 into (unsigned)a<0x80000000. */
2794 if (mode
== DImode
|| op1
!= const0_rtx
)
2796 op1
= gen_int_mode (1 << (GET_MODE_BITSIZE (mode
) - 1), mode
);
2797 code
= (code
== LT
? GEU
: LTU
);
2801 if (mode
== DImode
|| op1
!= constm1_rtx
)
2803 op1
= gen_int_mode (1 << (GET_MODE_BITSIZE (mode
) - 1), mode
);
2804 code
= (code
== LE
? GEU
: LTU
);
2810 /* Swapping operands may cause constant to appear as first operand. */
2811 if (!nonimmediate_operand (op0
, VOIDmode
))
2813 if (!can_create_pseudo_p ())
2815 op0
= force_reg (mode
, op0
);
2817 *pop
= ix86_expand_compare (code
, op0
, op1
);
2818 gcc_assert (GET_CODE (*pop
) == LTU
|| GET_CODE (*pop
) == GEU
);
2822 /* Expand conditional increment or decrement using adb/sbb instructions.
2823 The default case using setcc followed by the conditional move can be
2824 done by generic code. */
2826 ix86_expand_int_addcc (rtx operands
[])
2828 enum rtx_code code
= GET_CODE (operands
[1]);
2830 rtx (*insn
) (machine_mode
, rtx
, rtx
, rtx
, rtx
, rtx
);
2832 rtx val
= const0_rtx
;
2835 rtx op0
= XEXP (operands
[1], 0);
2836 rtx op1
= XEXP (operands
[1], 1);
2838 if (operands
[3] != const1_rtx
2839 && operands
[3] != constm1_rtx
)
2841 if (!ix86_expand_carry_flag_compare (code
, op0
, op1
, &compare_op
))
2843 code
= GET_CODE (compare_op
);
2845 flags
= XEXP (compare_op
, 0);
2847 if (GET_MODE (flags
) == CCFPmode
)
2850 code
= ix86_fp_compare_code_to_integer (code
);
2857 PUT_CODE (compare_op
,
2858 reverse_condition_maybe_unordered
2859 (GET_CODE (compare_op
)));
2861 PUT_CODE (compare_op
, reverse_condition (GET_CODE (compare_op
)));
2864 mode
= GET_MODE (operands
[0]);
2866 /* Construct either adc or sbb insn. */
2867 if ((code
== LTU
) == (operands
[3] == constm1_rtx
))
2868 insn
= gen_sub3_carry
;
2870 insn
= gen_add3_carry
;
2872 emit_insn (insn (mode
, operands
[0], operands
[2], val
, flags
, compare_op
));
2878 ix86_expand_int_movcc (rtx operands
[])
2880 enum rtx_code code
= GET_CODE (operands
[1]), compare_code
;
2881 rtx_insn
*compare_seq
;
2883 machine_mode mode
= GET_MODE (operands
[0]);
2884 bool sign_bit_compare_p
= false;
2885 rtx op0
= XEXP (operands
[1], 0);
2886 rtx op1
= XEXP (operands
[1], 1);
2888 if (GET_MODE (op0
) == TImode
2889 || (GET_MODE (op0
) == DImode
2894 compare_op
= ix86_expand_compare (code
, op0
, op1
);
2895 compare_seq
= get_insns ();
2898 compare_code
= GET_CODE (compare_op
);
2900 if ((op1
== const0_rtx
&& (code
== GE
|| code
== LT
))
2901 || (op1
== constm1_rtx
&& (code
== GT
|| code
== LE
)))
2902 sign_bit_compare_p
= true;
2904 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
2905 HImode insns, we'd be swallowed in word prefix ops. */
2907 if ((mode
!= HImode
|| TARGET_FAST_PREFIX
)
2908 && (mode
!= (TARGET_64BIT
? TImode
: DImode
))
2909 && CONST_INT_P (operands
[2])
2910 && CONST_INT_P (operands
[3]))
2912 rtx out
= operands
[0];
2913 HOST_WIDE_INT ct
= INTVAL (operands
[2]);
2914 HOST_WIDE_INT cf
= INTVAL (operands
[3]);
2918 /* Sign bit compares are better done using shifts than we do by using
2920 if (sign_bit_compare_p
2921 || ix86_expand_carry_flag_compare (code
, op0
, op1
, &compare_op
))
2923 /* Detect overlap between destination and compare sources. */
2926 if (!sign_bit_compare_p
)
2931 compare_code
= GET_CODE (compare_op
);
2933 flags
= XEXP (compare_op
, 0);
2935 if (GET_MODE (flags
) == CCFPmode
)
2939 = ix86_fp_compare_code_to_integer (compare_code
);
2942 /* To simplify rest of code, restrict to the GEU case. */
2943 if (compare_code
== LTU
)
2946 compare_code
= reverse_condition (compare_code
);
2947 code
= reverse_condition (code
);
2952 PUT_CODE (compare_op
,
2953 reverse_condition_maybe_unordered
2954 (GET_CODE (compare_op
)));
2956 PUT_CODE (compare_op
,
2957 reverse_condition (GET_CODE (compare_op
)));
2961 if (reg_overlap_mentioned_p (out
, op0
)
2962 || reg_overlap_mentioned_p (out
, op1
))
2963 tmp
= gen_reg_rtx (mode
);
2966 emit_insn (gen_x86_movdicc_0_m1 (tmp
, flags
, compare_op
));
2968 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode
, tmp
),
2969 flags
, compare_op
));
2973 if (code
== GT
|| code
== GE
)
2974 code
= reverse_condition (code
);
2980 tmp
= emit_store_flag (tmp
, code
, op0
, op1
, VOIDmode
, 0, -1);
2993 tmp
= expand_simple_binop (mode
, PLUS
,
2995 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3006 tmp
= expand_simple_binop (mode
, IOR
,
3008 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3010 else if (diff
== -1 && ct
)
3020 tmp
= expand_simple_unop (mode
, NOT
, tmp
, copy_rtx (tmp
), 1);
3022 tmp
= expand_simple_binop (mode
, PLUS
,
3023 copy_rtx (tmp
), GEN_INT (cf
),
3024 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3032 * andl cf - ct, dest
3042 tmp
= expand_simple_unop (mode
, NOT
, tmp
, copy_rtx (tmp
), 1);
3045 tmp
= expand_simple_binop (mode
, AND
,
3047 gen_int_mode (cf
- ct
, mode
),
3048 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3050 tmp
= expand_simple_binop (mode
, PLUS
,
3051 copy_rtx (tmp
), GEN_INT (ct
),
3052 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3055 if (!rtx_equal_p (tmp
, out
))
3056 emit_move_insn (copy_rtx (out
), copy_rtx (tmp
));
3063 machine_mode cmp_mode
= GET_MODE (op0
);
3064 enum rtx_code new_code
;
3066 if (SCALAR_FLOAT_MODE_P (cmp_mode
))
3068 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode
));
3070 /* We may be reversing a non-trapping
3071 comparison to a trapping comparison. */
3072 if (HONOR_NANS (cmp_mode
) && flag_trapping_math
3073 && code
!= EQ
&& code
!= NE
3074 && code
!= ORDERED
&& code
!= UNORDERED
)
3077 new_code
= reverse_condition_maybe_unordered (code
);
3080 new_code
= ix86_reverse_condition (code
, cmp_mode
);
3081 if (new_code
!= UNKNOWN
)
3089 compare_code
= UNKNOWN
;
3090 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
3091 && CONST_INT_P (op1
))
3093 if (op1
== const0_rtx
3094 && (code
== LT
|| code
== GE
))
3095 compare_code
= code
;
3096 else if (op1
== constm1_rtx
)
3100 else if (code
== GT
)
3105 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3106 if (compare_code
!= UNKNOWN
3107 && GET_MODE (op0
) == GET_MODE (out
)
3108 && (cf
== -1 || ct
== -1))
3110 /* If lea code below could be used, only optimize
3111 if it results in a 2 insn sequence. */
3113 if (! (diff
== 1 || diff
== 2 || diff
== 4 || diff
== 8
3114 || diff
== 3 || diff
== 5 || diff
== 9)
3115 || (compare_code
== LT
&& ct
== -1)
3116 || (compare_code
== GE
&& cf
== -1))
3119 * notl op1 (if necessary)
3127 code
= reverse_condition (code
);
3130 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, -1);
3132 out
= expand_simple_binop (mode
, IOR
,
3134 out
, 1, OPTAB_DIRECT
);
3135 if (out
!= operands
[0])
3136 emit_move_insn (operands
[0], out
);
3143 if ((diff
== 1 || diff
== 2 || diff
== 4 || diff
== 8
3144 || diff
== 3 || diff
== 5 || diff
== 9)
3145 && ((mode
!= QImode
&& mode
!= HImode
) || !TARGET_PARTIAL_REG_STALL
)
3147 || x86_64_immediate_operand (GEN_INT (cf
), VOIDmode
)))
3153 * lea cf(dest*(ct-cf)),dest
3157 * This also catches the degenerate setcc-only case.
3163 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, 1);
3166 /* On x86_64 the lea instruction operates on Pmode, so we need
3167 to get arithmetics done in proper mode to match. */
3169 tmp
= copy_rtx (out
);
3173 out1
= copy_rtx (out
);
3174 tmp
= gen_rtx_MULT (mode
, out1
, GEN_INT (diff
& ~1));
3178 tmp
= gen_rtx_PLUS (mode
, tmp
, out1
);
3184 tmp
= plus_constant (mode
, tmp
, cf
);
3187 if (!rtx_equal_p (tmp
, out
))
3190 out
= force_operand (tmp
, copy_rtx (out
));
3192 emit_insn (gen_rtx_SET (copy_rtx (out
), copy_rtx (tmp
)));
3194 if (!rtx_equal_p (out
, operands
[0]))
3195 emit_move_insn (operands
[0], copy_rtx (out
));
3201 * General case: Jumpful:
3202 * xorl dest,dest cmpl op1, op2
3203 * cmpl op1, op2 movl ct, dest
3205 * decl dest movl cf, dest
3206 * andl (cf-ct),dest 1:
3211 * This is reasonably steep, but branch mispredict costs are
3212 * high on modern cpus, so consider failing only if optimizing
3216 if ((!TARGET_CMOVE
|| (mode
== QImode
&& TARGET_PARTIAL_REG_STALL
))
3217 && BRANCH_COST (optimize_insn_for_speed_p (),
3222 machine_mode cmp_mode
= GET_MODE (op0
);
3223 enum rtx_code new_code
;
3225 if (SCALAR_FLOAT_MODE_P (cmp_mode
))
3227 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode
));
3229 /* We may be reversing a non-trapping
3230 comparison to a trapping comparison. */
3231 if (HONOR_NANS (cmp_mode
) && flag_trapping_math
3232 && code
!= EQ
&& code
!= NE
3233 && code
!= ORDERED
&& code
!= UNORDERED
)
3236 new_code
= reverse_condition_maybe_unordered (code
);
3241 new_code
= ix86_reverse_condition (code
, cmp_mode
);
3242 if (compare_code
!= UNKNOWN
&& new_code
!= UNKNOWN
)
3243 compare_code
= reverse_condition (compare_code
);
3246 if (new_code
!= UNKNOWN
)
3254 if (compare_code
!= UNKNOWN
)
3256 /* notl op1 (if needed)
3261 For x < 0 (resp. x <= -1) there will be no notl,
3262 so if possible swap the constants to get rid of the
3264 True/false will be -1/0 while code below (store flag
3265 followed by decrement) is 0/-1, so the constants need
3266 to be exchanged once more. */
3268 if (compare_code
== GE
|| !cf
)
3270 code
= reverse_condition (code
);
3276 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, -1);
3280 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, 1);
3282 out
= expand_simple_binop (mode
, PLUS
, copy_rtx (out
),
3284 copy_rtx (out
), 1, OPTAB_DIRECT
);
3287 out
= expand_simple_binop (mode
, AND
, copy_rtx (out
),
3288 gen_int_mode (cf
- ct
, mode
),
3289 copy_rtx (out
), 1, OPTAB_DIRECT
);
3291 out
= expand_simple_binop (mode
, PLUS
, copy_rtx (out
), GEN_INT (ct
),
3292 copy_rtx (out
), 1, OPTAB_DIRECT
);
3293 if (!rtx_equal_p (out
, operands
[0]))
3294 emit_move_insn (operands
[0], copy_rtx (out
));
3300 if (!TARGET_CMOVE
|| (mode
== QImode
&& TARGET_PARTIAL_REG_STALL
))
3302 /* Try a few things more with specific constants and a variable. */
3305 rtx var
, orig_out
, out
, tmp
;
3307 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3310 /* If one of the two operands is an interesting constant, load a
3311 constant with the above and mask it in with a logical operation. */
3313 if (CONST_INT_P (operands
[2]))
3316 if (INTVAL (operands
[2]) == 0 && operands
[3] != constm1_rtx
)
3317 operands
[3] = constm1_rtx
, op
= and_optab
;
3318 else if (INTVAL (operands
[2]) == -1 && operands
[3] != const0_rtx
)
3319 operands
[3] = const0_rtx
, op
= ior_optab
;
3323 else if (CONST_INT_P (operands
[3]))
3326 if (INTVAL (operands
[3]) == 0 && operands
[2] != constm1_rtx
)
3328 /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3329 "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
3330 if (code
== LE
&& op1
== const0_rtx
&& rtx_equal_p (op0
, var
))
3331 operands
[1] = simplify_gen_relational (LT
, VOIDmode
,
3335 operands
[2] = constm1_rtx
;
3338 else if (INTVAL (operands
[3]) == -1 && operands
[3] != const0_rtx
)
3339 operands
[2] = const0_rtx
, op
= ior_optab
;
3346 orig_out
= operands
[0];
3347 tmp
= gen_reg_rtx (mode
);
3350 /* Recurse to get the constant loaded. */
3351 if (!ix86_expand_int_movcc (operands
))
3354 /* Mask in the interesting variable. */
3355 out
= expand_binop (mode
, op
, var
, tmp
, orig_out
, 0,
3357 if (!rtx_equal_p (out
, orig_out
))
3358 emit_move_insn (copy_rtx (orig_out
), copy_rtx (out
));
3364 * For comparison with above,
3374 if (! nonimmediate_operand (operands
[2], mode
))
3375 operands
[2] = force_reg (mode
, operands
[2]);
3376 if (! nonimmediate_operand (operands
[3], mode
))
3377 operands
[3] = force_reg (mode
, operands
[3]);
3379 if (! register_operand (operands
[2], VOIDmode
)
3381 || ! register_operand (operands
[3], VOIDmode
)))
3382 operands
[2] = force_reg (mode
, operands
[2]);
3385 && ! register_operand (operands
[3], VOIDmode
))
3386 operands
[3] = force_reg (mode
, operands
[3]);
3388 emit_insn (compare_seq
);
3389 emit_insn (gen_rtx_SET (operands
[0],
3390 gen_rtx_IF_THEN_ELSE (mode
,
3391 compare_op
, operands
[2],
3396 /* Detect conditional moves that exactly match min/max operational
3397 semantics. Note that this is IEEE safe, as long as we don't
3398 interchange the operands.
3400 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3401 and TRUE if the operation is successful and instructions are emitted. */
3404 ix86_expand_sse_fp_minmax (rtx dest
, enum rtx_code code
, rtx cmp_op0
,
3405 rtx cmp_op1
, rtx if_true
, rtx if_false
)
3413 else if (code
== UNGE
)
3414 std::swap (if_true
, if_false
);
3418 if (rtx_equal_p (cmp_op0
, if_true
) && rtx_equal_p (cmp_op1
, if_false
))
3420 else if (rtx_equal_p (cmp_op1
, if_true
) && rtx_equal_p (cmp_op0
, if_false
))
3425 mode
= GET_MODE (dest
);
3427 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3428 but MODE may be a vector mode and thus not appropriate. */
3429 if (!flag_finite_math_only
|| flag_signed_zeros
)
3431 int u
= is_min
? UNSPEC_IEEE_MIN
: UNSPEC_IEEE_MAX
;
3434 if_true
= force_reg (mode
, if_true
);
3435 v
= gen_rtvec (2, if_true
, if_false
);
3436 tmp
= gen_rtx_UNSPEC (mode
, v
, u
);
3440 code
= is_min
? SMIN
: SMAX
;
3441 if (MEM_P (if_true
) && MEM_P (if_false
))
3442 if_true
= force_reg (mode
, if_true
);
3443 tmp
= gen_rtx_fmt_ee (code
, mode
, if_true
, if_false
);
3446 emit_insn (gen_rtx_SET (dest
, tmp
));
3450 /* Return true if MODE is valid for vector compare to mask register,
3451 Same result for conditionl vector move with mask register. */
3453 ix86_valid_mask_cmp_mode (machine_mode mode
)
3455 /* XOP has its own vector conditional movement. */
3456 if (TARGET_XOP
&& !TARGET_AVX512F
)
3459 /* AVX512F is needed for mask operation. */
3460 if (!(TARGET_AVX512F
&& VECTOR_MODE_P (mode
)))
3463 /* AVX512BW is needed for vector QI/HImode,
3464 AVX512VL is needed for 128/256-bit vector. */
3465 machine_mode inner_mode
= GET_MODE_INNER (mode
);
3466 int vector_size
= GET_MODE_SIZE (mode
);
3467 if ((inner_mode
== QImode
|| inner_mode
== HImode
) && !TARGET_AVX512BW
)
3470 return vector_size
== 64 || TARGET_AVX512VL
;
3473 /* Return true if integer mask comparison should be used. */
3475 ix86_use_mask_cmp_p (machine_mode mode
, machine_mode cmp_mode
,
3476 rtx op_true
, rtx op_false
)
3478 if (GET_MODE_SIZE (mode
) == 64)
3481 /* When op_true is NULL, op_false must be NULL, or vice versa. */
3482 gcc_assert (!op_true
== !op_false
);
3484 /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
3485 vector dest is required. */
3486 if (!op_true
|| !ix86_valid_mask_cmp_mode (cmp_mode
))
3489 /* Exclude those that could be optimized in ix86_expand_sse_movcc. */
3490 if (op_false
== CONST0_RTX (mode
)
3491 || op_true
== CONST0_RTX (mode
)
3492 || (INTEGRAL_MODE_P (mode
)
3493 && (op_true
== CONSTM1_RTX (mode
)
3494 || op_false
== CONSTM1_RTX (mode
))))
3500 /* Expand an SSE comparison. Return the register with the result. */
3503 ix86_expand_sse_cmp (rtx dest
, enum rtx_code code
, rtx cmp_op0
, rtx cmp_op1
,
3504 rtx op_true
, rtx op_false
)
3506 machine_mode mode
= GET_MODE (dest
);
3507 machine_mode cmp_ops_mode
= GET_MODE (cmp_op0
);
3509 /* In general case result of comparison can differ from operands' type. */
3510 machine_mode cmp_mode
;
3512 /* In AVX512F the result of comparison is an integer mask. */
3513 bool maskcmp
= false;
3516 if (ix86_use_mask_cmp_p (mode
, cmp_ops_mode
, op_true
, op_false
))
3518 unsigned int nbits
= GET_MODE_NUNITS (cmp_ops_mode
);
3520 cmp_mode
= nbits
> 8 ? int_mode_for_size (nbits
, 0).require () : E_QImode
;
3523 cmp_mode
= cmp_ops_mode
;
3525 cmp_op0
= force_reg (cmp_ops_mode
, cmp_op0
);
3527 int (*op1_predicate
)(rtx
, machine_mode
)
3528 = VECTOR_MODE_P (cmp_ops_mode
) ? vector_operand
: nonimmediate_operand
;
3530 if (!op1_predicate (cmp_op1
, cmp_ops_mode
))
3531 cmp_op1
= force_reg (cmp_ops_mode
, cmp_op1
);
3534 || (maskcmp
&& cmp_mode
!= mode
)
3535 || (op_true
&& reg_overlap_mentioned_p (dest
, op_true
))
3536 || (op_false
&& reg_overlap_mentioned_p (dest
, op_false
)))
3537 dest
= gen_reg_rtx (maskcmp
? cmp_mode
: mode
);
3541 bool ok
= ix86_expand_mask_vec_cmp (dest
, code
, cmp_op0
, cmp_op1
);
3546 x
= gen_rtx_fmt_ee (code
, cmp_mode
, cmp_op0
, cmp_op1
);
3548 if (cmp_mode
!= mode
)
3550 x
= force_reg (cmp_ops_mode
, x
);
3551 convert_move (dest
, x
, false);
3554 emit_insn (gen_rtx_SET (dest
, x
));
3559 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3560 operations. This is used for both scalar and vector conditional moves. */
3563 ix86_expand_sse_movcc (rtx dest
, rtx cmp
, rtx op_true
, rtx op_false
)
3565 machine_mode mode
= GET_MODE (dest
);
3566 machine_mode cmpmode
= GET_MODE (cmp
);
3568 /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
3569 if (rtx_equal_p (op_true
, op_false
))
3571 emit_move_insn (dest
, op_true
);
3577 /* If we have an integer mask and FP value then we need
3578 to cast mask to FP mode. */
3579 if (mode
!= cmpmode
&& VECTOR_MODE_P (cmpmode
))
3581 cmp
= force_reg (cmpmode
, cmp
);
3582 cmp
= gen_rtx_SUBREG (mode
, cmp
, 0);
3585 /* In AVX512F the result of comparison is an integer mask. */
3587 && GET_MODE_CLASS (cmpmode
) == MODE_INT
)
3589 gcc_assert (ix86_valid_mask_cmp_mode (mode
));
3590 /* Using vector move with mask register. */
3591 cmp
= force_reg (cmpmode
, cmp
);
3592 /* Optimize for mask zero. */
3593 op_true
= (op_true
!= CONST0_RTX (mode
)
3594 ? force_reg (mode
, op_true
) : op_true
);
3595 op_false
= (op_false
!= CONST0_RTX (mode
)
3596 ? force_reg (mode
, op_false
) : op_false
);
3597 if (op_true
== CONST0_RTX (mode
))
3599 rtx n
= gen_reg_rtx (cmpmode
);
3600 if (cmpmode
== E_DImode
&& !TARGET_64BIT
)
3601 emit_insn (gen_knotdi (n
, cmp
));
3603 emit_insn (gen_rtx_SET (n
, gen_rtx_fmt_e (NOT
, cmpmode
, cmp
)));
3605 /* Reverse op_true op_false. */
3606 std::swap (op_true
, op_false
);
3609 rtx vec_merge
= gen_rtx_VEC_MERGE (mode
, op_true
, op_false
, cmp
);
3610 emit_insn (gen_rtx_SET (dest
, vec_merge
));
3613 else if (vector_all_ones_operand (op_true
, mode
)
3614 && op_false
== CONST0_RTX (mode
))
3616 emit_insn (gen_rtx_SET (dest
, cmp
));
3619 else if (op_false
== CONST0_RTX (mode
))
3621 op_true
= force_reg (mode
, op_true
);
3622 x
= gen_rtx_AND (mode
, cmp
, op_true
);
3623 emit_insn (gen_rtx_SET (dest
, x
));
3626 else if (op_true
== CONST0_RTX (mode
))
3628 op_false
= force_reg (mode
, op_false
);
3629 x
= gen_rtx_NOT (mode
, cmp
);
3630 x
= gen_rtx_AND (mode
, x
, op_false
);
3631 emit_insn (gen_rtx_SET (dest
, x
));
3634 else if (INTEGRAL_MODE_P (mode
) && op_true
== CONSTM1_RTX (mode
))
3636 op_false
= force_reg (mode
, op_false
);
3637 x
= gen_rtx_IOR (mode
, cmp
, op_false
);
3638 emit_insn (gen_rtx_SET (dest
, x
));
3641 else if (TARGET_XOP
)
3643 op_true
= force_reg (mode
, op_true
);
3645 if (!nonimmediate_operand (op_false
, mode
))
3646 op_false
= force_reg (mode
, op_false
);
3648 emit_insn (gen_rtx_SET (dest
, gen_rtx_IF_THEN_ELSE (mode
, cmp
,
3654 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
3657 if (!vector_operand (op_true
, mode
))
3658 op_true
= force_reg (mode
, op_true
);
3660 op_false
= force_reg (mode
, op_false
);
3666 gen
= gen_sse4_1_blendvps
;
3670 gen
= gen_sse4_1_blendvpd
;
3675 gen
= gen_sse4_1_blendvss
;
3676 op_true
= force_reg (mode
, op_true
);
3682 gen
= gen_sse4_1_blendvsd
;
3683 op_true
= force_reg (mode
, op_true
);
3692 gen
= gen_sse4_1_pblendvb
;
3693 if (mode
!= V16QImode
)
3694 d
= gen_reg_rtx (V16QImode
);
3695 op_false
= gen_lowpart (V16QImode
, op_false
);
3696 op_true
= gen_lowpart (V16QImode
, op_true
);
3697 cmp
= gen_lowpart (V16QImode
, cmp
);
3702 gen
= gen_avx_blendvps256
;
3706 gen
= gen_avx_blendvpd256
;
3714 gen
= gen_avx2_pblendvb
;
3715 if (mode
!= V32QImode
)
3716 d
= gen_reg_rtx (V32QImode
);
3717 op_false
= gen_lowpart (V32QImode
, op_false
);
3718 op_true
= gen_lowpart (V32QImode
, op_true
);
3719 cmp
= gen_lowpart (V32QImode
, cmp
);
3724 gen
= gen_avx512bw_blendmv64qi
;
3727 gen
= gen_avx512bw_blendmv32hi
;
3730 gen
= gen_avx512f_blendmv16si
;
3733 gen
= gen_avx512f_blendmv8di
;
3736 gen
= gen_avx512f_blendmv8df
;
3739 gen
= gen_avx512f_blendmv16sf
;
3748 emit_insn (gen (d
, op_false
, op_true
, cmp
));
3750 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), d
));
3754 op_true
= force_reg (mode
, op_true
);
3756 t2
= gen_reg_rtx (mode
);
3758 t3
= gen_reg_rtx (mode
);
3762 x
= gen_rtx_AND (mode
, op_true
, cmp
);
3763 emit_insn (gen_rtx_SET (t2
, x
));
3765 x
= gen_rtx_NOT (mode
, cmp
);
3766 x
= gen_rtx_AND (mode
, x
, op_false
);
3767 emit_insn (gen_rtx_SET (t3
, x
));
3769 x
= gen_rtx_IOR (mode
, t3
, t2
);
3770 emit_insn (gen_rtx_SET (dest
, x
));
3774 /* Swap, force into registers, or otherwise massage the two operands
3775 to an sse comparison with a mask result. Thus we differ a bit from
3776 ix86_prepare_fp_compare_args which expects to produce a flags result.
3778 The DEST operand exists to help determine whether to commute commutative
3779 operators. The POP0/POP1 operands are updated in place. The new
3780 comparison code is returned, or UNKNOWN if not implementable. */
3782 static enum rtx_code
3783 ix86_prepare_sse_fp_compare_args (rtx dest
, enum rtx_code code
,
3784 rtx
*pop0
, rtx
*pop1
)
3790 /* AVX supports all the needed comparisons. */
3793 /* We have no LTGT as an operator. We could implement it with
3794 NE & ORDERED, but this requires an extra temporary. It's
3795 not clear that it's worth it. */
3802 /* These are supported directly. */
3809 /* AVX has 3 operand comparisons, no need to swap anything. */
3812 /* For commutative operators, try to canonicalize the destination
3813 operand to be first in the comparison - this helps reload to
3814 avoid extra moves. */
3815 if (!dest
|| !rtx_equal_p (dest
, *pop1
))
3823 /* These are not supported directly before AVX, and furthermore
3824 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
3825 comparison operands to transform into something that is
3827 std::swap (*pop0
, *pop1
);
3828 code
= swap_condition (code
);
3838 /* Expand a floating-point conditional move. Return true if successful. */
3841 ix86_expand_fp_movcc (rtx operands
[])
3843 machine_mode mode
= GET_MODE (operands
[0]);
3844 enum rtx_code code
= GET_CODE (operands
[1]);
3845 rtx tmp
, compare_op
;
3846 rtx op0
= XEXP (operands
[1], 0);
3847 rtx op1
= XEXP (operands
[1], 1);
3849 if (TARGET_SSE_MATH
&& SSE_FLOAT_MODE_P (mode
))
3853 /* Since we've no cmove for sse registers, don't force bad register
3854 allocation just to gain access to it. Deny movcc when the
3855 comparison mode doesn't match the move mode. */
3856 cmode
= GET_MODE (op0
);
3857 if (cmode
== VOIDmode
)
3858 cmode
= GET_MODE (op1
);
3862 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
, &op0
, &op1
);
3863 if (code
== UNKNOWN
)
3866 if (ix86_expand_sse_fp_minmax (operands
[0], code
, op0
, op1
,
3867 operands
[2], operands
[3]))
3870 tmp
= ix86_expand_sse_cmp (operands
[0], code
, op0
, op1
,
3871 operands
[2], operands
[3]);
3872 ix86_expand_sse_movcc (operands
[0], tmp
, operands
[2], operands
[3]);
3876 if (GET_MODE (op0
) == TImode
3877 || (GET_MODE (op0
) == DImode
3881 /* The floating point conditional move instructions don't directly
3882 support conditions resulting from a signed integer comparison. */
3884 compare_op
= ix86_expand_compare (code
, op0
, op1
);
3885 if (!fcmov_comparison_operator (compare_op
, VOIDmode
))
3887 tmp
= gen_reg_rtx (QImode
);
3888 ix86_expand_setcc (tmp
, code
, op0
, op1
);
3890 compare_op
= ix86_expand_compare (NE
, tmp
, const0_rtx
);
3893 emit_insn (gen_rtx_SET (operands
[0],
3894 gen_rtx_IF_THEN_ELSE (mode
, compare_op
,
3895 operands
[2], operands
[3])));
3900 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
3903 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code
)
3928 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
3931 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code
)
3968 /* Return immediate value to be used in UNSPEC_PCMP
3969 for comparison CODE in MODE. */
3972 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code
, machine_mode mode
)
3974 if (FLOAT_MODE_P (mode
))
3975 return ix86_fp_cmp_code_to_pcmp_immediate (code
);
3976 return ix86_int_cmp_code_to_pcmp_immediate (code
);
3979 /* Expand AVX-512 vector comparison. */
3982 ix86_expand_mask_vec_cmp (rtx dest
, enum rtx_code code
, rtx cmp_op0
, rtx cmp_op1
)
3984 machine_mode mask_mode
= GET_MODE (dest
);
3985 machine_mode cmp_mode
= GET_MODE (cmp_op0
);
3986 rtx imm
= GEN_INT (ix86_cmp_code_to_pcmp_immediate (code
, cmp_mode
));
3996 unspec_code
= UNSPEC_UNSIGNED_PCMP
;
4000 unspec_code
= UNSPEC_PCMP
;
4003 unspec
= gen_rtx_UNSPEC (mask_mode
, gen_rtvec (3, cmp_op0
, cmp_op1
, imm
),
4005 emit_insn (gen_rtx_SET (dest
, unspec
));
4010 /* Expand fp vector comparison. */
4013 ix86_expand_fp_vec_cmp (rtx operands
[])
4015 enum rtx_code code
= GET_CODE (operands
[1]);
4018 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
,
4019 &operands
[2], &operands
[3]);
4020 if (code
== UNKNOWN
)
4023 switch (GET_CODE (operands
[1]))
4026 temp
= ix86_expand_sse_cmp (operands
[0], ORDERED
, operands
[2],
4027 operands
[3], NULL
, NULL
);
4028 cmp
= ix86_expand_sse_cmp (operands
[0], NE
, operands
[2],
4029 operands
[3], NULL
, NULL
);
4033 temp
= ix86_expand_sse_cmp (operands
[0], UNORDERED
, operands
[2],
4034 operands
[3], NULL
, NULL
);
4035 cmp
= ix86_expand_sse_cmp (operands
[0], EQ
, operands
[2],
4036 operands
[3], NULL
, NULL
);
4042 cmp
= expand_simple_binop (GET_MODE (cmp
), code
, temp
, cmp
, cmp
, 1,
4046 cmp
= ix86_expand_sse_cmp (operands
[0], code
, operands
[2], operands
[3],
4049 if (operands
[0] != cmp
)
4050 emit_move_insn (operands
[0], cmp
);
4056 ix86_expand_int_sse_cmp (rtx dest
, enum rtx_code code
, rtx cop0
, rtx cop1
,
4057 rtx op_true
, rtx op_false
, bool *negate
)
4059 machine_mode data_mode
= GET_MODE (dest
);
4060 machine_mode mode
= GET_MODE (cop0
);
4065 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4067 && (mode
== V16QImode
|| mode
== V8HImode
4068 || mode
== V4SImode
|| mode
== V2DImode
))
4070 /* AVX512F supports all of the comparsions
4071 on all 128/256/512-bit vector int types. */
4072 else if (ix86_use_mask_cmp_p (data_mode
, mode
, op_true
, op_false
))
4076 /* Canonicalize the comparison to EQ, GT, GTU. */
4087 code
= reverse_condition (code
);
4093 code
= reverse_condition (code
);
4099 std::swap (cop0
, cop1
);
4100 code
= swap_condition (code
);
4107 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4108 if (mode
== V2DImode
)
4113 /* SSE4.1 supports EQ. */
4120 /* SSE4.2 supports GT/GTU. */
4130 rtx optrue
= op_true
? op_true
: CONSTM1_RTX (data_mode
);
4131 rtx opfalse
= op_false
? op_false
: CONST0_RTX (data_mode
);
4133 std::swap (optrue
, opfalse
);
4135 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4136 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4137 min (x, y) == x). While we add one instruction (the minimum),
4138 we remove the need for two instructions in the negation, as the
4139 result is done this way.
4140 When using masks, do it for SI/DImode element types, as it is shorter
4141 than the two subtractions. */
4143 && GET_MODE_SIZE (mode
) != 64
4144 && vector_all_ones_operand (opfalse
, data_mode
)
4145 && optrue
== CONST0_RTX (data_mode
))
4147 && GET_MODE_SIZE (GET_MODE_INNER (mode
)) >= 4
4148 /* Don't do it if not using integer masks and we'd end up with
4149 the right values in the registers though. */
4150 && (GET_MODE_SIZE (mode
) == 64
4151 || !vector_all_ones_operand (optrue
, data_mode
)
4152 || opfalse
!= CONST0_RTX (data_mode
))))
4154 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
4159 gen
= (code
== GTU
) ? gen_uminv16si3
: gen_sminv16si3
;
4162 gen
= (code
== GTU
) ? gen_uminv8di3
: gen_sminv8di3
;
4163 cop0
= force_reg (mode
, cop0
);
4164 cop1
= force_reg (mode
, cop1
);
4168 gen
= (code
== GTU
) ? gen_uminv32qi3
: gen_sminv32qi3
;
4172 gen
= (code
== GTU
) ? gen_uminv16hi3
: gen_sminv16hi3
;
4176 gen
= (code
== GTU
) ? gen_uminv8si3
: gen_sminv8si3
;
4179 if (TARGET_AVX512VL
)
4181 gen
= (code
== GTU
) ? gen_uminv4di3
: gen_sminv4di3
;
4182 cop0
= force_reg (mode
, cop0
);
4183 cop1
= force_reg (mode
, cop1
);
4187 if (code
== GTU
&& TARGET_SSE2
)
4188 gen
= gen_uminv16qi3
;
4189 else if (code
== GT
&& TARGET_SSE4_1
)
4190 gen
= gen_sminv16qi3
;
4193 if (code
== GTU
&& TARGET_SSE4_1
)
4194 gen
= gen_uminv8hi3
;
4195 else if (code
== GT
&& TARGET_SSE2
)
4196 gen
= gen_sminv8hi3
;
4200 gen
= (code
== GTU
) ? gen_uminv4si3
: gen_sminv4si3
;
4203 if (TARGET_AVX512VL
)
4205 gen
= (code
== GTU
) ? gen_uminv2di3
: gen_sminv2di3
;
4206 cop0
= force_reg (mode
, cop0
);
4207 cop1
= force_reg (mode
, cop1
);
4216 rtx tem
= gen_reg_rtx (mode
);
4217 if (!vector_operand (cop0
, mode
))
4218 cop0
= force_reg (mode
, cop0
);
4219 if (!vector_operand (cop1
, mode
))
4220 cop1
= force_reg (mode
, cop1
);
4222 emit_insn (gen (tem
, cop0
, cop1
));
4228 /* Unsigned parallel compare is not supported by the hardware.
4229 Play some tricks to turn this into a signed comparison
4233 cop0
= force_reg (mode
, cop0
);
4246 /* Subtract (-(INT MAX) - 1) from both operands to make
4248 mask
= ix86_build_signbit_mask (mode
, true, false);
4249 t1
= gen_reg_rtx (mode
);
4250 emit_insn (gen_sub3_insn (t1
, cop0
, mask
));
4252 t2
= gen_reg_rtx (mode
);
4253 emit_insn (gen_sub3_insn (t2
, cop1
, mask
));
4267 /* Perform a parallel unsigned saturating subtraction. */
4268 x
= gen_reg_rtx (mode
);
4269 emit_insn (gen_rtx_SET
4270 (x
, gen_rtx_US_MINUS (mode
, cop0
, cop1
)));
4272 cop1
= CONST0_RTX (mode
);
4284 std::swap (op_true
, op_false
);
4286 /* Allow the comparison to be done in one mode, but the movcc to
4287 happen in another mode. */
4288 if (data_mode
== mode
)
4290 x
= ix86_expand_sse_cmp (dest
, code
, cop0
, cop1
,
4295 gcc_assert (GET_MODE_SIZE (data_mode
) == GET_MODE_SIZE (mode
));
4296 x
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), code
, cop0
, cop1
,
4298 if (GET_MODE (x
) == mode
)
4299 x
= gen_lowpart (data_mode
, x
);
4305 /* Expand integer vector comparison. */
4308 ix86_expand_int_vec_cmp (rtx operands
[])
4310 rtx_code code
= GET_CODE (operands
[1]);
4311 bool negate
= false;
4312 rtx cmp
= ix86_expand_int_sse_cmp (operands
[0], code
, operands
[2],
4313 operands
[3], NULL
, NULL
, &negate
);
4319 cmp
= ix86_expand_int_sse_cmp (operands
[0], EQ
, cmp
,
4320 CONST0_RTX (GET_MODE (cmp
)),
4321 NULL
, NULL
, &negate
);
4323 gcc_assert (!negate
);
4325 if (operands
[0] != cmp
)
4326 emit_move_insn (operands
[0], cmp
);
4331 /* Expand a floating-point vector conditional move; a vcond operation
4332 rather than a movcc operation. */
4335 ix86_expand_fp_vcond (rtx operands
[])
4337 enum rtx_code code
= GET_CODE (operands
[3]);
4340 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
,
4341 &operands
[4], &operands
[5]);
4342 if (code
== UNKNOWN
)
4345 switch (GET_CODE (operands
[3]))
4348 temp
= ix86_expand_sse_cmp (operands
[0], ORDERED
, operands
[4],
4349 operands
[5], operands
[0], operands
[0]);
4350 cmp
= ix86_expand_sse_cmp (operands
[0], NE
, operands
[4],
4351 operands
[5], operands
[1], operands
[2]);
4355 temp
= ix86_expand_sse_cmp (operands
[0], UNORDERED
, operands
[4],
4356 operands
[5], operands
[0], operands
[0]);
4357 cmp
= ix86_expand_sse_cmp (operands
[0], EQ
, operands
[4],
4358 operands
[5], operands
[1], operands
[2]);
4364 cmp
= expand_simple_binop (GET_MODE (cmp
), code
, temp
, cmp
, cmp
, 1,
4366 ix86_expand_sse_movcc (operands
[0], cmp
, operands
[1], operands
[2]);
4370 if (ix86_expand_sse_fp_minmax (operands
[0], code
, operands
[4],
4371 operands
[5], operands
[1], operands
[2]))
4374 cmp
= ix86_expand_sse_cmp (operands
[0], code
, operands
[4], operands
[5],
4375 operands
[1], operands
[2]);
4376 ix86_expand_sse_movcc (operands
[0], cmp
, operands
[1], operands
[2]);
4380 /* Expand a signed/unsigned integral vector conditional move. */
4383 ix86_expand_int_vcond (rtx operands
[])
4385 machine_mode data_mode
= GET_MODE (operands
[0]);
4386 machine_mode mode
= GET_MODE (operands
[4]);
4387 enum rtx_code code
= GET_CODE (operands
[3]);
4388 bool negate
= false;
4394 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4395 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4396 if ((code
== LT
|| code
== GE
)
4397 && data_mode
== mode
4398 && cop1
== CONST0_RTX (mode
)
4399 && operands
[1 + (code
== LT
)] == CONST0_RTX (data_mode
)
4400 && GET_MODE_UNIT_SIZE (data_mode
) > 1
4401 && GET_MODE_UNIT_SIZE (data_mode
) <= 8
4402 && (GET_MODE_SIZE (data_mode
) == 16
4403 || (TARGET_AVX2
&& GET_MODE_SIZE (data_mode
) == 32)))
4405 rtx negop
= operands
[2 - (code
== LT
)];
4406 int shift
= GET_MODE_UNIT_BITSIZE (data_mode
) - 1;
4407 if (negop
== CONST1_RTX (data_mode
))
4409 rtx res
= expand_simple_binop (mode
, LSHIFTRT
, cop0
, GEN_INT (shift
),
4410 operands
[0], 1, OPTAB_DIRECT
);
4411 if (res
!= operands
[0])
4412 emit_move_insn (operands
[0], res
);
4415 else if (GET_MODE_INNER (data_mode
) != DImode
4416 && vector_all_ones_operand (negop
, data_mode
))
4418 rtx res
= expand_simple_binop (mode
, ASHIFTRT
, cop0
, GEN_INT (shift
),
4419 operands
[0], 0, OPTAB_DIRECT
);
4420 if (res
!= operands
[0])
4421 emit_move_insn (operands
[0], res
);
4426 if (!nonimmediate_operand (cop1
, mode
))
4427 cop1
= force_reg (mode
, cop1
);
4428 if (!general_operand (operands
[1], data_mode
))
4429 operands
[1] = force_reg (data_mode
, operands
[1]);
4430 if (!general_operand (operands
[2], data_mode
))
4431 operands
[2] = force_reg (data_mode
, operands
[2]);
4433 x
= ix86_expand_int_sse_cmp (operands
[0], code
, cop0
, cop1
,
4434 operands
[1], operands
[2], &negate
);
4439 ix86_expand_sse_movcc (operands
[0], x
, operands
[1+negate
],
4440 operands
[2-negate
]);
4445 ix86_expand_vec_perm_vpermt2 (rtx target
, rtx mask
, rtx op0
, rtx op1
,
4446 struct expand_vec_perm_d
*d
)
4448 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4449 expander, so args are either in d, or in op0, op1 etc. */
4450 machine_mode mode
= GET_MODE (d
? d
->op0
: op0
);
4451 machine_mode maskmode
= mode
;
4452 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
4457 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
4458 gen
= gen_avx512vl_vpermt2varv8hi3
;
4461 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
4462 gen
= gen_avx512vl_vpermt2varv16hi3
;
4465 if (TARGET_AVX512VBMI
)
4466 gen
= gen_avx512bw_vpermt2varv64qi3
;
4469 if (TARGET_AVX512BW
)
4470 gen
= gen_avx512bw_vpermt2varv32hi3
;
4473 if (TARGET_AVX512VL
)
4474 gen
= gen_avx512vl_vpermt2varv4si3
;
4477 if (TARGET_AVX512VL
)
4478 gen
= gen_avx512vl_vpermt2varv8si3
;
4482 gen
= gen_avx512f_vpermt2varv16si3
;
4485 if (TARGET_AVX512VL
)
4487 gen
= gen_avx512vl_vpermt2varv4sf3
;
4488 maskmode
= V4SImode
;
4492 if (TARGET_AVX512VL
)
4494 gen
= gen_avx512vl_vpermt2varv8sf3
;
4495 maskmode
= V8SImode
;
4501 gen
= gen_avx512f_vpermt2varv16sf3
;
4502 maskmode
= V16SImode
;
4506 if (TARGET_AVX512VL
)
4507 gen
= gen_avx512vl_vpermt2varv2di3
;
4510 if (TARGET_AVX512VL
)
4511 gen
= gen_avx512vl_vpermt2varv4di3
;
4515 gen
= gen_avx512f_vpermt2varv8di3
;
4518 if (TARGET_AVX512VL
)
4520 gen
= gen_avx512vl_vpermt2varv2df3
;
4521 maskmode
= V2DImode
;
4525 if (TARGET_AVX512VL
)
4527 gen
= gen_avx512vl_vpermt2varv4df3
;
4528 maskmode
= V4DImode
;
4534 gen
= gen_avx512f_vpermt2varv8df3
;
4535 maskmode
= V8DImode
;
4545 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4546 expander, so args are either in d, or in op0, op1 etc. */
4553 for (int i
= 0; i
< d
->nelt
; ++i
)
4554 vec
[i
] = GEN_INT (d
->perm
[i
]);
4555 mask
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (d
->nelt
, vec
));
4558 emit_insn (gen (target
, force_reg (maskmode
, mask
), op0
, op1
));
4562 /* Expand a variable vector permutation. */
4565 ix86_expand_vec_perm (rtx operands
[])
4567 rtx target
= operands
[0];
4568 rtx op0
= operands
[1];
4569 rtx op1
= operands
[2];
4570 rtx mask
= operands
[3];
4571 rtx t1
, t2
, t3
, t4
, t5
, t6
, t7
, t8
, vt
, vt2
, vec
[32];
4572 machine_mode mode
= GET_MODE (op0
);
4573 machine_mode maskmode
= GET_MODE (mask
);
4575 bool one_operand_shuffle
= rtx_equal_p (op0
, op1
);
4577 /* Number of elements in the vector. */
4578 w
= GET_MODE_NUNITS (mode
);
4579 e
= GET_MODE_UNIT_SIZE (mode
);
4580 gcc_assert (w
<= 64);
4582 if (TARGET_AVX512F
&& one_operand_shuffle
)
4584 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
4588 gen
=gen_avx512f_permvarv16si
;
4591 gen
= gen_avx512f_permvarv16sf
;
4594 gen
= gen_avx512f_permvarv8di
;
4597 gen
= gen_avx512f_permvarv8df
;
4604 emit_insn (gen (target
, op0
, mask
));
4609 if (ix86_expand_vec_perm_vpermt2 (target
, mask
, op0
, op1
, NULL
))
4614 if (mode
== V4DImode
|| mode
== V4DFmode
|| mode
== V16HImode
)
4616 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
4617 an constant shuffle operand. With a tiny bit of effort we can
4618 use VPERMD instead. A re-interpretation stall for V4DFmode is
4619 unfortunate but there's no avoiding it.
4620 Similarly for V16HImode we don't have instructions for variable
4621 shuffling, while for V32QImode we can use after preparing suitable
4622 masks vpshufb; vpshufb; vpermq; vpor. */
4624 if (mode
== V16HImode
)
4626 maskmode
= mode
= V32QImode
;
4632 maskmode
= mode
= V8SImode
;
4636 t1
= gen_reg_rtx (maskmode
);
4638 /* Replicate the low bits of the V4DImode mask into V8SImode:
4640 t1 = { A A B B C C D D }. */
4641 for (i
= 0; i
< w
/ 2; ++i
)
4642 vec
[i
*2 + 1] = vec
[i
*2] = GEN_INT (i
* 2);
4643 vt
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (w
, vec
));
4644 vt
= force_reg (maskmode
, vt
);
4645 mask
= gen_lowpart (maskmode
, mask
);
4646 if (maskmode
== V8SImode
)
4647 emit_insn (gen_avx2_permvarv8si (t1
, mask
, vt
));
4649 emit_insn (gen_avx2_pshufbv32qi3 (t1
, mask
, vt
));
4651 /* Multiply the shuffle indicies by two. */
4652 t1
= expand_simple_binop (maskmode
, PLUS
, t1
, t1
, t1
, 1,
4655 /* Add one to the odd shuffle indicies:
4656 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
4657 for (i
= 0; i
< w
/ 2; ++i
)
4659 vec
[i
* 2] = const0_rtx
;
4660 vec
[i
* 2 + 1] = const1_rtx
;
4662 vt
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (w
, vec
));
4663 vt
= validize_mem (force_const_mem (maskmode
, vt
));
4664 t1
= expand_simple_binop (maskmode
, PLUS
, t1
, vt
, t1
, 1,
4667 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
4668 operands
[3] = mask
= t1
;
4669 target
= gen_reg_rtx (mode
);
4670 op0
= gen_lowpart (mode
, op0
);
4671 op1
= gen_lowpart (mode
, op1
);
4677 /* The VPERMD and VPERMPS instructions already properly ignore
4678 the high bits of the shuffle elements. No need for us to
4679 perform an AND ourselves. */
4680 if (one_operand_shuffle
)
4682 emit_insn (gen_avx2_permvarv8si (target
, op0
, mask
));
4683 if (target
!= operands
[0])
4684 emit_move_insn (operands
[0],
4685 gen_lowpart (GET_MODE (operands
[0]), target
));
4689 t1
= gen_reg_rtx (V8SImode
);
4690 t2
= gen_reg_rtx (V8SImode
);
4691 emit_insn (gen_avx2_permvarv8si (t1
, op0
, mask
));
4692 emit_insn (gen_avx2_permvarv8si (t2
, op1
, mask
));
4698 mask
= gen_lowpart (V8SImode
, mask
);
4699 if (one_operand_shuffle
)
4700 emit_insn (gen_avx2_permvarv8sf (target
, op0
, mask
));
4703 t1
= gen_reg_rtx (V8SFmode
);
4704 t2
= gen_reg_rtx (V8SFmode
);
4705 emit_insn (gen_avx2_permvarv8sf (t1
, op0
, mask
));
4706 emit_insn (gen_avx2_permvarv8sf (t2
, op1
, mask
));
4712 /* By combining the two 128-bit input vectors into one 256-bit
4713 input vector, we can use VPERMD and VPERMPS for the full
4714 two-operand shuffle. */
4715 t1
= gen_reg_rtx (V8SImode
);
4716 t2
= gen_reg_rtx (V8SImode
);
4717 emit_insn (gen_avx_vec_concatv8si (t1
, op0
, op1
));
4718 emit_insn (gen_avx_vec_concatv8si (t2
, mask
, mask
));
4719 emit_insn (gen_avx2_permvarv8si (t1
, t1
, t2
));
4720 emit_insn (gen_avx_vextractf128v8si (target
, t1
, const0_rtx
));
4724 t1
= gen_reg_rtx (V8SFmode
);
4725 t2
= gen_reg_rtx (V8SImode
);
4726 mask
= gen_lowpart (V4SImode
, mask
);
4727 emit_insn (gen_avx_vec_concatv8sf (t1
, op0
, op1
));
4728 emit_insn (gen_avx_vec_concatv8si (t2
, mask
, mask
));
4729 emit_insn (gen_avx2_permvarv8sf (t1
, t1
, t2
));
4730 emit_insn (gen_avx_vextractf128v8sf (target
, t1
, const0_rtx
));
4734 t1
= gen_reg_rtx (V32QImode
);
4735 t2
= gen_reg_rtx (V32QImode
);
4736 t3
= gen_reg_rtx (V32QImode
);
4737 vt2
= GEN_INT (-128);
4738 vt
= gen_const_vec_duplicate (V32QImode
, vt2
);
4739 vt
= force_reg (V32QImode
, vt
);
4740 for (i
= 0; i
< 32; i
++)
4741 vec
[i
] = i
< 16 ? vt2
: const0_rtx
;
4742 vt2
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, vec
));
4743 vt2
= force_reg (V32QImode
, vt2
);
4744 /* From mask create two adjusted masks, which contain the same
4745 bits as mask in the low 7 bits of each vector element.
4746 The first mask will have the most significant bit clear
4747 if it requests element from the same 128-bit lane
4748 and MSB set if it requests element from the other 128-bit lane.
4749 The second mask will have the opposite values of the MSB,
4750 and additionally will have its 128-bit lanes swapped.
4751 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
4752 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
4753 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
4754 stands for other 12 bytes. */
4755 /* The bit whether element is from the same lane or the other
4756 lane is bit 4, so shift it up by 3 to the MSB position. */
4757 t5
= gen_reg_rtx (V4DImode
);
4758 emit_insn (gen_ashlv4di3 (t5
, gen_lowpart (V4DImode
, mask
),
4760 /* Clear MSB bits from the mask just in case it had them set. */
4761 emit_insn (gen_avx2_andnotv32qi3 (t2
, vt
, mask
));
4762 /* After this t1 will have MSB set for elements from other lane. */
4763 emit_insn (gen_xorv32qi3 (t1
, gen_lowpart (V32QImode
, t5
), vt2
));
4764 /* Clear bits other than MSB. */
4765 emit_insn (gen_andv32qi3 (t1
, t1
, vt
));
4766 /* Or in the lower bits from mask into t3. */
4767 emit_insn (gen_iorv32qi3 (t3
, t1
, t2
));
4768 /* And invert MSB bits in t1, so MSB is set for elements from the same
4770 emit_insn (gen_xorv32qi3 (t1
, t1
, vt
));
4771 /* Swap 128-bit lanes in t3. */
4772 t6
= gen_reg_rtx (V4DImode
);
4773 emit_insn (gen_avx2_permv4di_1 (t6
, gen_lowpart (V4DImode
, t3
),
4774 const2_rtx
, GEN_INT (3),
4775 const0_rtx
, const1_rtx
));
4776 /* And or in the lower bits from mask into t1. */
4777 emit_insn (gen_iorv32qi3 (t1
, t1
, t2
));
4778 if (one_operand_shuffle
)
4780 /* Each of these shuffles will put 0s in places where
4781 element from the other 128-bit lane is needed, otherwise
4782 will shuffle in the requested value. */
4783 emit_insn (gen_avx2_pshufbv32qi3 (t3
, op0
,
4784 gen_lowpart (V32QImode
, t6
)));
4785 emit_insn (gen_avx2_pshufbv32qi3 (t1
, op0
, t1
));
4786 /* For t3 the 128-bit lanes are swapped again. */
4787 t7
= gen_reg_rtx (V4DImode
);
4788 emit_insn (gen_avx2_permv4di_1 (t7
, gen_lowpart (V4DImode
, t3
),
4789 const2_rtx
, GEN_INT (3),
4790 const0_rtx
, const1_rtx
));
4791 /* And oring both together leads to the result. */
4792 emit_insn (gen_iorv32qi3 (target
, t1
,
4793 gen_lowpart (V32QImode
, t7
)));
4794 if (target
!= operands
[0])
4795 emit_move_insn (operands
[0],
4796 gen_lowpart (GET_MODE (operands
[0]), target
));
4800 t4
= gen_reg_rtx (V32QImode
);
4801 /* Similarly to the above one_operand_shuffle code,
4802 just for repeated twice for each operand. merge_two:
4803 code will merge the two results together. */
4804 emit_insn (gen_avx2_pshufbv32qi3 (t4
, op0
,
4805 gen_lowpart (V32QImode
, t6
)));
4806 emit_insn (gen_avx2_pshufbv32qi3 (t3
, op1
,
4807 gen_lowpart (V32QImode
, t6
)));
4808 emit_insn (gen_avx2_pshufbv32qi3 (t2
, op0
, t1
));
4809 emit_insn (gen_avx2_pshufbv32qi3 (t1
, op1
, t1
));
4810 t7
= gen_reg_rtx (V4DImode
);
4811 emit_insn (gen_avx2_permv4di_1 (t7
, gen_lowpart (V4DImode
, t4
),
4812 const2_rtx
, GEN_INT (3),
4813 const0_rtx
, const1_rtx
));
4814 t8
= gen_reg_rtx (V4DImode
);
4815 emit_insn (gen_avx2_permv4di_1 (t8
, gen_lowpart (V4DImode
, t3
),
4816 const2_rtx
, GEN_INT (3),
4817 const0_rtx
, const1_rtx
));
4818 emit_insn (gen_iorv32qi3 (t4
, t2
, gen_lowpart (V32QImode
, t7
)));
4819 emit_insn (gen_iorv32qi3 (t3
, t1
, gen_lowpart (V32QImode
, t8
)));
4825 gcc_assert (GET_MODE_SIZE (mode
) <= 16);
4832 /* The XOP VPPERM insn supports three inputs. By ignoring the
4833 one_operand_shuffle special case, we avoid creating another
4834 set of constant vectors in memory. */
4835 one_operand_shuffle
= false;
4837 /* mask = mask & {2*w-1, ...} */
4838 vt
= GEN_INT (2*w
- 1);
4842 /* mask = mask & {w-1, ...} */
4843 vt
= GEN_INT (w
- 1);
4846 vt
= gen_const_vec_duplicate (maskmode
, vt
);
4847 mask
= expand_simple_binop (maskmode
, AND
, mask
, vt
,
4848 NULL_RTX
, 0, OPTAB_DIRECT
);
4850 /* For non-QImode operations, convert the word permutation control
4851 into a byte permutation control. */
4852 if (mode
!= V16QImode
)
4854 mask
= expand_simple_binop (maskmode
, ASHIFT
, mask
,
4855 GEN_INT (exact_log2 (e
)),
4856 NULL_RTX
, 0, OPTAB_DIRECT
);
4858 /* Convert mask to vector of chars. */
4859 mask
= force_reg (V16QImode
, gen_lowpart (V16QImode
, mask
));
4861 /* Replicate each of the input bytes into byte positions:
4862 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
4863 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
4864 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
4865 for (i
= 0; i
< 16; ++i
)
4866 vec
[i
] = GEN_INT (i
/e
* e
);
4867 vt
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, vec
));
4868 vt
= validize_mem (force_const_mem (V16QImode
, vt
));
4870 emit_insn (gen_xop_pperm (mask
, mask
, mask
, vt
));
4872 emit_insn (gen_ssse3_pshufbv16qi3 (mask
, mask
, vt
));
4874 /* Convert it into the byte positions by doing
4875 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
4876 for (i
= 0; i
< 16; ++i
)
4877 vec
[i
] = GEN_INT (i
% e
);
4878 vt
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, vec
));
4879 vt
= validize_mem (force_const_mem (V16QImode
, vt
));
4880 emit_insn (gen_addv16qi3 (mask
, mask
, vt
));
4883 /* The actual shuffle operations all operate on V16QImode. */
4884 op0
= gen_lowpart (V16QImode
, op0
);
4885 op1
= gen_lowpart (V16QImode
, op1
);
4889 if (GET_MODE (target
) != V16QImode
)
4890 target
= gen_reg_rtx (V16QImode
);
4891 emit_insn (gen_xop_pperm (target
, op0
, op1
, mask
));
4892 if (target
!= operands
[0])
4893 emit_move_insn (operands
[0],
4894 gen_lowpart (GET_MODE (operands
[0]), target
));
4896 else if (one_operand_shuffle
)
4898 if (GET_MODE (target
) != V16QImode
)
4899 target
= gen_reg_rtx (V16QImode
);
4900 emit_insn (gen_ssse3_pshufbv16qi3 (target
, op0
, mask
));
4901 if (target
!= operands
[0])
4902 emit_move_insn (operands
[0],
4903 gen_lowpart (GET_MODE (operands
[0]), target
));
4910 /* Shuffle the two input vectors independently. */
4911 t1
= gen_reg_rtx (V16QImode
);
4912 t2
= gen_reg_rtx (V16QImode
);
4913 emit_insn (gen_ssse3_pshufbv16qi3 (t1
, op0
, mask
));
4914 emit_insn (gen_ssse3_pshufbv16qi3 (t2
, op1
, mask
));
4917 /* Then merge them together. The key is whether any given control
4918 element contained a bit set that indicates the second word. */
4921 if (maskmode
== V2DImode
&& !TARGET_SSE4_1
)
4923 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
4924 more shuffle to convert the V2DI input mask into a V4SI
4925 input mask. At which point the masking that expand_int_vcond
4926 will work as desired. */
4927 rtx t3
= gen_reg_rtx (V4SImode
);
4928 emit_insn (gen_sse2_pshufd_1 (t3
, gen_lowpart (V4SImode
, mask
),
4929 const0_rtx
, const0_rtx
,
4930 const2_rtx
, const2_rtx
));
4932 maskmode
= V4SImode
;
4936 vt
= gen_const_vec_duplicate (maskmode
, vt
);
4937 vt
= force_reg (maskmode
, vt
);
4938 mask
= expand_simple_binop (maskmode
, AND
, mask
, vt
,
4939 NULL_RTX
, 0, OPTAB_DIRECT
);
4941 if (GET_MODE (target
) != mode
)
4942 target
= gen_reg_rtx (mode
);
4944 xops
[1] = gen_lowpart (mode
, t2
);
4945 xops
[2] = gen_lowpart (mode
, t1
);
4946 xops
[3] = gen_rtx_EQ (maskmode
, mask
, vt
);
4949 ok
= ix86_expand_int_vcond (xops
);
4951 if (target
!= operands
[0])
4952 emit_move_insn (operands
[0],
4953 gen_lowpart (GET_MODE (operands
[0]), target
));
4957 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
4958 true if we should do zero extension, else sign extension. HIGH_P is
4959 true if we want the N/2 high elements, else the low elements. */
4962 ix86_expand_sse_unpack (rtx dest
, rtx src
, bool unsigned_p
, bool high_p
)
4964 machine_mode imode
= GET_MODE (src
);
4969 rtx (*unpack
)(rtx
, rtx
);
4970 rtx (*extract
)(rtx
, rtx
) = NULL
;
4971 machine_mode halfmode
= BLKmode
;
4977 unpack
= gen_avx512bw_zero_extendv32qiv32hi2
;
4979 unpack
= gen_avx512bw_sign_extendv32qiv32hi2
;
4980 halfmode
= V32QImode
;
4982 = high_p
? gen_vec_extract_hi_v64qi
: gen_vec_extract_lo_v64qi
;
4986 unpack
= gen_avx2_zero_extendv16qiv16hi2
;
4988 unpack
= gen_avx2_sign_extendv16qiv16hi2
;
4989 halfmode
= V16QImode
;
4991 = high_p
? gen_vec_extract_hi_v32qi
: gen_vec_extract_lo_v32qi
;
4995 unpack
= gen_avx512f_zero_extendv16hiv16si2
;
4997 unpack
= gen_avx512f_sign_extendv16hiv16si2
;
4998 halfmode
= V16HImode
;
5000 = high_p
? gen_vec_extract_hi_v32hi
: gen_vec_extract_lo_v32hi
;
5004 unpack
= gen_avx2_zero_extendv8hiv8si2
;
5006 unpack
= gen_avx2_sign_extendv8hiv8si2
;
5007 halfmode
= V8HImode
;
5009 = high_p
? gen_vec_extract_hi_v16hi
: gen_vec_extract_lo_v16hi
;
5013 unpack
= gen_avx512f_zero_extendv8siv8di2
;
5015 unpack
= gen_avx512f_sign_extendv8siv8di2
;
5016 halfmode
= V8SImode
;
5018 = high_p
? gen_vec_extract_hi_v16si
: gen_vec_extract_lo_v16si
;
5022 unpack
= gen_avx2_zero_extendv4siv4di2
;
5024 unpack
= gen_avx2_sign_extendv4siv4di2
;
5025 halfmode
= V4SImode
;
5027 = high_p
? gen_vec_extract_hi_v8si
: gen_vec_extract_lo_v8si
;
5031 unpack
= gen_sse4_1_zero_extendv8qiv8hi2
;
5033 unpack
= gen_sse4_1_sign_extendv8qiv8hi2
;
5037 unpack
= gen_sse4_1_zero_extendv4hiv4si2
;
5039 unpack
= gen_sse4_1_sign_extendv4hiv4si2
;
5043 unpack
= gen_sse4_1_zero_extendv2siv2di2
;
5045 unpack
= gen_sse4_1_sign_extendv2siv2di2
;
5051 if (GET_MODE_SIZE (imode
) >= 32)
5053 tmp
= gen_reg_rtx (halfmode
);
5054 emit_insn (extract (tmp
, src
));
5058 /* Shift higher 8 bytes to lower 8 bytes. */
5059 tmp
= gen_reg_rtx (V1TImode
);
5060 emit_insn (gen_sse2_lshrv1ti3 (tmp
, gen_lowpart (V1TImode
, src
),
5062 tmp
= gen_lowpart (imode
, tmp
);
5067 emit_insn (unpack (dest
, tmp
));
5071 rtx (*unpack
)(rtx
, rtx
, rtx
);
5077 unpack
= gen_vec_interleave_highv16qi
;
5079 unpack
= gen_vec_interleave_lowv16qi
;
5083 unpack
= gen_vec_interleave_highv8hi
;
5085 unpack
= gen_vec_interleave_lowv8hi
;
5089 unpack
= gen_vec_interleave_highv4si
;
5091 unpack
= gen_vec_interleave_lowv4si
;
5098 tmp
= force_reg (imode
, CONST0_RTX (imode
));
5100 tmp
= ix86_expand_sse_cmp (gen_reg_rtx (imode
), GT
, CONST0_RTX (imode
),
5101 src
, pc_rtx
, pc_rtx
);
5103 rtx tmp2
= gen_reg_rtx (imode
);
5104 emit_insn (unpack (tmp2
, src
, tmp
));
5105 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), tmp2
));
5109 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5110 but works for floating pointer parameters and nonoffsetable memories.
5111 For pushes, it returns just stack offsets; the values will be saved
5112 in the right order. Maximally three parts are generated. */
5115 ix86_split_to_parts (rtx operand
, rtx
*parts
, machine_mode mode
)
5120 size
= mode
==XFmode
? 3 : GET_MODE_SIZE (mode
) / 4;
5122 size
= (GET_MODE_SIZE (mode
) + 4) / 8;
5124 gcc_assert (!REG_P (operand
) || !MMX_REGNO_P (REGNO (operand
)));
5125 gcc_assert (size
>= 2 && size
<= 4);
5127 /* Optimize constant pool reference to immediates. This is used by fp
5128 moves, that force all constants to memory to allow combining. */
5129 if (MEM_P (operand
) && MEM_READONLY_P (operand
))
5130 operand
= avoid_constant_pool_reference (operand
);
5132 if (MEM_P (operand
) && !offsettable_memref_p (operand
))
5134 /* The only non-offsetable memories we handle are pushes. */
5135 int ok
= push_operand (operand
, VOIDmode
);
5139 operand
= copy_rtx (operand
);
5140 PUT_MODE (operand
, word_mode
);
5141 parts
[0] = parts
[1] = parts
[2] = parts
[3] = operand
;
5145 if (GET_CODE (operand
) == CONST_VECTOR
)
5147 scalar_int_mode imode
= int_mode_for_mode (mode
).require ();
5148 /* Caution: if we looked through a constant pool memory above,
5149 the operand may actually have a different mode now. That's
5150 ok, since we want to pun this all the way back to an integer. */
5151 operand
= simplify_subreg (imode
, operand
, GET_MODE (operand
), 0);
5152 gcc_assert (operand
!= NULL
);
5159 split_double_mode (mode
, &operand
, 1, &parts
[0], &parts
[1]);
5164 if (REG_P (operand
))
5166 gcc_assert (reload_completed
);
5167 for (i
= 0; i
< size
; i
++)
5168 parts
[i
] = gen_rtx_REG (SImode
, REGNO (operand
) + i
);
5170 else if (offsettable_memref_p (operand
))
5172 operand
= adjust_address (operand
, SImode
, 0);
5174 for (i
= 1; i
< size
; i
++)
5175 parts
[i
] = adjust_address (operand
, SImode
, 4 * i
);
5177 else if (CONST_DOUBLE_P (operand
))
5179 const REAL_VALUE_TYPE
*r
;
5182 r
= CONST_DOUBLE_REAL_VALUE (operand
);
5186 real_to_target (l
, r
, mode
);
5187 parts
[3] = gen_int_mode (l
[3], SImode
);
5188 parts
[2] = gen_int_mode (l
[2], SImode
);
5191 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5192 long double may not be 80-bit. */
5193 real_to_target (l
, r
, mode
);
5194 parts
[2] = gen_int_mode (l
[2], SImode
);
5197 REAL_VALUE_TO_TARGET_DOUBLE (*r
, l
);
5202 parts
[1] = gen_int_mode (l
[1], SImode
);
5203 parts
[0] = gen_int_mode (l
[0], SImode
);
5212 split_double_mode (mode
, &operand
, 1, &parts
[0], &parts
[1]);
5213 if (mode
== XFmode
|| mode
== TFmode
)
5215 machine_mode upper_mode
= mode
==XFmode
? SImode
: DImode
;
5216 if (REG_P (operand
))
5218 gcc_assert (reload_completed
);
5219 parts
[0] = gen_rtx_REG (DImode
, REGNO (operand
) + 0);
5220 parts
[1] = gen_rtx_REG (upper_mode
, REGNO (operand
) + 1);
5222 else if (offsettable_memref_p (operand
))
5224 operand
= adjust_address (operand
, DImode
, 0);
5226 parts
[1] = adjust_address (operand
, upper_mode
, 8);
5228 else if (CONST_DOUBLE_P (operand
))
5232 real_to_target (l
, CONST_DOUBLE_REAL_VALUE (operand
), mode
);
5234 /* real_to_target puts 32-bit pieces in each long. */
5235 parts
[0] = gen_int_mode ((l
[0] & HOST_WIDE_INT_C (0xffffffff))
5236 | ((l
[1] & HOST_WIDE_INT_C (0xffffffff))
5239 if (upper_mode
== SImode
)
5240 parts
[1] = gen_int_mode (l
[2], SImode
);
5243 = gen_int_mode ((l
[2] & HOST_WIDE_INT_C (0xffffffff))
5244 | ((l
[3] & HOST_WIDE_INT_C (0xffffffff))
5255 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5256 Return false when normal moves are needed; true when all required
5257 insns have been emitted. Operands 2-4 contain the input values
5258 int the correct order; operands 5-7 contain the output values. */
5261 ix86_split_long_move (rtx operands
[])
5267 machine_mode mode
= GET_MODE (operands
[0]);
5268 bool collisionparts
[4];
5270 /* The DFmode expanders may ask us to move double.
5271 For 64bit target this is single move. By hiding the fact
5272 here we simplify i386.md splitters. */
5273 if (TARGET_64BIT
&& GET_MODE_SIZE (GET_MODE (operands
[0])) == 8)
5275 /* Optimize constant pool reference to immediates. This is used by
5276 fp moves, that force all constants to memory to allow combining. */
5278 if (MEM_P (operands
[1])
5279 && GET_CODE (XEXP (operands
[1], 0)) == SYMBOL_REF
5280 && CONSTANT_POOL_ADDRESS_P (XEXP (operands
[1], 0)))
5281 operands
[1] = get_pool_constant (XEXP (operands
[1], 0));
5282 if (push_operand (operands
[0], VOIDmode
))
5284 operands
[0] = copy_rtx (operands
[0]);
5285 PUT_MODE (operands
[0], word_mode
);
5288 operands
[0] = gen_lowpart (DImode
, operands
[0]);
5289 operands
[1] = gen_lowpart (DImode
, operands
[1]);
5290 emit_move_insn (operands
[0], operands
[1]);
5294 /* The only non-offsettable memory we handle is push. */
5295 if (push_operand (operands
[0], VOIDmode
))
5298 gcc_assert (!MEM_P (operands
[0])
5299 || offsettable_memref_p (operands
[0]));
5301 nparts
= ix86_split_to_parts (operands
[1], part
[1], GET_MODE (operands
[0]));
5302 ix86_split_to_parts (operands
[0], part
[0], GET_MODE (operands
[0]));
5304 /* When emitting push, take care for source operands on the stack. */
5305 if (push
&& MEM_P (operands
[1])
5306 && reg_overlap_mentioned_p (stack_pointer_rtx
, operands
[1]))
5308 rtx src_base
= XEXP (part
[1][nparts
- 1], 0);
5310 /* Compensate for the stack decrement by 4. */
5311 if (!TARGET_64BIT
&& nparts
== 3
5312 && mode
== XFmode
&& TARGET_128BIT_LONG_DOUBLE
)
5313 src_base
= plus_constant (Pmode
, src_base
, 4);
5315 /* src_base refers to the stack pointer and is
5316 automatically decreased by emitted push. */
5317 for (i
= 0; i
< nparts
; i
++)
5318 part
[1][i
] = change_address (part
[1][i
],
5319 GET_MODE (part
[1][i
]), src_base
);
5322 /* We need to do copy in the right order in case an address register
5323 of the source overlaps the destination. */
5324 if (REG_P (part
[0][0]) && MEM_P (part
[1][0]))
5328 for (i
= 0; i
< nparts
; i
++)
5331 = reg_overlap_mentioned_p (part
[0][i
], XEXP (part
[1][0], 0));
5332 if (collisionparts
[i
])
5336 /* Collision in the middle part can be handled by reordering. */
5337 if (collisions
== 1 && nparts
== 3 && collisionparts
[1])
5339 std::swap (part
[0][1], part
[0][2]);
5340 std::swap (part
[1][1], part
[1][2]);
5342 else if (collisions
== 1
5344 && (collisionparts
[1] || collisionparts
[2]))
5346 if (collisionparts
[1])
5348 std::swap (part
[0][1], part
[0][2]);
5349 std::swap (part
[1][1], part
[1][2]);
5353 std::swap (part
[0][2], part
[0][3]);
5354 std::swap (part
[1][2], part
[1][3]);
5358 /* If there are more collisions, we can't handle it by reordering.
5359 Do an lea to the last part and use only one colliding move. */
5360 else if (collisions
> 1)
5366 base
= part
[0][nparts
- 1];
5368 /* Handle the case when the last part isn't valid for lea.
5369 Happens in 64-bit mode storing the 12-byte XFmode. */
5370 if (GET_MODE (base
) != Pmode
)
5371 base
= gen_rtx_REG (Pmode
, REGNO (base
));
5373 addr
= XEXP (part
[1][0], 0);
5374 if (TARGET_TLS_DIRECT_SEG_REFS
)
5376 struct ix86_address parts
;
5377 int ok
= ix86_decompose_address (addr
, &parts
);
5379 /* It is not valid to use %gs: or %fs: in lea. */
5380 gcc_assert (parts
.seg
== ADDR_SPACE_GENERIC
);
5382 emit_insn (gen_rtx_SET (base
, addr
));
5383 part
[1][0] = replace_equiv_address (part
[1][0], base
);
5384 for (i
= 1; i
< nparts
; i
++)
5386 tmp
= plus_constant (Pmode
, base
, UNITS_PER_WORD
* i
);
5387 part
[1][i
] = replace_equiv_address (part
[1][i
], tmp
);
5398 if (TARGET_128BIT_LONG_DOUBLE
&& mode
== XFmode
)
5399 emit_insn (gen_add2_insn (stack_pointer_rtx
, GEN_INT (-4)));
5400 emit_move_insn (part
[0][2], part
[1][2]);
5402 else if (nparts
== 4)
5404 emit_move_insn (part
[0][3], part
[1][3]);
5405 emit_move_insn (part
[0][2], part
[1][2]);
5410 /* In 64bit mode we don't have 32bit push available. In case this is
5411 register, it is OK - we will just use larger counterpart. We also
5412 retype memory - these comes from attempt to avoid REX prefix on
5413 moving of second half of TFmode value. */
5414 if (GET_MODE (part
[1][1]) == SImode
)
5416 switch (GET_CODE (part
[1][1]))
5419 part
[1][1] = adjust_address (part
[1][1], DImode
, 0);
5423 part
[1][1] = gen_rtx_REG (DImode
, REGNO (part
[1][1]));
5430 if (GET_MODE (part
[1][0]) == SImode
)
5431 part
[1][0] = part
[1][1];
5434 emit_move_insn (part
[0][1], part
[1][1]);
5435 emit_move_insn (part
[0][0], part
[1][0]);
5439 /* Choose correct order to not overwrite the source before it is copied. */
5440 if ((REG_P (part
[0][0])
5441 && REG_P (part
[1][1])
5442 && (REGNO (part
[0][0]) == REGNO (part
[1][1])
5444 && REGNO (part
[0][0]) == REGNO (part
[1][2]))
5446 && REGNO (part
[0][0]) == REGNO (part
[1][3]))))
5448 && reg_overlap_mentioned_p (part
[0][0], XEXP (part
[1][0], 0))))
5450 for (i
= 0, j
= nparts
- 1; i
< nparts
; i
++, j
--)
5452 operands
[2 + i
] = part
[0][j
];
5453 operands
[6 + i
] = part
[1][j
];
5458 for (i
= 0; i
< nparts
; i
++)
5460 operands
[2 + i
] = part
[0][i
];
5461 operands
[6 + i
] = part
[1][i
];
5465 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
5466 if (optimize_insn_for_size_p ())
5468 for (j
= 0; j
< nparts
- 1; j
++)
5469 if (CONST_INT_P (operands
[6 + j
])
5470 && operands
[6 + j
] != const0_rtx
5471 && REG_P (operands
[2 + j
]))
5472 for (i
= j
; i
< nparts
- 1; i
++)
5473 if (CONST_INT_P (operands
[7 + i
])
5474 && INTVAL (operands
[7 + i
]) == INTVAL (operands
[6 + j
]))
5475 operands
[7 + i
] = operands
[2 + j
];
5478 for (i
= 0; i
< nparts
; i
++)
5479 emit_move_insn (operands
[2 + i
], operands
[6 + i
]);
5484 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
5485 left shift by a constant, either using a single shift or
5486 a sequence of add instructions. */
5489 ix86_expand_ashl_const (rtx operand
, int count
, machine_mode mode
)
5492 || (count
* ix86_cost
->add
<= ix86_cost
->shift_const
5493 && !optimize_insn_for_size_p ()))
5496 emit_insn (gen_add2_insn (operand
, operand
));
5500 rtx (*insn
)(rtx
, rtx
, rtx
);
5502 insn
= mode
== DImode
? gen_ashlsi3
: gen_ashldi3
;
5503 emit_insn (insn (operand
, operand
, GEN_INT (count
)));
5508 ix86_split_ashl (rtx
*operands
, rtx scratch
, machine_mode mode
)
5510 rtx (*gen_ashl3
)(rtx
, rtx
, rtx
);
5511 rtx (*gen_shld
)(rtx
, rtx
, rtx
);
5512 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
5513 machine_mode half_mode
;
5515 rtx low
[2], high
[2];
5518 if (CONST_INT_P (operands
[2]))
5520 split_double_mode (mode
, operands
, 2, low
, high
);
5521 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
5523 if (count
>= half_width
)
5525 emit_move_insn (high
[0], low
[1]);
5526 emit_move_insn (low
[0], const0_rtx
);
5528 if (count
> half_width
)
5529 ix86_expand_ashl_const (high
[0], count
- half_width
, mode
);
5533 gen_shld
= mode
== DImode
? gen_x86_shld
: gen_x86_64_shld
;
5535 if (!rtx_equal_p (operands
[0], operands
[1]))
5536 emit_move_insn (operands
[0], operands
[1]);
5538 emit_insn (gen_shld (high
[0], low
[0], GEN_INT (count
)));
5539 ix86_expand_ashl_const (low
[0], count
, mode
);
5544 split_double_mode (mode
, operands
, 1, low
, high
);
5545 half_mode
= mode
== DImode
? SImode
: DImode
;
5547 gen_ashl3
= mode
== DImode
? gen_ashlsi3
: gen_ashldi3
;
5549 if (operands
[1] == const1_rtx
)
5551 /* Assuming we've chosen a QImode capable registers, then 1 << N
5552 can be done with two 32/64-bit shifts, no branches, no cmoves. */
5553 if (ANY_QI_REG_P (low
[0]) && ANY_QI_REG_P (high
[0]))
5555 rtx s
, d
, flags
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
5557 ix86_expand_clear (low
[0]);
5558 ix86_expand_clear (high
[0]);
5559 emit_insn (gen_testqi_ccz_1 (operands
[2], GEN_INT (half_width
)));
5561 d
= gen_lowpart (QImode
, low
[0]);
5562 d
= gen_rtx_STRICT_LOW_PART (VOIDmode
, d
);
5563 s
= gen_rtx_EQ (QImode
, flags
, const0_rtx
);
5564 emit_insn (gen_rtx_SET (d
, s
));
5566 d
= gen_lowpart (QImode
, high
[0]);
5567 d
= gen_rtx_STRICT_LOW_PART (VOIDmode
, d
);
5568 s
= gen_rtx_NE (QImode
, flags
, const0_rtx
);
5569 emit_insn (gen_rtx_SET (d
, s
));
5572 /* Otherwise, we can get the same results by manually performing
5573 a bit extract operation on bit 5/6, and then performing the two
5574 shifts. The two methods of getting 0/1 into low/high are exactly
5575 the same size. Avoiding the shift in the bit extract case helps
5576 pentium4 a bit; no one else seems to care much either way. */
5579 rtx (*gen_lshr3
)(rtx
, rtx
, rtx
);
5580 rtx (*gen_and3
)(rtx
, rtx
, rtx
);
5581 rtx (*gen_xor3
)(rtx
, rtx
, rtx
);
5587 gen_lshr3
= gen_lshrsi3
;
5588 gen_and3
= gen_andsi3
;
5589 gen_xor3
= gen_xorsi3
;
5594 gen_lshr3
= gen_lshrdi3
;
5595 gen_and3
= gen_anddi3
;
5596 gen_xor3
= gen_xordi3
;
5600 if (TARGET_PARTIAL_REG_STALL
&& !optimize_insn_for_size_p ())
5601 x
= gen_rtx_ZERO_EXTEND (half_mode
, operands
[2]);
5603 x
= gen_lowpart (half_mode
, operands
[2]);
5604 emit_insn (gen_rtx_SET (high
[0], x
));
5606 emit_insn (gen_lshr3 (high
[0], high
[0], GEN_INT (bits
)));
5607 emit_insn (gen_and3 (high
[0], high
[0], const1_rtx
));
5608 emit_move_insn (low
[0], high
[0]);
5609 emit_insn (gen_xor3 (low
[0], low
[0], const1_rtx
));
5612 emit_insn (gen_ashl3 (low
[0], low
[0], operands
[2]));
5613 emit_insn (gen_ashl3 (high
[0], high
[0], operands
[2]));
5617 if (operands
[1] == constm1_rtx
)
5619 /* For -1 << N, we can avoid the shld instruction, because we
5620 know that we're shifting 0...31/63 ones into a -1. */
5621 emit_move_insn (low
[0], constm1_rtx
);
5622 if (optimize_insn_for_size_p ())
5623 emit_move_insn (high
[0], low
[0]);
5625 emit_move_insn (high
[0], constm1_rtx
);
5629 gen_shld
= mode
== DImode
? gen_x86_shld
: gen_x86_64_shld
;
5631 if (!rtx_equal_p (operands
[0], operands
[1]))
5632 emit_move_insn (operands
[0], operands
[1]);
5634 split_double_mode (mode
, operands
, 1, low
, high
);
5635 emit_insn (gen_shld (high
[0], low
[0], operands
[2]));
5638 emit_insn (gen_ashl3 (low
[0], low
[0], operands
[2]));
5640 if (TARGET_CMOVE
&& scratch
)
5642 ix86_expand_clear (scratch
);
5643 emit_insn (gen_x86_shift_adj_1
5644 (half_mode
, high
[0], low
[0], operands
[2], scratch
));
5647 emit_insn (gen_x86_shift_adj_2 (half_mode
, high
[0], low
[0], operands
[2]));
5651 ix86_split_ashr (rtx
*operands
, rtx scratch
, machine_mode mode
)
5653 rtx (*gen_ashr3
)(rtx
, rtx
, rtx
)
5654 = mode
== DImode
? gen_ashrsi3
: gen_ashrdi3
;
5655 rtx (*gen_shrd
)(rtx
, rtx
, rtx
);
5656 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
5658 rtx low
[2], high
[2];
5661 if (CONST_INT_P (operands
[2]))
5663 split_double_mode (mode
, operands
, 2, low
, high
);
5664 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
5666 if (count
== GET_MODE_BITSIZE (mode
) - 1)
5668 emit_move_insn (high
[0], high
[1]);
5669 emit_insn (gen_ashr3 (high
[0], high
[0],
5670 GEN_INT (half_width
- 1)));
5671 emit_move_insn (low
[0], high
[0]);
5674 else if (count
>= half_width
)
5676 emit_move_insn (low
[0], high
[1]);
5677 emit_move_insn (high
[0], low
[0]);
5678 emit_insn (gen_ashr3 (high
[0], high
[0],
5679 GEN_INT (half_width
- 1)));
5681 if (count
> half_width
)
5682 emit_insn (gen_ashr3 (low
[0], low
[0],
5683 GEN_INT (count
- half_width
)));
5687 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5689 if (!rtx_equal_p (operands
[0], operands
[1]))
5690 emit_move_insn (operands
[0], operands
[1]);
5692 emit_insn (gen_shrd (low
[0], high
[0], GEN_INT (count
)));
5693 emit_insn (gen_ashr3 (high
[0], high
[0], GEN_INT (count
)));
5698 machine_mode half_mode
;
5700 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5702 if (!rtx_equal_p (operands
[0], operands
[1]))
5703 emit_move_insn (operands
[0], operands
[1]);
5705 split_double_mode (mode
, operands
, 1, low
, high
);
5706 half_mode
= mode
== DImode
? SImode
: DImode
;
5708 emit_insn (gen_shrd (low
[0], high
[0], operands
[2]));
5709 emit_insn (gen_ashr3 (high
[0], high
[0], operands
[2]));
5711 if (TARGET_CMOVE
&& scratch
)
5713 emit_move_insn (scratch
, high
[0]);
5714 emit_insn (gen_ashr3 (scratch
, scratch
,
5715 GEN_INT (half_width
- 1)));
5716 emit_insn (gen_x86_shift_adj_1
5717 (half_mode
, low
[0], high
[0], operands
[2], scratch
));
5720 emit_insn (gen_x86_shift_adj_3
5721 (half_mode
, low
[0], high
[0], operands
[2]));
5726 ix86_split_lshr (rtx
*operands
, rtx scratch
, machine_mode mode
)
5728 rtx (*gen_lshr3
)(rtx
, rtx
, rtx
)
5729 = mode
== DImode
? gen_lshrsi3
: gen_lshrdi3
;
5730 rtx (*gen_shrd
)(rtx
, rtx
, rtx
);
5731 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
5733 rtx low
[2], high
[2];
5736 if (CONST_INT_P (operands
[2]))
5738 split_double_mode (mode
, operands
, 2, low
, high
);
5739 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
5741 if (count
>= half_width
)
5743 emit_move_insn (low
[0], high
[1]);
5744 ix86_expand_clear (high
[0]);
5746 if (count
> half_width
)
5747 emit_insn (gen_lshr3 (low
[0], low
[0],
5748 GEN_INT (count
- half_width
)));
5752 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5754 if (!rtx_equal_p (operands
[0], operands
[1]))
5755 emit_move_insn (operands
[0], operands
[1]);
5757 emit_insn (gen_shrd (low
[0], high
[0], GEN_INT (count
)));
5758 emit_insn (gen_lshr3 (high
[0], high
[0], GEN_INT (count
)));
5763 machine_mode half_mode
;
5765 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5767 if (!rtx_equal_p (operands
[0], operands
[1]))
5768 emit_move_insn (operands
[0], operands
[1]);
5770 split_double_mode (mode
, operands
, 1, low
, high
);
5771 half_mode
= mode
== DImode
? SImode
: DImode
;
5773 emit_insn (gen_shrd (low
[0], high
[0], operands
[2]));
5774 emit_insn (gen_lshr3 (high
[0], high
[0], operands
[2]));
5776 if (TARGET_CMOVE
&& scratch
)
5778 ix86_expand_clear (scratch
);
5779 emit_insn (gen_x86_shift_adj_1
5780 (half_mode
, low
[0], high
[0], operands
[2], scratch
));
5783 emit_insn (gen_x86_shift_adj_2
5784 (half_mode
, low
[0], high
[0], operands
[2]));
5788 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
5789 DImode for constant loop counts. */
5792 counter_mode (rtx count_exp
)
5794 if (GET_MODE (count_exp
) != VOIDmode
)
5795 return GET_MODE (count_exp
);
5796 if (!CONST_INT_P (count_exp
))
5798 if (TARGET_64BIT
&& (INTVAL (count_exp
) & ~0xffffffff))
5803 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
5804 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
5805 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
5806 memory by VALUE (supposed to be in MODE).
5808 The size is rounded down to whole number of chunk size moved at once.
5809 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
5813 expand_set_or_cpymem_via_loop (rtx destmem
, rtx srcmem
,
5814 rtx destptr
, rtx srcptr
, rtx value
,
5815 rtx count
, machine_mode mode
, int unroll
,
5816 int expected_size
, bool issetmem
)
5818 rtx_code_label
*out_label
, *top_label
;
5820 machine_mode iter_mode
= counter_mode (count
);
5821 int piece_size_n
= GET_MODE_SIZE (mode
) * unroll
;
5822 rtx piece_size
= GEN_INT (piece_size_n
);
5823 rtx piece_size_mask
= GEN_INT (~((GET_MODE_SIZE (mode
) * unroll
) - 1));
5827 top_label
= gen_label_rtx ();
5828 out_label
= gen_label_rtx ();
5829 iter
= gen_reg_rtx (iter_mode
);
5831 size
= expand_simple_binop (iter_mode
, AND
, count
, piece_size_mask
,
5832 NULL
, 1, OPTAB_DIRECT
);
5833 /* Those two should combine. */
5834 if (piece_size
== const1_rtx
)
5836 emit_cmp_and_jump_insns (size
, const0_rtx
, EQ
, NULL_RTX
, iter_mode
,
5838 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
5840 emit_move_insn (iter
, const0_rtx
);
5842 emit_label (top_label
);
5844 tmp
= convert_modes (Pmode
, iter_mode
, iter
, true);
5846 /* This assert could be relaxed - in this case we'll need to compute
5847 smallest power of two, containing in PIECE_SIZE_N and pass it to
5849 gcc_assert ((piece_size_n
& (piece_size_n
- 1)) == 0);
5850 destmem
= offset_address (destmem
, tmp
, piece_size_n
);
5851 destmem
= adjust_address (destmem
, mode
, 0);
5855 srcmem
= offset_address (srcmem
, copy_rtx (tmp
), piece_size_n
);
5856 srcmem
= adjust_address (srcmem
, mode
, 0);
5858 /* When unrolling for chips that reorder memory reads and writes,
5859 we can save registers by using single temporary.
5860 Also using 4 temporaries is overkill in 32bit mode. */
5861 if (!TARGET_64BIT
&& 0)
5863 for (i
= 0; i
< unroll
; i
++)
5867 destmem
= adjust_address (copy_rtx (destmem
), mode
,
5868 GET_MODE_SIZE (mode
));
5869 srcmem
= adjust_address (copy_rtx (srcmem
), mode
,
5870 GET_MODE_SIZE (mode
));
5872 emit_move_insn (destmem
, srcmem
);
5878 gcc_assert (unroll
<= 4);
5879 for (i
= 0; i
< unroll
; i
++)
5881 tmpreg
[i
] = gen_reg_rtx (mode
);
5883 srcmem
= adjust_address (copy_rtx (srcmem
), mode
,
5884 GET_MODE_SIZE (mode
));
5885 emit_move_insn (tmpreg
[i
], srcmem
);
5887 for (i
= 0; i
< unroll
; i
++)
5890 destmem
= adjust_address (copy_rtx (destmem
), mode
,
5891 GET_MODE_SIZE (mode
));
5892 emit_move_insn (destmem
, tmpreg
[i
]);
5897 for (i
= 0; i
< unroll
; i
++)
5900 destmem
= adjust_address (copy_rtx (destmem
), mode
,
5901 GET_MODE_SIZE (mode
));
5902 emit_move_insn (destmem
, value
);
5905 tmp
= expand_simple_binop (iter_mode
, PLUS
, iter
, piece_size
, iter
,
5906 true, OPTAB_LIB_WIDEN
);
5908 emit_move_insn (iter
, tmp
);
5910 emit_cmp_and_jump_insns (iter
, size
, LT
, NULL_RTX
, iter_mode
,
5912 if (expected_size
!= -1)
5914 expected_size
/= GET_MODE_SIZE (mode
) * unroll
;
5915 if (expected_size
== 0)
5917 else if (expected_size
> REG_BR_PROB_BASE
)
5918 predict_jump (REG_BR_PROB_BASE
- 1);
5920 predict_jump (REG_BR_PROB_BASE
- (REG_BR_PROB_BASE
+ expected_size
/ 2)
5924 predict_jump (REG_BR_PROB_BASE
* 80 / 100);
5925 iter
= ix86_zero_extend_to_Pmode (iter
);
5926 tmp
= expand_simple_binop (Pmode
, PLUS
, destptr
, iter
, destptr
,
5927 true, OPTAB_LIB_WIDEN
);
5929 emit_move_insn (destptr
, tmp
);
5932 tmp
= expand_simple_binop (Pmode
, PLUS
, srcptr
, iter
, srcptr
,
5933 true, OPTAB_LIB_WIDEN
);
5935 emit_move_insn (srcptr
, tmp
);
5937 emit_label (out_label
);
5940 /* Divide COUNTREG by SCALE. */
5942 scale_counter (rtx countreg
, int scale
)
5948 if (CONST_INT_P (countreg
))
5949 return GEN_INT (INTVAL (countreg
) / scale
);
5950 gcc_assert (REG_P (countreg
));
5952 sc
= expand_simple_binop (GET_MODE (countreg
), LSHIFTRT
, countreg
,
5953 GEN_INT (exact_log2 (scale
)),
5954 NULL
, 1, OPTAB_DIRECT
);
5958 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
5959 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
5960 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
5961 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
5962 ORIG_VALUE is the original value passed to memset to fill the memory with.
5963 Other arguments have same meaning as for previous function. */
5966 expand_set_or_cpymem_via_rep (rtx destmem
, rtx srcmem
,
5967 rtx destptr
, rtx srcptr
, rtx value
, rtx orig_value
,
5969 machine_mode mode
, bool issetmem
)
5974 HOST_WIDE_INT rounded_count
;
5976 /* If possible, it is shorter to use rep movs.
5977 TODO: Maybe it is better to move this logic to decide_alg. */
5978 if (mode
== QImode
&& CONST_INT_P (count
) && !(INTVAL (count
) & 3)
5979 && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
5980 && (!issetmem
|| orig_value
== const0_rtx
))
5983 if (destptr
!= XEXP (destmem
, 0) || GET_MODE (destmem
) != BLKmode
)
5984 destmem
= adjust_automodify_address_nv (destmem
, BLKmode
, destptr
, 0);
5986 countreg
= ix86_zero_extend_to_Pmode (scale_counter (count
,
5987 GET_MODE_SIZE (mode
)));
5990 destexp
= gen_rtx_ASHIFT (Pmode
, countreg
,
5991 GEN_INT (exact_log2 (GET_MODE_SIZE (mode
))));
5992 destexp
= gen_rtx_PLUS (Pmode
, destexp
, destptr
);
5995 destexp
= gen_rtx_PLUS (Pmode
, destptr
, countreg
);
5996 if ((!issetmem
|| orig_value
== const0_rtx
) && CONST_INT_P (count
))
5999 = ROUND_DOWN (INTVAL (count
), (HOST_WIDE_INT
) GET_MODE_SIZE (mode
));
6000 destmem
= shallow_copy_rtx (destmem
);
6001 set_mem_size (destmem
, rounded_count
);
6003 else if (MEM_SIZE_KNOWN_P (destmem
))
6004 clear_mem_size (destmem
);
6008 value
= force_reg (mode
, gen_lowpart (mode
, value
));
6009 emit_insn (gen_rep_stos (destptr
, countreg
, destmem
, value
, destexp
));
6013 if (srcptr
!= XEXP (srcmem
, 0) || GET_MODE (srcmem
) != BLKmode
)
6014 srcmem
= adjust_automodify_address_nv (srcmem
, BLKmode
, srcptr
, 0);
6017 srcexp
= gen_rtx_ASHIFT (Pmode
, countreg
,
6018 GEN_INT (exact_log2 (GET_MODE_SIZE (mode
))));
6019 srcexp
= gen_rtx_PLUS (Pmode
, srcexp
, srcptr
);
6022 srcexp
= gen_rtx_PLUS (Pmode
, srcptr
, countreg
);
6023 if (CONST_INT_P (count
))
6026 = ROUND_DOWN (INTVAL (count
), (HOST_WIDE_INT
) GET_MODE_SIZE (mode
));
6027 srcmem
= shallow_copy_rtx (srcmem
);
6028 set_mem_size (srcmem
, rounded_count
);
6032 if (MEM_SIZE_KNOWN_P (srcmem
))
6033 clear_mem_size (srcmem
);
6035 emit_insn (gen_rep_mov (destptr
, destmem
, srcptr
, srcmem
, countreg
,
6040 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
6042 SRC is passed by pointer to be updated on return.
6043 Return value is updated DST. */
6045 emit_memmov (rtx destmem
, rtx
*srcmem
, rtx destptr
, rtx srcptr
,
6046 HOST_WIDE_INT size_to_move
)
6048 rtx dst
= destmem
, src
= *srcmem
, tempreg
;
6049 enum insn_code code
;
6050 machine_mode move_mode
;
6053 /* Find the widest mode in which we could perform moves.
6054 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6055 it until move of such size is supported. */
6056 piece_size
= 1 << floor_log2 (size_to_move
);
6057 while (!int_mode_for_size (piece_size
* BITS_PER_UNIT
, 0).exists (&move_mode
)
6058 || (code
= optab_handler (mov_optab
, move_mode
)) == CODE_FOR_nothing
)
6060 gcc_assert (piece_size
> 1);
6064 /* Find the corresponding vector mode with the same size as MOVE_MODE.
6065 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
6066 if (GET_MODE_SIZE (move_mode
) > GET_MODE_SIZE (word_mode
))
6068 int nunits
= GET_MODE_SIZE (move_mode
) / GET_MODE_SIZE (word_mode
);
6069 if (!mode_for_vector (word_mode
, nunits
).exists (&move_mode
)
6070 || (code
= optab_handler (mov_optab
, move_mode
)) == CODE_FOR_nothing
)
6072 move_mode
= word_mode
;
6073 piece_size
= GET_MODE_SIZE (move_mode
);
6074 code
= optab_handler (mov_optab
, move_mode
);
6077 gcc_assert (code
!= CODE_FOR_nothing
);
6079 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
, 0);
6080 src
= adjust_automodify_address_nv (src
, move_mode
, srcptr
, 0);
6082 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6083 gcc_assert (size_to_move
% piece_size
== 0);
6085 for (i
= 0; i
< size_to_move
; i
+= piece_size
)
6087 /* We move from memory to memory, so we'll need to do it via
6088 a temporary register. */
6089 tempreg
= gen_reg_rtx (move_mode
);
6090 emit_insn (GEN_FCN (code
) (tempreg
, src
));
6091 emit_insn (GEN_FCN (code
) (dst
, tempreg
));
6093 emit_move_insn (destptr
,
6094 plus_constant (Pmode
, copy_rtx (destptr
), piece_size
));
6095 emit_move_insn (srcptr
,
6096 plus_constant (Pmode
, copy_rtx (srcptr
), piece_size
));
6098 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
6100 src
= adjust_automodify_address_nv (src
, move_mode
, srcptr
,
6104 /* Update DST and SRC rtx. */
6109 /* Helper function for the string operations below. Dest VARIABLE whether
6110 it is aligned to VALUE bytes. If true, jump to the label. */
6112 static rtx_code_label
*
6113 ix86_expand_aligntest (rtx variable
, int value
, bool epilogue
)
6115 rtx_code_label
*label
= gen_label_rtx ();
6116 rtx tmpcount
= gen_reg_rtx (GET_MODE (variable
));
6117 if (GET_MODE (variable
) == DImode
)
6118 emit_insn (gen_anddi3 (tmpcount
, variable
, GEN_INT (value
)));
6120 emit_insn (gen_andsi3 (tmpcount
, variable
, GEN_INT (value
)));
6121 emit_cmp_and_jump_insns (tmpcount
, const0_rtx
, EQ
, 0, GET_MODE (variable
),
6124 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
6126 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
6131 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
6134 expand_cpymem_epilogue (rtx destmem
, rtx srcmem
,
6135 rtx destptr
, rtx srcptr
, rtx count
, int max_size
)
6138 if (CONST_INT_P (count
))
6140 HOST_WIDE_INT countval
= INTVAL (count
);
6141 HOST_WIDE_INT epilogue_size
= countval
% max_size
;
6144 /* For now MAX_SIZE should be a power of 2. This assert could be
6145 relaxed, but it'll require a bit more complicated epilogue
6147 gcc_assert ((max_size
& (max_size
- 1)) == 0);
6148 for (i
= max_size
; i
>= 1; i
>>= 1)
6150 if (epilogue_size
& i
)
6151 destmem
= emit_memmov (destmem
, &srcmem
, destptr
, srcptr
, i
);
6157 count
= expand_simple_binop (GET_MODE (count
), AND
, count
, GEN_INT (max_size
- 1),
6158 count
, 1, OPTAB_DIRECT
);
6159 expand_set_or_cpymem_via_loop (destmem
, srcmem
, destptr
, srcptr
, NULL
,
6160 count
, QImode
, 1, 4, false);
6164 /* When there are stringops, we can cheaply increase dest and src pointers.
6165 Otherwise we save code size by maintaining offset (zero is readily
6166 available from preceding rep operation) and using x86 addressing modes.
6168 if (TARGET_SINGLE_STRINGOP
)
6172 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
6173 src
= change_address (srcmem
, SImode
, srcptr
);
6174 dest
= change_address (destmem
, SImode
, destptr
);
6175 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
6177 LABEL_NUSES (label
) = 1;
6181 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
6182 src
= change_address (srcmem
, HImode
, srcptr
);
6183 dest
= change_address (destmem
, HImode
, destptr
);
6184 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
6186 LABEL_NUSES (label
) = 1;
6190 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
6191 src
= change_address (srcmem
, QImode
, srcptr
);
6192 dest
= change_address (destmem
, QImode
, destptr
);
6193 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
6195 LABEL_NUSES (label
) = 1;
6200 rtx offset
= force_reg (Pmode
, const0_rtx
);
6205 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
6206 src
= change_address (srcmem
, SImode
, srcptr
);
6207 dest
= change_address (destmem
, SImode
, destptr
);
6208 emit_move_insn (dest
, src
);
6209 tmp
= expand_simple_binop (Pmode
, PLUS
, offset
, GEN_INT (4), NULL
,
6210 true, OPTAB_LIB_WIDEN
);
6212 emit_move_insn (offset
, tmp
);
6214 LABEL_NUSES (label
) = 1;
6218 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
6219 tmp
= gen_rtx_PLUS (Pmode
, srcptr
, offset
);
6220 src
= change_address (srcmem
, HImode
, tmp
);
6221 tmp
= gen_rtx_PLUS (Pmode
, destptr
, offset
);
6222 dest
= change_address (destmem
, HImode
, tmp
);
6223 emit_move_insn (dest
, src
);
6224 tmp
= expand_simple_binop (Pmode
, PLUS
, offset
, GEN_INT (2), tmp
,
6225 true, OPTAB_LIB_WIDEN
);
6227 emit_move_insn (offset
, tmp
);
6229 LABEL_NUSES (label
) = 1;
6233 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
6234 tmp
= gen_rtx_PLUS (Pmode
, srcptr
, offset
);
6235 src
= change_address (srcmem
, QImode
, tmp
);
6236 tmp
= gen_rtx_PLUS (Pmode
, destptr
, offset
);
6237 dest
= change_address (destmem
, QImode
, tmp
);
6238 emit_move_insn (dest
, src
);
6240 LABEL_NUSES (label
) = 1;
6245 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
6246 with value PROMOTED_VAL.
6247 SRC is passed by pointer to be updated on return.
6248 Return value is updated DST. */
6250 emit_memset (rtx destmem
, rtx destptr
, rtx promoted_val
,
6251 HOST_WIDE_INT size_to_move
)
6254 enum insn_code code
;
6255 machine_mode move_mode
;
6258 /* Find the widest mode in which we could perform moves.
6259 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6260 it until move of such size is supported. */
6261 move_mode
= GET_MODE (promoted_val
);
6262 if (move_mode
== VOIDmode
)
6264 if (size_to_move
< GET_MODE_SIZE (move_mode
))
6266 unsigned int move_bits
= size_to_move
* BITS_PER_UNIT
;
6267 move_mode
= int_mode_for_size (move_bits
, 0).require ();
6268 promoted_val
= gen_lowpart (move_mode
, promoted_val
);
6270 piece_size
= GET_MODE_SIZE (move_mode
);
6271 code
= optab_handler (mov_optab
, move_mode
);
6272 gcc_assert (code
!= CODE_FOR_nothing
&& promoted_val
!= NULL_RTX
);
6274 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
, 0);
6276 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6277 gcc_assert (size_to_move
% piece_size
== 0);
6279 for (i
= 0; i
< size_to_move
; i
+= piece_size
)
6281 if (piece_size
<= GET_MODE_SIZE (word_mode
))
6283 emit_insn (gen_strset (destptr
, dst
, promoted_val
));
6284 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
6289 emit_insn (GEN_FCN (code
) (dst
, promoted_val
));
6291 emit_move_insn (destptr
,
6292 plus_constant (Pmode
, copy_rtx (destptr
), piece_size
));
6294 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
6298 /* Update DST rtx. */
6301 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6303 expand_setmem_epilogue_via_loop (rtx destmem
, rtx destptr
, rtx value
,
6304 rtx count
, int max_size
)
6306 count
= expand_simple_binop (counter_mode (count
), AND
, count
,
6307 GEN_INT (max_size
- 1), count
, 1, OPTAB_DIRECT
);
6308 expand_set_or_cpymem_via_loop (destmem
, NULL
, destptr
, NULL
,
6309 gen_lowpart (QImode
, value
), count
, QImode
,
6310 1, max_size
/ 2, true);
6313 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6315 expand_setmem_epilogue (rtx destmem
, rtx destptr
, rtx value
, rtx vec_value
,
6316 rtx count
, int max_size
)
6320 if (CONST_INT_P (count
))
6322 HOST_WIDE_INT countval
= INTVAL (count
);
6323 HOST_WIDE_INT epilogue_size
= countval
% max_size
;
6326 /* For now MAX_SIZE should be a power of 2. This assert could be
6327 relaxed, but it'll require a bit more complicated epilogue
6329 gcc_assert ((max_size
& (max_size
- 1)) == 0);
6330 for (i
= max_size
; i
>= 1; i
>>= 1)
6332 if (epilogue_size
& i
)
6334 if (vec_value
&& i
> GET_MODE_SIZE (GET_MODE (value
)))
6335 destmem
= emit_memset (destmem
, destptr
, vec_value
, i
);
6337 destmem
= emit_memset (destmem
, destptr
, value
, i
);
6344 expand_setmem_epilogue_via_loop (destmem
, destptr
, value
, count
, max_size
);
6349 rtx_code_label
*label
= ix86_expand_aligntest (count
, 16, true);
6352 dest
= change_address (destmem
, DImode
, destptr
);
6353 emit_insn (gen_strset (destptr
, dest
, value
));
6354 dest
= adjust_automodify_address_nv (dest
, DImode
, destptr
, 8);
6355 emit_insn (gen_strset (destptr
, dest
, value
));
6359 dest
= change_address (destmem
, SImode
, destptr
);
6360 emit_insn (gen_strset (destptr
, dest
, value
));
6361 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 4);
6362 emit_insn (gen_strset (destptr
, dest
, value
));
6363 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 8);
6364 emit_insn (gen_strset (destptr
, dest
, value
));
6365 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 12);
6366 emit_insn (gen_strset (destptr
, dest
, value
));
6369 LABEL_NUSES (label
) = 1;
6373 rtx_code_label
*label
= ix86_expand_aligntest (count
, 8, true);
6376 dest
= change_address (destmem
, DImode
, destptr
);
6377 emit_insn (gen_strset (destptr
, dest
, value
));
6381 dest
= change_address (destmem
, SImode
, destptr
);
6382 emit_insn (gen_strset (destptr
, dest
, value
));
6383 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 4);
6384 emit_insn (gen_strset (destptr
, dest
, value
));
6387 LABEL_NUSES (label
) = 1;
6391 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
6392 dest
= change_address (destmem
, SImode
, destptr
);
6393 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (SImode
, value
)));
6395 LABEL_NUSES (label
) = 1;
6399 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
6400 dest
= change_address (destmem
, HImode
, destptr
);
6401 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (HImode
, value
)));
6403 LABEL_NUSES (label
) = 1;
6407 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
6408 dest
= change_address (destmem
, QImode
, destptr
);
6409 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (QImode
, value
)));
6411 LABEL_NUSES (label
) = 1;
6415 /* Adjust COUNTER by the VALUE. */
6417 ix86_adjust_counter (rtx countreg
, HOST_WIDE_INT value
)
6419 emit_insn (gen_add2_insn (countreg
, GEN_INT (-value
)));
6422 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
6423 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
6424 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
6426 Return value is updated DESTMEM. */
6429 expand_set_or_cpymem_prologue (rtx destmem
, rtx srcmem
,
6430 rtx destptr
, rtx srcptr
, rtx value
,
6431 rtx vec_value
, rtx count
, int align
,
6432 int desired_alignment
, bool issetmem
)
6435 for (i
= 1; i
< desired_alignment
; i
<<= 1)
6439 rtx_code_label
*label
= ix86_expand_aligntest (destptr
, i
, false);
6442 if (vec_value
&& i
> GET_MODE_SIZE (GET_MODE (value
)))
6443 destmem
= emit_memset (destmem
, destptr
, vec_value
, i
);
6445 destmem
= emit_memset (destmem
, destptr
, value
, i
);
6448 destmem
= emit_memmov (destmem
, &srcmem
, destptr
, srcptr
, i
);
6449 ix86_adjust_counter (count
, i
);
6451 LABEL_NUSES (label
) = 1;
6452 set_mem_align (destmem
, i
* 2 * BITS_PER_UNIT
);
6458 /* Test if COUNT&SIZE is nonzero and if so, expand movme
6459 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
6460 and jump to DONE_LABEL. */
6462 expand_small_cpymem_or_setmem (rtx destmem
, rtx srcmem
,
6463 rtx destptr
, rtx srcptr
,
6464 rtx value
, rtx vec_value
,
6465 rtx count
, int size
,
6466 rtx done_label
, bool issetmem
)
6468 rtx_code_label
*label
= ix86_expand_aligntest (count
, size
, false);
6469 machine_mode mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 1).else_blk ();
6473 /* If we do not have vector value to copy, we must reduce size. */
6478 if (GET_MODE (value
) == VOIDmode
&& size
> 8)
6480 else if (GET_MODE_SIZE (mode
) > GET_MODE_SIZE (GET_MODE (value
)))
6481 mode
= GET_MODE (value
);
6484 mode
= GET_MODE (vec_value
), value
= vec_value
;
6488 /* Choose appropriate vector mode. */
6490 mode
= TARGET_AVX
? V32QImode
: TARGET_SSE
? V16QImode
: DImode
;
6491 else if (size
>= 16)
6492 mode
= TARGET_SSE
? V16QImode
: DImode
;
6493 srcmem
= change_address (srcmem
, mode
, srcptr
);
6495 destmem
= change_address (destmem
, mode
, destptr
);
6496 modesize
= GEN_INT (GET_MODE_SIZE (mode
));
6497 gcc_assert (GET_MODE_SIZE (mode
) <= size
);
6498 for (n
= 0; n
* GET_MODE_SIZE (mode
) < size
; n
++)
6501 emit_move_insn (destmem
, gen_lowpart (mode
, value
));
6504 emit_move_insn (destmem
, srcmem
);
6505 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
6507 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
6510 destmem
= offset_address (destmem
, count
, 1);
6511 destmem
= offset_address (destmem
, GEN_INT (-2 * size
),
6512 GET_MODE_SIZE (mode
));
6515 srcmem
= offset_address (srcmem
, count
, 1);
6516 srcmem
= offset_address (srcmem
, GEN_INT (-2 * size
),
6517 GET_MODE_SIZE (mode
));
6519 for (n
= 0; n
* GET_MODE_SIZE (mode
) < size
; n
++)
6522 emit_move_insn (destmem
, gen_lowpart (mode
, value
));
6525 emit_move_insn (destmem
, srcmem
);
6526 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
6528 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
6530 emit_jump_insn (gen_jump (done_label
));
6534 LABEL_NUSES (label
) = 1;
6537 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
6538 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
6539 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
6540 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
6541 DONE_LABEL is a label after the whole copying sequence. The label is created
6542 on demand if *DONE_LABEL is NULL.
6543 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
6544 bounds after the initial copies.
6546 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
6547 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
6548 we will dispatch to a library call for large blocks.
6550 In pseudocode we do:
6554 Assume that SIZE is 4. Bigger sizes are handled analogously
6557 copy 4 bytes from SRCPTR to DESTPTR
6558 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
6563 copy 1 byte from SRCPTR to DESTPTR
6566 copy 2 bytes from SRCPTR to DESTPTR
6567 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
6572 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
6573 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
6575 OLD_DESPTR = DESTPTR;
6576 Align DESTPTR up to DESIRED_ALIGN
6577 SRCPTR += DESTPTR - OLD_DESTPTR
6578 COUNT -= DEST_PTR - OLD_DESTPTR
6580 Round COUNT down to multiple of SIZE
6581 << optional caller supplied zero size guard is here >>
6582 << optional caller supplied dynamic check is here >>
6583 << caller supplied main copy loop is here >>
6588 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem
, rtx srcmem
,
6589 rtx
*destptr
, rtx
*srcptr
,
6591 rtx value
, rtx vec_value
,
6593 rtx_code_label
**done_label
,
6597 unsigned HOST_WIDE_INT
*min_size
,
6601 rtx_code_label
*loop_label
= NULL
, *label
;
6604 int prolog_size
= 0;
6607 /* Chose proper value to copy. */
6608 if (issetmem
&& VECTOR_MODE_P (mode
))
6609 mode_value
= vec_value
;
6612 gcc_assert (GET_MODE_SIZE (mode
) <= size
);
6614 /* See if block is big or small, handle small blocks. */
6615 if (!CONST_INT_P (*count
) && *min_size
< (unsigned HOST_WIDE_INT
)size
)
6618 loop_label
= gen_label_rtx ();
6621 *done_label
= gen_label_rtx ();
6623 emit_cmp_and_jump_insns (*count
, GEN_INT (size2
), GE
, 0, GET_MODE (*count
),
6627 /* Handle sizes > 3. */
6628 for (;size2
> 2; size2
>>= 1)
6629 expand_small_cpymem_or_setmem (destmem
, srcmem
,
6633 size2
, *done_label
, issetmem
);
6634 /* Nothing to copy? Jump to DONE_LABEL if so */
6635 emit_cmp_and_jump_insns (*count
, const0_rtx
, EQ
, 0, GET_MODE (*count
),
6638 /* Do a byte copy. */
6639 destmem
= change_address (destmem
, QImode
, *destptr
);
6641 emit_move_insn (destmem
, gen_lowpart (QImode
, value
));
6644 srcmem
= change_address (srcmem
, QImode
, *srcptr
);
6645 emit_move_insn (destmem
, srcmem
);
6648 /* Handle sizes 2 and 3. */
6649 label
= ix86_expand_aligntest (*count
, 2, false);
6650 destmem
= change_address (destmem
, HImode
, *destptr
);
6651 destmem
= offset_address (destmem
, *count
, 1);
6652 destmem
= offset_address (destmem
, GEN_INT (-2), 2);
6654 emit_move_insn (destmem
, gen_lowpart (HImode
, value
));
6657 srcmem
= change_address (srcmem
, HImode
, *srcptr
);
6658 srcmem
= offset_address (srcmem
, *count
, 1);
6659 srcmem
= offset_address (srcmem
, GEN_INT (-2), 2);
6660 emit_move_insn (destmem
, srcmem
);
6664 LABEL_NUSES (label
) = 1;
6665 emit_jump_insn (gen_jump (*done_label
));
6669 gcc_assert (*min_size
>= (unsigned HOST_WIDE_INT
)size
6670 || UINTVAL (*count
) >= (unsigned HOST_WIDE_INT
)size
);
6672 /* Start memcpy for COUNT >= SIZE. */
6675 emit_label (loop_label
);
6676 LABEL_NUSES (loop_label
) = 1;
6679 /* Copy first desired_align bytes. */
6681 srcmem
= change_address (srcmem
, mode
, *srcptr
);
6682 destmem
= change_address (destmem
, mode
, *destptr
);
6683 modesize
= GEN_INT (GET_MODE_SIZE (mode
));
6684 for (n
= 0; prolog_size
< desired_align
- align
; n
++)
6687 emit_move_insn (destmem
, mode_value
);
6690 emit_move_insn (destmem
, srcmem
);
6691 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
6693 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
6694 prolog_size
+= GET_MODE_SIZE (mode
);
6698 /* Copy last SIZE bytes. */
6699 destmem
= offset_address (destmem
, *count
, 1);
6700 destmem
= offset_address (destmem
,
6701 GEN_INT (-size
- prolog_size
),
6704 emit_move_insn (destmem
, mode_value
);
6707 srcmem
= offset_address (srcmem
, *count
, 1);
6708 srcmem
= offset_address (srcmem
,
6709 GEN_INT (-size
- prolog_size
),
6711 emit_move_insn (destmem
, srcmem
);
6713 for (n
= 1; n
* GET_MODE_SIZE (mode
) < size
; n
++)
6715 destmem
= offset_address (destmem
, modesize
, 1);
6717 emit_move_insn (destmem
, mode_value
);
6720 srcmem
= offset_address (srcmem
, modesize
, 1);
6721 emit_move_insn (destmem
, srcmem
);
6725 /* Align destination. */
6726 if (desired_align
> 1 && desired_align
> align
)
6728 rtx saveddest
= *destptr
;
6730 gcc_assert (desired_align
<= size
);
6731 /* Align destptr up, place it to new register. */
6732 *destptr
= expand_simple_binop (GET_MODE (*destptr
), PLUS
, *destptr
,
6733 GEN_INT (prolog_size
),
6734 NULL_RTX
, 1, OPTAB_DIRECT
);
6735 if (REG_P (*destptr
) && REG_P (saveddest
) && REG_POINTER (saveddest
))
6736 REG_POINTER (*destptr
) = 1;
6737 *destptr
= expand_simple_binop (GET_MODE (*destptr
), AND
, *destptr
,
6738 GEN_INT (-desired_align
),
6739 *destptr
, 1, OPTAB_DIRECT
);
6740 /* See how many bytes we skipped. */
6741 saveddest
= expand_simple_binop (GET_MODE (*destptr
), MINUS
, saveddest
,
6743 saveddest
, 1, OPTAB_DIRECT
);
6744 /* Adjust srcptr and count. */
6746 *srcptr
= expand_simple_binop (GET_MODE (*srcptr
), MINUS
, *srcptr
,
6747 saveddest
, *srcptr
, 1, OPTAB_DIRECT
);
6748 *count
= expand_simple_binop (GET_MODE (*count
), PLUS
, *count
,
6749 saveddest
, *count
, 1, OPTAB_DIRECT
);
6750 /* We copied at most size + prolog_size. */
6751 if (*min_size
> (unsigned HOST_WIDE_INT
)(size
+ prolog_size
))
6753 = ROUND_DOWN (*min_size
- size
, (unsigned HOST_WIDE_INT
)size
);
6757 /* Our loops always round down the block size, but for dispatch to
6758 library we need precise value. */
6760 *count
= expand_simple_binop (GET_MODE (*count
), AND
, *count
,
6761 GEN_INT (-size
), *count
, 1, OPTAB_DIRECT
);
6765 gcc_assert (prolog_size
== 0);
6766 /* Decrease count, so we won't end up copying last word twice. */
6767 if (!CONST_INT_P (*count
))
6768 *count
= expand_simple_binop (GET_MODE (*count
), PLUS
, *count
,
6769 constm1_rtx
, *count
, 1, OPTAB_DIRECT
);
6771 *count
= GEN_INT (ROUND_DOWN (UINTVAL (*count
) - 1,
6772 (unsigned HOST_WIDE_INT
)size
));
6774 *min_size
= ROUND_DOWN (*min_size
- 1, (unsigned HOST_WIDE_INT
)size
);
6779 /* This function is like the previous one, except here we know how many bytes
6780 need to be copied. That allows us to update alignment not only of DST, which
6781 is returned, but also of SRC, which is passed as a pointer for that
6784 expand_set_or_cpymem_constant_prologue (rtx dst
, rtx
*srcp
, rtx destreg
,
6785 rtx srcreg
, rtx value
, rtx vec_value
,
6786 int desired_align
, int align_bytes
,
6791 rtx orig_src
= NULL
;
6793 int copied_bytes
= 0;
6797 gcc_assert (srcp
!= NULL
);
6802 for (piece_size
= 1;
6803 piece_size
<= desired_align
&& copied_bytes
< align_bytes
;
6806 if (align_bytes
& piece_size
)
6810 if (vec_value
&& piece_size
> GET_MODE_SIZE (GET_MODE (value
)))
6811 dst
= emit_memset (dst
, destreg
, vec_value
, piece_size
);
6813 dst
= emit_memset (dst
, destreg
, value
, piece_size
);
6816 dst
= emit_memmov (dst
, &src
, destreg
, srcreg
, piece_size
);
6817 copied_bytes
+= piece_size
;
6820 if (MEM_ALIGN (dst
) < (unsigned int) desired_align
* BITS_PER_UNIT
)
6821 set_mem_align (dst
, desired_align
* BITS_PER_UNIT
);
6822 if (MEM_SIZE_KNOWN_P (orig_dst
))
6823 set_mem_size (dst
, MEM_SIZE (orig_dst
) - align_bytes
);
6827 int src_align_bytes
= get_mem_align_offset (src
, desired_align
6829 if (src_align_bytes
>= 0)
6830 src_align_bytes
= desired_align
- src_align_bytes
;
6831 if (src_align_bytes
>= 0)
6833 unsigned int src_align
;
6834 for (src_align
= desired_align
; src_align
>= 2; src_align
>>= 1)
6836 if ((src_align_bytes
& (src_align
- 1))
6837 == (align_bytes
& (src_align
- 1)))
6840 if (src_align
> (unsigned int) desired_align
)
6841 src_align
= desired_align
;
6842 if (MEM_ALIGN (src
) < src_align
* BITS_PER_UNIT
)
6843 set_mem_align (src
, src_align
* BITS_PER_UNIT
);
6845 if (MEM_SIZE_KNOWN_P (orig_src
))
6846 set_mem_size (src
, MEM_SIZE (orig_src
) - align_bytes
);
6853 /* Return true if ALG can be used in current context.
6854 Assume we expand memset if MEMSET is true. */
6856 alg_usable_p (enum stringop_alg alg
, bool memset
, bool have_as
)
6858 if (alg
== no_stringop
)
6860 if (alg
== vector_loop
)
6861 return TARGET_SSE
|| TARGET_AVX
;
6862 /* Algorithms using the rep prefix want at least edi and ecx;
6863 additionally, memset wants eax and memcpy wants esi. Don't
6864 consider such algorithms if the user has appropriated those
6865 registers for their own purposes, or if we have a non-default
6866 address space, since some string insns cannot override the segment. */
6867 if (alg
== rep_prefix_1_byte
6868 || alg
== rep_prefix_4_byte
6869 || alg
== rep_prefix_8_byte
)
6873 if (fixed_regs
[CX_REG
]
6874 || fixed_regs
[DI_REG
]
6875 || (memset
? fixed_regs
[AX_REG
] : fixed_regs
[SI_REG
]))
6881 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
6882 static enum stringop_alg
6883 decide_alg (HOST_WIDE_INT count
, HOST_WIDE_INT expected_size
,
6884 unsigned HOST_WIDE_INT min_size
, unsigned HOST_WIDE_INT max_size
,
6885 bool memset
, bool zero_memset
, bool have_as
,
6886 int *dynamic_check
, bool *noalign
, bool recur
)
6888 const struct stringop_algs
*algs
;
6889 bool optimize_for_speed
;
6891 const struct processor_costs
*cost
;
6893 bool any_alg_usable_p
= false;
6896 *dynamic_check
= -1;
6898 /* Even if the string operation call is cold, we still might spend a lot
6899 of time processing large blocks. */
6900 if (optimize_function_for_size_p (cfun
)
6901 || (optimize_insn_for_size_p ()
6903 || (expected_size
!= -1 && expected_size
< 256))))
6904 optimize_for_speed
= false;
6906 optimize_for_speed
= true;
6908 cost
= optimize_for_speed
? ix86_cost
: &ix86_size_cost
;
6910 algs
= &cost
->memset
[TARGET_64BIT
!= 0];
6912 algs
= &cost
->memcpy
[TARGET_64BIT
!= 0];
6914 /* See maximal size for user defined algorithm. */
6915 for (i
= 0; i
< MAX_STRINGOP_ALGS
; i
++)
6917 enum stringop_alg candidate
= algs
->size
[i
].alg
;
6918 bool usable
= alg_usable_p (candidate
, memset
, have_as
);
6919 any_alg_usable_p
|= usable
;
6921 if (candidate
!= libcall
&& candidate
&& usable
)
6922 max
= algs
->size
[i
].max
;
6925 /* If expected size is not known but max size is small enough
6926 so inline version is a win, set expected size into
6928 if (((max
> 1 && (unsigned HOST_WIDE_INT
) max
>= max_size
) || max
== -1)
6929 && expected_size
== -1)
6930 expected_size
= min_size
/ 2 + max_size
/ 2;
6932 /* If user specified the algorithm, honor it if possible. */
6933 if (ix86_stringop_alg
!= no_stringop
6934 && alg_usable_p (ix86_stringop_alg
, memset
, have_as
))
6935 return ix86_stringop_alg
;
6936 /* rep; movq or rep; movl is the smallest variant. */
6937 else if (!optimize_for_speed
)
6940 if (!count
|| (count
& 3) || (memset
&& !zero_memset
))
6941 return alg_usable_p (rep_prefix_1_byte
, memset
, have_as
)
6942 ? rep_prefix_1_byte
: loop_1_byte
;
6944 return alg_usable_p (rep_prefix_4_byte
, memset
, have_as
)
6945 ? rep_prefix_4_byte
: loop
;
6947 /* Very tiny blocks are best handled via the loop, REP is expensive to
6949 else if (expected_size
!= -1 && expected_size
< 4)
6951 else if (expected_size
!= -1)
6953 enum stringop_alg alg
= libcall
;
6954 bool alg_noalign
= false;
6955 for (i
= 0; i
< MAX_STRINGOP_ALGS
; i
++)
6957 /* We get here if the algorithms that were not libcall-based
6958 were rep-prefix based and we are unable to use rep prefixes
6959 based on global register usage. Break out of the loop and
6960 use the heuristic below. */
6961 if (algs
->size
[i
].max
== 0)
6963 if (algs
->size
[i
].max
>= expected_size
|| algs
->size
[i
].max
== -1)
6965 enum stringop_alg candidate
= algs
->size
[i
].alg
;
6967 if (candidate
!= libcall
6968 && alg_usable_p (candidate
, memset
, have_as
))
6971 alg_noalign
= algs
->size
[i
].noalign
;
6973 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
6974 last non-libcall inline algorithm. */
6975 if (TARGET_INLINE_ALL_STRINGOPS
)
6977 /* When the current size is best to be copied by a libcall,
6978 but we are still forced to inline, run the heuristic below
6979 that will pick code for medium sized blocks. */
6982 *noalign
= alg_noalign
;
6985 else if (!any_alg_usable_p
)
6988 else if (alg_usable_p (candidate
, memset
, have_as
)
6989 && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
6990 && candidate
== rep_prefix_1_byte
6991 /* NB: If min_size != max_size, size is
6993 && min_size
!= max_size
))
6995 *noalign
= algs
->size
[i
].noalign
;
7001 /* When asked to inline the call anyway, try to pick meaningful choice.
7002 We look for maximal size of block that is faster to copy by hand and
7003 take blocks of at most of that size guessing that average size will
7004 be roughly half of the block.
7006 If this turns out to be bad, we might simply specify the preferred
7007 choice in ix86_costs. */
7008 if ((TARGET_INLINE_ALL_STRINGOPS
|| TARGET_INLINE_STRINGOPS_DYNAMICALLY
)
7009 && (algs
->unknown_size
== libcall
7010 || !alg_usable_p (algs
->unknown_size
, memset
, have_as
)))
7012 enum stringop_alg alg
;
7013 HOST_WIDE_INT new_expected_size
= (max
> 0 ? max
: 4096) / 2;
7015 /* If there aren't any usable algorithms or if recursing already,
7016 then recursing on smaller sizes or same size isn't going to
7017 find anything. Just return the simple byte-at-a-time copy loop. */
7018 if (!any_alg_usable_p
|| recur
)
7020 /* Pick something reasonable. */
7021 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY
&& !recur
)
7022 *dynamic_check
= 128;
7025 alg
= decide_alg (count
, new_expected_size
, min_size
, max_size
, memset
,
7026 zero_memset
, have_as
, dynamic_check
, noalign
, true);
7027 gcc_assert (*dynamic_check
== -1);
7028 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY
)
7029 *dynamic_check
= max
;
7031 gcc_assert (alg
!= libcall
);
7034 return (alg_usable_p (algs
->unknown_size
, memset
, have_as
)
7035 ? algs
->unknown_size
: libcall
);
7038 /* Decide on alignment. We know that the operand is already aligned to ALIGN
7039 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
7041 decide_alignment (int align
,
7042 enum stringop_alg alg
,
7044 machine_mode move_mode
)
7046 int desired_align
= 0;
7048 gcc_assert (alg
!= no_stringop
);
7052 if (move_mode
== VOIDmode
)
7055 desired_align
= GET_MODE_SIZE (move_mode
);
7056 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
7057 copying whole cacheline at once. */
7058 if (TARGET_PENTIUMPRO
7059 && (alg
== rep_prefix_4_byte
|| alg
== rep_prefix_1_byte
))
7064 if (desired_align
< align
)
7065 desired_align
= align
;
7066 if (expected_size
!= -1 && expected_size
< 4)
7067 desired_align
= align
;
7069 return desired_align
;
7073 /* Helper function for memcpy. For QImode value 0xXY produce
7074 0xXYXYXYXY of wide specified by MODE. This is essentially
7075 a * 0x10101010, but we can do slightly better than
7076 synth_mult by unwinding the sequence by hand on CPUs with
7079 promote_duplicated_reg (machine_mode mode
, rtx val
)
7081 machine_mode valmode
= GET_MODE (val
);
7083 int nops
= mode
== DImode
? 3 : 2;
7085 gcc_assert (mode
== SImode
|| mode
== DImode
|| val
== const0_rtx
);
7086 if (val
== const0_rtx
)
7087 return copy_to_mode_reg (mode
, CONST0_RTX (mode
));
7088 if (CONST_INT_P (val
))
7090 HOST_WIDE_INT v
= INTVAL (val
) & 255;
7095 v
|= (v
<< 16) << 16;
7096 return copy_to_mode_reg (mode
, gen_int_mode (v
, mode
));
7099 if (valmode
== VOIDmode
)
7101 if (valmode
!= QImode
)
7102 val
= gen_lowpart (QImode
, val
);
7105 if (!TARGET_PARTIAL_REG_STALL
)
7107 if (ix86_cost
->mult_init
[mode
== DImode
? 3 : 2]
7108 + ix86_cost
->mult_bit
* (mode
== DImode
? 8 : 4)
7109 <= (ix86_cost
->shift_const
+ ix86_cost
->add
) * nops
7110 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL
== 0)))
7112 rtx reg
= convert_modes (mode
, QImode
, val
, true);
7113 tmp
= promote_duplicated_reg (mode
, const1_rtx
);
7114 return expand_simple_binop (mode
, MULT
, reg
, tmp
, NULL
, 1,
7119 rtx reg
= convert_modes (mode
, QImode
, val
, true);
7121 if (!TARGET_PARTIAL_REG_STALL
)
7122 emit_insn (gen_insv_1 (mode
, reg
, reg
));
7125 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (8),
7126 NULL
, 1, OPTAB_DIRECT
);
7127 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1,
7130 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (16),
7131 NULL
, 1, OPTAB_DIRECT
);
7132 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1, OPTAB_DIRECT
);
7135 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (32),
7136 NULL
, 1, OPTAB_DIRECT
);
7137 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1, OPTAB_DIRECT
);
7142 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
7143 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
7144 alignment from ALIGN to DESIRED_ALIGN. */
7146 promote_duplicated_reg_to_size (rtx val
, int size_needed
, int desired_align
,
7152 && (size_needed
> 4 || (desired_align
> align
&& desired_align
> 4)))
7153 promoted_val
= promote_duplicated_reg (DImode
, val
);
7154 else if (size_needed
> 2 || (desired_align
> align
&& desired_align
> 2))
7155 promoted_val
= promote_duplicated_reg (SImode
, val
);
7156 else if (size_needed
> 1 || (desired_align
> align
&& desired_align
> 1))
7157 promoted_val
= promote_duplicated_reg (HImode
, val
);
7161 return promoted_val
;
7164 /* Copy the address to a Pmode register. This is used for x32 to
7165 truncate DImode TLS address to a SImode register. */
7168 ix86_copy_addr_to_reg (rtx addr
)
7171 if (GET_MODE (addr
) == Pmode
|| GET_MODE (addr
) == VOIDmode
)
7173 reg
= copy_addr_to_reg (addr
);
7174 REG_POINTER (reg
) = 1;
7179 gcc_assert (GET_MODE (addr
) == DImode
&& Pmode
== SImode
);
7180 reg
= copy_to_mode_reg (DImode
, addr
);
7181 REG_POINTER (reg
) = 1;
7182 return gen_rtx_SUBREG (SImode
, reg
, 0);
7186 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
7187 operations when profitable. The code depends upon architecture, block size
7188 and alignment, but always has one of the following overall structures:
7190 Aligned move sequence:
7192 1) Prologue guard: Conditional that jumps up to epilogues for small
7193 blocks that can be handled by epilogue alone. This is faster
7194 but also needed for correctness, since prologue assume the block
7195 is larger than the desired alignment.
7197 Optional dynamic check for size and libcall for large
7198 blocks is emitted here too, with -minline-stringops-dynamically.
7200 2) Prologue: copy first few bytes in order to get destination
7201 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
7202 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
7203 copied. We emit either a jump tree on power of two sized
7204 blocks, or a byte loop.
7206 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7207 with specified algorithm.
7209 4) Epilogue: code copying tail of the block that is too small to be
7210 handled by main body (or up to size guarded by prologue guard).
7212 Misaligned move sequence
7214 1) missaligned move prologue/epilogue containing:
7215 a) Prologue handling small memory blocks and jumping to done_label
7216 (skipped if blocks are known to be large enough)
7217 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
7218 needed by single possibly misaligned move
7219 (skipped if alignment is not needed)
7220 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
7222 2) Zero size guard dispatching to done_label, if needed
7224 3) dispatch to library call, if needed,
7226 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7227 with specified algorithm. */
7229 ix86_expand_set_or_cpymem (rtx dst
, rtx src
, rtx count_exp
, rtx val_exp
,
7230 rtx align_exp
, rtx expected_align_exp
,
7231 rtx expected_size_exp
, rtx min_size_exp
,
7232 rtx max_size_exp
, rtx probable_max_size_exp
,
7237 rtx_code_label
*label
= NULL
;
7239 rtx_code_label
*jump_around_label
= NULL
;
7240 HOST_WIDE_INT align
= 1;
7241 unsigned HOST_WIDE_INT count
= 0;
7242 HOST_WIDE_INT expected_size
= -1;
7243 int size_needed
= 0, epilogue_size_needed
;
7244 int desired_align
= 0, align_bytes
= 0;
7245 enum stringop_alg alg
;
7246 rtx promoted_val
= NULL
;
7247 rtx vec_promoted_val
= NULL
;
7248 bool force_loopy_epilogue
= false;
7250 bool need_zero_guard
= false;
7252 machine_mode move_mode
= VOIDmode
;
7253 machine_mode wider_mode
;
7254 int unroll_factor
= 1;
7255 /* TODO: Once value ranges are available, fill in proper data. */
7256 unsigned HOST_WIDE_INT min_size
= 0;
7257 unsigned HOST_WIDE_INT max_size
= -1;
7258 unsigned HOST_WIDE_INT probable_max_size
= -1;
7259 bool misaligned_prologue_used
= false;
7262 if (CONST_INT_P (align_exp
))
7263 align
= INTVAL (align_exp
);
7264 /* i386 can do misaligned access on reasonably increased cost. */
7265 if (CONST_INT_P (expected_align_exp
)
7266 && INTVAL (expected_align_exp
) > align
)
7267 align
= INTVAL (expected_align_exp
);
7268 /* ALIGN is the minimum of destination and source alignment, but we care here
7269 just about destination alignment. */
7271 && MEM_ALIGN (dst
) > (unsigned HOST_WIDE_INT
) align
* BITS_PER_UNIT
)
7272 align
= MEM_ALIGN (dst
) / BITS_PER_UNIT
;
7274 if (CONST_INT_P (count_exp
))
7276 min_size
= max_size
= probable_max_size
= count
= expected_size
7277 = INTVAL (count_exp
);
7278 /* When COUNT is 0, there is nothing to do. */
7285 min_size
= INTVAL (min_size_exp
);
7287 max_size
= INTVAL (max_size_exp
);
7288 if (probable_max_size_exp
)
7289 probable_max_size
= INTVAL (probable_max_size_exp
);
7290 if (CONST_INT_P (expected_size_exp
))
7291 expected_size
= INTVAL (expected_size_exp
);
7294 /* Make sure we don't need to care about overflow later on. */
7295 if (count
> (HOST_WIDE_INT_1U
<< 30))
7298 have_as
= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst
));
7300 have_as
|= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src
));
7302 /* Step 0: Decide on preferred algorithm, desired alignment and
7303 size of chunks to be copied by main loop. */
7304 alg
= decide_alg (count
, expected_size
, min_size
, probable_max_size
,
7306 issetmem
&& val_exp
== const0_rtx
, have_as
,
7307 &dynamic_check
, &noalign
, false);
7310 fprintf (dump_file
, "Selected stringop expansion strategy: %s\n",
7311 stringop_alg_names
[alg
]);
7315 gcc_assert (alg
!= no_stringop
);
7317 /* For now vector-version of memset is generated only for memory zeroing, as
7318 creating of promoted vector value is very cheap in this case. */
7319 if (issetmem
&& alg
== vector_loop
&& val_exp
!= const0_rtx
)
7320 alg
= unrolled_loop
;
7323 count_exp
= copy_to_mode_reg (GET_MODE (count_exp
), count_exp
);
7324 destreg
= ix86_copy_addr_to_reg (XEXP (dst
, 0));
7326 srcreg
= ix86_copy_addr_to_reg (XEXP (src
, 0));
7329 move_mode
= word_mode
;
7337 need_zero_guard
= true;
7341 need_zero_guard
= true;
7344 need_zero_guard
= true;
7345 unroll_factor
= (TARGET_64BIT
? 4 : 2);
7348 need_zero_guard
= true;
7350 /* Find the widest supported mode. */
7351 move_mode
= word_mode
;
7352 while (GET_MODE_WIDER_MODE (move_mode
).exists (&wider_mode
)
7353 && optab_handler (mov_optab
, wider_mode
) != CODE_FOR_nothing
)
7354 move_mode
= wider_mode
;
7356 if (TARGET_AVX256_SPLIT_REGS
&& GET_MODE_BITSIZE (move_mode
) > 128)
7359 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7360 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7361 if (GET_MODE_SIZE (move_mode
) > GET_MODE_SIZE (word_mode
))
7363 int nunits
= GET_MODE_SIZE (move_mode
) / GET_MODE_SIZE (word_mode
);
7364 if (!mode_for_vector (word_mode
, nunits
).exists (&move_mode
)
7365 || optab_handler (mov_optab
, move_mode
) == CODE_FOR_nothing
)
7366 move_mode
= word_mode
;
7368 gcc_assert (optab_handler (mov_optab
, move_mode
) != CODE_FOR_nothing
);
7370 case rep_prefix_8_byte
:
7373 case rep_prefix_4_byte
:
7376 case rep_prefix_1_byte
:
7380 size_needed
= GET_MODE_SIZE (move_mode
) * unroll_factor
;
7381 epilogue_size_needed
= size_needed
;
7383 /* If we are going to call any library calls conditionally, make sure any
7384 pending stack adjustment happen before the first conditional branch,
7385 otherwise they will be emitted before the library call only and won't
7386 happen from the other branches. */
7387 if (dynamic_check
!= -1)
7388 do_pending_stack_adjust ();
7390 desired_align
= decide_alignment (align
, alg
, expected_size
, move_mode
);
7391 if (!TARGET_ALIGN_STRINGOPS
|| noalign
)
7392 align
= desired_align
;
7394 /* Step 1: Prologue guard. */
7396 /* Alignment code needs count to be in register. */
7397 if (CONST_INT_P (count_exp
) && desired_align
> align
)
7399 if (INTVAL (count_exp
) > desired_align
7400 && INTVAL (count_exp
) > size_needed
)
7403 = get_mem_align_offset (dst
, desired_align
* BITS_PER_UNIT
);
7404 if (align_bytes
<= 0)
7407 align_bytes
= desired_align
- align_bytes
;
7409 if (align_bytes
== 0)
7410 count_exp
= force_reg (counter_mode (count_exp
), count_exp
);
7412 gcc_assert (desired_align
>= 1 && align
>= 1);
7414 /* Misaligned move sequences handle both prologue and epilogue at once.
7415 Default code generation results in a smaller code for large alignments
7416 and also avoids redundant job when sizes are known precisely. */
7417 misaligned_prologue_used
7418 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
7419 && MAX (desired_align
, epilogue_size_needed
) <= 32
7420 && desired_align
<= epilogue_size_needed
7421 && ((desired_align
> align
&& !align_bytes
)
7422 || (!count
&& epilogue_size_needed
> 1)));
7424 /* Do the cheap promotion to allow better CSE across the
7425 main loop and epilogue (ie one load of the big constant in the
7427 For now the misaligned move sequences do not have fast path
7428 without broadcasting. */
7429 if (issetmem
&& ((CONST_INT_P (val_exp
) || misaligned_prologue_used
)))
7431 if (alg
== vector_loop
)
7433 gcc_assert (val_exp
== const0_rtx
);
7434 vec_promoted_val
= promote_duplicated_reg (move_mode
, val_exp
);
7435 promoted_val
= promote_duplicated_reg_to_size (val_exp
,
7436 GET_MODE_SIZE (word_mode
),
7437 desired_align
, align
);
7441 promoted_val
= promote_duplicated_reg_to_size (val_exp
, size_needed
,
7442 desired_align
, align
);
7445 /* Misaligned move sequences handles both prologues and epilogues at once.
7446 Default code generation results in smaller code for large alignments and
7447 also avoids redundant job when sizes are known precisely. */
7448 if (misaligned_prologue_used
)
7450 /* Misaligned move prologue handled small blocks by itself. */
7451 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
7452 (dst
, src
, &destreg
, &srcreg
,
7453 move_mode
, promoted_val
, vec_promoted_val
,
7456 desired_align
< align
7457 ? MAX (desired_align
, epilogue_size_needed
) : epilogue_size_needed
,
7458 desired_align
, align
, &min_size
, dynamic_check
, issetmem
);
7460 src
= change_address (src
, BLKmode
, srcreg
);
7461 dst
= change_address (dst
, BLKmode
, destreg
);
7462 set_mem_align (dst
, desired_align
* BITS_PER_UNIT
);
7463 epilogue_size_needed
= 0;
7465 && min_size
< (unsigned HOST_WIDE_INT
) size_needed
)
7467 /* It is possible that we copied enough so the main loop will not
7469 gcc_assert (size_needed
> 1);
7470 if (jump_around_label
== NULL_RTX
)
7471 jump_around_label
= gen_label_rtx ();
7472 emit_cmp_and_jump_insns (count_exp
,
7473 GEN_INT (size_needed
),
7474 LTU
, 0, counter_mode (count_exp
), 1, jump_around_label
);
7475 if (expected_size
== -1
7476 || expected_size
< (desired_align
- align
) / 2 + size_needed
)
7477 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
7479 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
7482 /* Ensure that alignment prologue won't copy past end of block. */
7483 else if (size_needed
> 1 || (desired_align
> 1 && desired_align
> align
))
7485 epilogue_size_needed
= MAX (size_needed
- 1, desired_align
- align
);
7486 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
7487 Make sure it is power of 2. */
7488 epilogue_size_needed
= 1 << (floor_log2 (epilogue_size_needed
) + 1);
7490 /* To improve performance of small blocks, we jump around the VAL
7491 promoting mode. This mean that if the promoted VAL is not constant,
7492 we might not use it in the epilogue and have to use byte
7494 if (issetmem
&& epilogue_size_needed
> 2 && !promoted_val
)
7495 force_loopy_epilogue
= true;
7496 if ((count
&& count
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
7497 || max_size
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
7499 /* If main algorithm works on QImode, no epilogue is needed.
7500 For small sizes just don't align anything. */
7501 if (size_needed
== 1)
7502 desired_align
= align
;
7507 && min_size
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
7509 label
= gen_label_rtx ();
7510 emit_cmp_and_jump_insns (count_exp
,
7511 GEN_INT (epilogue_size_needed
),
7512 LTU
, 0, counter_mode (count_exp
), 1, label
);
7513 if (expected_size
== -1 || expected_size
< epilogue_size_needed
)
7514 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
7516 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
7520 /* Emit code to decide on runtime whether library call or inline should be
7522 if (dynamic_check
!= -1)
7524 if (!issetmem
&& CONST_INT_P (count_exp
))
7526 if (UINTVAL (count_exp
) >= (unsigned HOST_WIDE_INT
)dynamic_check
)
7528 emit_block_copy_via_libcall (dst
, src
, count_exp
);
7529 count_exp
= const0_rtx
;
7535 rtx_code_label
*hot_label
= gen_label_rtx ();
7536 if (jump_around_label
== NULL_RTX
)
7537 jump_around_label
= gen_label_rtx ();
7538 emit_cmp_and_jump_insns (count_exp
, GEN_INT (dynamic_check
- 1),
7539 LEU
, 0, counter_mode (count_exp
),
7541 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
7543 set_storage_via_libcall (dst
, count_exp
, val_exp
);
7545 emit_block_copy_via_libcall (dst
, src
, count_exp
);
7546 emit_jump (jump_around_label
);
7547 emit_label (hot_label
);
7551 /* Step 2: Alignment prologue. */
7552 /* Do the expensive promotion once we branched off the small blocks. */
7553 if (issetmem
&& !promoted_val
)
7554 promoted_val
= promote_duplicated_reg_to_size (val_exp
, size_needed
,
7555 desired_align
, align
);
7557 if (desired_align
> align
&& !misaligned_prologue_used
)
7559 if (align_bytes
== 0)
7561 /* Except for the first move in prologue, we no longer know
7562 constant offset in aliasing info. It don't seems to worth
7563 the pain to maintain it for the first move, so throw away
7565 dst
= change_address (dst
, BLKmode
, destreg
);
7567 src
= change_address (src
, BLKmode
, srcreg
);
7568 dst
= expand_set_or_cpymem_prologue (dst
, src
, destreg
, srcreg
,
7569 promoted_val
, vec_promoted_val
,
7570 count_exp
, align
, desired_align
,
7572 /* At most desired_align - align bytes are copied. */
7573 if (min_size
< (unsigned)(desired_align
- align
))
7576 min_size
-= desired_align
- align
;
7580 /* If we know how many bytes need to be stored before dst is
7581 sufficiently aligned, maintain aliasing info accurately. */
7582 dst
= expand_set_or_cpymem_constant_prologue (dst
, &src
, destreg
,
7590 count_exp
= plus_constant (counter_mode (count_exp
),
7591 count_exp
, -align_bytes
);
7592 count
-= align_bytes
;
7593 min_size
-= align_bytes
;
7594 max_size
-= align_bytes
;
7597 && min_size
< (unsigned HOST_WIDE_INT
) size_needed
7598 && (count
< (unsigned HOST_WIDE_INT
) size_needed
7599 || (align_bytes
== 0
7600 && count
< ((unsigned HOST_WIDE_INT
) size_needed
7601 + desired_align
- align
))))
7603 /* It is possible that we copied enough so the main loop will not
7605 gcc_assert (size_needed
> 1);
7606 if (label
== NULL_RTX
)
7607 label
= gen_label_rtx ();
7608 emit_cmp_and_jump_insns (count_exp
,
7609 GEN_INT (size_needed
),
7610 LTU
, 0, counter_mode (count_exp
), 1, label
);
7611 if (expected_size
== -1
7612 || expected_size
< (desired_align
- align
) / 2 + size_needed
)
7613 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
7615 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
7618 if (label
&& size_needed
== 1)
7621 LABEL_NUSES (label
) = 1;
7623 epilogue_size_needed
= 1;
7625 promoted_val
= val_exp
;
7627 else if (label
== NULL_RTX
&& !misaligned_prologue_used
)
7628 epilogue_size_needed
= size_needed
;
7630 /* Step 3: Main loop. */
7641 expand_set_or_cpymem_via_loop (dst
, src
, destreg
, srcreg
, promoted_val
,
7642 count_exp
, move_mode
, unroll_factor
,
7643 expected_size
, issetmem
);
7646 expand_set_or_cpymem_via_loop (dst
, src
, destreg
, srcreg
,
7647 vec_promoted_val
, count_exp
, move_mode
,
7648 unroll_factor
, expected_size
, issetmem
);
7650 case rep_prefix_8_byte
:
7651 case rep_prefix_4_byte
:
7652 case rep_prefix_1_byte
:
7653 expand_set_or_cpymem_via_rep (dst
, src
, destreg
, srcreg
, promoted_val
,
7654 val_exp
, count_exp
, move_mode
, issetmem
);
7657 /* Adjust properly the offset of src and dest memory for aliasing. */
7658 if (CONST_INT_P (count_exp
))
7661 src
= adjust_automodify_address_nv (src
, BLKmode
, srcreg
,
7662 (count
/ size_needed
) * size_needed
);
7663 dst
= adjust_automodify_address_nv (dst
, BLKmode
, destreg
,
7664 (count
/ size_needed
) * size_needed
);
7669 src
= change_address (src
, BLKmode
, srcreg
);
7670 dst
= change_address (dst
, BLKmode
, destreg
);
7673 /* Step 4: Epilogue to copy the remaining bytes. */
7677 /* When the main loop is done, COUNT_EXP might hold original count,
7678 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
7679 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
7680 bytes. Compensate if needed. */
7682 if (size_needed
< epilogue_size_needed
)
7684 tmp
= expand_simple_binop (counter_mode (count_exp
), AND
, count_exp
,
7685 GEN_INT (size_needed
- 1), count_exp
, 1,
7687 if (tmp
!= count_exp
)
7688 emit_move_insn (count_exp
, tmp
);
7691 LABEL_NUSES (label
) = 1;
7694 if (count_exp
!= const0_rtx
&& epilogue_size_needed
> 1)
7696 if (force_loopy_epilogue
)
7697 expand_setmem_epilogue_via_loop (dst
, destreg
, val_exp
, count_exp
,
7698 epilogue_size_needed
);
7702 expand_setmem_epilogue (dst
, destreg
, promoted_val
,
7703 vec_promoted_val
, count_exp
,
7704 epilogue_size_needed
);
7706 expand_cpymem_epilogue (dst
, src
, destreg
, srcreg
, count_exp
,
7707 epilogue_size_needed
);
7710 if (jump_around_label
)
7711 emit_label (jump_around_label
);
7715 /* Expand cmpstrn or memcmp. */
7718 ix86_expand_cmpstrn_or_cmpmem (rtx result
, rtx src1
, rtx src2
,
7719 rtx length
, rtx align
, bool is_cmpstrn
)
7721 /* Expand strncmp and memcmp only with -minline-all-stringops since
7722 "repz cmpsb" can be much slower than strncmp and memcmp functions
7723 implemented with vector instructions, see
7725 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
7727 if (!TARGET_INLINE_ALL_STRINGOPS
)
7730 /* Can't use this if the user has appropriated ecx, esi or edi. */
7731 if (fixed_regs
[CX_REG
] || fixed_regs
[SI_REG
] || fixed_regs
[DI_REG
])
7736 /* For strncmp, length is the maximum length, which can be larger
7737 than actual string lengths. We can expand the cmpstrn pattern
7738 to "repz cmpsb" only if one of the strings is a constant so
7739 that expand_builtin_strncmp() can write the length argument to
7740 be the minimum of the const string length and the actual length
7741 argument. Otherwise, "repz cmpsb" may pass the 0 byte. */
7742 tree t1
= MEM_EXPR (src1
);
7743 tree t2
= MEM_EXPR (src2
);
7744 if (!((t1
&& TREE_CODE (t1
) == MEM_REF
7745 && TREE_CODE (TREE_OPERAND (t1
, 0)) == ADDR_EXPR
7746 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1
, 0), 0))
7748 || (t2
&& TREE_CODE (t2
) == MEM_REF
7749 && TREE_CODE (TREE_OPERAND (t2
, 0)) == ADDR_EXPR
7750 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2
, 0), 0))
7755 rtx addr1
= copy_addr_to_reg (XEXP (src1
, 0));
7756 rtx addr2
= copy_addr_to_reg (XEXP (src2
, 0));
7757 if (addr1
!= XEXP (src1
, 0))
7758 src1
= replace_equiv_address_nv (src1
, addr1
);
7759 if (addr2
!= XEXP (src2
, 0))
7760 src2
= replace_equiv_address_nv (src2
, addr2
);
7762 /* NB: Make a copy of the data length to avoid changing the original
7763 data length by cmpstrnqi patterns. */
7764 length
= ix86_zero_extend_to_Pmode (length
);
7765 rtx lengthreg
= gen_reg_rtx (Pmode
);
7766 emit_move_insn (lengthreg
, length
);
7768 /* If we are testing strict equality, we can use known alignment to
7769 good advantage. This may be possible with combine, particularly
7770 once cc0 is dead. */
7771 if (CONST_INT_P (length
))
7773 if (length
== const0_rtx
)
7775 emit_move_insn (result
, const0_rtx
);
7778 emit_insn (gen_cmpstrnqi_nz_1 (addr1
, addr2
, lengthreg
, align
,
7783 emit_insn (gen_cmp_1 (Pmode
, lengthreg
, lengthreg
));
7784 emit_insn (gen_cmpstrnqi_1 (addr1
, addr2
, lengthreg
, align
,
7788 rtx out
= gen_lowpart (QImode
, result
);
7789 emit_insn (gen_cmpintqi (out
));
7790 emit_move_insn (result
, gen_rtx_SIGN_EXTEND (SImode
, out
));
7795 /* Expand the appropriate insns for doing strlen if not just doing
7798 out = result, initialized with the start address
7799 align_rtx = alignment of the address.
7800 scratch = scratch register, initialized with the startaddress when
7801 not aligned, otherwise undefined
7803 This is just the body. It needs the initializations mentioned above and
7804 some address computing at the end. These things are done in i386.md. */
7807 ix86_expand_strlensi_unroll_1 (rtx out
, rtx src
, rtx align_rtx
)
7811 rtx_code_label
*align_2_label
= NULL
;
7812 rtx_code_label
*align_3_label
= NULL
;
7813 rtx_code_label
*align_4_label
= gen_label_rtx ();
7814 rtx_code_label
*end_0_label
= gen_label_rtx ();
7816 rtx tmpreg
= gen_reg_rtx (SImode
);
7817 rtx scratch
= gen_reg_rtx (SImode
);
7821 if (CONST_INT_P (align_rtx
))
7822 align
= INTVAL (align_rtx
);
7824 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
7826 /* Is there a known alignment and is it less than 4? */
7829 rtx scratch1
= gen_reg_rtx (Pmode
);
7830 emit_move_insn (scratch1
, out
);
7831 /* Is there a known alignment and is it not 2? */
7834 align_3_label
= gen_label_rtx (); /* Label when aligned to 3-byte */
7835 align_2_label
= gen_label_rtx (); /* Label when aligned to 2-byte */
7837 /* Leave just the 3 lower bits. */
7838 align_rtx
= expand_binop (Pmode
, and_optab
, scratch1
, GEN_INT (3),
7839 NULL_RTX
, 0, OPTAB_WIDEN
);
7841 emit_cmp_and_jump_insns (align_rtx
, const0_rtx
, EQ
, NULL
,
7842 Pmode
, 1, align_4_label
);
7843 emit_cmp_and_jump_insns (align_rtx
, const2_rtx
, EQ
, NULL
,
7844 Pmode
, 1, align_2_label
);
7845 emit_cmp_and_jump_insns (align_rtx
, const2_rtx
, GTU
, NULL
,
7846 Pmode
, 1, align_3_label
);
7850 /* Since the alignment is 2, we have to check 2 or 0 bytes;
7851 check if is aligned to 4 - byte. */
7853 align_rtx
= expand_binop (Pmode
, and_optab
, scratch1
, const2_rtx
,
7854 NULL_RTX
, 0, OPTAB_WIDEN
);
7856 emit_cmp_and_jump_insns (align_rtx
, const0_rtx
, EQ
, NULL
,
7857 Pmode
, 1, align_4_label
);
7860 mem
= change_address (src
, QImode
, out
);
7862 /* Now compare the bytes. */
7864 /* Compare the first n unaligned byte on a byte per byte basis. */
7865 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
,
7866 QImode
, 1, end_0_label
);
7868 /* Increment the address. */
7869 emit_insn (gen_add2_insn (out
, const1_rtx
));
7871 /* Not needed with an alignment of 2 */
7874 emit_label (align_2_label
);
7876 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
, QImode
, 1,
7879 emit_insn (gen_add2_insn (out
, const1_rtx
));
7881 emit_label (align_3_label
);
7884 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
, QImode
, 1,
7887 emit_insn (gen_add2_insn (out
, const1_rtx
));
7890 /* Generate loop to check 4 bytes at a time. It is not a good idea to
7891 align this loop. It gives only huge programs, but does not help to
7893 emit_label (align_4_label
);
7895 mem
= change_address (src
, SImode
, out
);
7896 emit_move_insn (scratch
, mem
);
7897 emit_insn (gen_add2_insn (out
, GEN_INT (4)));
7899 /* This formula yields a nonzero result iff one of the bytes is zero.
7900 This saves three branches inside loop and many cycles. */
7902 emit_insn (gen_addsi3 (tmpreg
, scratch
, GEN_INT (-0x01010101)));
7903 emit_insn (gen_one_cmplsi2 (scratch
, scratch
));
7904 emit_insn (gen_andsi3 (tmpreg
, tmpreg
, scratch
));
7905 emit_insn (gen_andsi3 (tmpreg
, tmpreg
,
7906 gen_int_mode (0x80808080, SImode
)));
7907 emit_cmp_and_jump_insns (tmpreg
, const0_rtx
, EQ
, 0, SImode
, 1,
7912 rtx reg
= gen_reg_rtx (SImode
);
7913 rtx reg2
= gen_reg_rtx (Pmode
);
7914 emit_move_insn (reg
, tmpreg
);
7915 emit_insn (gen_lshrsi3 (reg
, reg
, GEN_INT (16)));
7917 /* If zero is not in the first two bytes, move two bytes forward. */
7918 emit_insn (gen_testsi_ccno_1 (tmpreg
, GEN_INT (0x8080)));
7919 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
7920 tmp
= gen_rtx_EQ (VOIDmode
, tmp
, const0_rtx
);
7921 emit_insn (gen_rtx_SET (tmpreg
,
7922 gen_rtx_IF_THEN_ELSE (SImode
, tmp
,
7925 /* Emit lea manually to avoid clobbering of flags. */
7926 emit_insn (gen_rtx_SET (reg2
, plus_constant (Pmode
, out
, 2)));
7928 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
7929 tmp
= gen_rtx_EQ (VOIDmode
, tmp
, const0_rtx
);
7930 emit_insn (gen_rtx_SET (out
,
7931 gen_rtx_IF_THEN_ELSE (Pmode
, tmp
,
7937 rtx_code_label
*end_2_label
= gen_label_rtx ();
7938 /* Is zero in the first two bytes? */
7940 emit_insn (gen_testsi_ccno_1 (tmpreg
, GEN_INT (0x8080)));
7941 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
7942 tmp
= gen_rtx_NE (VOIDmode
, tmp
, const0_rtx
);
7943 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
7944 gen_rtx_LABEL_REF (VOIDmode
, end_2_label
),
7946 tmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
7947 JUMP_LABEL (tmp
) = end_2_label
;
7949 /* Not in the first two. Move two bytes forward. */
7950 emit_insn (gen_lshrsi3 (tmpreg
, tmpreg
, GEN_INT (16)));
7951 emit_insn (gen_add2_insn (out
, const2_rtx
));
7953 emit_label (end_2_label
);
7957 /* Avoid branch in fixing the byte. */
7958 tmpreg
= gen_lowpart (QImode
, tmpreg
);
7959 emit_insn (gen_addqi3_cconly_overflow (tmpreg
, tmpreg
));
7960 tmp
= gen_rtx_REG (CCmode
, FLAGS_REG
);
7961 cmp
= gen_rtx_LTU (VOIDmode
, tmp
, const0_rtx
);
7962 emit_insn (gen_sub3_carry (Pmode
, out
, out
, GEN_INT (3), tmp
, cmp
));
7964 emit_label (end_0_label
);
7967 /* Expand strlen. */
7970 ix86_expand_strlen (rtx out
, rtx src
, rtx eoschar
, rtx align
)
7972 if (TARGET_UNROLL_STRLEN
7973 && TARGET_INLINE_ALL_STRINGOPS
7974 && eoschar
== const0_rtx
7977 /* The generic case of strlen expander is long. Avoid it's
7978 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
7979 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
7980 /* Well it seems that some optimizer does not combine a call like
7981 foo(strlen(bar), strlen(bar));
7982 when the move and the subtraction is done here. It does calculate
7983 the length just once when these instructions are done inside of
7984 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
7985 often used and I use one fewer register for the lifetime of
7986 output_strlen_unroll() this is better. */
7988 emit_move_insn (out
, addr
);
7990 ix86_expand_strlensi_unroll_1 (out
, src
, align
);
7992 /* strlensi_unroll_1 returns the address of the zero at the end of
7993 the string, like memchr(), so compute the length by subtracting
7994 the start address. */
7995 emit_insn (gen_sub2_insn (out
, addr
));
8002 /* For given symbol (function) construct code to compute address of it's PLT
8003 entry in large x86-64 PIC model. */
8006 construct_plt_address (rtx symbol
)
8010 gcc_assert (GET_CODE (symbol
) == SYMBOL_REF
);
8011 gcc_assert (ix86_cmodel
== CM_LARGE_PIC
&& !TARGET_PECOFF
);
8012 gcc_assert (Pmode
== DImode
);
8014 tmp
= gen_reg_rtx (Pmode
);
8015 unspec
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, symbol
), UNSPEC_PLTOFF
);
8017 emit_move_insn (tmp
, gen_rtx_CONST (Pmode
, unspec
));
8018 emit_insn (gen_add2_insn (tmp
, pic_offset_table_rtx
));
8022 /* Additional registers that are clobbered by SYSV calls. */
8024 static int const x86_64_ms_sysv_extra_clobbered_registers
8025 [NUM_X86_64_MS_CLOBBERED_REGS
] =
8029 XMM8_REG
, XMM9_REG
, XMM10_REG
, XMM11_REG
,
8030 XMM12_REG
, XMM13_REG
, XMM14_REG
, XMM15_REG
8034 ix86_expand_call (rtx retval
, rtx fnaddr
, rtx callarg1
,
8036 rtx pop
, bool sibcall
)
8039 rtx use
= NULL
, call
;
8040 unsigned int vec_len
= 0;
8043 if (GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
8045 fndecl
= SYMBOL_REF_DECL (XEXP (fnaddr
, 0));
8047 && (lookup_attribute ("interrupt",
8048 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
)))))
8049 error ("interrupt service routine cannot be called directly");
8054 if (pop
== const0_rtx
)
8056 gcc_assert (!TARGET_64BIT
|| !pop
);
8058 if (TARGET_MACHO
&& !TARGET_64BIT
)
8061 if (flag_pic
&& GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
8062 fnaddr
= machopic_indirect_call_target (fnaddr
);
8067 /* Static functions and indirect calls don't need the pic register. Also,
8068 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
8069 it an indirect call. */
8070 rtx addr
= XEXP (fnaddr
, 0);
8072 && GET_CODE (addr
) == SYMBOL_REF
8073 && !SYMBOL_REF_LOCAL_P (addr
))
8076 && (SYMBOL_REF_DECL (addr
) == NULL_TREE
8077 || !lookup_attribute ("noplt",
8078 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr
)))))
8081 || (ix86_cmodel
== CM_LARGE_PIC
8082 && DEFAULT_ABI
!= MS_ABI
))
8084 use_reg (&use
, gen_rtx_REG (Pmode
,
8085 REAL_PIC_OFFSET_TABLE_REGNUM
));
8086 if (ix86_use_pseudo_pic_reg ())
8087 emit_move_insn (gen_rtx_REG (Pmode
,
8088 REAL_PIC_OFFSET_TABLE_REGNUM
),
8089 pic_offset_table_rtx
);
8092 else if (!TARGET_PECOFF
&& !TARGET_MACHO
)
8095 && ix86_cmodel
== CM_LARGE_PIC
8096 && DEFAULT_ABI
!= MS_ABI
)
8098 fnaddr
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, addr
),
8100 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
8101 fnaddr
= force_reg (Pmode
, fnaddr
);
8102 fnaddr
= gen_rtx_PLUS (Pmode
, pic_offset_table_rtx
, fnaddr
);
8104 else if (TARGET_64BIT
)
8106 fnaddr
= gen_rtx_UNSPEC (Pmode
,
8107 gen_rtvec (1, addr
),
8109 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
8113 fnaddr
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, addr
),
8115 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
8116 fnaddr
= gen_rtx_PLUS (Pmode
, pic_offset_table_rtx
,
8119 fnaddr
= gen_const_mem (Pmode
, fnaddr
);
8120 /* Pmode may not be the same as word_mode for x32, which
8121 doesn't support indirect branch via 32-bit memory slot.
8122 Since x32 GOT slot is 64 bit with zero upper 32 bits,
8123 indirect branch via x32 GOT slot is OK. */
8124 if (GET_MODE (fnaddr
) != word_mode
)
8125 fnaddr
= gen_rtx_ZERO_EXTEND (word_mode
, fnaddr
);
8126 fnaddr
= gen_rtx_MEM (QImode
, fnaddr
);
8131 /* Skip setting up RAX register for -mskip-rax-setup when there are no
8132 parameters passed in vector registers. */
8134 && (INTVAL (callarg2
) > 0
8135 || (INTVAL (callarg2
) == 0
8136 && (TARGET_SSE
|| !flag_skip_rax_setup
))))
8138 rtx al
= gen_rtx_REG (QImode
, AX_REG
);
8139 emit_move_insn (al
, callarg2
);
8143 if (ix86_cmodel
== CM_LARGE_PIC
8146 && GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
8147 && !local_symbolic_operand (XEXP (fnaddr
, 0), VOIDmode
))
8148 fnaddr
= gen_rtx_MEM (QImode
, construct_plt_address (XEXP (fnaddr
, 0)));
8149 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
8150 branch via x32 GOT slot is OK. */
8151 else if (!(TARGET_X32
8153 && GET_CODE (XEXP (fnaddr
, 0)) == ZERO_EXTEND
8154 && GOT_memory_operand (XEXP (XEXP (fnaddr
, 0), 0), Pmode
))
8156 ? !sibcall_insn_operand (XEXP (fnaddr
, 0), word_mode
)
8157 : !call_insn_operand (XEXP (fnaddr
, 0), word_mode
)))
8159 fnaddr
= convert_to_mode (word_mode
, XEXP (fnaddr
, 0), 1);
8160 fnaddr
= gen_rtx_MEM (QImode
, copy_to_mode_reg (word_mode
, fnaddr
));
8163 call
= gen_rtx_CALL (VOIDmode
, fnaddr
, callarg1
);
8166 call
= gen_rtx_SET (retval
, call
);
8167 vec
[vec_len
++] = call
;
8171 pop
= gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, pop
);
8172 pop
= gen_rtx_SET (stack_pointer_rtx
, pop
);
8173 vec
[vec_len
++] = pop
;
8176 if (cfun
->machine
->no_caller_saved_registers
8178 || (!TREE_THIS_VOLATILE (fndecl
)
8179 && !lookup_attribute ("no_caller_saved_registers",
8180 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
))))))
8182 static const char ix86_call_used_regs
[] = CALL_USED_REGISTERS
;
8183 bool is_64bit_ms_abi
= (TARGET_64BIT
8184 && ix86_function_abi (fndecl
) == MS_ABI
);
8185 char c_mask
= CALL_USED_REGISTERS_MASK (is_64bit_ms_abi
);
8187 /* If there are no caller-saved registers, add all registers
8188 that are clobbered by the call which returns. */
8189 for (int i
= 0; i
< FIRST_PSEUDO_REGISTER
; i
++)
8191 && (ix86_call_used_regs
[i
] == 1
8192 || (ix86_call_used_regs
[i
] & c_mask
))
8193 && !STACK_REGNO_P (i
)
8194 && !MMX_REGNO_P (i
))
8196 gen_rtx_REG (GET_MODE (regno_reg_rtx
[i
]), i
));
8198 else if (TARGET_64BIT_MS_ABI
8199 && (!callarg2
|| INTVAL (callarg2
) != -2))
8203 for (i
= 0; i
< NUM_X86_64_MS_CLOBBERED_REGS
; i
++)
8205 int regno
= x86_64_ms_sysv_extra_clobbered_registers
[i
];
8206 machine_mode mode
= SSE_REGNO_P (regno
) ? TImode
: DImode
;
8208 clobber_reg (&use
, gen_rtx_REG (mode
, regno
));
8211 /* Set here, but it may get cleared later. */
8212 if (TARGET_CALL_MS2SYSV_XLOGUES
)
8217 /* Don't break hot-patched functions. */
8218 else if (ix86_function_ms_hook_prologue (current_function_decl
))
8221 /* TODO: Cases not yet examined. */
8222 else if (flag_split_stack
)
8223 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
8227 gcc_assert (!reload_completed
);
8228 cfun
->machine
->call_ms2sysv
= true;
8234 call
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec_v (vec_len
, vec
));
8235 rtx_insn
*call_insn
= emit_call_insn (call
);
8237 CALL_INSN_FUNCTION_USAGE (call_insn
) = use
;
8242 /* Split simple return with popping POPC bytes from stack to indirect
8243 branch with stack adjustment . */
8246 ix86_split_simple_return_pop_internal (rtx popc
)
8248 struct machine_function
*m
= cfun
->machine
;
8249 rtx ecx
= gen_rtx_REG (SImode
, CX_REG
);
8252 /* There is no "pascal" calling convention in any 64bit ABI. */
8253 gcc_assert (!TARGET_64BIT
);
8255 insn
= emit_insn (gen_pop (ecx
));
8256 m
->fs
.cfa_offset
-= UNITS_PER_WORD
;
8257 m
->fs
.sp_offset
-= UNITS_PER_WORD
;
8259 rtx x
= plus_constant (Pmode
, stack_pointer_rtx
, UNITS_PER_WORD
);
8260 x
= gen_rtx_SET (stack_pointer_rtx
, x
);
8261 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, x
);
8262 add_reg_note (insn
, REG_CFA_REGISTER
, gen_rtx_SET (ecx
, pc_rtx
));
8263 RTX_FRAME_RELATED_P (insn
) = 1;
8265 x
= gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, popc
);
8266 x
= gen_rtx_SET (stack_pointer_rtx
, x
);
8267 insn
= emit_insn (x
);
8268 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, x
);
8269 RTX_FRAME_RELATED_P (insn
) = 1;
8271 /* Now return address is in ECX. */
8272 emit_jump_insn (gen_simple_return_indirect_internal (ecx
));
8275 /* Errors in the source file can cause expand_expr to return const0_rtx
8276 where we expect a vector. To avoid crashing, use one of the vector
8277 clear instructions. */
8280 safe_vector_operand (rtx x
, machine_mode mode
)
8282 if (x
== const0_rtx
)
8283 x
= CONST0_RTX (mode
);
8287 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
8290 ix86_expand_binop_builtin (enum insn_code icode
, tree exp
, rtx target
)
8293 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8294 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8295 rtx op0
= expand_normal (arg0
);
8296 rtx op1
= expand_normal (arg1
);
8297 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
8298 machine_mode mode0
= insn_data
[icode
].operand
[1].mode
;
8299 machine_mode mode1
= insn_data
[icode
].operand
[2].mode
;
8301 if (VECTOR_MODE_P (mode0
))
8302 op0
= safe_vector_operand (op0
, mode0
);
8303 if (VECTOR_MODE_P (mode1
))
8304 op1
= safe_vector_operand (op1
, mode1
);
8306 if (optimize
|| !target
8307 || GET_MODE (target
) != tmode
8308 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
8309 target
= gen_reg_rtx (tmode
);
8311 if (GET_MODE (op1
) == SImode
&& mode1
== TImode
)
8313 rtx x
= gen_reg_rtx (V4SImode
);
8314 emit_insn (gen_sse2_loadd (x
, op1
));
8315 op1
= gen_lowpart (TImode
, x
);
8318 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
8319 op0
= copy_to_mode_reg (mode0
, op0
);
8320 if (!insn_data
[icode
].operand
[2].predicate (op1
, mode1
))
8321 op1
= copy_to_mode_reg (mode1
, op1
);
8323 pat
= GEN_FCN (icode
) (target
, op0
, op1
);
8332 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
8335 ix86_expand_multi_arg_builtin (enum insn_code icode
, tree exp
, rtx target
,
8336 enum ix86_builtin_func_type m_type
,
8337 enum rtx_code sub_code
)
8340 unsigned int i
, nargs
;
8341 bool comparison_p
= false;
8343 bool last_arg_constant
= false;
8347 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
8351 case MULTI_ARG_4_DF2_DI_I
:
8352 case MULTI_ARG_4_DF2_DI_I1
:
8353 case MULTI_ARG_4_SF2_SI_I
:
8354 case MULTI_ARG_4_SF2_SI_I1
:
8356 last_arg_constant
= true;
8359 case MULTI_ARG_3_SF
:
8360 case MULTI_ARG_3_DF
:
8361 case MULTI_ARG_3_SF2
:
8362 case MULTI_ARG_3_DF2
:
8363 case MULTI_ARG_3_DI
:
8364 case MULTI_ARG_3_SI
:
8365 case MULTI_ARG_3_SI_DI
:
8366 case MULTI_ARG_3_HI
:
8367 case MULTI_ARG_3_HI_SI
:
8368 case MULTI_ARG_3_QI
:
8369 case MULTI_ARG_3_DI2
:
8370 case MULTI_ARG_3_SI2
:
8371 case MULTI_ARG_3_HI2
:
8372 case MULTI_ARG_3_QI2
:
8376 case MULTI_ARG_2_SF
:
8377 case MULTI_ARG_2_DF
:
8378 case MULTI_ARG_2_DI
:
8379 case MULTI_ARG_2_SI
:
8380 case MULTI_ARG_2_HI
:
8381 case MULTI_ARG_2_QI
:
8385 case MULTI_ARG_2_DI_IMM
:
8386 case MULTI_ARG_2_SI_IMM
:
8387 case MULTI_ARG_2_HI_IMM
:
8388 case MULTI_ARG_2_QI_IMM
:
8390 last_arg_constant
= true;
8393 case MULTI_ARG_1_SF
:
8394 case MULTI_ARG_1_DF
:
8395 case MULTI_ARG_1_SF2
:
8396 case MULTI_ARG_1_DF2
:
8397 case MULTI_ARG_1_DI
:
8398 case MULTI_ARG_1_SI
:
8399 case MULTI_ARG_1_HI
:
8400 case MULTI_ARG_1_QI
:
8401 case MULTI_ARG_1_SI_DI
:
8402 case MULTI_ARG_1_HI_DI
:
8403 case MULTI_ARG_1_HI_SI
:
8404 case MULTI_ARG_1_QI_DI
:
8405 case MULTI_ARG_1_QI_SI
:
8406 case MULTI_ARG_1_QI_HI
:
8410 case MULTI_ARG_2_DI_CMP
:
8411 case MULTI_ARG_2_SI_CMP
:
8412 case MULTI_ARG_2_HI_CMP
:
8413 case MULTI_ARG_2_QI_CMP
:
8415 comparison_p
= true;
8418 case MULTI_ARG_2_SF_TF
:
8419 case MULTI_ARG_2_DF_TF
:
8420 case MULTI_ARG_2_DI_TF
:
8421 case MULTI_ARG_2_SI_TF
:
8422 case MULTI_ARG_2_HI_TF
:
8423 case MULTI_ARG_2_QI_TF
:
8432 if (optimize
|| !target
8433 || GET_MODE (target
) != tmode
8434 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
8435 target
= gen_reg_rtx (tmode
);
8436 else if (memory_operand (target
, tmode
))
8439 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
8441 for (i
= 0; i
< nargs
; i
++)
8443 tree arg
= CALL_EXPR_ARG (exp
, i
);
8444 rtx op
= expand_normal (arg
);
8445 int adjust
= (comparison_p
) ? 1 : 0;
8446 machine_mode mode
= insn_data
[icode
].operand
[i
+adjust
+1].mode
;
8448 if (last_arg_constant
&& i
== nargs
- 1)
8450 if (!insn_data
[icode
].operand
[i
+ 1].predicate (op
, mode
))
8452 enum insn_code new_icode
= icode
;
8455 case CODE_FOR_xop_vpermil2v2df3
:
8456 case CODE_FOR_xop_vpermil2v4sf3
:
8457 case CODE_FOR_xop_vpermil2v4df3
:
8458 case CODE_FOR_xop_vpermil2v8sf3
:
8459 error ("the last argument must be a 2-bit immediate");
8460 return gen_reg_rtx (tmode
);
8461 case CODE_FOR_xop_rotlv2di3
:
8462 new_icode
= CODE_FOR_rotlv2di3
;
8464 case CODE_FOR_xop_rotlv4si3
:
8465 new_icode
= CODE_FOR_rotlv4si3
;
8467 case CODE_FOR_xop_rotlv8hi3
:
8468 new_icode
= CODE_FOR_rotlv8hi3
;
8470 case CODE_FOR_xop_rotlv16qi3
:
8471 new_icode
= CODE_FOR_rotlv16qi3
;
8473 if (CONST_INT_P (op
))
8475 int mask
= GET_MODE_UNIT_BITSIZE (tmode
) - 1;
8476 op
= GEN_INT (INTVAL (op
) & mask
);
8478 (insn_data
[icode
].operand
[i
+ 1].predicate (op
, mode
));
8484 && insn_data
[new_icode
].operand
[0].mode
== tmode
8485 && insn_data
[new_icode
].operand
[1].mode
== tmode
8486 && insn_data
[new_icode
].operand
[2].mode
== mode
8487 && insn_data
[new_icode
].operand
[0].predicate
8488 == insn_data
[icode
].operand
[0].predicate
8489 && insn_data
[new_icode
].operand
[1].predicate
8490 == insn_data
[icode
].operand
[1].predicate
);
8503 if (VECTOR_MODE_P (mode
))
8504 op
= safe_vector_operand (op
, mode
);
8506 /* If we aren't optimizing, only allow one memory operand to be
8508 if (memory_operand (op
, mode
))
8511 gcc_assert (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
);
8514 || !insn_data
[icode
].operand
[i
+adjust
+1].predicate (op
, mode
)
8516 op
= force_reg (mode
, op
);
8525 pat
= GEN_FCN (icode
) (target
, xops
[0]);
8530 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
8531 GEN_INT ((int)sub_code
));
8532 else if (! comparison_p
)
8533 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1]);
8536 rtx cmp_op
= gen_rtx_fmt_ee (sub_code
, GET_MODE (target
),
8539 pat
= GEN_FCN (icode
) (target
, cmp_op
, xops
[0], xops
[1]);
8544 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2]);
8548 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2], xops
[3]);
8562 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
8563 insns with vec_merge. */
8566 ix86_expand_unop_vec_merge_builtin (enum insn_code icode
, tree exp
,
8570 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8571 rtx op1
, op0
= expand_normal (arg0
);
8572 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
8573 machine_mode mode0
= insn_data
[icode
].operand
[1].mode
;
8575 if (optimize
|| !target
8576 || GET_MODE (target
) != tmode
8577 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
8578 target
= gen_reg_rtx (tmode
);
8580 if (VECTOR_MODE_P (mode0
))
8581 op0
= safe_vector_operand (op0
, mode0
);
8583 if ((optimize
&& !register_operand (op0
, mode0
))
8584 || !insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
8585 op0
= copy_to_mode_reg (mode0
, op0
);
8588 if (!insn_data
[icode
].operand
[2].predicate (op1
, mode0
))
8589 op1
= copy_to_mode_reg (mode0
, op1
);
8591 pat
= GEN_FCN (icode
) (target
, op0
, op1
);
8598 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
8601 ix86_expand_sse_compare (const struct builtin_description
*d
,
8602 tree exp
, rtx target
, bool swap
)
8605 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8606 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8607 rtx op0
= expand_normal (arg0
);
8608 rtx op1
= expand_normal (arg1
);
8610 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
8611 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
8612 machine_mode mode1
= insn_data
[d
->icode
].operand
[2].mode
;
8613 enum rtx_code comparison
= d
->comparison
;
8615 if (VECTOR_MODE_P (mode0
))
8616 op0
= safe_vector_operand (op0
, mode0
);
8617 if (VECTOR_MODE_P (mode1
))
8618 op1
= safe_vector_operand (op1
, mode1
);
8620 /* Swap operands if we have a comparison that isn't available in
8623 std::swap (op0
, op1
);
8625 if (optimize
|| !target
8626 || GET_MODE (target
) != tmode
8627 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
8628 target
= gen_reg_rtx (tmode
);
8630 if ((optimize
&& !register_operand (op0
, mode0
))
8631 || !insn_data
[d
->icode
].operand
[1].predicate (op0
, mode0
))
8632 op0
= copy_to_mode_reg (mode0
, op0
);
8633 if ((optimize
&& !register_operand (op1
, mode1
))
8634 || !insn_data
[d
->icode
].operand
[2].predicate (op1
, mode1
))
8635 op1
= copy_to_mode_reg (mode1
, op1
);
8637 op2
= gen_rtx_fmt_ee (comparison
, mode0
, op0
, op1
);
8638 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
, op2
);
8645 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
8648 ix86_expand_sse_comi (const struct builtin_description
*d
, tree exp
,
8652 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8653 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8654 rtx op0
= expand_normal (arg0
);
8655 rtx op1
= expand_normal (arg1
);
8656 machine_mode mode0
= insn_data
[d
->icode
].operand
[0].mode
;
8657 machine_mode mode1
= insn_data
[d
->icode
].operand
[1].mode
;
8658 enum rtx_code comparison
= d
->comparison
;
8660 if (VECTOR_MODE_P (mode0
))
8661 op0
= safe_vector_operand (op0
, mode0
);
8662 if (VECTOR_MODE_P (mode1
))
8663 op1
= safe_vector_operand (op1
, mode1
);
8665 target
= gen_reg_rtx (SImode
);
8666 emit_move_insn (target
, const0_rtx
);
8667 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8669 if ((optimize
&& !register_operand (op0
, mode0
))
8670 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8671 op0
= copy_to_mode_reg (mode0
, op0
);
8672 if ((optimize
&& !register_operand (op1
, mode1
))
8673 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
8674 op1
= copy_to_mode_reg (mode1
, op1
);
8676 pat
= GEN_FCN (d
->icode
) (op0
, op1
);
8680 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8681 gen_rtx_fmt_ee (comparison
, QImode
,
8685 return SUBREG_REG (target
);
8688 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
8691 ix86_expand_sse_round (const struct builtin_description
*d
, tree exp
,
8695 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8696 rtx op1
, op0
= expand_normal (arg0
);
8697 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
8698 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
8700 if (optimize
|| target
== 0
8701 || GET_MODE (target
) != tmode
8702 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
8703 target
= gen_reg_rtx (tmode
);
8705 if (VECTOR_MODE_P (mode0
))
8706 op0
= safe_vector_operand (op0
, mode0
);
8708 if ((optimize
&& !register_operand (op0
, mode0
))
8709 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8710 op0
= copy_to_mode_reg (mode0
, op0
);
8712 op1
= GEN_INT (d
->comparison
);
8714 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
);
8722 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description
*d
,
8723 tree exp
, rtx target
)
8726 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8727 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8728 rtx op0
= expand_normal (arg0
);
8729 rtx op1
= expand_normal (arg1
);
8731 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
8732 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
8733 machine_mode mode1
= insn_data
[d
->icode
].operand
[2].mode
;
8735 if (optimize
|| target
== 0
8736 || GET_MODE (target
) != tmode
8737 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
8738 target
= gen_reg_rtx (tmode
);
8740 op0
= safe_vector_operand (op0
, mode0
);
8741 op1
= safe_vector_operand (op1
, mode1
);
8743 if ((optimize
&& !register_operand (op0
, mode0
))
8744 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8745 op0
= copy_to_mode_reg (mode0
, op0
);
8746 if ((optimize
&& !register_operand (op1
, mode1
))
8747 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
8748 op1
= copy_to_mode_reg (mode1
, op1
);
8750 op2
= GEN_INT (d
->comparison
);
8752 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
, op2
);
8759 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
8762 ix86_expand_sse_ptest (const struct builtin_description
*d
, tree exp
,
8766 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8767 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8768 rtx op0
= expand_normal (arg0
);
8769 rtx op1
= expand_normal (arg1
);
8770 machine_mode mode0
= insn_data
[d
->icode
].operand
[0].mode
;
8771 machine_mode mode1
= insn_data
[d
->icode
].operand
[1].mode
;
8772 enum rtx_code comparison
= d
->comparison
;
8774 if (VECTOR_MODE_P (mode0
))
8775 op0
= safe_vector_operand (op0
, mode0
);
8776 if (VECTOR_MODE_P (mode1
))
8777 op1
= safe_vector_operand (op1
, mode1
);
8779 target
= gen_reg_rtx (SImode
);
8780 emit_move_insn (target
, const0_rtx
);
8781 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8783 if ((optimize
&& !register_operand (op0
, mode0
))
8784 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8785 op0
= copy_to_mode_reg (mode0
, op0
);
8786 if ((optimize
&& !register_operand (op1
, mode1
))
8787 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
8788 op1
= copy_to_mode_reg (mode1
, op1
);
8790 pat
= GEN_FCN (d
->icode
) (op0
, op1
);
8794 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8795 gen_rtx_fmt_ee (comparison
, QImode
,
8799 return SUBREG_REG (target
);
8802 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
8805 ix86_expand_sse_pcmpestr (const struct builtin_description
*d
,
8806 tree exp
, rtx target
)
8809 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8810 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8811 tree arg2
= CALL_EXPR_ARG (exp
, 2);
8812 tree arg3
= CALL_EXPR_ARG (exp
, 3);
8813 tree arg4
= CALL_EXPR_ARG (exp
, 4);
8814 rtx scratch0
, scratch1
;
8815 rtx op0
= expand_normal (arg0
);
8816 rtx op1
= expand_normal (arg1
);
8817 rtx op2
= expand_normal (arg2
);
8818 rtx op3
= expand_normal (arg3
);
8819 rtx op4
= expand_normal (arg4
);
8820 machine_mode tmode0
, tmode1
, modev2
, modei3
, modev4
, modei5
, modeimm
;
8822 tmode0
= insn_data
[d
->icode
].operand
[0].mode
;
8823 tmode1
= insn_data
[d
->icode
].operand
[1].mode
;
8824 modev2
= insn_data
[d
->icode
].operand
[2].mode
;
8825 modei3
= insn_data
[d
->icode
].operand
[3].mode
;
8826 modev4
= insn_data
[d
->icode
].operand
[4].mode
;
8827 modei5
= insn_data
[d
->icode
].operand
[5].mode
;
8828 modeimm
= insn_data
[d
->icode
].operand
[6].mode
;
8830 if (VECTOR_MODE_P (modev2
))
8831 op0
= safe_vector_operand (op0
, modev2
);
8832 if (VECTOR_MODE_P (modev4
))
8833 op2
= safe_vector_operand (op2
, modev4
);
8835 if (!insn_data
[d
->icode
].operand
[2].predicate (op0
, modev2
))
8836 op0
= copy_to_mode_reg (modev2
, op0
);
8837 if (!insn_data
[d
->icode
].operand
[3].predicate (op1
, modei3
))
8838 op1
= copy_to_mode_reg (modei3
, op1
);
8839 if ((optimize
&& !register_operand (op2
, modev4
))
8840 || !insn_data
[d
->icode
].operand
[4].predicate (op2
, modev4
))
8841 op2
= copy_to_mode_reg (modev4
, op2
);
8842 if (!insn_data
[d
->icode
].operand
[5].predicate (op3
, modei5
))
8843 op3
= copy_to_mode_reg (modei5
, op3
);
8845 if (!insn_data
[d
->icode
].operand
[6].predicate (op4
, modeimm
))
8847 error ("the fifth argument must be an 8-bit immediate");
8851 if (d
->code
== IX86_BUILTIN_PCMPESTRI128
)
8853 if (optimize
|| !target
8854 || GET_MODE (target
) != tmode0
8855 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode0
))
8856 target
= gen_reg_rtx (tmode0
);
8858 scratch1
= gen_reg_rtx (tmode1
);
8860 pat
= GEN_FCN (d
->icode
) (target
, scratch1
, op0
, op1
, op2
, op3
, op4
);
8862 else if (d
->code
== IX86_BUILTIN_PCMPESTRM128
)
8864 if (optimize
|| !target
8865 || GET_MODE (target
) != tmode1
8866 || !insn_data
[d
->icode
].operand
[1].predicate (target
, tmode1
))
8867 target
= gen_reg_rtx (tmode1
);
8869 scratch0
= gen_reg_rtx (tmode0
);
8871 pat
= GEN_FCN (d
->icode
) (scratch0
, target
, op0
, op1
, op2
, op3
, op4
);
8875 gcc_assert (d
->flag
);
8877 scratch0
= gen_reg_rtx (tmode0
);
8878 scratch1
= gen_reg_rtx (tmode1
);
8880 pat
= GEN_FCN (d
->icode
) (scratch0
, scratch1
, op0
, op1
, op2
, op3
, op4
);
8890 target
= gen_reg_rtx (SImode
);
8891 emit_move_insn (target
, const0_rtx
);
8892 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8895 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8896 gen_rtx_fmt_ee (EQ
, QImode
,
8897 gen_rtx_REG ((machine_mode
) d
->flag
,
8900 return SUBREG_REG (target
);
8907 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
8910 ix86_expand_sse_pcmpistr (const struct builtin_description
*d
,
8911 tree exp
, rtx target
)
8914 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8915 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8916 tree arg2
= CALL_EXPR_ARG (exp
, 2);
8917 rtx scratch0
, scratch1
;
8918 rtx op0
= expand_normal (arg0
);
8919 rtx op1
= expand_normal (arg1
);
8920 rtx op2
= expand_normal (arg2
);
8921 machine_mode tmode0
, tmode1
, modev2
, modev3
, modeimm
;
8923 tmode0
= insn_data
[d
->icode
].operand
[0].mode
;
8924 tmode1
= insn_data
[d
->icode
].operand
[1].mode
;
8925 modev2
= insn_data
[d
->icode
].operand
[2].mode
;
8926 modev3
= insn_data
[d
->icode
].operand
[3].mode
;
8927 modeimm
= insn_data
[d
->icode
].operand
[4].mode
;
8929 if (VECTOR_MODE_P (modev2
))
8930 op0
= safe_vector_operand (op0
, modev2
);
8931 if (VECTOR_MODE_P (modev3
))
8932 op1
= safe_vector_operand (op1
, modev3
);
8934 if (!insn_data
[d
->icode
].operand
[2].predicate (op0
, modev2
))
8935 op0
= copy_to_mode_reg (modev2
, op0
);
8936 if ((optimize
&& !register_operand (op1
, modev3
))
8937 || !insn_data
[d
->icode
].operand
[3].predicate (op1
, modev3
))
8938 op1
= copy_to_mode_reg (modev3
, op1
);
8940 if (!insn_data
[d
->icode
].operand
[4].predicate (op2
, modeimm
))
8942 error ("the third argument must be an 8-bit immediate");
8946 if (d
->code
== IX86_BUILTIN_PCMPISTRI128
)
8948 if (optimize
|| !target
8949 || GET_MODE (target
) != tmode0
8950 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode0
))
8951 target
= gen_reg_rtx (tmode0
);
8953 scratch1
= gen_reg_rtx (tmode1
);
8955 pat
= GEN_FCN (d
->icode
) (target
, scratch1
, op0
, op1
, op2
);
8957 else if (d
->code
== IX86_BUILTIN_PCMPISTRM128
)
8959 if (optimize
|| !target
8960 || GET_MODE (target
) != tmode1
8961 || !insn_data
[d
->icode
].operand
[1].predicate (target
, tmode1
))
8962 target
= gen_reg_rtx (tmode1
);
8964 scratch0
= gen_reg_rtx (tmode0
);
8966 pat
= GEN_FCN (d
->icode
) (scratch0
, target
, op0
, op1
, op2
);
8970 gcc_assert (d
->flag
);
8972 scratch0
= gen_reg_rtx (tmode0
);
8973 scratch1
= gen_reg_rtx (tmode1
);
8975 pat
= GEN_FCN (d
->icode
) (scratch0
, scratch1
, op0
, op1
, op2
);
8985 target
= gen_reg_rtx (SImode
);
8986 emit_move_insn (target
, const0_rtx
);
8987 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8990 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8991 gen_rtx_fmt_ee (EQ
, QImode
,
8992 gen_rtx_REG ((machine_mode
) d
->flag
,
8995 return SUBREG_REG (target
);
9001 /* Fixup modeless constants to fit required mode. */
9004 fixup_modeless_constant (rtx x
, machine_mode mode
)
9006 if (GET_MODE (x
) == VOIDmode
)
9007 x
= convert_to_mode (mode
, x
, 1);
9011 /* Subroutine of ix86_expand_builtin to take care of insns with
9012 variable number of operands. */
9015 ix86_expand_args_builtin (const struct builtin_description
*d
,
9016 tree exp
, rtx target
)
9018 rtx pat
, real_target
;
9019 unsigned int i
, nargs
;
9020 unsigned int nargs_constant
= 0;
9021 unsigned int mask_pos
= 0;
9024 bool second_arg_count
= false;
9025 enum insn_code icode
= d
->icode
;
9026 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
9027 machine_mode tmode
= insn_p
->operand
[0].mode
;
9028 machine_mode rmode
= VOIDmode
;
9030 enum rtx_code comparison
= d
->comparison
;
9032 switch ((enum ix86_builtin_func_type
) d
->flag
)
9034 case V2DF_FTYPE_V2DF_ROUND
:
9035 case V4DF_FTYPE_V4DF_ROUND
:
9036 case V8DF_FTYPE_V8DF_ROUND
:
9037 case V4SF_FTYPE_V4SF_ROUND
:
9038 case V8SF_FTYPE_V8SF_ROUND
:
9039 case V16SF_FTYPE_V16SF_ROUND
:
9040 case V4SI_FTYPE_V4SF_ROUND
:
9041 case V8SI_FTYPE_V8SF_ROUND
:
9042 case V16SI_FTYPE_V16SF_ROUND
:
9043 return ix86_expand_sse_round (d
, exp
, target
);
9044 case V4SI_FTYPE_V2DF_V2DF_ROUND
:
9045 case V8SI_FTYPE_V4DF_V4DF_ROUND
:
9046 case V16SI_FTYPE_V8DF_V8DF_ROUND
:
9047 return ix86_expand_sse_round_vec_pack_sfix (d
, exp
, target
);
9048 case INT_FTYPE_V8SF_V8SF_PTEST
:
9049 case INT_FTYPE_V4DI_V4DI_PTEST
:
9050 case INT_FTYPE_V4DF_V4DF_PTEST
:
9051 case INT_FTYPE_V4SF_V4SF_PTEST
:
9052 case INT_FTYPE_V2DI_V2DI_PTEST
:
9053 case INT_FTYPE_V2DF_V2DF_PTEST
:
9054 return ix86_expand_sse_ptest (d
, exp
, target
);
9055 case FLOAT128_FTYPE_FLOAT128
:
9056 case FLOAT_FTYPE_FLOAT
:
9058 case UINT_FTYPE_UINT
:
9059 case UINT16_FTYPE_UINT16
:
9060 case UINT64_FTYPE_INT
:
9061 case UINT64_FTYPE_UINT64
:
9062 case INT64_FTYPE_INT64
:
9063 case INT64_FTYPE_V4SF
:
9064 case INT64_FTYPE_V2DF
:
9065 case INT_FTYPE_V16QI
:
9066 case INT_FTYPE_V8QI
:
9067 case INT_FTYPE_V8SF
:
9068 case INT_FTYPE_V4DF
:
9069 case INT_FTYPE_V4SF
:
9070 case INT_FTYPE_V2DF
:
9071 case INT_FTYPE_V32QI
:
9072 case V16QI_FTYPE_V16QI
:
9073 case V8SI_FTYPE_V8SF
:
9074 case V8SI_FTYPE_V4SI
:
9075 case V8HI_FTYPE_V8HI
:
9076 case V8HI_FTYPE_V16QI
:
9077 case V8QI_FTYPE_V8QI
:
9078 case V8SF_FTYPE_V8SF
:
9079 case V8SF_FTYPE_V8SI
:
9080 case V8SF_FTYPE_V4SF
:
9081 case V8SF_FTYPE_V8HI
:
9082 case V4SI_FTYPE_V4SI
:
9083 case V4SI_FTYPE_V16QI
:
9084 case V4SI_FTYPE_V4SF
:
9085 case V4SI_FTYPE_V8SI
:
9086 case V4SI_FTYPE_V8HI
:
9087 case V4SI_FTYPE_V4DF
:
9088 case V4SI_FTYPE_V2DF
:
9089 case V4HI_FTYPE_V4HI
:
9090 case V4DF_FTYPE_V4DF
:
9091 case V4DF_FTYPE_V4SI
:
9092 case V4DF_FTYPE_V4SF
:
9093 case V4DF_FTYPE_V2DF
:
9094 case V4SF_FTYPE_V4SF
:
9095 case V4SF_FTYPE_V4SI
:
9096 case V4SF_FTYPE_V8SF
:
9097 case V4SF_FTYPE_V4DF
:
9098 case V4SF_FTYPE_V8HI
:
9099 case V4SF_FTYPE_V2DF
:
9100 case V2DI_FTYPE_V2DI
:
9101 case V2DI_FTYPE_V16QI
:
9102 case V2DI_FTYPE_V8HI
:
9103 case V2DI_FTYPE_V4SI
:
9104 case V2DF_FTYPE_V2DF
:
9105 case V2DF_FTYPE_V4SI
:
9106 case V2DF_FTYPE_V4DF
:
9107 case V2DF_FTYPE_V4SF
:
9108 case V2DF_FTYPE_V2SI
:
9109 case V2SI_FTYPE_V2SI
:
9110 case V2SI_FTYPE_V4SF
:
9111 case V2SI_FTYPE_V2SF
:
9112 case V2SI_FTYPE_V2DF
:
9113 case V2SF_FTYPE_V2SF
:
9114 case V2SF_FTYPE_V2SI
:
9115 case V32QI_FTYPE_V32QI
:
9116 case V32QI_FTYPE_V16QI
:
9117 case V16HI_FTYPE_V16HI
:
9118 case V16HI_FTYPE_V8HI
:
9119 case V8SI_FTYPE_V8SI
:
9120 case V16HI_FTYPE_V16QI
:
9121 case V8SI_FTYPE_V16QI
:
9122 case V4DI_FTYPE_V16QI
:
9123 case V8SI_FTYPE_V8HI
:
9124 case V4DI_FTYPE_V8HI
:
9125 case V4DI_FTYPE_V4SI
:
9126 case V4DI_FTYPE_V2DI
:
9133 case UHI_FTYPE_V16QI
:
9134 case USI_FTYPE_V32QI
:
9135 case UDI_FTYPE_V64QI
:
9136 case V16QI_FTYPE_UHI
:
9137 case V32QI_FTYPE_USI
:
9138 case V64QI_FTYPE_UDI
:
9139 case V8HI_FTYPE_UQI
:
9140 case V16HI_FTYPE_UHI
:
9141 case V32HI_FTYPE_USI
:
9142 case V4SI_FTYPE_UQI
:
9143 case V8SI_FTYPE_UQI
:
9144 case V4SI_FTYPE_UHI
:
9145 case V8SI_FTYPE_UHI
:
9146 case UQI_FTYPE_V8HI
:
9147 case UHI_FTYPE_V16HI
:
9148 case USI_FTYPE_V32HI
:
9149 case UQI_FTYPE_V4SI
:
9150 case UQI_FTYPE_V8SI
:
9151 case UHI_FTYPE_V16SI
:
9152 case UQI_FTYPE_V2DI
:
9153 case UQI_FTYPE_V4DI
:
9154 case UQI_FTYPE_V8DI
:
9155 case V16SI_FTYPE_UHI
:
9156 case V2DI_FTYPE_UQI
:
9157 case V4DI_FTYPE_UQI
:
9158 case V16SI_FTYPE_INT
:
9159 case V16SF_FTYPE_V8SF
:
9160 case V16SI_FTYPE_V8SI
:
9161 case V16SF_FTYPE_V4SF
:
9162 case V16SI_FTYPE_V4SI
:
9163 case V16SI_FTYPE_V16SF
:
9164 case V16SI_FTYPE_V16SI
:
9165 case V64QI_FTYPE_V64QI
:
9166 case V32HI_FTYPE_V32HI
:
9167 case V16SF_FTYPE_V16SF
:
9168 case V8DI_FTYPE_UQI
:
9169 case V8DI_FTYPE_V8DI
:
9170 case V8DF_FTYPE_V4DF
:
9171 case V8DF_FTYPE_V2DF
:
9172 case V8DF_FTYPE_V8DF
:
9173 case V4DI_FTYPE_V4DI
:
9174 case V16HI_FTYPE_V16SF
:
9175 case V8HI_FTYPE_V8SF
:
9176 case V8HI_FTYPE_V4SF
:
9179 case V4SF_FTYPE_V4SF_VEC_MERGE
:
9180 case V2DF_FTYPE_V2DF_VEC_MERGE
:
9181 return ix86_expand_unop_vec_merge_builtin (icode
, exp
, target
);
9182 case FLOAT128_FTYPE_FLOAT128_FLOAT128
:
9183 case V16QI_FTYPE_V16QI_V16QI
:
9184 case V16QI_FTYPE_V8HI_V8HI
:
9185 case V16SF_FTYPE_V16SF_V16SF
:
9186 case V8QI_FTYPE_V8QI_V8QI
:
9187 case V8QI_FTYPE_V4HI_V4HI
:
9188 case V8HI_FTYPE_V8HI_V8HI
:
9189 case V8HI_FTYPE_V16QI_V16QI
:
9190 case V8HI_FTYPE_V4SI_V4SI
:
9191 case V8SF_FTYPE_V8SF_V8SF
:
9192 case V8SF_FTYPE_V8SF_V8SI
:
9193 case V8DF_FTYPE_V8DF_V8DF
:
9194 case V4SI_FTYPE_V4SI_V4SI
:
9195 case V4SI_FTYPE_V8HI_V8HI
:
9196 case V4SI_FTYPE_V2DF_V2DF
:
9197 case V4HI_FTYPE_V4HI_V4HI
:
9198 case V4HI_FTYPE_V8QI_V8QI
:
9199 case V4HI_FTYPE_V2SI_V2SI
:
9200 case V4DF_FTYPE_V4DF_V4DF
:
9201 case V4DF_FTYPE_V4DF_V4DI
:
9202 case V4SF_FTYPE_V4SF_V4SF
:
9203 case V4SF_FTYPE_V4SF_V4SI
:
9204 case V4SF_FTYPE_V4SF_V2SI
:
9205 case V4SF_FTYPE_V4SF_V2DF
:
9206 case V4SF_FTYPE_V4SF_UINT
:
9207 case V4SF_FTYPE_V4SF_DI
:
9208 case V4SF_FTYPE_V4SF_SI
:
9209 case V2DI_FTYPE_V2DI_V2DI
:
9210 case V2DI_FTYPE_V16QI_V16QI
:
9211 case V2DI_FTYPE_V4SI_V4SI
:
9212 case V2DI_FTYPE_V2DI_V16QI
:
9213 case V2SI_FTYPE_V2SI_V2SI
:
9214 case V2SI_FTYPE_V4HI_V4HI
:
9215 case V2SI_FTYPE_V2SF_V2SF
:
9216 case V2DF_FTYPE_V2DF_V2DF
:
9217 case V2DF_FTYPE_V2DF_V4SF
:
9218 case V2DF_FTYPE_V2DF_V2DI
:
9219 case V2DF_FTYPE_V2DF_DI
:
9220 case V2DF_FTYPE_V2DF_SI
:
9221 case V2DF_FTYPE_V2DF_UINT
:
9222 case V2SF_FTYPE_V2SF_V2SF
:
9223 case V1DI_FTYPE_V1DI_V1DI
:
9224 case V1DI_FTYPE_V8QI_V8QI
:
9225 case V1DI_FTYPE_V2SI_V2SI
:
9226 case V32QI_FTYPE_V16HI_V16HI
:
9227 case V16HI_FTYPE_V8SI_V8SI
:
9228 case V64QI_FTYPE_V64QI_V64QI
:
9229 case V32QI_FTYPE_V32QI_V32QI
:
9230 case V16HI_FTYPE_V32QI_V32QI
:
9231 case V16HI_FTYPE_V16HI_V16HI
:
9232 case V8SI_FTYPE_V4DF_V4DF
:
9233 case V8SI_FTYPE_V8SI_V8SI
:
9234 case V8SI_FTYPE_V16HI_V16HI
:
9235 case V4DI_FTYPE_V4DI_V4DI
:
9236 case V4DI_FTYPE_V8SI_V8SI
:
9237 case V8DI_FTYPE_V64QI_V64QI
:
9238 if (comparison
== UNKNOWN
)
9239 return ix86_expand_binop_builtin (icode
, exp
, target
);
9242 case V4SF_FTYPE_V4SF_V4SF_SWAP
:
9243 case V2DF_FTYPE_V2DF_V2DF_SWAP
:
9244 gcc_assert (comparison
!= UNKNOWN
);
9248 case V16HI_FTYPE_V16HI_V8HI_COUNT
:
9249 case V16HI_FTYPE_V16HI_SI_COUNT
:
9250 case V8SI_FTYPE_V8SI_V4SI_COUNT
:
9251 case V8SI_FTYPE_V8SI_SI_COUNT
:
9252 case V4DI_FTYPE_V4DI_V2DI_COUNT
:
9253 case V4DI_FTYPE_V4DI_INT_COUNT
:
9254 case V8HI_FTYPE_V8HI_V8HI_COUNT
:
9255 case V8HI_FTYPE_V8HI_SI_COUNT
:
9256 case V4SI_FTYPE_V4SI_V4SI_COUNT
:
9257 case V4SI_FTYPE_V4SI_SI_COUNT
:
9258 case V4HI_FTYPE_V4HI_V4HI_COUNT
:
9259 case V4HI_FTYPE_V4HI_SI_COUNT
:
9260 case V2DI_FTYPE_V2DI_V2DI_COUNT
:
9261 case V2DI_FTYPE_V2DI_SI_COUNT
:
9262 case V2SI_FTYPE_V2SI_V2SI_COUNT
:
9263 case V2SI_FTYPE_V2SI_SI_COUNT
:
9264 case V1DI_FTYPE_V1DI_V1DI_COUNT
:
9265 case V1DI_FTYPE_V1DI_SI_COUNT
:
9267 second_arg_count
= true;
9269 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT
:
9270 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT
:
9271 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT
:
9272 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT
:
9273 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT
:
9274 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT
:
9275 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT
:
9276 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT
:
9277 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT
:
9278 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT
:
9279 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT
:
9280 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT
:
9281 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT
:
9282 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT
:
9283 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT
:
9284 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT
:
9285 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT
:
9286 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT
:
9288 second_arg_count
= true;
9290 case UINT64_FTYPE_UINT64_UINT64
:
9291 case UINT_FTYPE_UINT_UINT
:
9292 case UINT_FTYPE_UINT_USHORT
:
9293 case UINT_FTYPE_UINT_UCHAR
:
9294 case UINT16_FTYPE_UINT16_INT
:
9295 case UINT8_FTYPE_UINT8_INT
:
9296 case UQI_FTYPE_UQI_UQI
:
9297 case UHI_FTYPE_UHI_UHI
:
9298 case USI_FTYPE_USI_USI
:
9299 case UDI_FTYPE_UDI_UDI
:
9300 case V16SI_FTYPE_V8DF_V8DF
:
9301 case V32HI_FTYPE_V16SF_V16SF
:
9302 case V16HI_FTYPE_V8SF_V8SF
:
9303 case V8HI_FTYPE_V4SF_V4SF
:
9304 case V16HI_FTYPE_V16SF_UHI
:
9305 case V8HI_FTYPE_V8SF_UQI
:
9306 case V8HI_FTYPE_V4SF_UQI
:
9309 case V2DI_FTYPE_V2DI_INT_CONVERT
:
9314 case V4DI_FTYPE_V4DI_INT_CONVERT
:
9319 case V8DI_FTYPE_V8DI_INT_CONVERT
:
9324 case V8HI_FTYPE_V8HI_INT
:
9325 case V8HI_FTYPE_V8SF_INT
:
9326 case V16HI_FTYPE_V16SF_INT
:
9327 case V8HI_FTYPE_V4SF_INT
:
9328 case V8SF_FTYPE_V8SF_INT
:
9329 case V4SF_FTYPE_V16SF_INT
:
9330 case V16SF_FTYPE_V16SF_INT
:
9331 case V4SI_FTYPE_V4SI_INT
:
9332 case V4SI_FTYPE_V8SI_INT
:
9333 case V4HI_FTYPE_V4HI_INT
:
9334 case V4DF_FTYPE_V4DF_INT
:
9335 case V4DF_FTYPE_V8DF_INT
:
9336 case V4SF_FTYPE_V4SF_INT
:
9337 case V4SF_FTYPE_V8SF_INT
:
9338 case V2DI_FTYPE_V2DI_INT
:
9339 case V2DF_FTYPE_V2DF_INT
:
9340 case V2DF_FTYPE_V4DF_INT
:
9341 case V16HI_FTYPE_V16HI_INT
:
9342 case V8SI_FTYPE_V8SI_INT
:
9343 case V16SI_FTYPE_V16SI_INT
:
9344 case V4SI_FTYPE_V16SI_INT
:
9345 case V4DI_FTYPE_V4DI_INT
:
9346 case V2DI_FTYPE_V4DI_INT
:
9347 case V4DI_FTYPE_V8DI_INT
:
9348 case UQI_FTYPE_UQI_UQI_CONST
:
9349 case UHI_FTYPE_UHI_UQI
:
9350 case USI_FTYPE_USI_UQI
:
9351 case UDI_FTYPE_UDI_UQI
:
9355 case V16QI_FTYPE_V16QI_V16QI_V16QI
:
9356 case V8SF_FTYPE_V8SF_V8SF_V8SF
:
9357 case V4DF_FTYPE_V4DF_V4DF_V4DF
:
9358 case V4SF_FTYPE_V4SF_V4SF_V4SF
:
9359 case V2DF_FTYPE_V2DF_V2DF_V2DF
:
9360 case V32QI_FTYPE_V32QI_V32QI_V32QI
:
9361 case UHI_FTYPE_V16SI_V16SI_UHI
:
9362 case UQI_FTYPE_V8DI_V8DI_UQI
:
9363 case V16HI_FTYPE_V16SI_V16HI_UHI
:
9364 case V16QI_FTYPE_V16SI_V16QI_UHI
:
9365 case V16QI_FTYPE_V8DI_V16QI_UQI
:
9366 case V16SF_FTYPE_V16SF_V16SF_UHI
:
9367 case V16SF_FTYPE_V4SF_V16SF_UHI
:
9368 case V16SI_FTYPE_SI_V16SI_UHI
:
9369 case V16SI_FTYPE_V16HI_V16SI_UHI
:
9370 case V16SI_FTYPE_V16QI_V16SI_UHI
:
9371 case V8SF_FTYPE_V4SF_V8SF_UQI
:
9372 case V4DF_FTYPE_V2DF_V4DF_UQI
:
9373 case V8SI_FTYPE_V4SI_V8SI_UQI
:
9374 case V8SI_FTYPE_SI_V8SI_UQI
:
9375 case V4SI_FTYPE_V4SI_V4SI_UQI
:
9376 case V4SI_FTYPE_SI_V4SI_UQI
:
9377 case V4DI_FTYPE_V2DI_V4DI_UQI
:
9378 case V4DI_FTYPE_DI_V4DI_UQI
:
9379 case V2DI_FTYPE_V2DI_V2DI_UQI
:
9380 case V2DI_FTYPE_DI_V2DI_UQI
:
9381 case V64QI_FTYPE_V64QI_V64QI_UDI
:
9382 case V64QI_FTYPE_V16QI_V64QI_UDI
:
9383 case V64QI_FTYPE_QI_V64QI_UDI
:
9384 case V32QI_FTYPE_V32QI_V32QI_USI
:
9385 case V32QI_FTYPE_V16QI_V32QI_USI
:
9386 case V32QI_FTYPE_QI_V32QI_USI
:
9387 case V16QI_FTYPE_V16QI_V16QI_UHI
:
9388 case V16QI_FTYPE_QI_V16QI_UHI
:
9389 case V32HI_FTYPE_V8HI_V32HI_USI
:
9390 case V32HI_FTYPE_HI_V32HI_USI
:
9391 case V16HI_FTYPE_V8HI_V16HI_UHI
:
9392 case V16HI_FTYPE_HI_V16HI_UHI
:
9393 case V8HI_FTYPE_V8HI_V8HI_UQI
:
9394 case V8HI_FTYPE_HI_V8HI_UQI
:
9395 case V8SF_FTYPE_V8HI_V8SF_UQI
:
9396 case V4SF_FTYPE_V8HI_V4SF_UQI
:
9397 case V8SI_FTYPE_V8SF_V8SI_UQI
:
9398 case V4SI_FTYPE_V4SF_V4SI_UQI
:
9399 case V4DI_FTYPE_V4SF_V4DI_UQI
:
9400 case V2DI_FTYPE_V4SF_V2DI_UQI
:
9401 case V4SF_FTYPE_V4DI_V4SF_UQI
:
9402 case V4SF_FTYPE_V2DI_V4SF_UQI
:
9403 case V4DF_FTYPE_V4DI_V4DF_UQI
:
9404 case V2DF_FTYPE_V2DI_V2DF_UQI
:
9405 case V16QI_FTYPE_V8HI_V16QI_UQI
:
9406 case V16QI_FTYPE_V16HI_V16QI_UHI
:
9407 case V16QI_FTYPE_V4SI_V16QI_UQI
:
9408 case V16QI_FTYPE_V8SI_V16QI_UQI
:
9409 case V8HI_FTYPE_V4SI_V8HI_UQI
:
9410 case V8HI_FTYPE_V8SI_V8HI_UQI
:
9411 case V16QI_FTYPE_V2DI_V16QI_UQI
:
9412 case V16QI_FTYPE_V4DI_V16QI_UQI
:
9413 case V8HI_FTYPE_V2DI_V8HI_UQI
:
9414 case V8HI_FTYPE_V4DI_V8HI_UQI
:
9415 case V4SI_FTYPE_V2DI_V4SI_UQI
:
9416 case V4SI_FTYPE_V4DI_V4SI_UQI
:
9417 case V32QI_FTYPE_V32HI_V32QI_USI
:
9418 case UHI_FTYPE_V16QI_V16QI_UHI
:
9419 case USI_FTYPE_V32QI_V32QI_USI
:
9420 case UDI_FTYPE_V64QI_V64QI_UDI
:
9421 case UQI_FTYPE_V8HI_V8HI_UQI
:
9422 case UHI_FTYPE_V16HI_V16HI_UHI
:
9423 case USI_FTYPE_V32HI_V32HI_USI
:
9424 case UQI_FTYPE_V4SI_V4SI_UQI
:
9425 case UQI_FTYPE_V8SI_V8SI_UQI
:
9426 case UQI_FTYPE_V2DI_V2DI_UQI
:
9427 case UQI_FTYPE_V4DI_V4DI_UQI
:
9428 case V4SF_FTYPE_V2DF_V4SF_UQI
:
9429 case V4SF_FTYPE_V4DF_V4SF_UQI
:
9430 case V16SI_FTYPE_V16SI_V16SI_UHI
:
9431 case V16SI_FTYPE_V4SI_V16SI_UHI
:
9432 case V2DI_FTYPE_V4SI_V2DI_UQI
:
9433 case V2DI_FTYPE_V8HI_V2DI_UQI
:
9434 case V2DI_FTYPE_V16QI_V2DI_UQI
:
9435 case V4DI_FTYPE_V4DI_V4DI_UQI
:
9436 case V4DI_FTYPE_V4SI_V4DI_UQI
:
9437 case V4DI_FTYPE_V8HI_V4DI_UQI
:
9438 case V4DI_FTYPE_V16QI_V4DI_UQI
:
9439 case V4DI_FTYPE_V4DF_V4DI_UQI
:
9440 case V2DI_FTYPE_V2DF_V2DI_UQI
:
9441 case V4SI_FTYPE_V4DF_V4SI_UQI
:
9442 case V4SI_FTYPE_V2DF_V4SI_UQI
:
9443 case V4SI_FTYPE_V8HI_V4SI_UQI
:
9444 case V4SI_FTYPE_V16QI_V4SI_UQI
:
9445 case V4DI_FTYPE_V4DI_V4DI_V4DI
:
9446 case V8DF_FTYPE_V2DF_V8DF_UQI
:
9447 case V8DF_FTYPE_V4DF_V8DF_UQI
:
9448 case V8DF_FTYPE_V8DF_V8DF_UQI
:
9449 case V8SF_FTYPE_V8SF_V8SF_UQI
:
9450 case V8SF_FTYPE_V8SI_V8SF_UQI
:
9451 case V4DF_FTYPE_V4DF_V4DF_UQI
:
9452 case V4SF_FTYPE_V4SF_V4SF_UQI
:
9453 case V2DF_FTYPE_V2DF_V2DF_UQI
:
9454 case V2DF_FTYPE_V4SF_V2DF_UQI
:
9455 case V2DF_FTYPE_V4SI_V2DF_UQI
:
9456 case V4SF_FTYPE_V4SI_V4SF_UQI
:
9457 case V4DF_FTYPE_V4SF_V4DF_UQI
:
9458 case V4DF_FTYPE_V4SI_V4DF_UQI
:
9459 case V8SI_FTYPE_V8SI_V8SI_UQI
:
9460 case V8SI_FTYPE_V8HI_V8SI_UQI
:
9461 case V8SI_FTYPE_V16QI_V8SI_UQI
:
9462 case V8DF_FTYPE_V8SI_V8DF_UQI
:
9463 case V8DI_FTYPE_DI_V8DI_UQI
:
9464 case V16SF_FTYPE_V8SF_V16SF_UHI
:
9465 case V16SI_FTYPE_V8SI_V16SI_UHI
:
9466 case V16HI_FTYPE_V16HI_V16HI_UHI
:
9467 case V8HI_FTYPE_V16QI_V8HI_UQI
:
9468 case V16HI_FTYPE_V16QI_V16HI_UHI
:
9469 case V32HI_FTYPE_V32HI_V32HI_USI
:
9470 case V32HI_FTYPE_V32QI_V32HI_USI
:
9471 case V8DI_FTYPE_V16QI_V8DI_UQI
:
9472 case V8DI_FTYPE_V2DI_V8DI_UQI
:
9473 case V8DI_FTYPE_V4DI_V8DI_UQI
:
9474 case V8DI_FTYPE_V8DI_V8DI_UQI
:
9475 case V8DI_FTYPE_V8HI_V8DI_UQI
:
9476 case V8DI_FTYPE_V8SI_V8DI_UQI
:
9477 case V8HI_FTYPE_V8DI_V8HI_UQI
:
9478 case V8SI_FTYPE_V8DI_V8SI_UQI
:
9479 case V4SI_FTYPE_V4SI_V4SI_V4SI
:
9480 case V16SI_FTYPE_V16SI_V16SI_V16SI
:
9481 case V8DI_FTYPE_V8DI_V8DI_V8DI
:
9482 case V32HI_FTYPE_V32HI_V32HI_V32HI
:
9483 case V2DI_FTYPE_V2DI_V2DI_V2DI
:
9484 case V16HI_FTYPE_V16HI_V16HI_V16HI
:
9485 case V8SI_FTYPE_V8SI_V8SI_V8SI
:
9486 case V8HI_FTYPE_V8HI_V8HI_V8HI
:
9487 case V32HI_FTYPE_V16SF_V16SF_USI
:
9488 case V16HI_FTYPE_V8SF_V8SF_UHI
:
9489 case V8HI_FTYPE_V4SF_V4SF_UQI
:
9490 case V16HI_FTYPE_V16SF_V16HI_UHI
:
9491 case V8HI_FTYPE_V8SF_V8HI_UQI
:
9492 case V8HI_FTYPE_V4SF_V8HI_UQI
:
9493 case V16SF_FTYPE_V16SF_V32HI_V32HI
:
9494 case V8SF_FTYPE_V8SF_V16HI_V16HI
:
9495 case V4SF_FTYPE_V4SF_V8HI_V8HI
:
9498 case V32QI_FTYPE_V32QI_V32QI_INT
:
9499 case V16HI_FTYPE_V16HI_V16HI_INT
:
9500 case V16QI_FTYPE_V16QI_V16QI_INT
:
9501 case V4DI_FTYPE_V4DI_V4DI_INT
:
9502 case V8HI_FTYPE_V8HI_V8HI_INT
:
9503 case V8SI_FTYPE_V8SI_V8SI_INT
:
9504 case V8SI_FTYPE_V8SI_V4SI_INT
:
9505 case V8SF_FTYPE_V8SF_V8SF_INT
:
9506 case V8SF_FTYPE_V8SF_V4SF_INT
:
9507 case V4SI_FTYPE_V4SI_V4SI_INT
:
9508 case V4DF_FTYPE_V4DF_V4DF_INT
:
9509 case V16SF_FTYPE_V16SF_V16SF_INT
:
9510 case V16SF_FTYPE_V16SF_V4SF_INT
:
9511 case V16SI_FTYPE_V16SI_V4SI_INT
:
9512 case V4DF_FTYPE_V4DF_V2DF_INT
:
9513 case V4SF_FTYPE_V4SF_V4SF_INT
:
9514 case V2DI_FTYPE_V2DI_V2DI_INT
:
9515 case V4DI_FTYPE_V4DI_V2DI_INT
:
9516 case V2DF_FTYPE_V2DF_V2DF_INT
:
9517 case UQI_FTYPE_V8DI_V8UDI_INT
:
9518 case UQI_FTYPE_V8DF_V8DF_INT
:
9519 case UQI_FTYPE_V2DF_V2DF_INT
:
9520 case UQI_FTYPE_V4SF_V4SF_INT
:
9521 case UHI_FTYPE_V16SI_V16SI_INT
:
9522 case UHI_FTYPE_V16SF_V16SF_INT
:
9523 case V64QI_FTYPE_V64QI_V64QI_INT
:
9524 case V32HI_FTYPE_V32HI_V32HI_INT
:
9525 case V16SI_FTYPE_V16SI_V16SI_INT
:
9526 case V8DI_FTYPE_V8DI_V8DI_INT
:
9530 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT
:
9535 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT
:
9540 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT
:
9545 case V2DI_FTYPE_V2DI_UINT_UINT
:
9549 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT
:
9554 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT
:
9560 case QI_FTYPE_V8DF_INT_UQI
:
9561 case QI_FTYPE_V4DF_INT_UQI
:
9562 case QI_FTYPE_V2DF_INT_UQI
:
9563 case HI_FTYPE_V16SF_INT_UHI
:
9564 case QI_FTYPE_V8SF_INT_UQI
:
9565 case QI_FTYPE_V4SF_INT_UQI
:
9566 case V4SI_FTYPE_V4SI_V4SI_UHI
:
9567 case V8SI_FTYPE_V8SI_V8SI_UHI
:
9572 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT
:
9578 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT
:
9584 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI
:
9585 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI
:
9586 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI
:
9587 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI
:
9588 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI
:
9589 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI
:
9590 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI
:
9591 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI
:
9592 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI
:
9593 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI
:
9594 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI
:
9595 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI
:
9596 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI
:
9597 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI
:
9598 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI
:
9599 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI
:
9600 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI
:
9601 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI
:
9602 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI
:
9603 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI
:
9604 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI
:
9605 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI
:
9606 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI
:
9607 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI
:
9608 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI
:
9609 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI
:
9610 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI
:
9611 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI
:
9612 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI
:
9613 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI
:
9614 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI
:
9615 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI
:
9616 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI
:
9617 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI
:
9618 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI
:
9619 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI
:
9620 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI
:
9621 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI
:
9622 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI
:
9623 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI
:
9624 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI
:
9625 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI
:
9626 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI
:
9627 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI
:
9628 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI
:
9629 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI
:
9630 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI
:
9631 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI
:
9632 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI
:
9633 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI
:
9634 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI
:
9635 case V32HI_FTYPE_V16SF_V16SF_V32HI_USI
:
9636 case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI
:
9637 case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI
:
9640 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT
:
9641 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT
:
9642 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT
:
9643 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT
:
9644 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT
:
9648 case UQI_FTYPE_V4DI_V4DI_INT_UQI
:
9649 case UQI_FTYPE_V8SI_V8SI_INT_UQI
:
9650 case QI_FTYPE_V4DF_V4DF_INT_UQI
:
9651 case QI_FTYPE_V8SF_V8SF_INT_UQI
:
9652 case UQI_FTYPE_V2DI_V2DI_INT_UQI
:
9653 case UQI_FTYPE_V4SI_V4SI_INT_UQI
:
9654 case UQI_FTYPE_V2DF_V2DF_INT_UQI
:
9655 case UQI_FTYPE_V4SF_V4SF_INT_UQI
:
9656 case UDI_FTYPE_V64QI_V64QI_INT_UDI
:
9657 case USI_FTYPE_V32QI_V32QI_INT_USI
:
9658 case UHI_FTYPE_V16QI_V16QI_INT_UHI
:
9659 case USI_FTYPE_V32HI_V32HI_INT_USI
:
9660 case UHI_FTYPE_V16HI_V16HI_INT_UHI
:
9661 case UQI_FTYPE_V8HI_V8HI_INT_UQI
:
9666 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT
:
9670 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED
:
9671 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG
:
9672 case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI
:
9673 case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI
:
9674 case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI
:
9677 case UQI_FTYPE_V8DI_V8DI_INT_UQI
:
9678 case UHI_FTYPE_V16SI_V16SI_INT_UHI
:
9683 case V8SF_FTYPE_V8SF_INT_V8SF_UQI
:
9684 case V4SF_FTYPE_V4SF_INT_V4SF_UQI
:
9685 case V2DF_FTYPE_V4DF_INT_V2DF_UQI
:
9686 case V2DI_FTYPE_V4DI_INT_V2DI_UQI
:
9687 case V8SF_FTYPE_V16SF_INT_V8SF_UQI
:
9688 case V8SI_FTYPE_V16SI_INT_V8SI_UQI
:
9689 case V2DF_FTYPE_V8DF_INT_V2DF_UQI
:
9690 case V2DI_FTYPE_V8DI_INT_V2DI_UQI
:
9691 case V4SF_FTYPE_V8SF_INT_V4SF_UQI
:
9692 case V4SI_FTYPE_V8SI_INT_V4SI_UQI
:
9693 case V8HI_FTYPE_V8SF_INT_V8HI_UQI
:
9694 case V8HI_FTYPE_V4SF_INT_V8HI_UQI
:
9695 case V32HI_FTYPE_V32HI_INT_V32HI_USI
:
9696 case V16HI_FTYPE_V16HI_INT_V16HI_UHI
:
9697 case V8HI_FTYPE_V8HI_INT_V8HI_UQI
:
9698 case V4DI_FTYPE_V4DI_INT_V4DI_UQI
:
9699 case V2DI_FTYPE_V2DI_INT_V2DI_UQI
:
9700 case V8SI_FTYPE_V8SI_INT_V8SI_UQI
:
9701 case V4SI_FTYPE_V4SI_INT_V4SI_UQI
:
9702 case V4DF_FTYPE_V4DF_INT_V4DF_UQI
:
9703 case V2DF_FTYPE_V2DF_INT_V2DF_UQI
:
9704 case V8DF_FTYPE_V8DF_INT_V8DF_UQI
:
9705 case V16SF_FTYPE_V16SF_INT_V16SF_UHI
:
9706 case V16HI_FTYPE_V16SF_INT_V16HI_UHI
:
9707 case V16SI_FTYPE_V16SI_INT_V16SI_UHI
:
9708 case V4SI_FTYPE_V16SI_INT_V4SI_UQI
:
9709 case V4DI_FTYPE_V8DI_INT_V4DI_UQI
:
9710 case V4DF_FTYPE_V8DF_INT_V4DF_UQI
:
9711 case V4SF_FTYPE_V16SF_INT_V4SF_UQI
:
9712 case V8DI_FTYPE_V8DI_INT_V8DI_UQI
:
9717 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI
:
9718 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI
:
9719 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI
:
9720 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI
:
9721 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI
:
9722 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI
:
9723 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI
:
9724 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI
:
9725 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI
:
9726 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI
:
9727 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI
:
9728 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI
:
9729 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI
:
9730 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI
:
9731 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI
:
9732 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI
:
9733 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI
:
9734 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI
:
9735 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI
:
9736 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI
:
9737 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI
:
9738 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI
:
9739 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI
:
9740 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI
:
9741 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI
:
9742 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI
:
9743 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI
:
9748 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI
:
9749 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI
:
9750 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI
:
9751 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI
:
9752 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI
:
9753 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI
:
9754 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI
:
9755 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI
:
9756 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI
:
9757 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI
:
9762 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI
:
9763 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI
:
9764 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI
:
9765 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT
:
9766 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT
:
9767 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT
:
9768 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT
:
9769 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT
:
9770 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT
:
9771 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT
:
9772 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT
:
9773 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT
:
9783 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
9785 if (comparison
!= UNKNOWN
)
9787 gcc_assert (nargs
== 2);
9788 return ix86_expand_sse_compare (d
, exp
, target
, swap
);
9791 if (rmode
== VOIDmode
|| rmode
== tmode
)
9795 || GET_MODE (target
) != tmode
9796 || !insn_p
->operand
[0].predicate (target
, tmode
))
9797 target
= gen_reg_rtx (tmode
);
9798 else if (memory_operand (target
, tmode
))
9800 real_target
= target
;
9804 real_target
= gen_reg_rtx (tmode
);
9805 target
= lowpart_subreg (rmode
, real_target
, tmode
);
9808 for (i
= 0; i
< nargs
; i
++)
9810 tree arg
= CALL_EXPR_ARG (exp
, i
);
9811 rtx op
= expand_normal (arg
);
9812 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
9813 bool match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
9815 if (second_arg_count
&& i
== 1)
9817 /* SIMD shift insns take either an 8-bit immediate or
9818 register as count. But builtin functions take int as
9819 count. If count doesn't match, we put it in register.
9820 The instructions are using 64-bit count, if op is just
9821 32-bit, zero-extend it, as negative shift counts
9822 are undefined behavior and zero-extension is more
9826 if (SCALAR_INT_MODE_P (GET_MODE (op
)))
9827 op
= convert_modes (mode
, GET_MODE (op
), op
, 1);
9829 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
9830 if (!insn_p
->operand
[i
+ 1].predicate (op
, mode
))
9831 op
= copy_to_reg (op
);
9834 else if ((mask_pos
&& (nargs
- i
- mask_pos
) == nargs_constant
) ||
9835 (!mask_pos
&& (nargs
- i
) <= nargs_constant
))
9840 case CODE_FOR_avx_vinsertf128v4di
:
9841 case CODE_FOR_avx_vextractf128v4di
:
9842 error ("the last argument must be an 1-bit immediate");
9845 case CODE_FOR_avx512f_cmpv8di3_mask
:
9846 case CODE_FOR_avx512f_cmpv16si3_mask
:
9847 case CODE_FOR_avx512f_ucmpv8di3_mask
:
9848 case CODE_FOR_avx512f_ucmpv16si3_mask
:
9849 case CODE_FOR_avx512vl_cmpv4di3_mask
:
9850 case CODE_FOR_avx512vl_cmpv8si3_mask
:
9851 case CODE_FOR_avx512vl_ucmpv4di3_mask
:
9852 case CODE_FOR_avx512vl_ucmpv8si3_mask
:
9853 case CODE_FOR_avx512vl_cmpv2di3_mask
:
9854 case CODE_FOR_avx512vl_cmpv4si3_mask
:
9855 case CODE_FOR_avx512vl_ucmpv2di3_mask
:
9856 case CODE_FOR_avx512vl_ucmpv4si3_mask
:
9857 error ("the last argument must be a 3-bit immediate");
9860 case CODE_FOR_sse4_1_roundsd
:
9861 case CODE_FOR_sse4_1_roundss
:
9863 case CODE_FOR_sse4_1_roundpd
:
9864 case CODE_FOR_sse4_1_roundps
:
9865 case CODE_FOR_avx_roundpd256
:
9866 case CODE_FOR_avx_roundps256
:
9868 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix
:
9869 case CODE_FOR_sse4_1_roundps_sfix
:
9870 case CODE_FOR_avx_roundpd_vec_pack_sfix256
:
9871 case CODE_FOR_avx_roundps_sfix256
:
9873 case CODE_FOR_sse4_1_blendps
:
9874 case CODE_FOR_avx_blendpd256
:
9875 case CODE_FOR_avx_vpermilv4df
:
9876 case CODE_FOR_avx_vpermilv4df_mask
:
9877 case CODE_FOR_avx512f_getmantv8df_mask
:
9878 case CODE_FOR_avx512f_getmantv16sf_mask
:
9879 case CODE_FOR_avx512vl_getmantv8sf_mask
:
9880 case CODE_FOR_avx512vl_getmantv4df_mask
:
9881 case CODE_FOR_avx512vl_getmantv4sf_mask
:
9882 case CODE_FOR_avx512vl_getmantv2df_mask
:
9883 case CODE_FOR_avx512dq_rangepv8df_mask_round
:
9884 case CODE_FOR_avx512dq_rangepv16sf_mask_round
:
9885 case CODE_FOR_avx512dq_rangepv4df_mask
:
9886 case CODE_FOR_avx512dq_rangepv8sf_mask
:
9887 case CODE_FOR_avx512dq_rangepv2df_mask
:
9888 case CODE_FOR_avx512dq_rangepv4sf_mask
:
9889 case CODE_FOR_avx_shufpd256_mask
:
9890 error ("the last argument must be a 4-bit immediate");
9893 case CODE_FOR_sha1rnds4
:
9894 case CODE_FOR_sse4_1_blendpd
:
9895 case CODE_FOR_avx_vpermilv2df
:
9896 case CODE_FOR_avx_vpermilv2df_mask
:
9897 case CODE_FOR_xop_vpermil2v2df3
:
9898 case CODE_FOR_xop_vpermil2v4sf3
:
9899 case CODE_FOR_xop_vpermil2v4df3
:
9900 case CODE_FOR_xop_vpermil2v8sf3
:
9901 case CODE_FOR_avx512f_vinsertf32x4_mask
:
9902 case CODE_FOR_avx512f_vinserti32x4_mask
:
9903 case CODE_FOR_avx512f_vextractf32x4_mask
:
9904 case CODE_FOR_avx512f_vextracti32x4_mask
:
9905 case CODE_FOR_sse2_shufpd
:
9906 case CODE_FOR_sse2_shufpd_mask
:
9907 case CODE_FOR_avx512dq_shuf_f64x2_mask
:
9908 case CODE_FOR_avx512dq_shuf_i64x2_mask
:
9909 case CODE_FOR_avx512vl_shuf_i32x4_mask
:
9910 case CODE_FOR_avx512vl_shuf_f32x4_mask
:
9911 error ("the last argument must be a 2-bit immediate");
9914 case CODE_FOR_avx_vextractf128v4df
:
9915 case CODE_FOR_avx_vextractf128v8sf
:
9916 case CODE_FOR_avx_vextractf128v8si
:
9917 case CODE_FOR_avx_vinsertf128v4df
:
9918 case CODE_FOR_avx_vinsertf128v8sf
:
9919 case CODE_FOR_avx_vinsertf128v8si
:
9920 case CODE_FOR_avx512f_vinsertf64x4_mask
:
9921 case CODE_FOR_avx512f_vinserti64x4_mask
:
9922 case CODE_FOR_avx512f_vextractf64x4_mask
:
9923 case CODE_FOR_avx512f_vextracti64x4_mask
:
9924 case CODE_FOR_avx512dq_vinsertf32x8_mask
:
9925 case CODE_FOR_avx512dq_vinserti32x8_mask
:
9926 case CODE_FOR_avx512vl_vinsertv4df
:
9927 case CODE_FOR_avx512vl_vinsertv4di
:
9928 case CODE_FOR_avx512vl_vinsertv8sf
:
9929 case CODE_FOR_avx512vl_vinsertv8si
:
9930 error ("the last argument must be a 1-bit immediate");
9933 case CODE_FOR_avx_vmcmpv2df3
:
9934 case CODE_FOR_avx_vmcmpv4sf3
:
9935 case CODE_FOR_avx_cmpv2df3
:
9936 case CODE_FOR_avx_cmpv4sf3
:
9937 case CODE_FOR_avx_cmpv4df3
:
9938 case CODE_FOR_avx_cmpv8sf3
:
9939 case CODE_FOR_avx512f_cmpv8df3_mask
:
9940 case CODE_FOR_avx512f_cmpv16sf3_mask
:
9941 case CODE_FOR_avx512f_vmcmpv2df3_mask
:
9942 case CODE_FOR_avx512f_vmcmpv4sf3_mask
:
9943 error ("the last argument must be a 5-bit immediate");
9947 switch (nargs_constant
)
9950 if ((mask_pos
&& (nargs
- i
- mask_pos
) == nargs_constant
) ||
9951 (!mask_pos
&& (nargs
- i
) == nargs_constant
))
9953 error ("the next to last argument must be an 8-bit immediate");
9958 error ("the last argument must be an 8-bit immediate");
9968 if (VECTOR_MODE_P (mode
))
9969 op
= safe_vector_operand (op
, mode
);
9971 /* If we aren't optimizing, only allow one memory operand to
9973 if (memory_operand (op
, mode
))
9976 op
= fixup_modeless_constant (op
, mode
);
9978 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
9980 if (optimize
|| !match
|| num_memory
> 1)
9981 op
= copy_to_mode_reg (mode
, op
);
9985 op
= copy_to_reg (op
);
9986 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
9996 pat
= GEN_FCN (icode
) (real_target
, xops
[0]);
9999 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1]);
10002 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1], xops
[2]);
10005 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1],
10009 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1],
10010 xops
[2], xops
[3], xops
[4]);
10013 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1],
10014 xops
[2], xops
[3], xops
[4], xops
[5]);
10017 gcc_unreachable ();
10027 /* Transform pattern of following layout:
10029 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
10035 ix86_erase_embedded_rounding (rtx pat
)
10037 if (GET_CODE (pat
) == INSN
)
10038 pat
= PATTERN (pat
);
10040 gcc_assert (GET_CODE (pat
) == SET
);
10041 rtx src
= SET_SRC (pat
);
10042 gcc_assert (XVECLEN (src
, 0) == 2);
10043 rtx p0
= XVECEXP (src
, 0, 0);
10044 gcc_assert (GET_CODE (src
) == UNSPEC
10045 && XINT (src
, 1) == UNSPEC_EMBEDDED_ROUNDING
);
10046 rtx res
= gen_rtx_SET (SET_DEST (pat
), p0
);
10050 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
10053 ix86_expand_sse_comi_round (const struct builtin_description
*d
,
10054 tree exp
, rtx target
)
10057 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10058 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10059 tree arg2
= CALL_EXPR_ARG (exp
, 2);
10060 tree arg3
= CALL_EXPR_ARG (exp
, 3);
10061 rtx op0
= expand_normal (arg0
);
10062 rtx op1
= expand_normal (arg1
);
10063 rtx op2
= expand_normal (arg2
);
10064 rtx op3
= expand_normal (arg3
);
10065 enum insn_code icode
= d
->icode
;
10066 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10067 machine_mode mode0
= insn_p
->operand
[0].mode
;
10068 machine_mode mode1
= insn_p
->operand
[1].mode
;
10070 /* See avxintrin.h for values. */
10071 static const enum rtx_code comparisons
[32] =
10073 EQ
, LT
, LE
, UNORDERED
, NE
, UNGE
, UNGT
, ORDERED
,
10074 UNEQ
, UNLT
, UNLE
, UNORDERED
, LTGT
, GE
, GT
, ORDERED
,
10075 EQ
, LT
, LE
, UNORDERED
, NE
, UNGE
, UNGT
, ORDERED
,
10076 UNEQ
, UNLT
, UNLE
, UNORDERED
, LTGT
, GE
, GT
, ORDERED
10078 static const bool ordereds
[32] =
10080 true, true, true, false, false, false, false, true,
10081 false, false, false, true, true, true, true, false,
10082 true, true, true, false, false, false, false, true,
10083 false, false, false, true, true, true, true, false
10085 static const bool non_signalings
[32] =
10087 true, false, false, true, true, false, false, true,
10088 true, false, false, true, true, false, false, true,
10089 false, true, true, false, false, true, true, false,
10090 false, true, true, false, false, true, true, false
10093 if (!CONST_INT_P (op2
))
10095 error ("the third argument must be comparison constant");
10098 if (INTVAL (op2
) < 0 || INTVAL (op2
) >= 32)
10100 error ("incorrect comparison mode");
10104 if (!insn_p
->operand
[2].predicate (op3
, SImode
))
10106 error ("incorrect rounding operand");
10110 if (VECTOR_MODE_P (mode0
))
10111 op0
= safe_vector_operand (op0
, mode0
);
10112 if (VECTOR_MODE_P (mode1
))
10113 op1
= safe_vector_operand (op1
, mode1
);
10115 enum rtx_code comparison
= comparisons
[INTVAL (op2
)];
10116 bool ordered
= ordereds
[INTVAL (op2
)];
10117 bool non_signaling
= non_signalings
[INTVAL (op2
)];
10118 rtx const_val
= const0_rtx
;
10120 bool check_unordered
= false;
10121 machine_mode mode
= CCFPmode
;
10122 switch (comparison
)
10127 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
10128 if (!non_signaling
)
10134 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
10144 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
10151 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
10152 if (!non_signaling
)
10159 case LE
: /* -> GE */
10160 case LT
: /* -> GT */
10161 case UNGE
: /* -> UNLE */
10162 case UNGT
: /* -> UNLT */
10163 std::swap (op0
, op1
);
10164 comparison
= swap_condition (comparison
);
10172 /* These are supported by CCFPmode. NB: Use ordered/signaling
10173 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
10174 with NAN operands. */
10175 if (ordered
== non_signaling
)
10176 ordered
= !ordered
;
10179 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10180 _CMP_EQ_OQ/_CMP_EQ_OS. */
10181 check_unordered
= true;
10185 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10186 _CMP_NEQ_UQ/_CMP_NEQ_US. */
10187 gcc_assert (!ordered
);
10188 check_unordered
= true;
10190 const_val
= const1_rtx
;
10193 gcc_unreachable ();
10196 target
= gen_reg_rtx (SImode
);
10197 emit_move_insn (target
, const_val
);
10198 target
= gen_rtx_SUBREG (QImode
, target
, 0);
10200 if ((optimize
&& !register_operand (op0
, mode0
))
10201 || !insn_p
->operand
[0].predicate (op0
, mode0
))
10202 op0
= copy_to_mode_reg (mode0
, op0
);
10203 if ((optimize
&& !register_operand (op1
, mode1
))
10204 || !insn_p
->operand
[1].predicate (op1
, mode1
))
10205 op1
= copy_to_mode_reg (mode1
, op1
);
10208 1. COMI: ordered and signaling.
10209 2. UCOMI: unordered and non-signaling.
10212 icode
= (icode
== CODE_FOR_sse_comi_round
10213 ? CODE_FOR_sse_ucomi_round
10214 : CODE_FOR_sse2_ucomi_round
);
10216 pat
= GEN_FCN (icode
) (op0
, op1
, op3
);
10220 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
10221 if (INTVAL (op3
) == NO_ROUND
)
10223 pat
= ix86_erase_embedded_rounding (pat
);
10227 set_dst
= SET_DEST (pat
);
10231 gcc_assert (GET_CODE (pat
) == SET
);
10232 set_dst
= SET_DEST (pat
);
10237 rtx_code_label
*label
= NULL
;
10239 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10240 with NAN operands. */
10241 if (check_unordered
)
10243 gcc_assert (comparison
== EQ
|| comparison
== NE
);
10245 rtx flag
= gen_rtx_REG (CCFPmode
, FLAGS_REG
);
10246 label
= gen_label_rtx ();
10247 rtx tmp
= gen_rtx_fmt_ee (UNORDERED
, VOIDmode
, flag
, const0_rtx
);
10248 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
10249 gen_rtx_LABEL_REF (VOIDmode
, label
),
10251 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
10254 /* NB: Set CCFPmode and check a different CCmode which is in subset
10256 if (GET_MODE (set_dst
) != mode
)
10258 gcc_assert (mode
== CCAmode
|| mode
== CCCmode
10259 || mode
== CCOmode
|| mode
== CCPmode
10260 || mode
== CCSmode
|| mode
== CCZmode
);
10261 set_dst
= gen_rtx_REG (mode
, FLAGS_REG
);
10264 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
10265 gen_rtx_fmt_ee (comparison
, QImode
,
10270 emit_label (label
);
10272 return SUBREG_REG (target
);
10276 ix86_expand_round_builtin (const struct builtin_description
*d
,
10277 tree exp
, rtx target
)
10280 unsigned int i
, nargs
;
10282 enum insn_code icode
= d
->icode
;
10283 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10284 machine_mode tmode
= insn_p
->operand
[0].mode
;
10285 unsigned int nargs_constant
= 0;
10286 unsigned int redundant_embed_rnd
= 0;
10288 switch ((enum ix86_builtin_func_type
) d
->flag
)
10290 case UINT64_FTYPE_V2DF_INT
:
10291 case UINT64_FTYPE_V4SF_INT
:
10292 case UINT_FTYPE_V2DF_INT
:
10293 case UINT_FTYPE_V4SF_INT
:
10294 case INT64_FTYPE_V2DF_INT
:
10295 case INT64_FTYPE_V4SF_INT
:
10296 case INT_FTYPE_V2DF_INT
:
10297 case INT_FTYPE_V4SF_INT
:
10300 case V4SF_FTYPE_V4SF_UINT_INT
:
10301 case V4SF_FTYPE_V4SF_UINT64_INT
:
10302 case V2DF_FTYPE_V2DF_UINT64_INT
:
10303 case V4SF_FTYPE_V4SF_INT_INT
:
10304 case V4SF_FTYPE_V4SF_INT64_INT
:
10305 case V2DF_FTYPE_V2DF_INT64_INT
:
10306 case V4SF_FTYPE_V4SF_V4SF_INT
:
10307 case V2DF_FTYPE_V2DF_V2DF_INT
:
10308 case V4SF_FTYPE_V4SF_V2DF_INT
:
10309 case V2DF_FTYPE_V2DF_V4SF_INT
:
10312 case V8SF_FTYPE_V8DF_V8SF_QI_INT
:
10313 case V8DF_FTYPE_V8DF_V8DF_QI_INT
:
10314 case V8SI_FTYPE_V8DF_V8SI_QI_INT
:
10315 case V8DI_FTYPE_V8DF_V8DI_QI_INT
:
10316 case V8SF_FTYPE_V8DI_V8SF_QI_INT
:
10317 case V8DF_FTYPE_V8DI_V8DF_QI_INT
:
10318 case V16SF_FTYPE_V16SF_V16SF_HI_INT
:
10319 case V8DI_FTYPE_V8SF_V8DI_QI_INT
:
10320 case V16SF_FTYPE_V16SI_V16SF_HI_INT
:
10321 case V16SI_FTYPE_V16SF_V16SI_HI_INT
:
10322 case V8DF_FTYPE_V8SF_V8DF_QI_INT
:
10323 case V16SF_FTYPE_V16HI_V16SF_HI_INT
:
10324 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT
:
10325 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT
:
10328 case V4SF_FTYPE_V4SF_V4SF_INT_INT
:
10329 case V2DF_FTYPE_V2DF_V2DF_INT_INT
:
10330 nargs_constant
= 2;
10333 case INT_FTYPE_V4SF_V4SF_INT_INT
:
10334 case INT_FTYPE_V2DF_V2DF_INT_INT
:
10335 return ix86_expand_sse_comi_round (d
, exp
, target
);
10336 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT
:
10337 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT
:
10338 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT
:
10339 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT
:
10340 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT
:
10341 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT
:
10342 case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT
:
10343 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT
:
10344 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT
:
10345 case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT
:
10348 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT
:
10349 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT
:
10350 case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT
:
10351 case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT
:
10352 nargs_constant
= 4;
10355 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT
:
10356 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT
:
10357 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT
:
10358 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT
:
10359 nargs_constant
= 3;
10362 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT
:
10363 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT
:
10364 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT
:
10365 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT
:
10366 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT
:
10367 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT
:
10369 nargs_constant
= 4;
10371 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT
:
10372 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT
:
10373 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT
:
10374 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT
:
10376 nargs_constant
= 3;
10379 gcc_unreachable ();
10381 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
10385 || GET_MODE (target
) != tmode
10386 || !insn_p
->operand
[0].predicate (target
, tmode
))
10387 target
= gen_reg_rtx (tmode
);
10389 for (i
= 0; i
< nargs
; i
++)
10391 tree arg
= CALL_EXPR_ARG (exp
, i
);
10392 rtx op
= expand_normal (arg
);
10393 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
10394 bool match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
10396 if (i
== nargs
- nargs_constant
)
10402 case CODE_FOR_avx512f_getmantv8df_mask_round
:
10403 case CODE_FOR_avx512f_getmantv16sf_mask_round
:
10404 case CODE_FOR_avx512f_vgetmantv2df_round
:
10405 case CODE_FOR_avx512f_vgetmantv2df_mask_round
:
10406 case CODE_FOR_avx512f_vgetmantv4sf_round
:
10407 case CODE_FOR_avx512f_vgetmantv4sf_mask_round
:
10408 error ("the immediate argument must be a 4-bit immediate");
10410 case CODE_FOR_avx512f_cmpv8df3_mask_round
:
10411 case CODE_FOR_avx512f_cmpv16sf3_mask_round
:
10412 case CODE_FOR_avx512f_vmcmpv2df3_mask_round
:
10413 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round
:
10414 error ("the immediate argument must be a 5-bit immediate");
10417 error ("the immediate argument must be an 8-bit immediate");
10422 else if (i
== nargs
-1)
10424 if (!insn_p
->operand
[nargs
].predicate (op
, SImode
))
10426 error ("incorrect rounding operand");
10430 /* If there is no rounding use normal version of the pattern. */
10431 if (INTVAL (op
) == NO_ROUND
)
10432 redundant_embed_rnd
= 1;
10436 if (VECTOR_MODE_P (mode
))
10437 op
= safe_vector_operand (op
, mode
);
10439 op
= fixup_modeless_constant (op
, mode
);
10441 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
10443 if (optimize
|| !match
)
10444 op
= copy_to_mode_reg (mode
, op
);
10448 op
= copy_to_reg (op
);
10449 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
10459 pat
= GEN_FCN (icode
) (target
, xops
[0]);
10462 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1]);
10465 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2]);
10468 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
10472 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
10473 xops
[2], xops
[3], xops
[4]);
10476 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
10477 xops
[2], xops
[3], xops
[4], xops
[5]);
10480 gcc_unreachable ();
10486 if (redundant_embed_rnd
)
10487 pat
= ix86_erase_embedded_rounding (pat
);
10493 /* Subroutine of ix86_expand_builtin to take care of special insns
10494 with variable number of operands. */
10497 ix86_expand_special_args_builtin (const struct builtin_description
*d
,
10498 tree exp
, rtx target
)
10502 unsigned int i
, nargs
, arg_adjust
, memory
;
10503 bool aligned_mem
= false;
10505 enum insn_code icode
= d
->icode
;
10506 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10507 machine_mode tmode
= insn_p
->operand
[0].mode
;
10508 enum { load
, store
} klass
;
10510 switch ((enum ix86_builtin_func_type
) d
->flag
)
10512 case VOID_FTYPE_VOID
:
10513 emit_insn (GEN_FCN (icode
) (target
));
10515 case VOID_FTYPE_UINT64
:
10516 case VOID_FTYPE_UNSIGNED
:
10522 case INT_FTYPE_VOID
:
10523 case USHORT_FTYPE_VOID
:
10524 case UINT64_FTYPE_VOID
:
10525 case UINT_FTYPE_VOID
:
10526 case UINT8_FTYPE_VOID
:
10527 case UNSIGNED_FTYPE_VOID
:
10532 case UINT64_FTYPE_PUNSIGNED
:
10533 case V2DI_FTYPE_PV2DI
:
10534 case V4DI_FTYPE_PV4DI
:
10535 case V32QI_FTYPE_PCCHAR
:
10536 case V16QI_FTYPE_PCCHAR
:
10537 case V8SF_FTYPE_PCV4SF
:
10538 case V8SF_FTYPE_PCFLOAT
:
10539 case V4SF_FTYPE_PCFLOAT
:
10540 case V4DF_FTYPE_PCV2DF
:
10541 case V4DF_FTYPE_PCDOUBLE
:
10542 case V2DF_FTYPE_PCDOUBLE
:
10543 case VOID_FTYPE_PVOID
:
10544 case V8DI_FTYPE_PV8DI
:
10550 case CODE_FOR_sse4_1_movntdqa
:
10551 case CODE_FOR_avx2_movntdqa
:
10552 case CODE_FOR_avx512f_movntdqa
:
10553 aligned_mem
= true;
10559 case VOID_FTYPE_PV2SF_V4SF
:
10560 case VOID_FTYPE_PV8DI_V8DI
:
10561 case VOID_FTYPE_PV4DI_V4DI
:
10562 case VOID_FTYPE_PV2DI_V2DI
:
10563 case VOID_FTYPE_PCHAR_V32QI
:
10564 case VOID_FTYPE_PCHAR_V16QI
:
10565 case VOID_FTYPE_PFLOAT_V16SF
:
10566 case VOID_FTYPE_PFLOAT_V8SF
:
10567 case VOID_FTYPE_PFLOAT_V4SF
:
10568 case VOID_FTYPE_PDOUBLE_V8DF
:
10569 case VOID_FTYPE_PDOUBLE_V4DF
:
10570 case VOID_FTYPE_PDOUBLE_V2DF
:
10571 case VOID_FTYPE_PLONGLONG_LONGLONG
:
10572 case VOID_FTYPE_PULONGLONG_ULONGLONG
:
10573 case VOID_FTYPE_PUNSIGNED_UNSIGNED
:
10574 case VOID_FTYPE_PINT_INT
:
10577 /* Reserve memory operand for target. */
10578 memory
= ARRAY_SIZE (xops
);
10581 /* These builtins and instructions require the memory
10582 to be properly aligned. */
10583 case CODE_FOR_avx_movntv4di
:
10584 case CODE_FOR_sse2_movntv2di
:
10585 case CODE_FOR_avx_movntv8sf
:
10586 case CODE_FOR_sse_movntv4sf
:
10587 case CODE_FOR_sse4a_vmmovntv4sf
:
10588 case CODE_FOR_avx_movntv4df
:
10589 case CODE_FOR_sse2_movntv2df
:
10590 case CODE_FOR_sse4a_vmmovntv2df
:
10591 case CODE_FOR_sse2_movntidi
:
10592 case CODE_FOR_sse_movntq
:
10593 case CODE_FOR_sse2_movntisi
:
10594 case CODE_FOR_avx512f_movntv16sf
:
10595 case CODE_FOR_avx512f_movntv8df
:
10596 case CODE_FOR_avx512f_movntv8di
:
10597 aligned_mem
= true;
10603 case VOID_FTYPE_PVOID_PCVOID
:
10609 case V4SF_FTYPE_V4SF_PCV2SF
:
10610 case V2DF_FTYPE_V2DF_PCDOUBLE
:
10615 case V8SF_FTYPE_PCV8SF_V8SI
:
10616 case V4DF_FTYPE_PCV4DF_V4DI
:
10617 case V4SF_FTYPE_PCV4SF_V4SI
:
10618 case V2DF_FTYPE_PCV2DF_V2DI
:
10619 case V8SI_FTYPE_PCV8SI_V8SI
:
10620 case V4DI_FTYPE_PCV4DI_V4DI
:
10621 case V4SI_FTYPE_PCV4SI_V4SI
:
10622 case V2DI_FTYPE_PCV2DI_V2DI
:
10623 case VOID_FTYPE_INT_INT64
:
10628 case VOID_FTYPE_PV8DF_V8DF_UQI
:
10629 case VOID_FTYPE_PV4DF_V4DF_UQI
:
10630 case VOID_FTYPE_PV2DF_V2DF_UQI
:
10631 case VOID_FTYPE_PV16SF_V16SF_UHI
:
10632 case VOID_FTYPE_PV8SF_V8SF_UQI
:
10633 case VOID_FTYPE_PV4SF_V4SF_UQI
:
10634 case VOID_FTYPE_PV8DI_V8DI_UQI
:
10635 case VOID_FTYPE_PV4DI_V4DI_UQI
:
10636 case VOID_FTYPE_PV2DI_V2DI_UQI
:
10637 case VOID_FTYPE_PV16SI_V16SI_UHI
:
10638 case VOID_FTYPE_PV8SI_V8SI_UQI
:
10639 case VOID_FTYPE_PV4SI_V4SI_UQI
:
10640 case VOID_FTYPE_PV64QI_V64QI_UDI
:
10641 case VOID_FTYPE_PV32HI_V32HI_USI
:
10642 case VOID_FTYPE_PV32QI_V32QI_USI
:
10643 case VOID_FTYPE_PV16QI_V16QI_UHI
:
10644 case VOID_FTYPE_PV16HI_V16HI_UHI
:
10645 case VOID_FTYPE_PV8HI_V8HI_UQI
:
10648 /* These builtins and instructions require the memory
10649 to be properly aligned. */
10650 case CODE_FOR_avx512f_storev16sf_mask
:
10651 case CODE_FOR_avx512f_storev16si_mask
:
10652 case CODE_FOR_avx512f_storev8df_mask
:
10653 case CODE_FOR_avx512f_storev8di_mask
:
10654 case CODE_FOR_avx512vl_storev8sf_mask
:
10655 case CODE_FOR_avx512vl_storev8si_mask
:
10656 case CODE_FOR_avx512vl_storev4df_mask
:
10657 case CODE_FOR_avx512vl_storev4di_mask
:
10658 case CODE_FOR_avx512vl_storev4sf_mask
:
10659 case CODE_FOR_avx512vl_storev4si_mask
:
10660 case CODE_FOR_avx512vl_storev2df_mask
:
10661 case CODE_FOR_avx512vl_storev2di_mask
:
10662 aligned_mem
= true;
10668 case VOID_FTYPE_PV8SF_V8SI_V8SF
:
10669 case VOID_FTYPE_PV4DF_V4DI_V4DF
:
10670 case VOID_FTYPE_PV4SF_V4SI_V4SF
:
10671 case VOID_FTYPE_PV2DF_V2DI_V2DF
:
10672 case VOID_FTYPE_PV8SI_V8SI_V8SI
:
10673 case VOID_FTYPE_PV4DI_V4DI_V4DI
:
10674 case VOID_FTYPE_PV4SI_V4SI_V4SI
:
10675 case VOID_FTYPE_PV2DI_V2DI_V2DI
:
10676 case VOID_FTYPE_PV8SI_V8DI_UQI
:
10677 case VOID_FTYPE_PV8HI_V8DI_UQI
:
10678 case VOID_FTYPE_PV16HI_V16SI_UHI
:
10679 case VOID_FTYPE_PUDI_V8DI_UQI
:
10680 case VOID_FTYPE_PV16QI_V16SI_UHI
:
10681 case VOID_FTYPE_PV4SI_V4DI_UQI
:
10682 case VOID_FTYPE_PUDI_V2DI_UQI
:
10683 case VOID_FTYPE_PUDI_V4DI_UQI
:
10684 case VOID_FTYPE_PUSI_V2DI_UQI
:
10685 case VOID_FTYPE_PV8HI_V8SI_UQI
:
10686 case VOID_FTYPE_PUDI_V4SI_UQI
:
10687 case VOID_FTYPE_PUSI_V4DI_UQI
:
10688 case VOID_FTYPE_PUHI_V2DI_UQI
:
10689 case VOID_FTYPE_PUDI_V8SI_UQI
:
10690 case VOID_FTYPE_PUSI_V4SI_UQI
:
10691 case VOID_FTYPE_PCHAR_V64QI_UDI
:
10692 case VOID_FTYPE_PCHAR_V32QI_USI
:
10693 case VOID_FTYPE_PCHAR_V16QI_UHI
:
10694 case VOID_FTYPE_PSHORT_V32HI_USI
:
10695 case VOID_FTYPE_PSHORT_V16HI_UHI
:
10696 case VOID_FTYPE_PSHORT_V8HI_UQI
:
10697 case VOID_FTYPE_PINT_V16SI_UHI
:
10698 case VOID_FTYPE_PINT_V8SI_UQI
:
10699 case VOID_FTYPE_PINT_V4SI_UQI
:
10700 case VOID_FTYPE_PINT64_V8DI_UQI
:
10701 case VOID_FTYPE_PINT64_V4DI_UQI
:
10702 case VOID_FTYPE_PINT64_V2DI_UQI
:
10703 case VOID_FTYPE_PDOUBLE_V8DF_UQI
:
10704 case VOID_FTYPE_PDOUBLE_V4DF_UQI
:
10705 case VOID_FTYPE_PDOUBLE_V2DF_UQI
:
10706 case VOID_FTYPE_PFLOAT_V16SF_UHI
:
10707 case VOID_FTYPE_PFLOAT_V8SF_UQI
:
10708 case VOID_FTYPE_PFLOAT_V4SF_UQI
:
10709 case VOID_FTYPE_PV32QI_V32HI_USI
:
10710 case VOID_FTYPE_PV16QI_V16HI_UHI
:
10711 case VOID_FTYPE_PUDI_V8HI_UQI
:
10714 /* Reserve memory operand for target. */
10715 memory
= ARRAY_SIZE (xops
);
10717 case V4SF_FTYPE_PCV4SF_V4SF_UQI
:
10718 case V8SF_FTYPE_PCV8SF_V8SF_UQI
:
10719 case V16SF_FTYPE_PCV16SF_V16SF_UHI
:
10720 case V4SI_FTYPE_PCV4SI_V4SI_UQI
:
10721 case V8SI_FTYPE_PCV8SI_V8SI_UQI
:
10722 case V16SI_FTYPE_PCV16SI_V16SI_UHI
:
10723 case V2DF_FTYPE_PCV2DF_V2DF_UQI
:
10724 case V4DF_FTYPE_PCV4DF_V4DF_UQI
:
10725 case V8DF_FTYPE_PCV8DF_V8DF_UQI
:
10726 case V2DI_FTYPE_PCV2DI_V2DI_UQI
:
10727 case V4DI_FTYPE_PCV4DI_V4DI_UQI
:
10728 case V8DI_FTYPE_PCV8DI_V8DI_UQI
:
10729 case V64QI_FTYPE_PCV64QI_V64QI_UDI
:
10730 case V32HI_FTYPE_PCV32HI_V32HI_USI
:
10731 case V32QI_FTYPE_PCV32QI_V32QI_USI
:
10732 case V16QI_FTYPE_PCV16QI_V16QI_UHI
:
10733 case V16HI_FTYPE_PCV16HI_V16HI_UHI
:
10734 case V8HI_FTYPE_PCV8HI_V8HI_UQI
:
10737 /* These builtins and instructions require the memory
10738 to be properly aligned. */
10739 case CODE_FOR_avx512f_loadv16sf_mask
:
10740 case CODE_FOR_avx512f_loadv16si_mask
:
10741 case CODE_FOR_avx512f_loadv8df_mask
:
10742 case CODE_FOR_avx512f_loadv8di_mask
:
10743 case CODE_FOR_avx512vl_loadv8sf_mask
:
10744 case CODE_FOR_avx512vl_loadv8si_mask
:
10745 case CODE_FOR_avx512vl_loadv4df_mask
:
10746 case CODE_FOR_avx512vl_loadv4di_mask
:
10747 case CODE_FOR_avx512vl_loadv4sf_mask
:
10748 case CODE_FOR_avx512vl_loadv4si_mask
:
10749 case CODE_FOR_avx512vl_loadv2df_mask
:
10750 case CODE_FOR_avx512vl_loadv2di_mask
:
10751 case CODE_FOR_avx512bw_loadv64qi_mask
:
10752 case CODE_FOR_avx512vl_loadv32qi_mask
:
10753 case CODE_FOR_avx512vl_loadv16qi_mask
:
10754 case CODE_FOR_avx512bw_loadv32hi_mask
:
10755 case CODE_FOR_avx512vl_loadv16hi_mask
:
10756 case CODE_FOR_avx512vl_loadv8hi_mask
:
10757 aligned_mem
= true;
10763 case V64QI_FTYPE_PCCHAR_V64QI_UDI
:
10764 case V32QI_FTYPE_PCCHAR_V32QI_USI
:
10765 case V16QI_FTYPE_PCCHAR_V16QI_UHI
:
10766 case V32HI_FTYPE_PCSHORT_V32HI_USI
:
10767 case V16HI_FTYPE_PCSHORT_V16HI_UHI
:
10768 case V8HI_FTYPE_PCSHORT_V8HI_UQI
:
10769 case V16SI_FTYPE_PCINT_V16SI_UHI
:
10770 case V8SI_FTYPE_PCINT_V8SI_UQI
:
10771 case V4SI_FTYPE_PCINT_V4SI_UQI
:
10772 case V8DI_FTYPE_PCINT64_V8DI_UQI
:
10773 case V4DI_FTYPE_PCINT64_V4DI_UQI
:
10774 case V2DI_FTYPE_PCINT64_V2DI_UQI
:
10775 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI
:
10776 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI
:
10777 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI
:
10778 case V16SF_FTYPE_PCFLOAT_V16SF_UHI
:
10779 case V8SF_FTYPE_PCFLOAT_V8SF_UQI
:
10780 case V4SF_FTYPE_PCFLOAT_V4SF_UQI
:
10786 gcc_unreachable ();
10789 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
10791 if (klass
== store
)
10793 arg
= CALL_EXPR_ARG (exp
, 0);
10794 op
= expand_normal (arg
);
10795 gcc_assert (target
== 0);
10798 op
= ix86_zero_extend_to_Pmode (op
);
10799 target
= gen_rtx_MEM (tmode
, op
);
10800 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
10801 on it. Try to improve it using get_pointer_alignment,
10802 and if the special builtin is one that requires strict
10803 mode alignment, also from it's GET_MODE_ALIGNMENT.
10804 Failure to do so could lead to ix86_legitimate_combined_insn
10805 rejecting all changes to such insns. */
10806 unsigned int align
= get_pointer_alignment (arg
);
10807 if (aligned_mem
&& align
< GET_MODE_ALIGNMENT (tmode
))
10808 align
= GET_MODE_ALIGNMENT (tmode
);
10809 if (MEM_ALIGN (target
) < align
)
10810 set_mem_align (target
, align
);
10813 target
= force_reg (tmode
, op
);
10821 || !register_operand (target
, tmode
)
10822 || GET_MODE (target
) != tmode
)
10823 target
= gen_reg_rtx (tmode
);
10826 for (i
= 0; i
< nargs
; i
++)
10828 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
10830 arg
= CALL_EXPR_ARG (exp
, i
+ arg_adjust
);
10831 op
= expand_normal (arg
);
10835 /* This must be the memory operand. */
10836 op
= ix86_zero_extend_to_Pmode (op
);
10837 op
= gen_rtx_MEM (mode
, op
);
10838 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
10839 on it. Try to improve it using get_pointer_alignment,
10840 and if the special builtin is one that requires strict
10841 mode alignment, also from it's GET_MODE_ALIGNMENT.
10842 Failure to do so could lead to ix86_legitimate_combined_insn
10843 rejecting all changes to such insns. */
10844 unsigned int align
= get_pointer_alignment (arg
);
10845 if (aligned_mem
&& align
< GET_MODE_ALIGNMENT (mode
))
10846 align
= GET_MODE_ALIGNMENT (mode
);
10847 if (MEM_ALIGN (op
) < align
)
10848 set_mem_align (op
, align
);
10852 /* This must be register. */
10853 if (VECTOR_MODE_P (mode
))
10854 op
= safe_vector_operand (op
, mode
);
10856 op
= fixup_modeless_constant (op
, mode
);
10858 /* NB: 3-operands load implied it's a mask load,
10859 and that mask operand shoud be at the end.
10860 Keep all-ones mask which would be simplified by the expander. */
10861 if (nargs
== 3 && i
== 2 && klass
== load
10862 && constm1_operand (op
, mode
))
10864 else if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
10865 op
= copy_to_mode_reg (mode
, op
);
10868 op
= copy_to_reg (op
);
10869 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
10879 pat
= GEN_FCN (icode
) (target
);
10882 pat
= GEN_FCN (icode
) (target
, xops
[0]);
10885 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1]);
10888 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2]);
10891 gcc_unreachable ();
10898 return klass
== store
? 0 : target
;
10901 /* Return the integer constant in ARG. Constrain it to be in the range
10902 of the subparts of VEC_TYPE; issue an error if not. */
10905 get_element_number (tree vec_type
, tree arg
)
10907 unsigned HOST_WIDE_INT elt
, max
= TYPE_VECTOR_SUBPARTS (vec_type
) - 1;
10909 if (!tree_fits_uhwi_p (arg
)
10910 || (elt
= tree_to_uhwi (arg
), elt
> max
))
10912 error ("selector must be an integer constant in the range "
10920 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10921 ix86_expand_vector_init. We DO have language-level syntax for this, in
10922 the form of (type){ init-list }. Except that since we can't place emms
10923 instructions from inside the compiler, we can't allow the use of MMX
10924 registers unless the user explicitly asks for it. So we do *not* define
10925 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
10926 we have builtins invoked by mmintrin.h that gives us license to emit
10927 these sorts of instructions. */
10930 ix86_expand_vec_init_builtin (tree type
, tree exp
, rtx target
)
10932 machine_mode tmode
= TYPE_MODE (type
);
10933 machine_mode inner_mode
= GET_MODE_INNER (tmode
);
10934 int i
, n_elt
= GET_MODE_NUNITS (tmode
);
10935 rtvec v
= rtvec_alloc (n_elt
);
10937 gcc_assert (VECTOR_MODE_P (tmode
));
10938 gcc_assert (call_expr_nargs (exp
) == n_elt
);
10940 for (i
= 0; i
< n_elt
; ++i
)
10942 rtx x
= expand_normal (CALL_EXPR_ARG (exp
, i
));
10943 RTVEC_ELT (v
, i
) = gen_lowpart (inner_mode
, x
);
10946 if (!target
|| !register_operand (target
, tmode
))
10947 target
= gen_reg_rtx (tmode
);
10949 ix86_expand_vector_init (true, target
, gen_rtx_PARALLEL (tmode
, v
));
10953 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10954 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
10955 had a language-level syntax for referencing vector elements. */
10958 ix86_expand_vec_ext_builtin (tree exp
, rtx target
)
10960 machine_mode tmode
, mode0
;
10965 arg0
= CALL_EXPR_ARG (exp
, 0);
10966 arg1
= CALL_EXPR_ARG (exp
, 1);
10968 op0
= expand_normal (arg0
);
10969 elt
= get_element_number (TREE_TYPE (arg0
), arg1
);
10971 tmode
= TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0
)));
10972 mode0
= TYPE_MODE (TREE_TYPE (arg0
));
10973 gcc_assert (VECTOR_MODE_P (mode0
));
10975 op0
= force_reg (mode0
, op0
);
10977 if (optimize
|| !target
|| !register_operand (target
, tmode
))
10978 target
= gen_reg_rtx (tmode
);
10980 ix86_expand_vector_extract (true, target
, op0
, elt
);
10985 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10986 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
10987 a language-level syntax for referencing vector elements. */
10990 ix86_expand_vec_set_builtin (tree exp
)
10992 machine_mode tmode
, mode1
;
10993 tree arg0
, arg1
, arg2
;
10995 rtx op0
, op1
, target
;
10997 arg0
= CALL_EXPR_ARG (exp
, 0);
10998 arg1
= CALL_EXPR_ARG (exp
, 1);
10999 arg2
= CALL_EXPR_ARG (exp
, 2);
11001 tmode
= TYPE_MODE (TREE_TYPE (arg0
));
11002 mode1
= TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0
)));
11003 gcc_assert (VECTOR_MODE_P (tmode
));
11005 op0
= expand_expr (arg0
, NULL_RTX
, tmode
, EXPAND_NORMAL
);
11006 op1
= expand_expr (arg1
, NULL_RTX
, mode1
, EXPAND_NORMAL
);
11007 elt
= get_element_number (TREE_TYPE (arg0
), arg2
);
11009 if (GET_MODE (op1
) != mode1
&& GET_MODE (op1
) != VOIDmode
)
11010 op1
= convert_modes (mode1
, GET_MODE (op1
), op1
, true);
11012 op0
= force_reg (tmode
, op0
);
11013 op1
= force_reg (mode1
, op1
);
11015 /* OP0 is the source of these builtin functions and shouldn't be
11016 modified. Create a copy, use it and return it as target. */
11017 target
= gen_reg_rtx (tmode
);
11018 emit_move_insn (target
, op0
);
11019 ix86_expand_vector_set (true, target
, op1
, elt
);
11024 /* Expand an expression EXP that calls a built-in function,
11025 with result going to TARGET if that's convenient
11026 (and in mode MODE if that's convenient).
11027 SUBTARGET may be used as the target for computing one of EXP's operands.
11028 IGNORE is nonzero if the value is to be ignored. */
11031 ix86_expand_builtin (tree exp
, rtx target
, rtx subtarget
,
11032 machine_mode mode
, int ignore
)
11035 enum insn_code icode
, icode2
;
11036 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
11037 tree arg0
, arg1
, arg2
, arg3
, arg4
;
11038 rtx op0
, op1
, op2
, op3
, op4
, pat
, pat2
, insn
;
11039 machine_mode mode0
, mode1
, mode2
, mode3
, mode4
;
11040 unsigned int fcode
= DECL_MD_FUNCTION_CODE (fndecl
);
11042 /* For CPU builtins that can be folded, fold first and expand the fold. */
11045 case IX86_BUILTIN_CPU_INIT
:
11047 /* Make it call __cpu_indicator_init in libgcc. */
11048 tree call_expr
, fndecl
, type
;
11049 type
= build_function_type_list (integer_type_node
, NULL_TREE
);
11050 fndecl
= build_fn_decl ("__cpu_indicator_init", type
);
11051 call_expr
= build_call_expr (fndecl
, 0);
11052 return expand_expr (call_expr
, target
, mode
, EXPAND_NORMAL
);
11054 case IX86_BUILTIN_CPU_IS
:
11055 case IX86_BUILTIN_CPU_SUPPORTS
:
11057 tree arg0
= CALL_EXPR_ARG (exp
, 0);
11058 tree fold_expr
= fold_builtin_cpu (fndecl
, &arg0
);
11059 gcc_assert (fold_expr
!= NULL_TREE
);
11060 return expand_expr (fold_expr
, target
, mode
, EXPAND_NORMAL
);
11064 HOST_WIDE_INT isa
= ix86_isa_flags
;
11065 HOST_WIDE_INT isa2
= ix86_isa_flags2
;
11066 HOST_WIDE_INT bisa
= ix86_builtins_isa
[fcode
].isa
;
11067 HOST_WIDE_INT bisa2
= ix86_builtins_isa
[fcode
].isa2
;
11068 /* The general case is we require all the ISAs specified in bisa{,2}
11070 The exceptions are:
11071 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
11072 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
11073 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
11074 (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
11075 OPTION_MASK_ISA2_AVXVNNI
11076 where for each such pair it is sufficient if either of the ISAs is
11077 enabled, plus if it is ored with other options also those others.
11078 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
11079 if (((bisa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
))
11080 == (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
))
11081 && (isa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
)) != 0)
11082 isa
|= (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
);
11084 if (((bisa
& (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
))
11085 == (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
))
11086 && (isa
& (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
)) != 0)
11087 isa
|= (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
);
11089 if (((bisa
& (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
))
11090 == (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
))
11091 && (isa
& (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
)) != 0)
11092 isa
|= (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
);
11094 if ((((bisa
& (OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
))
11095 == (OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
))
11096 || (bisa2
& OPTION_MASK_ISA2_AVXVNNI
) != 0)
11097 && (((isa
& (OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
))
11098 == (OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
))
11099 || (isa2
& OPTION_MASK_ISA2_AVXVNNI
) != 0))
11101 isa
|= OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
;
11102 isa2
|= OPTION_MASK_ISA2_AVXVNNI
;
11105 if ((bisa
& OPTION_MASK_ISA_MMX
) && !TARGET_MMX
&& TARGET_MMX_WITH_SSE
11106 /* __builtin_ia32_maskmovq requires MMX registers. */
11107 && fcode
!= IX86_BUILTIN_MASKMOVQ
)
11109 bisa
&= ~OPTION_MASK_ISA_MMX
;
11110 bisa
|= OPTION_MASK_ISA_SSE2
;
11113 if ((bisa
& isa
) != bisa
|| (bisa2
& isa2
) != bisa2
)
11115 bool add_abi_p
= bisa
& OPTION_MASK_ISA_64BIT
;
11116 if (TARGET_ABI_X32
)
11117 bisa
|= OPTION_MASK_ABI_X32
;
11119 bisa
|= OPTION_MASK_ABI_64
;
11120 char *opts
= ix86_target_string (bisa
, bisa2
, 0, 0, NULL
, NULL
,
11121 (enum fpmath_unit
) 0,
11122 (enum prefer_vector_width
) 0,
11125 error ("%qE needs unknown isa option", fndecl
);
11128 gcc_assert (opts
!= NULL
);
11129 error ("%qE needs isa option %s", fndecl
, opts
);
11132 return expand_call (exp
, target
, ignore
);
11137 case IX86_BUILTIN_MASKMOVQ
:
11138 case IX86_BUILTIN_MASKMOVDQU
:
11139 icode
= (fcode
== IX86_BUILTIN_MASKMOVQ
11140 ? CODE_FOR_mmx_maskmovq
11141 : CODE_FOR_sse2_maskmovdqu
);
11142 /* Note the arg order is different from the operand order. */
11143 arg1
= CALL_EXPR_ARG (exp
, 0);
11144 arg2
= CALL_EXPR_ARG (exp
, 1);
11145 arg0
= CALL_EXPR_ARG (exp
, 2);
11146 op0
= expand_normal (arg0
);
11147 op1
= expand_normal (arg1
);
11148 op2
= expand_normal (arg2
);
11149 mode0
= insn_data
[icode
].operand
[0].mode
;
11150 mode1
= insn_data
[icode
].operand
[1].mode
;
11151 mode2
= insn_data
[icode
].operand
[2].mode
;
11153 op0
= ix86_zero_extend_to_Pmode (op0
);
11154 op0
= gen_rtx_MEM (mode1
, op0
);
11156 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
11157 op0
= copy_to_mode_reg (mode0
, op0
);
11158 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
11159 op1
= copy_to_mode_reg (mode1
, op1
);
11160 if (!insn_data
[icode
].operand
[2].predicate (op2
, mode2
))
11161 op2
= copy_to_mode_reg (mode2
, op2
);
11162 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11168 case IX86_BUILTIN_LDMXCSR
:
11169 op0
= expand_normal (CALL_EXPR_ARG (exp
, 0));
11170 target
= assign_386_stack_local (SImode
, SLOT_TEMP
);
11171 emit_move_insn (target
, op0
);
11172 emit_insn (gen_sse_ldmxcsr (target
));
11175 case IX86_BUILTIN_STMXCSR
:
11176 target
= assign_386_stack_local (SImode
, SLOT_TEMP
);
11177 emit_insn (gen_sse_stmxcsr (target
));
11178 return copy_to_mode_reg (SImode
, target
);
11180 case IX86_BUILTIN_CLFLUSH
:
11181 arg0
= CALL_EXPR_ARG (exp
, 0);
11182 op0
= expand_normal (arg0
);
11183 icode
= CODE_FOR_sse2_clflush
;
11184 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11185 op0
= ix86_zero_extend_to_Pmode (op0
);
11187 emit_insn (gen_sse2_clflush (op0
));
11190 case IX86_BUILTIN_CLWB
:
11191 arg0
= CALL_EXPR_ARG (exp
, 0);
11192 op0
= expand_normal (arg0
);
11193 icode
= CODE_FOR_clwb
;
11194 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11195 op0
= ix86_zero_extend_to_Pmode (op0
);
11197 emit_insn (gen_clwb (op0
));
11200 case IX86_BUILTIN_CLFLUSHOPT
:
11201 arg0
= CALL_EXPR_ARG (exp
, 0);
11202 op0
= expand_normal (arg0
);
11203 icode
= CODE_FOR_clflushopt
;
11204 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11205 op0
= ix86_zero_extend_to_Pmode (op0
);
11207 emit_insn (gen_clflushopt (op0
));
11210 case IX86_BUILTIN_MONITOR
:
11211 case IX86_BUILTIN_MONITORX
:
11212 arg0
= CALL_EXPR_ARG (exp
, 0);
11213 arg1
= CALL_EXPR_ARG (exp
, 1);
11214 arg2
= CALL_EXPR_ARG (exp
, 2);
11215 op0
= expand_normal (arg0
);
11216 op1
= expand_normal (arg1
);
11217 op2
= expand_normal (arg2
);
11219 op0
= ix86_zero_extend_to_Pmode (op0
);
11221 op1
= copy_to_mode_reg (SImode
, op1
);
11223 op2
= copy_to_mode_reg (SImode
, op2
);
11225 emit_insn (fcode
== IX86_BUILTIN_MONITOR
11226 ? gen_sse3_monitor (Pmode
, op0
, op1
, op2
)
11227 : gen_monitorx (Pmode
, op0
, op1
, op2
));
11230 case IX86_BUILTIN_MWAIT
:
11231 arg0
= CALL_EXPR_ARG (exp
, 0);
11232 arg1
= CALL_EXPR_ARG (exp
, 1);
11233 op0
= expand_normal (arg0
);
11234 op1
= expand_normal (arg1
);
11236 op0
= copy_to_mode_reg (SImode
, op0
);
11238 op1
= copy_to_mode_reg (SImode
, op1
);
11239 emit_insn (gen_sse3_mwait (op0
, op1
));
11242 case IX86_BUILTIN_MWAITX
:
11243 arg0
= CALL_EXPR_ARG (exp
, 0);
11244 arg1
= CALL_EXPR_ARG (exp
, 1);
11245 arg2
= CALL_EXPR_ARG (exp
, 2);
11246 op0
= expand_normal (arg0
);
11247 op1
= expand_normal (arg1
);
11248 op2
= expand_normal (arg2
);
11250 op0
= copy_to_mode_reg (SImode
, op0
);
11252 op1
= copy_to_mode_reg (SImode
, op1
);
11254 op2
= copy_to_mode_reg (SImode
, op2
);
11255 emit_insn (gen_mwaitx (op0
, op1
, op2
));
11258 case IX86_BUILTIN_UMONITOR
:
11259 arg0
= CALL_EXPR_ARG (exp
, 0);
11260 op0
= expand_normal (arg0
);
11262 op0
= ix86_zero_extend_to_Pmode (op0
);
11263 emit_insn (gen_umonitor (Pmode
, op0
));
11266 case IX86_BUILTIN_UMWAIT
:
11267 case IX86_BUILTIN_TPAUSE
:
11268 arg0
= CALL_EXPR_ARG (exp
, 0);
11269 arg1
= CALL_EXPR_ARG (exp
, 1);
11270 op0
= expand_normal (arg0
);
11271 op1
= expand_normal (arg1
);
11274 op0
= copy_to_mode_reg (SImode
, op0
);
11276 op1
= force_reg (DImode
, op1
);
11280 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
11281 NULL
, 1, OPTAB_DIRECT
);
11284 case IX86_BUILTIN_UMWAIT
:
11285 icode
= CODE_FOR_umwait_rex64
;
11287 case IX86_BUILTIN_TPAUSE
:
11288 icode
= CODE_FOR_tpause_rex64
;
11291 gcc_unreachable ();
11294 op2
= gen_lowpart (SImode
, op2
);
11295 op1
= gen_lowpart (SImode
, op1
);
11296 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11302 case IX86_BUILTIN_UMWAIT
:
11303 icode
= CODE_FOR_umwait
;
11305 case IX86_BUILTIN_TPAUSE
:
11306 icode
= CODE_FOR_tpause
;
11309 gcc_unreachable ();
11311 pat
= GEN_FCN (icode
) (op0
, op1
);
11320 || !register_operand (target
, QImode
))
11321 target
= gen_reg_rtx (QImode
);
11323 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
11325 emit_insn (gen_rtx_SET (target
, pat
));
11329 case IX86_BUILTIN_TESTUI
:
11330 emit_insn (gen_testui ());
11333 || !register_operand (target
, QImode
))
11334 target
= gen_reg_rtx (QImode
);
11336 pat
= gen_rtx_LTU (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
11338 emit_insn (gen_rtx_SET (target
, pat
));
11342 case IX86_BUILTIN_CLZERO
:
11343 arg0
= CALL_EXPR_ARG (exp
, 0);
11344 op0
= expand_normal (arg0
);
11346 op0
= ix86_zero_extend_to_Pmode (op0
);
11347 emit_insn (gen_clzero (Pmode
, op0
));
11350 case IX86_BUILTIN_CLDEMOTE
:
11351 arg0
= CALL_EXPR_ARG (exp
, 0);
11352 op0
= expand_normal (arg0
);
11353 icode
= CODE_FOR_cldemote
;
11354 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11355 op0
= ix86_zero_extend_to_Pmode (op0
);
11357 emit_insn (gen_cldemote (op0
));
11360 case IX86_BUILTIN_LOADIWKEY
:
11362 arg0
= CALL_EXPR_ARG (exp
, 0);
11363 arg1
= CALL_EXPR_ARG (exp
, 1);
11364 arg2
= CALL_EXPR_ARG (exp
, 2);
11365 arg3
= CALL_EXPR_ARG (exp
, 3);
11367 op0
= expand_normal (arg0
);
11368 op1
= expand_normal (arg1
);
11369 op2
= expand_normal (arg2
);
11370 op3
= expand_normal (arg3
);
11373 op0
= copy_to_mode_reg (V2DImode
, op0
);
11375 op1
= copy_to_mode_reg (V2DImode
, op1
);
11377 op2
= copy_to_mode_reg (V2DImode
, op2
);
11379 op3
= copy_to_mode_reg (SImode
, op3
);
11381 emit_insn (gen_loadiwkey (op0
, op1
, op2
, op3
));
11386 case IX86_BUILTIN_AESDEC128KLU8
:
11387 icode
= CODE_FOR_aesdec128klu8
;
11388 goto aesdecenc_expand
;
11390 case IX86_BUILTIN_AESDEC256KLU8
:
11391 icode
= CODE_FOR_aesdec256klu8
;
11392 goto aesdecenc_expand
;
11394 case IX86_BUILTIN_AESENC128KLU8
:
11395 icode
= CODE_FOR_aesenc128klu8
;
11396 goto aesdecenc_expand
;
11398 case IX86_BUILTIN_AESENC256KLU8
:
11399 icode
= CODE_FOR_aesenc256klu8
;
11403 arg0
= CALL_EXPR_ARG (exp
, 0); // __m128i *odata
11404 arg1
= CALL_EXPR_ARG (exp
, 1); // __m128i idata
11405 arg2
= CALL_EXPR_ARG (exp
, 2); // const void *p
11407 op0
= expand_normal (arg0
);
11408 op1
= expand_normal (arg1
);
11409 op2
= expand_normal (arg2
);
11411 if (!address_operand (op0
, V2DImode
))
11413 op0
= convert_memory_address (Pmode
, op0
);
11414 op0
= copy_addr_to_reg (op0
);
11416 op0
= gen_rtx_MEM (V2DImode
, op0
);
11419 op1
= copy_to_mode_reg (V2DImode
, op1
);
11421 if (!address_operand (op2
, VOIDmode
))
11423 op2
= convert_memory_address (Pmode
, op2
);
11424 op2
= copy_addr_to_reg (op2
);
11426 op2
= gen_rtx_MEM (BLKmode
, op2
);
11428 emit_insn (GEN_FCN (icode
) (op1
, op1
, op2
));
11431 target
= gen_reg_rtx (QImode
);
11433 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCZmode
, FLAGS_REG
),
11435 emit_insn (gen_rtx_SET (target
, pat
));
11437 emit_insn (gen_rtx_SET (op0
, op1
));
11441 case IX86_BUILTIN_AESDECWIDE128KLU8
:
11442 icode
= CODE_FOR_aesdecwide128klu8
;
11443 goto wideaesdecenc_expand
;
11445 case IX86_BUILTIN_AESDECWIDE256KLU8
:
11446 icode
= CODE_FOR_aesdecwide256klu8
;
11447 goto wideaesdecenc_expand
;
11449 case IX86_BUILTIN_AESENCWIDE128KLU8
:
11450 icode
= CODE_FOR_aesencwide128klu8
;
11451 goto wideaesdecenc_expand
;
11453 case IX86_BUILTIN_AESENCWIDE256KLU8
:
11454 icode
= CODE_FOR_aesencwide256klu8
;
11456 wideaesdecenc_expand
:
11461 arg0
= CALL_EXPR_ARG (exp
, 0); // __m128i * odata
11462 arg1
= CALL_EXPR_ARG (exp
, 1); // const __m128i * idata
11463 arg2
= CALL_EXPR_ARG (exp
, 2); // const void *p
11465 op0
= expand_normal (arg0
);
11466 op1
= expand_normal (arg1
);
11467 op2
= expand_normal (arg2
);
11469 if (!address_operand (op2
, VOIDmode
))
11471 op2
= convert_memory_address (Pmode
, op2
);
11472 op2
= copy_addr_to_reg (op2
);
11474 op2
= gen_rtx_MEM (BLKmode
, op2
);
11476 for (i
= 0; i
< 8; i
++)
11478 xmm_regs
[i
] = gen_rtx_REG (V2DImode
, GET_SSE_REGNO (i
));
11480 op
= gen_rtx_MEM (V2DImode
,
11481 plus_constant (Pmode
, op1
, (i
* 16)));
11483 emit_move_insn (xmm_regs
[i
], op
);
11486 emit_insn (GEN_FCN (icode
) (op2
));
11489 target
= gen_reg_rtx (QImode
);
11491 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCZmode
, FLAGS_REG
),
11493 emit_insn (gen_rtx_SET (target
, pat
));
11495 for (i
= 0; i
< 8; i
++)
11497 op
= gen_rtx_MEM (V2DImode
,
11498 plus_constant (Pmode
, op0
, (i
* 16)));
11499 emit_move_insn (op
, xmm_regs
[i
]);
11504 case IX86_BUILTIN_ENCODEKEY128U32
:
11506 rtx op
, xmm_regs
[7];
11508 arg0
= CALL_EXPR_ARG (exp
, 0); // unsigned int htype
11509 arg1
= CALL_EXPR_ARG (exp
, 1); // __m128i key
11510 arg2
= CALL_EXPR_ARG (exp
, 2); // void *h
11512 op0
= expand_normal (arg0
);
11513 op1
= expand_normal (arg1
);
11514 op2
= expand_normal (arg2
);
11517 op0
= copy_to_mode_reg (SImode
, op0
);
11519 op
= gen_rtx_REG (V2DImode
, GET_SSE_REGNO (0));
11520 emit_move_insn (op
, op1
);
11522 for (i
= 0; i
< 3; i
++)
11523 xmm_regs
[i
] = gen_rtx_REG (V2DImode
, GET_SSE_REGNO (i
));
11526 target
= gen_reg_rtx (SImode
);
11528 emit_insn (gen_encodekey128u32 (target
, op0
));
11530 for (i
= 0; i
< 3; i
++)
11532 op
= gen_rtx_MEM (V2DImode
,
11533 plus_constant (Pmode
, op2
, (i
* 16)));
11534 emit_move_insn (op
, xmm_regs
[i
]);
11539 case IX86_BUILTIN_ENCODEKEY256U32
:
11541 rtx op
, xmm_regs
[7];
11543 arg0
= CALL_EXPR_ARG (exp
, 0); // unsigned int htype
11544 arg1
= CALL_EXPR_ARG (exp
, 1); // __m128i keylow
11545 arg2
= CALL_EXPR_ARG (exp
, 2); // __m128i keyhi
11546 arg3
= CALL_EXPR_ARG (exp
, 3); // void *h
11548 op0
= expand_normal (arg0
);
11549 op1
= expand_normal (arg1
);
11550 op2
= expand_normal (arg2
);
11551 op3
= expand_normal (arg3
);
11554 op0
= copy_to_mode_reg (SImode
, op0
);
11556 /* Force to use xmm0, xmm1 for keylow, keyhi*/
11557 op
= gen_rtx_REG (V2DImode
, GET_SSE_REGNO (0));
11558 emit_move_insn (op
, op1
);
11559 op
= gen_rtx_REG (V2DImode
, GET_SSE_REGNO (1));
11560 emit_move_insn (op
, op2
);
11562 for (i
= 0; i
< 4; i
++)
11563 xmm_regs
[i
] = gen_rtx_REG (V2DImode
, GET_SSE_REGNO (i
));
11566 target
= gen_reg_rtx (SImode
);
11568 emit_insn (gen_encodekey256u32 (target
, op0
));
11570 for (i
= 0; i
< 4; i
++)
11572 op
= gen_rtx_MEM (V2DImode
,
11573 plus_constant (Pmode
, op3
, (i
* 16)));
11574 emit_move_insn (op
, xmm_regs
[i
]);
11580 case IX86_BUILTIN_VEC_INIT_V2SI
:
11581 case IX86_BUILTIN_VEC_INIT_V4HI
:
11582 case IX86_BUILTIN_VEC_INIT_V8QI
:
11583 return ix86_expand_vec_init_builtin (TREE_TYPE (exp
), exp
, target
);
11585 case IX86_BUILTIN_VEC_EXT_V2DF
:
11586 case IX86_BUILTIN_VEC_EXT_V2DI
:
11587 case IX86_BUILTIN_VEC_EXT_V4SF
:
11588 case IX86_BUILTIN_VEC_EXT_V4SI
:
11589 case IX86_BUILTIN_VEC_EXT_V8HI
:
11590 case IX86_BUILTIN_VEC_EXT_V2SI
:
11591 case IX86_BUILTIN_VEC_EXT_V4HI
:
11592 case IX86_BUILTIN_VEC_EXT_V16QI
:
11593 return ix86_expand_vec_ext_builtin (exp
, target
);
11595 case IX86_BUILTIN_VEC_SET_V2DI
:
11596 case IX86_BUILTIN_VEC_SET_V4SF
:
11597 case IX86_BUILTIN_VEC_SET_V4SI
:
11598 case IX86_BUILTIN_VEC_SET_V8HI
:
11599 case IX86_BUILTIN_VEC_SET_V4HI
:
11600 case IX86_BUILTIN_VEC_SET_V16QI
:
11601 return ix86_expand_vec_set_builtin (exp
);
11603 case IX86_BUILTIN_NANQ
:
11604 case IX86_BUILTIN_NANSQ
:
11605 return expand_call (exp
, target
, ignore
);
11607 case IX86_BUILTIN_RDPID
:
11609 op0
= gen_reg_rtx (word_mode
);
11613 insn
= gen_rdpid_rex64 (op0
);
11614 op0
= convert_to_mode (SImode
, op0
, 1);
11617 insn
= gen_rdpid (op0
);
11622 || !register_operand (target
, SImode
))
11623 target
= gen_reg_rtx (SImode
);
11625 emit_move_insn (target
, op0
);
11628 case IX86_BUILTIN_2INTERSECTD512
:
11629 case IX86_BUILTIN_2INTERSECTQ512
:
11630 case IX86_BUILTIN_2INTERSECTD256
:
11631 case IX86_BUILTIN_2INTERSECTQ256
:
11632 case IX86_BUILTIN_2INTERSECTD128
:
11633 case IX86_BUILTIN_2INTERSECTQ128
:
11634 arg0
= CALL_EXPR_ARG (exp
, 0);
11635 arg1
= CALL_EXPR_ARG (exp
, 1);
11636 arg2
= CALL_EXPR_ARG (exp
, 2);
11637 arg3
= CALL_EXPR_ARG (exp
, 3);
11638 op0
= expand_normal (arg0
);
11639 op1
= expand_normal (arg1
);
11640 op2
= expand_normal (arg2
);
11641 op3
= expand_normal (arg3
);
11643 if (!address_operand (op0
, VOIDmode
))
11645 op0
= convert_memory_address (Pmode
, op0
);
11646 op0
= copy_addr_to_reg (op0
);
11648 if (!address_operand (op1
, VOIDmode
))
11650 op1
= convert_memory_address (Pmode
, op1
);
11651 op1
= copy_addr_to_reg (op1
);
11656 case IX86_BUILTIN_2INTERSECTD512
:
11658 icode
= CODE_FOR_avx512vp2intersect_2intersectv16si
;
11660 case IX86_BUILTIN_2INTERSECTQ512
:
11662 icode
= CODE_FOR_avx512vp2intersect_2intersectv8di
;
11664 case IX86_BUILTIN_2INTERSECTD256
:
11666 icode
= CODE_FOR_avx512vp2intersect_2intersectv8si
;
11668 case IX86_BUILTIN_2INTERSECTQ256
:
11670 icode
= CODE_FOR_avx512vp2intersect_2intersectv4di
;
11672 case IX86_BUILTIN_2INTERSECTD128
:
11674 icode
= CODE_FOR_avx512vp2intersect_2intersectv4si
;
11676 case IX86_BUILTIN_2INTERSECTQ128
:
11678 icode
= CODE_FOR_avx512vp2intersect_2intersectv2di
;
11681 gcc_unreachable ();
11684 mode2
= insn_data
[icode
].operand
[1].mode
;
11685 mode3
= insn_data
[icode
].operand
[2].mode
;
11686 if (!insn_data
[icode
].operand
[1].predicate (op2
, mode2
))
11687 op2
= copy_to_mode_reg (mode2
, op2
);
11688 if (!insn_data
[icode
].operand
[2].predicate (op3
, mode3
))
11689 op3
= copy_to_mode_reg (mode3
, op3
);
11691 op4
= gen_reg_rtx (mode4
);
11692 emit_insn (GEN_FCN (icode
) (op4
, op2
, op3
));
11693 mode0
= mode4
== P2HImode
? HImode
: QImode
;
11694 emit_move_insn (gen_rtx_MEM (mode0
, op0
),
11695 gen_lowpart (mode0
, op4
));
11696 emit_move_insn (gen_rtx_MEM (mode0
, op1
),
11697 gen_highpart (mode0
, op4
));
11701 case IX86_BUILTIN_RDPMC
:
11702 case IX86_BUILTIN_RDTSC
:
11703 case IX86_BUILTIN_RDTSCP
:
11704 case IX86_BUILTIN_XGETBV
:
11706 op0
= gen_reg_rtx (DImode
);
11707 op1
= gen_reg_rtx (DImode
);
11709 if (fcode
== IX86_BUILTIN_RDPMC
)
11711 arg0
= CALL_EXPR_ARG (exp
, 0);
11712 op2
= expand_normal (arg0
);
11713 if (!register_operand (op2
, SImode
))
11714 op2
= copy_to_mode_reg (SImode
, op2
);
11716 insn
= (TARGET_64BIT
11717 ? gen_rdpmc_rex64 (op0
, op1
, op2
)
11718 : gen_rdpmc (op0
, op2
));
11721 else if (fcode
== IX86_BUILTIN_XGETBV
)
11723 arg0
= CALL_EXPR_ARG (exp
, 0);
11724 op2
= expand_normal (arg0
);
11725 if (!register_operand (op2
, SImode
))
11726 op2
= copy_to_mode_reg (SImode
, op2
);
11728 insn
= (TARGET_64BIT
11729 ? gen_xgetbv_rex64 (op0
, op1
, op2
)
11730 : gen_xgetbv (op0
, op2
));
11733 else if (fcode
== IX86_BUILTIN_RDTSC
)
11735 insn
= (TARGET_64BIT
11736 ? gen_rdtsc_rex64 (op0
, op1
)
11737 : gen_rdtsc (op0
));
11742 op2
= gen_reg_rtx (SImode
);
11744 insn
= (TARGET_64BIT
11745 ? gen_rdtscp_rex64 (op0
, op1
, op2
)
11746 : gen_rdtscp (op0
, op2
));
11749 arg0
= CALL_EXPR_ARG (exp
, 0);
11750 op4
= expand_normal (arg0
);
11751 if (!address_operand (op4
, VOIDmode
))
11753 op4
= convert_memory_address (Pmode
, op4
);
11754 op4
= copy_addr_to_reg (op4
);
11756 emit_move_insn (gen_rtx_MEM (SImode
, op4
), op2
);
11760 || !register_operand (target
, DImode
))
11761 target
= gen_reg_rtx (DImode
);
11765 op1
= expand_simple_binop (DImode
, ASHIFT
, op1
, GEN_INT (32),
11766 op1
, 1, OPTAB_DIRECT
);
11767 op0
= expand_simple_binop (DImode
, IOR
, op0
, op1
,
11768 op0
, 1, OPTAB_DIRECT
);
11771 emit_move_insn (target
, op0
);
11774 case IX86_BUILTIN_ENQCMD
:
11775 case IX86_BUILTIN_ENQCMDS
:
11776 case IX86_BUILTIN_MOVDIR64B
:
11778 arg0
= CALL_EXPR_ARG (exp
, 0);
11779 arg1
= CALL_EXPR_ARG (exp
, 1);
11780 op0
= expand_normal (arg0
);
11781 op1
= expand_normal (arg1
);
11783 op0
= ix86_zero_extend_to_Pmode (op0
);
11784 if (!address_operand (op1
, VOIDmode
))
11786 op1
= convert_memory_address (Pmode
, op1
);
11787 op1
= copy_addr_to_reg (op1
);
11789 op1
= gen_rtx_MEM (XImode
, op1
);
11791 if (fcode
== IX86_BUILTIN_MOVDIR64B
)
11793 emit_insn (gen_movdir64b (Pmode
, op0
, op1
));
11799 || !register_operand (target
, SImode
))
11800 target
= gen_reg_rtx (SImode
);
11802 emit_move_insn (target
, const0_rtx
);
11803 target
= gen_rtx_SUBREG (QImode
, target
, 0);
11805 int unspecv
= (fcode
== IX86_BUILTIN_ENQCMD
11807 : UNSPECV_ENQCMDS
);
11808 icode
= code_for_enqcmd (unspecv
, Pmode
);
11809 emit_insn (GEN_FCN (icode
) (op0
, op1
));
11812 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
11813 gen_rtx_fmt_ee (EQ
, QImode
,
11814 gen_rtx_REG (CCZmode
, FLAGS_REG
),
11816 return SUBREG_REG (target
);
11819 case IX86_BUILTIN_FXSAVE
:
11820 case IX86_BUILTIN_FXRSTOR
:
11821 case IX86_BUILTIN_FXSAVE64
:
11822 case IX86_BUILTIN_FXRSTOR64
:
11823 case IX86_BUILTIN_FNSTENV
:
11824 case IX86_BUILTIN_FLDENV
:
11828 case IX86_BUILTIN_FXSAVE
:
11829 icode
= CODE_FOR_fxsave
;
11831 case IX86_BUILTIN_FXRSTOR
:
11832 icode
= CODE_FOR_fxrstor
;
11834 case IX86_BUILTIN_FXSAVE64
:
11835 icode
= CODE_FOR_fxsave64
;
11837 case IX86_BUILTIN_FXRSTOR64
:
11838 icode
= CODE_FOR_fxrstor64
;
11840 case IX86_BUILTIN_FNSTENV
:
11841 icode
= CODE_FOR_fnstenv
;
11843 case IX86_BUILTIN_FLDENV
:
11844 icode
= CODE_FOR_fldenv
;
11847 gcc_unreachable ();
11850 arg0
= CALL_EXPR_ARG (exp
, 0);
11851 op0
= expand_normal (arg0
);
11853 if (!address_operand (op0
, VOIDmode
))
11855 op0
= convert_memory_address (Pmode
, op0
);
11856 op0
= copy_addr_to_reg (op0
);
11858 op0
= gen_rtx_MEM (mode0
, op0
);
11860 pat
= GEN_FCN (icode
) (op0
);
11865 case IX86_BUILTIN_XSETBV
:
11866 arg0
= CALL_EXPR_ARG (exp
, 0);
11867 arg1
= CALL_EXPR_ARG (exp
, 1);
11868 op0
= expand_normal (arg0
);
11869 op1
= expand_normal (arg1
);
11872 op0
= copy_to_mode_reg (SImode
, op0
);
11874 op1
= force_reg (DImode
, op1
);
11878 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
11879 NULL
, 1, OPTAB_DIRECT
);
11881 icode
= CODE_FOR_xsetbv_rex64
;
11883 op2
= gen_lowpart (SImode
, op2
);
11884 op1
= gen_lowpart (SImode
, op1
);
11885 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11889 icode
= CODE_FOR_xsetbv
;
11891 pat
= GEN_FCN (icode
) (op0
, op1
);
11897 case IX86_BUILTIN_XSAVE
:
11898 case IX86_BUILTIN_XRSTOR
:
11899 case IX86_BUILTIN_XSAVE64
:
11900 case IX86_BUILTIN_XRSTOR64
:
11901 case IX86_BUILTIN_XSAVEOPT
:
11902 case IX86_BUILTIN_XSAVEOPT64
:
11903 case IX86_BUILTIN_XSAVES
:
11904 case IX86_BUILTIN_XRSTORS
:
11905 case IX86_BUILTIN_XSAVES64
:
11906 case IX86_BUILTIN_XRSTORS64
:
11907 case IX86_BUILTIN_XSAVEC
:
11908 case IX86_BUILTIN_XSAVEC64
:
11909 arg0
= CALL_EXPR_ARG (exp
, 0);
11910 arg1
= CALL_EXPR_ARG (exp
, 1);
11911 op0
= expand_normal (arg0
);
11912 op1
= expand_normal (arg1
);
11914 if (!address_operand (op0
, VOIDmode
))
11916 op0
= convert_memory_address (Pmode
, op0
);
11917 op0
= copy_addr_to_reg (op0
);
11919 op0
= gen_rtx_MEM (BLKmode
, op0
);
11921 op1
= force_reg (DImode
, op1
);
11925 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
11926 NULL
, 1, OPTAB_DIRECT
);
11929 case IX86_BUILTIN_XSAVE
:
11930 icode
= CODE_FOR_xsave_rex64
;
11932 case IX86_BUILTIN_XRSTOR
:
11933 icode
= CODE_FOR_xrstor_rex64
;
11935 case IX86_BUILTIN_XSAVE64
:
11936 icode
= CODE_FOR_xsave64
;
11938 case IX86_BUILTIN_XRSTOR64
:
11939 icode
= CODE_FOR_xrstor64
;
11941 case IX86_BUILTIN_XSAVEOPT
:
11942 icode
= CODE_FOR_xsaveopt_rex64
;
11944 case IX86_BUILTIN_XSAVEOPT64
:
11945 icode
= CODE_FOR_xsaveopt64
;
11947 case IX86_BUILTIN_XSAVES
:
11948 icode
= CODE_FOR_xsaves_rex64
;
11950 case IX86_BUILTIN_XRSTORS
:
11951 icode
= CODE_FOR_xrstors_rex64
;
11953 case IX86_BUILTIN_XSAVES64
:
11954 icode
= CODE_FOR_xsaves64
;
11956 case IX86_BUILTIN_XRSTORS64
:
11957 icode
= CODE_FOR_xrstors64
;
11959 case IX86_BUILTIN_XSAVEC
:
11960 icode
= CODE_FOR_xsavec_rex64
;
11962 case IX86_BUILTIN_XSAVEC64
:
11963 icode
= CODE_FOR_xsavec64
;
11966 gcc_unreachable ();
11969 op2
= gen_lowpart (SImode
, op2
);
11970 op1
= gen_lowpart (SImode
, op1
);
11971 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11977 case IX86_BUILTIN_XSAVE
:
11978 icode
= CODE_FOR_xsave
;
11980 case IX86_BUILTIN_XRSTOR
:
11981 icode
= CODE_FOR_xrstor
;
11983 case IX86_BUILTIN_XSAVEOPT
:
11984 icode
= CODE_FOR_xsaveopt
;
11986 case IX86_BUILTIN_XSAVES
:
11987 icode
= CODE_FOR_xsaves
;
11989 case IX86_BUILTIN_XRSTORS
:
11990 icode
= CODE_FOR_xrstors
;
11992 case IX86_BUILTIN_XSAVEC
:
11993 icode
= CODE_FOR_xsavec
;
11996 gcc_unreachable ();
11998 pat
= GEN_FCN (icode
) (op0
, op1
);
12005 case IX86_BUILTIN_LLWPCB
:
12006 arg0
= CALL_EXPR_ARG (exp
, 0);
12007 op0
= expand_normal (arg0
);
12009 if (!register_operand (op0
, Pmode
))
12010 op0
= ix86_zero_extend_to_Pmode (op0
);
12011 emit_insn (gen_lwp_llwpcb (Pmode
, op0
));
12014 case IX86_BUILTIN_SLWPCB
:
12016 || !register_operand (target
, Pmode
))
12017 target
= gen_reg_rtx (Pmode
);
12018 emit_insn (gen_lwp_slwpcb (Pmode
, target
));
12021 case IX86_BUILTIN_LWPVAL32
:
12022 case IX86_BUILTIN_LWPVAL64
:
12023 case IX86_BUILTIN_LWPINS32
:
12024 case IX86_BUILTIN_LWPINS64
:
12025 mode
= ((fcode
== IX86_BUILTIN_LWPVAL32
12026 || fcode
== IX86_BUILTIN_LWPINS32
)
12027 ? SImode
: DImode
);
12029 if (fcode
== IX86_BUILTIN_LWPVAL32
12030 || fcode
== IX86_BUILTIN_LWPVAL64
)
12031 icode
= code_for_lwp_lwpval (mode
);
12033 icode
= code_for_lwp_lwpins (mode
);
12035 arg0
= CALL_EXPR_ARG (exp
, 0);
12036 arg1
= CALL_EXPR_ARG (exp
, 1);
12037 arg2
= CALL_EXPR_ARG (exp
, 2);
12038 op0
= expand_normal (arg0
);
12039 op1
= expand_normal (arg1
);
12040 op2
= expand_normal (arg2
);
12041 mode0
= insn_data
[icode
].operand
[0].mode
;
12043 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
12044 op0
= copy_to_mode_reg (mode0
, op0
);
12045 if (!insn_data
[icode
].operand
[1].predicate (op1
, SImode
))
12046 op1
= copy_to_mode_reg (SImode
, op1
);
12048 if (!CONST_INT_P (op2
))
12050 error ("the last argument must be a 32-bit immediate");
12054 emit_insn (GEN_FCN (icode
) (op0
, op1
, op2
));
12056 if (fcode
== IX86_BUILTIN_LWPINS32
12057 || fcode
== IX86_BUILTIN_LWPINS64
)
12060 || !nonimmediate_operand (target
, QImode
))
12061 target
= gen_reg_rtx (QImode
);
12063 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
12065 emit_insn (gen_rtx_SET (target
, pat
));
12072 case IX86_BUILTIN_BEXTRI32
:
12073 case IX86_BUILTIN_BEXTRI64
:
12074 mode
= (fcode
== IX86_BUILTIN_BEXTRI32
? SImode
: DImode
);
12076 arg0
= CALL_EXPR_ARG (exp
, 0);
12077 arg1
= CALL_EXPR_ARG (exp
, 1);
12078 op0
= expand_normal (arg0
);
12079 op1
= expand_normal (arg1
);
12081 if (!CONST_INT_P (op1
))
12083 error ("last argument must be an immediate");
12088 unsigned char lsb_index
= UINTVAL (op1
);
12089 unsigned char length
= UINTVAL (op1
) >> 8;
12091 unsigned char bitsize
= GET_MODE_BITSIZE (mode
);
12093 icode
= code_for_tbm_bextri (mode
);
12095 mode1
= insn_data
[icode
].operand
[1].mode
;
12096 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode1
))
12097 op0
= copy_to_mode_reg (mode1
, op0
);
12099 mode0
= insn_data
[icode
].operand
[0].mode
;
12101 || !register_operand (target
, mode0
))
12102 target
= gen_reg_rtx (mode0
);
12104 if (length
== 0 || lsb_index
>= bitsize
)
12106 emit_move_insn (target
, const0_rtx
);
12110 if (length
+ lsb_index
> bitsize
)
12111 length
= bitsize
- lsb_index
;
12113 op1
= GEN_INT (length
);
12114 op2
= GEN_INT (lsb_index
);
12116 emit_insn (GEN_FCN (icode
) (target
, op0
, op1
, op2
));
12120 case IX86_BUILTIN_RDRAND16_STEP
:
12124 case IX86_BUILTIN_RDRAND32_STEP
:
12128 case IX86_BUILTIN_RDRAND64_STEP
:
12132 arg0
= CALL_EXPR_ARG (exp
, 0);
12133 op1
= expand_normal (arg0
);
12134 if (!address_operand (op1
, VOIDmode
))
12136 op1
= convert_memory_address (Pmode
, op1
);
12137 op1
= copy_addr_to_reg (op1
);
12140 op0
= gen_reg_rtx (mode
);
12141 emit_insn (gen_rdrand (mode
, op0
));
12143 emit_move_insn (gen_rtx_MEM (mode
, op1
), op0
);
12145 op1
= force_reg (SImode
, const1_rtx
);
12147 /* Emit SImode conditional move. */
12148 if (mode
== HImode
)
12150 if (TARGET_ZERO_EXTEND_WITH_AND
12151 && optimize_function_for_speed_p (cfun
))
12153 op2
= force_reg (SImode
, const0_rtx
);
12155 emit_insn (gen_movstricthi
12156 (gen_lowpart (HImode
, op2
), op0
));
12160 op2
= gen_reg_rtx (SImode
);
12162 emit_insn (gen_zero_extendhisi2 (op2
, op0
));
12165 else if (mode
== SImode
)
12168 op2
= gen_rtx_SUBREG (SImode
, op0
, 0);
12171 || !register_operand (target
, SImode
))
12172 target
= gen_reg_rtx (SImode
);
12174 pat
= gen_rtx_GEU (VOIDmode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
12176 emit_insn (gen_rtx_SET (target
,
12177 gen_rtx_IF_THEN_ELSE (SImode
, pat
, op2
, op1
)));
12180 case IX86_BUILTIN_RDSEED16_STEP
:
12184 case IX86_BUILTIN_RDSEED32_STEP
:
12188 case IX86_BUILTIN_RDSEED64_STEP
:
12192 arg0
= CALL_EXPR_ARG (exp
, 0);
12193 op1
= expand_normal (arg0
);
12194 if (!address_operand (op1
, VOIDmode
))
12196 op1
= convert_memory_address (Pmode
, op1
);
12197 op1
= copy_addr_to_reg (op1
);
12200 op0
= gen_reg_rtx (mode
);
12201 emit_insn (gen_rdseed (mode
, op0
));
12203 emit_move_insn (gen_rtx_MEM (mode
, op1
), op0
);
12205 op2
= gen_reg_rtx (QImode
);
12207 pat
= gen_rtx_LTU (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
12209 emit_insn (gen_rtx_SET (op2
, pat
));
12212 || !register_operand (target
, SImode
))
12213 target
= gen_reg_rtx (SImode
);
12215 emit_insn (gen_zero_extendqisi2 (target
, op2
));
12218 case IX86_BUILTIN_SBB32
:
12219 icode
= CODE_FOR_subborrowsi
;
12220 icode2
= CODE_FOR_subborrowsi_0
;
12226 case IX86_BUILTIN_SBB64
:
12227 icode
= CODE_FOR_subborrowdi
;
12228 icode2
= CODE_FOR_subborrowdi_0
;
12234 case IX86_BUILTIN_ADDCARRYX32
:
12235 icode
= CODE_FOR_addcarrysi
;
12236 icode2
= CODE_FOR_addcarrysi_0
;
12242 case IX86_BUILTIN_ADDCARRYX64
:
12243 icode
= CODE_FOR_addcarrydi
;
12244 icode2
= CODE_FOR_addcarrydi_0
;
12250 arg0
= CALL_EXPR_ARG (exp
, 0); /* unsigned char c_in. */
12251 arg1
= CALL_EXPR_ARG (exp
, 1); /* unsigned int src1. */
12252 arg2
= CALL_EXPR_ARG (exp
, 2); /* unsigned int src2. */
12253 arg3
= CALL_EXPR_ARG (exp
, 3); /* unsigned int *sum_out. */
12255 op1
= expand_normal (arg0
);
12256 if (!integer_zerop (arg0
))
12257 op1
= copy_to_mode_reg (QImode
, convert_to_mode (QImode
, op1
, 1));
12259 op2
= expand_normal (arg1
);
12260 if (!register_operand (op2
, mode0
))
12261 op2
= copy_to_mode_reg (mode0
, op2
);
12263 op3
= expand_normal (arg2
);
12264 if (!register_operand (op3
, mode0
))
12265 op3
= copy_to_mode_reg (mode0
, op3
);
12267 op4
= expand_normal (arg3
);
12268 if (!address_operand (op4
, VOIDmode
))
12270 op4
= convert_memory_address (Pmode
, op4
);
12271 op4
= copy_addr_to_reg (op4
);
12274 op0
= gen_reg_rtx (mode0
);
12275 if (integer_zerop (arg0
))
12277 /* If arg0 is 0, optimize right away into add or sub
12278 instruction that sets CCCmode flags. */
12279 op1
= gen_rtx_REG (mode2
, FLAGS_REG
);
12280 emit_insn (GEN_FCN (icode2
) (op0
, op2
, op3
));
12284 /* Generate CF from input operand. */
12285 emit_insn (gen_addqi3_cconly_overflow (op1
, constm1_rtx
));
12287 /* Generate instruction that consumes CF. */
12288 op1
= gen_rtx_REG (CCCmode
, FLAGS_REG
);
12289 pat
= gen_rtx_LTU (mode1
, op1
, const0_rtx
);
12290 pat2
= gen_rtx_LTU (mode0
, op1
, const0_rtx
);
12291 emit_insn (GEN_FCN (icode
) (op0
, op2
, op3
, op1
, pat
, pat2
));
12294 /* Return current CF value. */
12296 target
= gen_reg_rtx (QImode
);
12298 pat
= gen_rtx_LTU (QImode
, op1
, const0_rtx
);
12299 emit_insn (gen_rtx_SET (target
, pat
));
12301 /* Store the result. */
12302 emit_move_insn (gen_rtx_MEM (mode0
, op4
), op0
);
12306 case IX86_BUILTIN_READ_FLAGS
:
12307 emit_insn (gen_push (gen_rtx_REG (word_mode
, FLAGS_REG
)));
12310 || target
== NULL_RTX
12311 || !nonimmediate_operand (target
, word_mode
)
12312 || GET_MODE (target
) != word_mode
)
12313 target
= gen_reg_rtx (word_mode
);
12315 emit_insn (gen_pop (target
));
12318 case IX86_BUILTIN_WRITE_FLAGS
:
12320 arg0
= CALL_EXPR_ARG (exp
, 0);
12321 op0
= expand_normal (arg0
);
12322 if (!general_no_elim_operand (op0
, word_mode
))
12323 op0
= copy_to_mode_reg (word_mode
, op0
);
12325 emit_insn (gen_push (op0
));
12326 emit_insn (gen_pop (gen_rtx_REG (word_mode
, FLAGS_REG
)));
12329 case IX86_BUILTIN_KTESTC8
:
12330 icode
= CODE_FOR_ktestqi
;
12334 case IX86_BUILTIN_KTESTZ8
:
12335 icode
= CODE_FOR_ktestqi
;
12339 case IX86_BUILTIN_KTESTC16
:
12340 icode
= CODE_FOR_ktesthi
;
12344 case IX86_BUILTIN_KTESTZ16
:
12345 icode
= CODE_FOR_ktesthi
;
12349 case IX86_BUILTIN_KTESTC32
:
12350 icode
= CODE_FOR_ktestsi
;
12354 case IX86_BUILTIN_KTESTZ32
:
12355 icode
= CODE_FOR_ktestsi
;
12359 case IX86_BUILTIN_KTESTC64
:
12360 icode
= CODE_FOR_ktestdi
;
12364 case IX86_BUILTIN_KTESTZ64
:
12365 icode
= CODE_FOR_ktestdi
;
12369 case IX86_BUILTIN_KORTESTC8
:
12370 icode
= CODE_FOR_kortestqi
;
12374 case IX86_BUILTIN_KORTESTZ8
:
12375 icode
= CODE_FOR_kortestqi
;
12379 case IX86_BUILTIN_KORTESTC16
:
12380 icode
= CODE_FOR_kortesthi
;
12384 case IX86_BUILTIN_KORTESTZ16
:
12385 icode
= CODE_FOR_kortesthi
;
12389 case IX86_BUILTIN_KORTESTC32
:
12390 icode
= CODE_FOR_kortestsi
;
12394 case IX86_BUILTIN_KORTESTZ32
:
12395 icode
= CODE_FOR_kortestsi
;
12399 case IX86_BUILTIN_KORTESTC64
:
12400 icode
= CODE_FOR_kortestdi
;
12404 case IX86_BUILTIN_KORTESTZ64
:
12405 icode
= CODE_FOR_kortestdi
;
12409 arg0
= CALL_EXPR_ARG (exp
, 0); /* Mask reg src1. */
12410 arg1
= CALL_EXPR_ARG (exp
, 1); /* Mask reg src2. */
12411 op0
= expand_normal (arg0
);
12412 op1
= expand_normal (arg1
);
12414 mode0
= insn_data
[icode
].operand
[0].mode
;
12415 mode1
= insn_data
[icode
].operand
[1].mode
;
12417 if (GET_MODE (op0
) != VOIDmode
)
12418 op0
= force_reg (GET_MODE (op0
), op0
);
12420 op0
= gen_lowpart (mode0
, op0
);
12422 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
12423 op0
= copy_to_mode_reg (mode0
, op0
);
12425 if (GET_MODE (op1
) != VOIDmode
)
12426 op1
= force_reg (GET_MODE (op1
), op1
);
12428 op1
= gen_lowpart (mode1
, op1
);
12430 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
12431 op1
= copy_to_mode_reg (mode1
, op1
);
12433 target
= gen_reg_rtx (QImode
);
12435 /* Emit kortest. */
12436 emit_insn (GEN_FCN (icode
) (op0
, op1
));
12437 /* And use setcc to return result from flags. */
12438 ix86_expand_setcc (target
, EQ
,
12439 gen_rtx_REG (mode3
, FLAGS_REG
), const0_rtx
);
12442 case IX86_BUILTIN_GATHERSIV2DF
:
12443 icode
= CODE_FOR_avx2_gathersiv2df
;
12445 case IX86_BUILTIN_GATHERSIV4DF
:
12446 icode
= CODE_FOR_avx2_gathersiv4df
;
12448 case IX86_BUILTIN_GATHERDIV2DF
:
12449 icode
= CODE_FOR_avx2_gatherdiv2df
;
12451 case IX86_BUILTIN_GATHERDIV4DF
:
12452 icode
= CODE_FOR_avx2_gatherdiv4df
;
12454 case IX86_BUILTIN_GATHERSIV4SF
:
12455 icode
= CODE_FOR_avx2_gathersiv4sf
;
12457 case IX86_BUILTIN_GATHERSIV8SF
:
12458 icode
= CODE_FOR_avx2_gathersiv8sf
;
12460 case IX86_BUILTIN_GATHERDIV4SF
:
12461 icode
= CODE_FOR_avx2_gatherdiv4sf
;
12463 case IX86_BUILTIN_GATHERDIV8SF
:
12464 icode
= CODE_FOR_avx2_gatherdiv8sf
;
12466 case IX86_BUILTIN_GATHERSIV2DI
:
12467 icode
= CODE_FOR_avx2_gathersiv2di
;
12469 case IX86_BUILTIN_GATHERSIV4DI
:
12470 icode
= CODE_FOR_avx2_gathersiv4di
;
12472 case IX86_BUILTIN_GATHERDIV2DI
:
12473 icode
= CODE_FOR_avx2_gatherdiv2di
;
12475 case IX86_BUILTIN_GATHERDIV4DI
:
12476 icode
= CODE_FOR_avx2_gatherdiv4di
;
12478 case IX86_BUILTIN_GATHERSIV4SI
:
12479 icode
= CODE_FOR_avx2_gathersiv4si
;
12481 case IX86_BUILTIN_GATHERSIV8SI
:
12482 icode
= CODE_FOR_avx2_gathersiv8si
;
12484 case IX86_BUILTIN_GATHERDIV4SI
:
12485 icode
= CODE_FOR_avx2_gatherdiv4si
;
12487 case IX86_BUILTIN_GATHERDIV8SI
:
12488 icode
= CODE_FOR_avx2_gatherdiv8si
;
12490 case IX86_BUILTIN_GATHERALTSIV4DF
:
12491 icode
= CODE_FOR_avx2_gathersiv4df
;
12493 case IX86_BUILTIN_GATHERALTDIV8SF
:
12494 icode
= CODE_FOR_avx2_gatherdiv8sf
;
12496 case IX86_BUILTIN_GATHERALTSIV4DI
:
12497 icode
= CODE_FOR_avx2_gathersiv4di
;
12499 case IX86_BUILTIN_GATHERALTDIV8SI
:
12500 icode
= CODE_FOR_avx2_gatherdiv8si
;
12502 case IX86_BUILTIN_GATHER3SIV16SF
:
12503 icode
= CODE_FOR_avx512f_gathersiv16sf
;
12505 case IX86_BUILTIN_GATHER3SIV8DF
:
12506 icode
= CODE_FOR_avx512f_gathersiv8df
;
12508 case IX86_BUILTIN_GATHER3DIV16SF
:
12509 icode
= CODE_FOR_avx512f_gatherdiv16sf
;
12511 case IX86_BUILTIN_GATHER3DIV8DF
:
12512 icode
= CODE_FOR_avx512f_gatherdiv8df
;
12514 case IX86_BUILTIN_GATHER3SIV16SI
:
12515 icode
= CODE_FOR_avx512f_gathersiv16si
;
12517 case IX86_BUILTIN_GATHER3SIV8DI
:
12518 icode
= CODE_FOR_avx512f_gathersiv8di
;
12520 case IX86_BUILTIN_GATHER3DIV16SI
:
12521 icode
= CODE_FOR_avx512f_gatherdiv16si
;
12523 case IX86_BUILTIN_GATHER3DIV8DI
:
12524 icode
= CODE_FOR_avx512f_gatherdiv8di
;
12526 case IX86_BUILTIN_GATHER3ALTSIV8DF
:
12527 icode
= CODE_FOR_avx512f_gathersiv8df
;
12529 case IX86_BUILTIN_GATHER3ALTDIV16SF
:
12530 icode
= CODE_FOR_avx512f_gatherdiv16sf
;
12532 case IX86_BUILTIN_GATHER3ALTSIV8DI
:
12533 icode
= CODE_FOR_avx512f_gathersiv8di
;
12535 case IX86_BUILTIN_GATHER3ALTDIV16SI
:
12536 icode
= CODE_FOR_avx512f_gatherdiv16si
;
12538 case IX86_BUILTIN_GATHER3SIV2DF
:
12539 icode
= CODE_FOR_avx512vl_gathersiv2df
;
12541 case IX86_BUILTIN_GATHER3SIV4DF
:
12542 icode
= CODE_FOR_avx512vl_gathersiv4df
;
12544 case IX86_BUILTIN_GATHER3DIV2DF
:
12545 icode
= CODE_FOR_avx512vl_gatherdiv2df
;
12547 case IX86_BUILTIN_GATHER3DIV4DF
:
12548 icode
= CODE_FOR_avx512vl_gatherdiv4df
;
12550 case IX86_BUILTIN_GATHER3SIV4SF
:
12551 icode
= CODE_FOR_avx512vl_gathersiv4sf
;
12553 case IX86_BUILTIN_GATHER3SIV8SF
:
12554 icode
= CODE_FOR_avx512vl_gathersiv8sf
;
12556 case IX86_BUILTIN_GATHER3DIV4SF
:
12557 icode
= CODE_FOR_avx512vl_gatherdiv4sf
;
12559 case IX86_BUILTIN_GATHER3DIV8SF
:
12560 icode
= CODE_FOR_avx512vl_gatherdiv8sf
;
12562 case IX86_BUILTIN_GATHER3SIV2DI
:
12563 icode
= CODE_FOR_avx512vl_gathersiv2di
;
12565 case IX86_BUILTIN_GATHER3SIV4DI
:
12566 icode
= CODE_FOR_avx512vl_gathersiv4di
;
12568 case IX86_BUILTIN_GATHER3DIV2DI
:
12569 icode
= CODE_FOR_avx512vl_gatherdiv2di
;
12571 case IX86_BUILTIN_GATHER3DIV4DI
:
12572 icode
= CODE_FOR_avx512vl_gatherdiv4di
;
12574 case IX86_BUILTIN_GATHER3SIV4SI
:
12575 icode
= CODE_FOR_avx512vl_gathersiv4si
;
12577 case IX86_BUILTIN_GATHER3SIV8SI
:
12578 icode
= CODE_FOR_avx512vl_gathersiv8si
;
12580 case IX86_BUILTIN_GATHER3DIV4SI
:
12581 icode
= CODE_FOR_avx512vl_gatherdiv4si
;
12583 case IX86_BUILTIN_GATHER3DIV8SI
:
12584 icode
= CODE_FOR_avx512vl_gatherdiv8si
;
12586 case IX86_BUILTIN_GATHER3ALTSIV4DF
:
12587 icode
= CODE_FOR_avx512vl_gathersiv4df
;
12589 case IX86_BUILTIN_GATHER3ALTDIV8SF
:
12590 icode
= CODE_FOR_avx512vl_gatherdiv8sf
;
12592 case IX86_BUILTIN_GATHER3ALTSIV4DI
:
12593 icode
= CODE_FOR_avx512vl_gathersiv4di
;
12595 case IX86_BUILTIN_GATHER3ALTDIV8SI
:
12596 icode
= CODE_FOR_avx512vl_gatherdiv8si
;
12598 case IX86_BUILTIN_SCATTERSIV16SF
:
12599 icode
= CODE_FOR_avx512f_scattersiv16sf
;
12601 case IX86_BUILTIN_SCATTERSIV8DF
:
12602 icode
= CODE_FOR_avx512f_scattersiv8df
;
12604 case IX86_BUILTIN_SCATTERDIV16SF
:
12605 icode
= CODE_FOR_avx512f_scatterdiv16sf
;
12607 case IX86_BUILTIN_SCATTERDIV8DF
:
12608 icode
= CODE_FOR_avx512f_scatterdiv8df
;
12610 case IX86_BUILTIN_SCATTERSIV16SI
:
12611 icode
= CODE_FOR_avx512f_scattersiv16si
;
12613 case IX86_BUILTIN_SCATTERSIV8DI
:
12614 icode
= CODE_FOR_avx512f_scattersiv8di
;
12616 case IX86_BUILTIN_SCATTERDIV16SI
:
12617 icode
= CODE_FOR_avx512f_scatterdiv16si
;
12619 case IX86_BUILTIN_SCATTERDIV8DI
:
12620 icode
= CODE_FOR_avx512f_scatterdiv8di
;
12622 case IX86_BUILTIN_SCATTERSIV8SF
:
12623 icode
= CODE_FOR_avx512vl_scattersiv8sf
;
12625 case IX86_BUILTIN_SCATTERSIV4SF
:
12626 icode
= CODE_FOR_avx512vl_scattersiv4sf
;
12628 case IX86_BUILTIN_SCATTERSIV4DF
:
12629 icode
= CODE_FOR_avx512vl_scattersiv4df
;
12631 case IX86_BUILTIN_SCATTERSIV2DF
:
12632 icode
= CODE_FOR_avx512vl_scattersiv2df
;
12634 case IX86_BUILTIN_SCATTERDIV8SF
:
12635 icode
= CODE_FOR_avx512vl_scatterdiv8sf
;
12637 case IX86_BUILTIN_SCATTERDIV4SF
:
12638 icode
= CODE_FOR_avx512vl_scatterdiv4sf
;
12640 case IX86_BUILTIN_SCATTERDIV4DF
:
12641 icode
= CODE_FOR_avx512vl_scatterdiv4df
;
12643 case IX86_BUILTIN_SCATTERDIV2DF
:
12644 icode
= CODE_FOR_avx512vl_scatterdiv2df
;
12646 case IX86_BUILTIN_SCATTERSIV8SI
:
12647 icode
= CODE_FOR_avx512vl_scattersiv8si
;
12649 case IX86_BUILTIN_SCATTERSIV4SI
:
12650 icode
= CODE_FOR_avx512vl_scattersiv4si
;
12652 case IX86_BUILTIN_SCATTERSIV4DI
:
12653 icode
= CODE_FOR_avx512vl_scattersiv4di
;
12655 case IX86_BUILTIN_SCATTERSIV2DI
:
12656 icode
= CODE_FOR_avx512vl_scattersiv2di
;
12658 case IX86_BUILTIN_SCATTERDIV8SI
:
12659 icode
= CODE_FOR_avx512vl_scatterdiv8si
;
12661 case IX86_BUILTIN_SCATTERDIV4SI
:
12662 icode
= CODE_FOR_avx512vl_scatterdiv4si
;
12664 case IX86_BUILTIN_SCATTERDIV4DI
:
12665 icode
= CODE_FOR_avx512vl_scatterdiv4di
;
12667 case IX86_BUILTIN_SCATTERDIV2DI
:
12668 icode
= CODE_FOR_avx512vl_scatterdiv2di
;
12670 case IX86_BUILTIN_GATHERPFDPD
:
12671 icode
= CODE_FOR_avx512pf_gatherpfv8sidf
;
12672 goto vec_prefetch_gen
;
12673 case IX86_BUILTIN_SCATTERALTSIV8DF
:
12674 icode
= CODE_FOR_avx512f_scattersiv8df
;
12676 case IX86_BUILTIN_SCATTERALTDIV16SF
:
12677 icode
= CODE_FOR_avx512f_scatterdiv16sf
;
12679 case IX86_BUILTIN_SCATTERALTSIV8DI
:
12680 icode
= CODE_FOR_avx512f_scattersiv8di
;
12682 case IX86_BUILTIN_SCATTERALTDIV16SI
:
12683 icode
= CODE_FOR_avx512f_scatterdiv16si
;
12685 case IX86_BUILTIN_SCATTERALTSIV4DF
:
12686 icode
= CODE_FOR_avx512vl_scattersiv4df
;
12688 case IX86_BUILTIN_SCATTERALTDIV8SF
:
12689 icode
= CODE_FOR_avx512vl_scatterdiv8sf
;
12691 case IX86_BUILTIN_SCATTERALTSIV4DI
:
12692 icode
= CODE_FOR_avx512vl_scattersiv4di
;
12694 case IX86_BUILTIN_SCATTERALTDIV8SI
:
12695 icode
= CODE_FOR_avx512vl_scatterdiv8si
;
12697 case IX86_BUILTIN_SCATTERALTSIV2DF
:
12698 icode
= CODE_FOR_avx512vl_scattersiv2df
;
12700 case IX86_BUILTIN_SCATTERALTDIV4SF
:
12701 icode
= CODE_FOR_avx512vl_scatterdiv4sf
;
12703 case IX86_BUILTIN_SCATTERALTSIV2DI
:
12704 icode
= CODE_FOR_avx512vl_scattersiv2di
;
12706 case IX86_BUILTIN_SCATTERALTDIV4SI
:
12707 icode
= CODE_FOR_avx512vl_scatterdiv4si
;
12709 case IX86_BUILTIN_GATHERPFDPS
:
12710 icode
= CODE_FOR_avx512pf_gatherpfv16sisf
;
12711 goto vec_prefetch_gen
;
12712 case IX86_BUILTIN_GATHERPFQPD
:
12713 icode
= CODE_FOR_avx512pf_gatherpfv8didf
;
12714 goto vec_prefetch_gen
;
12715 case IX86_BUILTIN_GATHERPFQPS
:
12716 icode
= CODE_FOR_avx512pf_gatherpfv8disf
;
12717 goto vec_prefetch_gen
;
12718 case IX86_BUILTIN_SCATTERPFDPD
:
12719 icode
= CODE_FOR_avx512pf_scatterpfv8sidf
;
12720 goto vec_prefetch_gen
;
12721 case IX86_BUILTIN_SCATTERPFDPS
:
12722 icode
= CODE_FOR_avx512pf_scatterpfv16sisf
;
12723 goto vec_prefetch_gen
;
12724 case IX86_BUILTIN_SCATTERPFQPD
:
12725 icode
= CODE_FOR_avx512pf_scatterpfv8didf
;
12726 goto vec_prefetch_gen
;
12727 case IX86_BUILTIN_SCATTERPFQPS
:
12728 icode
= CODE_FOR_avx512pf_scatterpfv8disf
;
12729 goto vec_prefetch_gen
;
12733 rtx (*gen
) (rtx
, rtx
);
12735 arg0
= CALL_EXPR_ARG (exp
, 0);
12736 arg1
= CALL_EXPR_ARG (exp
, 1);
12737 arg2
= CALL_EXPR_ARG (exp
, 2);
12738 arg3
= CALL_EXPR_ARG (exp
, 3);
12739 arg4
= CALL_EXPR_ARG (exp
, 4);
12740 op0
= expand_normal (arg0
);
12741 op1
= expand_normal (arg1
);
12742 op2
= expand_normal (arg2
);
12743 op3
= expand_normal (arg3
);
12744 op4
= expand_normal (arg4
);
12745 /* Note the arg order is different from the operand order. */
12746 mode0
= insn_data
[icode
].operand
[1].mode
;
12747 mode2
= insn_data
[icode
].operand
[3].mode
;
12748 mode3
= insn_data
[icode
].operand
[4].mode
;
12749 mode4
= insn_data
[icode
].operand
[5].mode
;
12751 if (target
== NULL_RTX
12752 || GET_MODE (target
) != insn_data
[icode
].operand
[0].mode
12753 || !insn_data
[icode
].operand
[0].predicate (target
,
12754 GET_MODE (target
)))
12755 subtarget
= gen_reg_rtx (insn_data
[icode
].operand
[0].mode
);
12757 subtarget
= target
;
12761 case IX86_BUILTIN_GATHER3ALTSIV8DF
:
12762 case IX86_BUILTIN_GATHER3ALTSIV8DI
:
12763 half
= gen_reg_rtx (V8SImode
);
12764 if (!nonimmediate_operand (op2
, V16SImode
))
12765 op2
= copy_to_mode_reg (V16SImode
, op2
);
12766 emit_insn (gen_vec_extract_lo_v16si (half
, op2
));
12769 case IX86_BUILTIN_GATHER3ALTSIV4DF
:
12770 case IX86_BUILTIN_GATHER3ALTSIV4DI
:
12771 case IX86_BUILTIN_GATHERALTSIV4DF
:
12772 case IX86_BUILTIN_GATHERALTSIV4DI
:
12773 half
= gen_reg_rtx (V4SImode
);
12774 if (!nonimmediate_operand (op2
, V8SImode
))
12775 op2
= copy_to_mode_reg (V8SImode
, op2
);
12776 emit_insn (gen_vec_extract_lo_v8si (half
, op2
));
12779 case IX86_BUILTIN_GATHER3ALTDIV16SF
:
12780 case IX86_BUILTIN_GATHER3ALTDIV16SI
:
12781 half
= gen_reg_rtx (mode0
);
12782 if (mode0
== V8SFmode
)
12783 gen
= gen_vec_extract_lo_v16sf
;
12785 gen
= gen_vec_extract_lo_v16si
;
12786 if (!nonimmediate_operand (op0
, GET_MODE (op0
)))
12787 op0
= copy_to_mode_reg (GET_MODE (op0
), op0
);
12788 emit_insn (gen (half
, op0
));
12790 op3
= lowpart_subreg (QImode
, op3
, HImode
);
12792 case IX86_BUILTIN_GATHER3ALTDIV8SF
:
12793 case IX86_BUILTIN_GATHER3ALTDIV8SI
:
12794 case IX86_BUILTIN_GATHERALTDIV8SF
:
12795 case IX86_BUILTIN_GATHERALTDIV8SI
:
12796 half
= gen_reg_rtx (mode0
);
12797 if (mode0
== V4SFmode
)
12798 gen
= gen_vec_extract_lo_v8sf
;
12800 gen
= gen_vec_extract_lo_v8si
;
12801 if (!nonimmediate_operand (op0
, GET_MODE (op0
)))
12802 op0
= copy_to_mode_reg (GET_MODE (op0
), op0
);
12803 emit_insn (gen (half
, op0
));
12805 if (VECTOR_MODE_P (GET_MODE (op3
)))
12807 half
= gen_reg_rtx (mode0
);
12808 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12809 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12810 emit_insn (gen (half
, op3
));
12818 /* Force memory operand only with base register here. But we
12819 don't want to do it on memory operand for other builtin
12821 op1
= ix86_zero_extend_to_Pmode (op1
);
12823 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
12824 op0
= copy_to_mode_reg (mode0
, op0
);
12825 if (!insn_data
[icode
].operand
[2].predicate (op1
, Pmode
))
12826 op1
= copy_to_mode_reg (Pmode
, op1
);
12827 if (!insn_data
[icode
].operand
[3].predicate (op2
, mode2
))
12828 op2
= copy_to_mode_reg (mode2
, op2
);
12830 op3
= fixup_modeless_constant (op3
, mode3
);
12832 if (GET_MODE (op3
) == mode3
|| GET_MODE (op3
) == VOIDmode
)
12834 if (!insn_data
[icode
].operand
[4].predicate (op3
, mode3
))
12835 op3
= copy_to_mode_reg (mode3
, op3
);
12839 op3
= copy_to_reg (op3
);
12840 op3
= lowpart_subreg (mode3
, op3
, GET_MODE (op3
));
12842 if (!insn_data
[icode
].operand
[5].predicate (op4
, mode4
))
12844 error ("the last argument must be scale 1, 2, 4, 8");
12848 /* Optimize. If mask is known to have all high bits set,
12849 replace op0 with pc_rtx to signal that the instruction
12850 overwrites the whole destination and doesn't use its
12851 previous contents. */
12854 if (TREE_CODE (arg3
) == INTEGER_CST
)
12856 if (integer_all_onesp (arg3
))
12859 else if (TREE_CODE (arg3
) == VECTOR_CST
)
12861 unsigned int negative
= 0;
12862 for (i
= 0; i
< VECTOR_CST_NELTS (arg3
); ++i
)
12864 tree cst
= VECTOR_CST_ELT (arg3
, i
);
12865 if (TREE_CODE (cst
) == INTEGER_CST
12866 && tree_int_cst_sign_bit (cst
))
12868 else if (TREE_CODE (cst
) == REAL_CST
12869 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst
)))
12872 if (negative
== TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3
)))
12875 else if (TREE_CODE (arg3
) == SSA_NAME
12876 && TREE_CODE (TREE_TYPE (arg3
)) == VECTOR_TYPE
)
12878 /* Recognize also when mask is like:
12879 __v2df src = _mm_setzero_pd ();
12880 __v2df mask = _mm_cmpeq_pd (src, src);
12882 __v8sf src = _mm256_setzero_ps ();
12883 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
12884 as that is a cheaper way to load all ones into
12885 a register than having to load a constant from
12887 gimple
*def_stmt
= SSA_NAME_DEF_STMT (arg3
);
12888 if (is_gimple_call (def_stmt
))
12890 tree fndecl
= gimple_call_fndecl (def_stmt
);
12892 && fndecl_built_in_p (fndecl
, BUILT_IN_MD
))
12893 switch (DECL_MD_FUNCTION_CODE (fndecl
))
12895 case IX86_BUILTIN_CMPPD
:
12896 case IX86_BUILTIN_CMPPS
:
12897 case IX86_BUILTIN_CMPPD256
:
12898 case IX86_BUILTIN_CMPPS256
:
12899 if (!integer_zerop (gimple_call_arg (def_stmt
, 2)))
12902 case IX86_BUILTIN_CMPEQPD
:
12903 case IX86_BUILTIN_CMPEQPS
:
12904 if (initializer_zerop (gimple_call_arg (def_stmt
, 0))
12905 && initializer_zerop (gimple_call_arg (def_stmt
,
12916 pat
= GEN_FCN (icode
) (subtarget
, op0
, op1
, op2
, op3
, op4
);
12923 case IX86_BUILTIN_GATHER3DIV16SF
:
12924 if (target
== NULL_RTX
)
12925 target
= gen_reg_rtx (V8SFmode
);
12926 emit_insn (gen_vec_extract_lo_v16sf (target
, subtarget
));
12928 case IX86_BUILTIN_GATHER3DIV16SI
:
12929 if (target
== NULL_RTX
)
12930 target
= gen_reg_rtx (V8SImode
);
12931 emit_insn (gen_vec_extract_lo_v16si (target
, subtarget
));
12933 case IX86_BUILTIN_GATHER3DIV8SF
:
12934 case IX86_BUILTIN_GATHERDIV8SF
:
12935 if (target
== NULL_RTX
)
12936 target
= gen_reg_rtx (V4SFmode
);
12937 emit_insn (gen_vec_extract_lo_v8sf (target
, subtarget
));
12939 case IX86_BUILTIN_GATHER3DIV8SI
:
12940 case IX86_BUILTIN_GATHERDIV8SI
:
12941 if (target
== NULL_RTX
)
12942 target
= gen_reg_rtx (V4SImode
);
12943 emit_insn (gen_vec_extract_lo_v8si (target
, subtarget
));
12946 target
= subtarget
;
12952 arg0
= CALL_EXPR_ARG (exp
, 0);
12953 arg1
= CALL_EXPR_ARG (exp
, 1);
12954 arg2
= CALL_EXPR_ARG (exp
, 2);
12955 arg3
= CALL_EXPR_ARG (exp
, 3);
12956 arg4
= CALL_EXPR_ARG (exp
, 4);
12957 op0
= expand_normal (arg0
);
12958 op1
= expand_normal (arg1
);
12959 op2
= expand_normal (arg2
);
12960 op3
= expand_normal (arg3
);
12961 op4
= expand_normal (arg4
);
12962 mode1
= insn_data
[icode
].operand
[1].mode
;
12963 mode2
= insn_data
[icode
].operand
[2].mode
;
12964 mode3
= insn_data
[icode
].operand
[3].mode
;
12965 mode4
= insn_data
[icode
].operand
[4].mode
;
12967 /* Scatter instruction stores operand op3 to memory with
12968 indices from op2 and scale from op4 under writemask op1.
12969 If index operand op2 has more elements then source operand
12970 op3 one need to use only its low half. And vice versa. */
12973 case IX86_BUILTIN_SCATTERALTSIV8DF
:
12974 case IX86_BUILTIN_SCATTERALTSIV8DI
:
12975 half
= gen_reg_rtx (V8SImode
);
12976 if (!nonimmediate_operand (op2
, V16SImode
))
12977 op2
= copy_to_mode_reg (V16SImode
, op2
);
12978 emit_insn (gen_vec_extract_lo_v16si (half
, op2
));
12981 case IX86_BUILTIN_SCATTERALTDIV16SF
:
12982 case IX86_BUILTIN_SCATTERALTDIV16SI
:
12983 half
= gen_reg_rtx (mode3
);
12984 if (mode3
== V8SFmode
)
12985 gen
= gen_vec_extract_lo_v16sf
;
12987 gen
= gen_vec_extract_lo_v16si
;
12988 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12989 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12990 emit_insn (gen (half
, op3
));
12993 case IX86_BUILTIN_SCATTERALTSIV4DF
:
12994 case IX86_BUILTIN_SCATTERALTSIV4DI
:
12995 half
= gen_reg_rtx (V4SImode
);
12996 if (!nonimmediate_operand (op2
, V8SImode
))
12997 op2
= copy_to_mode_reg (V8SImode
, op2
);
12998 emit_insn (gen_vec_extract_lo_v8si (half
, op2
));
13001 case IX86_BUILTIN_SCATTERALTDIV8SF
:
13002 case IX86_BUILTIN_SCATTERALTDIV8SI
:
13003 half
= gen_reg_rtx (mode3
);
13004 if (mode3
== V4SFmode
)
13005 gen
= gen_vec_extract_lo_v8sf
;
13007 gen
= gen_vec_extract_lo_v8si
;
13008 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
13009 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
13010 emit_insn (gen (half
, op3
));
13013 case IX86_BUILTIN_SCATTERALTSIV2DF
:
13014 case IX86_BUILTIN_SCATTERALTSIV2DI
:
13015 if (!nonimmediate_operand (op2
, V4SImode
))
13016 op2
= copy_to_mode_reg (V4SImode
, op2
);
13018 case IX86_BUILTIN_SCATTERALTDIV4SF
:
13019 case IX86_BUILTIN_SCATTERALTDIV4SI
:
13020 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
13021 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
13027 /* Force memory operand only with base register here. But we
13028 don't want to do it on memory operand for other builtin
13030 op0
= force_reg (Pmode
, convert_to_mode (Pmode
, op0
, 1));
13032 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
13033 op0
= copy_to_mode_reg (Pmode
, op0
);
13035 op1
= fixup_modeless_constant (op1
, mode1
);
13037 if (GET_MODE (op1
) == mode1
|| GET_MODE (op1
) == VOIDmode
)
13039 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
13040 op1
= copy_to_mode_reg (mode1
, op1
);
13044 op1
= copy_to_reg (op1
);
13045 op1
= lowpart_subreg (mode1
, op1
, GET_MODE (op1
));
13048 if (!insn_data
[icode
].operand
[2].predicate (op2
, mode2
))
13049 op2
= copy_to_mode_reg (mode2
, op2
);
13051 if (!insn_data
[icode
].operand
[3].predicate (op3
, mode3
))
13052 op3
= copy_to_mode_reg (mode3
, op3
);
13054 if (!insn_data
[icode
].operand
[4].predicate (op4
, mode4
))
13056 error ("the last argument must be scale 1, 2, 4, 8");
13060 pat
= GEN_FCN (icode
) (op0
, op1
, op2
, op3
, op4
);
13068 arg0
= CALL_EXPR_ARG (exp
, 0);
13069 arg1
= CALL_EXPR_ARG (exp
, 1);
13070 arg2
= CALL_EXPR_ARG (exp
, 2);
13071 arg3
= CALL_EXPR_ARG (exp
, 3);
13072 arg4
= CALL_EXPR_ARG (exp
, 4);
13073 op0
= expand_normal (arg0
);
13074 op1
= expand_normal (arg1
);
13075 op2
= expand_normal (arg2
);
13076 op3
= expand_normal (arg3
);
13077 op4
= expand_normal (arg4
);
13078 mode0
= insn_data
[icode
].operand
[0].mode
;
13079 mode1
= insn_data
[icode
].operand
[1].mode
;
13080 mode3
= insn_data
[icode
].operand
[3].mode
;
13081 mode4
= insn_data
[icode
].operand
[4].mode
;
13083 op0
= fixup_modeless_constant (op0
, mode0
);
13085 if (GET_MODE (op0
) == mode0
|| GET_MODE (op0
) == VOIDmode
)
13087 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
13088 op0
= copy_to_mode_reg (mode0
, op0
);
13092 op0
= copy_to_reg (op0
);
13093 op0
= lowpart_subreg (mode0
, op0
, GET_MODE (op0
));
13096 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
13097 op1
= copy_to_mode_reg (mode1
, op1
);
13099 /* Force memory operand only with base register here. But we
13100 don't want to do it on memory operand for other builtin
13102 op2
= force_reg (Pmode
, convert_to_mode (Pmode
, op2
, 1));
13104 if (!insn_data
[icode
].operand
[2].predicate (op2
, Pmode
))
13105 op2
= copy_to_mode_reg (Pmode
, op2
);
13107 if (!insn_data
[icode
].operand
[3].predicate (op3
, mode3
))
13109 error ("the forth argument must be scale 1, 2, 4, 8");
13113 if (!insn_data
[icode
].operand
[4].predicate (op4
, mode4
))
13115 error ("incorrect hint operand");
13119 pat
= GEN_FCN (icode
) (op0
, op1
, op2
, op3
, op4
);
13127 case IX86_BUILTIN_XABORT
:
13128 icode
= CODE_FOR_xabort
;
13129 arg0
= CALL_EXPR_ARG (exp
, 0);
13130 op0
= expand_normal (arg0
);
13131 mode0
= insn_data
[icode
].operand
[0].mode
;
13132 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
13134 error ("the argument to %<xabort%> intrinsic must "
13135 "be an 8-bit immediate");
13138 emit_insn (gen_xabort (op0
));
13141 case IX86_BUILTIN_RDSSPD
:
13142 case IX86_BUILTIN_RDSSPQ
:
13143 mode
= (fcode
== IX86_BUILTIN_RDSSPD
? SImode
: DImode
);
13146 || !register_operand (target
, mode
))
13147 target
= gen_reg_rtx (mode
);
13149 op0
= force_reg (mode
, const0_rtx
);
13151 emit_insn (gen_rdssp (mode
, target
, op0
));
13154 case IX86_BUILTIN_INCSSPD
:
13155 case IX86_BUILTIN_INCSSPQ
:
13156 mode
= (fcode
== IX86_BUILTIN_INCSSPD
? SImode
: DImode
);
13158 arg0
= CALL_EXPR_ARG (exp
, 0);
13159 op0
= expand_normal (arg0
);
13161 op0
= force_reg (mode
, op0
);
13163 emit_insn (gen_incssp (mode
, op0
));
13166 case IX86_BUILTIN_HRESET
:
13167 icode
= CODE_FOR_hreset
;
13168 arg0
= CALL_EXPR_ARG (exp
, 0);
13169 op0
= expand_normal (arg0
);
13170 op0
= force_reg (SImode
, op0
);
13171 emit_insn (gen_hreset (op0
));
13174 case IX86_BUILTIN_RSTORSSP
:
13175 case IX86_BUILTIN_CLRSSBSY
:
13176 arg0
= CALL_EXPR_ARG (exp
, 0);
13177 op0
= expand_normal (arg0
);
13178 icode
= (fcode
== IX86_BUILTIN_RSTORSSP
13179 ? CODE_FOR_rstorssp
13180 : CODE_FOR_clrssbsy
);
13182 if (!address_operand (op0
, VOIDmode
))
13184 op0
= convert_memory_address (Pmode
, op0
);
13185 op0
= copy_addr_to_reg (op0
);
13187 emit_insn (GEN_FCN (icode
) (gen_rtx_MEM (DImode
, op0
)));
13190 case IX86_BUILTIN_WRSSD
:
13191 case IX86_BUILTIN_WRSSQ
:
13192 case IX86_BUILTIN_WRUSSD
:
13193 case IX86_BUILTIN_WRUSSQ
:
13194 mode
= ((fcode
== IX86_BUILTIN_WRSSD
13195 || fcode
== IX86_BUILTIN_WRUSSD
)
13196 ? SImode
: DImode
);
13198 arg0
= CALL_EXPR_ARG (exp
, 0);
13199 op0
= expand_normal (arg0
);
13200 arg1
= CALL_EXPR_ARG (exp
, 1);
13201 op1
= expand_normal (arg1
);
13203 op0
= force_reg (mode
, op0
);
13205 if (!address_operand (op1
, VOIDmode
))
13207 op1
= convert_memory_address (Pmode
, op1
);
13208 op1
= copy_addr_to_reg (op1
);
13210 op1
= gen_rtx_MEM (mode
, op1
);
13212 icode
= ((fcode
== IX86_BUILTIN_WRSSD
13213 || fcode
== IX86_BUILTIN_WRSSQ
)
13214 ? code_for_wrss (mode
)
13215 : code_for_wruss (mode
));
13216 emit_insn (GEN_FCN (icode
) (op0
, op1
));
13220 case IX86_BUILTIN_VZEROUPPER
:
13221 cfun
->machine
->has_explicit_vzeroupper
= true;
13228 if (fcode
>= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
13229 && fcode
<= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST
)
13231 i
= fcode
- IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
;
13232 return ix86_expand_special_args_builtin (bdesc_special_args
+ i
, exp
,
13236 if (fcode
>= IX86_BUILTIN__BDESC_ARGS_FIRST
13237 && fcode
<= IX86_BUILTIN__BDESC_ARGS_LAST
)
13239 i
= fcode
- IX86_BUILTIN__BDESC_ARGS_FIRST
;
13240 rtx (*fcn
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
13241 rtx (*fcn_mask
) (rtx
, rtx
, rtx
, rtx
, rtx
);
13242 rtx (*fcn_maskz
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
);
13244 machine_mode mode
, wide_mode
, nar_mode
;
13246 nar_mode
= V4SFmode
;
13248 wide_mode
= V64SFmode
;
13249 fcn_mask
= gen_avx5124fmaddps_4fmaddps_mask
;
13250 fcn_maskz
= gen_avx5124fmaddps_4fmaddps_maskz
;
13254 case IX86_BUILTIN_4FMAPS
:
13255 fcn
= gen_avx5124fmaddps_4fmaddps
;
13259 case IX86_BUILTIN_4DPWSSD
:
13260 nar_mode
= V4SImode
;
13262 wide_mode
= V64SImode
;
13263 fcn
= gen_avx5124vnniw_vp4dpwssd
;
13267 case IX86_BUILTIN_4DPWSSDS
:
13268 nar_mode
= V4SImode
;
13270 wide_mode
= V64SImode
;
13271 fcn
= gen_avx5124vnniw_vp4dpwssds
;
13275 case IX86_BUILTIN_4FNMAPS
:
13276 fcn
= gen_avx5124fmaddps_4fnmaddps
;
13280 case IX86_BUILTIN_4FNMAPS_MASK
:
13281 fcn_mask
= gen_avx5124fmaddps_4fnmaddps_mask
;
13282 fcn_maskz
= gen_avx5124fmaddps_4fnmaddps_maskz
;
13285 case IX86_BUILTIN_4DPWSSD_MASK
:
13286 nar_mode
= V4SImode
;
13288 wide_mode
= V64SImode
;
13289 fcn_mask
= gen_avx5124vnniw_vp4dpwssd_mask
;
13290 fcn_maskz
= gen_avx5124vnniw_vp4dpwssd_maskz
;
13293 case IX86_BUILTIN_4DPWSSDS_MASK
:
13294 nar_mode
= V4SImode
;
13296 wide_mode
= V64SImode
;
13297 fcn_mask
= gen_avx5124vnniw_vp4dpwssds_mask
;
13298 fcn_maskz
= gen_avx5124vnniw_vp4dpwssds_maskz
;
13301 case IX86_BUILTIN_4FMAPS_MASK
:
13311 wide_reg
= gen_reg_rtx (wide_mode
);
13312 for (i
= 0; i
< 4; i
++)
13314 args
[i
] = CALL_EXPR_ARG (exp
, i
);
13315 ops
[i
] = expand_normal (args
[i
]);
13317 emit_move_insn (gen_rtx_SUBREG (mode
, wide_reg
, i
* 64),
13321 accum
= expand_normal (CALL_EXPR_ARG (exp
, 4));
13322 accum
= force_reg (mode
, accum
);
13324 addr
= expand_normal (CALL_EXPR_ARG (exp
, 5));
13325 addr
= force_reg (Pmode
, addr
);
13327 mem
= gen_rtx_MEM (nar_mode
, addr
);
13329 target
= gen_reg_rtx (mode
);
13331 emit_move_insn (target
, accum
);
13334 emit_insn (fcn (target
, accum
, wide_reg
, mem
));
13338 merge
= expand_normal (CALL_EXPR_ARG (exp
, 6));
13340 mask
= expand_normal (CALL_EXPR_ARG (exp
, 7));
13342 if (CONST_INT_P (mask
))
13343 mask
= fixup_modeless_constant (mask
, HImode
);
13345 mask
= force_reg (HImode
, mask
);
13347 if (GET_MODE (mask
) != HImode
)
13348 mask
= gen_rtx_SUBREG (HImode
, mask
, 0);
13350 /* If merge is 0 then we're about to emit z-masked variant. */
13351 if (const0_operand (merge
, mode
))
13352 emit_insn (fcn_maskz (target
, accum
, wide_reg
, mem
, merge
, mask
));
13353 /* If merge is the same as accum then emit merge-masked variant. */
13354 else if (CALL_EXPR_ARG (exp
, 6) == CALL_EXPR_ARG (exp
, 4))
13356 merge
= force_reg (mode
, merge
);
13357 emit_insn (fcn_mask (target
, wide_reg
, mem
, merge
, mask
));
13359 /* Merge with something unknown might happen if we z-mask w/ -O0. */
13362 target
= gen_reg_rtx (mode
);
13363 emit_move_insn (target
, merge
);
13364 emit_insn (fcn_mask (target
, wide_reg
, mem
, target
, mask
));
13370 case IX86_BUILTIN_4FNMASS
:
13371 fcn
= gen_avx5124fmaddps_4fnmaddss
;
13375 case IX86_BUILTIN_4FMASS
:
13376 fcn
= gen_avx5124fmaddps_4fmaddss
;
13380 case IX86_BUILTIN_4FNMASS_MASK
:
13381 fcn_mask
= gen_avx5124fmaddps_4fnmaddss_mask
;
13382 fcn_maskz
= gen_avx5124fmaddps_4fnmaddss_maskz
;
13385 case IX86_BUILTIN_4FMASS_MASK
:
13394 fcn_mask
= gen_avx5124fmaddps_4fmaddss_mask
;
13395 fcn_maskz
= gen_avx5124fmaddps_4fmaddss_maskz
;
13399 wide_reg
= gen_reg_rtx (V64SFmode
);
13400 for (i
= 0; i
< 4; i
++)
13403 args
[i
] = CALL_EXPR_ARG (exp
, i
);
13404 ops
[i
] = expand_normal (args
[i
]);
13406 tmp
= gen_reg_rtx (SFmode
);
13407 emit_move_insn (tmp
, gen_rtx_SUBREG (SFmode
, ops
[i
], 0));
13409 emit_move_insn (gen_rtx_SUBREG (V16SFmode
, wide_reg
, i
* 64),
13410 gen_rtx_SUBREG (V16SFmode
, tmp
, 0));
13413 accum
= expand_normal (CALL_EXPR_ARG (exp
, 4));
13414 accum
= force_reg (V4SFmode
, accum
);
13416 addr
= expand_normal (CALL_EXPR_ARG (exp
, 5));
13417 addr
= force_reg (Pmode
, addr
);
13419 mem
= gen_rtx_MEM (V4SFmode
, addr
);
13421 target
= gen_reg_rtx (V4SFmode
);
13423 emit_move_insn (target
, accum
);
13426 emit_insn (fcn (target
, accum
, wide_reg
, mem
));
13430 merge
= expand_normal (CALL_EXPR_ARG (exp
, 6));
13432 mask
= expand_normal (CALL_EXPR_ARG (exp
, 7));
13434 if (CONST_INT_P (mask
))
13435 mask
= fixup_modeless_constant (mask
, QImode
);
13437 mask
= force_reg (QImode
, mask
);
13439 if (GET_MODE (mask
) != QImode
)
13440 mask
= gen_rtx_SUBREG (QImode
, mask
, 0);
13442 /* If merge is 0 then we're about to emit z-masked variant. */
13443 if (const0_operand (merge
, mode
))
13444 emit_insn (fcn_maskz (target
, accum
, wide_reg
, mem
, merge
, mask
));
13445 /* If merge is the same as accum then emit merge-masked
13447 else if (CALL_EXPR_ARG (exp
, 6) == CALL_EXPR_ARG (exp
, 4))
13449 merge
= force_reg (mode
, merge
);
13450 emit_insn (fcn_mask (target
, wide_reg
, mem
, merge
, mask
));
13452 /* Merge with something unknown might happen if we z-mask
13456 target
= gen_reg_rtx (mode
);
13457 emit_move_insn (target
, merge
);
13458 emit_insn (fcn_mask (target
, wide_reg
, mem
, target
, mask
));
13463 case IX86_BUILTIN_RDPID
:
13464 return ix86_expand_special_args_builtin (bdesc_args
+ i
, exp
,
13466 case IX86_BUILTIN_FABSQ
:
13467 case IX86_BUILTIN_COPYSIGNQ
:
13469 /* Emit a normal call if SSE isn't available. */
13470 return expand_call (exp
, target
, ignore
);
13473 return ix86_expand_args_builtin (bdesc_args
+ i
, exp
, target
);
13477 if (fcode
>= IX86_BUILTIN__BDESC_COMI_FIRST
13478 && fcode
<= IX86_BUILTIN__BDESC_COMI_LAST
)
13480 i
= fcode
- IX86_BUILTIN__BDESC_COMI_FIRST
;
13481 return ix86_expand_sse_comi (bdesc_comi
+ i
, exp
, target
);
13484 if (fcode
>= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
13485 && fcode
<= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST
)
13487 i
= fcode
- IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
;
13488 return ix86_expand_round_builtin (bdesc_round_args
+ i
, exp
, target
);
13491 if (fcode
>= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
13492 && fcode
<= IX86_BUILTIN__BDESC_PCMPESTR_LAST
)
13494 i
= fcode
- IX86_BUILTIN__BDESC_PCMPESTR_FIRST
;
13495 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr
+ i
, exp
, target
);
13498 if (fcode
>= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
13499 && fcode
<= IX86_BUILTIN__BDESC_PCMPISTR_LAST
)
13501 i
= fcode
- IX86_BUILTIN__BDESC_PCMPISTR_FIRST
;
13502 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr
+ i
, exp
, target
);
13505 if (fcode
>= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
13506 && fcode
<= IX86_BUILTIN__BDESC_MULTI_ARG_LAST
)
13508 i
= fcode
- IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
;
13509 const struct builtin_description
*d
= bdesc_multi_arg
+ i
;
13510 return ix86_expand_multi_arg_builtin (d
->icode
, exp
, target
,
13511 (enum ix86_builtin_func_type
)
13512 d
->flag
, d
->comparison
);
13515 if (fcode
>= IX86_BUILTIN__BDESC_CET_FIRST
13516 && fcode
<= IX86_BUILTIN__BDESC_CET_LAST
)
13518 i
= fcode
- IX86_BUILTIN__BDESC_CET_FIRST
;
13519 return ix86_expand_special_args_builtin (bdesc_cet
+ i
, exp
,
13523 gcc_unreachable ();
13526 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
13527 fill target with val via vec_duplicate. */
13530 ix86_vector_duplicate_value (machine_mode mode
, rtx target
, rtx val
)
13536 /* First attempt to recognize VAL as-is. */
13537 dup
= gen_vec_duplicate (mode
, val
);
13538 insn
= emit_insn (gen_rtx_SET (target
, dup
));
13539 if (recog_memoized (insn
) < 0)
13542 machine_mode innermode
= GET_MODE_INNER (mode
);
13545 /* If that fails, force VAL into a register. */
13548 reg
= force_reg (innermode
, val
);
13549 if (GET_MODE (reg
) != innermode
)
13550 reg
= gen_lowpart (innermode
, reg
);
13551 SET_SRC (PATTERN (insn
)) = gen_vec_duplicate (mode
, reg
);
13552 seq
= get_insns ();
13555 emit_insn_before (seq
, insn
);
13557 ok
= recog_memoized (insn
) >= 0;
13563 /* Get a vector mode of the same size as the original but with elements
13564 twice as wide. This is only guaranteed to apply to integral vectors. */
13566 static machine_mode
13567 get_mode_wider_vector (machine_mode o
)
13569 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
13570 machine_mode n
= GET_MODE_WIDER_MODE (o
).require ();
13571 gcc_assert (GET_MODE_NUNITS (o
) == GET_MODE_NUNITS (n
) * 2);
13572 gcc_assert (GET_MODE_SIZE (o
) == GET_MODE_SIZE (n
));
13576 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d
*d
);
13577 static bool expand_vec_perm_1 (struct expand_vec_perm_d
*d
);
13579 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13580 with all elements equal to VAR. Return true if successful. */
13583 ix86_expand_vector_init_duplicate (bool mmx_ok
, machine_mode mode
,
13584 rtx target
, rtx val
)
13608 return ix86_vector_duplicate_value (mode
, target
, val
);
13613 if (TARGET_SSE
|| TARGET_3DNOW_A
)
13617 val
= gen_lowpart (SImode
, val
);
13618 x
= gen_rtx_TRUNCATE (HImode
, val
);
13619 x
= gen_rtx_VEC_DUPLICATE (mode
, x
);
13620 emit_insn (gen_rtx_SET (target
, x
));
13632 return ix86_vector_duplicate_value (mode
, target
, val
);
13636 struct expand_vec_perm_d dperm
;
13640 memset (&dperm
, 0, sizeof (dperm
));
13641 dperm
.target
= target
;
13642 dperm
.vmode
= mode
;
13643 dperm
.nelt
= GET_MODE_NUNITS (mode
);
13644 dperm
.op0
= dperm
.op1
= gen_reg_rtx (mode
);
13645 dperm
.one_operand_p
= true;
13647 /* Extend to SImode using a paradoxical SUBREG. */
13648 tmp1
= gen_reg_rtx (SImode
);
13649 emit_move_insn (tmp1
, gen_lowpart (SImode
, val
));
13651 /* Insert the SImode value as low element of a V4SImode vector. */
13652 tmp2
= gen_reg_rtx (V4SImode
);
13653 emit_insn (gen_vec_setv4si_0 (tmp2
, CONST0_RTX (V4SImode
), tmp1
));
13654 emit_move_insn (dperm
.op0
, gen_lowpart (mode
, tmp2
));
13656 ok
= (expand_vec_perm_1 (&dperm
)
13657 || expand_vec_perm_broadcast_1 (&dperm
));
13665 return ix86_vector_duplicate_value (mode
, target
, val
);
13672 /* Replicate the value once into the next wider mode and recurse. */
13674 machine_mode smode
, wsmode
, wvmode
;
13677 smode
= GET_MODE_INNER (mode
);
13678 wvmode
= get_mode_wider_vector (mode
);
13679 wsmode
= GET_MODE_INNER (wvmode
);
13681 val
= convert_modes (wsmode
, smode
, val
, true);
13682 x
= expand_simple_binop (wsmode
, ASHIFT
, val
,
13683 GEN_INT (GET_MODE_BITSIZE (smode
)),
13684 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
13685 val
= expand_simple_binop (wsmode
, IOR
, val
, x
, x
, 1, OPTAB_LIB_WIDEN
);
13687 x
= gen_reg_rtx (wvmode
);
13688 ok
= ix86_expand_vector_init_duplicate (mmx_ok
, wvmode
, x
, val
);
13690 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), x
));
13697 return ix86_vector_duplicate_value (mode
, target
, val
);
13700 machine_mode hvmode
= (mode
== V16HImode
? V8HImode
: V16QImode
);
13701 rtx x
= gen_reg_rtx (hvmode
);
13703 ok
= ix86_expand_vector_init_duplicate (false, hvmode
, x
, val
);
13706 x
= gen_rtx_VEC_CONCAT (mode
, x
, x
);
13707 emit_insn (gen_rtx_SET (target
, x
));
13713 if (TARGET_AVX512BW
)
13714 return ix86_vector_duplicate_value (mode
, target
, val
);
13717 machine_mode hvmode
= (mode
== V32HImode
? V16HImode
: V32QImode
);
13718 rtx x
= gen_reg_rtx (hvmode
);
13720 ok
= ix86_expand_vector_init_duplicate (false, hvmode
, x
, val
);
13723 x
= gen_rtx_VEC_CONCAT (mode
, x
, x
);
13724 emit_insn (gen_rtx_SET (target
, x
));
13733 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13734 whose ONE_VAR element is VAR, and other elements are zero. Return true
13738 ix86_expand_vector_init_one_nonzero (bool mmx_ok
, machine_mode mode
,
13739 rtx target
, rtx var
, int one_var
)
13741 machine_mode vsimode
;
13744 bool use_vector_set
= false;
13745 rtx (*gen_vec_set_0
) (rtx
, rtx
, rtx
) = NULL
;
13750 /* For SSE4.1, we normally use vector set. But if the second
13751 element is zero and inter-unit moves are OK, we use movq
13753 use_vector_set
= (TARGET_64BIT
&& TARGET_SSE4_1
13754 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
13760 use_vector_set
= TARGET_SSE4_1
;
13763 use_vector_set
= TARGET_SSE2
;
13766 use_vector_set
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
13769 use_vector_set
= TARGET_SSE
|| TARGET_3DNOW_A
;
13773 use_vector_set
= TARGET_AVX
;
13776 use_vector_set
= TARGET_AVX
;
13777 gen_vec_set_0
= gen_vec_setv8si_0
;
13780 use_vector_set
= TARGET_AVX
;
13781 gen_vec_set_0
= gen_vec_setv8sf_0
;
13784 use_vector_set
= TARGET_AVX
;
13785 gen_vec_set_0
= gen_vec_setv4df_0
;
13788 /* Use ix86_expand_vector_set in 64bit mode only. */
13789 use_vector_set
= TARGET_AVX
&& TARGET_64BIT
;
13790 gen_vec_set_0
= gen_vec_setv4di_0
;
13793 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
13794 gen_vec_set_0
= gen_vec_setv16si_0
;
13797 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
13798 gen_vec_set_0
= gen_vec_setv16sf_0
;
13801 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
13802 gen_vec_set_0
= gen_vec_setv8df_0
;
13805 /* Use ix86_expand_vector_set in 64bit mode only. */
13806 use_vector_set
= TARGET_AVX512F
&& TARGET_64BIT
&& one_var
== 0;
13807 gen_vec_set_0
= gen_vec_setv8di_0
;
13813 if (use_vector_set
)
13815 if (gen_vec_set_0
&& one_var
== 0)
13817 var
= force_reg (GET_MODE_INNER (mode
), var
);
13818 emit_insn (gen_vec_set_0 (target
, CONST0_RTX (mode
), var
));
13821 emit_insn (gen_rtx_SET (target
, CONST0_RTX (mode
)));
13822 var
= force_reg (GET_MODE_INNER (mode
), var
);
13823 ix86_expand_vector_set (mmx_ok
, target
, var
, one_var
);
13839 var
= force_reg (GET_MODE_INNER (mode
), var
);
13840 x
= gen_rtx_VEC_CONCAT (mode
, var
, CONST0_RTX (GET_MODE_INNER (mode
)));
13841 emit_insn (gen_rtx_SET (target
, x
));
13846 if (!REG_P (target
) || REGNO (target
) < FIRST_PSEUDO_REGISTER
)
13847 new_target
= gen_reg_rtx (mode
);
13849 new_target
= target
;
13850 var
= force_reg (GET_MODE_INNER (mode
), var
);
13851 x
= gen_rtx_VEC_DUPLICATE (mode
, var
);
13852 x
= gen_rtx_VEC_MERGE (mode
, x
, CONST0_RTX (mode
), const1_rtx
);
13853 emit_insn (gen_rtx_SET (new_target
, x
));
13856 /* We need to shuffle the value to the correct position, so
13857 create a new pseudo to store the intermediate result. */
13859 /* With SSE2, we can use the integer shuffle insns. */
13860 if (mode
!= V4SFmode
&& TARGET_SSE2
)
13862 emit_insn (gen_sse2_pshufd_1 (new_target
, new_target
,
13864 GEN_INT (one_var
== 1 ? 0 : 1),
13865 GEN_INT (one_var
== 2 ? 0 : 1),
13866 GEN_INT (one_var
== 3 ? 0 : 1)));
13867 if (target
!= new_target
)
13868 emit_move_insn (target
, new_target
);
13872 /* Otherwise convert the intermediate result to V4SFmode and
13873 use the SSE1 shuffle instructions. */
13874 if (mode
!= V4SFmode
)
13876 tmp
= gen_reg_rtx (V4SFmode
);
13877 emit_move_insn (tmp
, gen_lowpart (V4SFmode
, new_target
));
13882 emit_insn (gen_sse_shufps_v4sf (tmp
, tmp
, tmp
,
13884 GEN_INT (one_var
== 1 ? 0 : 1),
13885 GEN_INT (one_var
== 2 ? 0+4 : 1+4),
13886 GEN_INT (one_var
== 3 ? 0+4 : 1+4)));
13888 if (mode
!= V4SFmode
)
13889 emit_move_insn (target
, gen_lowpart (V4SImode
, tmp
));
13890 else if (tmp
!= target
)
13891 emit_move_insn (target
, tmp
);
13893 else if (target
!= new_target
)
13894 emit_move_insn (target
, new_target
);
13899 vsimode
= V4SImode
;
13905 vsimode
= V2SImode
;
13911 /* Zero extend the variable element to SImode and recurse. */
13912 var
= convert_modes (SImode
, GET_MODE_INNER (mode
), var
, true);
13914 x
= gen_reg_rtx (vsimode
);
13915 if (!ix86_expand_vector_init_one_nonzero (mmx_ok
, vsimode
, x
,
13917 gcc_unreachable ();
13919 emit_move_insn (target
, gen_lowpart (mode
, x
));
13927 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13928 consisting of the values in VALS. It is known that all elements
13929 except ONE_VAR are constants. Return true if successful. */
13932 ix86_expand_vector_init_one_var (bool mmx_ok
, machine_mode mode
,
13933 rtx target
, rtx vals
, int one_var
)
13935 rtx var
= XVECEXP (vals
, 0, one_var
);
13936 machine_mode wmode
;
13939 const_vec
= copy_rtx (vals
);
13940 XVECEXP (const_vec
, 0, one_var
) = CONST0_RTX (GET_MODE_INNER (mode
));
13941 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (const_vec
, 0));
13949 /* For the two element vectors, it's just as easy to use
13950 the general case. */
13954 /* Use ix86_expand_vector_set in 64bit mode only. */
13975 if (TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
)
13980 /* There's no way to set one QImode entry easily. Combine
13981 the variable value with its adjacent constant value, and
13982 promote to an HImode set. */
13983 x
= XVECEXP (vals
, 0, one_var
^ 1);
13986 var
= convert_modes (HImode
, QImode
, var
, true);
13987 var
= expand_simple_binop (HImode
, ASHIFT
, var
, GEN_INT (8),
13988 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
13989 x
= GEN_INT (INTVAL (x
) & 0xff);
13993 var
= convert_modes (HImode
, QImode
, var
, true);
13994 x
= gen_int_mode (UINTVAL (x
) << 8, HImode
);
13996 if (x
!= const0_rtx
)
13997 var
= expand_simple_binop (HImode
, IOR
, var
, x
, var
,
13998 1, OPTAB_LIB_WIDEN
);
14000 x
= gen_reg_rtx (wmode
);
14001 emit_move_insn (x
, gen_lowpart (wmode
, const_vec
));
14002 ix86_expand_vector_set (mmx_ok
, x
, var
, one_var
>> 1);
14004 emit_move_insn (target
, gen_lowpart (mode
, x
));
14011 emit_move_insn (target
, const_vec
);
14012 ix86_expand_vector_set (mmx_ok
, target
, var
, one_var
);
14016 /* A subroutine of ix86_expand_vector_init_general. Use vector
14017 concatenate to handle the most general case: all values variable,
14018 and none identical. */
14021 ix86_expand_vector_init_concat (machine_mode mode
,
14022 rtx target
, rtx
*ops
, int n
)
14024 machine_mode half_mode
= VOIDmode
;
14035 half_mode
= V8SImode
;
14038 half_mode
= V8SFmode
;
14041 half_mode
= V4DImode
;
14044 half_mode
= V4DFmode
;
14047 half_mode
= V4SImode
;
14050 half_mode
= V4SFmode
;
14053 half_mode
= V2DImode
;
14056 half_mode
= V2DFmode
;
14059 half_mode
= V2SImode
;
14062 half_mode
= V2SFmode
;
14065 half_mode
= DImode
;
14068 half_mode
= SImode
;
14071 half_mode
= DFmode
;
14074 half_mode
= SFmode
;
14077 gcc_unreachable ();
14080 if (!register_operand (ops
[1], half_mode
))
14081 ops
[1] = force_reg (half_mode
, ops
[1]);
14082 if (!register_operand (ops
[0], half_mode
))
14083 ops
[0] = force_reg (half_mode
, ops
[0]);
14084 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, ops
[0],
14092 half_mode
= V2DImode
;
14095 half_mode
= V2DFmode
;
14098 half_mode
= V2SImode
;
14101 half_mode
= V2SFmode
;
14104 gcc_unreachable ();
14112 half_mode
= V4DImode
;
14115 half_mode
= V4DFmode
;
14118 half_mode
= V4SImode
;
14121 half_mode
= V4SFmode
;
14124 gcc_unreachable ();
14132 half_mode
= V8SImode
;
14135 half_mode
= V8SFmode
;
14138 gcc_unreachable ();
14143 /* FIXME: We process inputs backward to help RA. PR 36222. */
14145 for (j
= 1; j
!= -1; j
--)
14147 half
[j
] = gen_reg_rtx (half_mode
);
14151 v
= gen_rtvec (2, ops
[i
-1], ops
[i
]);
14155 v
= gen_rtvec (4, ops
[i
-3], ops
[i
-2], ops
[i
-1], ops
[i
]);
14159 v
= gen_rtvec (8, ops
[i
-7], ops
[i
-6], ops
[i
-5], ops
[i
-4],
14160 ops
[i
-3], ops
[i
-2], ops
[i
-1], ops
[i
]);
14164 gcc_unreachable ();
14166 ix86_expand_vector_init (false, half
[j
],
14167 gen_rtx_PARALLEL (half_mode
, v
));
14170 ix86_expand_vector_init_concat (mode
, target
, half
, 2);
14174 gcc_unreachable ();
14178 /* A subroutine of ix86_expand_vector_init_general. Use vector
14179 interleave to handle the most general case: all values variable,
14180 and none identical. */
14183 ix86_expand_vector_init_interleave (machine_mode mode
,
14184 rtx target
, rtx
*ops
, int n
)
14186 machine_mode first_imode
, second_imode
, third_imode
, inner_mode
;
14189 rtx (*gen_load_even
) (rtx
, rtx
, rtx
);
14190 rtx (*gen_interleave_first_low
) (rtx
, rtx
, rtx
);
14191 rtx (*gen_interleave_second_low
) (rtx
, rtx
, rtx
);
14196 gen_load_even
= gen_vec_setv8hi
;
14197 gen_interleave_first_low
= gen_vec_interleave_lowv4si
;
14198 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
14199 inner_mode
= HImode
;
14200 first_imode
= V4SImode
;
14201 second_imode
= V2DImode
;
14202 third_imode
= VOIDmode
;
14205 gen_load_even
= gen_vec_setv16qi
;
14206 gen_interleave_first_low
= gen_vec_interleave_lowv8hi
;
14207 gen_interleave_second_low
= gen_vec_interleave_lowv4si
;
14208 inner_mode
= QImode
;
14209 first_imode
= V8HImode
;
14210 second_imode
= V4SImode
;
14211 third_imode
= V2DImode
;
14214 gcc_unreachable ();
14217 for (i
= 0; i
< n
; i
++)
14219 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
14220 op0
= gen_reg_rtx (SImode
);
14221 emit_move_insn (op0
, gen_lowpart (SImode
, ops
[i
+ i
]));
14223 /* Insert the SImode value as low element of V4SImode vector. */
14224 op1
= gen_reg_rtx (V4SImode
);
14225 op0
= gen_rtx_VEC_MERGE (V4SImode
,
14226 gen_rtx_VEC_DUPLICATE (V4SImode
,
14228 CONST0_RTX (V4SImode
),
14230 emit_insn (gen_rtx_SET (op1
, op0
));
14232 /* Cast the V4SImode vector back to a vector in orignal mode. */
14233 op0
= gen_reg_rtx (mode
);
14234 emit_move_insn (op0
, gen_lowpart (mode
, op1
));
14236 /* Load even elements into the second position. */
14237 emit_insn (gen_load_even (op0
,
14238 force_reg (inner_mode
,
14242 /* Cast vector to FIRST_IMODE vector. */
14243 ops
[i
] = gen_reg_rtx (first_imode
);
14244 emit_move_insn (ops
[i
], gen_lowpart (first_imode
, op0
));
14247 /* Interleave low FIRST_IMODE vectors. */
14248 for (i
= j
= 0; i
< n
; i
+= 2, j
++)
14250 op0
= gen_reg_rtx (first_imode
);
14251 emit_insn (gen_interleave_first_low (op0
, ops
[i
], ops
[i
+ 1]));
14253 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
14254 ops
[j
] = gen_reg_rtx (second_imode
);
14255 emit_move_insn (ops
[j
], gen_lowpart (second_imode
, op0
));
14258 /* Interleave low SECOND_IMODE vectors. */
14259 switch (second_imode
)
14262 for (i
= j
= 0; i
< n
/ 2; i
+= 2, j
++)
14264 op0
= gen_reg_rtx (second_imode
);
14265 emit_insn (gen_interleave_second_low (op0
, ops
[i
],
14268 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
14270 ops
[j
] = gen_reg_rtx (third_imode
);
14271 emit_move_insn (ops
[j
], gen_lowpart (third_imode
, op0
));
14273 second_imode
= V2DImode
;
14274 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
14278 op0
= gen_reg_rtx (second_imode
);
14279 emit_insn (gen_interleave_second_low (op0
, ops
[0],
14282 /* Cast the SECOND_IMODE vector back to a vector on original
14284 emit_insn (gen_rtx_SET (target
, gen_lowpart (mode
, op0
)));
14288 gcc_unreachable ();
14292 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
14293 all values variable, and none identical. */
14296 ix86_expand_vector_init_general (bool mmx_ok
, machine_mode mode
,
14297 rtx target
, rtx vals
)
14299 rtx ops
[64], op0
, op1
, op2
, op3
, op4
, op5
;
14300 machine_mode half_mode
= VOIDmode
;
14301 machine_mode quarter_mode
= VOIDmode
;
14308 if (!mmx_ok
&& !TARGET_SSE
)
14324 n
= GET_MODE_NUNITS (mode
);
14325 for (i
= 0; i
< n
; i
++)
14326 ops
[i
] = XVECEXP (vals
, 0, i
);
14327 ix86_expand_vector_init_concat (mode
, target
, ops
, n
);
14331 for (i
= 0; i
< 2; i
++)
14332 ops
[i
] = gen_lowpart (V2DImode
, XVECEXP (vals
, 0, i
));
14333 op0
= gen_reg_rtx (V4DImode
);
14334 ix86_expand_vector_init_concat (V4DImode
, op0
, ops
, 2);
14335 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), op0
));
14339 for (i
= 0; i
< 4; i
++)
14340 ops
[i
] = gen_lowpart (V2DImode
, XVECEXP (vals
, 0, i
));
14341 ops
[4] = gen_reg_rtx (V4DImode
);
14342 ix86_expand_vector_init_concat (V4DImode
, ops
[4], ops
, 2);
14343 ops
[5] = gen_reg_rtx (V4DImode
);
14344 ix86_expand_vector_init_concat (V4DImode
, ops
[5], ops
+ 2, 2);
14345 op0
= gen_reg_rtx (V8DImode
);
14346 ix86_expand_vector_init_concat (V8DImode
, op0
, ops
+ 4, 2);
14347 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), op0
));
14351 half_mode
= V16QImode
;
14355 half_mode
= V8HImode
;
14359 n
= GET_MODE_NUNITS (mode
);
14360 for (i
= 0; i
< n
; i
++)
14361 ops
[i
] = XVECEXP (vals
, 0, i
);
14362 op0
= gen_reg_rtx (half_mode
);
14363 op1
= gen_reg_rtx (half_mode
);
14364 ix86_expand_vector_init_interleave (half_mode
, op0
, ops
,
14366 ix86_expand_vector_init_interleave (half_mode
, op1
,
14367 &ops
[n
>> 1], n
>> 2);
14368 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, op0
, op1
)));
14372 quarter_mode
= V16QImode
;
14373 half_mode
= V32QImode
;
14377 quarter_mode
= V8HImode
;
14378 half_mode
= V16HImode
;
14382 n
= GET_MODE_NUNITS (mode
);
14383 for (i
= 0; i
< n
; i
++)
14384 ops
[i
] = XVECEXP (vals
, 0, i
);
14385 op0
= gen_reg_rtx (quarter_mode
);
14386 op1
= gen_reg_rtx (quarter_mode
);
14387 op2
= gen_reg_rtx (quarter_mode
);
14388 op3
= gen_reg_rtx (quarter_mode
);
14389 op4
= gen_reg_rtx (half_mode
);
14390 op5
= gen_reg_rtx (half_mode
);
14391 ix86_expand_vector_init_interleave (quarter_mode
, op0
, ops
,
14393 ix86_expand_vector_init_interleave (quarter_mode
, op1
,
14394 &ops
[n
>> 2], n
>> 3);
14395 ix86_expand_vector_init_interleave (quarter_mode
, op2
,
14396 &ops
[n
>> 1], n
>> 3);
14397 ix86_expand_vector_init_interleave (quarter_mode
, op3
,
14398 &ops
[(n
>> 1) | (n
>> 2)], n
>> 3);
14399 emit_insn (gen_rtx_SET (op4
, gen_rtx_VEC_CONCAT (half_mode
, op0
, op1
)));
14400 emit_insn (gen_rtx_SET (op5
, gen_rtx_VEC_CONCAT (half_mode
, op2
, op3
)));
14401 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, op4
, op5
)));
14405 if (!TARGET_SSE4_1
)
14413 /* Don't use ix86_expand_vector_init_interleave if we can't
14414 move from GPR to SSE register directly. */
14415 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
)
14418 n
= GET_MODE_NUNITS (mode
);
14419 for (i
= 0; i
< n
; i
++)
14420 ops
[i
] = XVECEXP (vals
, 0, i
);
14421 ix86_expand_vector_init_interleave (mode
, target
, ops
, n
>> 1);
14429 gcc_unreachable ();
14433 int i
, j
, n_elts
, n_words
, n_elt_per_word
;
14434 machine_mode inner_mode
;
14435 rtx words
[4], shift
;
14437 inner_mode
= GET_MODE_INNER (mode
);
14438 n_elts
= GET_MODE_NUNITS (mode
);
14439 n_words
= GET_MODE_SIZE (mode
) / UNITS_PER_WORD
;
14440 n_elt_per_word
= n_elts
/ n_words
;
14441 shift
= GEN_INT (GET_MODE_BITSIZE (inner_mode
));
14443 for (i
= 0; i
< n_words
; ++i
)
14445 rtx word
= NULL_RTX
;
14447 for (j
= 0; j
< n_elt_per_word
; ++j
)
14449 rtx elt
= XVECEXP (vals
, 0, (i
+1)*n_elt_per_word
- j
- 1);
14450 elt
= convert_modes (word_mode
, inner_mode
, elt
, true);
14456 word
= expand_simple_binop (word_mode
, ASHIFT
, word
, shift
,
14457 word
, 1, OPTAB_LIB_WIDEN
);
14458 word
= expand_simple_binop (word_mode
, IOR
, word
, elt
,
14459 word
, 1, OPTAB_LIB_WIDEN
);
14467 emit_move_insn (target
, gen_lowpart (mode
, words
[0]));
14468 else if (n_words
== 2)
14470 rtx tmp
= gen_reg_rtx (mode
);
14471 emit_clobber (tmp
);
14472 emit_move_insn (gen_lowpart (word_mode
, tmp
), words
[0]);
14473 emit_move_insn (gen_highpart (word_mode
, tmp
), words
[1]);
14474 emit_move_insn (target
, tmp
);
14476 else if (n_words
== 4)
14478 rtx tmp
= gen_reg_rtx (V4SImode
);
14479 gcc_assert (word_mode
== SImode
);
14480 vals
= gen_rtx_PARALLEL (V4SImode
, gen_rtvec_v (4, words
));
14481 ix86_expand_vector_init_general (false, V4SImode
, tmp
, vals
);
14482 emit_move_insn (target
, gen_lowpart (mode
, tmp
));
14485 gcc_unreachable ();
14489 /* Initialize vector TARGET via VALS. Suppress the use of MMX
14490 instructions unless MMX_OK is true. */
14493 ix86_expand_vector_init (bool mmx_ok
, rtx target
, rtx vals
)
14495 machine_mode mode
= GET_MODE (target
);
14496 machine_mode inner_mode
= GET_MODE_INNER (mode
);
14497 int n_elts
= GET_MODE_NUNITS (mode
);
14498 int n_var
= 0, one_var
= -1;
14499 bool all_same
= true, all_const_zero
= true;
14503 /* Handle first initialization from vector elts. */
14504 if (n_elts
!= XVECLEN (vals
, 0))
14506 rtx subtarget
= target
;
14507 x
= XVECEXP (vals
, 0, 0);
14508 gcc_assert (GET_MODE_INNER (GET_MODE (x
)) == inner_mode
);
14509 if (GET_MODE_NUNITS (GET_MODE (x
)) * 2 == n_elts
)
14511 rtx ops
[2] = { XVECEXP (vals
, 0, 0), XVECEXP (vals
, 0, 1) };
14512 if (inner_mode
== QImode
|| inner_mode
== HImode
)
14514 unsigned int n_bits
= n_elts
* GET_MODE_SIZE (inner_mode
);
14515 mode
= mode_for_vector (SImode
, n_bits
/ 4).require ();
14516 inner_mode
= mode_for_vector (SImode
, n_bits
/ 8).require ();
14517 ops
[0] = gen_lowpart (inner_mode
, ops
[0]);
14518 ops
[1] = gen_lowpart (inner_mode
, ops
[1]);
14519 subtarget
= gen_reg_rtx (mode
);
14521 ix86_expand_vector_init_concat (mode
, subtarget
, ops
, 2);
14522 if (subtarget
!= target
)
14523 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), subtarget
));
14526 gcc_unreachable ();
14529 for (i
= 0; i
< n_elts
; ++i
)
14531 x
= XVECEXP (vals
, 0, i
);
14532 if (!(CONST_SCALAR_INT_P (x
)
14533 || CONST_DOUBLE_P (x
)
14534 || CONST_FIXED_P (x
)))
14535 n_var
++, one_var
= i
;
14536 else if (x
!= CONST0_RTX (inner_mode
))
14537 all_const_zero
= false;
14538 if (i
> 0 && !rtx_equal_p (x
, XVECEXP (vals
, 0, 0)))
14542 /* Constants are best loaded from the constant pool. */
14545 emit_move_insn (target
, gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0)));
14549 /* If all values are identical, broadcast the value. */
14551 && ix86_expand_vector_init_duplicate (mmx_ok
, mode
, target
,
14552 XVECEXP (vals
, 0, 0)))
14555 /* Values where only one field is non-constant are best loaded from
14556 the pool and overwritten via move later. */
14560 && ix86_expand_vector_init_one_nonzero (mmx_ok
, mode
, target
,
14561 XVECEXP (vals
, 0, one_var
),
14565 if (ix86_expand_vector_init_one_var (mmx_ok
, mode
, target
, vals
, one_var
))
14569 ix86_expand_vector_init_general (mmx_ok
, mode
, target
, vals
);
14573 V setg (V v, int idx, T val)
14575 V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
14576 V valv = (V){val, val, val, val, val, val, val, val};
14577 V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
14578 v = (v & ~mask) | (valv & mask);
14582 ix86_expand_vector_set_var (rtx target
, rtx val
, rtx idx
)
14585 machine_mode mode
= GET_MODE (target
);
14586 machine_mode cmp_mode
= mode
;
14587 int n_elts
= GET_MODE_NUNITS (mode
);
14588 rtx valv
,idxv
,constv
,idx_tmp
;
14591 /* 512-bits vector byte/word broadcast and comparison only available
14592 under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
14593 when without TARGET_AVX512BW. */
14594 if ((mode
== V32HImode
|| mode
== V64QImode
) && !TARGET_AVX512BW
)
14596 gcc_assert (TARGET_AVX512F
);
14597 rtx vhi
, vlo
, idx_hi
;
14598 machine_mode half_mode
;
14599 rtx (*extract_hi
)(rtx
, rtx
);
14600 rtx (*extract_lo
)(rtx
, rtx
);
14602 if (mode
== V32HImode
)
14604 half_mode
= V16HImode
;
14605 extract_hi
= gen_vec_extract_hi_v32hi
;
14606 extract_lo
= gen_vec_extract_lo_v32hi
;
14610 half_mode
= V32QImode
;
14611 extract_hi
= gen_vec_extract_hi_v64qi
;
14612 extract_lo
= gen_vec_extract_lo_v64qi
;
14615 vhi
= gen_reg_rtx (half_mode
);
14616 vlo
= gen_reg_rtx (half_mode
);
14617 idx_hi
= gen_reg_rtx (GET_MODE (idx
));
14618 emit_insn (extract_hi (vhi
, target
));
14619 emit_insn (extract_lo (vlo
, target
));
14622 vec
[2] = GEN_INT (n_elts
/2);
14623 ix86_expand_binary_operator (MINUS
, GET_MODE (idx
), vec
);
14624 ix86_expand_vector_set_var (vhi
, val
, idx_hi
);
14625 ix86_expand_vector_set_var (vlo
, val
, idx
);
14626 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, vlo
, vhi
)));
14630 if (FLOAT_MODE_P (GET_MODE_INNER (mode
)))
14635 cmp_mode
= V2DImode
;
14638 cmp_mode
= V4DImode
;
14641 cmp_mode
= V8DImode
;
14644 cmp_mode
= V4SImode
;
14647 cmp_mode
= V8SImode
;
14650 cmp_mode
= V16SImode
;
14653 gcc_unreachable ();
14657 for (int i
= 0; i
!= n_elts
; i
++)
14658 vec
[i
] = GEN_INT (i
);
14659 constv
= gen_rtx_CONST_VECTOR (cmp_mode
, gen_rtvec_v (n_elts
, vec
));
14660 valv
= gen_reg_rtx (mode
);
14661 idxv
= gen_reg_rtx (cmp_mode
);
14662 idx_tmp
= convert_to_mode (GET_MODE_INNER (cmp_mode
), idx
, 1);
14664 ok
= ix86_expand_vector_init_duplicate (false, mode
, valv
, val
);
14666 ok
= ix86_expand_vector_init_duplicate (false, cmp_mode
, idxv
, idx_tmp
);
14671 vec
[3] = gen_rtx_EQ (mode
, idxv
, constv
);
14674 ok
= ix86_expand_int_vcond (vec
);
14679 ix86_expand_vector_set (bool mmx_ok
, rtx target
, rtx val
, int elt
)
14681 machine_mode mode
= GET_MODE (target
);
14682 machine_mode inner_mode
= GET_MODE_INNER (mode
);
14683 machine_mode half_mode
;
14684 bool use_vec_merge
= false;
14686 static rtx (*gen_extract
[6][2]) (rtx
, rtx
)
14688 { gen_vec_extract_lo_v32qi
, gen_vec_extract_hi_v32qi
},
14689 { gen_vec_extract_lo_v16hi
, gen_vec_extract_hi_v16hi
},
14690 { gen_vec_extract_lo_v8si
, gen_vec_extract_hi_v8si
},
14691 { gen_vec_extract_lo_v4di
, gen_vec_extract_hi_v4di
},
14692 { gen_vec_extract_lo_v8sf
, gen_vec_extract_hi_v8sf
},
14693 { gen_vec_extract_lo_v4df
, gen_vec_extract_hi_v4df
}
14695 static rtx (*gen_insert
[6][2]) (rtx
, rtx
, rtx
)
14697 { gen_vec_set_lo_v32qi
, gen_vec_set_hi_v32qi
},
14698 { gen_vec_set_lo_v16hi
, gen_vec_set_hi_v16hi
},
14699 { gen_vec_set_lo_v8si
, gen_vec_set_hi_v8si
},
14700 { gen_vec_set_lo_v4di
, gen_vec_set_hi_v4di
},
14701 { gen_vec_set_lo_v8sf
, gen_vec_set_hi_v8sf
},
14702 { gen_vec_set_lo_v4df
, gen_vec_set_hi_v4df
}
14705 machine_mode mmode
= VOIDmode
;
14706 rtx (*gen_blendm
) (rtx
, rtx
, rtx
, rtx
);
14711 use_vec_merge
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
14719 tmp
= gen_reg_rtx (GET_MODE_INNER (mode
));
14720 ix86_expand_vector_extract (true, tmp
, target
, 1 - elt
);
14722 tmp
= gen_rtx_VEC_CONCAT (mode
, val
, tmp
);
14724 tmp
= gen_rtx_VEC_CONCAT (mode
, tmp
, val
);
14725 emit_insn (gen_rtx_SET (target
, tmp
));
14731 use_vec_merge
= TARGET_SSE4_1
&& TARGET_64BIT
;
14735 tmp
= gen_reg_rtx (GET_MODE_INNER (mode
));
14736 ix86_expand_vector_extract (false, tmp
, target
, 1 - elt
);
14738 tmp
= gen_rtx_VEC_CONCAT (mode
, val
, tmp
);
14740 tmp
= gen_rtx_VEC_CONCAT (mode
, tmp
, val
);
14741 emit_insn (gen_rtx_SET (target
, tmp
));
14745 /* NB: For ELT == 0, use standard scalar operation patterns which
14746 preserve the rest of the vector for combiner:
14749 (vec_duplicate:V2DF (reg:DF))
14759 /* For the two element vectors, we implement a VEC_CONCAT with
14760 the extraction of the other element. */
14762 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (1 - elt
)));
14763 tmp
= gen_rtx_VEC_SELECT (inner_mode
, target
, tmp
);
14766 op0
= val
, op1
= tmp
;
14768 op0
= tmp
, op1
= val
;
14770 tmp
= gen_rtx_VEC_CONCAT (mode
, op0
, op1
);
14771 emit_insn (gen_rtx_SET (target
, tmp
));
14776 use_vec_merge
= TARGET_SSE4_1
;
14783 use_vec_merge
= true;
14787 /* tmp = target = A B C D */
14788 tmp
= copy_to_reg (target
);
14789 /* target = A A B B */
14790 emit_insn (gen_vec_interleave_lowv4sf (target
, target
, target
));
14791 /* target = X A B B */
14792 ix86_expand_vector_set (false, target
, val
, 0);
14793 /* target = A X C D */
14794 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
14795 const1_rtx
, const0_rtx
,
14796 GEN_INT (2+4), GEN_INT (3+4)));
14800 /* tmp = target = A B C D */
14801 tmp
= copy_to_reg (target
);
14802 /* tmp = X B C D */
14803 ix86_expand_vector_set (false, tmp
, val
, 0);
14804 /* target = A B X D */
14805 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
14806 const0_rtx
, const1_rtx
,
14807 GEN_INT (0+4), GEN_INT (3+4)));
14811 /* tmp = target = A B C D */
14812 tmp
= copy_to_reg (target
);
14813 /* tmp = X B C D */
14814 ix86_expand_vector_set (false, tmp
, val
, 0);
14815 /* target = A B X D */
14816 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
14817 const0_rtx
, const1_rtx
,
14818 GEN_INT (2+4), GEN_INT (0+4)));
14822 gcc_unreachable ();
14827 use_vec_merge
= TARGET_SSE4_1
;
14831 /* Element 0 handled by vec_merge below. */
14834 use_vec_merge
= true;
14840 /* With SSE2, use integer shuffles to swap element 0 and ELT,
14841 store into element 0, then shuffle them back. */
14845 order
[0] = GEN_INT (elt
);
14846 order
[1] = const1_rtx
;
14847 order
[2] = const2_rtx
;
14848 order
[3] = GEN_INT (3);
14849 order
[elt
] = const0_rtx
;
14851 emit_insn (gen_sse2_pshufd_1 (target
, target
, order
[0],
14852 order
[1], order
[2], order
[3]));
14854 ix86_expand_vector_set (false, target
, val
, 0);
14856 emit_insn (gen_sse2_pshufd_1 (target
, target
, order
[0],
14857 order
[1], order
[2], order
[3]));
14861 /* For SSE1, we have to reuse the V4SF code. */
14862 rtx t
= gen_reg_rtx (V4SFmode
);
14863 emit_move_insn (t
, gen_lowpart (V4SFmode
, target
));
14864 ix86_expand_vector_set (false, t
, gen_lowpart (SFmode
, val
), elt
);
14865 emit_move_insn (target
, gen_lowpart (mode
, t
));
14870 use_vec_merge
= TARGET_SSE2
;
14873 use_vec_merge
= mmx_ok
&& (TARGET_SSE
|| TARGET_3DNOW_A
);
14877 use_vec_merge
= TARGET_SSE4_1
;
14881 use_vec_merge
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
14885 half_mode
= V16QImode
;
14891 half_mode
= V8HImode
;
14897 half_mode
= V4SImode
;
14903 half_mode
= V2DImode
;
14909 half_mode
= V4SFmode
;
14915 half_mode
= V2DFmode
;
14921 /* Compute offset. */
14925 gcc_assert (i
<= 1);
14927 /* Extract the half. */
14928 tmp
= gen_reg_rtx (half_mode
);
14929 emit_insn (gen_extract
[j
][i
] (tmp
, target
));
14931 /* Put val in tmp at elt. */
14932 ix86_expand_vector_set (false, tmp
, val
, elt
);
14935 emit_insn (gen_insert
[j
][i
] (target
, target
, tmp
));
14939 if (TARGET_AVX512F
)
14942 gen_blendm
= gen_avx512f_blendmv8df
;
14947 if (TARGET_AVX512F
)
14950 gen_blendm
= gen_avx512f_blendmv8di
;
14955 if (TARGET_AVX512F
)
14958 gen_blendm
= gen_avx512f_blendmv16sf
;
14963 if (TARGET_AVX512F
)
14966 gen_blendm
= gen_avx512f_blendmv16si
;
14971 if (TARGET_AVX512BW
)
14974 gen_blendm
= gen_avx512bw_blendmv32hi
;
14976 else if (TARGET_AVX512F
)
14978 half_mode
= E_V8HImode
;
14985 if (TARGET_AVX512BW
)
14988 gen_blendm
= gen_avx512bw_blendmv64qi
;
14990 else if (TARGET_AVX512F
)
14992 half_mode
= E_V16QImode
;
14999 /* Compute offset. */
15003 gcc_assert (i
<= 3);
15006 /* Extract the quarter. */
15007 tmp
= gen_reg_rtx (V4SImode
);
15008 rtx tmp2
= gen_lowpart (V16SImode
, target
);
15009 rtx mask
= gen_reg_rtx (QImode
);
15011 emit_move_insn (mask
, constm1_rtx
);
15012 emit_insn (gen_avx512f_vextracti32x4_mask (tmp
, tmp2
, GEN_INT (i
),
15015 tmp2
= gen_reg_rtx (half_mode
);
15016 emit_move_insn (tmp2
, gen_lowpart (half_mode
, tmp
));
15019 /* Put val in tmp at elt. */
15020 ix86_expand_vector_set (false, tmp
, val
, elt
);
15023 tmp2
= gen_reg_rtx (V16SImode
);
15024 rtx tmp3
= gen_lowpart (V16SImode
, target
);
15025 mask
= gen_reg_rtx (HImode
);
15026 emit_move_insn (mask
, constm1_rtx
);
15027 tmp
= gen_lowpart (V4SImode
, tmp
);
15028 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2
, tmp3
, tmp
, GEN_INT (i
),
15030 emit_move_insn (target
, gen_lowpart (mode
, tmp2
));
15038 if (mmode
!= VOIDmode
)
15040 tmp
= gen_reg_rtx (mode
);
15041 emit_insn (gen_rtx_SET (tmp
, gen_rtx_VEC_DUPLICATE (mode
, val
)));
15042 /* The avx512*_blendm<mode> expanders have different operand order
15043 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
15044 elements where the mask is set and second input operand otherwise,
15045 in {sse,avx}*_*blend* the first input operand is used for elements
15046 where the mask is clear and second input operand otherwise. */
15047 emit_insn (gen_blendm (target
, target
, tmp
,
15049 gen_int_mode (HOST_WIDE_INT_1U
<< elt
,
15052 else if (use_vec_merge
)
15055 tmp
= gen_rtx_VEC_DUPLICATE (mode
, val
);
15056 tmp
= gen_rtx_VEC_MERGE (mode
, tmp
, target
,
15057 GEN_INT (HOST_WIDE_INT_1U
<< elt
));
15058 emit_insn (gen_rtx_SET (target
, tmp
));
15062 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
15064 emit_move_insn (mem
, target
);
15066 tmp
= adjust_address (mem
, inner_mode
, elt
* GET_MODE_SIZE (inner_mode
));
15067 emit_move_insn (tmp
, val
);
15069 emit_move_insn (target
, mem
);
15074 ix86_expand_vector_extract (bool mmx_ok
, rtx target
, rtx vec
, int elt
)
15076 machine_mode mode
= GET_MODE (vec
);
15077 machine_mode inner_mode
= GET_MODE_INNER (mode
);
15078 bool use_vec_extr
= false;
15084 use_vec_extr
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
15098 use_vec_extr
= true;
15102 use_vec_extr
= TARGET_SSE4_1
;
15114 tmp
= gen_reg_rtx (mode
);
15115 emit_insn (gen_sse_shufps_v4sf (tmp
, vec
, vec
,
15116 GEN_INT (elt
), GEN_INT (elt
),
15117 GEN_INT (elt
+4), GEN_INT (elt
+4)));
15121 tmp
= gen_reg_rtx (mode
);
15122 emit_insn (gen_vec_interleave_highv4sf (tmp
, vec
, vec
));
15126 gcc_unreachable ();
15129 use_vec_extr
= true;
15134 use_vec_extr
= TARGET_SSE4_1
;
15148 tmp
= gen_reg_rtx (mode
);
15149 emit_insn (gen_sse2_pshufd_1 (tmp
, vec
,
15150 GEN_INT (elt
), GEN_INT (elt
),
15151 GEN_INT (elt
), GEN_INT (elt
)));
15155 tmp
= gen_reg_rtx (mode
);
15156 emit_insn (gen_vec_interleave_highv4si (tmp
, vec
, vec
));
15160 gcc_unreachable ();
15163 use_vec_extr
= true;
15168 /* For SSE1, we have to reuse the V4SF code. */
15169 ix86_expand_vector_extract (false, gen_lowpart (SFmode
, target
),
15170 gen_lowpart (V4SFmode
, vec
), elt
);
15176 use_vec_extr
= TARGET_SSE2
;
15179 use_vec_extr
= mmx_ok
&& (TARGET_SSE
|| TARGET_3DNOW_A
);
15183 use_vec_extr
= TARGET_SSE4_1
;
15187 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC
))
15189 tmp
= gen_reg_rtx (SImode
);
15190 ix86_expand_vector_extract (false, tmp
, gen_lowpart (V4SImode
, vec
),
15192 emit_insn (gen_rtx_SET (target
, gen_lowpart (QImode
, tmp
)));
15200 tmp
= gen_reg_rtx (V4SFmode
);
15202 emit_insn (gen_vec_extract_lo_v8sf (tmp
, vec
));
15204 emit_insn (gen_vec_extract_hi_v8sf (tmp
, vec
));
15205 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
15213 tmp
= gen_reg_rtx (V2DFmode
);
15215 emit_insn (gen_vec_extract_lo_v4df (tmp
, vec
));
15217 emit_insn (gen_vec_extract_hi_v4df (tmp
, vec
));
15218 ix86_expand_vector_extract (false, target
, tmp
, elt
& 1);
15226 tmp
= gen_reg_rtx (V16QImode
);
15228 emit_insn (gen_vec_extract_lo_v32qi (tmp
, vec
));
15230 emit_insn (gen_vec_extract_hi_v32qi (tmp
, vec
));
15231 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
15239 tmp
= gen_reg_rtx (V8HImode
);
15241 emit_insn (gen_vec_extract_lo_v16hi (tmp
, vec
));
15243 emit_insn (gen_vec_extract_hi_v16hi (tmp
, vec
));
15244 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
15252 tmp
= gen_reg_rtx (V4SImode
);
15254 emit_insn (gen_vec_extract_lo_v8si (tmp
, vec
));
15256 emit_insn (gen_vec_extract_hi_v8si (tmp
, vec
));
15257 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
15265 tmp
= gen_reg_rtx (V2DImode
);
15267 emit_insn (gen_vec_extract_lo_v4di (tmp
, vec
));
15269 emit_insn (gen_vec_extract_hi_v4di (tmp
, vec
));
15270 ix86_expand_vector_extract (false, target
, tmp
, elt
& 1);
15276 if (TARGET_AVX512BW
)
15278 tmp
= gen_reg_rtx (V16HImode
);
15280 emit_insn (gen_vec_extract_lo_v32hi (tmp
, vec
));
15282 emit_insn (gen_vec_extract_hi_v32hi (tmp
, vec
));
15283 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
15289 if (TARGET_AVX512BW
)
15291 tmp
= gen_reg_rtx (V32QImode
);
15293 emit_insn (gen_vec_extract_lo_v64qi (tmp
, vec
));
15295 emit_insn (gen_vec_extract_hi_v64qi (tmp
, vec
));
15296 ix86_expand_vector_extract (false, target
, tmp
, elt
& 31);
15302 tmp
= gen_reg_rtx (V8SFmode
);
15304 emit_insn (gen_vec_extract_lo_v16sf (tmp
, vec
));
15306 emit_insn (gen_vec_extract_hi_v16sf (tmp
, vec
));
15307 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
15311 tmp
= gen_reg_rtx (V4DFmode
);
15313 emit_insn (gen_vec_extract_lo_v8df (tmp
, vec
));
15315 emit_insn (gen_vec_extract_hi_v8df (tmp
, vec
));
15316 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
15320 tmp
= gen_reg_rtx (V8SImode
);
15322 emit_insn (gen_vec_extract_lo_v16si (tmp
, vec
));
15324 emit_insn (gen_vec_extract_hi_v16si (tmp
, vec
));
15325 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
15329 tmp
= gen_reg_rtx (V4DImode
);
15331 emit_insn (gen_vec_extract_lo_v8di (tmp
, vec
));
15333 emit_insn (gen_vec_extract_hi_v8di (tmp
, vec
));
15334 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
15338 use_vec_extr
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
15339 /* ??? Could extract the appropriate HImode element and shift. */
15348 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (elt
)));
15349 tmp
= gen_rtx_VEC_SELECT (inner_mode
, vec
, tmp
);
15351 /* Let the rtl optimizers know about the zero extension performed. */
15352 if (inner_mode
== QImode
|| inner_mode
== HImode
)
15354 tmp
= gen_rtx_ZERO_EXTEND (SImode
, tmp
);
15355 target
= gen_lowpart (SImode
, target
);
15358 emit_insn (gen_rtx_SET (target
, tmp
));
15362 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
15364 emit_move_insn (mem
, vec
);
15366 tmp
= adjust_address (mem
, inner_mode
, elt
*GET_MODE_SIZE (inner_mode
));
15367 emit_move_insn (target
, tmp
);
15371 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
15372 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
15373 The upper bits of DEST are undefined, though they shouldn't cause
15374 exceptions (some bits from src or all zeros are ok). */
15377 emit_reduc_half (rtx dest
, rtx src
, int i
)
15380 switch (GET_MODE (src
))
15384 tem
= gen_sse_movhlps (dest
, src
, src
);
15386 tem
= gen_sse_shufps_v4sf (dest
, src
, src
, const1_rtx
, const1_rtx
,
15387 GEN_INT (1 + 4), GEN_INT (1 + 4));
15390 tem
= gen_vec_interleave_highv2df (dest
, src
, src
);
15396 d
= gen_reg_rtx (V1TImode
);
15397 tem
= gen_sse2_lshrv1ti3 (d
, gen_lowpart (V1TImode
, src
),
15402 tem
= gen_avx_vperm2f128v8sf3 (dest
, src
, src
, const1_rtx
);
15404 tem
= gen_avx_shufps256 (dest
, src
, src
,
15405 GEN_INT (i
== 128 ? 2 + (3 << 2) : 1));
15409 tem
= gen_avx_vperm2f128v4df3 (dest
, src
, src
, const1_rtx
);
15411 tem
= gen_avx_shufpd256 (dest
, src
, src
, const1_rtx
);
15419 if (GET_MODE (dest
) != V4DImode
)
15420 d
= gen_reg_rtx (V4DImode
);
15421 tem
= gen_avx2_permv2ti (d
, gen_lowpart (V4DImode
, src
),
15422 gen_lowpart (V4DImode
, src
),
15427 d
= gen_reg_rtx (V2TImode
);
15428 tem
= gen_avx2_lshrv2ti3 (d
, gen_lowpart (V2TImode
, src
),
15436 d
= gen_reg_rtx (V4TImode
);
15437 tem
= gen_avx512bw_lshrv4ti3 (d
, gen_lowpart (V4TImode
, src
),
15447 tem
= gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode
, dest
),
15448 gen_lowpart (V16SImode
, src
),
15449 gen_lowpart (V16SImode
, src
),
15450 GEN_INT (0x4 + (i
== 512 ? 4 : 0)),
15451 GEN_INT (0x5 + (i
== 512 ? 4 : 0)),
15452 GEN_INT (0x6 + (i
== 512 ? 4 : 0)),
15453 GEN_INT (0x7 + (i
== 512 ? 4 : 0)),
15454 GEN_INT (0xC), GEN_INT (0xD),
15455 GEN_INT (0xE), GEN_INT (0xF),
15456 GEN_INT (0x10), GEN_INT (0x11),
15457 GEN_INT (0x12), GEN_INT (0x13),
15458 GEN_INT (0x14), GEN_INT (0x15),
15459 GEN_INT (0x16), GEN_INT (0x17));
15461 tem
= gen_avx512f_pshufd_1 (gen_lowpart (V16SImode
, dest
),
15462 gen_lowpart (V16SImode
, src
),
15463 GEN_INT (i
== 128 ? 0x2 : 0x1),
15467 GEN_INT (i
== 128 ? 0x6 : 0x5),
15471 GEN_INT (i
== 128 ? 0xA : 0x9),
15475 GEN_INT (i
== 128 ? 0xE : 0xD),
15481 gcc_unreachable ();
15485 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), d
));
15488 /* Expand a vector reduction. FN is the binary pattern to reduce;
15489 DEST is the destination; IN is the input vector. */
15492 ix86_expand_reduc (rtx (*fn
) (rtx
, rtx
, rtx
), rtx dest
, rtx in
)
15494 rtx half
, dst
, vec
= in
;
15495 machine_mode mode
= GET_MODE (in
);
15498 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
15500 && mode
== V8HImode
15501 && fn
== gen_uminv8hi3
)
15503 emit_insn (gen_sse4_1_phminposuw (dest
, in
));
15507 for (i
= GET_MODE_BITSIZE (mode
);
15508 i
> GET_MODE_UNIT_BITSIZE (mode
);
15511 half
= gen_reg_rtx (mode
);
15512 emit_reduc_half (half
, vec
, i
);
15513 if (i
== GET_MODE_UNIT_BITSIZE (mode
) * 2)
15516 dst
= gen_reg_rtx (mode
);
15517 emit_insn (fn (dst
, half
, vec
));
15522 /* Output code to perform a conditional jump to LABEL, if C2 flag in
15523 FP status register is set. */
15526 ix86_emit_fp_unordered_jump (rtx label
)
15528 rtx reg
= gen_reg_rtx (HImode
);
15532 emit_insn (gen_x86_fnstsw_1 (reg
));
15534 if (TARGET_SAHF
&& (TARGET_USE_SAHF
|| optimize_insn_for_size_p ()))
15536 emit_insn (gen_x86_sahf_1 (reg
));
15538 temp
= gen_rtx_REG (CCmode
, FLAGS_REG
);
15539 temp
= gen_rtx_UNORDERED (VOIDmode
, temp
, const0_rtx
);
15543 emit_insn (gen_testqi_ext_1_ccno (reg
, GEN_INT (0x04)));
15545 temp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15546 temp
= gen_rtx_NE (VOIDmode
, temp
, const0_rtx
);
15549 temp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, temp
,
15550 gen_rtx_LABEL_REF (VOIDmode
, label
),
15552 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, temp
));
15553 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
15554 JUMP_LABEL (insn
) = label
;
15557 /* Output code to perform an sinh XFmode calculation. */
15559 void ix86_emit_i387_sinh (rtx op0
, rtx op1
)
15561 rtx e1
= gen_reg_rtx (XFmode
);
15562 rtx e2
= gen_reg_rtx (XFmode
);
15563 rtx scratch
= gen_reg_rtx (HImode
);
15564 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15565 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15567 rtx_code_label
*jump_label
= gen_label_rtx ();
15570 /* scratch = fxam (op1) */
15571 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15573 /* e1 = expm1 (|op1|) */
15574 emit_insn (gen_absxf2 (e2
, op1
));
15575 emit_insn (gen_expm1xf2 (e1
, e2
));
15577 /* e2 = e1 / (e1 + 1.0) + e1 */
15578 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15579 emit_insn (gen_addxf3 (e2
, e1
, cst1
));
15580 emit_insn (gen_divxf3 (e2
, e1
, e2
));
15581 emit_insn (gen_addxf3 (e2
, e2
, e1
));
15583 /* flags = signbit (op1) */
15584 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15586 /* if (flags) then e2 = -e2 */
15587 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15588 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
15589 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15591 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15592 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15593 JUMP_LABEL (insn
) = jump_label
;
15595 emit_insn (gen_negxf2 (e2
, e2
));
15597 emit_label (jump_label
);
15598 LABEL_NUSES (jump_label
) = 1;
15600 /* op0 = 0.5 * e2 */
15601 half
= force_reg (XFmode
, half
);
15602 emit_insn (gen_mulxf3 (op0
, e2
, half
));
15605 /* Output code to perform an cosh XFmode calculation. */
15607 void ix86_emit_i387_cosh (rtx op0
, rtx op1
)
15609 rtx e1
= gen_reg_rtx (XFmode
);
15610 rtx e2
= gen_reg_rtx (XFmode
);
15611 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15614 /* e1 = exp (op1) */
15615 emit_insn (gen_expxf2 (e1
, op1
));
15617 /* e2 = e1 + 1.0 / e1 */
15618 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15619 emit_insn (gen_divxf3 (e2
, cst1
, e1
));
15620 emit_insn (gen_addxf3 (e2
, e1
, e2
));
15622 /* op0 = 0.5 * e2 */
15623 half
= force_reg (XFmode
, half
);
15624 emit_insn (gen_mulxf3 (op0
, e2
, half
));
15627 /* Output code to perform an tanh XFmode calculation. */
15629 void ix86_emit_i387_tanh (rtx op0
, rtx op1
)
15631 rtx e1
= gen_reg_rtx (XFmode
);
15632 rtx e2
= gen_reg_rtx (XFmode
);
15633 rtx scratch
= gen_reg_rtx (HImode
);
15634 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15636 rtx_code_label
*jump_label
= gen_label_rtx ();
15639 /* scratch = fxam (op1) */
15640 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15642 /* e1 = expm1 (-|2 * op1|) */
15643 emit_insn (gen_addxf3 (e2
, op1
, op1
));
15644 emit_insn (gen_absxf2 (e2
, e2
));
15645 emit_insn (gen_negxf2 (e2
, e2
));
15646 emit_insn (gen_expm1xf2 (e1
, e2
));
15648 /* e2 = e1 / (e1 + 2.0) */
15649 cst2
= force_reg (XFmode
, CONST2_RTX (XFmode
));
15650 emit_insn (gen_addxf3 (e2
, e1
, cst2
));
15651 emit_insn (gen_divxf3 (e2
, e1
, e2
));
15653 /* flags = signbit (op1) */
15654 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15656 /* if (!flags) then e2 = -e2 */
15657 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15658 gen_rtx_NE (VOIDmode
, flags
, const0_rtx
),
15659 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15661 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15662 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15663 JUMP_LABEL (insn
) = jump_label
;
15665 emit_insn (gen_negxf2 (e2
, e2
));
15667 emit_label (jump_label
);
15668 LABEL_NUSES (jump_label
) = 1;
15670 emit_move_insn (op0
, e2
);
15673 /* Output code to perform an asinh XFmode calculation. */
15675 void ix86_emit_i387_asinh (rtx op0
, rtx op1
)
15677 rtx e1
= gen_reg_rtx (XFmode
);
15678 rtx e2
= gen_reg_rtx (XFmode
);
15679 rtx scratch
= gen_reg_rtx (HImode
);
15680 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15682 rtx_code_label
*jump_label
= gen_label_rtx ();
15685 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
15686 emit_insn (gen_mulxf3 (e1
, op1
, op1
));
15687 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15688 emit_insn (gen_addxf3 (e2
, e1
, cst1
));
15689 emit_insn (gen_sqrtxf2 (e2
, e2
));
15690 emit_insn (gen_addxf3 (e2
, e2
, cst1
));
15693 emit_insn (gen_divxf3 (e1
, e1
, e2
));
15695 /* scratch = fxam (op1) */
15696 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15698 /* e1 = e1 + |op1| */
15699 emit_insn (gen_absxf2 (e2
, op1
));
15700 emit_insn (gen_addxf3 (e1
, e1
, e2
));
15702 /* e2 = log1p (e1) */
15703 ix86_emit_i387_log1p (e2
, e1
);
15705 /* flags = signbit (op1) */
15706 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15708 /* if (flags) then e2 = -e2 */
15709 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15710 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
15711 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15713 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15714 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15715 JUMP_LABEL (insn
) = jump_label
;
15717 emit_insn (gen_negxf2 (e2
, e2
));
15719 emit_label (jump_label
);
15720 LABEL_NUSES (jump_label
) = 1;
15722 emit_move_insn (op0
, e2
);
15725 /* Output code to perform an acosh XFmode calculation. */
15727 void ix86_emit_i387_acosh (rtx op0
, rtx op1
)
15729 rtx e1
= gen_reg_rtx (XFmode
);
15730 rtx e2
= gen_reg_rtx (XFmode
);
15731 rtx cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15733 /* e2 = sqrt (op1 + 1.0) */
15734 emit_insn (gen_addxf3 (e2
, op1
, cst1
));
15735 emit_insn (gen_sqrtxf2 (e2
, e2
));
15737 /* e1 = sqrt (op1 - 1.0) */
15738 emit_insn (gen_subxf3 (e1
, op1
, cst1
));
15739 emit_insn (gen_sqrtxf2 (e1
, e1
));
15742 emit_insn (gen_mulxf3 (e1
, e1
, e2
));
15744 /* e1 = e1 + op1 */
15745 emit_insn (gen_addxf3 (e1
, e1
, op1
));
15747 /* op0 = log (e1) */
15748 emit_insn (gen_logxf2 (op0
, e1
));
15751 /* Output code to perform an atanh XFmode calculation. */
15753 void ix86_emit_i387_atanh (rtx op0
, rtx op1
)
15755 rtx e1
= gen_reg_rtx (XFmode
);
15756 rtx e2
= gen_reg_rtx (XFmode
);
15757 rtx scratch
= gen_reg_rtx (HImode
);
15758 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15759 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15761 rtx_code_label
*jump_label
= gen_label_rtx ();
15764 /* scratch = fxam (op1) */
15765 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15768 emit_insn (gen_absxf2 (e2
, op1
));
15770 /* e1 = -(e2 + e2) / (e2 + 1.0) */
15771 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15772 emit_insn (gen_addxf3 (e1
, e2
, cst1
));
15773 emit_insn (gen_addxf3 (e2
, e2
, e2
));
15774 emit_insn (gen_negxf2 (e2
, e2
));
15775 emit_insn (gen_divxf3 (e1
, e2
, e1
));
15777 /* e2 = log1p (e1) */
15778 ix86_emit_i387_log1p (e2
, e1
);
15780 /* flags = signbit (op1) */
15781 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15783 /* if (!flags) then e2 = -e2 */
15784 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15785 gen_rtx_NE (VOIDmode
, flags
, const0_rtx
),
15786 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15788 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15789 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15790 JUMP_LABEL (insn
) = jump_label
;
15792 emit_insn (gen_negxf2 (e2
, e2
));
15794 emit_label (jump_label
);
15795 LABEL_NUSES (jump_label
) = 1;
15797 /* op0 = 0.5 * e2 */
15798 half
= force_reg (XFmode
, half
);
15799 emit_insn (gen_mulxf3 (op0
, e2
, half
));
15802 /* Output code to perform a log1p XFmode calculation. */
15804 void ix86_emit_i387_log1p (rtx op0
, rtx op1
)
15806 rtx_code_label
*label1
= gen_label_rtx ();
15807 rtx_code_label
*label2
= gen_label_rtx ();
15809 rtx tmp
= gen_reg_rtx (XFmode
);
15810 rtx res
= gen_reg_rtx (XFmode
);
15811 rtx cst
, cstln2
, cst1
;
15814 cst
= const_double_from_real_value
15815 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode
), XFmode
);
15816 cstln2
= force_reg (XFmode
, standard_80387_constant_rtx (4)); /* fldln2 */
15818 emit_insn (gen_absxf2 (tmp
, op1
));
15820 cst
= force_reg (XFmode
, cst
);
15821 ix86_expand_branch (GE
, tmp
, cst
, label1
);
15822 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
15823 insn
= get_last_insn ();
15824 JUMP_LABEL (insn
) = label1
;
15826 emit_insn (gen_fyl2xp1xf3_i387 (res
, op1
, cstln2
));
15827 emit_jump (label2
);
15829 emit_label (label1
);
15830 LABEL_NUSES (label1
) = 1;
15832 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15833 emit_insn (gen_rtx_SET (tmp
, gen_rtx_PLUS (XFmode
, op1
, cst1
)));
15834 emit_insn (gen_fyl2xxf3_i387 (res
, tmp
, cstln2
));
15836 emit_label (label2
);
15837 LABEL_NUSES (label2
) = 1;
15839 emit_move_insn (op0
, res
);
15842 /* Emit code for round calculation. */
15843 void ix86_emit_i387_round (rtx op0
, rtx op1
)
15845 machine_mode inmode
= GET_MODE (op1
);
15846 machine_mode outmode
= GET_MODE (op0
);
15847 rtx e1
= gen_reg_rtx (XFmode
);
15848 rtx e2
= gen_reg_rtx (XFmode
);
15849 rtx scratch
= gen_reg_rtx (HImode
);
15850 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15851 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15852 rtx res
= gen_reg_rtx (outmode
);
15853 rtx_code_label
*jump_label
= gen_label_rtx ();
15854 rtx (*floor_insn
) (rtx
, rtx
);
15855 rtx (*neg_insn
) (rtx
, rtx
);
15863 tmp
= gen_reg_rtx (XFmode
);
15865 emit_insn (gen_rtx_SET (tmp
, gen_rtx_FLOAT_EXTEND (XFmode
, op1
)));
15871 gcc_unreachable ();
15877 floor_insn
= gen_frndintxf2_floor
;
15878 neg_insn
= gen_negsf2
;
15881 floor_insn
= gen_frndintxf2_floor
;
15882 neg_insn
= gen_negdf2
;
15885 floor_insn
= gen_frndintxf2_floor
;
15886 neg_insn
= gen_negxf2
;
15889 floor_insn
= gen_lfloorxfhi2
;
15890 neg_insn
= gen_neghi2
;
15893 floor_insn
= gen_lfloorxfsi2
;
15894 neg_insn
= gen_negsi2
;
15897 floor_insn
= gen_lfloorxfdi2
;
15898 neg_insn
= gen_negdi2
;
15901 gcc_unreachable ();
15904 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
15906 /* scratch = fxam(op1) */
15907 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15909 /* e1 = fabs(op1) */
15910 emit_insn (gen_absxf2 (e1
, op1
));
15912 /* e2 = e1 + 0.5 */
15913 half
= force_reg (XFmode
, half
);
15914 emit_insn (gen_rtx_SET (e2
, gen_rtx_PLUS (XFmode
, e1
, half
)));
15916 /* res = floor(e2) */
15922 tmp
= gen_reg_rtx (XFmode
);
15924 emit_insn (floor_insn (tmp
, e2
));
15925 emit_insn (gen_rtx_SET (res
,
15926 gen_rtx_UNSPEC (outmode
, gen_rtvec (1, tmp
),
15927 UNSPEC_TRUNC_NOOP
)));
15931 emit_insn (floor_insn (res
, e2
));
15934 /* flags = signbit(a) */
15935 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15937 /* if (flags) then res = -res */
15938 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15939 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
15940 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15942 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15943 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15944 JUMP_LABEL (insn
) = jump_label
;
15946 emit_insn (neg_insn (res
, res
));
15948 emit_label (jump_label
);
15949 LABEL_NUSES (jump_label
) = 1;
15951 emit_move_insn (op0
, res
);
15954 /* Output code to perform a Newton-Rhapson approximation of a single precision
15955 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
15957 void ix86_emit_swdivsf (rtx res
, rtx a
, rtx b
, machine_mode mode
)
15959 rtx x0
, x1
, e0
, e1
;
15961 x0
= gen_reg_rtx (mode
);
15962 e0
= gen_reg_rtx (mode
);
15963 e1
= gen_reg_rtx (mode
);
15964 x1
= gen_reg_rtx (mode
);
15966 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
15968 b
= force_reg (mode
, b
);
15970 /* x0 = rcp(b) estimate */
15971 if (mode
== V16SFmode
|| mode
== V8DFmode
)
15973 if (TARGET_AVX512ER
)
15975 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
15978 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, a
, x0
)));
15982 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
15986 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
15990 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, b
)));
15993 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, e0
)));
15996 emit_insn (gen_rtx_SET (e1
, gen_rtx_PLUS (mode
, x0
, x0
)));
15999 emit_insn (gen_rtx_SET (x1
, gen_rtx_MINUS (mode
, e1
, e0
)));
16002 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, a
, x1
)));
16005 /* Output code to perform a Newton-Rhapson approximation of a
16006 single precision floating point [reciprocal] square root. */
16008 void ix86_emit_swsqrtsf (rtx res
, rtx a
, machine_mode mode
, bool recip
)
16010 rtx x0
, e0
, e1
, e2
, e3
, mthree
, mhalf
;
16014 x0
= gen_reg_rtx (mode
);
16015 e0
= gen_reg_rtx (mode
);
16016 e1
= gen_reg_rtx (mode
);
16017 e2
= gen_reg_rtx (mode
);
16018 e3
= gen_reg_rtx (mode
);
16020 if (TARGET_AVX512ER
&& mode
== V16SFmode
)
16023 /* res = rsqrt28(a) estimate */
16024 emit_insn (gen_rtx_SET (res
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
16028 /* x0 = rsqrt28(a) estimate */
16029 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
16031 /* res = rcp28(x0) estimate */
16032 emit_insn (gen_rtx_SET (res
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, x0
),
16038 real_from_integer (&r
, VOIDmode
, -3, SIGNED
);
16039 mthree
= const_double_from_real_value (r
, SFmode
);
16041 real_arithmetic (&r
, NEGATE_EXPR
, &dconsthalf
, NULL
);
16042 mhalf
= const_double_from_real_value (r
, SFmode
);
16043 unspec
= UNSPEC_RSQRT
;
16045 if (VECTOR_MODE_P (mode
))
16047 mthree
= ix86_build_const_vector (mode
, true, mthree
);
16048 mhalf
= ix86_build_const_vector (mode
, true, mhalf
);
16049 /* There is no 512-bit rsqrt. There is however rsqrt14. */
16050 if (GET_MODE_SIZE (mode
) == 64)
16051 unspec
= UNSPEC_RSQRT14
;
16054 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
16055 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
16057 a
= force_reg (mode
, a
);
16059 /* x0 = rsqrt(a) estimate */
16060 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
16063 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
16066 rtx zero
= force_reg (mode
, CONST0_RTX(mode
));
16069 /* Handle masked compare. */
16070 if (VECTOR_MODE_P (mode
) && GET_MODE_SIZE (mode
) == 64)
16072 mask
= gen_reg_rtx (HImode
);
16073 /* Imm value 0x4 corresponds to not-equal comparison. */
16074 emit_insn (gen_avx512f_cmpv16sf3 (mask
, zero
, a
, GEN_INT (0x4)));
16075 emit_insn (gen_avx512f_blendmv16sf (x0
, zero
, x0
, mask
));
16079 mask
= gen_reg_rtx (mode
);
16080 emit_insn (gen_rtx_SET (mask
, gen_rtx_NE (mode
, zero
, a
)));
16081 emit_insn (gen_rtx_SET (x0
, gen_rtx_AND (mode
, x0
, mask
)));
16085 mthree
= force_reg (mode
, mthree
);
16088 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, a
)));
16090 unsigned vector_size
= GET_MODE_SIZE (mode
);
16092 || (TARGET_AVX512F
&& vector_size
== 64)
16093 || (TARGET_AVX512VL
&& (vector_size
== 32 || vector_size
== 16)))
16094 emit_insn (gen_rtx_SET (e2
,
16095 gen_rtx_FMA (mode
, e0
, x0
, mthree
)));
16099 emit_insn (gen_rtx_SET (e1
, gen_rtx_MULT (mode
, e0
, x0
)));
16102 emit_insn (gen_rtx_SET (e2
, gen_rtx_PLUS (mode
, e1
, mthree
)));
16105 mhalf
= force_reg (mode
, mhalf
);
16107 /* e3 = -.5 * x0 */
16108 emit_insn (gen_rtx_SET (e3
, gen_rtx_MULT (mode
, x0
, mhalf
)));
16110 /* e3 = -.5 * e0 */
16111 emit_insn (gen_rtx_SET (e3
, gen_rtx_MULT (mode
, e0
, mhalf
)));
16112 /* ret = e2 * e3 */
16113 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, e2
, e3
)));
16116 /* Expand fabs (OP0) and return a new rtx that holds the result. The
16117 mask for masking out the sign-bit is stored in *SMASK, if that is
16121 ix86_expand_sse_fabs (rtx op0
, rtx
*smask
)
16123 machine_mode vmode
, mode
= GET_MODE (op0
);
16126 xa
= gen_reg_rtx (mode
);
16127 if (mode
== SFmode
)
16129 else if (mode
== DFmode
)
16133 mask
= ix86_build_signbit_mask (vmode
, VECTOR_MODE_P (mode
), true);
16134 if (!VECTOR_MODE_P (mode
))
16136 /* We need to generate a scalar mode mask in this case. */
16137 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const0_rtx
));
16138 tmp
= gen_rtx_VEC_SELECT (mode
, mask
, tmp
);
16139 mask
= gen_reg_rtx (mode
);
16140 emit_insn (gen_rtx_SET (mask
, tmp
));
16142 emit_insn (gen_rtx_SET (xa
, gen_rtx_AND (mode
, op0
, mask
)));
16150 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
16151 swapping the operands if SWAP_OPERANDS is true. The expanded
16152 code is a forward jump to a newly created label in case the
16153 comparison is true. The generated label rtx is returned. */
16154 static rtx_code_label
*
16155 ix86_expand_sse_compare_and_jump (enum rtx_code code
, rtx op0
, rtx op1
,
16156 bool swap_operands
)
16158 bool unordered_compare
= ix86_unordered_fp_compare (code
);
16159 rtx_code_label
*label
;
16163 std::swap (op0
, op1
);
16165 label
= gen_label_rtx ();
16166 tmp
= gen_rtx_COMPARE (CCFPmode
, op0
, op1
);
16167 if (unordered_compare
)
16168 tmp
= gen_rtx_UNSPEC (CCFPmode
, gen_rtvec (1, tmp
), UNSPEC_NOTRAP
);
16169 reg
= gen_rtx_REG (CCFPmode
, FLAGS_REG
);
16170 emit_insn (gen_rtx_SET (reg
, tmp
));
16171 tmp
= gen_rtx_fmt_ee (code
, VOIDmode
, reg
, const0_rtx
);
16172 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
16173 gen_rtx_LABEL_REF (VOIDmode
, label
), pc_rtx
);
16174 tmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
16175 JUMP_LABEL (tmp
) = label
;
16180 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
16181 using comparison code CODE. Operands are swapped for the comparison if
16182 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
16184 ix86_expand_sse_compare_mask (enum rtx_code code
, rtx op0
, rtx op1
,
16185 bool swap_operands
)
16187 rtx (*insn
)(rtx
, rtx
, rtx
, rtx
);
16188 machine_mode mode
= GET_MODE (op0
);
16189 rtx mask
= gen_reg_rtx (mode
);
16192 std::swap (op0
, op1
);
16194 insn
= mode
== DFmode
? gen_setcc_df_sse
: gen_setcc_sf_sse
;
16196 emit_insn (insn (mask
, op0
, op1
,
16197 gen_rtx_fmt_ee (code
, mode
, op0
, op1
)));
16201 /* Expand copysign from SIGN to the positive value ABS_VALUE
16202 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
16206 ix86_sse_copysign_to_positive (rtx result
, rtx abs_value
, rtx sign
, rtx mask
)
16208 machine_mode mode
= GET_MODE (sign
);
16209 rtx sgn
= gen_reg_rtx (mode
);
16210 if (mask
== NULL_RTX
)
16212 machine_mode vmode
;
16214 if (mode
== SFmode
)
16216 else if (mode
== DFmode
)
16221 mask
= ix86_build_signbit_mask (vmode
, VECTOR_MODE_P (mode
), false);
16222 if (!VECTOR_MODE_P (mode
))
16224 /* We need to generate a scalar mode mask in this case. */
16225 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const0_rtx
));
16226 tmp
= gen_rtx_VEC_SELECT (mode
, mask
, tmp
);
16227 mask
= gen_reg_rtx (mode
);
16228 emit_insn (gen_rtx_SET (mask
, tmp
));
16232 mask
= gen_rtx_NOT (mode
, mask
);
16233 emit_insn (gen_rtx_SET (sgn
, gen_rtx_AND (mode
, mask
, sign
)));
16234 emit_insn (gen_rtx_SET (result
, gen_rtx_IOR (mode
, abs_value
, sgn
)));
16237 /* Expand SSE sequence for computing lround from OP1 storing
16241 ix86_expand_lround (rtx op0
, rtx op1
)
16243 /* C code for the stuff we're doing below:
16244 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
16247 machine_mode mode
= GET_MODE (op1
);
16248 const struct real_format
*fmt
;
16249 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
16252 /* load nextafter (0.5, 0.0) */
16253 fmt
= REAL_MODE_FORMAT (mode
);
16254 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
16255 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
16257 /* adj = copysign (0.5, op1) */
16258 adj
= force_reg (mode
, const_double_from_real_value (pred_half
, mode
));
16259 ix86_sse_copysign_to_positive (adj
, adj
, force_reg (mode
, op1
), NULL_RTX
);
16261 /* adj = op1 + adj */
16262 adj
= expand_simple_binop (mode
, PLUS
, adj
, op1
, NULL_RTX
, 0, OPTAB_DIRECT
);
16264 /* op0 = (imode)adj */
16265 expand_fix (op0
, adj
, 0);
16268 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
16272 ix86_expand_lfloorceil (rtx op0
, rtx op1
, bool do_floor
)
16274 /* C code for the stuff we're doing below (for do_floor):
16276 xi -= (double)xi > op1 ? 1 : 0;
16279 machine_mode fmode
= GET_MODE (op1
);
16280 machine_mode imode
= GET_MODE (op0
);
16281 rtx ireg
, freg
, tmp
;
16282 rtx_code_label
*label
;
16284 /* reg = (long)op1 */
16285 ireg
= gen_reg_rtx (imode
);
16286 expand_fix (ireg
, op1
, 0);
16288 /* freg = (double)reg */
16289 freg
= gen_reg_rtx (fmode
);
16290 expand_float (freg
, ireg
, 0);
16292 /* ireg = (freg > op1) ? ireg - 1 : ireg */
16293 label
= ix86_expand_sse_compare_and_jump (UNLE
,
16294 freg
, op1
, !do_floor
);
16295 tmp
= expand_simple_binop (imode
, do_floor
? MINUS
: PLUS
,
16296 ireg
, const1_rtx
, NULL_RTX
, 0, OPTAB_DIRECT
);
16297 emit_move_insn (ireg
, tmp
);
16299 emit_label (label
);
16300 LABEL_NUSES (label
) = 1;
16302 emit_move_insn (op0
, ireg
);
16305 /* Generate and return a rtx of mode MODE for 2**n where n is the number
16306 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
16309 ix86_gen_TWO52 (machine_mode mode
)
16311 const struct real_format
*fmt
;
16312 REAL_VALUE_TYPE TWO52r
;
16315 fmt
= REAL_MODE_FORMAT (mode
);
16316 real_2expN (&TWO52r
, fmt
->p
- 1, mode
);
16317 TWO52
= const_double_from_real_value (TWO52r
, mode
);
16318 TWO52
= force_reg (mode
, TWO52
);
16323 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
16326 ix86_expand_rint (rtx operand0
, rtx operand1
)
16328 /* C code for the stuff we're doing below:
16329 xa = fabs (operand1);
16330 if (!isless (xa, 2**52))
16333 if (flag_rounding_math)
16335 two52 = copysign (two52, operand1);
16338 xa = xa + two52 - two52;
16339 return copysign (xa, operand1);
16341 machine_mode mode
= GET_MODE (operand0
);
16342 rtx res
, xa
, TWO52
, mask
;
16343 rtx_code_label
*label
;
16345 TWO52
= ix86_gen_TWO52 (mode
);
16347 /* Temporary for holding the result, initialized to the input
16348 operand to ease control flow. */
16349 res
= copy_to_reg (operand1
);
16351 /* xa = abs (operand1) */
16352 xa
= ix86_expand_sse_fabs (res
, &mask
);
16354 /* if (!isless (xa, TWO52)) goto label; */
16355 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16357 if (flag_rounding_math
)
16359 ix86_sse_copysign_to_positive (TWO52
, TWO52
, res
, mask
);
16363 xa
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
16364 xa
= expand_simple_binop (mode
, MINUS
, xa
, TWO52
, xa
, 0, OPTAB_DIRECT
);
16366 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
16367 if (HONOR_SIGNED_ZEROS (mode
) && flag_rounding_math
)
16368 xa
= ix86_expand_sse_fabs (xa
, NULL
);
16370 ix86_sse_copysign_to_positive (res
, xa
, res
, mask
);
16372 emit_label (label
);
16373 LABEL_NUSES (label
) = 1;
16375 emit_move_insn (operand0
, res
);
16378 /* Expand SSE2 sequence for computing floor or ceil
16379 from OPERAND1 storing into OPERAND0. */
16381 ix86_expand_floorceil (rtx operand0
, rtx operand1
, bool do_floor
)
16383 /* C code for the stuff we expand below.
16384 double xa = fabs (x), x2;
16385 if (!isless (xa, TWO52))
16387 x2 = (double)(long)x;
16396 if (HONOR_SIGNED_ZEROS (mode))
16397 return copysign (x2, x);
16400 machine_mode mode
= GET_MODE (operand0
);
16401 rtx xa
, xi
, TWO52
, tmp
, one
, res
, mask
;
16402 rtx_code_label
*label
;
16404 TWO52
= ix86_gen_TWO52 (mode
);
16406 /* Temporary for holding the result, initialized to the input
16407 operand to ease control flow. */
16408 res
= copy_to_reg (operand1
);
16410 /* xa = abs (operand1) */
16411 xa
= ix86_expand_sse_fabs (res
, &mask
);
16413 /* if (!isless (xa, TWO52)) goto label; */
16414 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16416 /* xa = (double)(long)x */
16417 xi
= gen_reg_rtx (int_mode_for_mode (mode
).require ());
16418 expand_fix (xi
, res
, 0);
16419 expand_float (xa
, xi
, 0);
16422 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
16424 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
16425 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa
, res
, !do_floor
);
16426 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
16427 tmp
= expand_simple_binop (mode
, do_floor
? MINUS
: PLUS
,
16428 xa
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
16429 if (HONOR_SIGNED_ZEROS (mode
))
16431 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
16432 if (do_floor
&& flag_rounding_math
)
16433 tmp
= ix86_expand_sse_fabs (tmp
, NULL
);
16435 ix86_sse_copysign_to_positive (tmp
, tmp
, res
, mask
);
16437 emit_move_insn (res
, tmp
);
16439 emit_label (label
);
16440 LABEL_NUSES (label
) = 1;
16442 emit_move_insn (operand0
, res
);
16445 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
16446 into OPERAND0 without relying on DImode truncation via cvttsd2siq
16447 that is only available on 64bit targets. */
16449 ix86_expand_floorceildf_32 (rtx operand0
, rtx operand1
, bool do_floor
)
16451 /* C code for the stuff we expand below.
16452 double xa = fabs (x), x2;
16453 if (!isless (xa, TWO52))
16455 xa = xa + TWO52 - TWO52;
16456 x2 = copysign (xa, x);
16465 if (HONOR_SIGNED_ZEROS (mode))
16466 x2 = copysign (x2, x);
16469 machine_mode mode
= GET_MODE (operand0
);
16470 rtx xa
, TWO52
, tmp
, one
, res
, mask
;
16471 rtx_code_label
*label
;
16473 TWO52
= ix86_gen_TWO52 (mode
);
16475 /* Temporary for holding the result, initialized to the input
16476 operand to ease control flow. */
16477 res
= copy_to_reg (operand1
);
16479 /* xa = abs (operand1) */
16480 xa
= ix86_expand_sse_fabs (res
, &mask
);
16482 /* if (!isless (xa, TWO52)) goto label; */
16483 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16485 /* xa = xa + TWO52 - TWO52; */
16486 xa
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
16487 xa
= expand_simple_binop (mode
, MINUS
, xa
, TWO52
, xa
, 0, OPTAB_DIRECT
);
16489 /* xa = copysign (xa, operand1) */
16490 ix86_sse_copysign_to_positive (xa
, xa
, res
, mask
);
16493 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
16495 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
16496 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa
, res
, !do_floor
);
16497 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
16498 tmp
= expand_simple_binop (mode
, do_floor
? MINUS
: PLUS
,
16499 xa
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
16500 if (HONOR_SIGNED_ZEROS (mode
))
16502 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
16503 if (do_floor
&& flag_rounding_math
)
16504 tmp
= ix86_expand_sse_fabs (tmp
, NULL
);
16506 ix86_sse_copysign_to_positive (tmp
, tmp
, res
, mask
);
16508 emit_move_insn (res
, tmp
);
16510 emit_label (label
);
16511 LABEL_NUSES (label
) = 1;
16513 emit_move_insn (operand0
, res
);
16516 /* Expand SSE sequence for computing trunc
16517 from OPERAND1 storing into OPERAND0. */
16519 ix86_expand_trunc (rtx operand0
, rtx operand1
)
16521 /* C code for SSE variant we expand below.
16522 double xa = fabs (x), x2;
16523 if (!isless (xa, TWO52))
16525 x2 = (double)(long)x;
16526 if (HONOR_SIGNED_ZEROS (mode))
16527 return copysign (x2, x);
16530 machine_mode mode
= GET_MODE (operand0
);
16531 rtx xa
, xi
, TWO52
, res
, mask
;
16532 rtx_code_label
*label
;
16534 TWO52
= ix86_gen_TWO52 (mode
);
16536 /* Temporary for holding the result, initialized to the input
16537 operand to ease control flow. */
16538 res
= copy_to_reg (operand1
);
16540 /* xa = abs (operand1) */
16541 xa
= ix86_expand_sse_fabs (res
, &mask
);
16543 /* if (!isless (xa, TWO52)) goto label; */
16544 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16546 /* xa = (double)(long)x */
16547 xi
= gen_reg_rtx (int_mode_for_mode (mode
).require ());
16548 expand_fix (xi
, res
, 0);
16549 expand_float (xa
, xi
, 0);
16551 if (HONOR_SIGNED_ZEROS (mode
))
16552 ix86_sse_copysign_to_positive (xa
, xa
, res
, mask
);
16554 emit_move_insn (res
, xa
);
16556 emit_label (label
);
16557 LABEL_NUSES (label
) = 1;
16559 emit_move_insn (operand0
, res
);
16562 /* Expand SSE sequence for computing trunc from OPERAND1 storing
16563 into OPERAND0 without relying on DImode truncation via cvttsd2siq
16564 that is only available on 64bit targets. */
16566 ix86_expand_truncdf_32 (rtx operand0
, rtx operand1
)
16568 machine_mode mode
= GET_MODE (operand0
);
16569 rtx xa
, xa2
, TWO52
, tmp
, one
, res
, mask
;
16570 rtx_code_label
*label
;
16572 /* C code for SSE variant we expand below.
16573 double xa = fabs (x), x2;
16574 if (!isless (xa, TWO52))
16576 xa2 = xa + TWO52 - TWO52;
16580 x2 = copysign (xa2, x);
16584 TWO52
= ix86_gen_TWO52 (mode
);
16586 /* Temporary for holding the result, initialized to the input
16587 operand to ease control flow. */
16588 res
=copy_to_reg (operand1
);
16590 /* xa = abs (operand1) */
16591 xa
= ix86_expand_sse_fabs (res
, &mask
);
16593 /* if (!isless (xa, TWO52)) goto label; */
16594 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16596 /* xa2 = xa + TWO52 - TWO52; */
16597 xa2
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
16598 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, TWO52
, xa2
, 0, OPTAB_DIRECT
);
16601 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
16603 /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */
16604 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa2
, xa
, false);
16605 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
16606 tmp
= expand_simple_binop (mode
, MINUS
,
16607 xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
16608 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
16609 if (HONOR_SIGNED_ZEROS (mode
) && flag_rounding_math
)
16610 tmp
= ix86_expand_sse_fabs (tmp
, NULL
);
16612 /* res = copysign (xa2, operand1) */
16613 ix86_sse_copysign_to_positive (res
, tmp
, res
, mask
);
16615 emit_label (label
);
16616 LABEL_NUSES (label
) = 1;
16618 emit_move_insn (operand0
, res
);
16621 /* Expand SSE sequence for computing round
16622 from OPERAND1 storing into OPERAND0. */
16624 ix86_expand_round (rtx operand0
, rtx operand1
)
16626 /* C code for the stuff we're doing below:
16627 double xa = fabs (x);
16628 if (!isless (xa, TWO52))
16630 xa = (double)(long)(xa + nextafter (0.5, 0.0));
16631 return copysign (xa, x);
16633 machine_mode mode
= GET_MODE (operand0
);
16634 rtx res
, TWO52
, xa
, xi
, half
, mask
;
16635 rtx_code_label
*label
;
16636 const struct real_format
*fmt
;
16637 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
16639 /* Temporary for holding the result, initialized to the input
16640 operand to ease control flow. */
16641 res
= copy_to_reg (operand1
);
16643 TWO52
= ix86_gen_TWO52 (mode
);
16644 xa
= ix86_expand_sse_fabs (res
, &mask
);
16645 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16647 /* load nextafter (0.5, 0.0) */
16648 fmt
= REAL_MODE_FORMAT (mode
);
16649 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
16650 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
16652 /* xa = xa + 0.5 */
16653 half
= force_reg (mode
, const_double_from_real_value (pred_half
, mode
));
16654 xa
= expand_simple_binop (mode
, PLUS
, xa
, half
, NULL_RTX
, 0, OPTAB_DIRECT
);
16656 /* xa = (double)(int64_t)xa */
16657 xi
= gen_reg_rtx (int_mode_for_mode (mode
).require ());
16658 expand_fix (xi
, xa
, 0);
16659 expand_float (xa
, xi
, 0);
16661 /* res = copysign (xa, operand1) */
16662 ix86_sse_copysign_to_positive (res
, xa
, res
, mask
);
16664 emit_label (label
);
16665 LABEL_NUSES (label
) = 1;
16667 emit_move_insn (operand0
, res
);
16670 /* Expand SSE sequence for computing round from OPERAND1 storing
16671 into OPERAND0 without relying on DImode truncation via cvttsd2siq
16672 that is only available on 64bit targets. */
16674 ix86_expand_rounddf_32 (rtx operand0
, rtx operand1
)
16676 /* C code for the stuff we expand below.
16677 double xa = fabs (x), xa2, x2;
16678 if (!isless (xa, TWO52))
16680 Using the absolute value and copying back sign makes
16681 -0.0 -> -0.0 correct.
16682 xa2 = xa + TWO52 - TWO52;
16687 else if (dxa > 0.5)
16689 x2 = copysign (xa2, x);
16692 machine_mode mode
= GET_MODE (operand0
);
16693 rtx xa
, xa2
, dxa
, TWO52
, tmp
, half
, mhalf
, one
, res
, mask
;
16694 rtx_code_label
*label
;
16696 TWO52
= ix86_gen_TWO52 (mode
);
16698 /* Temporary for holding the result, initialized to the input
16699 operand to ease control flow. */
16700 res
= copy_to_reg (operand1
);
16702 /* xa = abs (operand1) */
16703 xa
= ix86_expand_sse_fabs (res
, &mask
);
16705 /* if (!isless (xa, TWO52)) goto label; */
16706 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16708 /* xa2 = xa + TWO52 - TWO52; */
16709 xa2
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
16710 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, TWO52
, xa2
, 0, OPTAB_DIRECT
);
16712 /* dxa = xa2 - xa; */
16713 dxa
= expand_simple_binop (mode
, MINUS
, xa2
, xa
, NULL_RTX
, 0, OPTAB_DIRECT
);
16715 /* generate 0.5, 1.0 and -0.5 */
16716 half
= force_reg (mode
, const_double_from_real_value (dconsthalf
, mode
));
16717 one
= expand_simple_binop (mode
, PLUS
, half
, half
, NULL_RTX
, 0, OPTAB_DIRECT
);
16718 mhalf
= expand_simple_binop (mode
, MINUS
, half
, one
, NULL_RTX
,
16722 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
16723 tmp
= ix86_expand_sse_compare_mask (UNGT
, dxa
, half
, false);
16724 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, tmp
, one
)));
16725 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
16726 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
16727 tmp
= ix86_expand_sse_compare_mask (UNGE
, mhalf
, dxa
, false);
16728 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, tmp
, one
)));
16729 xa2
= expand_simple_binop (mode
, PLUS
, xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
16731 /* res = copysign (xa2, operand1) */
16732 ix86_sse_copysign_to_positive (res
, xa2
, res
, mask
);
16734 emit_label (label
);
16735 LABEL_NUSES (label
) = 1;
16737 emit_move_insn (operand0
, res
);
16740 /* Expand SSE sequence for computing round
16741 from OP1 storing into OP0 using sse4 round insn. */
16743 ix86_expand_round_sse4 (rtx op0
, rtx op1
)
16745 machine_mode mode
= GET_MODE (op0
);
16746 rtx e1
, e2
, res
, half
;
16747 const struct real_format
*fmt
;
16748 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
16749 rtx (*gen_copysign
) (rtx
, rtx
, rtx
);
16750 rtx (*gen_round
) (rtx
, rtx
, rtx
);
16755 gen_copysign
= gen_copysignsf3
;
16756 gen_round
= gen_sse4_1_roundsf2
;
16759 gen_copysign
= gen_copysigndf3
;
16760 gen_round
= gen_sse4_1_rounddf2
;
16763 gcc_unreachable ();
16766 /* round (a) = trunc (a + copysign (0.5, a)) */
16768 /* load nextafter (0.5, 0.0) */
16769 fmt
= REAL_MODE_FORMAT (mode
);
16770 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
16771 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
16772 half
= const_double_from_real_value (pred_half
, mode
);
16774 /* e1 = copysign (0.5, op1) */
16775 e1
= gen_reg_rtx (mode
);
16776 emit_insn (gen_copysign (e1
, half
, op1
));
16778 /* e2 = op1 + e1 */
16779 e2
= expand_simple_binop (mode
, PLUS
, op1
, e1
, NULL_RTX
, 0, OPTAB_DIRECT
);
16781 /* res = trunc (e2) */
16782 res
= gen_reg_rtx (mode
);
16783 emit_insn (gen_round (res
, e2
, GEN_INT (ROUND_TRUNC
)));
16785 emit_move_insn (op0
, res
);
16788 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
16789 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
16790 insn every time. */
16792 static GTY(()) rtx_insn
*vselect_insn
;
16794 /* Initialize vselect_insn. */
16797 init_vselect_insn (void)
16802 x
= gen_rtx_PARALLEL (VOIDmode
, rtvec_alloc (MAX_VECT_LEN
));
16803 for (i
= 0; i
< MAX_VECT_LEN
; ++i
)
16804 XVECEXP (x
, 0, i
) = const0_rtx
;
16805 x
= gen_rtx_VEC_SELECT (V2DFmode
, gen_rtx_VEC_CONCAT (V4DFmode
, const0_rtx
,
16807 x
= gen_rtx_SET (const0_rtx
, x
);
16809 vselect_insn
= emit_insn (x
);
16813 /* Construct (set target (vec_select op0 (parallel perm))) and
16814 return true if that's a valid instruction in the active ISA. */
16817 expand_vselect (rtx target
, rtx op0
, const unsigned char *perm
,
16818 unsigned nelt
, bool testing_p
)
16821 rtx x
, save_vconcat
;
16824 if (vselect_insn
== NULL_RTX
)
16825 init_vselect_insn ();
16827 x
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 1);
16828 PUT_NUM_ELEM (XVEC (x
, 0), nelt
);
16829 for (i
= 0; i
< nelt
; ++i
)
16830 XVECEXP (x
, 0, i
) = GEN_INT (perm
[i
]);
16831 save_vconcat
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 0);
16832 XEXP (SET_SRC (PATTERN (vselect_insn
)), 0) = op0
;
16833 PUT_MODE (SET_SRC (PATTERN (vselect_insn
)), GET_MODE (target
));
16834 SET_DEST (PATTERN (vselect_insn
)) = target
;
16835 icode
= recog_memoized (vselect_insn
);
16837 if (icode
>= 0 && !testing_p
)
16838 emit_insn (copy_rtx (PATTERN (vselect_insn
)));
16840 SET_DEST (PATTERN (vselect_insn
)) = const0_rtx
;
16841 XEXP (SET_SRC (PATTERN (vselect_insn
)), 0) = save_vconcat
;
16842 INSN_CODE (vselect_insn
) = -1;
16847 /* Similar, but generate a vec_concat from op0 and op1 as well. */
16850 expand_vselect_vconcat (rtx target
, rtx op0
, rtx op1
,
16851 const unsigned char *perm
, unsigned nelt
,
16854 machine_mode v2mode
;
16858 if (vselect_insn
== NULL_RTX
)
16859 init_vselect_insn ();
16861 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0
)).exists (&v2mode
))
16863 x
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 0);
16864 PUT_MODE (x
, v2mode
);
16867 ok
= expand_vselect (target
, x
, perm
, nelt
, testing_p
);
16868 XEXP (x
, 0) = const0_rtx
;
16869 XEXP (x
, 1) = const0_rtx
;
16873 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16874 using movss or movsd. */
16876 expand_vec_perm_movs (struct expand_vec_perm_d
*d
)
16878 machine_mode vmode
= d
->vmode
;
16879 unsigned i
, nelt
= d
->nelt
;
16882 if (d
->one_operand_p
)
16885 if (!(TARGET_SSE
&& vmode
== V4SFmode
)
16886 && !(TARGET_MMX_WITH_SSE
&& vmode
== V2SFmode
)
16887 && !(TARGET_SSE2
&& vmode
== V2DFmode
))
16890 /* Only the first element is changed. */
16891 if (d
->perm
[0] != nelt
&& d
->perm
[0] != 0)
16893 for (i
= 1; i
< nelt
; ++i
)
16894 if (d
->perm
[i
] != i
+ nelt
- d
->perm
[0])
16900 if (d
->perm
[0] == nelt
)
16901 x
= gen_rtx_VEC_MERGE (vmode
, d
->op1
, d
->op0
, GEN_INT (1));
16903 x
= gen_rtx_VEC_MERGE (vmode
, d
->op0
, d
->op1
, GEN_INT (1));
16905 emit_insn (gen_rtx_SET (d
->target
, x
));
16910 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16911 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
16914 expand_vec_perm_blend (struct expand_vec_perm_d
*d
)
16916 machine_mode mmode
, vmode
= d
->vmode
;
16917 unsigned i
, nelt
= d
->nelt
;
16918 unsigned HOST_WIDE_INT mask
;
16919 rtx target
, op0
, op1
, maskop
, x
;
16920 rtx rperm
[32], vperm
;
16922 if (d
->one_operand_p
)
16924 if (TARGET_AVX512F
&& GET_MODE_SIZE (vmode
) == 64
16925 && (TARGET_AVX512BW
16926 || GET_MODE_UNIT_SIZE (vmode
) >= 4))
16928 else if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
16930 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
16932 else if (TARGET_SSE4_1
&& GET_MODE_SIZE (vmode
) == 16)
16937 /* This is a blend, not a permute. Elements must stay in their
16938 respective lanes. */
16939 for (i
= 0; i
< nelt
; ++i
)
16941 unsigned e
= d
->perm
[i
];
16942 if (!(e
== i
|| e
== i
+ nelt
))
16949 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
16950 decision should be extracted elsewhere, so that we only try that
16951 sequence once all budget==3 options have been tried. */
16952 target
= d
->target
;
16971 for (i
= 0; i
< nelt
; ++i
)
16972 mask
|= ((unsigned HOST_WIDE_INT
) (d
->perm
[i
] >= nelt
)) << i
;
16976 for (i
= 0; i
< 2; ++i
)
16977 mask
|= (d
->perm
[i
] >= 2 ? 15 : 0) << (i
* 4);
16982 for (i
= 0; i
< 4; ++i
)
16983 mask
|= (d
->perm
[i
] >= 4 ? 3 : 0) << (i
* 2);
16988 /* See if bytes move in pairs so we can use pblendw with
16989 an immediate argument, rather than pblendvb with a vector
16991 for (i
= 0; i
< 16; i
+= 2)
16992 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
16995 for (i
= 0; i
< nelt
; ++i
)
16996 rperm
[i
] = (d
->perm
[i
] < nelt
? const0_rtx
: constm1_rtx
);
16999 vperm
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
17000 vperm
= force_reg (vmode
, vperm
);
17002 if (GET_MODE_SIZE (vmode
) == 16)
17003 emit_insn (gen_sse4_1_pblendvb (target
, op0
, op1
, vperm
));
17005 emit_insn (gen_avx2_pblendvb (target
, op0
, op1
, vperm
));
17006 if (target
!= d
->target
)
17007 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
17011 for (i
= 0; i
< 8; ++i
)
17012 mask
|= (d
->perm
[i
* 2] >= 16) << i
;
17017 target
= gen_reg_rtx (vmode
);
17018 op0
= gen_lowpart (vmode
, op0
);
17019 op1
= gen_lowpart (vmode
, op1
);
17023 /* See if bytes move in pairs. If not, vpblendvb must be used. */
17024 for (i
= 0; i
< 32; i
+= 2)
17025 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
17027 /* See if bytes move in quadruplets. If yes, vpblendd
17028 with immediate can be used. */
17029 for (i
= 0; i
< 32; i
+= 4)
17030 if (d
->perm
[i
] + 2 != d
->perm
[i
+ 2])
17034 /* See if bytes move the same in both lanes. If yes,
17035 vpblendw with immediate can be used. */
17036 for (i
= 0; i
< 16; i
+= 2)
17037 if (d
->perm
[i
] + 16 != d
->perm
[i
+ 16])
17040 /* Use vpblendw. */
17041 for (i
= 0; i
< 16; ++i
)
17042 mask
|= (d
->perm
[i
* 2] >= 32) << i
;
17047 /* Use vpblendd. */
17048 for (i
= 0; i
< 8; ++i
)
17049 mask
|= (d
->perm
[i
* 4] >= 32) << i
;
17054 /* See if words move in pairs. If yes, vpblendd can be used. */
17055 for (i
= 0; i
< 16; i
+= 2)
17056 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
17060 /* See if words move the same in both lanes. If not,
17061 vpblendvb must be used. */
17062 for (i
= 0; i
< 8; i
++)
17063 if (d
->perm
[i
] + 8 != d
->perm
[i
+ 8])
17065 /* Use vpblendvb. */
17066 for (i
= 0; i
< 32; ++i
)
17067 rperm
[i
] = (d
->perm
[i
/ 2] < 16 ? const0_rtx
: constm1_rtx
);
17071 target
= gen_reg_rtx (vmode
);
17072 op0
= gen_lowpart (vmode
, op0
);
17073 op1
= gen_lowpart (vmode
, op1
);
17074 goto finish_pblendvb
;
17077 /* Use vpblendw. */
17078 for (i
= 0; i
< 16; ++i
)
17079 mask
|= (d
->perm
[i
] >= 16) << i
;
17083 /* Use vpblendd. */
17084 for (i
= 0; i
< 8; ++i
)
17085 mask
|= (d
->perm
[i
* 2] >= 16) << i
;
17090 /* Use vpblendd. */
17091 for (i
= 0; i
< 4; ++i
)
17092 mask
|= (d
->perm
[i
] >= 4 ? 3 : 0) << (i
* 2);
17097 gcc_unreachable ();
17120 if (mmode
!= VOIDmode
)
17121 maskop
= force_reg (mmode
, gen_int_mode (mask
, mmode
));
17123 maskop
= GEN_INT (mask
);
17125 /* This matches five different patterns with the different modes. */
17126 x
= gen_rtx_VEC_MERGE (vmode
, op1
, op0
, maskop
);
17127 x
= gen_rtx_SET (target
, x
);
17129 if (target
!= d
->target
)
17130 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
17135 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
17136 in terms of the variable form of vpermilps.
17138 Note that we will have already failed the immediate input vpermilps,
17139 which requires that the high and low part shuffle be identical; the
17140 variable form doesn't require that. */
17143 expand_vec_perm_vpermil (struct expand_vec_perm_d
*d
)
17145 rtx rperm
[8], vperm
;
17148 if (!TARGET_AVX
|| d
->vmode
!= V8SFmode
|| !d
->one_operand_p
)
17151 /* We can only permute within the 128-bit lane. */
17152 for (i
= 0; i
< 8; ++i
)
17154 unsigned e
= d
->perm
[i
];
17155 if (i
< 4 ? e
>= 4 : e
< 4)
17162 for (i
= 0; i
< 8; ++i
)
17164 unsigned e
= d
->perm
[i
];
17166 /* Within each 128-bit lane, the elements of op0 are numbered
17167 from 0 and the elements of op1 are numbered from 4. */
17173 rperm
[i
] = GEN_INT (e
);
17176 vperm
= gen_rtx_CONST_VECTOR (V8SImode
, gen_rtvec_v (8, rperm
));
17177 vperm
= force_reg (V8SImode
, vperm
);
17178 emit_insn (gen_avx_vpermilvarv8sf3 (d
->target
, d
->op0
, vperm
));
17183 /* Return true if permutation D can be performed as VMODE permutation
17187 valid_perm_using_mode_p (machine_mode vmode
, struct expand_vec_perm_d
*d
)
17189 unsigned int i
, j
, chunk
;
17191 if (GET_MODE_CLASS (vmode
) != MODE_VECTOR_INT
17192 || GET_MODE_CLASS (d
->vmode
) != MODE_VECTOR_INT
17193 || GET_MODE_SIZE (vmode
) != GET_MODE_SIZE (d
->vmode
))
17196 if (GET_MODE_NUNITS (vmode
) >= d
->nelt
)
17199 chunk
= d
->nelt
/ GET_MODE_NUNITS (vmode
);
17200 for (i
= 0; i
< d
->nelt
; i
+= chunk
)
17201 if (d
->perm
[i
] & (chunk
- 1))
17204 for (j
= 1; j
< chunk
; ++j
)
17205 if (d
->perm
[i
] + j
!= d
->perm
[i
+ j
])
17211 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
17212 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
17215 expand_vec_perm_pshufb (struct expand_vec_perm_d
*d
)
17217 unsigned i
, nelt
, eltsz
, mask
;
17218 unsigned char perm
[64];
17219 machine_mode vmode
= V16QImode
;
17220 rtx rperm
[64], vperm
, target
, op0
, op1
;
17224 if (!d
->one_operand_p
)
17226 if (!TARGET_XOP
|| GET_MODE_SIZE (d
->vmode
) != 16)
17229 && valid_perm_using_mode_p (V2TImode
, d
))
17234 /* Use vperm2i128 insn. The pattern uses
17235 V4DImode instead of V2TImode. */
17236 target
= d
->target
;
17237 if (d
->vmode
!= V4DImode
)
17238 target
= gen_reg_rtx (V4DImode
);
17239 op0
= gen_lowpart (V4DImode
, d
->op0
);
17240 op1
= gen_lowpart (V4DImode
, d
->op1
);
17242 = GEN_INT ((d
->perm
[0] / (nelt
/ 2))
17243 | ((d
->perm
[nelt
/ 2] / (nelt
/ 2)) * 16));
17244 emit_insn (gen_avx2_permv2ti (target
, op0
, op1
, rperm
[0]));
17245 if (target
!= d
->target
)
17246 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
17254 if (GET_MODE_SIZE (d
->vmode
) == 16)
17259 else if (GET_MODE_SIZE (d
->vmode
) == 32)
17264 /* V4DImode should be already handled through
17265 expand_vselect by vpermq instruction. */
17266 gcc_assert (d
->vmode
!= V4DImode
);
17269 if (d
->vmode
== V8SImode
17270 || d
->vmode
== V16HImode
17271 || d
->vmode
== V32QImode
)
17273 /* First see if vpermq can be used for
17274 V8SImode/V16HImode/V32QImode. */
17275 if (valid_perm_using_mode_p (V4DImode
, d
))
17277 for (i
= 0; i
< 4; i
++)
17278 perm
[i
] = (d
->perm
[i
* nelt
/ 4] * 4 / nelt
) & 3;
17281 target
= gen_reg_rtx (V4DImode
);
17282 if (expand_vselect (target
, gen_lowpart (V4DImode
, d
->op0
),
17285 emit_move_insn (d
->target
,
17286 gen_lowpart (d
->vmode
, target
));
17292 /* Next see if vpermd can be used. */
17293 if (valid_perm_using_mode_p (V8SImode
, d
))
17296 /* Or if vpermps can be used. */
17297 else if (d
->vmode
== V8SFmode
)
17300 if (vmode
== V32QImode
)
17302 /* vpshufb only works intra lanes, it is not
17303 possible to shuffle bytes in between the lanes. */
17304 for (i
= 0; i
< nelt
; ++i
)
17305 if ((d
->perm
[i
] ^ i
) & (nelt
/ 2))
17309 else if (GET_MODE_SIZE (d
->vmode
) == 64)
17311 if (!TARGET_AVX512BW
)
17314 /* If vpermq didn't work, vpshufb won't work either. */
17315 if (d
->vmode
== V8DFmode
|| d
->vmode
== V8DImode
)
17319 if (d
->vmode
== V16SImode
17320 || d
->vmode
== V32HImode
17321 || d
->vmode
== V64QImode
)
17323 /* First see if vpermq can be used for
17324 V16SImode/V32HImode/V64QImode. */
17325 if (valid_perm_using_mode_p (V8DImode
, d
))
17327 for (i
= 0; i
< 8; i
++)
17328 perm
[i
] = (d
->perm
[i
* nelt
/ 8] * 8 / nelt
) & 7;
17331 target
= gen_reg_rtx (V8DImode
);
17332 if (expand_vselect (target
, gen_lowpart (V8DImode
, d
->op0
),
17335 emit_move_insn (d
->target
,
17336 gen_lowpart (d
->vmode
, target
));
17342 /* Next see if vpermd can be used. */
17343 if (valid_perm_using_mode_p (V16SImode
, d
))
17346 /* Or if vpermps can be used. */
17347 else if (d
->vmode
== V16SFmode
)
17349 if (vmode
== V64QImode
)
17351 /* vpshufb only works intra lanes, it is not
17352 possible to shuffle bytes in between the lanes. */
17353 for (i
= 0; i
< nelt
; ++i
)
17354 if ((d
->perm
[i
] ^ i
) & (3 * nelt
/ 4))
17365 if (vmode
== V8SImode
)
17366 for (i
= 0; i
< 8; ++i
)
17367 rperm
[i
] = GEN_INT ((d
->perm
[i
* nelt
/ 8] * 8 / nelt
) & 7);
17368 else if (vmode
== V16SImode
)
17369 for (i
= 0; i
< 16; ++i
)
17370 rperm
[i
] = GEN_INT ((d
->perm
[i
* nelt
/ 16] * 16 / nelt
) & 15);
17373 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
17374 if (!d
->one_operand_p
)
17375 mask
= 2 * nelt
- 1;
17376 else if (vmode
== V16QImode
)
17378 else if (vmode
== V64QImode
)
17379 mask
= nelt
/ 4 - 1;
17381 mask
= nelt
/ 2 - 1;
17383 for (i
= 0; i
< nelt
; ++i
)
17385 unsigned j
, e
= d
->perm
[i
] & mask
;
17386 for (j
= 0; j
< eltsz
; ++j
)
17387 rperm
[i
* eltsz
+ j
] = GEN_INT (e
* eltsz
+ j
);
17391 vperm
= gen_rtx_CONST_VECTOR (vmode
,
17392 gen_rtvec_v (GET_MODE_NUNITS (vmode
), rperm
));
17393 vperm
= force_reg (vmode
, vperm
);
17395 target
= d
->target
;
17396 if (d
->vmode
!= vmode
)
17397 target
= gen_reg_rtx (vmode
);
17398 op0
= gen_lowpart (vmode
, d
->op0
);
17399 if (d
->one_operand_p
)
17401 if (vmode
== V16QImode
)
17402 emit_insn (gen_ssse3_pshufbv16qi3 (target
, op0
, vperm
));
17403 else if (vmode
== V32QImode
)
17404 emit_insn (gen_avx2_pshufbv32qi3 (target
, op0
, vperm
));
17405 else if (vmode
== V64QImode
)
17406 emit_insn (gen_avx512bw_pshufbv64qi3 (target
, op0
, vperm
));
17407 else if (vmode
== V8SFmode
)
17408 emit_insn (gen_avx2_permvarv8sf (target
, op0
, vperm
));
17409 else if (vmode
== V8SImode
)
17410 emit_insn (gen_avx2_permvarv8si (target
, op0
, vperm
));
17411 else if (vmode
== V16SFmode
)
17412 emit_insn (gen_avx512f_permvarv16sf (target
, op0
, vperm
));
17413 else if (vmode
== V16SImode
)
17414 emit_insn (gen_avx512f_permvarv16si (target
, op0
, vperm
));
17416 gcc_unreachable ();
17420 op1
= gen_lowpart (vmode
, d
->op1
);
17421 emit_insn (gen_xop_pperm (target
, op0
, op1
, vperm
));
17423 if (target
!= d
->target
)
17424 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
17429 /* For V*[QHS]Imode permutations, check if the same permutation
17430 can't be performed in a 2x, 4x or 8x wider inner mode. */
17433 canonicalize_vector_int_perm (const struct expand_vec_perm_d
*d
,
17434 struct expand_vec_perm_d
*nd
)
17437 machine_mode mode
= VOIDmode
;
17441 case E_V16QImode
: mode
= V8HImode
; break;
17442 case E_V32QImode
: mode
= V16HImode
; break;
17443 case E_V64QImode
: mode
= V32HImode
; break;
17444 case E_V8HImode
: mode
= V4SImode
; break;
17445 case E_V16HImode
: mode
= V8SImode
; break;
17446 case E_V32HImode
: mode
= V16SImode
; break;
17447 case E_V4SImode
: mode
= V2DImode
; break;
17448 case E_V8SImode
: mode
= V4DImode
; break;
17449 case E_V16SImode
: mode
= V8DImode
; break;
17450 default: return false;
17452 for (i
= 0; i
< d
->nelt
; i
+= 2)
17453 if ((d
->perm
[i
] & 1) || d
->perm
[i
+ 1] != d
->perm
[i
] + 1)
17456 nd
->nelt
= d
->nelt
/ 2;
17457 for (i
= 0; i
< nd
->nelt
; i
++)
17458 nd
->perm
[i
] = d
->perm
[2 * i
] / 2;
17459 if (GET_MODE_INNER (mode
) != DImode
)
17460 canonicalize_vector_int_perm (nd
, nd
);
17463 nd
->one_operand_p
= d
->one_operand_p
;
17464 nd
->testing_p
= d
->testing_p
;
17465 if (d
->op0
== d
->op1
)
17466 nd
->op0
= nd
->op1
= gen_lowpart (nd
->vmode
, d
->op0
);
17469 nd
->op0
= gen_lowpart (nd
->vmode
, d
->op0
);
17470 nd
->op1
= gen_lowpart (nd
->vmode
, d
->op1
);
17473 nd
->target
= gen_raw_REG (nd
->vmode
, LAST_VIRTUAL_REGISTER
+ 1);
17475 nd
->target
= gen_reg_rtx (nd
->vmode
);
17480 /* Try to expand one-operand permutation with constant mask. */
17483 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d
*d
)
17485 machine_mode mode
= GET_MODE (d
->op0
);
17486 machine_mode maskmode
= mode
;
17487 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
17488 rtx target
, op0
, mask
;
17491 if (!rtx_equal_p (d
->op0
, d
->op1
))
17494 if (!TARGET_AVX512F
)
17500 gen
= gen_avx512f_permvarv16si
;
17503 gen
= gen_avx512f_permvarv16sf
;
17504 maskmode
= V16SImode
;
17507 gen
= gen_avx512f_permvarv8di
;
17510 gen
= gen_avx512f_permvarv8df
;
17511 maskmode
= V8DImode
;
17517 target
= d
->target
;
17519 for (int i
= 0; i
< d
->nelt
; ++i
)
17520 vec
[i
] = GEN_INT (d
->perm
[i
]);
17521 mask
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (d
->nelt
, vec
));
17522 emit_insn (gen (target
, op0
, force_reg (maskmode
, mask
)));
17526 static bool expand_vec_perm_palignr (struct expand_vec_perm_d
*d
, bool);
17528 /* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
17529 in a single instruction. */
17532 expand_vec_perm_1 (struct expand_vec_perm_d
*d
)
17534 unsigned i
, nelt
= d
->nelt
;
17535 struct expand_vec_perm_d nd
;
17537 /* Check plain VEC_SELECT first, because AVX has instructions that could
17538 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
17539 input where SEL+CONCAT may not. */
17540 if (d
->one_operand_p
)
17542 int mask
= nelt
- 1;
17543 bool identity_perm
= true;
17544 bool broadcast_perm
= true;
17546 for (i
= 0; i
< nelt
; i
++)
17548 nd
.perm
[i
] = d
->perm
[i
] & mask
;
17549 if (nd
.perm
[i
] != i
)
17550 identity_perm
= false;
17552 broadcast_perm
= false;
17558 emit_move_insn (d
->target
, d
->op0
);
17561 else if (broadcast_perm
&& TARGET_AVX2
)
17563 /* Use vpbroadcast{b,w,d}. */
17564 rtx (*gen
) (rtx
, rtx
) = NULL
;
17568 if (TARGET_AVX512BW
)
17569 gen
= gen_avx512bw_vec_dupv64qi_1
;
17572 gen
= gen_avx2_pbroadcastv32qi_1
;
17575 if (TARGET_AVX512BW
)
17576 gen
= gen_avx512bw_vec_dupv32hi_1
;
17579 gen
= gen_avx2_pbroadcastv16hi_1
;
17582 if (TARGET_AVX512F
)
17583 gen
= gen_avx512f_vec_dupv16si_1
;
17586 gen
= gen_avx2_pbroadcastv8si_1
;
17589 gen
= gen_avx2_pbroadcastv16qi
;
17592 gen
= gen_avx2_pbroadcastv8hi
;
17595 if (TARGET_AVX512F
)
17596 gen
= gen_avx512f_vec_dupv16sf_1
;
17599 gen
= gen_avx2_vec_dupv8sf_1
;
17602 if (TARGET_AVX512F
)
17603 gen
= gen_avx512f_vec_dupv8df_1
;
17606 if (TARGET_AVX512F
)
17607 gen
= gen_avx512f_vec_dupv8di_1
;
17609 /* For other modes prefer other shuffles this function creates. */
17615 emit_insn (gen (d
->target
, d
->op0
));
17620 if (expand_vselect (d
->target
, d
->op0
, nd
.perm
, nelt
, d
->testing_p
))
17623 /* There are plenty of patterns in sse.md that are written for
17624 SEL+CONCAT and are not replicated for a single op. Perhaps
17625 that should be changed, to avoid the nastiness here. */
17627 /* Recognize interleave style patterns, which means incrementing
17628 every other permutation operand. */
17629 for (i
= 0; i
< nelt
; i
+= 2)
17631 nd
.perm
[i
] = d
->perm
[i
] & mask
;
17632 nd
.perm
[i
+ 1] = (d
->perm
[i
+ 1] & mask
) + nelt
;
17634 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op0
, nd
.perm
, nelt
,
17638 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
17641 for (i
= 0; i
< nelt
; i
+= 4)
17643 nd
.perm
[i
+ 0] = d
->perm
[i
+ 0] & mask
;
17644 nd
.perm
[i
+ 1] = d
->perm
[i
+ 1] & mask
;
17645 nd
.perm
[i
+ 2] = (d
->perm
[i
+ 2] & mask
) + nelt
;
17646 nd
.perm
[i
+ 3] = (d
->perm
[i
+ 3] & mask
) + nelt
;
17649 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op0
, nd
.perm
, nelt
,
17655 /* Try movss/movsd instructions. */
17656 if (expand_vec_perm_movs (d
))
17659 /* Finally, try the fully general two operand permute. */
17660 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op1
, d
->perm
, nelt
,
17664 /* Recognize interleave style patterns with reversed operands. */
17665 if (!d
->one_operand_p
)
17667 for (i
= 0; i
< nelt
; ++i
)
17669 unsigned e
= d
->perm
[i
];
17677 if (expand_vselect_vconcat (d
->target
, d
->op1
, d
->op0
, nd
.perm
, nelt
,
17682 /* Try the SSE4.1 blend variable merge instructions. */
17683 if (expand_vec_perm_blend (d
))
17686 /* Try one of the AVX vpermil variable permutations. */
17687 if (expand_vec_perm_vpermil (d
))
17690 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
17691 vpshufb, vpermd, vpermps or vpermq variable permutation. */
17692 if (expand_vec_perm_pshufb (d
))
17695 /* Try the AVX2 vpalignr instruction. */
17696 if (expand_vec_perm_palignr (d
, true))
17699 /* Try the AVX512F vperm{s,d} instructions. */
17700 if (ix86_expand_vec_one_operand_perm_avx512 (d
))
17703 /* Try the AVX512F vpermt2/vpermi2 instructions. */
17704 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX
, NULL_RTX
, NULL_RTX
, NULL_RTX
, d
))
17707 /* See if we can get the same permutation in different vector integer
17709 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
17712 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
17718 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
17719 in terms of a pair of pshuflw + pshufhw instructions. */
17722 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d
*d
)
17724 unsigned char perm2
[MAX_VECT_LEN
];
17728 if (d
->vmode
!= V8HImode
|| !d
->one_operand_p
)
17731 /* The two permutations only operate in 64-bit lanes. */
17732 for (i
= 0; i
< 4; ++i
)
17733 if (d
->perm
[i
] >= 4)
17735 for (i
= 4; i
< 8; ++i
)
17736 if (d
->perm
[i
] < 4)
17742 /* Emit the pshuflw. */
17743 memcpy (perm2
, d
->perm
, 4);
17744 for (i
= 4; i
< 8; ++i
)
17746 ok
= expand_vselect (d
->target
, d
->op0
, perm2
, 8, d
->testing_p
);
17749 /* Emit the pshufhw. */
17750 memcpy (perm2
+ 4, d
->perm
+ 4, 4);
17751 for (i
= 0; i
< 4; ++i
)
17753 ok
= expand_vselect (d
->target
, d
->target
, perm2
, 8, d
->testing_p
);
17759 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17760 the permutation using the SSSE3 palignr instruction. This succeeds
17761 when all of the elements in PERM fit within one vector and we merely
17762 need to shift them down so that a single vector permutation has a
17763 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
17764 the vpalignr instruction itself can perform the requested permutation. */
17767 expand_vec_perm_palignr (struct expand_vec_perm_d
*d
, bool single_insn_only_p
)
17769 unsigned i
, nelt
= d
->nelt
;
17770 unsigned min
, max
, minswap
, maxswap
;
17771 bool in_order
, ok
, swap
= false;
17773 struct expand_vec_perm_d dcopy
;
17775 /* Even with AVX, palignr only operates on 128-bit vectors,
17776 in AVX2 palignr operates on both 128-bit lanes. */
17777 if ((!TARGET_SSSE3
|| GET_MODE_SIZE (d
->vmode
) != 16)
17778 && (!TARGET_AVX2
|| GET_MODE_SIZE (d
->vmode
) != 32))
17783 minswap
= 2 * nelt
;
17785 for (i
= 0; i
< nelt
; ++i
)
17787 unsigned e
= d
->perm
[i
];
17788 unsigned eswap
= d
->perm
[i
] ^ nelt
;
17789 if (GET_MODE_SIZE (d
->vmode
) == 32)
17791 e
= (e
& ((nelt
/ 2) - 1)) | ((e
& nelt
) >> 1);
17792 eswap
= e
^ (nelt
/ 2);
17798 if (eswap
< minswap
)
17800 if (eswap
> maxswap
)
17804 || max
- min
>= (GET_MODE_SIZE (d
->vmode
) == 32 ? nelt
/ 2 : nelt
))
17806 if (d
->one_operand_p
17808 || maxswap
- minswap
>= (GET_MODE_SIZE (d
->vmode
) == 32
17809 ? nelt
/ 2 : nelt
))
17816 /* Given that we have SSSE3, we know we'll be able to implement the
17817 single operand permutation after the palignr with pshufb for
17818 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
17820 if (d
->testing_p
&& GET_MODE_SIZE (d
->vmode
) == 16 && !single_insn_only_p
)
17826 dcopy
.op0
= d
->op1
;
17827 dcopy
.op1
= d
->op0
;
17828 for (i
= 0; i
< nelt
; ++i
)
17829 dcopy
.perm
[i
] ^= nelt
;
17833 for (i
= 0; i
< nelt
; ++i
)
17835 unsigned e
= dcopy
.perm
[i
];
17836 if (GET_MODE_SIZE (d
->vmode
) == 32
17838 && (e
& (nelt
/ 2 - 1)) < min
)
17839 e
= e
- min
- (nelt
/ 2);
17846 dcopy
.one_operand_p
= true;
17848 if (single_insn_only_p
&& !in_order
)
17851 /* For AVX2, test whether we can permute the result in one instruction. */
17856 dcopy
.op1
= dcopy
.op0
;
17857 return expand_vec_perm_1 (&dcopy
);
17860 shift
= GEN_INT (min
* GET_MODE_UNIT_BITSIZE (d
->vmode
));
17861 if (GET_MODE_SIZE (d
->vmode
) == 16)
17863 target
= gen_reg_rtx (TImode
);
17864 emit_insn (gen_ssse3_palignrti (target
, gen_lowpart (TImode
, dcopy
.op1
),
17865 gen_lowpart (TImode
, dcopy
.op0
), shift
));
17869 target
= gen_reg_rtx (V2TImode
);
17870 emit_insn (gen_avx2_palignrv2ti (target
,
17871 gen_lowpart (V2TImode
, dcopy
.op1
),
17872 gen_lowpart (V2TImode
, dcopy
.op0
),
17876 dcopy
.op0
= dcopy
.op1
= gen_lowpart (d
->vmode
, target
);
17878 /* Test for the degenerate case where the alignment by itself
17879 produces the desired permutation. */
17882 emit_move_insn (d
->target
, dcopy
.op0
);
17886 ok
= expand_vec_perm_1 (&dcopy
);
17887 gcc_assert (ok
|| GET_MODE_SIZE (d
->vmode
) == 32);
17892 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17893 the permutation using the SSE4_1 pblendv instruction. Potentially
17894 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
17897 expand_vec_perm_pblendv (struct expand_vec_perm_d
*d
)
17899 unsigned i
, which
, nelt
= d
->nelt
;
17900 struct expand_vec_perm_d dcopy
, dcopy1
;
17901 machine_mode vmode
= d
->vmode
;
17904 /* Use the same checks as in expand_vec_perm_blend. */
17905 if (d
->one_operand_p
)
17907 if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
17909 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
17911 else if (TARGET_SSE4_1
&& GET_MODE_SIZE (vmode
) == 16)
17916 /* Figure out where permutation elements stay not in their
17917 respective lanes. */
17918 for (i
= 0, which
= 0; i
< nelt
; ++i
)
17920 unsigned e
= d
->perm
[i
];
17922 which
|= (e
< nelt
? 1 : 2);
17924 /* We can pblend the part where elements stay not in their
17925 respective lanes only when these elements are all in one
17926 half of a permutation.
17927 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
17928 lanes, but both 8 and 9 >= 8
17929 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
17930 respective lanes and 8 >= 8, but 2 not. */
17931 if (which
!= 1 && which
!= 2)
17933 if (d
->testing_p
&& GET_MODE_SIZE (vmode
) == 16)
17936 /* First we apply one operand permutation to the part where
17937 elements stay not in their respective lanes. */
17940 dcopy
.op0
= dcopy
.op1
= d
->op1
;
17942 dcopy
.op0
= dcopy
.op1
= d
->op0
;
17944 dcopy
.target
= gen_reg_rtx (vmode
);
17945 dcopy
.one_operand_p
= true;
17947 for (i
= 0; i
< nelt
; ++i
)
17948 dcopy
.perm
[i
] = d
->perm
[i
] & (nelt
- 1);
17950 ok
= expand_vec_perm_1 (&dcopy
);
17951 if (GET_MODE_SIZE (vmode
) != 16 && !ok
)
17958 /* Next we put permuted elements into their positions. */
17961 dcopy1
.op1
= dcopy
.target
;
17963 dcopy1
.op0
= dcopy
.target
;
17965 for (i
= 0; i
< nelt
; ++i
)
17966 dcopy1
.perm
[i
] = ((d
->perm
[i
] >= nelt
) ? (nelt
+ i
) : i
);
17968 ok
= expand_vec_perm_blend (&dcopy1
);
17974 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d
*d
);
17976 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17977 a two vector permutation into a single vector permutation by using
17978 an interleave operation to merge the vectors. */
17981 expand_vec_perm_interleave2 (struct expand_vec_perm_d
*d
)
17983 struct expand_vec_perm_d dremap
, dfinal
;
17984 unsigned i
, nelt
= d
->nelt
, nelt2
= nelt
/ 2;
17985 unsigned HOST_WIDE_INT contents
;
17986 unsigned char remap
[2 * MAX_VECT_LEN
];
17988 bool ok
, same_halves
= false;
17990 if (GET_MODE_SIZE (d
->vmode
) == 16)
17992 if (d
->one_operand_p
)
17995 else if (GET_MODE_SIZE (d
->vmode
) == 32)
17999 /* For 32-byte modes allow even d->one_operand_p.
18000 The lack of cross-lane shuffling in some instructions
18001 might prevent a single insn shuffle. */
18003 dfinal
.testing_p
= true;
18004 /* If expand_vec_perm_interleave3 can expand this into
18005 a 3 insn sequence, give up and let it be expanded as
18006 3 insn sequence. While that is one insn longer,
18007 it doesn't need a memory operand and in the common
18008 case that both interleave low and high permutations
18009 with the same operands are adjacent needs 4 insns
18010 for both after CSE. */
18011 if (expand_vec_perm_interleave3 (&dfinal
))
18017 /* Examine from whence the elements come. */
18019 for (i
= 0; i
< nelt
; ++i
)
18020 contents
|= HOST_WIDE_INT_1U
<< d
->perm
[i
];
18022 memset (remap
, 0xff, sizeof (remap
));
18025 if (GET_MODE_SIZE (d
->vmode
) == 16)
18027 unsigned HOST_WIDE_INT h1
, h2
, h3
, h4
;
18029 /* Split the two input vectors into 4 halves. */
18030 h1
= (HOST_WIDE_INT_1U
<< nelt2
) - 1;
18035 /* If the elements from the low halves use interleave low, and similarly
18036 for interleave high. If the elements are from mis-matched halves, we
18037 can use shufps for V4SF/V4SI or do a DImode shuffle. */
18038 if ((contents
& (h1
| h3
)) == contents
)
18041 for (i
= 0; i
< nelt2
; ++i
)
18044 remap
[i
+ nelt
] = i
* 2 + 1;
18045 dremap
.perm
[i
* 2] = i
;
18046 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
18048 if (!TARGET_SSE2
&& d
->vmode
== V4SImode
)
18049 dremap
.vmode
= V4SFmode
;
18051 else if ((contents
& (h2
| h4
)) == contents
)
18054 for (i
= 0; i
< nelt2
; ++i
)
18056 remap
[i
+ nelt2
] = i
* 2;
18057 remap
[i
+ nelt
+ nelt2
] = i
* 2 + 1;
18058 dremap
.perm
[i
* 2] = i
+ nelt2
;
18059 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt2
;
18061 if (!TARGET_SSE2
&& d
->vmode
== V4SImode
)
18062 dremap
.vmode
= V4SFmode
;
18064 else if ((contents
& (h1
| h4
)) == contents
)
18067 for (i
= 0; i
< nelt2
; ++i
)
18070 remap
[i
+ nelt
+ nelt2
] = i
+ nelt2
;
18071 dremap
.perm
[i
] = i
;
18072 dremap
.perm
[i
+ nelt2
] = i
+ nelt
+ nelt2
;
18077 dremap
.vmode
= V2DImode
;
18079 dremap
.perm
[0] = 0;
18080 dremap
.perm
[1] = 3;
18083 else if ((contents
& (h2
| h3
)) == contents
)
18086 for (i
= 0; i
< nelt2
; ++i
)
18088 remap
[i
+ nelt2
] = i
;
18089 remap
[i
+ nelt
] = i
+ nelt2
;
18090 dremap
.perm
[i
] = i
+ nelt2
;
18091 dremap
.perm
[i
+ nelt2
] = i
+ nelt
;
18096 dremap
.vmode
= V2DImode
;
18098 dremap
.perm
[0] = 1;
18099 dremap
.perm
[1] = 2;
18107 unsigned int nelt4
= nelt
/ 4, nzcnt
= 0;
18108 unsigned HOST_WIDE_INT q
[8];
18109 unsigned int nonzero_halves
[4];
18111 /* Split the two input vectors into 8 quarters. */
18112 q
[0] = (HOST_WIDE_INT_1U
<< nelt4
) - 1;
18113 for (i
= 1; i
< 8; ++i
)
18114 q
[i
] = q
[0] << (nelt4
* i
);
18115 for (i
= 0; i
< 4; ++i
)
18116 if (((q
[2 * i
] | q
[2 * i
+ 1]) & contents
) != 0)
18118 nonzero_halves
[nzcnt
] = i
;
18124 gcc_assert (d
->one_operand_p
);
18125 nonzero_halves
[1] = nonzero_halves
[0];
18126 same_halves
= true;
18128 else if (d
->one_operand_p
)
18130 gcc_assert (nonzero_halves
[0] == 0);
18131 gcc_assert (nonzero_halves
[1] == 1);
18136 if (d
->perm
[0] / nelt2
== nonzero_halves
[1])
18138 /* Attempt to increase the likelihood that dfinal
18139 shuffle will be intra-lane. */
18140 std::swap (nonzero_halves
[0], nonzero_halves
[1]);
18143 /* vperm2f128 or vperm2i128. */
18144 for (i
= 0; i
< nelt2
; ++i
)
18146 remap
[i
+ nonzero_halves
[1] * nelt2
] = i
+ nelt2
;
18147 remap
[i
+ nonzero_halves
[0] * nelt2
] = i
;
18148 dremap
.perm
[i
+ nelt2
] = i
+ nonzero_halves
[1] * nelt2
;
18149 dremap
.perm
[i
] = i
+ nonzero_halves
[0] * nelt2
;
18152 if (d
->vmode
!= V8SFmode
18153 && d
->vmode
!= V4DFmode
18154 && d
->vmode
!= V8SImode
)
18156 dremap
.vmode
= V8SImode
;
18158 for (i
= 0; i
< 4; ++i
)
18160 dremap
.perm
[i
] = i
+ nonzero_halves
[0] * 4;
18161 dremap
.perm
[i
+ 4] = i
+ nonzero_halves
[1] * 4;
18165 else if (d
->one_operand_p
)
18167 else if (TARGET_AVX2
18168 && (contents
& (q
[0] | q
[2] | q
[4] | q
[6])) == contents
)
18171 for (i
= 0; i
< nelt4
; ++i
)
18174 remap
[i
+ nelt
] = i
* 2 + 1;
18175 remap
[i
+ nelt2
] = i
* 2 + nelt2
;
18176 remap
[i
+ nelt
+ nelt2
] = i
* 2 + nelt2
+ 1;
18177 dremap
.perm
[i
* 2] = i
;
18178 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
18179 dremap
.perm
[i
* 2 + nelt2
] = i
+ nelt2
;
18180 dremap
.perm
[i
* 2 + nelt2
+ 1] = i
+ nelt
+ nelt2
;
18183 else if (TARGET_AVX2
18184 && (contents
& (q
[1] | q
[3] | q
[5] | q
[7])) == contents
)
18187 for (i
= 0; i
< nelt4
; ++i
)
18189 remap
[i
+ nelt4
] = i
* 2;
18190 remap
[i
+ nelt
+ nelt4
] = i
* 2 + 1;
18191 remap
[i
+ nelt2
+ nelt4
] = i
* 2 + nelt2
;
18192 remap
[i
+ nelt
+ nelt2
+ nelt4
] = i
* 2 + nelt2
+ 1;
18193 dremap
.perm
[i
* 2] = i
+ nelt4
;
18194 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt4
;
18195 dremap
.perm
[i
* 2 + nelt2
] = i
+ nelt2
+ nelt4
;
18196 dremap
.perm
[i
* 2 + nelt2
+ 1] = i
+ nelt
+ nelt2
+ nelt4
;
18203 /* Use the remapping array set up above to move the elements from their
18204 swizzled locations into their final destinations. */
18206 for (i
= 0; i
< nelt
; ++i
)
18208 unsigned e
= remap
[d
->perm
[i
]];
18209 gcc_assert (e
< nelt
);
18210 /* If same_halves is true, both halves of the remapped vector are the
18211 same. Avoid cross-lane accesses if possible. */
18212 if (same_halves
&& i
>= nelt2
)
18214 gcc_assert (e
< nelt2
);
18215 dfinal
.perm
[i
] = e
+ nelt2
;
18218 dfinal
.perm
[i
] = e
;
18222 dremap
.target
= gen_reg_rtx (dremap
.vmode
);
18223 dfinal
.op0
= gen_lowpart (dfinal
.vmode
, dremap
.target
);
18225 dfinal
.op1
= dfinal
.op0
;
18226 dfinal
.one_operand_p
= true;
18228 /* Test if the final remap can be done with a single insn. For V4SFmode or
18229 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
18231 ok
= expand_vec_perm_1 (&dfinal
);
18232 seq
= get_insns ();
18241 if (dremap
.vmode
!= dfinal
.vmode
)
18243 dremap
.op0
= gen_lowpart (dremap
.vmode
, dremap
.op0
);
18244 dremap
.op1
= gen_lowpart (dremap
.vmode
, dremap
.op1
);
18247 ok
= expand_vec_perm_1 (&dremap
);
18254 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
18255 a single vector cross-lane permutation into vpermq followed
18256 by any of the single insn permutations. */
18259 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d
*d
)
18261 struct expand_vec_perm_d dremap
, dfinal
;
18262 unsigned i
, j
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, nelt4
= nelt
/ 4;
18263 unsigned contents
[2];
18267 && (d
->vmode
== V32QImode
|| d
->vmode
== V16HImode
)
18268 && d
->one_operand_p
))
18273 for (i
= 0; i
< nelt2
; ++i
)
18275 contents
[0] |= 1u << (d
->perm
[i
] / nelt4
);
18276 contents
[1] |= 1u << (d
->perm
[i
+ nelt2
] / nelt4
);
18279 for (i
= 0; i
< 2; ++i
)
18281 unsigned int cnt
= 0;
18282 for (j
= 0; j
< 4; ++j
)
18283 if ((contents
[i
] & (1u << j
)) != 0 && ++cnt
> 2)
18291 dremap
.vmode
= V4DImode
;
18293 dremap
.target
= gen_reg_rtx (V4DImode
);
18294 dremap
.op0
= gen_lowpart (V4DImode
, d
->op0
);
18295 dremap
.op1
= dremap
.op0
;
18296 dremap
.one_operand_p
= true;
18297 for (i
= 0; i
< 2; ++i
)
18299 unsigned int cnt
= 0;
18300 for (j
= 0; j
< 4; ++j
)
18301 if ((contents
[i
] & (1u << j
)) != 0)
18302 dremap
.perm
[2 * i
+ cnt
++] = j
;
18303 for (; cnt
< 2; ++cnt
)
18304 dremap
.perm
[2 * i
+ cnt
] = 0;
18308 dfinal
.op0
= gen_lowpart (dfinal
.vmode
, dremap
.target
);
18309 dfinal
.op1
= dfinal
.op0
;
18310 dfinal
.one_operand_p
= true;
18311 for (i
= 0, j
= 0; i
< nelt
; ++i
)
18315 dfinal
.perm
[i
] = (d
->perm
[i
] & (nelt4
- 1)) | (j
? nelt2
: 0);
18316 if ((d
->perm
[i
] / nelt4
) == dremap
.perm
[j
])
18318 else if ((d
->perm
[i
] / nelt4
) == dremap
.perm
[j
+ 1])
18319 dfinal
.perm
[i
] |= nelt4
;
18321 gcc_unreachable ();
18324 ok
= expand_vec_perm_1 (&dremap
);
18327 ok
= expand_vec_perm_1 (&dfinal
);
18333 static bool canonicalize_perm (struct expand_vec_perm_d
*d
);
18335 /* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
18336 a vector permutation using two instructions, vperm2f128 resp.
18337 vperm2i128 followed by any single in-lane permutation. */
18340 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d
*d
)
18342 struct expand_vec_perm_d dfirst
, dsecond
;
18343 unsigned i
, j
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, perm
;
18347 || GET_MODE_SIZE (d
->vmode
) != 32
18348 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
&& !TARGET_AVX2
))
18352 dsecond
.one_operand_p
= false;
18353 dsecond
.testing_p
= true;
18355 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
18356 immediate. For perm < 16 the second permutation uses
18357 d->op0 as first operand, for perm >= 16 it uses d->op1
18358 as first operand. The second operand is the result of
18360 for (perm
= 0; perm
< 32; perm
++)
18362 /* Ignore permutations which do not move anything cross-lane. */
18365 /* The second shuffle for e.g. V4DFmode has
18366 0123 and ABCD operands.
18367 Ignore AB23, as 23 is already in the second lane
18368 of the first operand. */
18369 if ((perm
& 0xc) == (1 << 2)) continue;
18370 /* And 01CD, as 01 is in the first lane of the first
18372 if ((perm
& 3) == 0) continue;
18373 /* And 4567, as then the vperm2[fi]128 doesn't change
18374 anything on the original 4567 second operand. */
18375 if ((perm
& 0xf) == ((3 << 2) | 2)) continue;
18379 /* The second shuffle for e.g. V4DFmode has
18380 4567 and ABCD operands.
18381 Ignore AB67, as 67 is already in the second lane
18382 of the first operand. */
18383 if ((perm
& 0xc) == (3 << 2)) continue;
18384 /* And 45CD, as 45 is in the first lane of the first
18386 if ((perm
& 3) == 2) continue;
18387 /* And 0123, as then the vperm2[fi]128 doesn't change
18388 anything on the original 0123 first operand. */
18389 if ((perm
& 0xf) == (1 << 2)) continue;
18392 for (i
= 0; i
< nelt
; i
++)
18394 j
= d
->perm
[i
] / nelt2
;
18395 if (j
== ((perm
>> (2 * (i
>= nelt2
))) & 3))
18396 dsecond
.perm
[i
] = nelt
+ (i
& nelt2
) + (d
->perm
[i
] & (nelt2
- 1));
18397 else if (j
== (unsigned) (i
>= nelt2
) + 2 * (perm
>= 16))
18398 dsecond
.perm
[i
] = d
->perm
[i
] & (nelt
- 1);
18406 ok
= expand_vec_perm_1 (&dsecond
);
18417 /* Found a usable second shuffle. dfirst will be
18418 vperm2f128 on d->op0 and d->op1. */
18419 dsecond
.testing_p
= false;
18421 dfirst
.target
= gen_reg_rtx (d
->vmode
);
18422 for (i
= 0; i
< nelt
; i
++)
18423 dfirst
.perm
[i
] = (i
& (nelt2
- 1))
18424 + ((perm
>> (2 * (i
>= nelt2
))) & 3) * nelt2
;
18426 canonicalize_perm (&dfirst
);
18427 ok
= expand_vec_perm_1 (&dfirst
);
18430 /* And dsecond is some single insn shuffle, taking
18431 d->op0 and result of vperm2f128 (if perm < 16) or
18432 d->op1 and result of vperm2f128 (otherwise). */
18434 dsecond
.op0
= dsecond
.op1
;
18435 dsecond
.op1
= dfirst
.target
;
18437 ok
= expand_vec_perm_1 (&dsecond
);
18443 /* For one operand, the only useful vperm2f128 permutation is 0x01
18445 if (d
->one_operand_p
)
18452 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
18453 a two vector permutation using 2 intra-lane interleave insns
18454 and cross-lane shuffle for 32-byte vectors. */
18457 expand_vec_perm_interleave3 (struct expand_vec_perm_d
*d
)
18460 rtx (*gen
) (rtx
, rtx
, rtx
);
18462 if (d
->one_operand_p
)
18464 if (TARGET_AVX2
&& GET_MODE_SIZE (d
->vmode
) == 32)
18466 else if (TARGET_AVX
&& (d
->vmode
== V8SFmode
|| d
->vmode
== V4DFmode
))
18472 if (d
->perm
[0] != 0 && d
->perm
[0] != nelt
/ 2)
18474 for (i
= 0; i
< nelt
; i
+= 2)
18475 if (d
->perm
[i
] != d
->perm
[0] + i
/ 2
18476 || d
->perm
[i
+ 1] != d
->perm
[0] + i
/ 2 + nelt
)
18486 gen
= gen_vec_interleave_highv32qi
;
18488 gen
= gen_vec_interleave_lowv32qi
;
18492 gen
= gen_vec_interleave_highv16hi
;
18494 gen
= gen_vec_interleave_lowv16hi
;
18498 gen
= gen_vec_interleave_highv8si
;
18500 gen
= gen_vec_interleave_lowv8si
;
18504 gen
= gen_vec_interleave_highv4di
;
18506 gen
= gen_vec_interleave_lowv4di
;
18510 gen
= gen_vec_interleave_highv8sf
;
18512 gen
= gen_vec_interleave_lowv8sf
;
18516 gen
= gen_vec_interleave_highv4df
;
18518 gen
= gen_vec_interleave_lowv4df
;
18521 gcc_unreachable ();
18524 emit_insn (gen (d
->target
, d
->op0
, d
->op1
));
18528 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
18529 a single vector permutation using a single intra-lane vector
18530 permutation, vperm2f128 swapping the lanes and vblend* insn blending
18531 the non-swapped and swapped vectors together. */
18534 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d
*d
)
18536 struct expand_vec_perm_d dfirst
, dsecond
;
18537 unsigned i
, j
, msk
, nelt
= d
->nelt
, nelt2
= nelt
/ 2;
18540 rtx (*blend
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
18544 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
)
18545 || !d
->one_operand_p
)
18549 for (i
= 0; i
< nelt
; i
++)
18550 dfirst
.perm
[i
] = 0xff;
18551 for (i
= 0, msk
= 0; i
< nelt
; i
++)
18553 j
= (d
->perm
[i
] & nelt2
) ? i
| nelt2
: i
& ~nelt2
;
18554 if (dfirst
.perm
[j
] != 0xff && dfirst
.perm
[j
] != d
->perm
[i
])
18556 dfirst
.perm
[j
] = d
->perm
[i
];
18560 for (i
= 0; i
< nelt
; i
++)
18561 if (dfirst
.perm
[i
] == 0xff)
18562 dfirst
.perm
[i
] = i
;
18565 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
18568 ok
= expand_vec_perm_1 (&dfirst
);
18569 seq
= get_insns ();
18581 dsecond
.op0
= dfirst
.target
;
18582 dsecond
.op1
= dfirst
.target
;
18583 dsecond
.one_operand_p
= true;
18584 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
18585 for (i
= 0; i
< nelt
; i
++)
18586 dsecond
.perm
[i
] = i
^ nelt2
;
18588 ok
= expand_vec_perm_1 (&dsecond
);
18591 blend
= d
->vmode
== V8SFmode
? gen_avx_blendps256
: gen_avx_blendpd256
;
18592 emit_insn (blend (d
->target
, dfirst
.target
, dsecond
.target
, GEN_INT (msk
)));
18596 /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
18597 permutation using two vperm2f128, followed by a vshufpd insn blending
18598 the two vectors together. */
18601 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d
*d
)
18603 struct expand_vec_perm_d dfirst
, dsecond
, dthird
;
18606 if (!TARGET_AVX
|| (d
->vmode
!= V4DFmode
))
18616 dfirst
.perm
[0] = (d
->perm
[0] & ~1);
18617 dfirst
.perm
[1] = (d
->perm
[0] & ~1) + 1;
18618 dfirst
.perm
[2] = (d
->perm
[2] & ~1);
18619 dfirst
.perm
[3] = (d
->perm
[2] & ~1) + 1;
18620 dsecond
.perm
[0] = (d
->perm
[1] & ~1);
18621 dsecond
.perm
[1] = (d
->perm
[1] & ~1) + 1;
18622 dsecond
.perm
[2] = (d
->perm
[3] & ~1);
18623 dsecond
.perm
[3] = (d
->perm
[3] & ~1) + 1;
18624 dthird
.perm
[0] = (d
->perm
[0] % 2);
18625 dthird
.perm
[1] = (d
->perm
[1] % 2) + 4;
18626 dthird
.perm
[2] = (d
->perm
[2] % 2) + 2;
18627 dthird
.perm
[3] = (d
->perm
[3] % 2) + 6;
18629 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
18630 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
18631 dthird
.op0
= dfirst
.target
;
18632 dthird
.op1
= dsecond
.target
;
18633 dthird
.one_operand_p
= false;
18635 canonicalize_perm (&dfirst
);
18636 canonicalize_perm (&dsecond
);
18638 ok
= expand_vec_perm_1 (&dfirst
)
18639 && expand_vec_perm_1 (&dsecond
)
18640 && expand_vec_perm_1 (&dthird
);
18647 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d
*);
18649 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
18650 a two vector permutation using two intra-lane vector
18651 permutations, vperm2f128 swapping the lanes and vblend* insn blending
18652 the non-swapped and swapped vectors together. */
18655 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d
*d
)
18657 struct expand_vec_perm_d dfirst
, dsecond
, dthird
;
18658 unsigned i
, j
, msk
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, which1
= 0, which2
= 0;
18659 rtx_insn
*seq1
, *seq2
;
18661 rtx (*blend
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
18665 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
)
18666 || d
->one_operand_p
)
18671 for (i
= 0; i
< nelt
; i
++)
18673 dfirst
.perm
[i
] = 0xff;
18674 dsecond
.perm
[i
] = 0xff;
18676 for (i
= 0, msk
= 0; i
< nelt
; i
++)
18678 j
= (d
->perm
[i
] & nelt2
) ? i
| nelt2
: i
& ~nelt2
;
18681 dfirst
.perm
[j
] = d
->perm
[i
];
18682 which1
|= (d
->perm
[i
] < nelt
? 1 : 2);
18686 dsecond
.perm
[j
] = d
->perm
[i
];
18687 which2
|= (d
->perm
[i
] < nelt
? 1 : 2);
18691 if (msk
== 0 || msk
== (1U << nelt
) - 1)
18696 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
18697 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
18700 for (i
= 0; i
< nelt
; i
++)
18702 if (dfirst
.perm
[i
] == 0xff)
18703 dfirst
.perm
[i
] = (which1
== 2 ? i
+ nelt
: i
);
18704 if (dsecond
.perm
[i
] == 0xff)
18705 dsecond
.perm
[i
] = (which2
== 2 ? i
+ nelt
: i
);
18707 canonicalize_perm (&dfirst
);
18709 ok
= ix86_expand_vec_perm_const_1 (&dfirst
);
18710 seq1
= get_insns ();
18716 canonicalize_perm (&dsecond
);
18718 ok
= ix86_expand_vec_perm_const_1 (&dsecond
);
18719 seq2
= get_insns ();
18732 dthird
.op0
= dsecond
.target
;
18733 dthird
.op1
= dsecond
.target
;
18734 dthird
.one_operand_p
= true;
18735 dthird
.target
= gen_reg_rtx (dthird
.vmode
);
18736 for (i
= 0; i
< nelt
; i
++)
18737 dthird
.perm
[i
] = i
^ nelt2
;
18739 ok
= expand_vec_perm_1 (&dthird
);
18742 blend
= d
->vmode
== V8SFmode
? gen_avx_blendps256
: gen_avx_blendpd256
;
18743 emit_insn (blend (d
->target
, dfirst
.target
, dthird
.target
, GEN_INT (msk
)));
18747 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
18748 permutation with two pshufb insns and an ior. We should have already
18749 failed all two instruction sequences. */
18752 expand_vec_perm_pshufb2 (struct expand_vec_perm_d
*d
)
18754 rtx rperm
[2][16], vperm
, l
, h
, op
, m128
;
18755 unsigned int i
, nelt
, eltsz
;
18757 if (!TARGET_SSSE3
|| GET_MODE_SIZE (d
->vmode
) != 16)
18759 gcc_assert (!d
->one_operand_p
);
18765 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
18767 /* Generate two permutation masks. If the required element is within
18768 the given vector it is shuffled into the proper lane. If the required
18769 element is in the other vector, force a zero into the lane by setting
18770 bit 7 in the permutation mask. */
18771 m128
= GEN_INT (-128);
18772 for (i
= 0; i
< nelt
; ++i
)
18774 unsigned j
, e
= d
->perm
[i
];
18775 unsigned which
= (e
>= nelt
);
18779 for (j
= 0; j
< eltsz
; ++j
)
18781 rperm
[which
][i
*eltsz
+ j
] = GEN_INT (e
*eltsz
+ j
);
18782 rperm
[1-which
][i
*eltsz
+ j
] = m128
;
18786 vperm
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, rperm
[0]));
18787 vperm
= force_reg (V16QImode
, vperm
);
18789 l
= gen_reg_rtx (V16QImode
);
18790 op
= gen_lowpart (V16QImode
, d
->op0
);
18791 emit_insn (gen_ssse3_pshufbv16qi3 (l
, op
, vperm
));
18793 vperm
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, rperm
[1]));
18794 vperm
= force_reg (V16QImode
, vperm
);
18796 h
= gen_reg_rtx (V16QImode
);
18797 op
= gen_lowpart (V16QImode
, d
->op1
);
18798 emit_insn (gen_ssse3_pshufbv16qi3 (h
, op
, vperm
));
18801 if (d
->vmode
!= V16QImode
)
18802 op
= gen_reg_rtx (V16QImode
);
18803 emit_insn (gen_iorv16qi3 (op
, l
, h
));
18804 if (op
!= d
->target
)
18805 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
18810 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
18811 with two vpshufb insns, vpermq and vpor. We should have already failed
18812 all two or three instruction sequences. */
18815 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d
*d
)
18817 rtx rperm
[2][32], vperm
, l
, h
, hp
, op
, m128
;
18818 unsigned int i
, nelt
, eltsz
;
18821 || !d
->one_operand_p
18822 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
18829 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
18831 /* Generate two permutation masks. If the required element is within
18832 the same lane, it is shuffled in. If the required element from the
18833 other lane, force a zero by setting bit 7 in the permutation mask.
18834 In the other mask the mask has non-negative elements if element
18835 is requested from the other lane, but also moved to the other lane,
18836 so that the result of vpshufb can have the two V2TImode halves
18838 m128
= GEN_INT (-128);
18839 for (i
= 0; i
< nelt
; ++i
)
18841 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
18842 unsigned which
= ((d
->perm
[i
] ^ i
) & (nelt
/ 2)) * eltsz
;
18844 for (j
= 0; j
< eltsz
; ++j
)
18846 rperm
[!!which
][(i
* eltsz
+ j
) ^ which
] = GEN_INT (e
* eltsz
+ j
);
18847 rperm
[!which
][(i
* eltsz
+ j
) ^ (which
^ 16)] = m128
;
18851 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[1]));
18852 vperm
= force_reg (V32QImode
, vperm
);
18854 h
= gen_reg_rtx (V32QImode
);
18855 op
= gen_lowpart (V32QImode
, d
->op0
);
18856 emit_insn (gen_avx2_pshufbv32qi3 (h
, op
, vperm
));
18858 /* Swap the 128-byte lanes of h into hp. */
18859 hp
= gen_reg_rtx (V4DImode
);
18860 op
= gen_lowpart (V4DImode
, h
);
18861 emit_insn (gen_avx2_permv4di_1 (hp
, op
, const2_rtx
, GEN_INT (3), const0_rtx
,
18864 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[0]));
18865 vperm
= force_reg (V32QImode
, vperm
);
18867 l
= gen_reg_rtx (V32QImode
);
18868 op
= gen_lowpart (V32QImode
, d
->op0
);
18869 emit_insn (gen_avx2_pshufbv32qi3 (l
, op
, vperm
));
18872 if (d
->vmode
!= V32QImode
)
18873 op
= gen_reg_rtx (V32QImode
);
18874 emit_insn (gen_iorv32qi3 (op
, l
, gen_lowpart (V32QImode
, hp
)));
18875 if (op
!= d
->target
)
18876 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
18881 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18882 and extract-odd permutations of two V32QImode and V16QImode operand
18883 with two vpshufb insns, vpor and vpermq. We should have already
18884 failed all two or three instruction sequences. */
18887 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d
*d
)
18889 rtx rperm
[2][32], vperm
, l
, h
, ior
, op
, m128
;
18890 unsigned int i
, nelt
, eltsz
;
18893 || d
->one_operand_p
18894 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
18897 for (i
= 0; i
< d
->nelt
; ++i
)
18898 if ((d
->perm
[i
] ^ (i
* 2)) & (3 * d
->nelt
/ 2))
18905 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
18907 /* Generate two permutation masks. In the first permutation mask
18908 the first quarter will contain indexes for the first half
18909 of the op0, the second quarter will contain bit 7 set, third quarter
18910 will contain indexes for the second half of the op0 and the
18911 last quarter bit 7 set. In the second permutation mask
18912 the first quarter will contain bit 7 set, the second quarter
18913 indexes for the first half of the op1, the third quarter bit 7 set
18914 and last quarter indexes for the second half of the op1.
18915 I.e. the first mask e.g. for V32QImode extract even will be:
18916 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
18917 (all values masked with 0xf except for -128) and second mask
18918 for extract even will be
18919 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
18920 m128
= GEN_INT (-128);
18921 for (i
= 0; i
< nelt
; ++i
)
18923 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
18924 unsigned which
= d
->perm
[i
] >= nelt
;
18925 unsigned xorv
= (i
>= nelt
/ 4 && i
< 3 * nelt
/ 4) ? 24 : 0;
18927 for (j
= 0; j
< eltsz
; ++j
)
18929 rperm
[which
][(i
* eltsz
+ j
) ^ xorv
] = GEN_INT (e
* eltsz
+ j
);
18930 rperm
[1 - which
][(i
* eltsz
+ j
) ^ xorv
] = m128
;
18934 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[0]));
18935 vperm
= force_reg (V32QImode
, vperm
);
18937 l
= gen_reg_rtx (V32QImode
);
18938 op
= gen_lowpart (V32QImode
, d
->op0
);
18939 emit_insn (gen_avx2_pshufbv32qi3 (l
, op
, vperm
));
18941 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[1]));
18942 vperm
= force_reg (V32QImode
, vperm
);
18944 h
= gen_reg_rtx (V32QImode
);
18945 op
= gen_lowpart (V32QImode
, d
->op1
);
18946 emit_insn (gen_avx2_pshufbv32qi3 (h
, op
, vperm
));
18948 ior
= gen_reg_rtx (V32QImode
);
18949 emit_insn (gen_iorv32qi3 (ior
, l
, h
));
18951 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
18952 op
= gen_reg_rtx (V4DImode
);
18953 ior
= gen_lowpart (V4DImode
, ior
);
18954 emit_insn (gen_avx2_permv4di_1 (op
, ior
, const0_rtx
, const2_rtx
,
18955 const1_rtx
, GEN_INT (3)));
18956 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
18961 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18962 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
18963 with two "and" and "pack" or two "shift" and "pack" insns. We should
18964 have already failed all two instruction sequences. */
18967 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d
*d
)
18969 rtx op
, dop0
, dop1
, t
;
18970 unsigned i
, odd
, c
, s
, nelt
= d
->nelt
;
18971 bool end_perm
= false;
18972 machine_mode half_mode
;
18973 rtx (*gen_and
) (rtx
, rtx
, rtx
);
18974 rtx (*gen_pack
) (rtx
, rtx
, rtx
);
18975 rtx (*gen_shift
) (rtx
, rtx
, rtx
);
18977 if (d
->one_operand_p
)
18983 /* Required for "pack". */
18984 if (!TARGET_SSE4_1
)
18988 half_mode
= V4SImode
;
18989 gen_and
= gen_andv4si3
;
18990 gen_pack
= gen_sse4_1_packusdw
;
18991 gen_shift
= gen_lshrv4si3
;
18994 /* No check as all instructions are SSE2. */
18997 half_mode
= V8HImode
;
18998 gen_and
= gen_andv8hi3
;
18999 gen_pack
= gen_sse2_packuswb
;
19000 gen_shift
= gen_lshrv8hi3
;
19007 half_mode
= V8SImode
;
19008 gen_and
= gen_andv8si3
;
19009 gen_pack
= gen_avx2_packusdw
;
19010 gen_shift
= gen_lshrv8si3
;
19018 half_mode
= V16HImode
;
19019 gen_and
= gen_andv16hi3
;
19020 gen_pack
= gen_avx2_packuswb
;
19021 gen_shift
= gen_lshrv16hi3
;
19025 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
19026 general shuffles. */
19030 /* Check that permutation is even or odd. */
19035 for (i
= 1; i
< nelt
; ++i
)
19036 if (d
->perm
[i
] != 2 * i
+ odd
)
19042 dop0
= gen_reg_rtx (half_mode
);
19043 dop1
= gen_reg_rtx (half_mode
);
19046 t
= gen_const_vec_duplicate (half_mode
, GEN_INT (c
));
19047 t
= force_reg (half_mode
, t
);
19048 emit_insn (gen_and (dop0
, t
, gen_lowpart (half_mode
, d
->op0
)));
19049 emit_insn (gen_and (dop1
, t
, gen_lowpart (half_mode
, d
->op1
)));
19053 emit_insn (gen_shift (dop0
,
19054 gen_lowpart (half_mode
, d
->op0
),
19056 emit_insn (gen_shift (dop1
,
19057 gen_lowpart (half_mode
, d
->op1
),
19060 /* In AVX2 for 256 bit case we need to permute pack result. */
19061 if (TARGET_AVX2
&& end_perm
)
19063 op
= gen_reg_rtx (d
->vmode
);
19064 t
= gen_reg_rtx (V4DImode
);
19065 emit_insn (gen_pack (op
, dop0
, dop1
));
19066 emit_insn (gen_avx2_permv4di_1 (t
,
19067 gen_lowpart (V4DImode
, op
),
19072 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, t
));
19075 emit_insn (gen_pack (d
->target
, dop0
, dop1
));
19080 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
19081 and extract-odd permutations of two V64QI operands
19082 with two "shifts", two "truncs" and one "concat" insns for "odd"
19083 and two "truncs" and one concat insn for "even."
19084 Have already failed all two instruction sequences. */
19087 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d
*d
)
19089 rtx t1
, t2
, t3
, t4
;
19090 unsigned i
, odd
, nelt
= d
->nelt
;
19092 if (!TARGET_AVX512BW
19093 || d
->one_operand_p
19094 || d
->vmode
!= V64QImode
)
19097 /* Check that permutation is even or odd. */
19102 for (i
= 1; i
< nelt
; ++i
)
19103 if (d
->perm
[i
] != 2 * i
+ odd
)
19112 t1
= gen_reg_rtx (V32HImode
);
19113 t2
= gen_reg_rtx (V32HImode
);
19114 emit_insn (gen_lshrv32hi3 (t1
,
19115 gen_lowpart (V32HImode
, d
->op0
),
19117 emit_insn (gen_lshrv32hi3 (t2
,
19118 gen_lowpart (V32HImode
, d
->op1
),
19123 t1
= gen_lowpart (V32HImode
, d
->op0
);
19124 t2
= gen_lowpart (V32HImode
, d
->op1
);
19127 t3
= gen_reg_rtx (V32QImode
);
19128 t4
= gen_reg_rtx (V32QImode
);
19129 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3
, t1
));
19130 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4
, t2
));
19131 emit_insn (gen_avx_vec_concatv64qi (d
->target
, t3
, t4
));
19136 /* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
19137 and extract-odd permutations. */
19140 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d
*d
, unsigned odd
)
19142 rtx t1
, t2
, t3
, t4
, t5
;
19149 t1
= gen_reg_rtx (V4DFmode
);
19150 t2
= gen_reg_rtx (V4DFmode
);
19152 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
19153 emit_insn (gen_avx_vperm2f128v4df3 (t1
, d
->op0
, d
->op1
, GEN_INT (0x20)));
19154 emit_insn (gen_avx_vperm2f128v4df3 (t2
, d
->op0
, d
->op1
, GEN_INT (0x31)));
19156 /* Now an unpck[lh]pd will produce the result required. */
19158 t3
= gen_avx_unpckhpd256 (d
->target
, t1
, t2
);
19160 t3
= gen_avx_unpcklpd256 (d
->target
, t1
, t2
);
19166 int mask
= odd
? 0xdd : 0x88;
19170 t1
= gen_reg_rtx (V8SFmode
);
19171 t2
= gen_reg_rtx (V8SFmode
);
19172 t3
= gen_reg_rtx (V8SFmode
);
19174 /* Shuffle within the 128-bit lanes to produce:
19175 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
19176 emit_insn (gen_avx_shufps256 (t1
, d
->op0
, d
->op1
,
19179 /* Shuffle the lanes around to produce:
19180 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
19181 emit_insn (gen_avx_vperm2f128v8sf3 (t2
, t1
, t1
,
19184 /* Shuffle within the 128-bit lanes to produce:
19185 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
19186 emit_insn (gen_avx_shufps256 (t3
, t1
, t2
, GEN_INT (0x44)));
19188 /* Shuffle within the 128-bit lanes to produce:
19189 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
19190 emit_insn (gen_avx_shufps256 (t2
, t1
, t2
, GEN_INT (0xee)));
19192 /* Shuffle the lanes around to produce:
19193 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
19194 emit_insn (gen_avx_vperm2f128v8sf3 (d
->target
, t3
, t2
,
19204 /* These are always directly implementable by expand_vec_perm_1. */
19205 gcc_unreachable ();
19208 gcc_assert (TARGET_MMX_WITH_SSE
);
19209 /* We have no suitable instructions. */
19217 /* We need 2*log2(N)-1 operations to achieve odd/even
19218 with interleave. */
19219 t1
= gen_reg_rtx (V4HImode
);
19220 emit_insn (gen_mmx_punpckhwd (t1
, d
->op0
, d
->op1
));
19221 emit_insn (gen_mmx_punpcklwd (d
->target
, d
->op0
, d
->op1
));
19223 t2
= gen_mmx_punpckhwd (d
->target
, d
->target
, t1
);
19225 t2
= gen_mmx_punpcklwd (d
->target
, d
->target
, t1
);
19231 return expand_vec_perm_even_odd_pack (d
);
19232 else if (TARGET_SSSE3
&& !TARGET_SLOW_PSHUFB
)
19233 return expand_vec_perm_pshufb2 (d
);
19238 /* We need 2*log2(N)-1 operations to achieve odd/even
19239 with interleave. */
19240 t1
= gen_reg_rtx (V8HImode
);
19241 t2
= gen_reg_rtx (V8HImode
);
19242 emit_insn (gen_vec_interleave_highv8hi (t1
, d
->op0
, d
->op1
));
19243 emit_insn (gen_vec_interleave_lowv8hi (d
->target
, d
->op0
, d
->op1
));
19244 emit_insn (gen_vec_interleave_highv8hi (t2
, d
->target
, t1
));
19245 emit_insn (gen_vec_interleave_lowv8hi (d
->target
, d
->target
, t1
));
19247 t3
= gen_vec_interleave_highv8hi (d
->target
, d
->target
, t2
);
19249 t3
= gen_vec_interleave_lowv8hi (d
->target
, d
->target
, t2
);
19255 return expand_vec_perm_even_odd_pack (d
);
19259 return expand_vec_perm_even_odd_pack (d
);
19262 return expand_vec_perm_even_odd_trunc (d
);
19267 struct expand_vec_perm_d d_copy
= *d
;
19268 d_copy
.vmode
= V4DFmode
;
19270 d_copy
.target
= gen_raw_REG (V4DFmode
, LAST_VIRTUAL_REGISTER
+ 1);
19272 d_copy
.target
= gen_reg_rtx (V4DFmode
);
19273 d_copy
.op0
= gen_lowpart (V4DFmode
, d
->op0
);
19274 d_copy
.op1
= gen_lowpart (V4DFmode
, d
->op1
);
19275 if (expand_vec_perm_even_odd_1 (&d_copy
, odd
))
19278 emit_move_insn (d
->target
,
19279 gen_lowpart (V4DImode
, d_copy
.target
));
19288 t1
= gen_reg_rtx (V4DImode
);
19289 t2
= gen_reg_rtx (V4DImode
);
19291 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
19292 emit_insn (gen_avx2_permv2ti (t1
, d
->op0
, d
->op1
, GEN_INT (0x20)));
19293 emit_insn (gen_avx2_permv2ti (t2
, d
->op0
, d
->op1
, GEN_INT (0x31)));
19295 /* Now an vpunpck[lh]qdq will produce the result required. */
19297 t3
= gen_avx2_interleave_highv4di (d
->target
, t1
, t2
);
19299 t3
= gen_avx2_interleave_lowv4di (d
->target
, t1
, t2
);
19306 struct expand_vec_perm_d d_copy
= *d
;
19307 d_copy
.vmode
= V8SFmode
;
19309 d_copy
.target
= gen_raw_REG (V8SFmode
, LAST_VIRTUAL_REGISTER
+ 1);
19311 d_copy
.target
= gen_reg_rtx (V8SFmode
);
19312 d_copy
.op0
= gen_lowpart (V8SFmode
, d
->op0
);
19313 d_copy
.op1
= gen_lowpart (V8SFmode
, d
->op1
);
19314 if (expand_vec_perm_even_odd_1 (&d_copy
, odd
))
19317 emit_move_insn (d
->target
,
19318 gen_lowpart (V8SImode
, d_copy
.target
));
19327 t1
= gen_reg_rtx (V8SImode
);
19328 t2
= gen_reg_rtx (V8SImode
);
19329 t3
= gen_reg_rtx (V4DImode
);
19330 t4
= gen_reg_rtx (V4DImode
);
19331 t5
= gen_reg_rtx (V4DImode
);
19333 /* Shuffle the lanes around into
19334 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
19335 emit_insn (gen_avx2_permv2ti (t3
, gen_lowpart (V4DImode
, d
->op0
),
19336 gen_lowpart (V4DImode
, d
->op1
),
19338 emit_insn (gen_avx2_permv2ti (t4
, gen_lowpart (V4DImode
, d
->op0
),
19339 gen_lowpart (V4DImode
, d
->op1
),
19342 /* Swap the 2nd and 3rd position in each lane into
19343 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
19344 emit_insn (gen_avx2_pshufdv3 (t1
, gen_lowpart (V8SImode
, t3
),
19345 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
19346 emit_insn (gen_avx2_pshufdv3 (t2
, gen_lowpart (V8SImode
, t4
),
19347 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
19349 /* Now an vpunpck[lh]qdq will produce
19350 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
19352 t3
= gen_avx2_interleave_highv4di (t5
, gen_lowpart (V4DImode
, t1
),
19353 gen_lowpart (V4DImode
, t2
));
19355 t3
= gen_avx2_interleave_lowv4di (t5
, gen_lowpart (V4DImode
, t1
),
19356 gen_lowpart (V4DImode
, t2
));
19358 emit_move_insn (d
->target
, gen_lowpart (V8SImode
, t5
));
19362 gcc_unreachable ();
19368 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
19369 extract-even and extract-odd permutations. */
19372 expand_vec_perm_even_odd (struct expand_vec_perm_d
*d
)
19374 unsigned i
, odd
, nelt
= d
->nelt
;
19377 if (odd
!= 0 && odd
!= 1)
19380 for (i
= 1; i
< nelt
; ++i
)
19381 if (d
->perm
[i
] != 2 * i
+ odd
)
19384 return expand_vec_perm_even_odd_1 (d
, odd
);
19387 /* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
19388 permutations. We assume that expand_vec_perm_1 has already failed. */
19391 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d
*d
)
19393 unsigned elt
= d
->perm
[0], nelt2
= d
->nelt
/ 2;
19394 machine_mode vmode
= d
->vmode
;
19395 unsigned char perm2
[4];
19396 rtx op0
= d
->op0
, dest
;
19403 /* These are special-cased in sse.md so that we can optionally
19404 use the vbroadcast instruction. They expand to two insns
19405 if the input happens to be in a register. */
19406 gcc_unreachable ();
19414 /* These are always implementable using standard shuffle patterns. */
19415 gcc_unreachable ();
19419 /* These can be implemented via interleave. We save one insn by
19420 stopping once we have promoted to V4SImode and then use pshufd. */
19426 rtx (*gen
) (rtx
, rtx
, rtx
)
19427 = vmode
== V16QImode
? gen_vec_interleave_lowv16qi
19428 : gen_vec_interleave_lowv8hi
;
19432 gen
= vmode
== V16QImode
? gen_vec_interleave_highv16qi
19433 : gen_vec_interleave_highv8hi
;
19438 dest
= gen_reg_rtx (vmode
);
19439 emit_insn (gen (dest
, op0
, op0
));
19440 vmode
= get_mode_wider_vector (vmode
);
19441 op0
= gen_lowpart (vmode
, dest
);
19443 while (vmode
!= V4SImode
);
19445 memset (perm2
, elt
, 4);
19446 dest
= gen_reg_rtx (V4SImode
);
19447 ok
= expand_vselect (dest
, op0
, perm2
, 4, d
->testing_p
);
19450 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
19458 /* For AVX2 broadcasts of the first element vpbroadcast* or
19459 vpermq should be used by expand_vec_perm_1. */
19460 gcc_assert (!TARGET_AVX2
|| d
->perm
[0]);
19464 gcc_unreachable ();
19468 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
19469 broadcast permutations. */
19472 expand_vec_perm_broadcast (struct expand_vec_perm_d
*d
)
19474 unsigned i
, elt
, nelt
= d
->nelt
;
19476 if (!d
->one_operand_p
)
19480 for (i
= 1; i
< nelt
; ++i
)
19481 if (d
->perm
[i
] != elt
)
19484 return expand_vec_perm_broadcast_1 (d
);
19487 /* Implement arbitrary permutations of two V64QImode operands
19488 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
19490 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d
*d
)
19492 if (!TARGET_AVX512BW
|| !(d
->vmode
== V64QImode
))
19498 struct expand_vec_perm_d ds
[2];
19499 rtx rperm
[128], vperm
, target0
, target1
;
19500 unsigned int i
, nelt
;
19501 machine_mode vmode
;
19506 for (i
= 0; i
< 2; i
++)
19509 ds
[i
].vmode
= V32HImode
;
19511 ds
[i
].target
= gen_reg_rtx (V32HImode
);
19512 ds
[i
].op0
= gen_lowpart (V32HImode
, d
->op0
);
19513 ds
[i
].op1
= gen_lowpart (V32HImode
, d
->op1
);
19516 /* Prepare permutations such that the first one takes care of
19517 putting the even bytes into the right positions or one higher
19518 positions (ds[0]) and the second one takes care of
19519 putting the odd bytes into the right positions or one below
19522 for (i
= 0; i
< nelt
; i
++)
19524 ds
[i
& 1].perm
[i
/ 2] = d
->perm
[i
] / 2;
19527 rperm
[i
] = constm1_rtx
;
19528 rperm
[i
+ 64] = GEN_INT ((i
& 14) + (d
->perm
[i
] & 1));
19532 rperm
[i
] = GEN_INT ((i
& 14) + (d
->perm
[i
] & 1));
19533 rperm
[i
+ 64] = constm1_rtx
;
19537 bool ok
= expand_vec_perm_1 (&ds
[0]);
19539 ds
[0].target
= gen_lowpart (V64QImode
, ds
[0].target
);
19541 ok
= expand_vec_perm_1 (&ds
[1]);
19543 ds
[1].target
= gen_lowpart (V64QImode
, ds
[1].target
);
19545 vperm
= gen_rtx_CONST_VECTOR (V64QImode
, gen_rtvec_v (64, rperm
));
19546 vperm
= force_reg (vmode
, vperm
);
19547 target0
= gen_reg_rtx (V64QImode
);
19548 emit_insn (gen_avx512bw_pshufbv64qi3 (target0
, ds
[0].target
, vperm
));
19550 vperm
= gen_rtx_CONST_VECTOR (V64QImode
, gen_rtvec_v (64, rperm
+ 64));
19551 vperm
= force_reg (vmode
, vperm
);
19552 target1
= gen_reg_rtx (V64QImode
);
19553 emit_insn (gen_avx512bw_pshufbv64qi3 (target1
, ds
[1].target
, vperm
));
19555 emit_insn (gen_iorv64qi3 (d
->target
, target0
, target1
));
19559 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
19560 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
19561 all the shorter instruction sequences. */
19564 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d
*d
)
19566 rtx rperm
[4][32], vperm
, l
[2], h
[2], op
, m128
;
19567 unsigned int i
, nelt
, eltsz
;
19571 || d
->one_operand_p
19572 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
19579 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
19581 /* Generate 4 permutation masks. If the required element is within
19582 the same lane, it is shuffled in. If the required element from the
19583 other lane, force a zero by setting bit 7 in the permutation mask.
19584 In the other mask the mask has non-negative elements if element
19585 is requested from the other lane, but also moved to the other lane,
19586 so that the result of vpshufb can have the two V2TImode halves
19588 m128
= GEN_INT (-128);
19589 for (i
= 0; i
< 32; ++i
)
19591 rperm
[0][i
] = m128
;
19592 rperm
[1][i
] = m128
;
19593 rperm
[2][i
] = m128
;
19594 rperm
[3][i
] = m128
;
19600 for (i
= 0; i
< nelt
; ++i
)
19602 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
19603 unsigned xlane
= ((d
->perm
[i
] ^ i
) & (nelt
/ 2)) * eltsz
;
19604 unsigned int which
= ((d
->perm
[i
] & nelt
) ? 2 : 0) + (xlane
? 1 : 0);
19606 for (j
= 0; j
< eltsz
; ++j
)
19607 rperm
[which
][(i
* eltsz
+ j
) ^ xlane
] = GEN_INT (e
* eltsz
+ j
);
19608 used
[which
] = true;
19611 for (i
= 0; i
< 2; ++i
)
19613 if (!used
[2 * i
+ 1])
19618 vperm
= gen_rtx_CONST_VECTOR (V32QImode
,
19619 gen_rtvec_v (32, rperm
[2 * i
+ 1]));
19620 vperm
= force_reg (V32QImode
, vperm
);
19621 h
[i
] = gen_reg_rtx (V32QImode
);
19622 op
= gen_lowpart (V32QImode
, i
? d
->op1
: d
->op0
);
19623 emit_insn (gen_avx2_pshufbv32qi3 (h
[i
], op
, vperm
));
19626 /* Swap the 128-byte lanes of h[X]. */
19627 for (i
= 0; i
< 2; ++i
)
19629 if (h
[i
] == NULL_RTX
)
19631 op
= gen_reg_rtx (V4DImode
);
19632 emit_insn (gen_avx2_permv4di_1 (op
, gen_lowpart (V4DImode
, h
[i
]),
19633 const2_rtx
, GEN_INT (3), const0_rtx
,
19635 h
[i
] = gen_lowpart (V32QImode
, op
);
19638 for (i
= 0; i
< 2; ++i
)
19645 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[2 * i
]));
19646 vperm
= force_reg (V32QImode
, vperm
);
19647 l
[i
] = gen_reg_rtx (V32QImode
);
19648 op
= gen_lowpart (V32QImode
, i
? d
->op1
: d
->op0
);
19649 emit_insn (gen_avx2_pshufbv32qi3 (l
[i
], op
, vperm
));
19652 for (i
= 0; i
< 2; ++i
)
19656 op
= gen_reg_rtx (V32QImode
);
19657 emit_insn (gen_iorv32qi3 (op
, l
[i
], h
[i
]));
19664 gcc_assert (l
[0] && l
[1]);
19666 if (d
->vmode
!= V32QImode
)
19667 op
= gen_reg_rtx (V32QImode
);
19668 emit_insn (gen_iorv32qi3 (op
, l
[0], l
[1]));
19669 if (op
!= d
->target
)
19670 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
19674 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
19675 taken care of, perform the expansion in D and return true on success. */
19678 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
19680 /* Try a single instruction expansion. */
19681 if (expand_vec_perm_1 (d
))
19684 /* Try sequences of two instructions. */
19686 if (expand_vec_perm_pshuflw_pshufhw (d
))
19689 if (expand_vec_perm_palignr (d
, false))
19692 if (expand_vec_perm_interleave2 (d
))
19695 if (expand_vec_perm_broadcast (d
))
19698 if (expand_vec_perm_vpermq_perm_1 (d
))
19701 if (expand_vec_perm_vperm2f128 (d
))
19704 if (expand_vec_perm_pblendv (d
))
19707 /* Try sequences of three instructions. */
19709 if (expand_vec_perm_even_odd_pack (d
))
19712 if (expand_vec_perm_2vperm2f128_vshuf (d
))
19715 if (expand_vec_perm_pshufb2 (d
))
19718 if (expand_vec_perm_interleave3 (d
))
19721 if (expand_vec_perm_vperm2f128_vblend (d
))
19724 /* Try sequences of four instructions. */
19726 if (expand_vec_perm_even_odd_trunc (d
))
19728 if (expand_vec_perm_vpshufb2_vpermq (d
))
19731 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d
))
19734 if (expand_vec_perm_vpermt2_vpshub2 (d
))
19737 /* ??? Look for narrow permutations whose element orderings would
19738 allow the promotion to a wider mode. */
19740 /* ??? Look for sequences of interleave or a wider permute that place
19741 the data into the correct lanes for a half-vector shuffle like
19742 pshuf[lh]w or vpermilps. */
19744 /* ??? Look for sequences of interleave that produce the desired results.
19745 The combinatorics of punpck[lh] get pretty ugly... */
19747 if (expand_vec_perm_even_odd (d
))
19750 /* Even longer sequences. */
19751 if (expand_vec_perm_vpshufb4_vpermq2 (d
))
19754 /* See if we can get the same permutation in different vector integer
19756 struct expand_vec_perm_d nd
;
19757 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
19760 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
19764 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
19765 if (expand_vec_perm2_vperm2f128_vblend (d
))
19771 /* If a permutation only uses one operand, make it clear. Returns true
19772 if the permutation references both operands. */
19775 canonicalize_perm (struct expand_vec_perm_d
*d
)
19777 int i
, which
, nelt
= d
->nelt
;
19779 for (i
= which
= 0; i
< nelt
; ++i
)
19780 which
|= (d
->perm
[i
] < nelt
? 1 : 2);
19782 d
->one_operand_p
= true;
19789 if (!rtx_equal_p (d
->op0
, d
->op1
))
19791 d
->one_operand_p
= false;
19794 /* The elements of PERM do not suggest that only the first operand
19795 is used, but both operands are identical. Allow easier matching
19796 of the permutation by folding the permutation into the single
19801 for (i
= 0; i
< nelt
; ++i
)
19802 d
->perm
[i
] &= nelt
- 1;
19811 return (which
== 3);
19814 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
19817 ix86_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
19818 rtx op1
, const vec_perm_indices
&sel
)
19820 struct expand_vec_perm_d d
;
19821 unsigned char perm
[MAX_VECT_LEN
];
19822 unsigned int i
, nelt
, which
;
19830 gcc_assert (VECTOR_MODE_P (d
.vmode
));
19831 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
19832 d
.testing_p
= !target
;
19834 gcc_assert (sel
.length () == nelt
);
19835 gcc_checking_assert (sizeof (d
.perm
) == sizeof (perm
));
19837 /* Given sufficient ISA support we can just return true here
19838 for selected vector modes. */
19845 if (!TARGET_AVX512F
)
19847 /* All implementable with a single vperm[it]2 insn. */
19852 if (!TARGET_AVX512BW
)
19855 /* All implementable with a single vperm[it]2 insn. */
19859 if (!TARGET_AVX512BW
)
19862 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
19871 if (d
.testing_p
&& TARGET_AVX512VL
)
19872 /* All implementable with a single vperm[it]2 insn. */
19878 if (d
.testing_p
&& TARGET_AVX2
)
19879 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19885 if (d
.testing_p
&& TARGET_AVX2
)
19886 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19893 /* Fall through. */
19898 /* All implementable with a single vpperm insn. */
19899 if (d
.testing_p
&& TARGET_XOP
)
19901 /* All implementable with 2 pshufb + 1 ior. */
19902 if (d
.testing_p
&& TARGET_SSSE3
)
19908 if (!TARGET_MMX_WITH_SSE
)
19915 /* All implementable with shufpd or unpck[lh]pd. */
19923 for (i
= which
= 0; i
< nelt
; ++i
)
19925 unsigned char e
= sel
[i
];
19926 gcc_assert (e
< 2 * nelt
);
19929 which
|= (e
< nelt
? 1 : 2);
19934 /* For all elements from second vector, fold the elements to first. */
19936 for (i
= 0; i
< nelt
; ++i
)
19939 /* Check whether the mask can be applied to the vector type. */
19940 d
.one_operand_p
= (which
!= 3);
19942 /* Implementable with shufps or pshufd. */
19943 if (d
.one_operand_p
19944 && (d
.vmode
== V4SFmode
|| d
.vmode
== V2SFmode
19945 || d
.vmode
== V4SImode
|| d
.vmode
== V2SImode
))
19948 /* Otherwise we have to go through the motions and see if we can
19949 figure out how to generate the requested permutation. */
19950 d
.target
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 1);
19951 d
.op1
= d
.op0
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 2);
19952 if (!d
.one_operand_p
)
19953 d
.op1
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 3);
19956 bool ret
= ix86_expand_vec_perm_const_1 (&d
);
19962 two_args
= canonicalize_perm (&d
);
19964 /* If one of the operands is a zero vector, try to match pmovzx. */
19965 if (two_args
&& (d
.op0
== CONST0_RTX (vmode
) || d
.op1
== CONST0_RTX (vmode
)))
19967 struct expand_vec_perm_d dzero
= d
;
19968 if (d
.op0
== CONST0_RTX (vmode
))
19970 d
.op1
= dzero
.op1
= force_reg (vmode
, d
.op1
);
19971 std::swap (dzero
.op0
, dzero
.op1
);
19972 for (i
= 0; i
< nelt
; ++i
)
19973 dzero
.perm
[i
] ^= nelt
;
19976 d
.op0
= dzero
.op0
= force_reg (vmode
, d
.op0
);
19978 if (expand_vselect_vconcat (dzero
.target
, dzero
.op0
, dzero
.op1
,
19979 dzero
.perm
, nelt
, dzero
.testing_p
))
19983 /* Force operands into registers. */
19984 rtx nop0
= force_reg (vmode
, d
.op0
);
19985 if (d
.op0
== d
.op1
)
19988 d
.op1
= force_reg (vmode
, d
.op1
);
19990 if (ix86_expand_vec_perm_const_1 (&d
))
19993 /* If the selector says both arguments are needed, but the operands are the
19994 same, the above tried to expand with one_operand_p and flattened selector.
19995 If that didn't work, retry without one_operand_p; we succeeded with that
19997 if (two_args
&& d
.one_operand_p
)
19999 d
.one_operand_p
= false;
20000 memcpy (d
.perm
, perm
, sizeof (perm
));
20001 return ix86_expand_vec_perm_const_1 (&d
);
20008 ix86_expand_vec_extract_even_odd (rtx targ
, rtx op0
, rtx op1
, unsigned odd
)
20010 struct expand_vec_perm_d d
;
20016 d
.vmode
= GET_MODE (targ
);
20017 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
20018 d
.one_operand_p
= false;
20019 d
.testing_p
= false;
20021 for (i
= 0; i
< nelt
; ++i
)
20022 d
.perm
[i
] = i
* 2 + odd
;
20024 /* We'll either be able to implement the permutation directly... */
20025 if (expand_vec_perm_1 (&d
))
20028 /* ... or we use the special-case patterns. */
20029 expand_vec_perm_even_odd_1 (&d
, odd
);
20033 ix86_expand_vec_interleave (rtx targ
, rtx op0
, rtx op1
, bool high_p
)
20035 struct expand_vec_perm_d d
;
20036 unsigned i
, nelt
, base
;
20042 d
.vmode
= GET_MODE (targ
);
20043 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
20044 d
.one_operand_p
= false;
20045 d
.testing_p
= false;
20047 base
= high_p
? nelt
/ 2 : 0;
20048 for (i
= 0; i
< nelt
/ 2; ++i
)
20050 d
.perm
[i
* 2] = i
+ base
;
20051 d
.perm
[i
* 2 + 1] = i
+ base
+ nelt
;
20054 /* Note that for AVX this isn't one instruction. */
20055 ok
= ix86_expand_vec_perm_const_1 (&d
);
20059 /* Optimize vector MUL generation for V8QI, V16QI and V32QI
20060 under TARGET_AVX512BW. i.e. for v16qi a * b, it has
20062 vpmovzxbw ymm2, xmm0
20063 vpmovzxbw ymm3, xmm1
20064 vpmullw ymm4, ymm2, ymm3
20067 it would take less instructions than ix86_expand_vecop_qihi.
20068 Return true if success. */
20071 ix86_expand_vecmul_qihi (rtx dest
, rtx op1
, rtx op2
)
20073 machine_mode himode
, qimode
= GET_MODE (dest
);
20074 rtx hop1
, hop2
, hdest
;
20075 rtx (*gen_extend
)(rtx
, rtx
);
20076 rtx (*gen_truncate
)(rtx
, rtx
);
20078 /* There's no V64HImode multiplication instruction. */
20079 if (qimode
== E_V64QImode
)
20082 /* vpmovwb only available under AVX512BW. */
20083 if (!TARGET_AVX512BW
)
20085 if ((qimode
== V8QImode
|| qimode
== V16QImode
)
20086 && !TARGET_AVX512VL
)
20088 /* Not generate zmm instruction when prefer 128/256 bit vector width. */
20089 if (qimode
== V32QImode
20090 && (TARGET_PREFER_AVX128
|| TARGET_PREFER_AVX256
))
20097 gen_extend
= gen_zero_extendv8qiv8hi2
;
20098 gen_truncate
= gen_truncv8hiv8qi2
;
20101 himode
= V16HImode
;
20102 gen_extend
= gen_zero_extendv16qiv16hi2
;
20103 gen_truncate
= gen_truncv16hiv16qi2
;
20106 himode
= V32HImode
;
20107 gen_extend
= gen_zero_extendv32qiv32hi2
;
20108 gen_truncate
= gen_truncv32hiv32qi2
;
20111 gcc_unreachable ();
20114 hop1
= gen_reg_rtx (himode
);
20115 hop2
= gen_reg_rtx (himode
);
20116 hdest
= gen_reg_rtx (himode
);
20117 emit_insn (gen_extend (hop1
, op1
));
20118 emit_insn (gen_extend (hop2
, op2
));
20119 emit_insn (gen_rtx_SET (hdest
, simplify_gen_binary (MULT
, himode
,
20121 emit_insn (gen_truncate (dest
, hdest
));
20125 /* Expand a vector operation shift by constant for a V*QImode in terms of the
20126 same operation on V*HImode. Return true if success. */
20128 ix86_expand_vec_shift_qihi_constant (enum rtx_code code
, rtx dest
, rtx op1
, rtx op2
)
20130 machine_mode qimode
, himode
;
20131 HOST_WIDE_INT and_constant
, xor_constant
;
20132 HOST_WIDE_INT shift_amount
;
20133 rtx vec_const_and
, vec_const_xor
;
20134 rtx tmp
, op1_subreg
;
20135 rtx (*gen_shift
) (rtx
, rtx
, rtx
);
20136 rtx (*gen_and
) (rtx
, rtx
, rtx
);
20137 rtx (*gen_xor
) (rtx
, rtx
, rtx
);
20138 rtx (*gen_sub
) (rtx
, rtx
, rtx
);
20140 /* Only optimize shift by constant. */
20141 if (!CONST_INT_P (op2
))
20144 qimode
= GET_MODE (dest
);
20145 shift_amount
= INTVAL (op2
);
20146 /* Do nothing when shift amount greater equal 8. */
20147 if (shift_amount
> 7)
20150 gcc_assert (code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
);
20151 /* Record sign bit. */
20152 xor_constant
= 1 << (8 - shift_amount
- 1);
20154 /* Zero upper/lower bits shift from left/right element. */
20156 = (code
== ASHIFT
? 256 - (1 << shift_amount
)
20157 : (1 << (8 - shift_amount
)) - 1);
20166 : (code
== ASHIFTRT
) ? gen_ashrv8hi3
: gen_lshrv8hi3
);
20167 gen_and
= gen_andv16qi3
;
20168 gen_xor
= gen_xorv16qi3
;
20169 gen_sub
= gen_subv16qi3
;
20172 himode
= V16HImode
;
20176 : (code
== ASHIFTRT
) ? gen_ashrv16hi3
: gen_lshrv16hi3
);
20177 gen_and
= gen_andv32qi3
;
20178 gen_xor
= gen_xorv32qi3
;
20179 gen_sub
= gen_subv32qi3
;
20182 himode
= V32HImode
;
20186 : (code
== ASHIFTRT
) ? gen_ashrv32hi3
: gen_lshrv32hi3
);
20187 gen_and
= gen_andv64qi3
;
20188 gen_xor
= gen_xorv64qi3
;
20189 gen_sub
= gen_subv64qi3
;
20192 gcc_unreachable ();
20195 tmp
= gen_reg_rtx (himode
);
20196 vec_const_and
= gen_reg_rtx (qimode
);
20197 op1_subreg
= lowpart_subreg (himode
, op1
, qimode
);
20199 /* For ASHIFT and LSHIFTRT, perform operation like
20200 vpsllw/vpsrlw $shift_amount, %op1, %dest.
20201 vpand %vec_const_and, %dest. */
20202 emit_insn (gen_shift (tmp
, op1_subreg
, op2
));
20203 emit_move_insn (dest
, simplify_gen_subreg (qimode
, tmp
, himode
, 0));
20204 emit_move_insn (vec_const_and
,
20205 ix86_build_const_vector (qimode
, true,
20206 gen_int_mode (and_constant
, QImode
)));
20207 emit_insn (gen_and (dest
, dest
, vec_const_and
));
20209 /* For ASHIFTRT, perform extra operation like
20210 vpxor %vec_const_xor, %dest, %dest
20211 vpsubb %vec_const_xor, %dest, %dest */
20212 if (code
== ASHIFTRT
)
20214 vec_const_xor
= gen_reg_rtx (qimode
);
20215 emit_move_insn (vec_const_xor
,
20216 ix86_build_const_vector (qimode
, true,
20217 gen_int_mode (xor_constant
, QImode
)));
20218 emit_insn (gen_xor (dest
, dest
, vec_const_xor
));
20219 emit_insn (gen_sub (dest
, dest
, vec_const_xor
));
20224 /* Expand a vector operation CODE for a V*QImode in terms of the
20225 same operation on V*HImode. */
20228 ix86_expand_vecop_qihi (enum rtx_code code
, rtx dest
, rtx op1
, rtx op2
)
20230 machine_mode qimode
= GET_MODE (dest
);
20231 machine_mode himode
;
20232 rtx (*gen_il
) (rtx
, rtx
, rtx
);
20233 rtx (*gen_ih
) (rtx
, rtx
, rtx
);
20234 rtx op1_l
, op1_h
, op2_l
, op2_h
, res_l
, res_h
;
20235 struct expand_vec_perm_d d
;
20236 bool ok
, full_interleave
;
20237 bool uns_p
= false;
20244 gen_il
= gen_vec_interleave_lowv16qi
;
20245 gen_ih
= gen_vec_interleave_highv16qi
;
20248 himode
= V16HImode
;
20249 gen_il
= gen_avx2_interleave_lowv32qi
;
20250 gen_ih
= gen_avx2_interleave_highv32qi
;
20253 himode
= V32HImode
;
20254 gen_il
= gen_avx512bw_interleave_lowv64qi
;
20255 gen_ih
= gen_avx512bw_interleave_highv64qi
;
20258 gcc_unreachable ();
20261 op2_l
= op2_h
= op2
;
20265 /* Unpack data such that we've got a source byte in each low byte of
20266 each word. We don't care what goes into the high byte of each word.
20267 Rather than trying to get zero in there, most convenient is to let
20268 it be a copy of the low byte. */
20269 op2_l
= gen_reg_rtx (qimode
);
20270 op2_h
= gen_reg_rtx (qimode
);
20271 emit_insn (gen_il (op2_l
, op2
, op2
));
20272 emit_insn (gen_ih (op2_h
, op2
, op2
));
20274 op1_l
= gen_reg_rtx (qimode
);
20275 op1_h
= gen_reg_rtx (qimode
);
20276 emit_insn (gen_il (op1_l
, op1
, op1
));
20277 emit_insn (gen_ih (op1_h
, op1
, op1
));
20278 full_interleave
= qimode
== V16QImode
;
20286 op1_l
= gen_reg_rtx (himode
);
20287 op1_h
= gen_reg_rtx (himode
);
20288 ix86_expand_sse_unpack (op1_l
, op1
, uns_p
, false);
20289 ix86_expand_sse_unpack (op1_h
, op1
, uns_p
, true);
20290 full_interleave
= true;
20293 gcc_unreachable ();
20296 /* Perform the operation. */
20297 res_l
= expand_simple_binop (himode
, code
, op1_l
, op2_l
, NULL_RTX
,
20299 res_h
= expand_simple_binop (himode
, code
, op1_h
, op2_h
, NULL_RTX
,
20301 gcc_assert (res_l
&& res_h
);
20303 /* Merge the data back into the right place. */
20305 d
.op0
= gen_lowpart (qimode
, res_l
);
20306 d
.op1
= gen_lowpart (qimode
, res_h
);
20308 d
.nelt
= GET_MODE_NUNITS (qimode
);
20309 d
.one_operand_p
= false;
20310 d
.testing_p
= false;
20312 if (full_interleave
)
20314 /* For SSE2, we used an full interleave, so the desired
20315 results are in the even elements. */
20316 for (i
= 0; i
< d
.nelt
; ++i
)
20321 /* For AVX, the interleave used above was not cross-lane. So the
20322 extraction is evens but with the second and third quarter swapped.
20323 Happily, that is even one insn shorter than even extraction.
20324 For AVX512BW we have 4 lanes. We extract evens from within a lane,
20325 always first from the first and then from the second source operand,
20326 the index bits above the low 4 bits remains the same.
20327 Thus, for d.nelt == 32 we want permutation
20328 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
20329 and for d.nelt == 64 we want permutation
20330 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
20331 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
20332 for (i
= 0; i
< d
.nelt
; ++i
)
20333 d
.perm
[i
] = ((i
* 2) & 14) + ((i
& 8) ? d
.nelt
: 0) + (i
& ~15);
20336 ok
= ix86_expand_vec_perm_const_1 (&d
);
20339 set_unique_reg_note (get_last_insn (), REG_EQUAL
,
20340 gen_rtx_fmt_ee (code
, qimode
, op1
, op2
));
20343 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
20344 if op is CONST_VECTOR with all odd elements equal to their
20345 preceding element. */
20348 const_vector_equal_evenodd_p (rtx op
)
20350 machine_mode mode
= GET_MODE (op
);
20351 int i
, nunits
= GET_MODE_NUNITS (mode
);
20352 if (GET_CODE (op
) != CONST_VECTOR
20353 || nunits
!= CONST_VECTOR_NUNITS (op
))
20355 for (i
= 0; i
< nunits
; i
+= 2)
20356 if (CONST_VECTOR_ELT (op
, i
) != CONST_VECTOR_ELT (op
, i
+ 1))
20362 ix86_expand_mul_widen_evenodd (rtx dest
, rtx op1
, rtx op2
,
20363 bool uns_p
, bool odd_p
)
20365 machine_mode mode
= GET_MODE (op1
);
20366 machine_mode wmode
= GET_MODE (dest
);
20368 rtx orig_op1
= op1
, orig_op2
= op2
;
20370 if (!nonimmediate_operand (op1
, mode
))
20371 op1
= force_reg (mode
, op1
);
20372 if (!nonimmediate_operand (op2
, mode
))
20373 op2
= force_reg (mode
, op2
);
20375 /* We only play even/odd games with vectors of SImode. */
20376 gcc_assert (mode
== V4SImode
|| mode
== V8SImode
|| mode
== V16SImode
);
20378 /* If we're looking for the odd results, shift those members down to
20379 the even slots. For some cpus this is faster than a PSHUFD. */
20382 /* For XOP use vpmacsdqh, but only for smult, as it is only
20384 if (TARGET_XOP
&& mode
== V4SImode
&& !uns_p
)
20386 x
= force_reg (wmode
, CONST0_RTX (wmode
));
20387 emit_insn (gen_xop_pmacsdqh (dest
, op1
, op2
, x
));
20391 x
= GEN_INT (GET_MODE_UNIT_BITSIZE (mode
));
20392 if (!const_vector_equal_evenodd_p (orig_op1
))
20393 op1
= expand_binop (wmode
, lshr_optab
, gen_lowpart (wmode
, op1
),
20394 x
, NULL
, 1, OPTAB_DIRECT
);
20395 if (!const_vector_equal_evenodd_p (orig_op2
))
20396 op2
= expand_binop (wmode
, lshr_optab
, gen_lowpart (wmode
, op2
),
20397 x
, NULL
, 1, OPTAB_DIRECT
);
20398 op1
= gen_lowpart (mode
, op1
);
20399 op2
= gen_lowpart (mode
, op2
);
20402 if (mode
== V16SImode
)
20405 x
= gen_vec_widen_umult_even_v16si (dest
, op1
, op2
);
20407 x
= gen_vec_widen_smult_even_v16si (dest
, op1
, op2
);
20409 else if (mode
== V8SImode
)
20412 x
= gen_vec_widen_umult_even_v8si (dest
, op1
, op2
);
20414 x
= gen_vec_widen_smult_even_v8si (dest
, op1
, op2
);
20417 x
= gen_vec_widen_umult_even_v4si (dest
, op1
, op2
);
20418 else if (TARGET_SSE4_1
)
20419 x
= gen_sse4_1_mulv2siv2di3 (dest
, op1
, op2
);
20422 rtx s1
, s2
, t0
, t1
, t2
;
20424 /* The easiest way to implement this without PMULDQ is to go through
20425 the motions as if we are performing a full 64-bit multiply. With
20426 the exception that we need to do less shuffling of the elements. */
20428 /* Compute the sign-extension, aka highparts, of the two operands. */
20429 s1
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), GT
, CONST0_RTX (mode
),
20430 op1
, pc_rtx
, pc_rtx
);
20431 s2
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), GT
, CONST0_RTX (mode
),
20432 op2
, pc_rtx
, pc_rtx
);
20434 /* Multiply LO(A) * HI(B), and vice-versa. */
20435 t1
= gen_reg_rtx (wmode
);
20436 t2
= gen_reg_rtx (wmode
);
20437 emit_insn (gen_vec_widen_umult_even_v4si (t1
, s1
, op2
));
20438 emit_insn (gen_vec_widen_umult_even_v4si (t2
, s2
, op1
));
20440 /* Multiply LO(A) * LO(B). */
20441 t0
= gen_reg_rtx (wmode
);
20442 emit_insn (gen_vec_widen_umult_even_v4si (t0
, op1
, op2
));
20444 /* Combine and shift the highparts into place. */
20445 t1
= expand_binop (wmode
, add_optab
, t1
, t2
, t1
, 1, OPTAB_DIRECT
);
20446 t1
= expand_binop (wmode
, ashl_optab
, t1
, GEN_INT (32), t1
,
20449 /* Combine high and low parts. */
20450 force_expand_binop (wmode
, add_optab
, t0
, t1
, dest
, 1, OPTAB_DIRECT
);
20457 ix86_expand_mul_widen_hilo (rtx dest
, rtx op1
, rtx op2
,
20458 bool uns_p
, bool high_p
)
20460 machine_mode wmode
= GET_MODE (dest
);
20461 machine_mode mode
= GET_MODE (op1
);
20462 rtx t1
, t2
, t3
, t4
, mask
;
20467 t1
= gen_reg_rtx (mode
);
20468 t2
= gen_reg_rtx (mode
);
20469 if (TARGET_XOP
&& !uns_p
)
20471 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
20472 shuffle the elements once so that all elements are in the right
20473 place for immediate use: { A C B D }. */
20474 emit_insn (gen_sse2_pshufd_1 (t1
, op1
, const0_rtx
, const2_rtx
,
20475 const1_rtx
, GEN_INT (3)));
20476 emit_insn (gen_sse2_pshufd_1 (t2
, op2
, const0_rtx
, const2_rtx
,
20477 const1_rtx
, GEN_INT (3)));
20481 /* Put the elements into place for the multiply. */
20482 ix86_expand_vec_interleave (t1
, op1
, op1
, high_p
);
20483 ix86_expand_vec_interleave (t2
, op2
, op2
, high_p
);
20486 ix86_expand_mul_widen_evenodd (dest
, t1
, t2
, uns_p
, high_p
);
20490 /* Shuffle the elements between the lanes. After this we
20491 have { A B E F | C D G H } for each operand. */
20492 t1
= gen_reg_rtx (V4DImode
);
20493 t2
= gen_reg_rtx (V4DImode
);
20494 emit_insn (gen_avx2_permv4di_1 (t1
, gen_lowpart (V4DImode
, op1
),
20495 const0_rtx
, const2_rtx
,
20496 const1_rtx
, GEN_INT (3)));
20497 emit_insn (gen_avx2_permv4di_1 (t2
, gen_lowpart (V4DImode
, op2
),
20498 const0_rtx
, const2_rtx
,
20499 const1_rtx
, GEN_INT (3)));
20501 /* Shuffle the elements within the lanes. After this we
20502 have { A A B B | C C D D } or { E E F F | G G H H }. */
20503 t3
= gen_reg_rtx (V8SImode
);
20504 t4
= gen_reg_rtx (V8SImode
);
20505 mask
= GEN_INT (high_p
20506 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
20507 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
20508 emit_insn (gen_avx2_pshufdv3 (t3
, gen_lowpart (V8SImode
, t1
), mask
));
20509 emit_insn (gen_avx2_pshufdv3 (t4
, gen_lowpart (V8SImode
, t2
), mask
));
20511 ix86_expand_mul_widen_evenodd (dest
, t3
, t4
, uns_p
, false);
20516 t1
= expand_binop (mode
, smul_optab
, op1
, op2
, NULL_RTX
,
20517 uns_p
, OPTAB_DIRECT
);
20518 t2
= expand_binop (mode
,
20519 uns_p
? umul_highpart_optab
: smul_highpart_optab
,
20520 op1
, op2
, NULL_RTX
, uns_p
, OPTAB_DIRECT
);
20521 gcc_assert (t1
&& t2
);
20523 t3
= gen_reg_rtx (mode
);
20524 ix86_expand_vec_interleave (t3
, t1
, t2
, high_p
);
20525 emit_move_insn (dest
, gen_lowpart (wmode
, t3
));
20533 t1
= gen_reg_rtx (wmode
);
20534 t2
= gen_reg_rtx (wmode
);
20535 ix86_expand_sse_unpack (t1
, op1
, uns_p
, high_p
);
20536 ix86_expand_sse_unpack (t2
, op2
, uns_p
, high_p
);
20538 emit_insn (gen_rtx_SET (dest
, gen_rtx_MULT (wmode
, t1
, t2
)));
20542 gcc_unreachable ();
20547 ix86_expand_sse2_mulv4si3 (rtx op0
, rtx op1
, rtx op2
)
20549 rtx res_1
, res_2
, res_3
, res_4
;
20551 res_1
= gen_reg_rtx (V4SImode
);
20552 res_2
= gen_reg_rtx (V4SImode
);
20553 res_3
= gen_reg_rtx (V2DImode
);
20554 res_4
= gen_reg_rtx (V2DImode
);
20555 ix86_expand_mul_widen_evenodd (res_3
, op1
, op2
, true, false);
20556 ix86_expand_mul_widen_evenodd (res_4
, op1
, op2
, true, true);
20558 /* Move the results in element 2 down to element 1; we don't care
20559 what goes in elements 2 and 3. Then we can merge the parts
20560 back together with an interleave.
20562 Note that two other sequences were tried:
20563 (1) Use interleaves at the start instead of psrldq, which allows
20564 us to use a single shufps to merge things back at the end.
20565 (2) Use shufps here to combine the two vectors, then pshufd to
20566 put the elements in the correct order.
20567 In both cases the cost of the reformatting stall was too high
20568 and the overall sequence slower. */
20570 emit_insn (gen_sse2_pshufd_1 (res_1
, gen_lowpart (V4SImode
, res_3
),
20571 const0_rtx
, const2_rtx
,
20572 const0_rtx
, const0_rtx
));
20573 emit_insn (gen_sse2_pshufd_1 (res_2
, gen_lowpart (V4SImode
, res_4
),
20574 const0_rtx
, const2_rtx
,
20575 const0_rtx
, const0_rtx
));
20576 res_1
= emit_insn (gen_vec_interleave_lowv4si (op0
, res_1
, res_2
));
20578 set_unique_reg_note (res_1
, REG_EQUAL
, gen_rtx_MULT (V4SImode
, op1
, op2
));
20582 ix86_expand_sse2_mulvxdi3 (rtx op0
, rtx op1
, rtx op2
)
20584 machine_mode mode
= GET_MODE (op0
);
20585 rtx t1
, t2
, t3
, t4
, t5
, t6
;
20587 if (TARGET_AVX512DQ
&& mode
== V8DImode
)
20588 emit_insn (gen_avx512dq_mulv8di3 (op0
, op1
, op2
));
20589 else if (TARGET_AVX512DQ
&& TARGET_AVX512VL
&& mode
== V4DImode
)
20590 emit_insn (gen_avx512dq_mulv4di3 (op0
, op1
, op2
));
20591 else if (TARGET_AVX512DQ
&& TARGET_AVX512VL
&& mode
== V2DImode
)
20592 emit_insn (gen_avx512dq_mulv2di3 (op0
, op1
, op2
));
20593 else if (TARGET_XOP
&& mode
== V2DImode
)
20595 /* op1: A,B,C,D, op2: E,F,G,H */
20596 op1
= gen_lowpart (V4SImode
, op1
);
20597 op2
= gen_lowpart (V4SImode
, op2
);
20599 t1
= gen_reg_rtx (V4SImode
);
20600 t2
= gen_reg_rtx (V4SImode
);
20601 t3
= gen_reg_rtx (V2DImode
);
20602 t4
= gen_reg_rtx (V2DImode
);
20605 emit_insn (gen_sse2_pshufd_1 (t1
, op1
,
20611 /* t2: (B*E),(A*F),(D*G),(C*H) */
20612 emit_insn (gen_mulv4si3 (t2
, t1
, op2
));
20614 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
20615 emit_insn (gen_xop_phadddq (t3
, t2
));
20617 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
20618 emit_insn (gen_ashlv2di3 (t4
, t3
, GEN_INT (32)));
20620 /* Multiply lower parts and add all */
20621 t5
= gen_reg_rtx (V2DImode
);
20622 emit_insn (gen_vec_widen_umult_even_v4si (t5
,
20623 gen_lowpart (V4SImode
, op1
),
20624 gen_lowpart (V4SImode
, op2
)));
20625 force_expand_binop (mode
, add_optab
, t5
, t4
, op0
, 1, OPTAB_DIRECT
);
20629 machine_mode nmode
;
20630 rtx (*umul
) (rtx
, rtx
, rtx
);
20632 if (mode
== V2DImode
)
20634 umul
= gen_vec_widen_umult_even_v4si
;
20637 else if (mode
== V4DImode
)
20639 umul
= gen_vec_widen_umult_even_v8si
;
20642 else if (mode
== V8DImode
)
20644 umul
= gen_vec_widen_umult_even_v16si
;
20648 gcc_unreachable ();
20651 /* Multiply low parts. */
20652 t1
= gen_reg_rtx (mode
);
20653 emit_insn (umul (t1
, gen_lowpart (nmode
, op1
), gen_lowpart (nmode
, op2
)));
20655 /* Shift input vectors right 32 bits so we can multiply high parts. */
20657 t2
= expand_binop (mode
, lshr_optab
, op1
, t6
, NULL
, 1, OPTAB_DIRECT
);
20658 t3
= expand_binop (mode
, lshr_optab
, op2
, t6
, NULL
, 1, OPTAB_DIRECT
);
20660 /* Multiply high parts by low parts. */
20661 t4
= gen_reg_rtx (mode
);
20662 t5
= gen_reg_rtx (mode
);
20663 emit_insn (umul (t4
, gen_lowpart (nmode
, t2
), gen_lowpart (nmode
, op2
)));
20664 emit_insn (umul (t5
, gen_lowpart (nmode
, t3
), gen_lowpart (nmode
, op1
)));
20666 /* Combine and shift the highparts back. */
20667 t4
= expand_binop (mode
, add_optab
, t4
, t5
, t4
, 1, OPTAB_DIRECT
);
20668 t4
= expand_binop (mode
, ashl_optab
, t4
, t6
, t4
, 1, OPTAB_DIRECT
);
20670 /* Combine high and low parts. */
20671 force_expand_binop (mode
, add_optab
, t1
, t4
, op0
, 1, OPTAB_DIRECT
);
20674 set_unique_reg_note (get_last_insn (), REG_EQUAL
,
20675 gen_rtx_MULT (mode
, op1
, op2
));
20678 /* Return 1 if control tansfer instruction INSN
20679 should be encoded with notrack prefix. */
20682 ix86_notrack_prefixed_insn_p (rtx_insn
*insn
)
20684 if (!insn
|| !((flag_cf_protection
& CF_BRANCH
)))
20689 rtx call
= get_call_rtx_from (insn
);
20690 gcc_assert (call
!= NULL_RTX
);
20691 rtx addr
= XEXP (call
, 0);
20693 /* Do not emit 'notrack' if it's not an indirect call. */
20695 && GET_CODE (XEXP (addr
, 0)) == SYMBOL_REF
)
20698 return find_reg_note (insn
, REG_CALL_NOCF_CHECK
, 0);
20701 if (JUMP_P (insn
) && !flag_cet_switch
)
20703 rtx target
= JUMP_LABEL (insn
);
20704 if (target
== NULL_RTX
|| ANY_RETURN_P (target
))
20707 /* Check the jump is a switch table. */
20708 rtx_insn
*label
= as_a
<rtx_insn
*> (target
);
20709 rtx_insn
*table
= next_insn (label
);
20710 if (table
== NULL_RTX
|| !JUMP_TABLE_DATA_P (table
))
20718 /* Calculate integer abs() using only SSE2 instructions. */
20721 ix86_expand_sse2_abs (rtx target
, rtx input
)
20723 machine_mode mode
= GET_MODE (target
);
20730 /* For 64-bit signed integer X, with SSE4.2 use
20731 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
20732 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
20733 32 and use logical instead of arithmetic right shift (which is
20734 unimplemented) and subtract. */
20737 tmp0
= gen_reg_rtx (mode
);
20738 tmp1
= gen_reg_rtx (mode
);
20739 emit_move_insn (tmp1
, CONST0_RTX (mode
));
20740 if (mode
== E_V2DImode
)
20741 emit_insn (gen_sse4_2_gtv2di3 (tmp0
, tmp1
, input
));
20743 emit_insn (gen_avx2_gtv4di3 (tmp0
, tmp1
, input
));
20747 tmp0
= expand_simple_binop (mode
, LSHIFTRT
, input
,
20748 GEN_INT (GET_MODE_UNIT_BITSIZE (mode
)
20749 - 1), NULL
, 0, OPTAB_DIRECT
);
20750 tmp0
= expand_simple_unop (mode
, NEG
, tmp0
, NULL
, false);
20753 tmp1
= expand_simple_binop (mode
, XOR
, tmp0
, input
,
20754 NULL
, 0, OPTAB_DIRECT
);
20755 x
= expand_simple_binop (mode
, MINUS
, tmp1
, tmp0
,
20756 target
, 0, OPTAB_DIRECT
);
20760 /* For 32-bit signed integer X, the best way to calculate the absolute
20761 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
20762 tmp0
= expand_simple_binop (mode
, ASHIFTRT
, input
,
20763 GEN_INT (GET_MODE_UNIT_BITSIZE (mode
) - 1),
20764 NULL
, 0, OPTAB_DIRECT
);
20765 tmp1
= expand_simple_binop (mode
, XOR
, tmp0
, input
,
20766 NULL
, 0, OPTAB_DIRECT
);
20767 x
= expand_simple_binop (mode
, MINUS
, tmp1
, tmp0
,
20768 target
, 0, OPTAB_DIRECT
);
20772 /* For 16-bit signed integer X, the best way to calculate the absolute
20773 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
20774 tmp0
= expand_unop (mode
, neg_optab
, input
, NULL_RTX
, 0);
20776 x
= expand_simple_binop (mode
, SMAX
, tmp0
, input
,
20777 target
, 0, OPTAB_DIRECT
);
20781 /* For 8-bit signed integer X, the best way to calculate the absolute
20782 value of X is min ((unsigned char) X, (unsigned char) (-X)),
20783 as SSE2 provides the PMINUB insn. */
20784 tmp0
= expand_unop (mode
, neg_optab
, input
, NULL_RTX
, 0);
20786 x
= expand_simple_binop (V16QImode
, UMIN
, tmp0
, input
,
20787 target
, 0, OPTAB_DIRECT
);
20791 gcc_unreachable ();
20795 emit_move_insn (target
, x
);
20798 /* Expand an extract from a vector register through pextr insn.
20799 Return true if successful. */
20802 ix86_expand_pextr (rtx
*operands
)
20804 rtx dst
= operands
[0];
20805 rtx src
= operands
[1];
20807 unsigned int size
= INTVAL (operands
[2]);
20808 unsigned int pos
= INTVAL (operands
[3]);
20810 if (SUBREG_P (dst
))
20812 /* Reject non-lowpart subregs. */
20813 if (SUBREG_BYTE (dst
) > 0)
20815 dst
= SUBREG_REG (dst
);
20818 if (SUBREG_P (src
))
20820 pos
+= SUBREG_BYTE (src
) * BITS_PER_UNIT
;
20821 src
= SUBREG_REG (src
);
20824 switch (GET_MODE (src
))
20832 machine_mode srcmode
, dstmode
;
20835 if (!int_mode_for_size (size
, 0).exists (&dstmode
))
20841 if (!TARGET_SSE4_1
)
20843 srcmode
= V16QImode
;
20849 srcmode
= V8HImode
;
20853 if (!TARGET_SSE4_1
)
20855 srcmode
= V4SImode
;
20859 gcc_assert (TARGET_64BIT
);
20860 if (!TARGET_SSE4_1
)
20862 srcmode
= V2DImode
;
20869 /* Reject extractions from misaligned positions. */
20870 if (pos
& (size
-1))
20873 if (GET_MODE (dst
) == dstmode
)
20876 d
= gen_reg_rtx (dstmode
);
20878 /* Construct insn pattern. */
20879 pat
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (pos
/ size
)));
20880 pat
= gen_rtx_VEC_SELECT (dstmode
, gen_lowpart (srcmode
, src
), pat
);
20882 /* Let the rtl optimizers know about the zero extension performed. */
20883 if (dstmode
== QImode
|| dstmode
== HImode
)
20885 pat
= gen_rtx_ZERO_EXTEND (SImode
, pat
);
20886 d
= gen_lowpart (SImode
, d
);
20889 emit_insn (gen_rtx_SET (d
, pat
));
20892 emit_move_insn (dst
, gen_lowpart (GET_MODE (dst
), d
));
20901 /* Expand an insert into a vector register through pinsr insn.
20902 Return true if successful. */
20905 ix86_expand_pinsr (rtx
*operands
)
20907 rtx dst
= operands
[0];
20908 rtx src
= operands
[3];
20910 unsigned int size
= INTVAL (operands
[1]);
20911 unsigned int pos
= INTVAL (operands
[2]);
20913 if (SUBREG_P (dst
))
20915 pos
+= SUBREG_BYTE (dst
) * BITS_PER_UNIT
;
20916 dst
= SUBREG_REG (dst
);
20919 switch (GET_MODE (dst
))
20927 machine_mode srcmode
, dstmode
;
20928 rtx (*pinsr
)(rtx
, rtx
, rtx
, rtx
);
20931 if (!int_mode_for_size (size
, 0).exists (&srcmode
))
20937 if (!TARGET_SSE4_1
)
20939 dstmode
= V16QImode
;
20940 pinsr
= gen_sse4_1_pinsrb
;
20946 dstmode
= V8HImode
;
20947 pinsr
= gen_sse2_pinsrw
;
20951 if (!TARGET_SSE4_1
)
20953 dstmode
= V4SImode
;
20954 pinsr
= gen_sse4_1_pinsrd
;
20958 gcc_assert (TARGET_64BIT
);
20959 if (!TARGET_SSE4_1
)
20961 dstmode
= V2DImode
;
20962 pinsr
= gen_sse4_1_pinsrq
;
20969 /* Reject insertions to misaligned positions. */
20970 if (pos
& (size
-1))
20973 if (SUBREG_P (src
))
20975 unsigned int srcpos
= SUBREG_BYTE (src
);
20981 extr_ops
[0] = gen_reg_rtx (srcmode
);
20982 extr_ops
[1] = gen_lowpart (srcmode
, SUBREG_REG (src
));
20983 extr_ops
[2] = GEN_INT (size
);
20984 extr_ops
[3] = GEN_INT (srcpos
* BITS_PER_UNIT
);
20986 if (!ix86_expand_pextr (extr_ops
))
20992 src
= gen_lowpart (srcmode
, SUBREG_REG (src
));
20995 if (GET_MODE (dst
) == dstmode
)
20998 d
= gen_reg_rtx (dstmode
);
21000 emit_insn (pinsr (d
, gen_lowpart (dstmode
, dst
),
21001 gen_lowpart (srcmode
, src
),
21002 GEN_INT (1 << (pos
/ size
))));
21004 emit_move_insn (dst
, gen_lowpart (GET_MODE (dst
), d
));
21013 /* All CPUs prefer to avoid cross-lane operations so perform reductions
21014 upper against lower halves up to SSE reg size. */
21017 ix86_split_reduction (machine_mode mode
)
21019 /* Reduce lowpart against highpart until we reach SSE reg width to
21020 avoid cross-lane operations. */
21046 /* Generate call to __divmoddi4. */
21049 ix86_expand_divmod_libfunc (rtx libfunc
, machine_mode mode
,
21051 rtx
*quot_p
, rtx
*rem_p
)
21053 rtx rem
= assign_386_stack_local (mode
, SLOT_TEMP
);
21055 rtx quot
= emit_library_call_value (libfunc
, NULL_RTX
, LCT_NORMAL
,
21056 mode
, op0
, mode
, op1
, mode
,
21057 XEXP (rem
, 0), Pmode
);
21062 #include "gt-i386-expand.h"