1 /* Copyright (C) 1988-2021 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
19 #define IN_TARGET_CODE 1
23 #include "coretypes.h"
33 #include "stringpool.h"
40 #include "diagnostic.h"
43 #include "fold-const.h"
46 #include "stor-layout.h"
49 #include "insn-attr.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
60 #include "tm-constrs.h"
62 #include "sched-int.h"
64 #include "tree-pass.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
73 #include "tree-iterator.h"
75 #include "case-cfn-macros.h"
77 #include "fold-const-call.h"
79 #include "tree-ssanames.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
85 #include "symbol-summary.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
91 #include "dwarf2out.h"
92 #include "i386-options.h"
93 #include "i386-builtins.h"
94 #include "i386-expand.h"
96 /* Split one or more double-mode RTL references into pairs of half-mode
97 references. The RTL can be REG, offsettable MEM, integer constant, or
98 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
99 split and "num" is its length. lo_half and hi_half are output arrays
100 that parallel "operands". */
103 split_double_mode (machine_mode mode
, rtx operands
[],
104 int num
, rtx lo_half
[], rtx hi_half
[])
106 machine_mode half_mode
;
108 rtx mem_op
= NULL_RTX
;
129 byte
= GET_MODE_SIZE (half_mode
);
133 rtx op
= operands
[num
];
135 /* simplify_subreg refuse to split volatile memory addresses,
136 but we still have to handle it. */
139 if (mem_op
&& rtx_equal_p (op
, mem_op
))
141 lo_half
[num
] = lo_half
[mem_num
];
142 hi_half
[num
] = hi_half
[mem_num
];
148 lo_half
[num
] = adjust_address (op
, half_mode
, 0);
149 hi_half
[num
] = adjust_address (op
, half_mode
, byte
);
154 lo_half
[num
] = simplify_gen_subreg (half_mode
, op
,
155 GET_MODE (op
) == VOIDmode
156 ? mode
: GET_MODE (op
), 0);
157 hi_half
[num
] = simplify_gen_subreg (half_mode
, op
,
158 GET_MODE (op
) == VOIDmode
159 ? mode
: GET_MODE (op
), byte
);
164 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
168 ix86_expand_clear (rtx dest
)
172 /* We play register width games, which are only valid after reload. */
173 gcc_assert (reload_completed
);
175 /* Avoid HImode and its attendant prefix byte. */
176 if (GET_MODE_SIZE (GET_MODE (dest
)) < 4)
177 dest
= gen_rtx_REG (SImode
, REGNO (dest
));
178 tmp
= gen_rtx_SET (dest
, const0_rtx
);
180 if (!TARGET_USE_MOV0
|| optimize_insn_for_size_p ())
182 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
183 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, tmp
, clob
));
190 ix86_expand_move (machine_mode mode
, rtx operands
[])
193 rtx tmp
, addend
= NULL_RTX
;
194 enum tls_model model
;
199 /* Avoid complex sets of likely spilled hard registers before reload. */
200 if (!ix86_hardreg_mov_ok (op0
, op1
))
202 tmp
= gen_reg_rtx (mode
);
204 ix86_expand_move (mode
, operands
);
210 switch (GET_CODE (op1
))
215 if (GET_CODE (tmp
) != PLUS
216 || GET_CODE (XEXP (tmp
, 0)) != SYMBOL_REF
)
220 addend
= XEXP (tmp
, 1);
224 model
= SYMBOL_REF_TLS_MODEL (op1
);
227 op1
= legitimize_tls_address (op1
, model
, true);
228 else if (ix86_force_load_from_GOT_p (op1
))
230 /* Load the external function address via GOT slot to avoid PLT. */
231 op1
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, op1
),
235 op1
= gen_rtx_CONST (Pmode
, op1
);
236 op1
= gen_const_mem (Pmode
, op1
);
237 set_mem_alias_set (op1
, ix86_GOT_alias_set ());
241 tmp
= legitimize_pe_coff_symbol (op1
, addend
!= NULL_RTX
);
257 op1
= force_operand (op1
, NULL_RTX
);
258 op1
= expand_simple_binop (Pmode
, PLUS
, op1
, addend
,
259 op0
, 1, OPTAB_DIRECT
);
262 op1
= force_operand (op1
, op0
);
267 op1
= convert_to_mode (mode
, op1
, 1);
273 if ((flag_pic
|| MACHOPIC_INDIRECT
)
274 && symbolic_operand (op1
, mode
))
276 if (TARGET_MACHO
&& !TARGET_64BIT
)
280 if (MACHOPIC_INDIRECT
)
282 rtx temp
= (op0
&& REG_P (op0
) && mode
== Pmode
)
283 ? op0
: gen_reg_rtx (Pmode
);
284 op1
= machopic_indirect_data_reference (op1
, temp
);
286 op1
= machopic_legitimize_pic_address (op1
, mode
,
287 temp
== op1
? 0 : temp
);
289 if (op0
!= op1
&& GET_CODE (op0
) != MEM
)
291 rtx insn
= gen_rtx_SET (op0
, op1
);
295 if (GET_CODE (op0
) == MEM
)
296 op1
= force_reg (Pmode
, op1
);
300 if (GET_CODE (temp
) != REG
)
301 temp
= gen_reg_rtx (Pmode
);
302 temp
= legitimize_pic_address (op1
, temp
);
313 op1
= force_reg (mode
, op1
);
314 else if (!(TARGET_64BIT
&& x86_64_movabs_operand (op1
, DImode
)))
316 rtx reg
= can_create_pseudo_p () ? NULL_RTX
: op0
;
317 op1
= legitimize_pic_address (op1
, reg
);
320 op1
= convert_to_mode (mode
, op1
, 1);
327 && (PUSH_ROUNDING (GET_MODE_SIZE (mode
)) != GET_MODE_SIZE (mode
)
328 || !push_operand (op0
, mode
))
330 op1
= force_reg (mode
, op1
);
332 if (push_operand (op0
, mode
)
333 && ! general_no_elim_operand (op1
, mode
))
334 op1
= copy_to_mode_reg (mode
, op1
);
336 /* Force large constants in 64bit compilation into register
337 to get them CSEed. */
338 if (can_create_pseudo_p ()
339 && (mode
== DImode
) && TARGET_64BIT
340 && immediate_operand (op1
, mode
)
341 && !x86_64_zext_immediate_operand (op1
, VOIDmode
)
342 && !register_operand (op0
, mode
)
344 op1
= copy_to_mode_reg (mode
, op1
);
346 if (can_create_pseudo_p ()
347 && CONST_DOUBLE_P (op1
))
349 /* If we are loading a floating point constant to a register,
350 force the value to memory now, since we'll get better code
353 op1
= validize_mem (force_const_mem (mode
, op1
));
354 if (!register_operand (op0
, mode
))
356 rtx temp
= gen_reg_rtx (mode
);
357 emit_insn (gen_rtx_SET (temp
, op1
));
358 emit_move_insn (op0
, temp
);
364 emit_insn (gen_rtx_SET (op0
, op1
));
368 ix86_expand_vector_move (machine_mode mode
, rtx operands
[])
370 rtx op0
= operands
[0], op1
= operands
[1];
371 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
372 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
373 unsigned int align
= (TARGET_IAMCU
374 ? GET_MODE_BITSIZE (mode
)
375 : GET_MODE_ALIGNMENT (mode
));
377 if (push_operand (op0
, VOIDmode
))
378 op0
= emit_move_resolve_push (mode
, op0
);
380 /* Force constants other than zero into memory. We do not know how
381 the instructions used to build constants modify the upper 64 bits
382 of the register, once we have that information we may be able
383 to handle some of them more efficiently. */
384 if (can_create_pseudo_p ()
387 && CONSTANT_P (SUBREG_REG (op1
))))
388 && ((register_operand (op0
, mode
)
389 && !standard_sse_constant_p (op1
, mode
))
390 /* ix86_expand_vector_move_misalign() does not like constants. */
391 || (SSE_REG_MODE_P (mode
)
393 && MEM_ALIGN (op0
) < align
)))
397 machine_mode imode
= GET_MODE (SUBREG_REG (op1
));
398 rtx r
= force_const_mem (imode
, SUBREG_REG (op1
));
400 r
= validize_mem (r
);
402 r
= force_reg (imode
, SUBREG_REG (op1
));
403 op1
= simplify_gen_subreg (mode
, r
, imode
, SUBREG_BYTE (op1
));
406 op1
= validize_mem (force_const_mem (mode
, op1
));
409 /* We need to check memory alignment for SSE mode since attribute
410 can make operands unaligned. */
411 if (can_create_pseudo_p ()
412 && SSE_REG_MODE_P (mode
)
413 && ((MEM_P (op0
) && (MEM_ALIGN (op0
) < align
))
414 || (MEM_P (op1
) && (MEM_ALIGN (op1
) < align
))))
418 /* ix86_expand_vector_move_misalign() does not like both
419 arguments in memory. */
420 if (!register_operand (op0
, mode
)
421 && !register_operand (op1
, mode
))
422 op1
= force_reg (mode
, op1
);
424 tmp
[0] = op0
; tmp
[1] = op1
;
425 ix86_expand_vector_move_misalign (mode
, tmp
);
429 /* Make operand1 a register if it isn't already. */
430 if (can_create_pseudo_p ()
431 && !register_operand (op0
, mode
)
432 && !register_operand (op1
, mode
))
434 emit_move_insn (op0
, force_reg (GET_MODE (op0
), op1
));
438 emit_insn (gen_rtx_SET (op0
, op1
));
441 /* Split 32-byte AVX unaligned load and store if needed. */
444 ix86_avx256_split_vector_move_misalign (rtx op0
, rtx op1
)
447 rtx (*extract
) (rtx
, rtx
, rtx
);
450 if ((MEM_P (op1
) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD
)
451 || (MEM_P (op0
) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE
))
453 emit_insn (gen_rtx_SET (op0
, op1
));
457 rtx orig_op0
= NULL_RTX
;
458 mode
= GET_MODE (op0
);
459 switch (GET_MODE_CLASS (mode
))
461 case MODE_VECTOR_INT
:
463 if (mode
!= V32QImode
)
468 op0
= gen_reg_rtx (V32QImode
);
471 op0
= gen_lowpart (V32QImode
, op0
);
472 op1
= gen_lowpart (V32QImode
, op1
);
476 case MODE_VECTOR_FLOAT
:
487 extract
= gen_avx_vextractf128v32qi
;
491 extract
= gen_avx_vextractf128v8sf
;
495 extract
= gen_avx_vextractf128v4df
;
502 rtx r
= gen_reg_rtx (mode
);
503 m
= adjust_address (op1
, mode
, 0);
504 emit_move_insn (r
, m
);
505 m
= adjust_address (op1
, mode
, 16);
506 r
= gen_rtx_VEC_CONCAT (GET_MODE (op0
), r
, m
);
507 emit_move_insn (op0
, r
);
509 else if (MEM_P (op0
))
511 m
= adjust_address (op0
, mode
, 0);
512 emit_insn (extract (m
, op1
, const0_rtx
));
513 m
= adjust_address (op0
, mode
, 16);
514 emit_insn (extract (m
, copy_rtx (op1
), const1_rtx
));
520 emit_move_insn (orig_op0
, gen_lowpart (GET_MODE (orig_op0
), op0
));
523 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
524 straight to ix86_expand_vector_move. */
525 /* Code generation for scalar reg-reg moves of single and double precision data:
526 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
530 if (x86_sse_partial_reg_dependency == true)
535 Code generation for scalar loads of double precision data:
536 if (x86_sse_split_regs == true)
537 movlpd mem, reg (gas syntax)
541 Code generation for unaligned packed loads of single precision data
542 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
543 if (x86_sse_unaligned_move_optimal)
546 if (x86_sse_partial_reg_dependency == true)
558 Code generation for unaligned packed loads of double precision data
559 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
560 if (x86_sse_unaligned_move_optimal)
563 if (x86_sse_split_regs == true)
576 ix86_expand_vector_move_misalign (machine_mode mode
, rtx operands
[])
583 /* Use unaligned load/store for AVX512 or when optimizing for size. */
584 if (GET_MODE_SIZE (mode
) == 64 || optimize_insn_for_size_p ())
586 emit_insn (gen_rtx_SET (op0
, op1
));
592 if (GET_MODE_SIZE (mode
) == 32)
593 ix86_avx256_split_vector_move_misalign (op0
, op1
);
595 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
596 emit_insn (gen_rtx_SET (op0
, op1
));
600 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
601 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
)
603 emit_insn (gen_rtx_SET (op0
, op1
));
607 /* ??? If we have typed data, then it would appear that using
608 movdqu is the only way to get unaligned data loaded with
610 if (TARGET_SSE2
&& GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
612 emit_insn (gen_rtx_SET (op0
, op1
));
618 if (TARGET_SSE2
&& mode
== V2DFmode
)
622 /* When SSE registers are split into halves, we can avoid
623 writing to the top half twice. */
624 if (TARGET_SSE_SPLIT_REGS
)
631 /* ??? Not sure about the best option for the Intel chips.
632 The following would seem to satisfy; the register is
633 entirely cleared, breaking the dependency chain. We
634 then store to the upper half, with a dependency depth
635 of one. A rumor has it that Intel recommends two movsd
636 followed by an unpacklpd, but this is unconfirmed. And
637 given that the dependency depth of the unpacklpd would
638 still be one, I'm not sure why this would be better. */
639 zero
= CONST0_RTX (V2DFmode
);
642 m
= adjust_address (op1
, DFmode
, 0);
643 emit_insn (gen_sse2_loadlpd (op0
, zero
, m
));
644 m
= adjust_address (op1
, DFmode
, 8);
645 emit_insn (gen_sse2_loadhpd (op0
, op0
, m
));
651 if (mode
!= V4SFmode
)
652 t
= gen_reg_rtx (V4SFmode
);
656 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY
)
657 emit_move_insn (t
, CONST0_RTX (V4SFmode
));
661 m
= adjust_address (op1
, V2SFmode
, 0);
662 emit_insn (gen_sse_loadlps (t
, t
, m
));
663 m
= adjust_address (op1
, V2SFmode
, 8);
664 emit_insn (gen_sse_loadhps (t
, t
, m
));
665 if (mode
!= V4SFmode
)
666 emit_move_insn (op0
, gen_lowpart (mode
, t
));
669 else if (MEM_P (op0
))
671 if (TARGET_SSE2
&& mode
== V2DFmode
)
673 m
= adjust_address (op0
, DFmode
, 0);
674 emit_insn (gen_sse2_storelpd (m
, op1
));
675 m
= adjust_address (op0
, DFmode
, 8);
676 emit_insn (gen_sse2_storehpd (m
, op1
));
680 if (mode
!= V4SFmode
)
681 op1
= gen_lowpart (V4SFmode
, op1
);
683 m
= adjust_address (op0
, V2SFmode
, 0);
684 emit_insn (gen_sse_storelps (m
, op1
));
685 m
= adjust_address (op0
, V2SFmode
, 8);
686 emit_insn (gen_sse_storehps (m
, copy_rtx (op1
)));
693 /* Move bits 64:95 to bits 32:63. */
696 ix86_move_vector_high_sse_to_mmx (rtx op
)
698 rtx mask
= gen_rtx_PARALLEL (VOIDmode
,
699 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
700 GEN_INT (0), GEN_INT (0)));
701 rtx dest
= lowpart_subreg (V4SImode
, op
, GET_MODE (op
));
702 op
= gen_rtx_VEC_SELECT (V4SImode
, dest
, mask
);
703 rtx insn
= gen_rtx_SET (dest
, op
);
707 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
710 ix86_split_mmx_pack (rtx operands
[], enum rtx_code code
)
712 rtx op0
= operands
[0];
713 rtx op1
= operands
[1];
714 rtx op2
= operands
[2];
716 machine_mode dmode
= GET_MODE (op0
);
717 machine_mode smode
= GET_MODE (op1
);
718 machine_mode inner_dmode
= GET_MODE_INNER (dmode
);
719 machine_mode inner_smode
= GET_MODE_INNER (smode
);
721 /* Get the corresponding SSE mode for destination. */
722 int nunits
= 16 / GET_MODE_SIZE (inner_dmode
);
723 machine_mode sse_dmode
= mode_for_vector (GET_MODE_INNER (dmode
),
725 machine_mode sse_half_dmode
= mode_for_vector (GET_MODE_INNER (dmode
),
726 nunits
/ 2).require ();
728 /* Get the corresponding SSE mode for source. */
729 nunits
= 16 / GET_MODE_SIZE (inner_smode
);
730 machine_mode sse_smode
= mode_for_vector (GET_MODE_INNER (smode
),
733 /* Generate SSE pack with signed/unsigned saturation. */
734 rtx dest
= lowpart_subreg (sse_dmode
, op0
, GET_MODE (op0
));
735 op1
= lowpart_subreg (sse_smode
, op1
, GET_MODE (op1
));
736 op2
= lowpart_subreg (sse_smode
, op2
, GET_MODE (op2
));
738 op1
= gen_rtx_fmt_e (code
, sse_half_dmode
, op1
);
739 op2
= gen_rtx_fmt_e (code
, sse_half_dmode
, op2
);
740 rtx insn
= gen_rtx_SET (dest
, gen_rtx_VEC_CONCAT (sse_dmode
,
744 ix86_move_vector_high_sse_to_mmx (op0
);
747 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
750 ix86_split_mmx_punpck (rtx operands
[], bool high_p
)
752 rtx op0
= operands
[0];
753 rtx op1
= operands
[1];
754 rtx op2
= operands
[2];
755 machine_mode mode
= GET_MODE (op0
);
757 /* The corresponding SSE mode. */
758 machine_mode sse_mode
, double_sse_mode
;
763 sse_mode
= V16QImode
;
764 double_sse_mode
= V32QImode
;
765 mask
= gen_rtx_PARALLEL (VOIDmode
,
767 GEN_INT (0), GEN_INT (16),
768 GEN_INT (1), GEN_INT (17),
769 GEN_INT (2), GEN_INT (18),
770 GEN_INT (3), GEN_INT (19),
771 GEN_INT (4), GEN_INT (20),
772 GEN_INT (5), GEN_INT (21),
773 GEN_INT (6), GEN_INT (22),
774 GEN_INT (7), GEN_INT (23)));
779 double_sse_mode
= V16HImode
;
780 mask
= gen_rtx_PARALLEL (VOIDmode
,
782 GEN_INT (0), GEN_INT (8),
783 GEN_INT (1), GEN_INT (9),
784 GEN_INT (2), GEN_INT (10),
785 GEN_INT (3), GEN_INT (11)));
790 double_sse_mode
= V8SImode
;
791 mask
= gen_rtx_PARALLEL (VOIDmode
,
793 GEN_INT (0), GEN_INT (4),
794 GEN_INT (1), GEN_INT (5)));
801 /* Generate SSE punpcklXX. */
802 rtx dest
= lowpart_subreg (sse_mode
, op0
, GET_MODE (op0
));
803 op1
= lowpart_subreg (sse_mode
, op1
, GET_MODE (op1
));
804 op2
= lowpart_subreg (sse_mode
, op2
, GET_MODE (op2
));
806 op1
= gen_rtx_VEC_CONCAT (double_sse_mode
, op1
, op2
);
807 op2
= gen_rtx_VEC_SELECT (sse_mode
, op1
, mask
);
808 rtx insn
= gen_rtx_SET (dest
, op2
);
813 /* Move bits 64:127 to bits 0:63. */
814 mask
= gen_rtx_PARALLEL (VOIDmode
,
815 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
816 GEN_INT (0), GEN_INT (0)));
817 dest
= lowpart_subreg (V4SImode
, dest
, GET_MODE (dest
));
818 op1
= gen_rtx_VEC_SELECT (V4SImode
, dest
, mask
);
819 insn
= gen_rtx_SET (dest
, op1
);
824 /* Helper function of ix86_fixup_binary_operands to canonicalize
825 operand order. Returns true if the operands should be swapped. */
828 ix86_swap_binary_operands_p (enum rtx_code code
, machine_mode mode
,
831 rtx dst
= operands
[0];
832 rtx src1
= operands
[1];
833 rtx src2
= operands
[2];
835 /* If the operation is not commutative, we can't do anything. */
836 if (GET_RTX_CLASS (code
) != RTX_COMM_ARITH
837 && GET_RTX_CLASS (code
) != RTX_COMM_COMPARE
)
840 /* Highest priority is that src1 should match dst. */
841 if (rtx_equal_p (dst
, src1
))
843 if (rtx_equal_p (dst
, src2
))
846 /* Next highest priority is that immediate constants come second. */
847 if (immediate_operand (src2
, mode
))
849 if (immediate_operand (src1
, mode
))
852 /* Lowest priority is that memory references should come second. */
862 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
863 destination to use for the operation. If different from the true
864 destination in operands[0], a copy operation will be required. */
867 ix86_fixup_binary_operands (enum rtx_code code
, machine_mode mode
,
870 rtx dst
= operands
[0];
871 rtx src1
= operands
[1];
872 rtx src2
= operands
[2];
874 /* Canonicalize operand order. */
875 if (ix86_swap_binary_operands_p (code
, mode
, operands
))
877 /* It is invalid to swap operands of different modes. */
878 gcc_assert (GET_MODE (src1
) == GET_MODE (src2
));
880 std::swap (src1
, src2
);
883 /* Both source operands cannot be in memory. */
884 if (MEM_P (src1
) && MEM_P (src2
))
886 /* Optimization: Only read from memory once. */
887 if (rtx_equal_p (src1
, src2
))
889 src2
= force_reg (mode
, src2
);
892 else if (rtx_equal_p (dst
, src1
))
893 src2
= force_reg (mode
, src2
);
895 src1
= force_reg (mode
, src1
);
898 /* If the destination is memory, and we do not have matching source
899 operands, do things in registers. */
900 if (MEM_P (dst
) && !rtx_equal_p (dst
, src1
))
901 dst
= gen_reg_rtx (mode
);
903 /* Source 1 cannot be a constant. */
904 if (CONSTANT_P (src1
))
905 src1
= force_reg (mode
, src1
);
907 /* Source 1 cannot be a non-matching memory. */
908 if (MEM_P (src1
) && !rtx_equal_p (dst
, src1
))
909 src1
= force_reg (mode
, src1
);
911 /* Improve address combine. */
913 && GET_MODE_CLASS (mode
) == MODE_INT
915 src2
= force_reg (mode
, src2
);
922 /* Similarly, but assume that the destination has already been
926 ix86_fixup_binary_operands_no_copy (enum rtx_code code
,
927 machine_mode mode
, rtx operands
[])
929 rtx dst
= ix86_fixup_binary_operands (code
, mode
, operands
);
930 gcc_assert (dst
== operands
[0]);
933 /* Attempt to expand a binary operator. Make the expansion closer to the
934 actual machine, then just general_operand, which will allow 3 separate
935 memory references (one output, two input) in a single insn. */
938 ix86_expand_binary_operator (enum rtx_code code
, machine_mode mode
,
941 rtx src1
, src2
, dst
, op
, clob
;
943 dst
= ix86_fixup_binary_operands (code
, mode
, operands
);
947 /* Emit the instruction. */
949 op
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, src1
, src2
));
953 && !rtx_equal_p (dst
, src1
))
955 /* This is going to be an LEA; avoid splitting it later. */
960 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
961 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
964 /* Fix up the destination if needed. */
965 if (dst
!= operands
[0])
966 emit_move_insn (operands
[0], dst
);
969 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
970 the given OPERANDS. */
973 ix86_expand_vector_logical_operator (enum rtx_code code
, machine_mode mode
,
976 rtx op1
= NULL_RTX
, op2
= NULL_RTX
;
977 if (SUBREG_P (operands
[1]))
982 else if (SUBREG_P (operands
[2]))
987 /* Optimize (__m128i) d | (__m128i) e and similar code
988 when d and e are float vectors into float vector logical
989 insn. In C/C++ without using intrinsics there is no other way
990 to express vector logical operation on float vectors than
991 to cast them temporarily to integer vectors. */
993 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
994 && (SUBREG_P (op2
) || GET_CODE (op2
) == CONST_VECTOR
)
995 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1
))) == MODE_VECTOR_FLOAT
996 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1
))) == GET_MODE_SIZE (mode
)
997 && SUBREG_BYTE (op1
) == 0
998 && (GET_CODE (op2
) == CONST_VECTOR
999 || (GET_MODE (SUBREG_REG (op1
)) == GET_MODE (SUBREG_REG (op2
))
1000 && SUBREG_BYTE (op2
) == 0))
1001 && can_create_pseudo_p ())
1004 switch (GET_MODE (SUBREG_REG (op1
)))
1012 dst
= gen_reg_rtx (GET_MODE (SUBREG_REG (op1
)));
1013 if (GET_CODE (op2
) == CONST_VECTOR
)
1015 op2
= gen_lowpart (GET_MODE (dst
), op2
);
1016 op2
= force_reg (GET_MODE (dst
), op2
);
1021 op2
= SUBREG_REG (operands
[2]);
1022 if (!vector_operand (op2
, GET_MODE (dst
)))
1023 op2
= force_reg (GET_MODE (dst
), op2
);
1025 op1
= SUBREG_REG (op1
);
1026 if (!vector_operand (op1
, GET_MODE (dst
)))
1027 op1
= force_reg (GET_MODE (dst
), op1
);
1028 emit_insn (gen_rtx_SET (dst
,
1029 gen_rtx_fmt_ee (code
, GET_MODE (dst
),
1031 emit_move_insn (operands
[0], gen_lowpart (mode
, dst
));
1037 if (!vector_operand (operands
[1], mode
))
1038 operands
[1] = force_reg (mode
, operands
[1]);
1039 if (!vector_operand (operands
[2], mode
))
1040 operands
[2] = force_reg (mode
, operands
[2]);
1041 ix86_fixup_binary_operands_no_copy (code
, mode
, operands
);
1042 emit_insn (gen_rtx_SET (operands
[0],
1043 gen_rtx_fmt_ee (code
, mode
, operands
[1],
1047 /* Return TRUE or FALSE depending on whether the binary operator meets the
1048 appropriate constraints. */
1051 ix86_binary_operator_ok (enum rtx_code code
, machine_mode mode
,
1054 rtx dst
= operands
[0];
1055 rtx src1
= operands
[1];
1056 rtx src2
= operands
[2];
1058 /* Both source operands cannot be in memory. */
1059 if ((MEM_P (src1
) || bcst_mem_operand (src1
, mode
))
1060 && (MEM_P (src2
) || bcst_mem_operand (src2
, mode
)))
1063 /* Canonicalize operand order for commutative operators. */
1064 if (ix86_swap_binary_operands_p (code
, mode
, operands
))
1065 std::swap (src1
, src2
);
1067 /* If the destination is memory, we must have a matching source operand. */
1068 if (MEM_P (dst
) && !rtx_equal_p (dst
, src1
))
1071 /* Source 1 cannot be a constant. */
1072 if (CONSTANT_P (src1
))
1075 /* Source 1 cannot be a non-matching memory. */
1076 if (MEM_P (src1
) && !rtx_equal_p (dst
, src1
))
1077 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1081 || (TARGET_64BIT
&& mode
== DImode
))
1082 && satisfies_constraint_L (src2
));
1087 /* Attempt to expand a unary operator. Make the expansion closer to the
1088 actual machine, then just general_operand, which will allow 2 separate
1089 memory references (one output, one input) in a single insn. */
1092 ix86_expand_unary_operator (enum rtx_code code
, machine_mode mode
,
1095 bool matching_memory
= false;
1096 rtx src
, dst
, op
, clob
;
1101 /* If the destination is memory, and we do not have matching source
1102 operands, do things in registers. */
1105 if (rtx_equal_p (dst
, src
))
1106 matching_memory
= true;
1108 dst
= gen_reg_rtx (mode
);
1111 /* When source operand is memory, destination must match. */
1112 if (MEM_P (src
) && !matching_memory
)
1113 src
= force_reg (mode
, src
);
1115 /* Emit the instruction. */
1117 op
= gen_rtx_SET (dst
, gen_rtx_fmt_e (code
, mode
, src
));
1123 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1124 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1127 /* Fix up the destination if needed. */
1128 if (dst
!= operands
[0])
1129 emit_move_insn (operands
[0], dst
);
1132 /* Predict just emitted jump instruction to be taken with probability PROB. */
1135 predict_jump (int prob
)
1137 rtx_insn
*insn
= get_last_insn ();
1138 gcc_assert (JUMP_P (insn
));
1139 add_reg_br_prob_note (insn
, profile_probability::from_reg_br_prob_base (prob
));
1142 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1143 divisor are within the range [0-255]. */
1146 ix86_split_idivmod (machine_mode mode
, rtx operands
[],
1149 rtx_code_label
*end_label
, *qimode_label
;
1152 rtx scratch
, tmp0
, tmp1
, tmp2
;
1153 rtx (*gen_divmod4_1
) (rtx
, rtx
, rtx
, rtx
);
1158 if (GET_MODE (operands
[0]) == SImode
)
1160 if (GET_MODE (operands
[1]) == SImode
)
1161 gen_divmod4_1
= unsigned_p
? gen_udivmodsi4_1
: gen_divmodsi4_1
;
1164 = unsigned_p
? gen_udivmodsi4_zext_2
: gen_divmodsi4_zext_2
;
1168 = unsigned_p
? gen_udivmodsi4_zext_1
: gen_divmodsi4_zext_1
;
1172 gen_divmod4_1
= unsigned_p
? gen_udivmoddi4_1
: gen_divmoddi4_1
;
1179 end_label
= gen_label_rtx ();
1180 qimode_label
= gen_label_rtx ();
1182 scratch
= gen_reg_rtx (mode
);
1184 /* Use 8bit unsigned divimod if dividend and divisor are within
1185 the range [0-255]. */
1186 emit_move_insn (scratch
, operands
[2]);
1187 scratch
= expand_simple_binop (mode
, IOR
, scratch
, operands
[3],
1188 scratch
, 1, OPTAB_DIRECT
);
1189 emit_insn (gen_test_ccno_1 (mode
, scratch
, GEN_INT (-0x100)));
1190 tmp0
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
1191 tmp0
= gen_rtx_EQ (VOIDmode
, tmp0
, const0_rtx
);
1192 tmp0
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp0
,
1193 gen_rtx_LABEL_REF (VOIDmode
, qimode_label
),
1195 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp0
));
1196 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
1197 JUMP_LABEL (insn
) = qimode_label
;
1199 /* Generate original signed/unsigned divimod. */
1200 emit_insn (gen_divmod4_1 (operands
[0], operands
[1],
1201 operands
[2], operands
[3]));
1203 /* Branch to the end. */
1204 emit_jump_insn (gen_jump (end_label
));
1207 /* Generate 8bit unsigned divide. */
1208 emit_label (qimode_label
);
1209 /* Don't use operands[0] for result of 8bit divide since not all
1210 registers support QImode ZERO_EXTRACT. */
1211 tmp0
= lowpart_subreg (HImode
, scratch
, mode
);
1212 tmp1
= lowpart_subreg (HImode
, operands
[2], mode
);
1213 tmp2
= lowpart_subreg (QImode
, operands
[3], mode
);
1214 emit_insn (gen_udivmodhiqi3 (tmp0
, tmp1
, tmp2
));
1218 div
= gen_rtx_UDIV (mode
, operands
[2], operands
[3]);
1219 mod
= gen_rtx_UMOD (mode
, operands
[2], operands
[3]);
1223 div
= gen_rtx_DIV (mode
, operands
[2], operands
[3]);
1224 mod
= gen_rtx_MOD (mode
, operands
[2], operands
[3]);
1228 if (GET_MODE (operands
[0]) != SImode
)
1229 div
= gen_rtx_ZERO_EXTEND (DImode
, div
);
1230 if (GET_MODE (operands
[1]) != SImode
)
1231 mod
= gen_rtx_ZERO_EXTEND (DImode
, mod
);
1234 /* Extract remainder from AH. */
1235 scratch
= gen_lowpart (GET_MODE (operands
[1]), scratch
);
1236 tmp1
= gen_rtx_ZERO_EXTRACT (GET_MODE (operands
[1]), scratch
,
1237 GEN_INT (8), GEN_INT (8));
1238 insn
= emit_move_insn (operands
[1], tmp1
);
1239 set_unique_reg_note (insn
, REG_EQUAL
, mod
);
1241 /* Zero extend quotient from AL. */
1242 tmp1
= gen_lowpart (QImode
, tmp0
);
1243 insn
= emit_insn (gen_extend_insn
1245 GET_MODE (operands
[0]), QImode
, 1));
1246 set_unique_reg_note (insn
, REG_EQUAL
, div
);
1248 emit_label (end_label
);
1251 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1252 matches destination. RTX includes clobber of FLAGS_REG. */
1255 ix86_emit_binop (enum rtx_code code
, machine_mode mode
,
1260 op
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, dst
, src
));
1261 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1263 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1266 /* Return true if regno1 def is nearest to the insn. */
1269 find_nearest_reg_def (rtx_insn
*insn
, int regno1
, int regno2
)
1271 rtx_insn
*prev
= insn
;
1272 rtx_insn
*start
= BB_HEAD (BLOCK_FOR_INSN (insn
));
1276 while (prev
&& prev
!= start
)
1278 if (!INSN_P (prev
) || !NONDEBUG_INSN_P (prev
))
1280 prev
= PREV_INSN (prev
);
1283 if (insn_defines_reg (regno1
, INVALID_REGNUM
, prev
))
1285 else if (insn_defines_reg (regno2
, INVALID_REGNUM
, prev
))
1287 prev
= PREV_INSN (prev
);
1290 /* None of the regs is defined in the bb. */
1294 /* Split lea instructions into a sequence of instructions
1295 which are executed on ALU to avoid AGU stalls.
1296 It is assumed that it is allowed to clobber flags register
1300 ix86_split_lea_for_addr (rtx_insn
*insn
, rtx operands
[], machine_mode mode
)
1302 unsigned int regno0
, regno1
, regno2
;
1303 struct ix86_address parts
;
1307 ok
= ix86_decompose_address (operands
[1], &parts
);
1310 target
= gen_lowpart (mode
, operands
[0]);
1312 regno0
= true_regnum (target
);
1313 regno1
= INVALID_REGNUM
;
1314 regno2
= INVALID_REGNUM
;
1318 parts
.base
= gen_lowpart (mode
, parts
.base
);
1319 regno1
= true_regnum (parts
.base
);
1324 parts
.index
= gen_lowpart (mode
, parts
.index
);
1325 regno2
= true_regnum (parts
.index
);
1329 parts
.disp
= gen_lowpart (mode
, parts
.disp
);
1331 if (parts
.scale
> 1)
1333 /* Case r1 = r1 + ... */
1334 if (regno1
== regno0
)
1336 /* If we have a case r1 = r1 + C * r2 then we
1337 should use multiplication which is very
1338 expensive. Assume cost model is wrong if we
1339 have such case here. */
1340 gcc_assert (regno2
!= regno0
);
1342 for (adds
= parts
.scale
; adds
> 0; adds
--)
1343 ix86_emit_binop (PLUS
, mode
, target
, parts
.index
);
1347 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1348 if (regno0
!= regno2
)
1349 emit_insn (gen_rtx_SET (target
, parts
.index
));
1351 /* Use shift for scaling, but emit it as MULT instead
1352 to avoid it being immediately peephole2 optimized back
1354 ix86_emit_binop (MULT
, mode
, target
, GEN_INT (parts
.scale
));
1357 ix86_emit_binop (PLUS
, mode
, target
, parts
.base
);
1359 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1360 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1363 else if (!parts
.base
&& !parts
.index
)
1365 gcc_assert(parts
.disp
);
1366 emit_insn (gen_rtx_SET (target
, parts
.disp
));
1372 if (regno0
!= regno2
)
1373 emit_insn (gen_rtx_SET (target
, parts
.index
));
1375 else if (!parts
.index
)
1377 if (regno0
!= regno1
)
1378 emit_insn (gen_rtx_SET (target
, parts
.base
));
1382 if (regno0
== regno1
)
1384 else if (regno0
== regno2
)
1390 /* Find better operand for SET instruction, depending
1391 on which definition is farther from the insn. */
1392 if (find_nearest_reg_def (insn
, regno1
, regno2
))
1393 tmp
= parts
.index
, tmp1
= parts
.base
;
1395 tmp
= parts
.base
, tmp1
= parts
.index
;
1397 emit_insn (gen_rtx_SET (target
, tmp
));
1399 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1400 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1402 ix86_emit_binop (PLUS
, mode
, target
, tmp1
);
1406 ix86_emit_binop (PLUS
, mode
, target
, tmp
);
1409 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1410 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1414 /* Post-reload splitter for converting an SF or DFmode value in an
1415 SSE register into an unsigned SImode. */
1418 ix86_split_convert_uns_si_sse (rtx operands
[])
1420 machine_mode vecmode
;
1421 rtx value
, large
, zero_or_two31
, input
, two31
, x
;
1423 large
= operands
[1];
1424 zero_or_two31
= operands
[2];
1425 input
= operands
[3];
1426 two31
= operands
[4];
1427 vecmode
= GET_MODE (large
);
1428 value
= gen_rtx_REG (vecmode
, REGNO (operands
[0]));
1430 /* Load up the value into the low element. We must ensure that the other
1431 elements are valid floats -- zero is the easiest such value. */
1434 if (vecmode
== V4SFmode
)
1435 emit_insn (gen_vec_setv4sf_0 (value
, CONST0_RTX (V4SFmode
), input
));
1437 emit_insn (gen_sse2_loadlpd (value
, CONST0_RTX (V2DFmode
), input
));
1441 input
= gen_rtx_REG (vecmode
, REGNO (input
));
1442 emit_move_insn (value
, CONST0_RTX (vecmode
));
1443 if (vecmode
== V4SFmode
)
1444 emit_insn (gen_sse_movss (value
, value
, input
));
1446 emit_insn (gen_sse2_movsd (value
, value
, input
));
1449 emit_move_insn (large
, two31
);
1450 emit_move_insn (zero_or_two31
, MEM_P (two31
) ? large
: two31
);
1452 x
= gen_rtx_fmt_ee (LE
, vecmode
, large
, value
);
1453 emit_insn (gen_rtx_SET (large
, x
));
1455 x
= gen_rtx_AND (vecmode
, zero_or_two31
, large
);
1456 emit_insn (gen_rtx_SET (zero_or_two31
, x
));
1458 x
= gen_rtx_MINUS (vecmode
, value
, zero_or_two31
);
1459 emit_insn (gen_rtx_SET (value
, x
));
1461 large
= gen_rtx_REG (V4SImode
, REGNO (large
));
1462 emit_insn (gen_ashlv4si3 (large
, large
, GEN_INT (31)));
1464 x
= gen_rtx_REG (V4SImode
, REGNO (value
));
1465 if (vecmode
== V4SFmode
)
1466 emit_insn (gen_fix_truncv4sfv4si2 (x
, value
));
1468 emit_insn (gen_sse2_cvttpd2dq (x
, value
));
1471 emit_insn (gen_xorv4si3 (value
, value
, large
));
1474 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok
,
1475 machine_mode mode
, rtx target
,
1476 rtx var
, int one_var
);
1478 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1479 Expects the 64-bit DImode to be supplied in a pair of integral
1480 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1481 -mfpmath=sse, !optimize_size only. */
1484 ix86_expand_convert_uns_didf_sse (rtx target
, rtx input
)
1486 REAL_VALUE_TYPE bias_lo_rvt
, bias_hi_rvt
;
1487 rtx int_xmm
, fp_xmm
;
1488 rtx biases
, exponents
;
1491 int_xmm
= gen_reg_rtx (V4SImode
);
1492 if (TARGET_INTER_UNIT_MOVES_TO_VEC
)
1493 emit_insn (gen_movdi_to_sse (int_xmm
, input
));
1494 else if (TARGET_SSE_SPLIT_REGS
)
1496 emit_clobber (int_xmm
);
1497 emit_move_insn (gen_lowpart (DImode
, int_xmm
), input
);
1501 x
= gen_reg_rtx (V2DImode
);
1502 ix86_expand_vector_init_one_nonzero (false, V2DImode
, x
, input
, 0);
1503 emit_move_insn (int_xmm
, gen_lowpart (V4SImode
, x
));
1506 x
= gen_rtx_CONST_VECTOR (V4SImode
,
1507 gen_rtvec (4, GEN_INT (0x43300000UL
),
1508 GEN_INT (0x45300000UL
),
1509 const0_rtx
, const0_rtx
));
1510 exponents
= validize_mem (force_const_mem (V4SImode
, x
));
1512 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1513 emit_insn (gen_vec_interleave_lowv4si (int_xmm
, int_xmm
, exponents
));
1515 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1516 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1517 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1518 (0x1.0p84 + double(fp_value_hi_xmm)).
1519 Note these exponents differ by 32. */
1521 fp_xmm
= copy_to_mode_reg (V2DFmode
, gen_lowpart (V2DFmode
, int_xmm
));
1523 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1524 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1525 real_ldexp (&bias_lo_rvt
, &dconst1
, 52);
1526 real_ldexp (&bias_hi_rvt
, &dconst1
, 84);
1527 biases
= const_double_from_real_value (bias_lo_rvt
, DFmode
);
1528 x
= const_double_from_real_value (bias_hi_rvt
, DFmode
);
1529 biases
= gen_rtx_CONST_VECTOR (V2DFmode
, gen_rtvec (2, biases
, x
));
1530 biases
= validize_mem (force_const_mem (V2DFmode
, biases
));
1531 emit_insn (gen_subv2df3 (fp_xmm
, fp_xmm
, biases
));
1533 /* Add the upper and lower DFmode values together. */
1535 emit_insn (gen_sse3_haddv2df3 (fp_xmm
, fp_xmm
, fp_xmm
));
1538 x
= copy_to_mode_reg (V2DFmode
, fp_xmm
);
1539 emit_insn (gen_vec_interleave_highv2df (fp_xmm
, fp_xmm
, fp_xmm
));
1540 emit_insn (gen_addv2df3 (fp_xmm
, fp_xmm
, x
));
1543 ix86_expand_vector_extract (false, target
, fp_xmm
, 0);
1546 /* Not used, but eases macroization of patterns. */
1548 ix86_expand_convert_uns_sixf_sse (rtx
, rtx
)
1553 static rtx
ix86_expand_sse_fabs (rtx op0
, rtx
*smask
);
1555 /* Convert an unsigned SImode value into a DFmode. Only currently used
1556 for SSE, but applicable anywhere. */
1559 ix86_expand_convert_uns_sidf_sse (rtx target
, rtx input
)
1561 REAL_VALUE_TYPE TWO31r
;
1564 x
= expand_simple_binop (SImode
, PLUS
, input
, GEN_INT (-2147483647 - 1),
1565 NULL
, 1, OPTAB_DIRECT
);
1567 fp
= gen_reg_rtx (DFmode
);
1568 emit_insn (gen_floatsidf2 (fp
, x
));
1570 real_ldexp (&TWO31r
, &dconst1
, 31);
1571 x
= const_double_from_real_value (TWO31r
, DFmode
);
1573 x
= expand_simple_binop (DFmode
, PLUS
, fp
, x
, target
, 0, OPTAB_DIRECT
);
1575 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
1576 if (HONOR_SIGNED_ZEROS (DFmode
) && flag_rounding_math
)
1577 x
= ix86_expand_sse_fabs (x
, NULL
);
1580 emit_move_insn (target
, x
);
1583 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1584 32-bit mode; otherwise we have a direct convert instruction. */
1587 ix86_expand_convert_sign_didf_sse (rtx target
, rtx input
)
1589 REAL_VALUE_TYPE TWO32r
;
1590 rtx fp_lo
, fp_hi
, x
;
1592 fp_lo
= gen_reg_rtx (DFmode
);
1593 fp_hi
= gen_reg_rtx (DFmode
);
1595 emit_insn (gen_floatsidf2 (fp_hi
, gen_highpart (SImode
, input
)));
1597 real_ldexp (&TWO32r
, &dconst1
, 32);
1598 x
= const_double_from_real_value (TWO32r
, DFmode
);
1599 fp_hi
= expand_simple_binop (DFmode
, MULT
, fp_hi
, x
, fp_hi
, 0, OPTAB_DIRECT
);
1601 ix86_expand_convert_uns_sidf_sse (fp_lo
, gen_lowpart (SImode
, input
));
1603 x
= expand_simple_binop (DFmode
, PLUS
, fp_hi
, fp_lo
, target
,
1606 emit_move_insn (target
, x
);
1609 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1610 For x86_32, -mfpmath=sse, !optimize_size only. */
1612 ix86_expand_convert_uns_sisf_sse (rtx target
, rtx input
)
1614 REAL_VALUE_TYPE ONE16r
;
1615 rtx fp_hi
, fp_lo
, int_hi
, int_lo
, x
;
1617 real_ldexp (&ONE16r
, &dconst1
, 16);
1618 x
= const_double_from_real_value (ONE16r
, SFmode
);
1619 int_lo
= expand_simple_binop (SImode
, AND
, input
, GEN_INT(0xffff),
1620 NULL
, 0, OPTAB_DIRECT
);
1621 int_hi
= expand_simple_binop (SImode
, LSHIFTRT
, input
, GEN_INT(16),
1622 NULL
, 0, OPTAB_DIRECT
);
1623 fp_hi
= gen_reg_rtx (SFmode
);
1624 fp_lo
= gen_reg_rtx (SFmode
);
1625 emit_insn (gen_floatsisf2 (fp_hi
, int_hi
));
1626 emit_insn (gen_floatsisf2 (fp_lo
, int_lo
));
1627 fp_hi
= expand_simple_binop (SFmode
, MULT
, fp_hi
, x
, fp_hi
,
1629 fp_hi
= expand_simple_binop (SFmode
, PLUS
, fp_hi
, fp_lo
, target
,
1631 if (!rtx_equal_p (target
, fp_hi
))
1632 emit_move_insn (target
, fp_hi
);
1635 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1636 a vector of unsigned ints VAL to vector of floats TARGET. */
1639 ix86_expand_vector_convert_uns_vsivsf (rtx target
, rtx val
)
1642 REAL_VALUE_TYPE TWO16r
;
1643 machine_mode intmode
= GET_MODE (val
);
1644 machine_mode fltmode
= GET_MODE (target
);
1645 rtx (*cvt
) (rtx
, rtx
);
1647 if (intmode
== V4SImode
)
1648 cvt
= gen_floatv4siv4sf2
;
1650 cvt
= gen_floatv8siv8sf2
;
1651 tmp
[0] = ix86_build_const_vector (intmode
, 1, GEN_INT (0xffff));
1652 tmp
[0] = force_reg (intmode
, tmp
[0]);
1653 tmp
[1] = expand_simple_binop (intmode
, AND
, val
, tmp
[0], NULL_RTX
, 1,
1655 tmp
[2] = expand_simple_binop (intmode
, LSHIFTRT
, val
, GEN_INT (16),
1656 NULL_RTX
, 1, OPTAB_DIRECT
);
1657 tmp
[3] = gen_reg_rtx (fltmode
);
1658 emit_insn (cvt (tmp
[3], tmp
[1]));
1659 tmp
[4] = gen_reg_rtx (fltmode
);
1660 emit_insn (cvt (tmp
[4], tmp
[2]));
1661 real_ldexp (&TWO16r
, &dconst1
, 16);
1662 tmp
[5] = const_double_from_real_value (TWO16r
, SFmode
);
1663 tmp
[5] = force_reg (fltmode
, ix86_build_const_vector (fltmode
, 1, tmp
[5]));
1664 tmp
[6] = expand_simple_binop (fltmode
, MULT
, tmp
[4], tmp
[5], NULL_RTX
, 1,
1666 tmp
[7] = expand_simple_binop (fltmode
, PLUS
, tmp
[3], tmp
[6], target
, 1,
1668 if (tmp
[7] != target
)
1669 emit_move_insn (target
, tmp
[7]);
1672 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1673 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1674 This is done by doing just signed conversion if < 0x1p31, and otherwise by
1675 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
1678 ix86_expand_adjust_ufix_to_sfix_si (rtx val
, rtx
*xorp
)
1680 REAL_VALUE_TYPE TWO31r
;
1682 machine_mode mode
= GET_MODE (val
);
1683 machine_mode scalarmode
= GET_MODE_INNER (mode
);
1684 machine_mode intmode
= GET_MODE_SIZE (mode
) == 32 ? V8SImode
: V4SImode
;
1685 rtx (*cmp
) (rtx
, rtx
, rtx
, rtx
);
1688 for (i
= 0; i
< 3; i
++)
1689 tmp
[i
] = gen_reg_rtx (mode
);
1690 real_ldexp (&TWO31r
, &dconst1
, 31);
1691 two31r
= const_double_from_real_value (TWO31r
, scalarmode
);
1692 two31r
= ix86_build_const_vector (mode
, 1, two31r
);
1693 two31r
= force_reg (mode
, two31r
);
1696 case E_V8SFmode
: cmp
= gen_avx_maskcmpv8sf3
; break;
1697 case E_V4SFmode
: cmp
= gen_sse_maskcmpv4sf3
; break;
1698 case E_V4DFmode
: cmp
= gen_avx_maskcmpv4df3
; break;
1699 case E_V2DFmode
: cmp
= gen_sse2_maskcmpv2df3
; break;
1700 default: gcc_unreachable ();
1702 tmp
[3] = gen_rtx_LE (mode
, two31r
, val
);
1703 emit_insn (cmp (tmp
[0], two31r
, val
, tmp
[3]));
1704 tmp
[1] = expand_simple_binop (mode
, AND
, tmp
[0], two31r
, tmp
[1],
1706 if (intmode
== V4SImode
|| TARGET_AVX2
)
1707 *xorp
= expand_simple_binop (intmode
, ASHIFT
,
1708 gen_lowpart (intmode
, tmp
[0]),
1709 GEN_INT (31), NULL_RTX
, 0,
1713 rtx two31
= gen_int_mode (HOST_WIDE_INT_1U
<< 31, SImode
);
1714 two31
= ix86_build_const_vector (intmode
, 1, two31
);
1715 *xorp
= expand_simple_binop (intmode
, AND
,
1716 gen_lowpart (intmode
, tmp
[0]),
1720 return expand_simple_binop (mode
, MINUS
, val
, tmp
[1], tmp
[2],
1724 /* Generate code for floating point ABS or NEG. */
1727 ix86_expand_fp_absneg_operator (enum rtx_code code
, machine_mode mode
,
1731 bool use_sse
= false;
1732 bool vector_mode
= VECTOR_MODE_P (mode
);
1733 machine_mode vmode
= mode
;
1736 if (vector_mode
|| mode
== TFmode
)
1738 else if (TARGET_SSE_MATH
)
1740 use_sse
= SSE_FLOAT_MODE_P (mode
);
1743 else if (mode
== DFmode
)
1750 set
= gen_rtx_fmt_e (code
, mode
, src
);
1751 set
= gen_rtx_SET (dst
, set
);
1755 rtx mask
, use
, clob
;
1757 /* NEG and ABS performed with SSE use bitwise mask operations.
1758 Create the appropriate mask now. */
1759 mask
= ix86_build_signbit_mask (vmode
, vector_mode
, code
== ABS
);
1760 use
= gen_rtx_USE (VOIDmode
, mask
);
1761 if (vector_mode
|| mode
== TFmode
)
1762 par
= gen_rtvec (2, set
, use
);
1765 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1766 par
= gen_rtvec (3, set
, use
, clob
);
1773 /* Changing of sign for FP values is doable using integer unit too. */
1774 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1775 par
= gen_rtvec (2, set
, clob
);
1778 emit_insn (gen_rtx_PARALLEL (VOIDmode
, par
));
1781 /* Deconstruct a floating point ABS or NEG operation
1782 with integer registers into integer operations. */
1785 ix86_split_fp_absneg_operator (enum rtx_code code
, machine_mode mode
,
1788 enum rtx_code absneg_op
;
1791 gcc_assert (operands_match_p (operands
[0], operands
[1]));
1796 dst
= gen_lowpart (SImode
, operands
[0]);
1800 set
= gen_int_mode (0x7fffffff, SImode
);
1805 set
= gen_int_mode (0x80000000, SImode
);
1808 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
1814 dst
= gen_lowpart (DImode
, operands
[0]);
1815 dst
= gen_rtx_ZERO_EXTRACT (DImode
, dst
, const1_rtx
, GEN_INT (63));
1820 set
= gen_rtx_NOT (DImode
, dst
);
1824 dst
= gen_highpart (SImode
, operands
[0]);
1828 set
= gen_int_mode (0x7fffffff, SImode
);
1833 set
= gen_int_mode (0x80000000, SImode
);
1836 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
1841 dst
= gen_rtx_REG (SImode
,
1842 REGNO (operands
[0]) + (TARGET_64BIT
? 1 : 2));
1845 set
= GEN_INT (0x7fff);
1850 set
= GEN_INT (0x8000);
1853 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
1860 set
= gen_rtx_SET (dst
, set
);
1862 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1863 rtvec par
= gen_rtvec (2, set
, clob
);
1865 emit_insn (gen_rtx_PARALLEL (VOIDmode
, par
));
1868 /* Expand a copysign operation. Special case operand 0 being a constant. */
1871 ix86_expand_copysign (rtx operands
[])
1873 machine_mode mode
, vmode
;
1874 rtx dest
, op0
, op1
, mask
;
1880 mode
= GET_MODE (dest
);
1884 else if (mode
== DFmode
)
1886 else if (mode
== TFmode
)
1891 mask
= ix86_build_signbit_mask (vmode
, 0, 0);
1893 if (CONST_DOUBLE_P (op0
))
1895 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0
)))
1896 op0
= simplify_unary_operation (ABS
, mode
, op0
, mode
);
1898 if (mode
== SFmode
|| mode
== DFmode
)
1900 if (op0
== CONST0_RTX (mode
))
1901 op0
= CONST0_RTX (vmode
);
1904 rtx v
= ix86_build_const_vector (vmode
, false, op0
);
1906 op0
= force_reg (vmode
, v
);
1909 else if (op0
!= CONST0_RTX (mode
))
1910 op0
= force_reg (mode
, op0
);
1912 emit_insn (gen_copysign3_const (mode
, dest
, op0
, op1
, mask
));
1916 rtx nmask
= ix86_build_signbit_mask (vmode
, 0, 1);
1918 emit_insn (gen_copysign3_var
1919 (mode
, dest
, NULL_RTX
, op0
, op1
, nmask
, mask
));
1923 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
1924 be a constant, and so has already been expanded into a vector constant. */
1927 ix86_split_copysign_const (rtx operands
[])
1929 machine_mode mode
, vmode
;
1930 rtx dest
, op0
, mask
, x
;
1936 mode
= GET_MODE (dest
);
1937 vmode
= GET_MODE (mask
);
1939 dest
= lowpart_subreg (vmode
, dest
, mode
);
1940 x
= gen_rtx_AND (vmode
, dest
, mask
);
1941 emit_insn (gen_rtx_SET (dest
, x
));
1943 if (op0
!= CONST0_RTX (vmode
))
1945 x
= gen_rtx_IOR (vmode
, dest
, op0
);
1946 emit_insn (gen_rtx_SET (dest
, x
));
1950 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
1951 so we have to do two masks. */
1954 ix86_split_copysign_var (rtx operands
[])
1956 machine_mode mode
, vmode
;
1957 rtx dest
, scratch
, op0
, op1
, mask
, nmask
, x
;
1960 scratch
= operands
[1];
1963 nmask
= operands
[4];
1966 mode
= GET_MODE (dest
);
1967 vmode
= GET_MODE (mask
);
1969 if (rtx_equal_p (op0
, op1
))
1971 /* Shouldn't happen often (it's useless, obviously), but when it does
1972 we'd generate incorrect code if we continue below. */
1973 emit_move_insn (dest
, op0
);
1977 if (REG_P (mask
) && REGNO (dest
) == REGNO (mask
)) /* alternative 0 */
1979 gcc_assert (REGNO (op1
) == REGNO (scratch
));
1981 x
= gen_rtx_AND (vmode
, scratch
, mask
);
1982 emit_insn (gen_rtx_SET (scratch
, x
));
1985 op0
= lowpart_subreg (vmode
, op0
, mode
);
1986 x
= gen_rtx_NOT (vmode
, dest
);
1987 x
= gen_rtx_AND (vmode
, x
, op0
);
1988 emit_insn (gen_rtx_SET (dest
, x
));
1992 if (REGNO (op1
) == REGNO (scratch
)) /* alternative 1,3 */
1994 x
= gen_rtx_AND (vmode
, scratch
, mask
);
1996 else /* alternative 2,4 */
1998 gcc_assert (REGNO (mask
) == REGNO (scratch
));
1999 op1
= lowpart_subreg (vmode
, op1
, mode
);
2000 x
= gen_rtx_AND (vmode
, scratch
, op1
);
2002 emit_insn (gen_rtx_SET (scratch
, x
));
2004 if (REGNO (op0
) == REGNO (dest
)) /* alternative 1,2 */
2006 dest
= lowpart_subreg (vmode
, op0
, mode
);
2007 x
= gen_rtx_AND (vmode
, dest
, nmask
);
2009 else /* alternative 3,4 */
2011 gcc_assert (REGNO (nmask
) == REGNO (dest
));
2013 op0
= lowpart_subreg (vmode
, op0
, mode
);
2014 x
= gen_rtx_AND (vmode
, dest
, op0
);
2016 emit_insn (gen_rtx_SET (dest
, x
));
2019 x
= gen_rtx_IOR (vmode
, dest
, scratch
);
2020 emit_insn (gen_rtx_SET (dest
, x
));
2023 /* Expand an xorsign operation. */
2026 ix86_expand_xorsign (rtx operands
[])
2028 machine_mode mode
, vmode
;
2029 rtx dest
, op0
, op1
, mask
;
2035 mode
= GET_MODE (dest
);
2039 else if (mode
== DFmode
)
2044 mask
= ix86_build_signbit_mask (vmode
, 0, 0);
2046 emit_insn (gen_xorsign3_1 (mode
, dest
, op0
, op1
, mask
));
2049 /* Deconstruct an xorsign operation into bit masks. */
2052 ix86_split_xorsign (rtx operands
[])
2054 machine_mode mode
, vmode
;
2055 rtx dest
, op0
, mask
, x
;
2061 mode
= GET_MODE (dest
);
2062 vmode
= GET_MODE (mask
);
2064 dest
= lowpart_subreg (vmode
, dest
, mode
);
2065 x
= gen_rtx_AND (vmode
, dest
, mask
);
2066 emit_insn (gen_rtx_SET (dest
, x
));
2068 op0
= lowpart_subreg (vmode
, op0
, mode
);
2069 x
= gen_rtx_XOR (vmode
, dest
, op0
);
2070 emit_insn (gen_rtx_SET (dest
, x
));
2073 static rtx
ix86_expand_compare (enum rtx_code code
, rtx op0
, rtx op1
);
2076 ix86_expand_branch (enum rtx_code code
, rtx op0
, rtx op1
, rtx label
)
2078 machine_mode mode
= GET_MODE (op0
);
2081 /* Handle special case - vector comparsion with boolean result, transform
2082 it using ptest instruction. */
2083 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
2085 rtx flag
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
2086 machine_mode p_mode
= GET_MODE_SIZE (mode
) == 32 ? V4DImode
: V2DImode
;
2088 gcc_assert (code
== EQ
|| code
== NE
);
2089 /* Generate XOR since we can't check that one operand is zero vector. */
2090 tmp
= gen_reg_rtx (mode
);
2091 emit_insn (gen_rtx_SET (tmp
, gen_rtx_XOR (mode
, op0
, op1
)));
2092 tmp
= gen_lowpart (p_mode
, tmp
);
2093 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode
, FLAGS_REG
),
2094 gen_rtx_UNSPEC (CCmode
,
2095 gen_rtvec (2, tmp
, tmp
),
2097 tmp
= gen_rtx_fmt_ee (code
, VOIDmode
, flag
, const0_rtx
);
2098 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
2099 gen_rtx_LABEL_REF (VOIDmode
, label
),
2101 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2114 tmp
= ix86_expand_compare (code
, op0
, op1
);
2115 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
2116 gen_rtx_LABEL_REF (VOIDmode
, label
),
2118 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2124 /* For 32-bit target DI comparison may be performed on
2125 SSE registers. To allow this we should avoid split
2126 to SI mode which is achieved by doing xor in DI mode
2127 and then comparing with zero (which is recognized by
2128 STV pass). We don't compare using xor when optimizing
2130 if (!optimize_insn_for_size_p ()
2132 && (code
== EQ
|| code
== NE
))
2134 op0
= force_reg (mode
, gen_rtx_XOR (mode
, op0
, op1
));
2139 /* Expand DImode branch into multiple compare+branch. */
2142 rtx_code_label
*label2
;
2143 enum rtx_code code1
, code2
, code3
;
2144 machine_mode submode
;
2146 if (CONSTANT_P (op0
) && !CONSTANT_P (op1
))
2148 std::swap (op0
, op1
);
2149 code
= swap_condition (code
);
2152 split_double_mode (mode
, &op0
, 1, lo
+0, hi
+0);
2153 split_double_mode (mode
, &op1
, 1, lo
+1, hi
+1);
2155 submode
= mode
== DImode
? SImode
: DImode
;
2157 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
2158 avoid two branches. This costs one extra insn, so disable when
2159 optimizing for size. */
2161 if ((code
== EQ
|| code
== NE
)
2162 && (!optimize_insn_for_size_p ()
2163 || hi
[1] == const0_rtx
|| lo
[1] == const0_rtx
))
2168 if (hi
[1] != const0_rtx
)
2169 xor1
= expand_binop (submode
, xor_optab
, xor1
, hi
[1],
2170 NULL_RTX
, 0, OPTAB_WIDEN
);
2173 if (lo
[1] != const0_rtx
)
2174 xor0
= expand_binop (submode
, xor_optab
, xor0
, lo
[1],
2175 NULL_RTX
, 0, OPTAB_WIDEN
);
2177 tmp
= expand_binop (submode
, ior_optab
, xor1
, xor0
,
2178 NULL_RTX
, 0, OPTAB_WIDEN
);
2180 ix86_expand_branch (code
, tmp
, const0_rtx
, label
);
2184 /* Otherwise, if we are doing less-than or greater-or-equal-than,
2185 op1 is a constant and the low word is zero, then we can just
2186 examine the high word. Similarly for low word -1 and
2187 less-or-equal-than or greater-than. */
2189 if (CONST_INT_P (hi
[1]))
2192 case LT
: case LTU
: case GE
: case GEU
:
2193 if (lo
[1] == const0_rtx
)
2195 ix86_expand_branch (code
, hi
[0], hi
[1], label
);
2199 case LE
: case LEU
: case GT
: case GTU
:
2200 if (lo
[1] == constm1_rtx
)
2202 ix86_expand_branch (code
, hi
[0], hi
[1], label
);
2210 /* Emulate comparisons that do not depend on Zero flag with
2211 double-word subtraction. Note that only Overflow, Sign
2212 and Carry flags are valid, so swap arguments and condition
2213 of comparisons that would otherwise test Zero flag. */
2217 case LE
: case LEU
: case GT
: case GTU
:
2218 std::swap (lo
[0], lo
[1]);
2219 std::swap (hi
[0], hi
[1]);
2220 code
= swap_condition (code
);
2223 case LT
: case LTU
: case GE
: case GEU
:
2225 bool uns
= (code
== LTU
|| code
== GEU
);
2226 rtx (*sbb_insn
) (machine_mode
, rtx
, rtx
, rtx
)
2227 = uns
? gen_sub3_carry_ccc
: gen_sub3_carry_ccgz
;
2229 if (!nonimmediate_operand (lo
[0], submode
))
2230 lo
[0] = force_reg (submode
, lo
[0]);
2231 if (!x86_64_general_operand (lo
[1], submode
))
2232 lo
[1] = force_reg (submode
, lo
[1]);
2234 if (!register_operand (hi
[0], submode
))
2235 hi
[0] = force_reg (submode
, hi
[0]);
2236 if ((uns
&& !nonimmediate_operand (hi
[1], submode
))
2237 || (!uns
&& !x86_64_general_operand (hi
[1], submode
)))
2238 hi
[1] = force_reg (submode
, hi
[1]);
2240 emit_insn (gen_cmp_1 (submode
, lo
[0], lo
[1]));
2242 tmp
= gen_rtx_SCRATCH (submode
);
2243 emit_insn (sbb_insn (submode
, tmp
, hi
[0], hi
[1]));
2245 tmp
= gen_rtx_REG (uns
? CCCmode
: CCGZmode
, FLAGS_REG
);
2246 ix86_expand_branch (code
, tmp
, const0_rtx
, label
);
2254 /* Otherwise, we need two or three jumps. */
2256 label2
= gen_label_rtx ();
2259 code2
= swap_condition (code
);
2260 code3
= unsigned_condition (code
);
2264 case LT
: case GT
: case LTU
: case GTU
:
2267 case LE
: code1
= LT
; code2
= GT
; break;
2268 case GE
: code1
= GT
; code2
= LT
; break;
2269 case LEU
: code1
= LTU
; code2
= GTU
; break;
2270 case GEU
: code1
= GTU
; code2
= LTU
; break;
2272 case EQ
: code1
= UNKNOWN
; code2
= NE
; break;
2273 case NE
: code2
= UNKNOWN
; break;
2281 * if (hi(a) < hi(b)) goto true;
2282 * if (hi(a) > hi(b)) goto false;
2283 * if (lo(a) < lo(b)) goto true;
2287 if (code1
!= UNKNOWN
)
2288 ix86_expand_branch (code1
, hi
[0], hi
[1], label
);
2289 if (code2
!= UNKNOWN
)
2290 ix86_expand_branch (code2
, hi
[0], hi
[1], label2
);
2292 ix86_expand_branch (code3
, lo
[0], lo
[1], label
);
2294 if (code2
!= UNKNOWN
)
2295 emit_label (label2
);
2300 gcc_assert (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_CC
);
2305 /* Figure out whether to use unordered fp comparisons. */
2308 ix86_unordered_fp_compare (enum rtx_code code
)
2310 if (!TARGET_IEEE_FP
)
2339 /* Return a comparison we can do and that it is equivalent to
2340 swap_condition (code) apart possibly from orderedness.
2341 But, never change orderedness if TARGET_IEEE_FP, returning
2342 UNKNOWN in that case if necessary. */
2344 static enum rtx_code
2345 ix86_fp_swap_condition (enum rtx_code code
)
2349 case GT
: /* GTU - CF=0 & ZF=0 */
2350 return TARGET_IEEE_FP
? UNKNOWN
: UNLT
;
2351 case GE
: /* GEU - CF=0 */
2352 return TARGET_IEEE_FP
? UNKNOWN
: UNLE
;
2353 case UNLT
: /* LTU - CF=1 */
2354 return TARGET_IEEE_FP
? UNKNOWN
: GT
;
2355 case UNLE
: /* LEU - CF=1 | ZF=1 */
2356 return TARGET_IEEE_FP
? UNKNOWN
: GE
;
2358 return swap_condition (code
);
2362 /* Return cost of comparison CODE using the best strategy for performance.
2363 All following functions do use number of instructions as a cost metrics.
2364 In future this should be tweaked to compute bytes for optimize_size and
2365 take into account performance of various instructions on various CPUs. */
2368 ix86_fp_comparison_cost (enum rtx_code code
)
2372 /* The cost of code using bit-twiddling on %ah. */
2389 arith_cost
= TARGET_IEEE_FP
? 5 : 4;
2393 arith_cost
= TARGET_IEEE_FP
? 6 : 4;
2399 switch (ix86_fp_comparison_strategy (code
))
2401 case IX86_FPCMP_COMI
:
2402 return arith_cost
> 4 ? 3 : 2;
2403 case IX86_FPCMP_SAHF
:
2404 return arith_cost
> 4 ? 4 : 3;
2410 /* Swap, force into registers, or otherwise massage the two operands
2411 to a fp comparison. The operands are updated in place; the new
2412 comparison code is returned. */
2414 static enum rtx_code
2415 ix86_prepare_fp_compare_args (enum rtx_code code
, rtx
*pop0
, rtx
*pop1
)
2417 bool unordered_compare
= ix86_unordered_fp_compare (code
);
2418 rtx op0
= *pop0
, op1
= *pop1
;
2419 machine_mode op_mode
= GET_MODE (op0
);
2420 bool is_sse
= TARGET_SSE_MATH
&& SSE_FLOAT_MODE_P (op_mode
);
2422 /* All of the unordered compare instructions only work on registers.
2423 The same is true of the fcomi compare instructions. The XFmode
2424 compare instructions require registers except when comparing
2425 against zero or when converting operand 1 from fixed point to
2429 && (unordered_compare
2430 || (op_mode
== XFmode
2431 && ! (standard_80387_constant_p (op0
) == 1
2432 || standard_80387_constant_p (op1
) == 1)
2433 && GET_CODE (op1
) != FLOAT
)
2434 || ix86_fp_comparison_strategy (code
) == IX86_FPCMP_COMI
))
2436 op0
= force_reg (op_mode
, op0
);
2437 op1
= force_reg (op_mode
, op1
);
2441 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2442 things around if they appear profitable, otherwise force op0
2445 if (standard_80387_constant_p (op0
) == 0
2447 && ! (standard_80387_constant_p (op1
) == 0
2450 enum rtx_code new_code
= ix86_fp_swap_condition (code
);
2451 if (new_code
!= UNKNOWN
)
2453 std::swap (op0
, op1
);
2459 op0
= force_reg (op_mode
, op0
);
2461 if (CONSTANT_P (op1
))
2463 int tmp
= standard_80387_constant_p (op1
);
2465 op1
= validize_mem (force_const_mem (op_mode
, op1
));
2469 op1
= force_reg (op_mode
, op1
);
2472 op1
= force_reg (op_mode
, op1
);
2476 /* Try to rearrange the comparison to make it cheaper. */
2477 if (ix86_fp_comparison_cost (code
)
2478 > ix86_fp_comparison_cost (swap_condition (code
))
2479 && (REG_P (op1
) || can_create_pseudo_p ()))
2481 std::swap (op0
, op1
);
2482 code
= swap_condition (code
);
2484 op0
= force_reg (op_mode
, op0
);
2492 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2495 ix86_expand_fp_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2497 bool unordered_compare
= ix86_unordered_fp_compare (code
);
2498 machine_mode cmp_mode
;
2501 code
= ix86_prepare_fp_compare_args (code
, &op0
, &op1
);
2503 tmp
= gen_rtx_COMPARE (CCFPmode
, op0
, op1
);
2504 if (unordered_compare
)
2505 tmp
= gen_rtx_UNSPEC (CCFPmode
, gen_rtvec (1, tmp
), UNSPEC_NOTRAP
);
2507 /* Do fcomi/sahf based test when profitable. */
2508 switch (ix86_fp_comparison_strategy (code
))
2510 case IX86_FPCMP_COMI
:
2511 cmp_mode
= CCFPmode
;
2512 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode
, FLAGS_REG
), tmp
));
2515 case IX86_FPCMP_SAHF
:
2516 cmp_mode
= CCFPmode
;
2517 tmp
= gen_rtx_UNSPEC (HImode
, gen_rtvec (1, tmp
), UNSPEC_FNSTSW
);
2518 scratch
= gen_reg_rtx (HImode
);
2519 emit_insn (gen_rtx_SET (scratch
, tmp
));
2520 emit_insn (gen_x86_sahf_1 (scratch
));
2523 case IX86_FPCMP_ARITH
:
2524 cmp_mode
= CCNOmode
;
2525 tmp
= gen_rtx_UNSPEC (HImode
, gen_rtvec (1, tmp
), UNSPEC_FNSTSW
);
2526 scratch
= gen_reg_rtx (HImode
);
2527 emit_insn (gen_rtx_SET (scratch
, tmp
));
2529 /* In the unordered case, we have to check C2 for NaN's, which
2530 doesn't happen to work out to anything nice combination-wise.
2531 So do some bit twiddling on the value we've got in AH to come
2532 up with an appropriate set of condition codes. */
2538 if (code
== GT
|| !TARGET_IEEE_FP
)
2540 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x45)));
2545 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2546 emit_insn (gen_addqi_ext_1 (scratch
, scratch
, constm1_rtx
));
2547 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x44)));
2554 if (code
== LT
&& TARGET_IEEE_FP
)
2556 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2557 emit_insn (gen_cmpqi_ext_3 (scratch
, const1_rtx
));
2563 emit_insn (gen_testqi_ext_1_ccno (scratch
, const1_rtx
));
2569 if (code
== GE
|| !TARGET_IEEE_FP
)
2571 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x05)));
2576 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2577 emit_insn (gen_xorqi_ext_1_cc (scratch
, scratch
, const1_rtx
));
2583 if (code
== LE
&& TARGET_IEEE_FP
)
2585 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2586 emit_insn (gen_addqi_ext_1 (scratch
, scratch
, constm1_rtx
));
2587 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x40)));
2593 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x45)));
2599 if (code
== EQ
&& TARGET_IEEE_FP
)
2601 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2602 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x40)));
2608 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x40)));
2614 if (code
== NE
&& TARGET_IEEE_FP
)
2616 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2617 emit_insn (gen_xorqi_ext_1_cc (scratch
, scratch
,
2623 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x40)));
2629 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x04)));
2633 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x04)));
2646 /* Return the test that should be put into the flags user, i.e.
2647 the bcc, scc, or cmov instruction. */
2648 return gen_rtx_fmt_ee (code
, VOIDmode
,
2649 gen_rtx_REG (cmp_mode
, FLAGS_REG
),
2653 /* Generate insn patterns to do an integer compare of OPERANDS. */
2656 ix86_expand_int_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2658 machine_mode cmpmode
;
2661 cmpmode
= SELECT_CC_MODE (code
, op0
, op1
);
2662 flags
= gen_rtx_REG (cmpmode
, FLAGS_REG
);
2664 /* This is very simple, but making the interface the same as in the
2665 FP case makes the rest of the code easier. */
2666 tmp
= gen_rtx_COMPARE (cmpmode
, op0
, op1
);
2667 emit_insn (gen_rtx_SET (flags
, tmp
));
2669 /* Return the test that should be put into the flags user, i.e.
2670 the bcc, scc, or cmov instruction. */
2671 return gen_rtx_fmt_ee (code
, VOIDmode
, flags
, const0_rtx
);
2675 ix86_expand_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2679 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_CC
)
2680 ret
= gen_rtx_fmt_ee (code
, VOIDmode
, op0
, op1
);
2682 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0
)))
2684 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0
)));
2685 ret
= ix86_expand_fp_compare (code
, op0
, op1
);
2688 ret
= ix86_expand_int_compare (code
, op0
, op1
);
2694 ix86_expand_setcc (rtx dest
, enum rtx_code code
, rtx op0
, rtx op1
)
2698 gcc_assert (GET_MODE (dest
) == QImode
);
2700 ret
= ix86_expand_compare (code
, op0
, op1
);
2701 PUT_MODE (ret
, QImode
);
2702 emit_insn (gen_rtx_SET (dest
, ret
));
2705 /* Expand comparison setting or clearing carry flag. Return true when
2706 successful and set pop for the operation. */
2708 ix86_expand_carry_flag_compare (enum rtx_code code
, rtx op0
, rtx op1
, rtx
*pop
)
2711 = GET_MODE (op0
) != VOIDmode
? GET_MODE (op0
) : GET_MODE (op1
);
2713 /* Do not handle double-mode compares that go through special path. */
2714 if (mode
== (TARGET_64BIT
? TImode
: DImode
))
2717 if (SCALAR_FLOAT_MODE_P (mode
))
2720 rtx_insn
*compare_seq
;
2722 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode
));
2724 /* Shortcut: following common codes never translate
2725 into carry flag compares. */
2726 if (code
== EQ
|| code
== NE
|| code
== UNEQ
|| code
== LTGT
2727 || code
== ORDERED
|| code
== UNORDERED
)
2730 /* These comparisons require zero flag; swap operands so they won't. */
2731 if ((code
== GT
|| code
== UNLE
|| code
== LE
|| code
== UNGT
)
2734 std::swap (op0
, op1
);
2735 code
= swap_condition (code
);
2738 /* Try to expand the comparison and verify that we end up with
2739 carry flag based comparison. This fails to be true only when
2740 we decide to expand comparison using arithmetic that is not
2741 too common scenario. */
2743 compare_op
= ix86_expand_fp_compare (code
, op0
, op1
);
2744 compare_seq
= get_insns ();
2747 if (GET_MODE (XEXP (compare_op
, 0)) == CCFPmode
)
2748 code
= ix86_fp_compare_code_to_integer (GET_CODE (compare_op
));
2750 code
= GET_CODE (compare_op
);
2752 if (code
!= LTU
&& code
!= GEU
)
2755 emit_insn (compare_seq
);
2760 if (!INTEGRAL_MODE_P (mode
))
2769 /* Convert a==0 into (unsigned)a<1. */
2772 if (op1
!= const0_rtx
)
2775 code
= (code
== EQ
? LTU
: GEU
);
2778 /* Convert a>b into b<a or a>=b-1. */
2781 if (CONST_INT_P (op1
))
2783 op1
= gen_int_mode (INTVAL (op1
) + 1, GET_MODE (op0
));
2784 /* Bail out on overflow. We still can swap operands but that
2785 would force loading of the constant into register. */
2786 if (op1
== const0_rtx
2787 || !x86_64_immediate_operand (op1
, GET_MODE (op1
)))
2789 code
= (code
== GTU
? GEU
: LTU
);
2793 std::swap (op0
, op1
);
2794 code
= (code
== GTU
? LTU
: GEU
);
2798 /* Convert a>=0 into (unsigned)a<0x80000000. */
2801 if (mode
== DImode
|| op1
!= const0_rtx
)
2803 op1
= gen_int_mode (1 << (GET_MODE_BITSIZE (mode
) - 1), mode
);
2804 code
= (code
== LT
? GEU
: LTU
);
2808 if (mode
== DImode
|| op1
!= constm1_rtx
)
2810 op1
= gen_int_mode (1 << (GET_MODE_BITSIZE (mode
) - 1), mode
);
2811 code
= (code
== LE
? GEU
: LTU
);
2817 /* Swapping operands may cause constant to appear as first operand. */
2818 if (!nonimmediate_operand (op0
, VOIDmode
))
2820 if (!can_create_pseudo_p ())
2822 op0
= force_reg (mode
, op0
);
2824 *pop
= ix86_expand_compare (code
, op0
, op1
);
2825 gcc_assert (GET_CODE (*pop
) == LTU
|| GET_CODE (*pop
) == GEU
);
2829 /* Expand conditional increment or decrement using adb/sbb instructions.
2830 The default case using setcc followed by the conditional move can be
2831 done by generic code. */
2833 ix86_expand_int_addcc (rtx operands
[])
2835 enum rtx_code code
= GET_CODE (operands
[1]);
2837 rtx (*insn
) (machine_mode
, rtx
, rtx
, rtx
, rtx
, rtx
);
2839 rtx val
= const0_rtx
;
2842 rtx op0
= XEXP (operands
[1], 0);
2843 rtx op1
= XEXP (operands
[1], 1);
2845 if (operands
[3] != const1_rtx
2846 && operands
[3] != constm1_rtx
)
2848 if (!ix86_expand_carry_flag_compare (code
, op0
, op1
, &compare_op
))
2850 code
= GET_CODE (compare_op
);
2852 flags
= XEXP (compare_op
, 0);
2854 if (GET_MODE (flags
) == CCFPmode
)
2857 code
= ix86_fp_compare_code_to_integer (code
);
2864 PUT_CODE (compare_op
,
2865 reverse_condition_maybe_unordered
2866 (GET_CODE (compare_op
)));
2868 PUT_CODE (compare_op
, reverse_condition (GET_CODE (compare_op
)));
2871 mode
= GET_MODE (operands
[0]);
2873 /* Construct either adc or sbb insn. */
2874 if ((code
== LTU
) == (operands
[3] == constm1_rtx
))
2875 insn
= gen_sub3_carry
;
2877 insn
= gen_add3_carry
;
2879 emit_insn (insn (mode
, operands
[0], operands
[2], val
, flags
, compare_op
));
2885 ix86_expand_int_movcc (rtx operands
[])
2887 enum rtx_code code
= GET_CODE (operands
[1]), compare_code
;
2888 rtx_insn
*compare_seq
;
2890 machine_mode mode
= GET_MODE (operands
[0]);
2891 bool sign_bit_compare_p
= false;
2892 rtx op0
= XEXP (operands
[1], 0);
2893 rtx op1
= XEXP (operands
[1], 1);
2895 if (GET_MODE (op0
) == TImode
2896 || (GET_MODE (op0
) == DImode
2901 compare_op
= ix86_expand_compare (code
, op0
, op1
);
2902 compare_seq
= get_insns ();
2905 compare_code
= GET_CODE (compare_op
);
2907 if ((op1
== const0_rtx
&& (code
== GE
|| code
== LT
))
2908 || (op1
== constm1_rtx
&& (code
== GT
|| code
== LE
)))
2909 sign_bit_compare_p
= true;
2911 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
2912 HImode insns, we'd be swallowed in word prefix ops. */
2914 if ((mode
!= HImode
|| TARGET_FAST_PREFIX
)
2915 && (mode
!= (TARGET_64BIT
? TImode
: DImode
))
2916 && CONST_INT_P (operands
[2])
2917 && CONST_INT_P (operands
[3]))
2919 rtx out
= operands
[0];
2920 HOST_WIDE_INT ct
= INTVAL (operands
[2]);
2921 HOST_WIDE_INT cf
= INTVAL (operands
[3]);
2925 /* Sign bit compares are better done using shifts than we do by using
2927 if (sign_bit_compare_p
2928 || ix86_expand_carry_flag_compare (code
, op0
, op1
, &compare_op
))
2930 /* Detect overlap between destination and compare sources. */
2933 if (!sign_bit_compare_p
)
2938 compare_code
= GET_CODE (compare_op
);
2940 flags
= XEXP (compare_op
, 0);
2942 if (GET_MODE (flags
) == CCFPmode
)
2946 = ix86_fp_compare_code_to_integer (compare_code
);
2949 /* To simplify rest of code, restrict to the GEU case. */
2950 if (compare_code
== LTU
)
2953 compare_code
= reverse_condition (compare_code
);
2954 code
= reverse_condition (code
);
2959 PUT_CODE (compare_op
,
2960 reverse_condition_maybe_unordered
2961 (GET_CODE (compare_op
)));
2963 PUT_CODE (compare_op
,
2964 reverse_condition (GET_CODE (compare_op
)));
2968 if (reg_overlap_mentioned_p (out
, op0
)
2969 || reg_overlap_mentioned_p (out
, op1
))
2970 tmp
= gen_reg_rtx (mode
);
2973 emit_insn (gen_x86_movdicc_0_m1 (tmp
, flags
, compare_op
));
2975 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode
, tmp
),
2976 flags
, compare_op
));
2980 if (code
== GT
|| code
== GE
)
2981 code
= reverse_condition (code
);
2987 tmp
= emit_store_flag (tmp
, code
, op0
, op1
, VOIDmode
, 0, -1);
3000 tmp
= expand_simple_binop (mode
, PLUS
,
3002 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3013 tmp
= expand_simple_binop (mode
, IOR
,
3015 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3017 else if (diff
== -1 && ct
)
3027 tmp
= expand_simple_unop (mode
, NOT
, tmp
, copy_rtx (tmp
), 1);
3029 tmp
= expand_simple_binop (mode
, PLUS
,
3030 copy_rtx (tmp
), GEN_INT (cf
),
3031 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3039 * andl cf - ct, dest
3049 tmp
= expand_simple_unop (mode
, NOT
, tmp
, copy_rtx (tmp
), 1);
3052 tmp
= expand_simple_binop (mode
, AND
,
3054 gen_int_mode (cf
- ct
, mode
),
3055 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3057 tmp
= expand_simple_binop (mode
, PLUS
,
3058 copy_rtx (tmp
), GEN_INT (ct
),
3059 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3062 if (!rtx_equal_p (tmp
, out
))
3063 emit_move_insn (copy_rtx (out
), copy_rtx (tmp
));
3070 machine_mode cmp_mode
= GET_MODE (op0
);
3071 enum rtx_code new_code
;
3073 if (SCALAR_FLOAT_MODE_P (cmp_mode
))
3075 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode
));
3077 /* We may be reversing a non-trapping
3078 comparison to a trapping comparison. */
3079 if (HONOR_NANS (cmp_mode
) && flag_trapping_math
3080 && code
!= EQ
&& code
!= NE
3081 && code
!= ORDERED
&& code
!= UNORDERED
)
3084 new_code
= reverse_condition_maybe_unordered (code
);
3087 new_code
= ix86_reverse_condition (code
, cmp_mode
);
3088 if (new_code
!= UNKNOWN
)
3096 compare_code
= UNKNOWN
;
3097 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
3098 && CONST_INT_P (op1
))
3100 if (op1
== const0_rtx
3101 && (code
== LT
|| code
== GE
))
3102 compare_code
= code
;
3103 else if (op1
== constm1_rtx
)
3107 else if (code
== GT
)
3112 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3113 if (compare_code
!= UNKNOWN
3114 && GET_MODE (op0
) == GET_MODE (out
)
3115 && (cf
== -1 || ct
== -1))
3117 /* If lea code below could be used, only optimize
3118 if it results in a 2 insn sequence. */
3120 if (! (diff
== 1 || diff
== 2 || diff
== 4 || diff
== 8
3121 || diff
== 3 || diff
== 5 || diff
== 9)
3122 || (compare_code
== LT
&& ct
== -1)
3123 || (compare_code
== GE
&& cf
== -1))
3126 * notl op1 (if necessary)
3134 code
= reverse_condition (code
);
3137 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, -1);
3139 out
= expand_simple_binop (mode
, IOR
,
3141 out
, 1, OPTAB_DIRECT
);
3142 if (out
!= operands
[0])
3143 emit_move_insn (operands
[0], out
);
3150 if ((diff
== 1 || diff
== 2 || diff
== 4 || diff
== 8
3151 || diff
== 3 || diff
== 5 || diff
== 9)
3152 && ((mode
!= QImode
&& mode
!= HImode
) || !TARGET_PARTIAL_REG_STALL
)
3154 || x86_64_immediate_operand (GEN_INT (cf
), VOIDmode
)))
3160 * lea cf(dest*(ct-cf)),dest
3164 * This also catches the degenerate setcc-only case.
3170 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, 1);
3173 /* On x86_64 the lea instruction operates on Pmode, so we need
3174 to get arithmetics done in proper mode to match. */
3176 tmp
= copy_rtx (out
);
3180 out1
= copy_rtx (out
);
3181 tmp
= gen_rtx_MULT (mode
, out1
, GEN_INT (diff
& ~1));
3185 tmp
= gen_rtx_PLUS (mode
, tmp
, out1
);
3191 tmp
= plus_constant (mode
, tmp
, cf
);
3194 if (!rtx_equal_p (tmp
, out
))
3197 out
= force_operand (tmp
, copy_rtx (out
));
3199 emit_insn (gen_rtx_SET (copy_rtx (out
), copy_rtx (tmp
)));
3201 if (!rtx_equal_p (out
, operands
[0]))
3202 emit_move_insn (operands
[0], copy_rtx (out
));
3208 * General case: Jumpful:
3209 * xorl dest,dest cmpl op1, op2
3210 * cmpl op1, op2 movl ct, dest
3212 * decl dest movl cf, dest
3213 * andl (cf-ct),dest 1:
3218 * This is reasonably steep, but branch mispredict costs are
3219 * high on modern cpus, so consider failing only if optimizing
3223 if ((!TARGET_CMOVE
|| (mode
== QImode
&& TARGET_PARTIAL_REG_STALL
))
3224 && BRANCH_COST (optimize_insn_for_speed_p (),
3229 machine_mode cmp_mode
= GET_MODE (op0
);
3230 enum rtx_code new_code
;
3232 if (SCALAR_FLOAT_MODE_P (cmp_mode
))
3234 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode
));
3236 /* We may be reversing a non-trapping
3237 comparison to a trapping comparison. */
3238 if (HONOR_NANS (cmp_mode
) && flag_trapping_math
3239 && code
!= EQ
&& code
!= NE
3240 && code
!= ORDERED
&& code
!= UNORDERED
)
3243 new_code
= reverse_condition_maybe_unordered (code
);
3248 new_code
= ix86_reverse_condition (code
, cmp_mode
);
3249 if (compare_code
!= UNKNOWN
&& new_code
!= UNKNOWN
)
3250 compare_code
= reverse_condition (compare_code
);
3253 if (new_code
!= UNKNOWN
)
3261 if (compare_code
!= UNKNOWN
)
3263 /* notl op1 (if needed)
3268 For x < 0 (resp. x <= -1) there will be no notl,
3269 so if possible swap the constants to get rid of the
3271 True/false will be -1/0 while code below (store flag
3272 followed by decrement) is 0/-1, so the constants need
3273 to be exchanged once more. */
3275 if (compare_code
== GE
|| !cf
)
3277 code
= reverse_condition (code
);
3283 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, -1);
3287 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, 1);
3289 out
= expand_simple_binop (mode
, PLUS
, copy_rtx (out
),
3291 copy_rtx (out
), 1, OPTAB_DIRECT
);
3294 out
= expand_simple_binop (mode
, AND
, copy_rtx (out
),
3295 gen_int_mode (cf
- ct
, mode
),
3296 copy_rtx (out
), 1, OPTAB_DIRECT
);
3298 out
= expand_simple_binop (mode
, PLUS
, copy_rtx (out
), GEN_INT (ct
),
3299 copy_rtx (out
), 1, OPTAB_DIRECT
);
3300 if (!rtx_equal_p (out
, operands
[0]))
3301 emit_move_insn (operands
[0], copy_rtx (out
));
3307 if (!TARGET_CMOVE
|| (mode
== QImode
&& TARGET_PARTIAL_REG_STALL
))
3309 /* Try a few things more with specific constants and a variable. */
3312 rtx var
, orig_out
, out
, tmp
;
3314 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3317 /* If one of the two operands is an interesting constant, load a
3318 constant with the above and mask it in with a logical operation. */
3320 if (CONST_INT_P (operands
[2]))
3323 if (INTVAL (operands
[2]) == 0 && operands
[3] != constm1_rtx
)
3324 operands
[3] = constm1_rtx
, op
= and_optab
;
3325 else if (INTVAL (operands
[2]) == -1 && operands
[3] != const0_rtx
)
3326 operands
[3] = const0_rtx
, op
= ior_optab
;
3330 else if (CONST_INT_P (operands
[3]))
3333 if (INTVAL (operands
[3]) == 0 && operands
[2] != constm1_rtx
)
3335 /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3336 "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
3337 if (code
== LE
&& op1
== const0_rtx
&& rtx_equal_p (op0
, var
))
3338 operands
[1] = simplify_gen_relational (LT
, VOIDmode
,
3342 operands
[2] = constm1_rtx
;
3345 else if (INTVAL (operands
[3]) == -1 && operands
[3] != const0_rtx
)
3346 operands
[2] = const0_rtx
, op
= ior_optab
;
3353 orig_out
= operands
[0];
3354 tmp
= gen_reg_rtx (mode
);
3357 /* Recurse to get the constant loaded. */
3358 if (!ix86_expand_int_movcc (operands
))
3361 /* Mask in the interesting variable. */
3362 out
= expand_binop (mode
, op
, var
, tmp
, orig_out
, 0,
3364 if (!rtx_equal_p (out
, orig_out
))
3365 emit_move_insn (copy_rtx (orig_out
), copy_rtx (out
));
3371 * For comparison with above,
3381 if (! nonimmediate_operand (operands
[2], mode
))
3382 operands
[2] = force_reg (mode
, operands
[2]);
3383 if (! nonimmediate_operand (operands
[3], mode
))
3384 operands
[3] = force_reg (mode
, operands
[3]);
3386 if (! register_operand (operands
[2], VOIDmode
)
3388 || ! register_operand (operands
[3], VOIDmode
)))
3389 operands
[2] = force_reg (mode
, operands
[2]);
3392 && ! register_operand (operands
[3], VOIDmode
))
3393 operands
[3] = force_reg (mode
, operands
[3]);
3395 emit_insn (compare_seq
);
3396 emit_insn (gen_rtx_SET (operands
[0],
3397 gen_rtx_IF_THEN_ELSE (mode
,
3398 compare_op
, operands
[2],
3403 /* Detect conditional moves that exactly match min/max operational
3404 semantics. Note that this is IEEE safe, as long as we don't
3405 interchange the operands.
3407 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3408 and TRUE if the operation is successful and instructions are emitted. */
3411 ix86_expand_sse_fp_minmax (rtx dest
, enum rtx_code code
, rtx cmp_op0
,
3412 rtx cmp_op1
, rtx if_true
, rtx if_false
)
3420 else if (code
== UNGE
)
3421 std::swap (if_true
, if_false
);
3425 if (rtx_equal_p (cmp_op0
, if_true
) && rtx_equal_p (cmp_op1
, if_false
))
3427 else if (rtx_equal_p (cmp_op1
, if_true
) && rtx_equal_p (cmp_op0
, if_false
))
3432 mode
= GET_MODE (dest
);
3434 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3435 but MODE may be a vector mode and thus not appropriate. */
3436 if (!flag_finite_math_only
|| flag_signed_zeros
)
3438 int u
= is_min
? UNSPEC_IEEE_MIN
: UNSPEC_IEEE_MAX
;
3441 if_true
= force_reg (mode
, if_true
);
3442 v
= gen_rtvec (2, if_true
, if_false
);
3443 tmp
= gen_rtx_UNSPEC (mode
, v
, u
);
3447 code
= is_min
? SMIN
: SMAX
;
3448 if (MEM_P (if_true
) && MEM_P (if_false
))
3449 if_true
= force_reg (mode
, if_true
);
3450 tmp
= gen_rtx_fmt_ee (code
, mode
, if_true
, if_false
);
3453 emit_insn (gen_rtx_SET (dest
, tmp
));
3457 /* Return true if MODE is valid for vector compare to mask register,
3458 Same result for conditionl vector move with mask register. */
3460 ix86_valid_mask_cmp_mode (machine_mode mode
)
3462 /* XOP has its own vector conditional movement. */
3463 if (TARGET_XOP
&& !TARGET_AVX512F
)
3466 /* AVX512F is needed for mask operation. */
3467 if (!(TARGET_AVX512F
&& VECTOR_MODE_P (mode
)))
3470 /* AVX512BW is needed for vector QI/HImode,
3471 AVX512VL is needed for 128/256-bit vector. */
3472 machine_mode inner_mode
= GET_MODE_INNER (mode
);
3473 int vector_size
= GET_MODE_SIZE (mode
);
3474 if ((inner_mode
== QImode
|| inner_mode
== HImode
) && !TARGET_AVX512BW
)
3477 return vector_size
== 64 || TARGET_AVX512VL
;
3480 /* Return true if integer mask comparison should be used. */
3482 ix86_use_mask_cmp_p (machine_mode mode
, machine_mode cmp_mode
,
3483 rtx op_true
, rtx op_false
)
3485 if (GET_MODE_SIZE (mode
) == 64)
3488 /* When op_true is NULL, op_false must be NULL, or vice versa. */
3489 gcc_assert (!op_true
== !op_false
);
3491 /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
3492 vector dest is required. */
3493 if (!op_true
|| !ix86_valid_mask_cmp_mode (cmp_mode
))
3496 /* Exclude those that could be optimized in ix86_expand_sse_movcc. */
3497 if (op_false
== CONST0_RTX (mode
)
3498 || op_true
== CONST0_RTX (mode
)
3499 || (INTEGRAL_MODE_P (mode
)
3500 && (op_true
== CONSTM1_RTX (mode
)
3501 || op_false
== CONSTM1_RTX (mode
))))
3507 /* Expand an SSE comparison. Return the register with the result. */
3510 ix86_expand_sse_cmp (rtx dest
, enum rtx_code code
, rtx cmp_op0
, rtx cmp_op1
,
3511 rtx op_true
, rtx op_false
)
3513 machine_mode mode
= GET_MODE (dest
);
3514 machine_mode cmp_ops_mode
= GET_MODE (cmp_op0
);
3516 /* In general case result of comparison can differ from operands' type. */
3517 machine_mode cmp_mode
;
3519 /* In AVX512F the result of comparison is an integer mask. */
3520 bool maskcmp
= false;
3523 if (ix86_use_mask_cmp_p (mode
, cmp_ops_mode
, op_true
, op_false
))
3525 unsigned int nbits
= GET_MODE_NUNITS (cmp_ops_mode
);
3527 cmp_mode
= nbits
> 8 ? int_mode_for_size (nbits
, 0).require () : E_QImode
;
3530 cmp_mode
= cmp_ops_mode
;
3532 cmp_op0
= force_reg (cmp_ops_mode
, cmp_op0
);
3534 int (*op1_predicate
)(rtx
, machine_mode
)
3535 = VECTOR_MODE_P (cmp_ops_mode
) ? vector_operand
: nonimmediate_operand
;
3537 if (!op1_predicate (cmp_op1
, cmp_ops_mode
))
3538 cmp_op1
= force_reg (cmp_ops_mode
, cmp_op1
);
3541 || (maskcmp
&& cmp_mode
!= mode
)
3542 || (op_true
&& reg_overlap_mentioned_p (dest
, op_true
))
3543 || (op_false
&& reg_overlap_mentioned_p (dest
, op_false
)))
3544 dest
= gen_reg_rtx (maskcmp
? cmp_mode
: mode
);
3548 bool ok
= ix86_expand_mask_vec_cmp (dest
, code
, cmp_op0
, cmp_op1
);
3553 x
= gen_rtx_fmt_ee (code
, cmp_mode
, cmp_op0
, cmp_op1
);
3555 if (cmp_mode
!= mode
)
3557 x
= force_reg (cmp_ops_mode
, x
);
3558 convert_move (dest
, x
, false);
3561 emit_insn (gen_rtx_SET (dest
, x
));
3566 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3567 operations. This is used for both scalar and vector conditional moves. */
3570 ix86_expand_sse_movcc (rtx dest
, rtx cmp
, rtx op_true
, rtx op_false
)
3572 machine_mode mode
= GET_MODE (dest
);
3573 machine_mode cmpmode
= GET_MODE (cmp
);
3575 /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
3576 if (rtx_equal_p (op_true
, op_false
))
3578 emit_move_insn (dest
, op_true
);
3584 /* If we have an integer mask and FP value then we need
3585 to cast mask to FP mode. */
3586 if (mode
!= cmpmode
&& VECTOR_MODE_P (cmpmode
))
3588 cmp
= force_reg (cmpmode
, cmp
);
3589 cmp
= gen_rtx_SUBREG (mode
, cmp
, 0);
3592 /* In AVX512F the result of comparison is an integer mask. */
3594 && GET_MODE_CLASS (cmpmode
) == MODE_INT
)
3596 gcc_assert (ix86_valid_mask_cmp_mode (mode
));
3597 /* Using vector move with mask register. */
3598 cmp
= force_reg (cmpmode
, cmp
);
3599 /* Optimize for mask zero. */
3600 op_true
= (op_true
!= CONST0_RTX (mode
)
3601 ? force_reg (mode
, op_true
) : op_true
);
3602 op_false
= (op_false
!= CONST0_RTX (mode
)
3603 ? force_reg (mode
, op_false
) : op_false
);
3604 if (op_true
== CONST0_RTX (mode
))
3606 rtx n
= gen_reg_rtx (cmpmode
);
3607 if (cmpmode
== E_DImode
&& !TARGET_64BIT
)
3608 emit_insn (gen_knotdi (n
, cmp
));
3610 emit_insn (gen_rtx_SET (n
, gen_rtx_fmt_e (NOT
, cmpmode
, cmp
)));
3612 /* Reverse op_true op_false. */
3613 std::swap (op_true
, op_false
);
3616 rtx vec_merge
= gen_rtx_VEC_MERGE (mode
, op_true
, op_false
, cmp
);
3617 emit_insn (gen_rtx_SET (dest
, vec_merge
));
3620 else if (vector_all_ones_operand (op_true
, mode
)
3621 && op_false
== CONST0_RTX (mode
))
3623 emit_insn (gen_rtx_SET (dest
, cmp
));
3626 else if (op_false
== CONST0_RTX (mode
))
3628 op_true
= force_reg (mode
, op_true
);
3629 x
= gen_rtx_AND (mode
, cmp
, op_true
);
3630 emit_insn (gen_rtx_SET (dest
, x
));
3633 else if (op_true
== CONST0_RTX (mode
))
3635 op_false
= force_reg (mode
, op_false
);
3636 x
= gen_rtx_NOT (mode
, cmp
);
3637 x
= gen_rtx_AND (mode
, x
, op_false
);
3638 emit_insn (gen_rtx_SET (dest
, x
));
3641 else if (INTEGRAL_MODE_P (mode
) && op_true
== CONSTM1_RTX (mode
))
3643 op_false
= force_reg (mode
, op_false
);
3644 x
= gen_rtx_IOR (mode
, cmp
, op_false
);
3645 emit_insn (gen_rtx_SET (dest
, x
));
3648 else if (TARGET_XOP
)
3650 op_true
= force_reg (mode
, op_true
);
3652 if (!nonimmediate_operand (op_false
, mode
))
3653 op_false
= force_reg (mode
, op_false
);
3655 emit_insn (gen_rtx_SET (dest
, gen_rtx_IF_THEN_ELSE (mode
, cmp
,
3661 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
3664 if (!vector_operand (op_true
, mode
))
3665 op_true
= force_reg (mode
, op_true
);
3667 op_false
= force_reg (mode
, op_false
);
3673 gen
= gen_sse4_1_blendvps
;
3677 gen
= gen_sse4_1_blendvpd
;
3682 gen
= gen_sse4_1_blendvss
;
3683 op_true
= force_reg (mode
, op_true
);
3689 gen
= gen_sse4_1_blendvsd
;
3690 op_true
= force_reg (mode
, op_true
);
3699 gen
= gen_sse4_1_pblendvb
;
3700 if (mode
!= V16QImode
)
3701 d
= gen_reg_rtx (V16QImode
);
3702 op_false
= gen_lowpart (V16QImode
, op_false
);
3703 op_true
= gen_lowpart (V16QImode
, op_true
);
3704 cmp
= gen_lowpart (V16QImode
, cmp
);
3709 gen
= gen_avx_blendvps256
;
3713 gen
= gen_avx_blendvpd256
;
3721 gen
= gen_avx2_pblendvb
;
3722 if (mode
!= V32QImode
)
3723 d
= gen_reg_rtx (V32QImode
);
3724 op_false
= gen_lowpart (V32QImode
, op_false
);
3725 op_true
= gen_lowpart (V32QImode
, op_true
);
3726 cmp
= gen_lowpart (V32QImode
, cmp
);
3731 gen
= gen_avx512bw_blendmv64qi
;
3734 gen
= gen_avx512bw_blendmv32hi
;
3737 gen
= gen_avx512f_blendmv16si
;
3740 gen
= gen_avx512f_blendmv8di
;
3743 gen
= gen_avx512f_blendmv8df
;
3746 gen
= gen_avx512f_blendmv16sf
;
3755 emit_insn (gen (d
, op_false
, op_true
, cmp
));
3757 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), d
));
3761 op_true
= force_reg (mode
, op_true
);
3763 t2
= gen_reg_rtx (mode
);
3765 t3
= gen_reg_rtx (mode
);
3769 x
= gen_rtx_AND (mode
, op_true
, cmp
);
3770 emit_insn (gen_rtx_SET (t2
, x
));
3772 x
= gen_rtx_NOT (mode
, cmp
);
3773 x
= gen_rtx_AND (mode
, x
, op_false
);
3774 emit_insn (gen_rtx_SET (t3
, x
));
3776 x
= gen_rtx_IOR (mode
, t3
, t2
);
3777 emit_insn (gen_rtx_SET (dest
, x
));
3781 /* Swap, force into registers, or otherwise massage the two operands
3782 to an sse comparison with a mask result. Thus we differ a bit from
3783 ix86_prepare_fp_compare_args which expects to produce a flags result.
3785 The DEST operand exists to help determine whether to commute commutative
3786 operators. The POP0/POP1 operands are updated in place. The new
3787 comparison code is returned, or UNKNOWN if not implementable. */
3789 static enum rtx_code
3790 ix86_prepare_sse_fp_compare_args (rtx dest
, enum rtx_code code
,
3791 rtx
*pop0
, rtx
*pop1
)
3797 /* AVX supports all the needed comparisons. */
3800 /* We have no LTGT as an operator. We could implement it with
3801 NE & ORDERED, but this requires an extra temporary. It's
3802 not clear that it's worth it. */
3809 /* These are supported directly. */
3816 /* AVX has 3 operand comparisons, no need to swap anything. */
3819 /* For commutative operators, try to canonicalize the destination
3820 operand to be first in the comparison - this helps reload to
3821 avoid extra moves. */
3822 if (!dest
|| !rtx_equal_p (dest
, *pop1
))
3830 /* These are not supported directly before AVX, and furthermore
3831 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
3832 comparison operands to transform into something that is
3834 std::swap (*pop0
, *pop1
);
3835 code
= swap_condition (code
);
3845 /* Expand a floating-point conditional move. Return true if successful. */
3848 ix86_expand_fp_movcc (rtx operands
[])
3850 machine_mode mode
= GET_MODE (operands
[0]);
3851 enum rtx_code code
= GET_CODE (operands
[1]);
3852 rtx tmp
, compare_op
;
3853 rtx op0
= XEXP (operands
[1], 0);
3854 rtx op1
= XEXP (operands
[1], 1);
3856 if (TARGET_SSE_MATH
&& SSE_FLOAT_MODE_P (mode
))
3860 /* Since we've no cmove for sse registers, don't force bad register
3861 allocation just to gain access to it. Deny movcc when the
3862 comparison mode doesn't match the move mode. */
3863 cmode
= GET_MODE (op0
);
3864 if (cmode
== VOIDmode
)
3865 cmode
= GET_MODE (op1
);
3869 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
, &op0
, &op1
);
3870 if (code
== UNKNOWN
)
3873 if (ix86_expand_sse_fp_minmax (operands
[0], code
, op0
, op1
,
3874 operands
[2], operands
[3]))
3877 tmp
= ix86_expand_sse_cmp (operands
[0], code
, op0
, op1
,
3878 operands
[2], operands
[3]);
3879 ix86_expand_sse_movcc (operands
[0], tmp
, operands
[2], operands
[3]);
3883 if (GET_MODE (op0
) == TImode
3884 || (GET_MODE (op0
) == DImode
3888 /* The floating point conditional move instructions don't directly
3889 support conditions resulting from a signed integer comparison. */
3891 compare_op
= ix86_expand_compare (code
, op0
, op1
);
3892 if (!fcmov_comparison_operator (compare_op
, VOIDmode
))
3894 tmp
= gen_reg_rtx (QImode
);
3895 ix86_expand_setcc (tmp
, code
, op0
, op1
);
3897 compare_op
= ix86_expand_compare (NE
, tmp
, const0_rtx
);
3900 emit_insn (gen_rtx_SET (operands
[0],
3901 gen_rtx_IF_THEN_ELSE (mode
, compare_op
,
3902 operands
[2], operands
[3])));
3907 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
3910 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code
)
3935 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
3938 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code
)
3975 /* Return immediate value to be used in UNSPEC_PCMP
3976 for comparison CODE in MODE. */
3979 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code
, machine_mode mode
)
3981 if (FLOAT_MODE_P (mode
))
3982 return ix86_fp_cmp_code_to_pcmp_immediate (code
);
3983 return ix86_int_cmp_code_to_pcmp_immediate (code
);
3986 /* Expand AVX-512 vector comparison. */
3989 ix86_expand_mask_vec_cmp (rtx dest
, enum rtx_code code
, rtx cmp_op0
, rtx cmp_op1
)
3991 machine_mode mask_mode
= GET_MODE (dest
);
3992 machine_mode cmp_mode
= GET_MODE (cmp_op0
);
3993 rtx imm
= GEN_INT (ix86_cmp_code_to_pcmp_immediate (code
, cmp_mode
));
4003 unspec_code
= UNSPEC_UNSIGNED_PCMP
;
4007 unspec_code
= UNSPEC_PCMP
;
4010 unspec
= gen_rtx_UNSPEC (mask_mode
, gen_rtvec (3, cmp_op0
, cmp_op1
, imm
),
4012 emit_insn (gen_rtx_SET (dest
, unspec
));
4017 /* Expand fp vector comparison. */
4020 ix86_expand_fp_vec_cmp (rtx operands
[])
4022 enum rtx_code code
= GET_CODE (operands
[1]);
4025 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
,
4026 &operands
[2], &operands
[3]);
4027 if (code
== UNKNOWN
)
4030 switch (GET_CODE (operands
[1]))
4033 temp
= ix86_expand_sse_cmp (operands
[0], ORDERED
, operands
[2],
4034 operands
[3], NULL
, NULL
);
4035 cmp
= ix86_expand_sse_cmp (operands
[0], NE
, operands
[2],
4036 operands
[3], NULL
, NULL
);
4040 temp
= ix86_expand_sse_cmp (operands
[0], UNORDERED
, operands
[2],
4041 operands
[3], NULL
, NULL
);
4042 cmp
= ix86_expand_sse_cmp (operands
[0], EQ
, operands
[2],
4043 operands
[3], NULL
, NULL
);
4049 cmp
= expand_simple_binop (GET_MODE (cmp
), code
, temp
, cmp
, cmp
, 1,
4053 cmp
= ix86_expand_sse_cmp (operands
[0], code
, operands
[2], operands
[3],
4056 if (operands
[0] != cmp
)
4057 emit_move_insn (operands
[0], cmp
);
4063 ix86_expand_int_sse_cmp (rtx dest
, enum rtx_code code
, rtx cop0
, rtx cop1
,
4064 rtx op_true
, rtx op_false
, bool *negate
)
4066 machine_mode data_mode
= GET_MODE (dest
);
4067 machine_mode mode
= GET_MODE (cop0
);
4072 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4074 && (mode
== V16QImode
|| mode
== V8HImode
4075 || mode
== V4SImode
|| mode
== V2DImode
))
4077 /* AVX512F supports all of the comparsions
4078 on all 128/256/512-bit vector int types. */
4079 else if (ix86_use_mask_cmp_p (data_mode
, mode
, op_true
, op_false
))
4083 /* Canonicalize the comparison to EQ, GT, GTU. */
4094 code
= reverse_condition (code
);
4100 code
= reverse_condition (code
);
4106 std::swap (cop0
, cop1
);
4107 code
= swap_condition (code
);
4114 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4115 if (mode
== V2DImode
)
4120 /* SSE4.1 supports EQ. */
4127 /* SSE4.2 supports GT/GTU. */
4137 rtx optrue
= op_true
? op_true
: CONSTM1_RTX (data_mode
);
4138 rtx opfalse
= op_false
? op_false
: CONST0_RTX (data_mode
);
4140 std::swap (optrue
, opfalse
);
4142 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4143 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4144 min (x, y) == x). While we add one instruction (the minimum),
4145 we remove the need for two instructions in the negation, as the
4146 result is done this way.
4147 When using masks, do it for SI/DImode element types, as it is shorter
4148 than the two subtractions. */
4150 && GET_MODE_SIZE (mode
) != 64
4151 && vector_all_ones_operand (opfalse
, data_mode
)
4152 && optrue
== CONST0_RTX (data_mode
))
4154 && GET_MODE_SIZE (GET_MODE_INNER (mode
)) >= 4
4155 /* Don't do it if not using integer masks and we'd end up with
4156 the right values in the registers though. */
4157 && (GET_MODE_SIZE (mode
) == 64
4158 || !vector_all_ones_operand (optrue
, data_mode
)
4159 || opfalse
!= CONST0_RTX (data_mode
))))
4161 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
4166 gen
= (code
== GTU
) ? gen_uminv16si3
: gen_sminv16si3
;
4169 gen
= (code
== GTU
) ? gen_uminv8di3
: gen_sminv8di3
;
4170 cop0
= force_reg (mode
, cop0
);
4171 cop1
= force_reg (mode
, cop1
);
4175 gen
= (code
== GTU
) ? gen_uminv32qi3
: gen_sminv32qi3
;
4179 gen
= (code
== GTU
) ? gen_uminv16hi3
: gen_sminv16hi3
;
4183 gen
= (code
== GTU
) ? gen_uminv8si3
: gen_sminv8si3
;
4186 if (TARGET_AVX512VL
)
4188 gen
= (code
== GTU
) ? gen_uminv4di3
: gen_sminv4di3
;
4189 cop0
= force_reg (mode
, cop0
);
4190 cop1
= force_reg (mode
, cop1
);
4194 if (code
== GTU
&& TARGET_SSE2
)
4195 gen
= gen_uminv16qi3
;
4196 else if (code
== GT
&& TARGET_SSE4_1
)
4197 gen
= gen_sminv16qi3
;
4200 if (code
== GTU
&& TARGET_SSE4_1
)
4201 gen
= gen_uminv8hi3
;
4202 else if (code
== GT
&& TARGET_SSE2
)
4203 gen
= gen_sminv8hi3
;
4207 gen
= (code
== GTU
) ? gen_uminv4si3
: gen_sminv4si3
;
4210 if (TARGET_AVX512VL
)
4212 gen
= (code
== GTU
) ? gen_uminv2di3
: gen_sminv2di3
;
4213 cop0
= force_reg (mode
, cop0
);
4214 cop1
= force_reg (mode
, cop1
);
4223 rtx tem
= gen_reg_rtx (mode
);
4224 if (!vector_operand (cop0
, mode
))
4225 cop0
= force_reg (mode
, cop0
);
4226 if (!vector_operand (cop1
, mode
))
4227 cop1
= force_reg (mode
, cop1
);
4229 emit_insn (gen (tem
, cop0
, cop1
));
4235 /* Unsigned parallel compare is not supported by the hardware.
4236 Play some tricks to turn this into a signed comparison
4240 cop0
= force_reg (mode
, cop0
);
4253 /* Subtract (-(INT MAX) - 1) from both operands to make
4255 mask
= ix86_build_signbit_mask (mode
, true, false);
4256 t1
= gen_reg_rtx (mode
);
4257 emit_insn (gen_sub3_insn (t1
, cop0
, mask
));
4259 t2
= gen_reg_rtx (mode
);
4260 emit_insn (gen_sub3_insn (t2
, cop1
, mask
));
4274 /* Perform a parallel unsigned saturating subtraction. */
4275 x
= gen_reg_rtx (mode
);
4276 emit_insn (gen_rtx_SET
4277 (x
, gen_rtx_US_MINUS (mode
, cop0
, cop1
)));
4279 cop1
= CONST0_RTX (mode
);
4291 std::swap (op_true
, op_false
);
4293 /* Allow the comparison to be done in one mode, but the movcc to
4294 happen in another mode. */
4295 if (data_mode
== mode
)
4297 x
= ix86_expand_sse_cmp (dest
, code
, cop0
, cop1
,
4302 gcc_assert (GET_MODE_SIZE (data_mode
) == GET_MODE_SIZE (mode
));
4303 x
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), code
, cop0
, cop1
,
4305 if (GET_MODE (x
) == mode
)
4306 x
= gen_lowpart (data_mode
, x
);
4312 /* Expand integer vector comparison. */
4315 ix86_expand_int_vec_cmp (rtx operands
[])
4317 rtx_code code
= GET_CODE (operands
[1]);
4318 bool negate
= false;
4319 rtx cmp
= ix86_expand_int_sse_cmp (operands
[0], code
, operands
[2],
4320 operands
[3], NULL
, NULL
, &negate
);
4326 cmp
= ix86_expand_int_sse_cmp (operands
[0], EQ
, cmp
,
4327 CONST0_RTX (GET_MODE (cmp
)),
4328 NULL
, NULL
, &negate
);
4330 gcc_assert (!negate
);
4332 if (operands
[0] != cmp
)
4333 emit_move_insn (operands
[0], cmp
);
4338 /* Expand a floating-point vector conditional move; a vcond operation
4339 rather than a movcc operation. */
4342 ix86_expand_fp_vcond (rtx operands
[])
4344 enum rtx_code code
= GET_CODE (operands
[3]);
4347 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
,
4348 &operands
[4], &operands
[5]);
4349 if (code
== UNKNOWN
)
4352 switch (GET_CODE (operands
[3]))
4355 temp
= ix86_expand_sse_cmp (operands
[0], ORDERED
, operands
[4],
4356 operands
[5], operands
[0], operands
[0]);
4357 cmp
= ix86_expand_sse_cmp (operands
[0], NE
, operands
[4],
4358 operands
[5], operands
[1], operands
[2]);
4362 temp
= ix86_expand_sse_cmp (operands
[0], UNORDERED
, operands
[4],
4363 operands
[5], operands
[0], operands
[0]);
4364 cmp
= ix86_expand_sse_cmp (operands
[0], EQ
, operands
[4],
4365 operands
[5], operands
[1], operands
[2]);
4371 cmp
= expand_simple_binop (GET_MODE (cmp
), code
, temp
, cmp
, cmp
, 1,
4373 ix86_expand_sse_movcc (operands
[0], cmp
, operands
[1], operands
[2]);
4377 if (ix86_expand_sse_fp_minmax (operands
[0], code
, operands
[4],
4378 operands
[5], operands
[1], operands
[2]))
4381 cmp
= ix86_expand_sse_cmp (operands
[0], code
, operands
[4], operands
[5],
4382 operands
[1], operands
[2]);
4383 ix86_expand_sse_movcc (operands
[0], cmp
, operands
[1], operands
[2]);
4387 /* Expand a signed/unsigned integral vector conditional move. */
4390 ix86_expand_int_vcond (rtx operands
[])
4392 machine_mode data_mode
= GET_MODE (operands
[0]);
4393 machine_mode mode
= GET_MODE (operands
[4]);
4394 enum rtx_code code
= GET_CODE (operands
[3]);
4395 bool negate
= false;
4401 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4402 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4403 if ((code
== LT
|| code
== GE
)
4404 && data_mode
== mode
4405 && cop1
== CONST0_RTX (mode
)
4406 && operands
[1 + (code
== LT
)] == CONST0_RTX (data_mode
)
4407 && GET_MODE_UNIT_SIZE (data_mode
) > 1
4408 && GET_MODE_UNIT_SIZE (data_mode
) <= 8
4409 && (GET_MODE_SIZE (data_mode
) == 16
4410 || (TARGET_AVX2
&& GET_MODE_SIZE (data_mode
) == 32)))
4412 rtx negop
= operands
[2 - (code
== LT
)];
4413 int shift
= GET_MODE_UNIT_BITSIZE (data_mode
) - 1;
4414 if (negop
== CONST1_RTX (data_mode
))
4416 rtx res
= expand_simple_binop (mode
, LSHIFTRT
, cop0
, GEN_INT (shift
),
4417 operands
[0], 1, OPTAB_DIRECT
);
4418 if (res
!= operands
[0])
4419 emit_move_insn (operands
[0], res
);
4422 else if (GET_MODE_INNER (data_mode
) != DImode
4423 && vector_all_ones_operand (negop
, data_mode
))
4425 rtx res
= expand_simple_binop (mode
, ASHIFTRT
, cop0
, GEN_INT (shift
),
4426 operands
[0], 0, OPTAB_DIRECT
);
4427 if (res
!= operands
[0])
4428 emit_move_insn (operands
[0], res
);
4433 if (!nonimmediate_operand (cop1
, mode
))
4434 cop1
= force_reg (mode
, cop1
);
4435 if (!general_operand (operands
[1], data_mode
))
4436 operands
[1] = force_reg (data_mode
, operands
[1]);
4437 if (!general_operand (operands
[2], data_mode
))
4438 operands
[2] = force_reg (data_mode
, operands
[2]);
4440 x
= ix86_expand_int_sse_cmp (operands
[0], code
, cop0
, cop1
,
4441 operands
[1], operands
[2], &negate
);
4446 ix86_expand_sse_movcc (operands
[0], x
, operands
[1+negate
],
4447 operands
[2-negate
]);
4452 ix86_expand_vec_perm_vpermt2 (rtx target
, rtx mask
, rtx op0
, rtx op1
,
4453 struct expand_vec_perm_d
*d
)
4455 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4456 expander, so args are either in d, or in op0, op1 etc. */
4457 machine_mode mode
= GET_MODE (d
? d
->op0
: op0
);
4458 machine_mode maskmode
= mode
;
4459 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
4464 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
4465 gen
= gen_avx512vl_vpermt2varv8hi3
;
4468 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
4469 gen
= gen_avx512vl_vpermt2varv16hi3
;
4472 if (TARGET_AVX512VBMI
)
4473 gen
= gen_avx512bw_vpermt2varv64qi3
;
4476 if (TARGET_AVX512BW
)
4477 gen
= gen_avx512bw_vpermt2varv32hi3
;
4480 if (TARGET_AVX512VL
)
4481 gen
= gen_avx512vl_vpermt2varv4si3
;
4484 if (TARGET_AVX512VL
)
4485 gen
= gen_avx512vl_vpermt2varv8si3
;
4489 gen
= gen_avx512f_vpermt2varv16si3
;
4492 if (TARGET_AVX512VL
)
4494 gen
= gen_avx512vl_vpermt2varv4sf3
;
4495 maskmode
= V4SImode
;
4499 if (TARGET_AVX512VL
)
4501 gen
= gen_avx512vl_vpermt2varv8sf3
;
4502 maskmode
= V8SImode
;
4508 gen
= gen_avx512f_vpermt2varv16sf3
;
4509 maskmode
= V16SImode
;
4513 if (TARGET_AVX512VL
)
4514 gen
= gen_avx512vl_vpermt2varv2di3
;
4517 if (TARGET_AVX512VL
)
4518 gen
= gen_avx512vl_vpermt2varv4di3
;
4522 gen
= gen_avx512f_vpermt2varv8di3
;
4525 if (TARGET_AVX512VL
)
4527 gen
= gen_avx512vl_vpermt2varv2df3
;
4528 maskmode
= V2DImode
;
4532 if (TARGET_AVX512VL
)
4534 gen
= gen_avx512vl_vpermt2varv4df3
;
4535 maskmode
= V4DImode
;
4541 gen
= gen_avx512f_vpermt2varv8df3
;
4542 maskmode
= V8DImode
;
4552 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4553 expander, so args are either in d, or in op0, op1 etc. */
4560 for (int i
= 0; i
< d
->nelt
; ++i
)
4561 vec
[i
] = GEN_INT (d
->perm
[i
]);
4562 mask
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (d
->nelt
, vec
));
4565 emit_insn (gen (target
, force_reg (maskmode
, mask
), op0
, op1
));
4569 /* Expand a variable vector permutation. */
4572 ix86_expand_vec_perm (rtx operands
[])
4574 rtx target
= operands
[0];
4575 rtx op0
= operands
[1];
4576 rtx op1
= operands
[2];
4577 rtx mask
= operands
[3];
4578 rtx t1
, t2
, t3
, t4
, t5
, t6
, t7
, t8
, vt
, vt2
, vec
[32];
4579 machine_mode mode
= GET_MODE (op0
);
4580 machine_mode maskmode
= GET_MODE (mask
);
4582 bool one_operand_shuffle
= rtx_equal_p (op0
, op1
);
4584 /* Number of elements in the vector. */
4585 w
= GET_MODE_NUNITS (mode
);
4586 e
= GET_MODE_UNIT_SIZE (mode
);
4587 gcc_assert (w
<= 64);
4589 if (TARGET_AVX512F
&& one_operand_shuffle
)
4591 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
4595 gen
=gen_avx512f_permvarv16si
;
4598 gen
= gen_avx512f_permvarv16sf
;
4601 gen
= gen_avx512f_permvarv8di
;
4604 gen
= gen_avx512f_permvarv8df
;
4611 emit_insn (gen (target
, op0
, mask
));
4616 if (ix86_expand_vec_perm_vpermt2 (target
, mask
, op0
, op1
, NULL
))
4621 if (mode
== V4DImode
|| mode
== V4DFmode
|| mode
== V16HImode
)
4623 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
4624 an constant shuffle operand. With a tiny bit of effort we can
4625 use VPERMD instead. A re-interpretation stall for V4DFmode is
4626 unfortunate but there's no avoiding it.
4627 Similarly for V16HImode we don't have instructions for variable
4628 shuffling, while for V32QImode we can use after preparing suitable
4629 masks vpshufb; vpshufb; vpermq; vpor. */
4631 if (mode
== V16HImode
)
4633 maskmode
= mode
= V32QImode
;
4639 maskmode
= mode
= V8SImode
;
4643 t1
= gen_reg_rtx (maskmode
);
4645 /* Replicate the low bits of the V4DImode mask into V8SImode:
4647 t1 = { A A B B C C D D }. */
4648 for (i
= 0; i
< w
/ 2; ++i
)
4649 vec
[i
*2 + 1] = vec
[i
*2] = GEN_INT (i
* 2);
4650 vt
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (w
, vec
));
4651 vt
= force_reg (maskmode
, vt
);
4652 mask
= gen_lowpart (maskmode
, mask
);
4653 if (maskmode
== V8SImode
)
4654 emit_insn (gen_avx2_permvarv8si (t1
, mask
, vt
));
4656 emit_insn (gen_avx2_pshufbv32qi3 (t1
, mask
, vt
));
4658 /* Multiply the shuffle indicies by two. */
4659 t1
= expand_simple_binop (maskmode
, PLUS
, t1
, t1
, t1
, 1,
4662 /* Add one to the odd shuffle indicies:
4663 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
4664 for (i
= 0; i
< w
/ 2; ++i
)
4666 vec
[i
* 2] = const0_rtx
;
4667 vec
[i
* 2 + 1] = const1_rtx
;
4669 vt
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (w
, vec
));
4670 vt
= validize_mem (force_const_mem (maskmode
, vt
));
4671 t1
= expand_simple_binop (maskmode
, PLUS
, t1
, vt
, t1
, 1,
4674 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
4675 operands
[3] = mask
= t1
;
4676 target
= gen_reg_rtx (mode
);
4677 op0
= gen_lowpart (mode
, op0
);
4678 op1
= gen_lowpart (mode
, op1
);
4684 /* The VPERMD and VPERMPS instructions already properly ignore
4685 the high bits of the shuffle elements. No need for us to
4686 perform an AND ourselves. */
4687 if (one_operand_shuffle
)
4689 emit_insn (gen_avx2_permvarv8si (target
, op0
, mask
));
4690 if (target
!= operands
[0])
4691 emit_move_insn (operands
[0],
4692 gen_lowpart (GET_MODE (operands
[0]), target
));
4696 t1
= gen_reg_rtx (V8SImode
);
4697 t2
= gen_reg_rtx (V8SImode
);
4698 emit_insn (gen_avx2_permvarv8si (t1
, op0
, mask
));
4699 emit_insn (gen_avx2_permvarv8si (t2
, op1
, mask
));
4705 mask
= gen_lowpart (V8SImode
, mask
);
4706 if (one_operand_shuffle
)
4707 emit_insn (gen_avx2_permvarv8sf (target
, op0
, mask
));
4710 t1
= gen_reg_rtx (V8SFmode
);
4711 t2
= gen_reg_rtx (V8SFmode
);
4712 emit_insn (gen_avx2_permvarv8sf (t1
, op0
, mask
));
4713 emit_insn (gen_avx2_permvarv8sf (t2
, op1
, mask
));
4719 /* By combining the two 128-bit input vectors into one 256-bit
4720 input vector, we can use VPERMD and VPERMPS for the full
4721 two-operand shuffle. */
4722 t1
= gen_reg_rtx (V8SImode
);
4723 t2
= gen_reg_rtx (V8SImode
);
4724 emit_insn (gen_avx_vec_concatv8si (t1
, op0
, op1
));
4725 emit_insn (gen_avx_vec_concatv8si (t2
, mask
, mask
));
4726 emit_insn (gen_avx2_permvarv8si (t1
, t1
, t2
));
4727 emit_insn (gen_avx_vextractf128v8si (target
, t1
, const0_rtx
));
4731 t1
= gen_reg_rtx (V8SFmode
);
4732 t2
= gen_reg_rtx (V8SImode
);
4733 mask
= gen_lowpart (V4SImode
, mask
);
4734 emit_insn (gen_avx_vec_concatv8sf (t1
, op0
, op1
));
4735 emit_insn (gen_avx_vec_concatv8si (t2
, mask
, mask
));
4736 emit_insn (gen_avx2_permvarv8sf (t1
, t1
, t2
));
4737 emit_insn (gen_avx_vextractf128v8sf (target
, t1
, const0_rtx
));
4741 t1
= gen_reg_rtx (V32QImode
);
4742 t2
= gen_reg_rtx (V32QImode
);
4743 t3
= gen_reg_rtx (V32QImode
);
4744 vt2
= GEN_INT (-128);
4745 vt
= gen_const_vec_duplicate (V32QImode
, vt2
);
4746 vt
= force_reg (V32QImode
, vt
);
4747 for (i
= 0; i
< 32; i
++)
4748 vec
[i
] = i
< 16 ? vt2
: const0_rtx
;
4749 vt2
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, vec
));
4750 vt2
= force_reg (V32QImode
, vt2
);
4751 /* From mask create two adjusted masks, which contain the same
4752 bits as mask in the low 7 bits of each vector element.
4753 The first mask will have the most significant bit clear
4754 if it requests element from the same 128-bit lane
4755 and MSB set if it requests element from the other 128-bit lane.
4756 The second mask will have the opposite values of the MSB,
4757 and additionally will have its 128-bit lanes swapped.
4758 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
4759 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
4760 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
4761 stands for other 12 bytes. */
4762 /* The bit whether element is from the same lane or the other
4763 lane is bit 4, so shift it up by 3 to the MSB position. */
4764 t5
= gen_reg_rtx (V4DImode
);
4765 emit_insn (gen_ashlv4di3 (t5
, gen_lowpart (V4DImode
, mask
),
4767 /* Clear MSB bits from the mask just in case it had them set. */
4768 emit_insn (gen_avx2_andnotv32qi3 (t2
, vt
, mask
));
4769 /* After this t1 will have MSB set for elements from other lane. */
4770 emit_insn (gen_xorv32qi3 (t1
, gen_lowpart (V32QImode
, t5
), vt2
));
4771 /* Clear bits other than MSB. */
4772 emit_insn (gen_andv32qi3 (t1
, t1
, vt
));
4773 /* Or in the lower bits from mask into t3. */
4774 emit_insn (gen_iorv32qi3 (t3
, t1
, t2
));
4775 /* And invert MSB bits in t1, so MSB is set for elements from the same
4777 emit_insn (gen_xorv32qi3 (t1
, t1
, vt
));
4778 /* Swap 128-bit lanes in t3. */
4779 t6
= gen_reg_rtx (V4DImode
);
4780 emit_insn (gen_avx2_permv4di_1 (t6
, gen_lowpart (V4DImode
, t3
),
4781 const2_rtx
, GEN_INT (3),
4782 const0_rtx
, const1_rtx
));
4783 /* And or in the lower bits from mask into t1. */
4784 emit_insn (gen_iorv32qi3 (t1
, t1
, t2
));
4785 if (one_operand_shuffle
)
4787 /* Each of these shuffles will put 0s in places where
4788 element from the other 128-bit lane is needed, otherwise
4789 will shuffle in the requested value. */
4790 emit_insn (gen_avx2_pshufbv32qi3 (t3
, op0
,
4791 gen_lowpart (V32QImode
, t6
)));
4792 emit_insn (gen_avx2_pshufbv32qi3 (t1
, op0
, t1
));
4793 /* For t3 the 128-bit lanes are swapped again. */
4794 t7
= gen_reg_rtx (V4DImode
);
4795 emit_insn (gen_avx2_permv4di_1 (t7
, gen_lowpart (V4DImode
, t3
),
4796 const2_rtx
, GEN_INT (3),
4797 const0_rtx
, const1_rtx
));
4798 /* And oring both together leads to the result. */
4799 emit_insn (gen_iorv32qi3 (target
, t1
,
4800 gen_lowpart (V32QImode
, t7
)));
4801 if (target
!= operands
[0])
4802 emit_move_insn (operands
[0],
4803 gen_lowpart (GET_MODE (operands
[0]), target
));
4807 t4
= gen_reg_rtx (V32QImode
);
4808 /* Similarly to the above one_operand_shuffle code,
4809 just for repeated twice for each operand. merge_two:
4810 code will merge the two results together. */
4811 emit_insn (gen_avx2_pshufbv32qi3 (t4
, op0
,
4812 gen_lowpart (V32QImode
, t6
)));
4813 emit_insn (gen_avx2_pshufbv32qi3 (t3
, op1
,
4814 gen_lowpart (V32QImode
, t6
)));
4815 emit_insn (gen_avx2_pshufbv32qi3 (t2
, op0
, t1
));
4816 emit_insn (gen_avx2_pshufbv32qi3 (t1
, op1
, t1
));
4817 t7
= gen_reg_rtx (V4DImode
);
4818 emit_insn (gen_avx2_permv4di_1 (t7
, gen_lowpart (V4DImode
, t4
),
4819 const2_rtx
, GEN_INT (3),
4820 const0_rtx
, const1_rtx
));
4821 t8
= gen_reg_rtx (V4DImode
);
4822 emit_insn (gen_avx2_permv4di_1 (t8
, gen_lowpart (V4DImode
, t3
),
4823 const2_rtx
, GEN_INT (3),
4824 const0_rtx
, const1_rtx
));
4825 emit_insn (gen_iorv32qi3 (t4
, t2
, gen_lowpart (V32QImode
, t7
)));
4826 emit_insn (gen_iorv32qi3 (t3
, t1
, gen_lowpart (V32QImode
, t8
)));
4832 gcc_assert (GET_MODE_SIZE (mode
) <= 16);
4839 /* The XOP VPPERM insn supports three inputs. By ignoring the
4840 one_operand_shuffle special case, we avoid creating another
4841 set of constant vectors in memory. */
4842 one_operand_shuffle
= false;
4844 /* mask = mask & {2*w-1, ...} */
4845 vt
= GEN_INT (2*w
- 1);
4849 /* mask = mask & {w-1, ...} */
4850 vt
= GEN_INT (w
- 1);
4853 vt
= gen_const_vec_duplicate (maskmode
, vt
);
4854 mask
= expand_simple_binop (maskmode
, AND
, mask
, vt
,
4855 NULL_RTX
, 0, OPTAB_DIRECT
);
4857 /* For non-QImode operations, convert the word permutation control
4858 into a byte permutation control. */
4859 if (mode
!= V16QImode
)
4861 mask
= expand_simple_binop (maskmode
, ASHIFT
, mask
,
4862 GEN_INT (exact_log2 (e
)),
4863 NULL_RTX
, 0, OPTAB_DIRECT
);
4865 /* Convert mask to vector of chars. */
4866 mask
= force_reg (V16QImode
, gen_lowpart (V16QImode
, mask
));
4868 /* Replicate each of the input bytes into byte positions:
4869 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
4870 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
4871 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
4872 for (i
= 0; i
< 16; ++i
)
4873 vec
[i
] = GEN_INT (i
/e
* e
);
4874 vt
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, vec
));
4875 vt
= validize_mem (force_const_mem (V16QImode
, vt
));
4877 emit_insn (gen_xop_pperm (mask
, mask
, mask
, vt
));
4879 emit_insn (gen_ssse3_pshufbv16qi3 (mask
, mask
, vt
));
4881 /* Convert it into the byte positions by doing
4882 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
4883 for (i
= 0; i
< 16; ++i
)
4884 vec
[i
] = GEN_INT (i
% e
);
4885 vt
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, vec
));
4886 vt
= validize_mem (force_const_mem (V16QImode
, vt
));
4887 emit_insn (gen_addv16qi3 (mask
, mask
, vt
));
4890 /* The actual shuffle operations all operate on V16QImode. */
4891 op0
= gen_lowpart (V16QImode
, op0
);
4892 op1
= gen_lowpart (V16QImode
, op1
);
4896 if (GET_MODE (target
) != V16QImode
)
4897 target
= gen_reg_rtx (V16QImode
);
4898 emit_insn (gen_xop_pperm (target
, op0
, op1
, mask
));
4899 if (target
!= operands
[0])
4900 emit_move_insn (operands
[0],
4901 gen_lowpart (GET_MODE (operands
[0]), target
));
4903 else if (one_operand_shuffle
)
4905 if (GET_MODE (target
) != V16QImode
)
4906 target
= gen_reg_rtx (V16QImode
);
4907 emit_insn (gen_ssse3_pshufbv16qi3 (target
, op0
, mask
));
4908 if (target
!= operands
[0])
4909 emit_move_insn (operands
[0],
4910 gen_lowpart (GET_MODE (operands
[0]), target
));
4917 /* Shuffle the two input vectors independently. */
4918 t1
= gen_reg_rtx (V16QImode
);
4919 t2
= gen_reg_rtx (V16QImode
);
4920 emit_insn (gen_ssse3_pshufbv16qi3 (t1
, op0
, mask
));
4921 emit_insn (gen_ssse3_pshufbv16qi3 (t2
, op1
, mask
));
4924 /* Then merge them together. The key is whether any given control
4925 element contained a bit set that indicates the second word. */
4928 if (maskmode
== V2DImode
&& !TARGET_SSE4_1
)
4930 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
4931 more shuffle to convert the V2DI input mask into a V4SI
4932 input mask. At which point the masking that expand_int_vcond
4933 will work as desired. */
4934 rtx t3
= gen_reg_rtx (V4SImode
);
4935 emit_insn (gen_sse2_pshufd_1 (t3
, gen_lowpart (V4SImode
, mask
),
4936 const0_rtx
, const0_rtx
,
4937 const2_rtx
, const2_rtx
));
4939 maskmode
= V4SImode
;
4943 vt
= gen_const_vec_duplicate (maskmode
, vt
);
4944 vt
= force_reg (maskmode
, vt
);
4945 mask
= expand_simple_binop (maskmode
, AND
, mask
, vt
,
4946 NULL_RTX
, 0, OPTAB_DIRECT
);
4948 if (GET_MODE (target
) != mode
)
4949 target
= gen_reg_rtx (mode
);
4951 xops
[1] = gen_lowpart (mode
, t2
);
4952 xops
[2] = gen_lowpart (mode
, t1
);
4953 xops
[3] = gen_rtx_EQ (maskmode
, mask
, vt
);
4956 ok
= ix86_expand_int_vcond (xops
);
4958 if (target
!= operands
[0])
4959 emit_move_insn (operands
[0],
4960 gen_lowpart (GET_MODE (operands
[0]), target
));
4964 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
4965 true if we should do zero extension, else sign extension. HIGH_P is
4966 true if we want the N/2 high elements, else the low elements. */
4969 ix86_expand_sse_unpack (rtx dest
, rtx src
, bool unsigned_p
, bool high_p
)
4971 machine_mode imode
= GET_MODE (src
);
4976 rtx (*unpack
)(rtx
, rtx
);
4977 rtx (*extract
)(rtx
, rtx
) = NULL
;
4978 machine_mode halfmode
= BLKmode
;
4984 unpack
= gen_avx512bw_zero_extendv32qiv32hi2
;
4986 unpack
= gen_avx512bw_sign_extendv32qiv32hi2
;
4987 halfmode
= V32QImode
;
4989 = high_p
? gen_vec_extract_hi_v64qi
: gen_vec_extract_lo_v64qi
;
4993 unpack
= gen_avx2_zero_extendv16qiv16hi2
;
4995 unpack
= gen_avx2_sign_extendv16qiv16hi2
;
4996 halfmode
= V16QImode
;
4998 = high_p
? gen_vec_extract_hi_v32qi
: gen_vec_extract_lo_v32qi
;
5002 unpack
= gen_avx512f_zero_extendv16hiv16si2
;
5004 unpack
= gen_avx512f_sign_extendv16hiv16si2
;
5005 halfmode
= V16HImode
;
5007 = high_p
? gen_vec_extract_hi_v32hi
: gen_vec_extract_lo_v32hi
;
5011 unpack
= gen_avx2_zero_extendv8hiv8si2
;
5013 unpack
= gen_avx2_sign_extendv8hiv8si2
;
5014 halfmode
= V8HImode
;
5016 = high_p
? gen_vec_extract_hi_v16hi
: gen_vec_extract_lo_v16hi
;
5020 unpack
= gen_avx512f_zero_extendv8siv8di2
;
5022 unpack
= gen_avx512f_sign_extendv8siv8di2
;
5023 halfmode
= V8SImode
;
5025 = high_p
? gen_vec_extract_hi_v16si
: gen_vec_extract_lo_v16si
;
5029 unpack
= gen_avx2_zero_extendv4siv4di2
;
5031 unpack
= gen_avx2_sign_extendv4siv4di2
;
5032 halfmode
= V4SImode
;
5034 = high_p
? gen_vec_extract_hi_v8si
: gen_vec_extract_lo_v8si
;
5038 unpack
= gen_sse4_1_zero_extendv8qiv8hi2
;
5040 unpack
= gen_sse4_1_sign_extendv8qiv8hi2
;
5044 unpack
= gen_sse4_1_zero_extendv4hiv4si2
;
5046 unpack
= gen_sse4_1_sign_extendv4hiv4si2
;
5050 unpack
= gen_sse4_1_zero_extendv2siv2di2
;
5052 unpack
= gen_sse4_1_sign_extendv2siv2di2
;
5058 if (GET_MODE_SIZE (imode
) >= 32)
5060 tmp
= gen_reg_rtx (halfmode
);
5061 emit_insn (extract (tmp
, src
));
5065 /* Shift higher 8 bytes to lower 8 bytes. */
5066 tmp
= gen_reg_rtx (V1TImode
);
5067 emit_insn (gen_sse2_lshrv1ti3 (tmp
, gen_lowpart (V1TImode
, src
),
5069 tmp
= gen_lowpart (imode
, tmp
);
5074 emit_insn (unpack (dest
, tmp
));
5078 rtx (*unpack
)(rtx
, rtx
, rtx
);
5084 unpack
= gen_vec_interleave_highv16qi
;
5086 unpack
= gen_vec_interleave_lowv16qi
;
5090 unpack
= gen_vec_interleave_highv8hi
;
5092 unpack
= gen_vec_interleave_lowv8hi
;
5096 unpack
= gen_vec_interleave_highv4si
;
5098 unpack
= gen_vec_interleave_lowv4si
;
5105 tmp
= force_reg (imode
, CONST0_RTX (imode
));
5107 tmp
= ix86_expand_sse_cmp (gen_reg_rtx (imode
), GT
, CONST0_RTX (imode
),
5108 src
, pc_rtx
, pc_rtx
);
5110 rtx tmp2
= gen_reg_rtx (imode
);
5111 emit_insn (unpack (tmp2
, src
, tmp
));
5112 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), tmp2
));
5116 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5117 but works for floating pointer parameters and nonoffsetable memories.
5118 For pushes, it returns just stack offsets; the values will be saved
5119 in the right order. Maximally three parts are generated. */
5122 ix86_split_to_parts (rtx operand
, rtx
*parts
, machine_mode mode
)
5127 size
= mode
==XFmode
? 3 : GET_MODE_SIZE (mode
) / 4;
5129 size
= (GET_MODE_SIZE (mode
) + 4) / 8;
5131 gcc_assert (!REG_P (operand
) || !MMX_REGNO_P (REGNO (operand
)));
5132 gcc_assert (size
>= 2 && size
<= 4);
5134 /* Optimize constant pool reference to immediates. This is used by fp
5135 moves, that force all constants to memory to allow combining. */
5136 if (MEM_P (operand
) && MEM_READONLY_P (operand
))
5137 operand
= avoid_constant_pool_reference (operand
);
5139 if (MEM_P (operand
) && !offsettable_memref_p (operand
))
5141 /* The only non-offsetable memories we handle are pushes. */
5142 int ok
= push_operand (operand
, VOIDmode
);
5146 operand
= copy_rtx (operand
);
5147 PUT_MODE (operand
, word_mode
);
5148 parts
[0] = parts
[1] = parts
[2] = parts
[3] = operand
;
5152 if (GET_CODE (operand
) == CONST_VECTOR
)
5154 scalar_int_mode imode
= int_mode_for_mode (mode
).require ();
5155 /* Caution: if we looked through a constant pool memory above,
5156 the operand may actually have a different mode now. That's
5157 ok, since we want to pun this all the way back to an integer. */
5158 operand
= simplify_subreg (imode
, operand
, GET_MODE (operand
), 0);
5159 gcc_assert (operand
!= NULL
);
5166 split_double_mode (mode
, &operand
, 1, &parts
[0], &parts
[1]);
5171 if (REG_P (operand
))
5173 gcc_assert (reload_completed
);
5174 for (i
= 0; i
< size
; i
++)
5175 parts
[i
] = gen_rtx_REG (SImode
, REGNO (operand
) + i
);
5177 else if (offsettable_memref_p (operand
))
5179 operand
= adjust_address (operand
, SImode
, 0);
5181 for (i
= 1; i
< size
; i
++)
5182 parts
[i
] = adjust_address (operand
, SImode
, 4 * i
);
5184 else if (CONST_DOUBLE_P (operand
))
5186 const REAL_VALUE_TYPE
*r
;
5189 r
= CONST_DOUBLE_REAL_VALUE (operand
);
5193 real_to_target (l
, r
, mode
);
5194 parts
[3] = gen_int_mode (l
[3], SImode
);
5195 parts
[2] = gen_int_mode (l
[2], SImode
);
5198 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5199 long double may not be 80-bit. */
5200 real_to_target (l
, r
, mode
);
5201 parts
[2] = gen_int_mode (l
[2], SImode
);
5204 REAL_VALUE_TO_TARGET_DOUBLE (*r
, l
);
5209 parts
[1] = gen_int_mode (l
[1], SImode
);
5210 parts
[0] = gen_int_mode (l
[0], SImode
);
5219 split_double_mode (mode
, &operand
, 1, &parts
[0], &parts
[1]);
5220 if (mode
== XFmode
|| mode
== TFmode
)
5222 machine_mode upper_mode
= mode
==XFmode
? SImode
: DImode
;
5223 if (REG_P (operand
))
5225 gcc_assert (reload_completed
);
5226 parts
[0] = gen_rtx_REG (DImode
, REGNO (operand
) + 0);
5227 parts
[1] = gen_rtx_REG (upper_mode
, REGNO (operand
) + 1);
5229 else if (offsettable_memref_p (operand
))
5231 operand
= adjust_address (operand
, DImode
, 0);
5233 parts
[1] = adjust_address (operand
, upper_mode
, 8);
5235 else if (CONST_DOUBLE_P (operand
))
5239 real_to_target (l
, CONST_DOUBLE_REAL_VALUE (operand
), mode
);
5241 /* real_to_target puts 32-bit pieces in each long. */
5242 parts
[0] = gen_int_mode ((l
[0] & HOST_WIDE_INT_C (0xffffffff))
5243 | ((l
[1] & HOST_WIDE_INT_C (0xffffffff))
5246 if (upper_mode
== SImode
)
5247 parts
[1] = gen_int_mode (l
[2], SImode
);
5250 = gen_int_mode ((l
[2] & HOST_WIDE_INT_C (0xffffffff))
5251 | ((l
[3] & HOST_WIDE_INT_C (0xffffffff))
5262 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5263 Return false when normal moves are needed; true when all required
5264 insns have been emitted. Operands 2-4 contain the input values
5265 int the correct order; operands 5-7 contain the output values. */
5268 ix86_split_long_move (rtx operands
[])
5274 machine_mode mode
= GET_MODE (operands
[0]);
5275 bool collisionparts
[4];
5277 /* The DFmode expanders may ask us to move double.
5278 For 64bit target this is single move. By hiding the fact
5279 here we simplify i386.md splitters. */
5280 if (TARGET_64BIT
&& GET_MODE_SIZE (GET_MODE (operands
[0])) == 8)
5282 /* Optimize constant pool reference to immediates. This is used by
5283 fp moves, that force all constants to memory to allow combining. */
5285 if (MEM_P (operands
[1])
5286 && GET_CODE (XEXP (operands
[1], 0)) == SYMBOL_REF
5287 && CONSTANT_POOL_ADDRESS_P (XEXP (operands
[1], 0)))
5288 operands
[1] = get_pool_constant (XEXP (operands
[1], 0));
5289 if (push_operand (operands
[0], VOIDmode
))
5291 operands
[0] = copy_rtx (operands
[0]);
5292 PUT_MODE (operands
[0], word_mode
);
5295 operands
[0] = gen_lowpart (DImode
, operands
[0]);
5296 operands
[1] = gen_lowpart (DImode
, operands
[1]);
5297 emit_move_insn (operands
[0], operands
[1]);
5301 /* The only non-offsettable memory we handle is push. */
5302 if (push_operand (operands
[0], VOIDmode
))
5305 gcc_assert (!MEM_P (operands
[0])
5306 || offsettable_memref_p (operands
[0]));
5308 nparts
= ix86_split_to_parts (operands
[1], part
[1], GET_MODE (operands
[0]));
5309 ix86_split_to_parts (operands
[0], part
[0], GET_MODE (operands
[0]));
5311 /* When emitting push, take care for source operands on the stack. */
5312 if (push
&& MEM_P (operands
[1])
5313 && reg_overlap_mentioned_p (stack_pointer_rtx
, operands
[1]))
5315 rtx src_base
= XEXP (part
[1][nparts
- 1], 0);
5317 /* Compensate for the stack decrement by 4. */
5318 if (!TARGET_64BIT
&& nparts
== 3
5319 && mode
== XFmode
&& TARGET_128BIT_LONG_DOUBLE
)
5320 src_base
= plus_constant (Pmode
, src_base
, 4);
5322 /* src_base refers to the stack pointer and is
5323 automatically decreased by emitted push. */
5324 for (i
= 0; i
< nparts
; i
++)
5325 part
[1][i
] = change_address (part
[1][i
],
5326 GET_MODE (part
[1][i
]), src_base
);
5329 /* We need to do copy in the right order in case an address register
5330 of the source overlaps the destination. */
5331 if (REG_P (part
[0][0]) && MEM_P (part
[1][0]))
5335 for (i
= 0; i
< nparts
; i
++)
5338 = reg_overlap_mentioned_p (part
[0][i
], XEXP (part
[1][0], 0));
5339 if (collisionparts
[i
])
5343 /* Collision in the middle part can be handled by reordering. */
5344 if (collisions
== 1 && nparts
== 3 && collisionparts
[1])
5346 std::swap (part
[0][1], part
[0][2]);
5347 std::swap (part
[1][1], part
[1][2]);
5349 else if (collisions
== 1
5351 && (collisionparts
[1] || collisionparts
[2]))
5353 if (collisionparts
[1])
5355 std::swap (part
[0][1], part
[0][2]);
5356 std::swap (part
[1][1], part
[1][2]);
5360 std::swap (part
[0][2], part
[0][3]);
5361 std::swap (part
[1][2], part
[1][3]);
5365 /* If there are more collisions, we can't handle it by reordering.
5366 Do an lea to the last part and use only one colliding move. */
5367 else if (collisions
> 1)
5373 base
= part
[0][nparts
- 1];
5375 /* Handle the case when the last part isn't valid for lea.
5376 Happens in 64-bit mode storing the 12-byte XFmode. */
5377 if (GET_MODE (base
) != Pmode
)
5378 base
= gen_rtx_REG (Pmode
, REGNO (base
));
5380 addr
= XEXP (part
[1][0], 0);
5381 if (TARGET_TLS_DIRECT_SEG_REFS
)
5383 struct ix86_address parts
;
5384 int ok
= ix86_decompose_address (addr
, &parts
);
5386 /* It is not valid to use %gs: or %fs: in lea. */
5387 gcc_assert (parts
.seg
== ADDR_SPACE_GENERIC
);
5389 emit_insn (gen_rtx_SET (base
, addr
));
5390 part
[1][0] = replace_equiv_address (part
[1][0], base
);
5391 for (i
= 1; i
< nparts
; i
++)
5393 tmp
= plus_constant (Pmode
, base
, UNITS_PER_WORD
* i
);
5394 part
[1][i
] = replace_equiv_address (part
[1][i
], tmp
);
5405 if (TARGET_128BIT_LONG_DOUBLE
&& mode
== XFmode
)
5406 emit_insn (gen_add2_insn (stack_pointer_rtx
, GEN_INT (-4)));
5407 emit_move_insn (part
[0][2], part
[1][2]);
5409 else if (nparts
== 4)
5411 emit_move_insn (part
[0][3], part
[1][3]);
5412 emit_move_insn (part
[0][2], part
[1][2]);
5417 /* In 64bit mode we don't have 32bit push available. In case this is
5418 register, it is OK - we will just use larger counterpart. We also
5419 retype memory - these comes from attempt to avoid REX prefix on
5420 moving of second half of TFmode value. */
5421 if (GET_MODE (part
[1][1]) == SImode
)
5423 switch (GET_CODE (part
[1][1]))
5426 part
[1][1] = adjust_address (part
[1][1], DImode
, 0);
5430 part
[1][1] = gen_rtx_REG (DImode
, REGNO (part
[1][1]));
5437 if (GET_MODE (part
[1][0]) == SImode
)
5438 part
[1][0] = part
[1][1];
5441 emit_move_insn (part
[0][1], part
[1][1]);
5442 emit_move_insn (part
[0][0], part
[1][0]);
5446 /* Choose correct order to not overwrite the source before it is copied. */
5447 if ((REG_P (part
[0][0])
5448 && REG_P (part
[1][1])
5449 && (REGNO (part
[0][0]) == REGNO (part
[1][1])
5451 && REGNO (part
[0][0]) == REGNO (part
[1][2]))
5453 && REGNO (part
[0][0]) == REGNO (part
[1][3]))))
5455 && reg_overlap_mentioned_p (part
[0][0], XEXP (part
[1][0], 0))))
5457 for (i
= 0, j
= nparts
- 1; i
< nparts
; i
++, j
--)
5459 operands
[2 + i
] = part
[0][j
];
5460 operands
[6 + i
] = part
[1][j
];
5465 for (i
= 0; i
< nparts
; i
++)
5467 operands
[2 + i
] = part
[0][i
];
5468 operands
[6 + i
] = part
[1][i
];
5472 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
5473 if (optimize_insn_for_size_p ())
5475 for (j
= 0; j
< nparts
- 1; j
++)
5476 if (CONST_INT_P (operands
[6 + j
])
5477 && operands
[6 + j
] != const0_rtx
5478 && REG_P (operands
[2 + j
]))
5479 for (i
= j
; i
< nparts
- 1; i
++)
5480 if (CONST_INT_P (operands
[7 + i
])
5481 && INTVAL (operands
[7 + i
]) == INTVAL (operands
[6 + j
]))
5482 operands
[7 + i
] = operands
[2 + j
];
5485 for (i
= 0; i
< nparts
; i
++)
5486 emit_move_insn (operands
[2 + i
], operands
[6 + i
]);
5491 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
5492 left shift by a constant, either using a single shift or
5493 a sequence of add instructions. */
5496 ix86_expand_ashl_const (rtx operand
, int count
, machine_mode mode
)
5499 || (count
* ix86_cost
->add
<= ix86_cost
->shift_const
5500 && !optimize_insn_for_size_p ()))
5503 emit_insn (gen_add2_insn (operand
, operand
));
5507 rtx (*insn
)(rtx
, rtx
, rtx
);
5509 insn
= mode
== DImode
? gen_ashlsi3
: gen_ashldi3
;
5510 emit_insn (insn (operand
, operand
, GEN_INT (count
)));
5515 ix86_split_ashl (rtx
*operands
, rtx scratch
, machine_mode mode
)
5517 rtx (*gen_ashl3
)(rtx
, rtx
, rtx
);
5518 rtx (*gen_shld
)(rtx
, rtx
, rtx
);
5519 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
5520 machine_mode half_mode
;
5522 rtx low
[2], high
[2];
5525 if (CONST_INT_P (operands
[2]))
5527 split_double_mode (mode
, operands
, 2, low
, high
);
5528 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
5530 if (count
>= half_width
)
5532 emit_move_insn (high
[0], low
[1]);
5533 emit_move_insn (low
[0], const0_rtx
);
5535 if (count
> half_width
)
5536 ix86_expand_ashl_const (high
[0], count
- half_width
, mode
);
5540 gen_shld
= mode
== DImode
? gen_x86_shld
: gen_x86_64_shld
;
5542 if (!rtx_equal_p (operands
[0], operands
[1]))
5543 emit_move_insn (operands
[0], operands
[1]);
5545 emit_insn (gen_shld (high
[0], low
[0], GEN_INT (count
)));
5546 ix86_expand_ashl_const (low
[0], count
, mode
);
5551 split_double_mode (mode
, operands
, 1, low
, high
);
5552 half_mode
= mode
== DImode
? SImode
: DImode
;
5554 gen_ashl3
= mode
== DImode
? gen_ashlsi3
: gen_ashldi3
;
5556 if (operands
[1] == const1_rtx
)
5558 /* Assuming we've chosen a QImode capable registers, then 1 << N
5559 can be done with two 32/64-bit shifts, no branches, no cmoves. */
5560 if (ANY_QI_REG_P (low
[0]) && ANY_QI_REG_P (high
[0]))
5562 rtx s
, d
, flags
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
5564 ix86_expand_clear (low
[0]);
5565 ix86_expand_clear (high
[0]);
5566 emit_insn (gen_testqi_ccz_1 (operands
[2], GEN_INT (half_width
)));
5568 d
= gen_lowpart (QImode
, low
[0]);
5569 d
= gen_rtx_STRICT_LOW_PART (VOIDmode
, d
);
5570 s
= gen_rtx_EQ (QImode
, flags
, const0_rtx
);
5571 emit_insn (gen_rtx_SET (d
, s
));
5573 d
= gen_lowpart (QImode
, high
[0]);
5574 d
= gen_rtx_STRICT_LOW_PART (VOIDmode
, d
);
5575 s
= gen_rtx_NE (QImode
, flags
, const0_rtx
);
5576 emit_insn (gen_rtx_SET (d
, s
));
5579 /* Otherwise, we can get the same results by manually performing
5580 a bit extract operation on bit 5/6, and then performing the two
5581 shifts. The two methods of getting 0/1 into low/high are exactly
5582 the same size. Avoiding the shift in the bit extract case helps
5583 pentium4 a bit; no one else seems to care much either way. */
5586 rtx (*gen_lshr3
)(rtx
, rtx
, rtx
);
5587 rtx (*gen_and3
)(rtx
, rtx
, rtx
);
5588 rtx (*gen_xor3
)(rtx
, rtx
, rtx
);
5594 gen_lshr3
= gen_lshrsi3
;
5595 gen_and3
= gen_andsi3
;
5596 gen_xor3
= gen_xorsi3
;
5601 gen_lshr3
= gen_lshrdi3
;
5602 gen_and3
= gen_anddi3
;
5603 gen_xor3
= gen_xordi3
;
5607 if (TARGET_PARTIAL_REG_STALL
&& !optimize_insn_for_size_p ())
5608 x
= gen_rtx_ZERO_EXTEND (half_mode
, operands
[2]);
5610 x
= gen_lowpart (half_mode
, operands
[2]);
5611 emit_insn (gen_rtx_SET (high
[0], x
));
5613 emit_insn (gen_lshr3 (high
[0], high
[0], GEN_INT (bits
)));
5614 emit_insn (gen_and3 (high
[0], high
[0], const1_rtx
));
5615 emit_move_insn (low
[0], high
[0]);
5616 emit_insn (gen_xor3 (low
[0], low
[0], const1_rtx
));
5619 emit_insn (gen_ashl3 (low
[0], low
[0], operands
[2]));
5620 emit_insn (gen_ashl3 (high
[0], high
[0], operands
[2]));
5624 if (operands
[1] == constm1_rtx
)
5626 /* For -1 << N, we can avoid the shld instruction, because we
5627 know that we're shifting 0...31/63 ones into a -1. */
5628 emit_move_insn (low
[0], constm1_rtx
);
5629 if (optimize_insn_for_size_p ())
5630 emit_move_insn (high
[0], low
[0]);
5632 emit_move_insn (high
[0], constm1_rtx
);
5636 gen_shld
= mode
== DImode
? gen_x86_shld
: gen_x86_64_shld
;
5638 if (!rtx_equal_p (operands
[0], operands
[1]))
5639 emit_move_insn (operands
[0], operands
[1]);
5641 split_double_mode (mode
, operands
, 1, low
, high
);
5642 emit_insn (gen_shld (high
[0], low
[0], operands
[2]));
5645 emit_insn (gen_ashl3 (low
[0], low
[0], operands
[2]));
5647 if (TARGET_CMOVE
&& scratch
)
5649 ix86_expand_clear (scratch
);
5650 emit_insn (gen_x86_shift_adj_1
5651 (half_mode
, high
[0], low
[0], operands
[2], scratch
));
5654 emit_insn (gen_x86_shift_adj_2 (half_mode
, high
[0], low
[0], operands
[2]));
5658 ix86_split_ashr (rtx
*operands
, rtx scratch
, machine_mode mode
)
5660 rtx (*gen_ashr3
)(rtx
, rtx
, rtx
)
5661 = mode
== DImode
? gen_ashrsi3
: gen_ashrdi3
;
5662 rtx (*gen_shrd
)(rtx
, rtx
, rtx
);
5663 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
5665 rtx low
[2], high
[2];
5668 if (CONST_INT_P (operands
[2]))
5670 split_double_mode (mode
, operands
, 2, low
, high
);
5671 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
5673 if (count
== GET_MODE_BITSIZE (mode
) - 1)
5675 emit_move_insn (high
[0], high
[1]);
5676 emit_insn (gen_ashr3 (high
[0], high
[0],
5677 GEN_INT (half_width
- 1)));
5678 emit_move_insn (low
[0], high
[0]);
5681 else if (count
>= half_width
)
5683 emit_move_insn (low
[0], high
[1]);
5684 emit_move_insn (high
[0], low
[0]);
5685 emit_insn (gen_ashr3 (high
[0], high
[0],
5686 GEN_INT (half_width
- 1)));
5688 if (count
> half_width
)
5689 emit_insn (gen_ashr3 (low
[0], low
[0],
5690 GEN_INT (count
- half_width
)));
5694 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5696 if (!rtx_equal_p (operands
[0], operands
[1]))
5697 emit_move_insn (operands
[0], operands
[1]);
5699 emit_insn (gen_shrd (low
[0], high
[0], GEN_INT (count
)));
5700 emit_insn (gen_ashr3 (high
[0], high
[0], GEN_INT (count
)));
5705 machine_mode half_mode
;
5707 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5709 if (!rtx_equal_p (operands
[0], operands
[1]))
5710 emit_move_insn (operands
[0], operands
[1]);
5712 split_double_mode (mode
, operands
, 1, low
, high
);
5713 half_mode
= mode
== DImode
? SImode
: DImode
;
5715 emit_insn (gen_shrd (low
[0], high
[0], operands
[2]));
5716 emit_insn (gen_ashr3 (high
[0], high
[0], operands
[2]));
5718 if (TARGET_CMOVE
&& scratch
)
5720 emit_move_insn (scratch
, high
[0]);
5721 emit_insn (gen_ashr3 (scratch
, scratch
,
5722 GEN_INT (half_width
- 1)));
5723 emit_insn (gen_x86_shift_adj_1
5724 (half_mode
, low
[0], high
[0], operands
[2], scratch
));
5727 emit_insn (gen_x86_shift_adj_3
5728 (half_mode
, low
[0], high
[0], operands
[2]));
5733 ix86_split_lshr (rtx
*operands
, rtx scratch
, machine_mode mode
)
5735 rtx (*gen_lshr3
)(rtx
, rtx
, rtx
)
5736 = mode
== DImode
? gen_lshrsi3
: gen_lshrdi3
;
5737 rtx (*gen_shrd
)(rtx
, rtx
, rtx
);
5738 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
5740 rtx low
[2], high
[2];
5743 if (CONST_INT_P (operands
[2]))
5745 split_double_mode (mode
, operands
, 2, low
, high
);
5746 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
5748 if (count
>= half_width
)
5750 emit_move_insn (low
[0], high
[1]);
5751 ix86_expand_clear (high
[0]);
5753 if (count
> half_width
)
5754 emit_insn (gen_lshr3 (low
[0], low
[0],
5755 GEN_INT (count
- half_width
)));
5759 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5761 if (!rtx_equal_p (operands
[0], operands
[1]))
5762 emit_move_insn (operands
[0], operands
[1]);
5764 emit_insn (gen_shrd (low
[0], high
[0], GEN_INT (count
)));
5765 emit_insn (gen_lshr3 (high
[0], high
[0], GEN_INT (count
)));
5770 machine_mode half_mode
;
5772 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5774 if (!rtx_equal_p (operands
[0], operands
[1]))
5775 emit_move_insn (operands
[0], operands
[1]);
5777 split_double_mode (mode
, operands
, 1, low
, high
);
5778 half_mode
= mode
== DImode
? SImode
: DImode
;
5780 emit_insn (gen_shrd (low
[0], high
[0], operands
[2]));
5781 emit_insn (gen_lshr3 (high
[0], high
[0], operands
[2]));
5783 if (TARGET_CMOVE
&& scratch
)
5785 ix86_expand_clear (scratch
);
5786 emit_insn (gen_x86_shift_adj_1
5787 (half_mode
, low
[0], high
[0], operands
[2], scratch
));
5790 emit_insn (gen_x86_shift_adj_2
5791 (half_mode
, low
[0], high
[0], operands
[2]));
5795 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
5796 DImode for constant loop counts. */
5799 counter_mode (rtx count_exp
)
5801 if (GET_MODE (count_exp
) != VOIDmode
)
5802 return GET_MODE (count_exp
);
5803 if (!CONST_INT_P (count_exp
))
5805 if (TARGET_64BIT
&& (INTVAL (count_exp
) & ~0xffffffff))
5810 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
5811 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
5812 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
5813 memory by VALUE (supposed to be in MODE).
5815 The size is rounded down to whole number of chunk size moved at once.
5816 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
5820 expand_set_or_cpymem_via_loop (rtx destmem
, rtx srcmem
,
5821 rtx destptr
, rtx srcptr
, rtx value
,
5822 rtx count
, machine_mode mode
, int unroll
,
5823 int expected_size
, bool issetmem
)
5825 rtx_code_label
*out_label
, *top_label
;
5827 machine_mode iter_mode
= counter_mode (count
);
5828 int piece_size_n
= GET_MODE_SIZE (mode
) * unroll
;
5829 rtx piece_size
= GEN_INT (piece_size_n
);
5830 rtx piece_size_mask
= GEN_INT (~((GET_MODE_SIZE (mode
) * unroll
) - 1));
5834 top_label
= gen_label_rtx ();
5835 out_label
= gen_label_rtx ();
5836 iter
= gen_reg_rtx (iter_mode
);
5838 size
= expand_simple_binop (iter_mode
, AND
, count
, piece_size_mask
,
5839 NULL
, 1, OPTAB_DIRECT
);
5840 /* Those two should combine. */
5841 if (piece_size
== const1_rtx
)
5843 emit_cmp_and_jump_insns (size
, const0_rtx
, EQ
, NULL_RTX
, iter_mode
,
5845 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
5847 emit_move_insn (iter
, const0_rtx
);
5849 emit_label (top_label
);
5851 tmp
= convert_modes (Pmode
, iter_mode
, iter
, true);
5853 /* This assert could be relaxed - in this case we'll need to compute
5854 smallest power of two, containing in PIECE_SIZE_N and pass it to
5856 gcc_assert ((piece_size_n
& (piece_size_n
- 1)) == 0);
5857 destmem
= offset_address (destmem
, tmp
, piece_size_n
);
5858 destmem
= adjust_address (destmem
, mode
, 0);
5862 srcmem
= offset_address (srcmem
, copy_rtx (tmp
), piece_size_n
);
5863 srcmem
= adjust_address (srcmem
, mode
, 0);
5865 /* When unrolling for chips that reorder memory reads and writes,
5866 we can save registers by using single temporary.
5867 Also using 4 temporaries is overkill in 32bit mode. */
5868 if (!TARGET_64BIT
&& 0)
5870 for (i
= 0; i
< unroll
; i
++)
5874 destmem
= adjust_address (copy_rtx (destmem
), mode
,
5875 GET_MODE_SIZE (mode
));
5876 srcmem
= adjust_address (copy_rtx (srcmem
), mode
,
5877 GET_MODE_SIZE (mode
));
5879 emit_move_insn (destmem
, srcmem
);
5885 gcc_assert (unroll
<= 4);
5886 for (i
= 0; i
< unroll
; i
++)
5888 tmpreg
[i
] = gen_reg_rtx (mode
);
5890 srcmem
= adjust_address (copy_rtx (srcmem
), mode
,
5891 GET_MODE_SIZE (mode
));
5892 emit_move_insn (tmpreg
[i
], srcmem
);
5894 for (i
= 0; i
< unroll
; i
++)
5897 destmem
= adjust_address (copy_rtx (destmem
), mode
,
5898 GET_MODE_SIZE (mode
));
5899 emit_move_insn (destmem
, tmpreg
[i
]);
5904 for (i
= 0; i
< unroll
; i
++)
5907 destmem
= adjust_address (copy_rtx (destmem
), mode
,
5908 GET_MODE_SIZE (mode
));
5909 emit_move_insn (destmem
, value
);
5912 tmp
= expand_simple_binop (iter_mode
, PLUS
, iter
, piece_size
, iter
,
5913 true, OPTAB_LIB_WIDEN
);
5915 emit_move_insn (iter
, tmp
);
5917 emit_cmp_and_jump_insns (iter
, size
, LT
, NULL_RTX
, iter_mode
,
5919 if (expected_size
!= -1)
5921 expected_size
/= GET_MODE_SIZE (mode
) * unroll
;
5922 if (expected_size
== 0)
5924 else if (expected_size
> REG_BR_PROB_BASE
)
5925 predict_jump (REG_BR_PROB_BASE
- 1);
5927 predict_jump (REG_BR_PROB_BASE
- (REG_BR_PROB_BASE
+ expected_size
/ 2)
5931 predict_jump (REG_BR_PROB_BASE
* 80 / 100);
5932 iter
= ix86_zero_extend_to_Pmode (iter
);
5933 tmp
= expand_simple_binop (Pmode
, PLUS
, destptr
, iter
, destptr
,
5934 true, OPTAB_LIB_WIDEN
);
5936 emit_move_insn (destptr
, tmp
);
5939 tmp
= expand_simple_binop (Pmode
, PLUS
, srcptr
, iter
, srcptr
,
5940 true, OPTAB_LIB_WIDEN
);
5942 emit_move_insn (srcptr
, tmp
);
5944 emit_label (out_label
);
5947 /* Divide COUNTREG by SCALE. */
5949 scale_counter (rtx countreg
, int scale
)
5955 if (CONST_INT_P (countreg
))
5956 return GEN_INT (INTVAL (countreg
) / scale
);
5957 gcc_assert (REG_P (countreg
));
5959 sc
= expand_simple_binop (GET_MODE (countreg
), LSHIFTRT
, countreg
,
5960 GEN_INT (exact_log2 (scale
)),
5961 NULL
, 1, OPTAB_DIRECT
);
5965 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
5966 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
5967 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
5968 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
5969 ORIG_VALUE is the original value passed to memset to fill the memory with.
5970 Other arguments have same meaning as for previous function. */
5973 expand_set_or_cpymem_via_rep (rtx destmem
, rtx srcmem
,
5974 rtx destptr
, rtx srcptr
, rtx value
, rtx orig_value
,
5976 machine_mode mode
, bool issetmem
)
5981 HOST_WIDE_INT rounded_count
;
5983 /* If possible, it is shorter to use rep movs.
5984 TODO: Maybe it is better to move this logic to decide_alg. */
5985 if (mode
== QImode
&& CONST_INT_P (count
) && !(INTVAL (count
) & 3)
5986 && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
5987 && (!issetmem
|| orig_value
== const0_rtx
))
5990 if (destptr
!= XEXP (destmem
, 0) || GET_MODE (destmem
) != BLKmode
)
5991 destmem
= adjust_automodify_address_nv (destmem
, BLKmode
, destptr
, 0);
5993 countreg
= ix86_zero_extend_to_Pmode (scale_counter (count
,
5994 GET_MODE_SIZE (mode
)));
5997 destexp
= gen_rtx_ASHIFT (Pmode
, countreg
,
5998 GEN_INT (exact_log2 (GET_MODE_SIZE (mode
))));
5999 destexp
= gen_rtx_PLUS (Pmode
, destexp
, destptr
);
6002 destexp
= gen_rtx_PLUS (Pmode
, destptr
, countreg
);
6003 if ((!issetmem
|| orig_value
== const0_rtx
) && CONST_INT_P (count
))
6006 = ROUND_DOWN (INTVAL (count
), (HOST_WIDE_INT
) GET_MODE_SIZE (mode
));
6007 destmem
= shallow_copy_rtx (destmem
);
6008 set_mem_size (destmem
, rounded_count
);
6010 else if (MEM_SIZE_KNOWN_P (destmem
))
6011 clear_mem_size (destmem
);
6015 value
= force_reg (mode
, gen_lowpart (mode
, value
));
6016 emit_insn (gen_rep_stos (destptr
, countreg
, destmem
, value
, destexp
));
6020 if (srcptr
!= XEXP (srcmem
, 0) || GET_MODE (srcmem
) != BLKmode
)
6021 srcmem
= adjust_automodify_address_nv (srcmem
, BLKmode
, srcptr
, 0);
6024 srcexp
= gen_rtx_ASHIFT (Pmode
, countreg
,
6025 GEN_INT (exact_log2 (GET_MODE_SIZE (mode
))));
6026 srcexp
= gen_rtx_PLUS (Pmode
, srcexp
, srcptr
);
6029 srcexp
= gen_rtx_PLUS (Pmode
, srcptr
, countreg
);
6030 if (CONST_INT_P (count
))
6033 = ROUND_DOWN (INTVAL (count
), (HOST_WIDE_INT
) GET_MODE_SIZE (mode
));
6034 srcmem
= shallow_copy_rtx (srcmem
);
6035 set_mem_size (srcmem
, rounded_count
);
6039 if (MEM_SIZE_KNOWN_P (srcmem
))
6040 clear_mem_size (srcmem
);
6042 emit_insn (gen_rep_mov (destptr
, destmem
, srcptr
, srcmem
, countreg
,
6047 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
6049 SRC is passed by pointer to be updated on return.
6050 Return value is updated DST. */
6052 emit_memmov (rtx destmem
, rtx
*srcmem
, rtx destptr
, rtx srcptr
,
6053 HOST_WIDE_INT size_to_move
)
6055 rtx dst
= destmem
, src
= *srcmem
, tempreg
;
6056 enum insn_code code
;
6057 machine_mode move_mode
;
6060 /* Find the widest mode in which we could perform moves.
6061 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6062 it until move of such size is supported. */
6063 piece_size
= 1 << floor_log2 (size_to_move
);
6064 while (!int_mode_for_size (piece_size
* BITS_PER_UNIT
, 0).exists (&move_mode
)
6065 || (code
= optab_handler (mov_optab
, move_mode
)) == CODE_FOR_nothing
)
6067 gcc_assert (piece_size
> 1);
6071 /* Find the corresponding vector mode with the same size as MOVE_MODE.
6072 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
6073 if (GET_MODE_SIZE (move_mode
) > GET_MODE_SIZE (word_mode
))
6075 int nunits
= GET_MODE_SIZE (move_mode
) / GET_MODE_SIZE (word_mode
);
6076 if (!mode_for_vector (word_mode
, nunits
).exists (&move_mode
)
6077 || (code
= optab_handler (mov_optab
, move_mode
)) == CODE_FOR_nothing
)
6079 move_mode
= word_mode
;
6080 piece_size
= GET_MODE_SIZE (move_mode
);
6081 code
= optab_handler (mov_optab
, move_mode
);
6084 gcc_assert (code
!= CODE_FOR_nothing
);
6086 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
, 0);
6087 src
= adjust_automodify_address_nv (src
, move_mode
, srcptr
, 0);
6089 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6090 gcc_assert (size_to_move
% piece_size
== 0);
6092 for (i
= 0; i
< size_to_move
; i
+= piece_size
)
6094 /* We move from memory to memory, so we'll need to do it via
6095 a temporary register. */
6096 tempreg
= gen_reg_rtx (move_mode
);
6097 emit_insn (GEN_FCN (code
) (tempreg
, src
));
6098 emit_insn (GEN_FCN (code
) (dst
, tempreg
));
6100 emit_move_insn (destptr
,
6101 plus_constant (Pmode
, copy_rtx (destptr
), piece_size
));
6102 emit_move_insn (srcptr
,
6103 plus_constant (Pmode
, copy_rtx (srcptr
), piece_size
));
6105 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
6107 src
= adjust_automodify_address_nv (src
, move_mode
, srcptr
,
6111 /* Update DST and SRC rtx. */
6116 /* Helper function for the string operations below. Dest VARIABLE whether
6117 it is aligned to VALUE bytes. If true, jump to the label. */
6119 static rtx_code_label
*
6120 ix86_expand_aligntest (rtx variable
, int value
, bool epilogue
)
6122 rtx_code_label
*label
= gen_label_rtx ();
6123 rtx tmpcount
= gen_reg_rtx (GET_MODE (variable
));
6124 if (GET_MODE (variable
) == DImode
)
6125 emit_insn (gen_anddi3 (tmpcount
, variable
, GEN_INT (value
)));
6127 emit_insn (gen_andsi3 (tmpcount
, variable
, GEN_INT (value
)));
6128 emit_cmp_and_jump_insns (tmpcount
, const0_rtx
, EQ
, 0, GET_MODE (variable
),
6131 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
6133 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
6138 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
6141 expand_cpymem_epilogue (rtx destmem
, rtx srcmem
,
6142 rtx destptr
, rtx srcptr
, rtx count
, int max_size
)
6145 if (CONST_INT_P (count
))
6147 HOST_WIDE_INT countval
= INTVAL (count
);
6148 HOST_WIDE_INT epilogue_size
= countval
% max_size
;
6151 /* For now MAX_SIZE should be a power of 2. This assert could be
6152 relaxed, but it'll require a bit more complicated epilogue
6154 gcc_assert ((max_size
& (max_size
- 1)) == 0);
6155 for (i
= max_size
; i
>= 1; i
>>= 1)
6157 if (epilogue_size
& i
)
6158 destmem
= emit_memmov (destmem
, &srcmem
, destptr
, srcptr
, i
);
6164 count
= expand_simple_binop (GET_MODE (count
), AND
, count
, GEN_INT (max_size
- 1),
6165 count
, 1, OPTAB_DIRECT
);
6166 expand_set_or_cpymem_via_loop (destmem
, srcmem
, destptr
, srcptr
, NULL
,
6167 count
, QImode
, 1, 4, false);
6171 /* When there are stringops, we can cheaply increase dest and src pointers.
6172 Otherwise we save code size by maintaining offset (zero is readily
6173 available from preceding rep operation) and using x86 addressing modes.
6175 if (TARGET_SINGLE_STRINGOP
)
6179 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
6180 src
= change_address (srcmem
, SImode
, srcptr
);
6181 dest
= change_address (destmem
, SImode
, destptr
);
6182 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
6184 LABEL_NUSES (label
) = 1;
6188 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
6189 src
= change_address (srcmem
, HImode
, srcptr
);
6190 dest
= change_address (destmem
, HImode
, destptr
);
6191 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
6193 LABEL_NUSES (label
) = 1;
6197 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
6198 src
= change_address (srcmem
, QImode
, srcptr
);
6199 dest
= change_address (destmem
, QImode
, destptr
);
6200 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
6202 LABEL_NUSES (label
) = 1;
6207 rtx offset
= force_reg (Pmode
, const0_rtx
);
6212 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
6213 src
= change_address (srcmem
, SImode
, srcptr
);
6214 dest
= change_address (destmem
, SImode
, destptr
);
6215 emit_move_insn (dest
, src
);
6216 tmp
= expand_simple_binop (Pmode
, PLUS
, offset
, GEN_INT (4), NULL
,
6217 true, OPTAB_LIB_WIDEN
);
6219 emit_move_insn (offset
, tmp
);
6221 LABEL_NUSES (label
) = 1;
6225 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
6226 tmp
= gen_rtx_PLUS (Pmode
, srcptr
, offset
);
6227 src
= change_address (srcmem
, HImode
, tmp
);
6228 tmp
= gen_rtx_PLUS (Pmode
, destptr
, offset
);
6229 dest
= change_address (destmem
, HImode
, tmp
);
6230 emit_move_insn (dest
, src
);
6231 tmp
= expand_simple_binop (Pmode
, PLUS
, offset
, GEN_INT (2), tmp
,
6232 true, OPTAB_LIB_WIDEN
);
6234 emit_move_insn (offset
, tmp
);
6236 LABEL_NUSES (label
) = 1;
6240 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
6241 tmp
= gen_rtx_PLUS (Pmode
, srcptr
, offset
);
6242 src
= change_address (srcmem
, QImode
, tmp
);
6243 tmp
= gen_rtx_PLUS (Pmode
, destptr
, offset
);
6244 dest
= change_address (destmem
, QImode
, tmp
);
6245 emit_move_insn (dest
, src
);
6247 LABEL_NUSES (label
) = 1;
6252 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
6253 with value PROMOTED_VAL.
6254 SRC is passed by pointer to be updated on return.
6255 Return value is updated DST. */
6257 emit_memset (rtx destmem
, rtx destptr
, rtx promoted_val
,
6258 HOST_WIDE_INT size_to_move
)
6261 enum insn_code code
;
6262 machine_mode move_mode
;
6265 /* Find the widest mode in which we could perform moves.
6266 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6267 it until move of such size is supported. */
6268 move_mode
= GET_MODE (promoted_val
);
6269 if (move_mode
== VOIDmode
)
6271 if (size_to_move
< GET_MODE_SIZE (move_mode
))
6273 unsigned int move_bits
= size_to_move
* BITS_PER_UNIT
;
6274 move_mode
= int_mode_for_size (move_bits
, 0).require ();
6275 promoted_val
= gen_lowpart (move_mode
, promoted_val
);
6277 piece_size
= GET_MODE_SIZE (move_mode
);
6278 code
= optab_handler (mov_optab
, move_mode
);
6279 gcc_assert (code
!= CODE_FOR_nothing
&& promoted_val
!= NULL_RTX
);
6281 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
, 0);
6283 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6284 gcc_assert (size_to_move
% piece_size
== 0);
6286 for (i
= 0; i
< size_to_move
; i
+= piece_size
)
6288 if (piece_size
<= GET_MODE_SIZE (word_mode
))
6290 emit_insn (gen_strset (destptr
, dst
, promoted_val
));
6291 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
6296 emit_insn (GEN_FCN (code
) (dst
, promoted_val
));
6298 emit_move_insn (destptr
,
6299 plus_constant (Pmode
, copy_rtx (destptr
), piece_size
));
6301 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
6305 /* Update DST rtx. */
6308 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6310 expand_setmem_epilogue_via_loop (rtx destmem
, rtx destptr
, rtx value
,
6311 rtx count
, int max_size
)
6313 count
= expand_simple_binop (counter_mode (count
), AND
, count
,
6314 GEN_INT (max_size
- 1), count
, 1, OPTAB_DIRECT
);
6315 expand_set_or_cpymem_via_loop (destmem
, NULL
, destptr
, NULL
,
6316 gen_lowpart (QImode
, value
), count
, QImode
,
6317 1, max_size
/ 2, true);
6320 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6322 expand_setmem_epilogue (rtx destmem
, rtx destptr
, rtx value
, rtx vec_value
,
6323 rtx count
, int max_size
)
6327 if (CONST_INT_P (count
))
6329 HOST_WIDE_INT countval
= INTVAL (count
);
6330 HOST_WIDE_INT epilogue_size
= countval
% max_size
;
6333 /* For now MAX_SIZE should be a power of 2. This assert could be
6334 relaxed, but it'll require a bit more complicated epilogue
6336 gcc_assert ((max_size
& (max_size
- 1)) == 0);
6337 for (i
= max_size
; i
>= 1; i
>>= 1)
6339 if (epilogue_size
& i
)
6341 if (vec_value
&& i
> GET_MODE_SIZE (GET_MODE (value
)))
6342 destmem
= emit_memset (destmem
, destptr
, vec_value
, i
);
6344 destmem
= emit_memset (destmem
, destptr
, value
, i
);
6351 expand_setmem_epilogue_via_loop (destmem
, destptr
, value
, count
, max_size
);
6356 rtx_code_label
*label
= ix86_expand_aligntest (count
, 16, true);
6359 dest
= change_address (destmem
, DImode
, destptr
);
6360 emit_insn (gen_strset (destptr
, dest
, value
));
6361 dest
= adjust_automodify_address_nv (dest
, DImode
, destptr
, 8);
6362 emit_insn (gen_strset (destptr
, dest
, value
));
6366 dest
= change_address (destmem
, SImode
, destptr
);
6367 emit_insn (gen_strset (destptr
, dest
, value
));
6368 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 4);
6369 emit_insn (gen_strset (destptr
, dest
, value
));
6370 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 8);
6371 emit_insn (gen_strset (destptr
, dest
, value
));
6372 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 12);
6373 emit_insn (gen_strset (destptr
, dest
, value
));
6376 LABEL_NUSES (label
) = 1;
6380 rtx_code_label
*label
= ix86_expand_aligntest (count
, 8, true);
6383 dest
= change_address (destmem
, DImode
, destptr
);
6384 emit_insn (gen_strset (destptr
, dest
, value
));
6388 dest
= change_address (destmem
, SImode
, destptr
);
6389 emit_insn (gen_strset (destptr
, dest
, value
));
6390 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 4);
6391 emit_insn (gen_strset (destptr
, dest
, value
));
6394 LABEL_NUSES (label
) = 1;
6398 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
6399 dest
= change_address (destmem
, SImode
, destptr
);
6400 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (SImode
, value
)));
6402 LABEL_NUSES (label
) = 1;
6406 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
6407 dest
= change_address (destmem
, HImode
, destptr
);
6408 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (HImode
, value
)));
6410 LABEL_NUSES (label
) = 1;
6414 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
6415 dest
= change_address (destmem
, QImode
, destptr
);
6416 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (QImode
, value
)));
6418 LABEL_NUSES (label
) = 1;
6422 /* Adjust COUNTER by the VALUE. */
6424 ix86_adjust_counter (rtx countreg
, HOST_WIDE_INT value
)
6426 emit_insn (gen_add2_insn (countreg
, GEN_INT (-value
)));
6429 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
6430 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
6431 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
6433 Return value is updated DESTMEM. */
6436 expand_set_or_cpymem_prologue (rtx destmem
, rtx srcmem
,
6437 rtx destptr
, rtx srcptr
, rtx value
,
6438 rtx vec_value
, rtx count
, int align
,
6439 int desired_alignment
, bool issetmem
)
6442 for (i
= 1; i
< desired_alignment
; i
<<= 1)
6446 rtx_code_label
*label
= ix86_expand_aligntest (destptr
, i
, false);
6449 if (vec_value
&& i
> GET_MODE_SIZE (GET_MODE (value
)))
6450 destmem
= emit_memset (destmem
, destptr
, vec_value
, i
);
6452 destmem
= emit_memset (destmem
, destptr
, value
, i
);
6455 destmem
= emit_memmov (destmem
, &srcmem
, destptr
, srcptr
, i
);
6456 ix86_adjust_counter (count
, i
);
6458 LABEL_NUSES (label
) = 1;
6459 set_mem_align (destmem
, i
* 2 * BITS_PER_UNIT
);
6465 /* Test if COUNT&SIZE is nonzero and if so, expand movme
6466 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
6467 and jump to DONE_LABEL. */
6469 expand_small_cpymem_or_setmem (rtx destmem
, rtx srcmem
,
6470 rtx destptr
, rtx srcptr
,
6471 rtx value
, rtx vec_value
,
6472 rtx count
, int size
,
6473 rtx done_label
, bool issetmem
)
6475 rtx_code_label
*label
= ix86_expand_aligntest (count
, size
, false);
6476 machine_mode mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 1).else_blk ();
6480 /* If we do not have vector value to copy, we must reduce size. */
6485 if (GET_MODE (value
) == VOIDmode
&& size
> 8)
6487 else if (GET_MODE_SIZE (mode
) > GET_MODE_SIZE (GET_MODE (value
)))
6488 mode
= GET_MODE (value
);
6491 mode
= GET_MODE (vec_value
), value
= vec_value
;
6495 /* Choose appropriate vector mode. */
6497 mode
= TARGET_AVX
? V32QImode
: TARGET_SSE
? V16QImode
: DImode
;
6498 else if (size
>= 16)
6499 mode
= TARGET_SSE
? V16QImode
: DImode
;
6500 srcmem
= change_address (srcmem
, mode
, srcptr
);
6502 destmem
= change_address (destmem
, mode
, destptr
);
6503 modesize
= GEN_INT (GET_MODE_SIZE (mode
));
6504 gcc_assert (GET_MODE_SIZE (mode
) <= size
);
6505 for (n
= 0; n
* GET_MODE_SIZE (mode
) < size
; n
++)
6508 emit_move_insn (destmem
, gen_lowpart (mode
, value
));
6511 emit_move_insn (destmem
, srcmem
);
6512 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
6514 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
6517 destmem
= offset_address (destmem
, count
, 1);
6518 destmem
= offset_address (destmem
, GEN_INT (-2 * size
),
6519 GET_MODE_SIZE (mode
));
6522 srcmem
= offset_address (srcmem
, count
, 1);
6523 srcmem
= offset_address (srcmem
, GEN_INT (-2 * size
),
6524 GET_MODE_SIZE (mode
));
6526 for (n
= 0; n
* GET_MODE_SIZE (mode
) < size
; n
++)
6529 emit_move_insn (destmem
, gen_lowpart (mode
, value
));
6532 emit_move_insn (destmem
, srcmem
);
6533 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
6535 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
6537 emit_jump_insn (gen_jump (done_label
));
6541 LABEL_NUSES (label
) = 1;
6544 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
6545 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
6546 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
6547 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
6548 DONE_LABEL is a label after the whole copying sequence. The label is created
6549 on demand if *DONE_LABEL is NULL.
6550 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
6551 bounds after the initial copies.
6553 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
6554 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
6555 we will dispatch to a library call for large blocks.
6557 In pseudocode we do:
6561 Assume that SIZE is 4. Bigger sizes are handled analogously
6564 copy 4 bytes from SRCPTR to DESTPTR
6565 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
6570 copy 1 byte from SRCPTR to DESTPTR
6573 copy 2 bytes from SRCPTR to DESTPTR
6574 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
6579 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
6580 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
6582 OLD_DESPTR = DESTPTR;
6583 Align DESTPTR up to DESIRED_ALIGN
6584 SRCPTR += DESTPTR - OLD_DESTPTR
6585 COUNT -= DEST_PTR - OLD_DESTPTR
6587 Round COUNT down to multiple of SIZE
6588 << optional caller supplied zero size guard is here >>
6589 << optional caller supplied dynamic check is here >>
6590 << caller supplied main copy loop is here >>
6595 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem
, rtx srcmem
,
6596 rtx
*destptr
, rtx
*srcptr
,
6598 rtx value
, rtx vec_value
,
6600 rtx_code_label
**done_label
,
6604 unsigned HOST_WIDE_INT
*min_size
,
6608 rtx_code_label
*loop_label
= NULL
, *label
;
6611 int prolog_size
= 0;
6614 /* Chose proper value to copy. */
6615 if (issetmem
&& VECTOR_MODE_P (mode
))
6616 mode_value
= vec_value
;
6619 gcc_assert (GET_MODE_SIZE (mode
) <= size
);
6621 /* See if block is big or small, handle small blocks. */
6622 if (!CONST_INT_P (*count
) && *min_size
< (unsigned HOST_WIDE_INT
)size
)
6625 loop_label
= gen_label_rtx ();
6628 *done_label
= gen_label_rtx ();
6630 emit_cmp_and_jump_insns (*count
, GEN_INT (size2
), GE
, 0, GET_MODE (*count
),
6634 /* Handle sizes > 3. */
6635 for (;size2
> 2; size2
>>= 1)
6636 expand_small_cpymem_or_setmem (destmem
, srcmem
,
6640 size2
, *done_label
, issetmem
);
6641 /* Nothing to copy? Jump to DONE_LABEL if so */
6642 emit_cmp_and_jump_insns (*count
, const0_rtx
, EQ
, 0, GET_MODE (*count
),
6645 /* Do a byte copy. */
6646 destmem
= change_address (destmem
, QImode
, *destptr
);
6648 emit_move_insn (destmem
, gen_lowpart (QImode
, value
));
6651 srcmem
= change_address (srcmem
, QImode
, *srcptr
);
6652 emit_move_insn (destmem
, srcmem
);
6655 /* Handle sizes 2 and 3. */
6656 label
= ix86_expand_aligntest (*count
, 2, false);
6657 destmem
= change_address (destmem
, HImode
, *destptr
);
6658 destmem
= offset_address (destmem
, *count
, 1);
6659 destmem
= offset_address (destmem
, GEN_INT (-2), 2);
6661 emit_move_insn (destmem
, gen_lowpart (HImode
, value
));
6664 srcmem
= change_address (srcmem
, HImode
, *srcptr
);
6665 srcmem
= offset_address (srcmem
, *count
, 1);
6666 srcmem
= offset_address (srcmem
, GEN_INT (-2), 2);
6667 emit_move_insn (destmem
, srcmem
);
6671 LABEL_NUSES (label
) = 1;
6672 emit_jump_insn (gen_jump (*done_label
));
6676 gcc_assert (*min_size
>= (unsigned HOST_WIDE_INT
)size
6677 || UINTVAL (*count
) >= (unsigned HOST_WIDE_INT
)size
);
6679 /* Start memcpy for COUNT >= SIZE. */
6682 emit_label (loop_label
);
6683 LABEL_NUSES (loop_label
) = 1;
6686 /* Copy first desired_align bytes. */
6688 srcmem
= change_address (srcmem
, mode
, *srcptr
);
6689 destmem
= change_address (destmem
, mode
, *destptr
);
6690 modesize
= GEN_INT (GET_MODE_SIZE (mode
));
6691 for (n
= 0; prolog_size
< desired_align
- align
; n
++)
6694 emit_move_insn (destmem
, mode_value
);
6697 emit_move_insn (destmem
, srcmem
);
6698 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
6700 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
6701 prolog_size
+= GET_MODE_SIZE (mode
);
6705 /* Copy last SIZE bytes. */
6706 destmem
= offset_address (destmem
, *count
, 1);
6707 destmem
= offset_address (destmem
,
6708 GEN_INT (-size
- prolog_size
),
6711 emit_move_insn (destmem
, mode_value
);
6714 srcmem
= offset_address (srcmem
, *count
, 1);
6715 srcmem
= offset_address (srcmem
,
6716 GEN_INT (-size
- prolog_size
),
6718 emit_move_insn (destmem
, srcmem
);
6720 for (n
= 1; n
* GET_MODE_SIZE (mode
) < size
; n
++)
6722 destmem
= offset_address (destmem
, modesize
, 1);
6724 emit_move_insn (destmem
, mode_value
);
6727 srcmem
= offset_address (srcmem
, modesize
, 1);
6728 emit_move_insn (destmem
, srcmem
);
6732 /* Align destination. */
6733 if (desired_align
> 1 && desired_align
> align
)
6735 rtx saveddest
= *destptr
;
6737 gcc_assert (desired_align
<= size
);
6738 /* Align destptr up, place it to new register. */
6739 *destptr
= expand_simple_binop (GET_MODE (*destptr
), PLUS
, *destptr
,
6740 GEN_INT (prolog_size
),
6741 NULL_RTX
, 1, OPTAB_DIRECT
);
6742 if (REG_P (*destptr
) && REG_P (saveddest
) && REG_POINTER (saveddest
))
6743 REG_POINTER (*destptr
) = 1;
6744 *destptr
= expand_simple_binop (GET_MODE (*destptr
), AND
, *destptr
,
6745 GEN_INT (-desired_align
),
6746 *destptr
, 1, OPTAB_DIRECT
);
6747 /* See how many bytes we skipped. */
6748 saveddest
= expand_simple_binop (GET_MODE (*destptr
), MINUS
, saveddest
,
6750 saveddest
, 1, OPTAB_DIRECT
);
6751 /* Adjust srcptr and count. */
6753 *srcptr
= expand_simple_binop (GET_MODE (*srcptr
), MINUS
, *srcptr
,
6754 saveddest
, *srcptr
, 1, OPTAB_DIRECT
);
6755 *count
= expand_simple_binop (GET_MODE (*count
), PLUS
, *count
,
6756 saveddest
, *count
, 1, OPTAB_DIRECT
);
6757 /* We copied at most size + prolog_size. */
6758 if (*min_size
> (unsigned HOST_WIDE_INT
)(size
+ prolog_size
))
6760 = ROUND_DOWN (*min_size
- size
, (unsigned HOST_WIDE_INT
)size
);
6764 /* Our loops always round down the block size, but for dispatch to
6765 library we need precise value. */
6767 *count
= expand_simple_binop (GET_MODE (*count
), AND
, *count
,
6768 GEN_INT (-size
), *count
, 1, OPTAB_DIRECT
);
6772 gcc_assert (prolog_size
== 0);
6773 /* Decrease count, so we won't end up copying last word twice. */
6774 if (!CONST_INT_P (*count
))
6775 *count
= expand_simple_binop (GET_MODE (*count
), PLUS
, *count
,
6776 constm1_rtx
, *count
, 1, OPTAB_DIRECT
);
6778 *count
= GEN_INT (ROUND_DOWN (UINTVAL (*count
) - 1,
6779 (unsigned HOST_WIDE_INT
)size
));
6781 *min_size
= ROUND_DOWN (*min_size
- 1, (unsigned HOST_WIDE_INT
)size
);
6786 /* This function is like the previous one, except here we know how many bytes
6787 need to be copied. That allows us to update alignment not only of DST, which
6788 is returned, but also of SRC, which is passed as a pointer for that
6791 expand_set_or_cpymem_constant_prologue (rtx dst
, rtx
*srcp
, rtx destreg
,
6792 rtx srcreg
, rtx value
, rtx vec_value
,
6793 int desired_align
, int align_bytes
,
6798 rtx orig_src
= NULL
;
6800 int copied_bytes
= 0;
6804 gcc_assert (srcp
!= NULL
);
6809 for (piece_size
= 1;
6810 piece_size
<= desired_align
&& copied_bytes
< align_bytes
;
6813 if (align_bytes
& piece_size
)
6817 if (vec_value
&& piece_size
> GET_MODE_SIZE (GET_MODE (value
)))
6818 dst
= emit_memset (dst
, destreg
, vec_value
, piece_size
);
6820 dst
= emit_memset (dst
, destreg
, value
, piece_size
);
6823 dst
= emit_memmov (dst
, &src
, destreg
, srcreg
, piece_size
);
6824 copied_bytes
+= piece_size
;
6827 if (MEM_ALIGN (dst
) < (unsigned int) desired_align
* BITS_PER_UNIT
)
6828 set_mem_align (dst
, desired_align
* BITS_PER_UNIT
);
6829 if (MEM_SIZE_KNOWN_P (orig_dst
))
6830 set_mem_size (dst
, MEM_SIZE (orig_dst
) - align_bytes
);
6834 int src_align_bytes
= get_mem_align_offset (src
, desired_align
6836 if (src_align_bytes
>= 0)
6837 src_align_bytes
= desired_align
- src_align_bytes
;
6838 if (src_align_bytes
>= 0)
6840 unsigned int src_align
;
6841 for (src_align
= desired_align
; src_align
>= 2; src_align
>>= 1)
6843 if ((src_align_bytes
& (src_align
- 1))
6844 == (align_bytes
& (src_align
- 1)))
6847 if (src_align
> (unsigned int) desired_align
)
6848 src_align
= desired_align
;
6849 if (MEM_ALIGN (src
) < src_align
* BITS_PER_UNIT
)
6850 set_mem_align (src
, src_align
* BITS_PER_UNIT
);
6852 if (MEM_SIZE_KNOWN_P (orig_src
))
6853 set_mem_size (src
, MEM_SIZE (orig_src
) - align_bytes
);
6860 /* Return true if ALG can be used in current context.
6861 Assume we expand memset if MEMSET is true. */
6863 alg_usable_p (enum stringop_alg alg
, bool memset
, bool have_as
)
6865 if (alg
== no_stringop
)
6867 if (alg
== vector_loop
)
6868 return TARGET_SSE
|| TARGET_AVX
;
6869 /* Algorithms using the rep prefix want at least edi and ecx;
6870 additionally, memset wants eax and memcpy wants esi. Don't
6871 consider such algorithms if the user has appropriated those
6872 registers for their own purposes, or if we have a non-default
6873 address space, since some string insns cannot override the segment. */
6874 if (alg
== rep_prefix_1_byte
6875 || alg
== rep_prefix_4_byte
6876 || alg
== rep_prefix_8_byte
)
6880 if (fixed_regs
[CX_REG
]
6881 || fixed_regs
[DI_REG
]
6882 || (memset
? fixed_regs
[AX_REG
] : fixed_regs
[SI_REG
]))
6888 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
6889 static enum stringop_alg
6890 decide_alg (HOST_WIDE_INT count
, HOST_WIDE_INT expected_size
,
6891 unsigned HOST_WIDE_INT min_size
, unsigned HOST_WIDE_INT max_size
,
6892 bool memset
, bool zero_memset
, bool have_as
,
6893 int *dynamic_check
, bool *noalign
, bool recur
)
6895 const struct stringop_algs
*algs
;
6896 bool optimize_for_speed
;
6898 const struct processor_costs
*cost
;
6900 bool any_alg_usable_p
= false;
6903 *dynamic_check
= -1;
6905 /* Even if the string operation call is cold, we still might spend a lot
6906 of time processing large blocks. */
6907 if (optimize_function_for_size_p (cfun
)
6908 || (optimize_insn_for_size_p ()
6910 || (expected_size
!= -1 && expected_size
< 256))))
6911 optimize_for_speed
= false;
6913 optimize_for_speed
= true;
6915 cost
= optimize_for_speed
? ix86_cost
: &ix86_size_cost
;
6917 algs
= &cost
->memset
[TARGET_64BIT
!= 0];
6919 algs
= &cost
->memcpy
[TARGET_64BIT
!= 0];
6921 /* See maximal size for user defined algorithm. */
6922 for (i
= 0; i
< MAX_STRINGOP_ALGS
; i
++)
6924 enum stringop_alg candidate
= algs
->size
[i
].alg
;
6925 bool usable
= alg_usable_p (candidate
, memset
, have_as
);
6926 any_alg_usable_p
|= usable
;
6928 if (candidate
!= libcall
&& candidate
&& usable
)
6929 max
= algs
->size
[i
].max
;
6932 /* If expected size is not known but max size is small enough
6933 so inline version is a win, set expected size into
6935 if (((max
> 1 && (unsigned HOST_WIDE_INT
) max
>= max_size
) || max
== -1)
6936 && expected_size
== -1)
6937 expected_size
= min_size
/ 2 + max_size
/ 2;
6939 /* If user specified the algorithm, honor it if possible. */
6940 if (ix86_stringop_alg
!= no_stringop
6941 && alg_usable_p (ix86_stringop_alg
, memset
, have_as
))
6942 return ix86_stringop_alg
;
6943 /* rep; movq or rep; movl is the smallest variant. */
6944 else if (!optimize_for_speed
)
6947 if (!count
|| (count
& 3) || (memset
&& !zero_memset
))
6948 return alg_usable_p (rep_prefix_1_byte
, memset
, have_as
)
6949 ? rep_prefix_1_byte
: loop_1_byte
;
6951 return alg_usable_p (rep_prefix_4_byte
, memset
, have_as
)
6952 ? rep_prefix_4_byte
: loop
;
6954 /* Very tiny blocks are best handled via the loop, REP is expensive to
6956 else if (expected_size
!= -1 && expected_size
< 4)
6958 else if (expected_size
!= -1)
6960 enum stringop_alg alg
= libcall
;
6961 bool alg_noalign
= false;
6962 for (i
= 0; i
< MAX_STRINGOP_ALGS
; i
++)
6964 /* We get here if the algorithms that were not libcall-based
6965 were rep-prefix based and we are unable to use rep prefixes
6966 based on global register usage. Break out of the loop and
6967 use the heuristic below. */
6968 if (algs
->size
[i
].max
== 0)
6970 if (algs
->size
[i
].max
>= expected_size
|| algs
->size
[i
].max
== -1)
6972 enum stringop_alg candidate
= algs
->size
[i
].alg
;
6974 if (candidate
!= libcall
6975 && alg_usable_p (candidate
, memset
, have_as
))
6978 alg_noalign
= algs
->size
[i
].noalign
;
6980 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
6981 last non-libcall inline algorithm. */
6982 if (TARGET_INLINE_ALL_STRINGOPS
)
6984 /* When the current size is best to be copied by a libcall,
6985 but we are still forced to inline, run the heuristic below
6986 that will pick code for medium sized blocks. */
6989 *noalign
= alg_noalign
;
6992 else if (!any_alg_usable_p
)
6995 else if (alg_usable_p (candidate
, memset
, have_as
)
6996 && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
6997 && candidate
== rep_prefix_1_byte
6998 /* NB: If min_size != max_size, size is
7000 && min_size
!= max_size
))
7002 *noalign
= algs
->size
[i
].noalign
;
7008 /* When asked to inline the call anyway, try to pick meaningful choice.
7009 We look for maximal size of block that is faster to copy by hand and
7010 take blocks of at most of that size guessing that average size will
7011 be roughly half of the block.
7013 If this turns out to be bad, we might simply specify the preferred
7014 choice in ix86_costs. */
7015 if ((TARGET_INLINE_ALL_STRINGOPS
|| TARGET_INLINE_STRINGOPS_DYNAMICALLY
)
7016 && (algs
->unknown_size
== libcall
7017 || !alg_usable_p (algs
->unknown_size
, memset
, have_as
)))
7019 enum stringop_alg alg
;
7020 HOST_WIDE_INT new_expected_size
= (max
> 0 ? max
: 4096) / 2;
7022 /* If there aren't any usable algorithms or if recursing already,
7023 then recursing on smaller sizes or same size isn't going to
7024 find anything. Just return the simple byte-at-a-time copy loop. */
7025 if (!any_alg_usable_p
|| recur
)
7027 /* Pick something reasonable. */
7028 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY
&& !recur
)
7029 *dynamic_check
= 128;
7032 alg
= decide_alg (count
, new_expected_size
, min_size
, max_size
, memset
,
7033 zero_memset
, have_as
, dynamic_check
, noalign
, true);
7034 gcc_assert (*dynamic_check
== -1);
7035 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY
)
7036 *dynamic_check
= max
;
7038 gcc_assert (alg
!= libcall
);
7041 return (alg_usable_p (algs
->unknown_size
, memset
, have_as
)
7042 ? algs
->unknown_size
: libcall
);
7045 /* Decide on alignment. We know that the operand is already aligned to ALIGN
7046 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
7048 decide_alignment (int align
,
7049 enum stringop_alg alg
,
7051 machine_mode move_mode
)
7053 int desired_align
= 0;
7055 gcc_assert (alg
!= no_stringop
);
7059 if (move_mode
== VOIDmode
)
7062 desired_align
= GET_MODE_SIZE (move_mode
);
7063 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
7064 copying whole cacheline at once. */
7065 if (TARGET_CPU_P (PENTIUMPRO
)
7066 && (alg
== rep_prefix_4_byte
|| alg
== rep_prefix_1_byte
))
7071 if (desired_align
< align
)
7072 desired_align
= align
;
7073 if (expected_size
!= -1 && expected_size
< 4)
7074 desired_align
= align
;
7076 return desired_align
;
7080 /* Helper function for memcpy. For QImode value 0xXY produce
7081 0xXYXYXYXY of wide specified by MODE. This is essentially
7082 a * 0x10101010, but we can do slightly better than
7083 synth_mult by unwinding the sequence by hand on CPUs with
7086 promote_duplicated_reg (machine_mode mode
, rtx val
)
7088 machine_mode valmode
= GET_MODE (val
);
7090 int nops
= mode
== DImode
? 3 : 2;
7092 gcc_assert (mode
== SImode
|| mode
== DImode
|| val
== const0_rtx
);
7093 if (val
== const0_rtx
)
7094 return copy_to_mode_reg (mode
, CONST0_RTX (mode
));
7095 if (CONST_INT_P (val
))
7097 HOST_WIDE_INT v
= INTVAL (val
) & 255;
7102 v
|= (v
<< 16) << 16;
7103 return copy_to_mode_reg (mode
, gen_int_mode (v
, mode
));
7106 if (valmode
== VOIDmode
)
7108 if (valmode
!= QImode
)
7109 val
= gen_lowpart (QImode
, val
);
7112 if (!TARGET_PARTIAL_REG_STALL
)
7114 if (ix86_cost
->mult_init
[mode
== DImode
? 3 : 2]
7115 + ix86_cost
->mult_bit
* (mode
== DImode
? 8 : 4)
7116 <= (ix86_cost
->shift_const
+ ix86_cost
->add
) * nops
7117 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL
== 0)))
7119 rtx reg
= convert_modes (mode
, QImode
, val
, true);
7120 tmp
= promote_duplicated_reg (mode
, const1_rtx
);
7121 return expand_simple_binop (mode
, MULT
, reg
, tmp
, NULL
, 1,
7126 rtx reg
= convert_modes (mode
, QImode
, val
, true);
7128 if (!TARGET_PARTIAL_REG_STALL
)
7129 emit_insn (gen_insv_1 (mode
, reg
, reg
));
7132 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (8),
7133 NULL
, 1, OPTAB_DIRECT
);
7134 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1,
7137 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (16),
7138 NULL
, 1, OPTAB_DIRECT
);
7139 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1, OPTAB_DIRECT
);
7142 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (32),
7143 NULL
, 1, OPTAB_DIRECT
);
7144 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1, OPTAB_DIRECT
);
7149 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
7150 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
7151 alignment from ALIGN to DESIRED_ALIGN. */
7153 promote_duplicated_reg_to_size (rtx val
, int size_needed
, int desired_align
,
7159 && (size_needed
> 4 || (desired_align
> align
&& desired_align
> 4)))
7160 promoted_val
= promote_duplicated_reg (DImode
, val
);
7161 else if (size_needed
> 2 || (desired_align
> align
&& desired_align
> 2))
7162 promoted_val
= promote_duplicated_reg (SImode
, val
);
7163 else if (size_needed
> 1 || (desired_align
> align
&& desired_align
> 1))
7164 promoted_val
= promote_duplicated_reg (HImode
, val
);
7168 return promoted_val
;
7171 /* Copy the address to a Pmode register. This is used for x32 to
7172 truncate DImode TLS address to a SImode register. */
7175 ix86_copy_addr_to_reg (rtx addr
)
7178 if (GET_MODE (addr
) == Pmode
|| GET_MODE (addr
) == VOIDmode
)
7180 reg
= copy_addr_to_reg (addr
);
7181 REG_POINTER (reg
) = 1;
7186 gcc_assert (GET_MODE (addr
) == DImode
&& Pmode
== SImode
);
7187 reg
= copy_to_mode_reg (DImode
, addr
);
7188 REG_POINTER (reg
) = 1;
7189 return gen_rtx_SUBREG (SImode
, reg
, 0);
7193 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
7194 operations when profitable. The code depends upon architecture, block size
7195 and alignment, but always has one of the following overall structures:
7197 Aligned move sequence:
7199 1) Prologue guard: Conditional that jumps up to epilogues for small
7200 blocks that can be handled by epilogue alone. This is faster
7201 but also needed for correctness, since prologue assume the block
7202 is larger than the desired alignment.
7204 Optional dynamic check for size and libcall for large
7205 blocks is emitted here too, with -minline-stringops-dynamically.
7207 2) Prologue: copy first few bytes in order to get destination
7208 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
7209 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
7210 copied. We emit either a jump tree on power of two sized
7211 blocks, or a byte loop.
7213 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7214 with specified algorithm.
7216 4) Epilogue: code copying tail of the block that is too small to be
7217 handled by main body (or up to size guarded by prologue guard).
7219 Misaligned move sequence
7221 1) missaligned move prologue/epilogue containing:
7222 a) Prologue handling small memory blocks and jumping to done_label
7223 (skipped if blocks are known to be large enough)
7224 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
7225 needed by single possibly misaligned move
7226 (skipped if alignment is not needed)
7227 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
7229 2) Zero size guard dispatching to done_label, if needed
7231 3) dispatch to library call, if needed,
7233 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7234 with specified algorithm. */
7236 ix86_expand_set_or_cpymem (rtx dst
, rtx src
, rtx count_exp
, rtx val_exp
,
7237 rtx align_exp
, rtx expected_align_exp
,
7238 rtx expected_size_exp
, rtx min_size_exp
,
7239 rtx max_size_exp
, rtx probable_max_size_exp
,
7244 rtx_code_label
*label
= NULL
;
7246 rtx_code_label
*jump_around_label
= NULL
;
7247 HOST_WIDE_INT align
= 1;
7248 unsigned HOST_WIDE_INT count
= 0;
7249 HOST_WIDE_INT expected_size
= -1;
7250 int size_needed
= 0, epilogue_size_needed
;
7251 int desired_align
= 0, align_bytes
= 0;
7252 enum stringop_alg alg
;
7253 rtx promoted_val
= NULL
;
7254 rtx vec_promoted_val
= NULL
;
7255 bool force_loopy_epilogue
= false;
7257 bool need_zero_guard
= false;
7259 machine_mode move_mode
= VOIDmode
;
7260 machine_mode wider_mode
;
7261 int unroll_factor
= 1;
7262 /* TODO: Once value ranges are available, fill in proper data. */
7263 unsigned HOST_WIDE_INT min_size
= 0;
7264 unsigned HOST_WIDE_INT max_size
= -1;
7265 unsigned HOST_WIDE_INT probable_max_size
= -1;
7266 bool misaligned_prologue_used
= false;
7269 if (CONST_INT_P (align_exp
))
7270 align
= INTVAL (align_exp
);
7271 /* i386 can do misaligned access on reasonably increased cost. */
7272 if (CONST_INT_P (expected_align_exp
)
7273 && INTVAL (expected_align_exp
) > align
)
7274 align
= INTVAL (expected_align_exp
);
7275 /* ALIGN is the minimum of destination and source alignment, but we care here
7276 just about destination alignment. */
7278 && MEM_ALIGN (dst
) > (unsigned HOST_WIDE_INT
) align
* BITS_PER_UNIT
)
7279 align
= MEM_ALIGN (dst
) / BITS_PER_UNIT
;
7281 if (CONST_INT_P (count_exp
))
7283 min_size
= max_size
= probable_max_size
= count
= expected_size
7284 = INTVAL (count_exp
);
7285 /* When COUNT is 0, there is nothing to do. */
7292 min_size
= INTVAL (min_size_exp
);
7294 max_size
= INTVAL (max_size_exp
);
7295 if (probable_max_size_exp
)
7296 probable_max_size
= INTVAL (probable_max_size_exp
);
7297 if (CONST_INT_P (expected_size_exp
))
7298 expected_size
= INTVAL (expected_size_exp
);
7301 /* Make sure we don't need to care about overflow later on. */
7302 if (count
> (HOST_WIDE_INT_1U
<< 30))
7305 have_as
= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst
));
7307 have_as
|= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src
));
7309 /* Step 0: Decide on preferred algorithm, desired alignment and
7310 size of chunks to be copied by main loop. */
7311 alg
= decide_alg (count
, expected_size
, min_size
, probable_max_size
,
7313 issetmem
&& val_exp
== const0_rtx
, have_as
,
7314 &dynamic_check
, &noalign
, false);
7317 fprintf (dump_file
, "Selected stringop expansion strategy: %s\n",
7318 stringop_alg_names
[alg
]);
7322 gcc_assert (alg
!= no_stringop
);
7324 /* For now vector-version of memset is generated only for memory zeroing, as
7325 creating of promoted vector value is very cheap in this case. */
7326 if (issetmem
&& alg
== vector_loop
&& val_exp
!= const0_rtx
)
7327 alg
= unrolled_loop
;
7330 count_exp
= copy_to_mode_reg (GET_MODE (count_exp
), count_exp
);
7331 destreg
= ix86_copy_addr_to_reg (XEXP (dst
, 0));
7333 srcreg
= ix86_copy_addr_to_reg (XEXP (src
, 0));
7336 move_mode
= word_mode
;
7344 need_zero_guard
= true;
7348 need_zero_guard
= true;
7351 need_zero_guard
= true;
7352 unroll_factor
= (TARGET_64BIT
? 4 : 2);
7355 need_zero_guard
= true;
7357 /* Find the widest supported mode. */
7358 move_mode
= word_mode
;
7359 while (GET_MODE_WIDER_MODE (move_mode
).exists (&wider_mode
)
7360 && optab_handler (mov_optab
, wider_mode
) != CODE_FOR_nothing
)
7361 move_mode
= wider_mode
;
7363 if (TARGET_AVX256_SPLIT_REGS
&& GET_MODE_BITSIZE (move_mode
) > 128)
7366 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7367 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7368 if (GET_MODE_SIZE (move_mode
) > GET_MODE_SIZE (word_mode
))
7370 int nunits
= GET_MODE_SIZE (move_mode
) / GET_MODE_SIZE (word_mode
);
7371 if (!mode_for_vector (word_mode
, nunits
).exists (&move_mode
)
7372 || optab_handler (mov_optab
, move_mode
) == CODE_FOR_nothing
)
7373 move_mode
= word_mode
;
7375 gcc_assert (optab_handler (mov_optab
, move_mode
) != CODE_FOR_nothing
);
7377 case rep_prefix_8_byte
:
7380 case rep_prefix_4_byte
:
7383 case rep_prefix_1_byte
:
7387 size_needed
= GET_MODE_SIZE (move_mode
) * unroll_factor
;
7388 epilogue_size_needed
= size_needed
;
7390 /* If we are going to call any library calls conditionally, make sure any
7391 pending stack adjustment happen before the first conditional branch,
7392 otherwise they will be emitted before the library call only and won't
7393 happen from the other branches. */
7394 if (dynamic_check
!= -1)
7395 do_pending_stack_adjust ();
7397 desired_align
= decide_alignment (align
, alg
, expected_size
, move_mode
);
7398 if (!TARGET_ALIGN_STRINGOPS
|| noalign
)
7399 align
= desired_align
;
7401 /* Step 1: Prologue guard. */
7403 /* Alignment code needs count to be in register. */
7404 if (CONST_INT_P (count_exp
) && desired_align
> align
)
7406 if (INTVAL (count_exp
) > desired_align
7407 && INTVAL (count_exp
) > size_needed
)
7410 = get_mem_align_offset (dst
, desired_align
* BITS_PER_UNIT
);
7411 if (align_bytes
<= 0)
7414 align_bytes
= desired_align
- align_bytes
;
7416 if (align_bytes
== 0)
7417 count_exp
= force_reg (counter_mode (count_exp
), count_exp
);
7419 gcc_assert (desired_align
>= 1 && align
>= 1);
7421 /* Misaligned move sequences handle both prologue and epilogue at once.
7422 Default code generation results in a smaller code for large alignments
7423 and also avoids redundant job when sizes are known precisely. */
7424 misaligned_prologue_used
7425 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
7426 && MAX (desired_align
, epilogue_size_needed
) <= 32
7427 && desired_align
<= epilogue_size_needed
7428 && ((desired_align
> align
&& !align_bytes
)
7429 || (!count
&& epilogue_size_needed
> 1)));
7431 /* Do the cheap promotion to allow better CSE across the
7432 main loop and epilogue (ie one load of the big constant in the
7434 For now the misaligned move sequences do not have fast path
7435 without broadcasting. */
7436 if (issetmem
&& ((CONST_INT_P (val_exp
) || misaligned_prologue_used
)))
7438 if (alg
== vector_loop
)
7440 gcc_assert (val_exp
== const0_rtx
);
7441 vec_promoted_val
= promote_duplicated_reg (move_mode
, val_exp
);
7442 promoted_val
= promote_duplicated_reg_to_size (val_exp
,
7443 GET_MODE_SIZE (word_mode
),
7444 desired_align
, align
);
7448 promoted_val
= promote_duplicated_reg_to_size (val_exp
, size_needed
,
7449 desired_align
, align
);
7452 /* Misaligned move sequences handles both prologues and epilogues at once.
7453 Default code generation results in smaller code for large alignments and
7454 also avoids redundant job when sizes are known precisely. */
7455 if (misaligned_prologue_used
)
7457 /* Misaligned move prologue handled small blocks by itself. */
7458 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
7459 (dst
, src
, &destreg
, &srcreg
,
7460 move_mode
, promoted_val
, vec_promoted_val
,
7463 desired_align
< align
7464 ? MAX (desired_align
, epilogue_size_needed
) : epilogue_size_needed
,
7465 desired_align
, align
, &min_size
, dynamic_check
, issetmem
);
7467 src
= change_address (src
, BLKmode
, srcreg
);
7468 dst
= change_address (dst
, BLKmode
, destreg
);
7469 set_mem_align (dst
, desired_align
* BITS_PER_UNIT
);
7470 epilogue_size_needed
= 0;
7472 && min_size
< (unsigned HOST_WIDE_INT
) size_needed
)
7474 /* It is possible that we copied enough so the main loop will not
7476 gcc_assert (size_needed
> 1);
7477 if (jump_around_label
== NULL_RTX
)
7478 jump_around_label
= gen_label_rtx ();
7479 emit_cmp_and_jump_insns (count_exp
,
7480 GEN_INT (size_needed
),
7481 LTU
, 0, counter_mode (count_exp
), 1, jump_around_label
);
7482 if (expected_size
== -1
7483 || expected_size
< (desired_align
- align
) / 2 + size_needed
)
7484 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
7486 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
7489 /* Ensure that alignment prologue won't copy past end of block. */
7490 else if (size_needed
> 1 || (desired_align
> 1 && desired_align
> align
))
7492 epilogue_size_needed
= MAX (size_needed
- 1, desired_align
- align
);
7493 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
7494 Make sure it is power of 2. */
7495 epilogue_size_needed
= 1 << (floor_log2 (epilogue_size_needed
) + 1);
7497 /* To improve performance of small blocks, we jump around the VAL
7498 promoting mode. This mean that if the promoted VAL is not constant,
7499 we might not use it in the epilogue and have to use byte
7501 if (issetmem
&& epilogue_size_needed
> 2 && !promoted_val
)
7502 force_loopy_epilogue
= true;
7503 if ((count
&& count
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
7504 || max_size
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
7506 /* If main algorithm works on QImode, no epilogue is needed.
7507 For small sizes just don't align anything. */
7508 if (size_needed
== 1)
7509 desired_align
= align
;
7514 && min_size
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
7516 label
= gen_label_rtx ();
7517 emit_cmp_and_jump_insns (count_exp
,
7518 GEN_INT (epilogue_size_needed
),
7519 LTU
, 0, counter_mode (count_exp
), 1, label
);
7520 if (expected_size
== -1 || expected_size
< epilogue_size_needed
)
7521 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
7523 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
7527 /* Emit code to decide on runtime whether library call or inline should be
7529 if (dynamic_check
!= -1)
7531 if (!issetmem
&& CONST_INT_P (count_exp
))
7533 if (UINTVAL (count_exp
) >= (unsigned HOST_WIDE_INT
)dynamic_check
)
7535 emit_block_copy_via_libcall (dst
, src
, count_exp
);
7536 count_exp
= const0_rtx
;
7542 rtx_code_label
*hot_label
= gen_label_rtx ();
7543 if (jump_around_label
== NULL_RTX
)
7544 jump_around_label
= gen_label_rtx ();
7545 emit_cmp_and_jump_insns (count_exp
, GEN_INT (dynamic_check
- 1),
7546 LEU
, 0, counter_mode (count_exp
),
7548 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
7550 set_storage_via_libcall (dst
, count_exp
, val_exp
);
7552 emit_block_copy_via_libcall (dst
, src
, count_exp
);
7553 emit_jump (jump_around_label
);
7554 emit_label (hot_label
);
7558 /* Step 2: Alignment prologue. */
7559 /* Do the expensive promotion once we branched off the small blocks. */
7560 if (issetmem
&& !promoted_val
)
7561 promoted_val
= promote_duplicated_reg_to_size (val_exp
, size_needed
,
7562 desired_align
, align
);
7564 if (desired_align
> align
&& !misaligned_prologue_used
)
7566 if (align_bytes
== 0)
7568 /* Except for the first move in prologue, we no longer know
7569 constant offset in aliasing info. It don't seems to worth
7570 the pain to maintain it for the first move, so throw away
7572 dst
= change_address (dst
, BLKmode
, destreg
);
7574 src
= change_address (src
, BLKmode
, srcreg
);
7575 dst
= expand_set_or_cpymem_prologue (dst
, src
, destreg
, srcreg
,
7576 promoted_val
, vec_promoted_val
,
7577 count_exp
, align
, desired_align
,
7579 /* At most desired_align - align bytes are copied. */
7580 if (min_size
< (unsigned)(desired_align
- align
))
7583 min_size
-= desired_align
- align
;
7587 /* If we know how many bytes need to be stored before dst is
7588 sufficiently aligned, maintain aliasing info accurately. */
7589 dst
= expand_set_or_cpymem_constant_prologue (dst
, &src
, destreg
,
7597 count_exp
= plus_constant (counter_mode (count_exp
),
7598 count_exp
, -align_bytes
);
7599 count
-= align_bytes
;
7600 min_size
-= align_bytes
;
7601 max_size
-= align_bytes
;
7604 && min_size
< (unsigned HOST_WIDE_INT
) size_needed
7605 && (count
< (unsigned HOST_WIDE_INT
) size_needed
7606 || (align_bytes
== 0
7607 && count
< ((unsigned HOST_WIDE_INT
) size_needed
7608 + desired_align
- align
))))
7610 /* It is possible that we copied enough so the main loop will not
7612 gcc_assert (size_needed
> 1);
7613 if (label
== NULL_RTX
)
7614 label
= gen_label_rtx ();
7615 emit_cmp_and_jump_insns (count_exp
,
7616 GEN_INT (size_needed
),
7617 LTU
, 0, counter_mode (count_exp
), 1, label
);
7618 if (expected_size
== -1
7619 || expected_size
< (desired_align
- align
) / 2 + size_needed
)
7620 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
7622 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
7625 if (label
&& size_needed
== 1)
7628 LABEL_NUSES (label
) = 1;
7630 epilogue_size_needed
= 1;
7632 promoted_val
= val_exp
;
7634 else if (label
== NULL_RTX
&& !misaligned_prologue_used
)
7635 epilogue_size_needed
= size_needed
;
7637 /* Step 3: Main loop. */
7648 expand_set_or_cpymem_via_loop (dst
, src
, destreg
, srcreg
, promoted_val
,
7649 count_exp
, move_mode
, unroll_factor
,
7650 expected_size
, issetmem
);
7653 expand_set_or_cpymem_via_loop (dst
, src
, destreg
, srcreg
,
7654 vec_promoted_val
, count_exp
, move_mode
,
7655 unroll_factor
, expected_size
, issetmem
);
7657 case rep_prefix_8_byte
:
7658 case rep_prefix_4_byte
:
7659 case rep_prefix_1_byte
:
7660 expand_set_or_cpymem_via_rep (dst
, src
, destreg
, srcreg
, promoted_val
,
7661 val_exp
, count_exp
, move_mode
, issetmem
);
7664 /* Adjust properly the offset of src and dest memory for aliasing. */
7665 if (CONST_INT_P (count_exp
))
7668 src
= adjust_automodify_address_nv (src
, BLKmode
, srcreg
,
7669 (count
/ size_needed
) * size_needed
);
7670 dst
= adjust_automodify_address_nv (dst
, BLKmode
, destreg
,
7671 (count
/ size_needed
) * size_needed
);
7676 src
= change_address (src
, BLKmode
, srcreg
);
7677 dst
= change_address (dst
, BLKmode
, destreg
);
7680 /* Step 4: Epilogue to copy the remaining bytes. */
7684 /* When the main loop is done, COUNT_EXP might hold original count,
7685 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
7686 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
7687 bytes. Compensate if needed. */
7689 if (size_needed
< epilogue_size_needed
)
7691 tmp
= expand_simple_binop (counter_mode (count_exp
), AND
, count_exp
,
7692 GEN_INT (size_needed
- 1), count_exp
, 1,
7694 if (tmp
!= count_exp
)
7695 emit_move_insn (count_exp
, tmp
);
7698 LABEL_NUSES (label
) = 1;
7701 if (count_exp
!= const0_rtx
&& epilogue_size_needed
> 1)
7703 if (force_loopy_epilogue
)
7704 expand_setmem_epilogue_via_loop (dst
, destreg
, val_exp
, count_exp
,
7705 epilogue_size_needed
);
7709 expand_setmem_epilogue (dst
, destreg
, promoted_val
,
7710 vec_promoted_val
, count_exp
,
7711 epilogue_size_needed
);
7713 expand_cpymem_epilogue (dst
, src
, destreg
, srcreg
, count_exp
,
7714 epilogue_size_needed
);
7717 if (jump_around_label
)
7718 emit_label (jump_around_label
);
7722 /* Expand cmpstrn or memcmp. */
7725 ix86_expand_cmpstrn_or_cmpmem (rtx result
, rtx src1
, rtx src2
,
7726 rtx length
, rtx align
, bool is_cmpstrn
)
7728 /* Expand strncmp and memcmp only with -minline-all-stringops since
7729 "repz cmpsb" can be much slower than strncmp and memcmp functions
7730 implemented with vector instructions, see
7732 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
7734 if (!TARGET_INLINE_ALL_STRINGOPS
)
7737 /* Can't use this if the user has appropriated ecx, esi or edi. */
7738 if (fixed_regs
[CX_REG
] || fixed_regs
[SI_REG
] || fixed_regs
[DI_REG
])
7743 /* For strncmp, length is the maximum length, which can be larger
7744 than actual string lengths. We can expand the cmpstrn pattern
7745 to "repz cmpsb" only if one of the strings is a constant so
7746 that expand_builtin_strncmp() can write the length argument to
7747 be the minimum of the const string length and the actual length
7748 argument. Otherwise, "repz cmpsb" may pass the 0 byte. */
7749 tree t1
= MEM_EXPR (src1
);
7750 tree t2
= MEM_EXPR (src2
);
7751 if (!((t1
&& TREE_CODE (t1
) == MEM_REF
7752 && TREE_CODE (TREE_OPERAND (t1
, 0)) == ADDR_EXPR
7753 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1
, 0), 0))
7755 || (t2
&& TREE_CODE (t2
) == MEM_REF
7756 && TREE_CODE (TREE_OPERAND (t2
, 0)) == ADDR_EXPR
7757 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2
, 0), 0))
7762 rtx addr1
= copy_addr_to_reg (XEXP (src1
, 0));
7763 rtx addr2
= copy_addr_to_reg (XEXP (src2
, 0));
7764 if (addr1
!= XEXP (src1
, 0))
7765 src1
= replace_equiv_address_nv (src1
, addr1
);
7766 if (addr2
!= XEXP (src2
, 0))
7767 src2
= replace_equiv_address_nv (src2
, addr2
);
7769 /* NB: Make a copy of the data length to avoid changing the original
7770 data length by cmpstrnqi patterns. */
7771 length
= ix86_zero_extend_to_Pmode (length
);
7772 rtx lengthreg
= gen_reg_rtx (Pmode
);
7773 emit_move_insn (lengthreg
, length
);
7775 /* If we are testing strict equality, we can use known alignment to
7776 good advantage. This may be possible with combine, particularly
7777 once cc0 is dead. */
7778 if (CONST_INT_P (length
))
7780 if (length
== const0_rtx
)
7782 emit_move_insn (result
, const0_rtx
);
7785 emit_insn (gen_cmpstrnqi_nz_1 (addr1
, addr2
, lengthreg
, align
,
7790 emit_insn (gen_cmp_1 (Pmode
, lengthreg
, lengthreg
));
7791 emit_insn (gen_cmpstrnqi_1 (addr1
, addr2
, lengthreg
, align
,
7795 rtx out
= gen_lowpart (QImode
, result
);
7796 emit_insn (gen_cmpintqi (out
));
7797 emit_move_insn (result
, gen_rtx_SIGN_EXTEND (SImode
, out
));
7802 /* Expand the appropriate insns for doing strlen if not just doing
7805 out = result, initialized with the start address
7806 align_rtx = alignment of the address.
7807 scratch = scratch register, initialized with the startaddress when
7808 not aligned, otherwise undefined
7810 This is just the body. It needs the initializations mentioned above and
7811 some address computing at the end. These things are done in i386.md. */
7814 ix86_expand_strlensi_unroll_1 (rtx out
, rtx src
, rtx align_rtx
)
7818 rtx_code_label
*align_2_label
= NULL
;
7819 rtx_code_label
*align_3_label
= NULL
;
7820 rtx_code_label
*align_4_label
= gen_label_rtx ();
7821 rtx_code_label
*end_0_label
= gen_label_rtx ();
7823 rtx tmpreg
= gen_reg_rtx (SImode
);
7824 rtx scratch
= gen_reg_rtx (SImode
);
7828 if (CONST_INT_P (align_rtx
))
7829 align
= INTVAL (align_rtx
);
7831 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
7833 /* Is there a known alignment and is it less than 4? */
7836 rtx scratch1
= gen_reg_rtx (Pmode
);
7837 emit_move_insn (scratch1
, out
);
7838 /* Is there a known alignment and is it not 2? */
7841 align_3_label
= gen_label_rtx (); /* Label when aligned to 3-byte */
7842 align_2_label
= gen_label_rtx (); /* Label when aligned to 2-byte */
7844 /* Leave just the 3 lower bits. */
7845 align_rtx
= expand_binop (Pmode
, and_optab
, scratch1
, GEN_INT (3),
7846 NULL_RTX
, 0, OPTAB_WIDEN
);
7848 emit_cmp_and_jump_insns (align_rtx
, const0_rtx
, EQ
, NULL
,
7849 Pmode
, 1, align_4_label
);
7850 emit_cmp_and_jump_insns (align_rtx
, const2_rtx
, EQ
, NULL
,
7851 Pmode
, 1, align_2_label
);
7852 emit_cmp_and_jump_insns (align_rtx
, const2_rtx
, GTU
, NULL
,
7853 Pmode
, 1, align_3_label
);
7857 /* Since the alignment is 2, we have to check 2 or 0 bytes;
7858 check if is aligned to 4 - byte. */
7860 align_rtx
= expand_binop (Pmode
, and_optab
, scratch1
, const2_rtx
,
7861 NULL_RTX
, 0, OPTAB_WIDEN
);
7863 emit_cmp_and_jump_insns (align_rtx
, const0_rtx
, EQ
, NULL
,
7864 Pmode
, 1, align_4_label
);
7867 mem
= change_address (src
, QImode
, out
);
7869 /* Now compare the bytes. */
7871 /* Compare the first n unaligned byte on a byte per byte basis. */
7872 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
,
7873 QImode
, 1, end_0_label
);
7875 /* Increment the address. */
7876 emit_insn (gen_add2_insn (out
, const1_rtx
));
7878 /* Not needed with an alignment of 2 */
7881 emit_label (align_2_label
);
7883 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
, QImode
, 1,
7886 emit_insn (gen_add2_insn (out
, const1_rtx
));
7888 emit_label (align_3_label
);
7891 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
, QImode
, 1,
7894 emit_insn (gen_add2_insn (out
, const1_rtx
));
7897 /* Generate loop to check 4 bytes at a time. It is not a good idea to
7898 align this loop. It gives only huge programs, but does not help to
7900 emit_label (align_4_label
);
7902 mem
= change_address (src
, SImode
, out
);
7903 emit_move_insn (scratch
, mem
);
7904 emit_insn (gen_add2_insn (out
, GEN_INT (4)));
7906 /* This formula yields a nonzero result iff one of the bytes is zero.
7907 This saves three branches inside loop and many cycles. */
7909 emit_insn (gen_addsi3 (tmpreg
, scratch
, GEN_INT (-0x01010101)));
7910 emit_insn (gen_one_cmplsi2 (scratch
, scratch
));
7911 emit_insn (gen_andsi3 (tmpreg
, tmpreg
, scratch
));
7912 emit_insn (gen_andsi3 (tmpreg
, tmpreg
,
7913 gen_int_mode (0x80808080, SImode
)));
7914 emit_cmp_and_jump_insns (tmpreg
, const0_rtx
, EQ
, 0, SImode
, 1,
7919 rtx reg
= gen_reg_rtx (SImode
);
7920 rtx reg2
= gen_reg_rtx (Pmode
);
7921 emit_move_insn (reg
, tmpreg
);
7922 emit_insn (gen_lshrsi3 (reg
, reg
, GEN_INT (16)));
7924 /* If zero is not in the first two bytes, move two bytes forward. */
7925 emit_insn (gen_testsi_ccno_1 (tmpreg
, GEN_INT (0x8080)));
7926 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
7927 tmp
= gen_rtx_EQ (VOIDmode
, tmp
, const0_rtx
);
7928 emit_insn (gen_rtx_SET (tmpreg
,
7929 gen_rtx_IF_THEN_ELSE (SImode
, tmp
,
7932 /* Emit lea manually to avoid clobbering of flags. */
7933 emit_insn (gen_rtx_SET (reg2
, plus_constant (Pmode
, out
, 2)));
7935 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
7936 tmp
= gen_rtx_EQ (VOIDmode
, tmp
, const0_rtx
);
7937 emit_insn (gen_rtx_SET (out
,
7938 gen_rtx_IF_THEN_ELSE (Pmode
, tmp
,
7944 rtx_code_label
*end_2_label
= gen_label_rtx ();
7945 /* Is zero in the first two bytes? */
7947 emit_insn (gen_testsi_ccno_1 (tmpreg
, GEN_INT (0x8080)));
7948 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
7949 tmp
= gen_rtx_NE (VOIDmode
, tmp
, const0_rtx
);
7950 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
7951 gen_rtx_LABEL_REF (VOIDmode
, end_2_label
),
7953 tmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
7954 JUMP_LABEL (tmp
) = end_2_label
;
7956 /* Not in the first two. Move two bytes forward. */
7957 emit_insn (gen_lshrsi3 (tmpreg
, tmpreg
, GEN_INT (16)));
7958 emit_insn (gen_add2_insn (out
, const2_rtx
));
7960 emit_label (end_2_label
);
7964 /* Avoid branch in fixing the byte. */
7965 tmpreg
= gen_lowpart (QImode
, tmpreg
);
7966 emit_insn (gen_addqi3_cconly_overflow (tmpreg
, tmpreg
));
7967 tmp
= gen_rtx_REG (CCmode
, FLAGS_REG
);
7968 cmp
= gen_rtx_LTU (VOIDmode
, tmp
, const0_rtx
);
7969 emit_insn (gen_sub3_carry (Pmode
, out
, out
, GEN_INT (3), tmp
, cmp
));
7971 emit_label (end_0_label
);
7974 /* Expand strlen. */
7977 ix86_expand_strlen (rtx out
, rtx src
, rtx eoschar
, rtx align
)
7979 if (TARGET_UNROLL_STRLEN
7980 && TARGET_INLINE_ALL_STRINGOPS
7981 && eoschar
== const0_rtx
7984 /* The generic case of strlen expander is long. Avoid it's
7985 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
7986 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
7987 /* Well it seems that some optimizer does not combine a call like
7988 foo(strlen(bar), strlen(bar));
7989 when the move and the subtraction is done here. It does calculate
7990 the length just once when these instructions are done inside of
7991 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
7992 often used and I use one fewer register for the lifetime of
7993 output_strlen_unroll() this is better. */
7995 emit_move_insn (out
, addr
);
7997 ix86_expand_strlensi_unroll_1 (out
, src
, align
);
7999 /* strlensi_unroll_1 returns the address of the zero at the end of
8000 the string, like memchr(), so compute the length by subtracting
8001 the start address. */
8002 emit_insn (gen_sub2_insn (out
, addr
));
8009 /* For given symbol (function) construct code to compute address of it's PLT
8010 entry in large x86-64 PIC model. */
8013 construct_plt_address (rtx symbol
)
8017 gcc_assert (GET_CODE (symbol
) == SYMBOL_REF
);
8018 gcc_assert (ix86_cmodel
== CM_LARGE_PIC
&& !TARGET_PECOFF
);
8019 gcc_assert (Pmode
== DImode
);
8021 tmp
= gen_reg_rtx (Pmode
);
8022 unspec
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, symbol
), UNSPEC_PLTOFF
);
8024 emit_move_insn (tmp
, gen_rtx_CONST (Pmode
, unspec
));
8025 emit_insn (gen_add2_insn (tmp
, pic_offset_table_rtx
));
8029 /* Additional registers that are clobbered by SYSV calls. */
8031 static int const x86_64_ms_sysv_extra_clobbered_registers
8032 [NUM_X86_64_MS_CLOBBERED_REGS
] =
8036 XMM8_REG
, XMM9_REG
, XMM10_REG
, XMM11_REG
,
8037 XMM12_REG
, XMM13_REG
, XMM14_REG
, XMM15_REG
8041 ix86_expand_call (rtx retval
, rtx fnaddr
, rtx callarg1
,
8043 rtx pop
, bool sibcall
)
8046 rtx use
= NULL
, call
;
8047 unsigned int vec_len
= 0;
8050 if (GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
8052 fndecl
= SYMBOL_REF_DECL (XEXP (fnaddr
, 0));
8054 && (lookup_attribute ("interrupt",
8055 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
)))))
8056 error ("interrupt service routine cannot be called directly");
8061 if (pop
== const0_rtx
)
8063 gcc_assert (!TARGET_64BIT
|| !pop
);
8065 if (TARGET_MACHO
&& !TARGET_64BIT
)
8068 if (flag_pic
&& GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
8069 fnaddr
= machopic_indirect_call_target (fnaddr
);
8074 /* Static functions and indirect calls don't need the pic register. Also,
8075 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
8076 it an indirect call. */
8077 rtx addr
= XEXP (fnaddr
, 0);
8079 && GET_CODE (addr
) == SYMBOL_REF
8080 && !SYMBOL_REF_LOCAL_P (addr
))
8083 && (SYMBOL_REF_DECL (addr
) == NULL_TREE
8084 || !lookup_attribute ("noplt",
8085 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr
)))))
8088 || (ix86_cmodel
== CM_LARGE_PIC
8089 && DEFAULT_ABI
!= MS_ABI
))
8091 use_reg (&use
, gen_rtx_REG (Pmode
,
8092 REAL_PIC_OFFSET_TABLE_REGNUM
));
8093 if (ix86_use_pseudo_pic_reg ())
8094 emit_move_insn (gen_rtx_REG (Pmode
,
8095 REAL_PIC_OFFSET_TABLE_REGNUM
),
8096 pic_offset_table_rtx
);
8099 else if (!TARGET_PECOFF
&& !TARGET_MACHO
)
8102 && ix86_cmodel
== CM_LARGE_PIC
8103 && DEFAULT_ABI
!= MS_ABI
)
8105 fnaddr
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, addr
),
8107 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
8108 fnaddr
= force_reg (Pmode
, fnaddr
);
8109 fnaddr
= gen_rtx_PLUS (Pmode
, pic_offset_table_rtx
, fnaddr
);
8111 else if (TARGET_64BIT
)
8113 fnaddr
= gen_rtx_UNSPEC (Pmode
,
8114 gen_rtvec (1, addr
),
8116 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
8120 fnaddr
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, addr
),
8122 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
8123 fnaddr
= gen_rtx_PLUS (Pmode
, pic_offset_table_rtx
,
8126 fnaddr
= gen_const_mem (Pmode
, fnaddr
);
8127 /* Pmode may not be the same as word_mode for x32, which
8128 doesn't support indirect branch via 32-bit memory slot.
8129 Since x32 GOT slot is 64 bit with zero upper 32 bits,
8130 indirect branch via x32 GOT slot is OK. */
8131 if (GET_MODE (fnaddr
) != word_mode
)
8132 fnaddr
= gen_rtx_ZERO_EXTEND (word_mode
, fnaddr
);
8133 fnaddr
= gen_rtx_MEM (QImode
, fnaddr
);
8138 /* Skip setting up RAX register for -mskip-rax-setup when there are no
8139 parameters passed in vector registers. */
8141 && (INTVAL (callarg2
) > 0
8142 || (INTVAL (callarg2
) == 0
8143 && (TARGET_SSE
|| !flag_skip_rax_setup
))))
8145 rtx al
= gen_rtx_REG (QImode
, AX_REG
);
8146 emit_move_insn (al
, callarg2
);
8150 if (ix86_cmodel
== CM_LARGE_PIC
8153 && GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
8154 && !local_symbolic_operand (XEXP (fnaddr
, 0), VOIDmode
))
8155 fnaddr
= gen_rtx_MEM (QImode
, construct_plt_address (XEXP (fnaddr
, 0)));
8156 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
8157 branch via x32 GOT slot is OK. */
8158 else if (!(TARGET_X32
8160 && GET_CODE (XEXP (fnaddr
, 0)) == ZERO_EXTEND
8161 && GOT_memory_operand (XEXP (XEXP (fnaddr
, 0), 0), Pmode
))
8163 ? !sibcall_insn_operand (XEXP (fnaddr
, 0), word_mode
)
8164 : !call_insn_operand (XEXP (fnaddr
, 0), word_mode
)))
8166 fnaddr
= convert_to_mode (word_mode
, XEXP (fnaddr
, 0), 1);
8167 fnaddr
= gen_rtx_MEM (QImode
, copy_to_mode_reg (word_mode
, fnaddr
));
8170 call
= gen_rtx_CALL (VOIDmode
, fnaddr
, callarg1
);
8173 call
= gen_rtx_SET (retval
, call
);
8174 vec
[vec_len
++] = call
;
8178 pop
= gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, pop
);
8179 pop
= gen_rtx_SET (stack_pointer_rtx
, pop
);
8180 vec
[vec_len
++] = pop
;
8183 if (cfun
->machine
->no_caller_saved_registers
8185 || (!TREE_THIS_VOLATILE (fndecl
)
8186 && !lookup_attribute ("no_caller_saved_registers",
8187 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
))))))
8189 static const char ix86_call_used_regs
[] = CALL_USED_REGISTERS
;
8190 bool is_64bit_ms_abi
= (TARGET_64BIT
8191 && ix86_function_abi (fndecl
) == MS_ABI
);
8192 char c_mask
= CALL_USED_REGISTERS_MASK (is_64bit_ms_abi
);
8194 /* If there are no caller-saved registers, add all registers
8195 that are clobbered by the call which returns. */
8196 for (int i
= 0; i
< FIRST_PSEUDO_REGISTER
; i
++)
8198 && (ix86_call_used_regs
[i
] == 1
8199 || (ix86_call_used_regs
[i
] & c_mask
))
8200 && !STACK_REGNO_P (i
)
8201 && !MMX_REGNO_P (i
))
8203 gen_rtx_REG (GET_MODE (regno_reg_rtx
[i
]), i
));
8205 else if (TARGET_64BIT_MS_ABI
8206 && (!callarg2
|| INTVAL (callarg2
) != -2))
8210 for (i
= 0; i
< NUM_X86_64_MS_CLOBBERED_REGS
; i
++)
8212 int regno
= x86_64_ms_sysv_extra_clobbered_registers
[i
];
8213 machine_mode mode
= SSE_REGNO_P (regno
) ? TImode
: DImode
;
8215 clobber_reg (&use
, gen_rtx_REG (mode
, regno
));
8218 /* Set here, but it may get cleared later. */
8219 if (TARGET_CALL_MS2SYSV_XLOGUES
)
8224 /* Don't break hot-patched functions. */
8225 else if (ix86_function_ms_hook_prologue (current_function_decl
))
8228 /* TODO: Cases not yet examined. */
8229 else if (flag_split_stack
)
8230 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
8234 gcc_assert (!reload_completed
);
8235 cfun
->machine
->call_ms2sysv
= true;
8241 call
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec_v (vec_len
, vec
));
8242 rtx_insn
*call_insn
= emit_call_insn (call
);
8244 CALL_INSN_FUNCTION_USAGE (call_insn
) = use
;
8249 /* Split simple return with popping POPC bytes from stack to indirect
8250 branch with stack adjustment . */
8253 ix86_split_simple_return_pop_internal (rtx popc
)
8255 struct machine_function
*m
= cfun
->machine
;
8256 rtx ecx
= gen_rtx_REG (SImode
, CX_REG
);
8259 /* There is no "pascal" calling convention in any 64bit ABI. */
8260 gcc_assert (!TARGET_64BIT
);
8262 insn
= emit_insn (gen_pop (ecx
));
8263 m
->fs
.cfa_offset
-= UNITS_PER_WORD
;
8264 m
->fs
.sp_offset
-= UNITS_PER_WORD
;
8266 rtx x
= plus_constant (Pmode
, stack_pointer_rtx
, UNITS_PER_WORD
);
8267 x
= gen_rtx_SET (stack_pointer_rtx
, x
);
8268 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, x
);
8269 add_reg_note (insn
, REG_CFA_REGISTER
, gen_rtx_SET (ecx
, pc_rtx
));
8270 RTX_FRAME_RELATED_P (insn
) = 1;
8272 x
= gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, popc
);
8273 x
= gen_rtx_SET (stack_pointer_rtx
, x
);
8274 insn
= emit_insn (x
);
8275 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, x
);
8276 RTX_FRAME_RELATED_P (insn
) = 1;
8278 /* Now return address is in ECX. */
8279 emit_jump_insn (gen_simple_return_indirect_internal (ecx
));
8282 /* Errors in the source file can cause expand_expr to return const0_rtx
8283 where we expect a vector. To avoid crashing, use one of the vector
8284 clear instructions. */
8287 safe_vector_operand (rtx x
, machine_mode mode
)
8289 if (x
== const0_rtx
)
8290 x
= CONST0_RTX (mode
);
8294 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
8297 ix86_expand_binop_builtin (enum insn_code icode
, tree exp
, rtx target
)
8300 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8301 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8302 rtx op0
= expand_normal (arg0
);
8303 rtx op1
= expand_normal (arg1
);
8304 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
8305 machine_mode mode0
= insn_data
[icode
].operand
[1].mode
;
8306 machine_mode mode1
= insn_data
[icode
].operand
[2].mode
;
8308 if (VECTOR_MODE_P (mode0
))
8309 op0
= safe_vector_operand (op0
, mode0
);
8310 if (VECTOR_MODE_P (mode1
))
8311 op1
= safe_vector_operand (op1
, mode1
);
8313 if (optimize
|| !target
8314 || GET_MODE (target
) != tmode
8315 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
8316 target
= gen_reg_rtx (tmode
);
8318 if (GET_MODE (op1
) == SImode
&& mode1
== TImode
)
8320 rtx x
= gen_reg_rtx (V4SImode
);
8321 emit_insn (gen_sse2_loadd (x
, op1
));
8322 op1
= gen_lowpart (TImode
, x
);
8325 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
8326 op0
= copy_to_mode_reg (mode0
, op0
);
8327 if (!insn_data
[icode
].operand
[2].predicate (op1
, mode1
))
8328 op1
= copy_to_mode_reg (mode1
, op1
);
8330 pat
= GEN_FCN (icode
) (target
, op0
, op1
);
8339 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
8342 ix86_expand_multi_arg_builtin (enum insn_code icode
, tree exp
, rtx target
,
8343 enum ix86_builtin_func_type m_type
,
8344 enum rtx_code sub_code
)
8347 unsigned int i
, nargs
;
8348 bool comparison_p
= false;
8350 bool last_arg_constant
= false;
8354 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
8358 case MULTI_ARG_4_DF2_DI_I
:
8359 case MULTI_ARG_4_DF2_DI_I1
:
8360 case MULTI_ARG_4_SF2_SI_I
:
8361 case MULTI_ARG_4_SF2_SI_I1
:
8363 last_arg_constant
= true;
8366 case MULTI_ARG_3_SF
:
8367 case MULTI_ARG_3_DF
:
8368 case MULTI_ARG_3_SF2
:
8369 case MULTI_ARG_3_DF2
:
8370 case MULTI_ARG_3_DI
:
8371 case MULTI_ARG_3_SI
:
8372 case MULTI_ARG_3_SI_DI
:
8373 case MULTI_ARG_3_HI
:
8374 case MULTI_ARG_3_HI_SI
:
8375 case MULTI_ARG_3_QI
:
8376 case MULTI_ARG_3_DI2
:
8377 case MULTI_ARG_3_SI2
:
8378 case MULTI_ARG_3_HI2
:
8379 case MULTI_ARG_3_QI2
:
8383 case MULTI_ARG_2_SF
:
8384 case MULTI_ARG_2_DF
:
8385 case MULTI_ARG_2_DI
:
8386 case MULTI_ARG_2_SI
:
8387 case MULTI_ARG_2_HI
:
8388 case MULTI_ARG_2_QI
:
8392 case MULTI_ARG_2_DI_IMM
:
8393 case MULTI_ARG_2_SI_IMM
:
8394 case MULTI_ARG_2_HI_IMM
:
8395 case MULTI_ARG_2_QI_IMM
:
8397 last_arg_constant
= true;
8400 case MULTI_ARG_1_SF
:
8401 case MULTI_ARG_1_DF
:
8402 case MULTI_ARG_1_SF2
:
8403 case MULTI_ARG_1_DF2
:
8404 case MULTI_ARG_1_DI
:
8405 case MULTI_ARG_1_SI
:
8406 case MULTI_ARG_1_HI
:
8407 case MULTI_ARG_1_QI
:
8408 case MULTI_ARG_1_SI_DI
:
8409 case MULTI_ARG_1_HI_DI
:
8410 case MULTI_ARG_1_HI_SI
:
8411 case MULTI_ARG_1_QI_DI
:
8412 case MULTI_ARG_1_QI_SI
:
8413 case MULTI_ARG_1_QI_HI
:
8417 case MULTI_ARG_2_DI_CMP
:
8418 case MULTI_ARG_2_SI_CMP
:
8419 case MULTI_ARG_2_HI_CMP
:
8420 case MULTI_ARG_2_QI_CMP
:
8422 comparison_p
= true;
8425 case MULTI_ARG_2_SF_TF
:
8426 case MULTI_ARG_2_DF_TF
:
8427 case MULTI_ARG_2_DI_TF
:
8428 case MULTI_ARG_2_SI_TF
:
8429 case MULTI_ARG_2_HI_TF
:
8430 case MULTI_ARG_2_QI_TF
:
8439 if (optimize
|| !target
8440 || GET_MODE (target
) != tmode
8441 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
8442 target
= gen_reg_rtx (tmode
);
8443 else if (memory_operand (target
, tmode
))
8446 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
8448 for (i
= 0; i
< nargs
; i
++)
8450 tree arg
= CALL_EXPR_ARG (exp
, i
);
8451 rtx op
= expand_normal (arg
);
8452 int adjust
= (comparison_p
) ? 1 : 0;
8453 machine_mode mode
= insn_data
[icode
].operand
[i
+adjust
+1].mode
;
8455 if (last_arg_constant
&& i
== nargs
- 1)
8457 if (!insn_data
[icode
].operand
[i
+ 1].predicate (op
, mode
))
8459 enum insn_code new_icode
= icode
;
8462 case CODE_FOR_xop_vpermil2v2df3
:
8463 case CODE_FOR_xop_vpermil2v4sf3
:
8464 case CODE_FOR_xop_vpermil2v4df3
:
8465 case CODE_FOR_xop_vpermil2v8sf3
:
8466 error ("the last argument must be a 2-bit immediate");
8467 return gen_reg_rtx (tmode
);
8468 case CODE_FOR_xop_rotlv2di3
:
8469 new_icode
= CODE_FOR_rotlv2di3
;
8471 case CODE_FOR_xop_rotlv4si3
:
8472 new_icode
= CODE_FOR_rotlv4si3
;
8474 case CODE_FOR_xop_rotlv8hi3
:
8475 new_icode
= CODE_FOR_rotlv8hi3
;
8477 case CODE_FOR_xop_rotlv16qi3
:
8478 new_icode
= CODE_FOR_rotlv16qi3
;
8480 if (CONST_INT_P (op
))
8482 int mask
= GET_MODE_UNIT_BITSIZE (tmode
) - 1;
8483 op
= GEN_INT (INTVAL (op
) & mask
);
8485 (insn_data
[icode
].operand
[i
+ 1].predicate (op
, mode
));
8491 && insn_data
[new_icode
].operand
[0].mode
== tmode
8492 && insn_data
[new_icode
].operand
[1].mode
== tmode
8493 && insn_data
[new_icode
].operand
[2].mode
== mode
8494 && insn_data
[new_icode
].operand
[0].predicate
8495 == insn_data
[icode
].operand
[0].predicate
8496 && insn_data
[new_icode
].operand
[1].predicate
8497 == insn_data
[icode
].operand
[1].predicate
);
8510 if (VECTOR_MODE_P (mode
))
8511 op
= safe_vector_operand (op
, mode
);
8513 /* If we aren't optimizing, only allow one memory operand to be
8515 if (memory_operand (op
, mode
))
8518 gcc_assert (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
);
8521 || !insn_data
[icode
].operand
[i
+adjust
+1].predicate (op
, mode
)
8523 op
= force_reg (mode
, op
);
8532 pat
= GEN_FCN (icode
) (target
, xops
[0]);
8537 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
8538 GEN_INT ((int)sub_code
));
8539 else if (! comparison_p
)
8540 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1]);
8543 rtx cmp_op
= gen_rtx_fmt_ee (sub_code
, GET_MODE (target
),
8546 pat
= GEN_FCN (icode
) (target
, cmp_op
, xops
[0], xops
[1]);
8551 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2]);
8555 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2], xops
[3]);
8569 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
8570 insns with vec_merge. */
8573 ix86_expand_unop_vec_merge_builtin (enum insn_code icode
, tree exp
,
8577 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8578 rtx op1
, op0
= expand_normal (arg0
);
8579 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
8580 machine_mode mode0
= insn_data
[icode
].operand
[1].mode
;
8582 if (optimize
|| !target
8583 || GET_MODE (target
) != tmode
8584 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
8585 target
= gen_reg_rtx (tmode
);
8587 if (VECTOR_MODE_P (mode0
))
8588 op0
= safe_vector_operand (op0
, mode0
);
8590 if ((optimize
&& !register_operand (op0
, mode0
))
8591 || !insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
8592 op0
= copy_to_mode_reg (mode0
, op0
);
8595 if (!insn_data
[icode
].operand
[2].predicate (op1
, mode0
))
8596 op1
= copy_to_mode_reg (mode0
, op1
);
8598 pat
= GEN_FCN (icode
) (target
, op0
, op1
);
8605 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
8608 ix86_expand_sse_compare (const struct builtin_description
*d
,
8609 tree exp
, rtx target
, bool swap
)
8612 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8613 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8614 rtx op0
= expand_normal (arg0
);
8615 rtx op1
= expand_normal (arg1
);
8617 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
8618 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
8619 machine_mode mode1
= insn_data
[d
->icode
].operand
[2].mode
;
8620 enum rtx_code comparison
= d
->comparison
;
8622 if (VECTOR_MODE_P (mode0
))
8623 op0
= safe_vector_operand (op0
, mode0
);
8624 if (VECTOR_MODE_P (mode1
))
8625 op1
= safe_vector_operand (op1
, mode1
);
8627 /* Swap operands if we have a comparison that isn't available in
8630 std::swap (op0
, op1
);
8632 if (optimize
|| !target
8633 || GET_MODE (target
) != tmode
8634 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
8635 target
= gen_reg_rtx (tmode
);
8637 if ((optimize
&& !register_operand (op0
, mode0
))
8638 || !insn_data
[d
->icode
].operand
[1].predicate (op0
, mode0
))
8639 op0
= copy_to_mode_reg (mode0
, op0
);
8640 if ((optimize
&& !register_operand (op1
, mode1
))
8641 || !insn_data
[d
->icode
].operand
[2].predicate (op1
, mode1
))
8642 op1
= copy_to_mode_reg (mode1
, op1
);
8644 op2
= gen_rtx_fmt_ee (comparison
, mode0
, op0
, op1
);
8645 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
, op2
);
8652 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
8655 ix86_expand_sse_comi (const struct builtin_description
*d
, tree exp
,
8659 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8660 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8661 rtx op0
= expand_normal (arg0
);
8662 rtx op1
= expand_normal (arg1
);
8663 machine_mode mode0
= insn_data
[d
->icode
].operand
[0].mode
;
8664 machine_mode mode1
= insn_data
[d
->icode
].operand
[1].mode
;
8665 enum rtx_code comparison
= d
->comparison
;
8667 if (VECTOR_MODE_P (mode0
))
8668 op0
= safe_vector_operand (op0
, mode0
);
8669 if (VECTOR_MODE_P (mode1
))
8670 op1
= safe_vector_operand (op1
, mode1
);
8672 target
= gen_reg_rtx (SImode
);
8673 emit_move_insn (target
, const0_rtx
);
8674 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8676 if ((optimize
&& !register_operand (op0
, mode0
))
8677 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8678 op0
= copy_to_mode_reg (mode0
, op0
);
8679 if ((optimize
&& !register_operand (op1
, mode1
))
8680 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
8681 op1
= copy_to_mode_reg (mode1
, op1
);
8683 pat
= GEN_FCN (d
->icode
) (op0
, op1
);
8687 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8688 gen_rtx_fmt_ee (comparison
, QImode
,
8692 return SUBREG_REG (target
);
8695 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
8698 ix86_expand_sse_round (const struct builtin_description
*d
, tree exp
,
8702 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8703 rtx op1
, op0
= expand_normal (arg0
);
8704 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
8705 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
8707 if (optimize
|| target
== 0
8708 || GET_MODE (target
) != tmode
8709 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
8710 target
= gen_reg_rtx (tmode
);
8712 if (VECTOR_MODE_P (mode0
))
8713 op0
= safe_vector_operand (op0
, mode0
);
8715 if ((optimize
&& !register_operand (op0
, mode0
))
8716 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8717 op0
= copy_to_mode_reg (mode0
, op0
);
8719 op1
= GEN_INT (d
->comparison
);
8721 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
);
8729 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description
*d
,
8730 tree exp
, rtx target
)
8733 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8734 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8735 rtx op0
= expand_normal (arg0
);
8736 rtx op1
= expand_normal (arg1
);
8738 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
8739 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
8740 machine_mode mode1
= insn_data
[d
->icode
].operand
[2].mode
;
8742 if (optimize
|| target
== 0
8743 || GET_MODE (target
) != tmode
8744 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
8745 target
= gen_reg_rtx (tmode
);
8747 op0
= safe_vector_operand (op0
, mode0
);
8748 op1
= safe_vector_operand (op1
, mode1
);
8750 if ((optimize
&& !register_operand (op0
, mode0
))
8751 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8752 op0
= copy_to_mode_reg (mode0
, op0
);
8753 if ((optimize
&& !register_operand (op1
, mode1
))
8754 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
8755 op1
= copy_to_mode_reg (mode1
, op1
);
8757 op2
= GEN_INT (d
->comparison
);
8759 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
, op2
);
8766 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
8769 ix86_expand_sse_ptest (const struct builtin_description
*d
, tree exp
,
8773 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8774 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8775 rtx op0
= expand_normal (arg0
);
8776 rtx op1
= expand_normal (arg1
);
8777 machine_mode mode0
= insn_data
[d
->icode
].operand
[0].mode
;
8778 machine_mode mode1
= insn_data
[d
->icode
].operand
[1].mode
;
8779 enum rtx_code comparison
= d
->comparison
;
8781 if (VECTOR_MODE_P (mode0
))
8782 op0
= safe_vector_operand (op0
, mode0
);
8783 if (VECTOR_MODE_P (mode1
))
8784 op1
= safe_vector_operand (op1
, mode1
);
8786 target
= gen_reg_rtx (SImode
);
8787 emit_move_insn (target
, const0_rtx
);
8788 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8790 if ((optimize
&& !register_operand (op0
, mode0
))
8791 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8792 op0
= copy_to_mode_reg (mode0
, op0
);
8793 if ((optimize
&& !register_operand (op1
, mode1
))
8794 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
8795 op1
= copy_to_mode_reg (mode1
, op1
);
8797 pat
= GEN_FCN (d
->icode
) (op0
, op1
);
8801 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8802 gen_rtx_fmt_ee (comparison
, QImode
,
8806 return SUBREG_REG (target
);
8809 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
8812 ix86_expand_sse_pcmpestr (const struct builtin_description
*d
,
8813 tree exp
, rtx target
)
8816 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8817 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8818 tree arg2
= CALL_EXPR_ARG (exp
, 2);
8819 tree arg3
= CALL_EXPR_ARG (exp
, 3);
8820 tree arg4
= CALL_EXPR_ARG (exp
, 4);
8821 rtx scratch0
, scratch1
;
8822 rtx op0
= expand_normal (arg0
);
8823 rtx op1
= expand_normal (arg1
);
8824 rtx op2
= expand_normal (arg2
);
8825 rtx op3
= expand_normal (arg3
);
8826 rtx op4
= expand_normal (arg4
);
8827 machine_mode tmode0
, tmode1
, modev2
, modei3
, modev4
, modei5
, modeimm
;
8829 tmode0
= insn_data
[d
->icode
].operand
[0].mode
;
8830 tmode1
= insn_data
[d
->icode
].operand
[1].mode
;
8831 modev2
= insn_data
[d
->icode
].operand
[2].mode
;
8832 modei3
= insn_data
[d
->icode
].operand
[3].mode
;
8833 modev4
= insn_data
[d
->icode
].operand
[4].mode
;
8834 modei5
= insn_data
[d
->icode
].operand
[5].mode
;
8835 modeimm
= insn_data
[d
->icode
].operand
[6].mode
;
8837 if (VECTOR_MODE_P (modev2
))
8838 op0
= safe_vector_operand (op0
, modev2
);
8839 if (VECTOR_MODE_P (modev4
))
8840 op2
= safe_vector_operand (op2
, modev4
);
8842 if (!insn_data
[d
->icode
].operand
[2].predicate (op0
, modev2
))
8843 op0
= copy_to_mode_reg (modev2
, op0
);
8844 if (!insn_data
[d
->icode
].operand
[3].predicate (op1
, modei3
))
8845 op1
= copy_to_mode_reg (modei3
, op1
);
8846 if ((optimize
&& !register_operand (op2
, modev4
))
8847 || !insn_data
[d
->icode
].operand
[4].predicate (op2
, modev4
))
8848 op2
= copy_to_mode_reg (modev4
, op2
);
8849 if (!insn_data
[d
->icode
].operand
[5].predicate (op3
, modei5
))
8850 op3
= copy_to_mode_reg (modei5
, op3
);
8852 if (!insn_data
[d
->icode
].operand
[6].predicate (op4
, modeimm
))
8854 error ("the fifth argument must be an 8-bit immediate");
8858 if (d
->code
== IX86_BUILTIN_PCMPESTRI128
)
8860 if (optimize
|| !target
8861 || GET_MODE (target
) != tmode0
8862 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode0
))
8863 target
= gen_reg_rtx (tmode0
);
8865 scratch1
= gen_reg_rtx (tmode1
);
8867 pat
= GEN_FCN (d
->icode
) (target
, scratch1
, op0
, op1
, op2
, op3
, op4
);
8869 else if (d
->code
== IX86_BUILTIN_PCMPESTRM128
)
8871 if (optimize
|| !target
8872 || GET_MODE (target
) != tmode1
8873 || !insn_data
[d
->icode
].operand
[1].predicate (target
, tmode1
))
8874 target
= gen_reg_rtx (tmode1
);
8876 scratch0
= gen_reg_rtx (tmode0
);
8878 pat
= GEN_FCN (d
->icode
) (scratch0
, target
, op0
, op1
, op2
, op3
, op4
);
8882 gcc_assert (d
->flag
);
8884 scratch0
= gen_reg_rtx (tmode0
);
8885 scratch1
= gen_reg_rtx (tmode1
);
8887 pat
= GEN_FCN (d
->icode
) (scratch0
, scratch1
, op0
, op1
, op2
, op3
, op4
);
8897 target
= gen_reg_rtx (SImode
);
8898 emit_move_insn (target
, const0_rtx
);
8899 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8902 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8903 gen_rtx_fmt_ee (EQ
, QImode
,
8904 gen_rtx_REG ((machine_mode
) d
->flag
,
8907 return SUBREG_REG (target
);
8914 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
8917 ix86_expand_sse_pcmpistr (const struct builtin_description
*d
,
8918 tree exp
, rtx target
)
8921 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8922 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8923 tree arg2
= CALL_EXPR_ARG (exp
, 2);
8924 rtx scratch0
, scratch1
;
8925 rtx op0
= expand_normal (arg0
);
8926 rtx op1
= expand_normal (arg1
);
8927 rtx op2
= expand_normal (arg2
);
8928 machine_mode tmode0
, tmode1
, modev2
, modev3
, modeimm
;
8930 tmode0
= insn_data
[d
->icode
].operand
[0].mode
;
8931 tmode1
= insn_data
[d
->icode
].operand
[1].mode
;
8932 modev2
= insn_data
[d
->icode
].operand
[2].mode
;
8933 modev3
= insn_data
[d
->icode
].operand
[3].mode
;
8934 modeimm
= insn_data
[d
->icode
].operand
[4].mode
;
8936 if (VECTOR_MODE_P (modev2
))
8937 op0
= safe_vector_operand (op0
, modev2
);
8938 if (VECTOR_MODE_P (modev3
))
8939 op1
= safe_vector_operand (op1
, modev3
);
8941 if (!insn_data
[d
->icode
].operand
[2].predicate (op0
, modev2
))
8942 op0
= copy_to_mode_reg (modev2
, op0
);
8943 if ((optimize
&& !register_operand (op1
, modev3
))
8944 || !insn_data
[d
->icode
].operand
[3].predicate (op1
, modev3
))
8945 op1
= copy_to_mode_reg (modev3
, op1
);
8947 if (!insn_data
[d
->icode
].operand
[4].predicate (op2
, modeimm
))
8949 error ("the third argument must be an 8-bit immediate");
8953 if (d
->code
== IX86_BUILTIN_PCMPISTRI128
)
8955 if (optimize
|| !target
8956 || GET_MODE (target
) != tmode0
8957 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode0
))
8958 target
= gen_reg_rtx (tmode0
);
8960 scratch1
= gen_reg_rtx (tmode1
);
8962 pat
= GEN_FCN (d
->icode
) (target
, scratch1
, op0
, op1
, op2
);
8964 else if (d
->code
== IX86_BUILTIN_PCMPISTRM128
)
8966 if (optimize
|| !target
8967 || GET_MODE (target
) != tmode1
8968 || !insn_data
[d
->icode
].operand
[1].predicate (target
, tmode1
))
8969 target
= gen_reg_rtx (tmode1
);
8971 scratch0
= gen_reg_rtx (tmode0
);
8973 pat
= GEN_FCN (d
->icode
) (scratch0
, target
, op0
, op1
, op2
);
8977 gcc_assert (d
->flag
);
8979 scratch0
= gen_reg_rtx (tmode0
);
8980 scratch1
= gen_reg_rtx (tmode1
);
8982 pat
= GEN_FCN (d
->icode
) (scratch0
, scratch1
, op0
, op1
, op2
);
8992 target
= gen_reg_rtx (SImode
);
8993 emit_move_insn (target
, const0_rtx
);
8994 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8997 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8998 gen_rtx_fmt_ee (EQ
, QImode
,
8999 gen_rtx_REG ((machine_mode
) d
->flag
,
9002 return SUBREG_REG (target
);
9008 /* Fixup modeless constants to fit required mode. */
9011 fixup_modeless_constant (rtx x
, machine_mode mode
)
9013 if (GET_MODE (x
) == VOIDmode
)
9014 x
= convert_to_mode (mode
, x
, 1);
9018 /* Subroutine of ix86_expand_builtin to take care of insns with
9019 variable number of operands. */
9022 ix86_expand_args_builtin (const struct builtin_description
*d
,
9023 tree exp
, rtx target
)
9025 rtx pat
, real_target
;
9026 unsigned int i
, nargs
;
9027 unsigned int nargs_constant
= 0;
9028 unsigned int mask_pos
= 0;
9031 bool second_arg_count
= false;
9032 enum insn_code icode
= d
->icode
;
9033 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
9034 machine_mode tmode
= insn_p
->operand
[0].mode
;
9035 machine_mode rmode
= VOIDmode
;
9037 enum rtx_code comparison
= d
->comparison
;
9039 switch ((enum ix86_builtin_func_type
) d
->flag
)
9041 case V2DF_FTYPE_V2DF_ROUND
:
9042 case V4DF_FTYPE_V4DF_ROUND
:
9043 case V8DF_FTYPE_V8DF_ROUND
:
9044 case V4SF_FTYPE_V4SF_ROUND
:
9045 case V8SF_FTYPE_V8SF_ROUND
:
9046 case V16SF_FTYPE_V16SF_ROUND
:
9047 case V4SI_FTYPE_V4SF_ROUND
:
9048 case V8SI_FTYPE_V8SF_ROUND
:
9049 case V16SI_FTYPE_V16SF_ROUND
:
9050 return ix86_expand_sse_round (d
, exp
, target
);
9051 case V4SI_FTYPE_V2DF_V2DF_ROUND
:
9052 case V8SI_FTYPE_V4DF_V4DF_ROUND
:
9053 case V16SI_FTYPE_V8DF_V8DF_ROUND
:
9054 return ix86_expand_sse_round_vec_pack_sfix (d
, exp
, target
);
9055 case INT_FTYPE_V8SF_V8SF_PTEST
:
9056 case INT_FTYPE_V4DI_V4DI_PTEST
:
9057 case INT_FTYPE_V4DF_V4DF_PTEST
:
9058 case INT_FTYPE_V4SF_V4SF_PTEST
:
9059 case INT_FTYPE_V2DI_V2DI_PTEST
:
9060 case INT_FTYPE_V2DF_V2DF_PTEST
:
9061 return ix86_expand_sse_ptest (d
, exp
, target
);
9062 case FLOAT128_FTYPE_FLOAT128
:
9063 case FLOAT_FTYPE_FLOAT
:
9065 case UINT_FTYPE_UINT
:
9066 case UINT16_FTYPE_UINT16
:
9067 case UINT64_FTYPE_INT
:
9068 case UINT64_FTYPE_UINT64
:
9069 case INT64_FTYPE_INT64
:
9070 case INT64_FTYPE_V4SF
:
9071 case INT64_FTYPE_V2DF
:
9072 case INT_FTYPE_V16QI
:
9073 case INT_FTYPE_V8QI
:
9074 case INT_FTYPE_V8SF
:
9075 case INT_FTYPE_V4DF
:
9076 case INT_FTYPE_V4SF
:
9077 case INT_FTYPE_V2DF
:
9078 case INT_FTYPE_V32QI
:
9079 case V16QI_FTYPE_V16QI
:
9080 case V8SI_FTYPE_V8SF
:
9081 case V8SI_FTYPE_V4SI
:
9082 case V8HI_FTYPE_V8HI
:
9083 case V8HI_FTYPE_V16QI
:
9084 case V8QI_FTYPE_V8QI
:
9085 case V8SF_FTYPE_V8SF
:
9086 case V8SF_FTYPE_V8SI
:
9087 case V8SF_FTYPE_V4SF
:
9088 case V8SF_FTYPE_V8HI
:
9089 case V4SI_FTYPE_V4SI
:
9090 case V4SI_FTYPE_V16QI
:
9091 case V4SI_FTYPE_V4SF
:
9092 case V4SI_FTYPE_V8SI
:
9093 case V4SI_FTYPE_V8HI
:
9094 case V4SI_FTYPE_V4DF
:
9095 case V4SI_FTYPE_V2DF
:
9096 case V4HI_FTYPE_V4HI
:
9097 case V4DF_FTYPE_V4DF
:
9098 case V4DF_FTYPE_V4SI
:
9099 case V4DF_FTYPE_V4SF
:
9100 case V4DF_FTYPE_V2DF
:
9101 case V4SF_FTYPE_V4SF
:
9102 case V4SF_FTYPE_V4SI
:
9103 case V4SF_FTYPE_V8SF
:
9104 case V4SF_FTYPE_V4DF
:
9105 case V4SF_FTYPE_V8HI
:
9106 case V4SF_FTYPE_V2DF
:
9107 case V2DI_FTYPE_V2DI
:
9108 case V2DI_FTYPE_V16QI
:
9109 case V2DI_FTYPE_V8HI
:
9110 case V2DI_FTYPE_V4SI
:
9111 case V2DF_FTYPE_V2DF
:
9112 case V2DF_FTYPE_V4SI
:
9113 case V2DF_FTYPE_V4DF
:
9114 case V2DF_FTYPE_V4SF
:
9115 case V2DF_FTYPE_V2SI
:
9116 case V2SI_FTYPE_V2SI
:
9117 case V2SI_FTYPE_V4SF
:
9118 case V2SI_FTYPE_V2SF
:
9119 case V2SI_FTYPE_V2DF
:
9120 case V2SF_FTYPE_V2SF
:
9121 case V2SF_FTYPE_V2SI
:
9122 case V32QI_FTYPE_V32QI
:
9123 case V32QI_FTYPE_V16QI
:
9124 case V16HI_FTYPE_V16HI
:
9125 case V16HI_FTYPE_V8HI
:
9126 case V8SI_FTYPE_V8SI
:
9127 case V16HI_FTYPE_V16QI
:
9128 case V8SI_FTYPE_V16QI
:
9129 case V4DI_FTYPE_V16QI
:
9130 case V8SI_FTYPE_V8HI
:
9131 case V4DI_FTYPE_V8HI
:
9132 case V4DI_FTYPE_V4SI
:
9133 case V4DI_FTYPE_V2DI
:
9140 case UHI_FTYPE_V16QI
:
9141 case USI_FTYPE_V32QI
:
9142 case UDI_FTYPE_V64QI
:
9143 case V16QI_FTYPE_UHI
:
9144 case V32QI_FTYPE_USI
:
9145 case V64QI_FTYPE_UDI
:
9146 case V8HI_FTYPE_UQI
:
9147 case V16HI_FTYPE_UHI
:
9148 case V32HI_FTYPE_USI
:
9149 case V4SI_FTYPE_UQI
:
9150 case V8SI_FTYPE_UQI
:
9151 case V4SI_FTYPE_UHI
:
9152 case V8SI_FTYPE_UHI
:
9153 case UQI_FTYPE_V8HI
:
9154 case UHI_FTYPE_V16HI
:
9155 case USI_FTYPE_V32HI
:
9156 case UQI_FTYPE_V4SI
:
9157 case UQI_FTYPE_V8SI
:
9158 case UHI_FTYPE_V16SI
:
9159 case UQI_FTYPE_V2DI
:
9160 case UQI_FTYPE_V4DI
:
9161 case UQI_FTYPE_V8DI
:
9162 case V16SI_FTYPE_UHI
:
9163 case V2DI_FTYPE_UQI
:
9164 case V4DI_FTYPE_UQI
:
9165 case V16SI_FTYPE_INT
:
9166 case V16SF_FTYPE_V8SF
:
9167 case V16SI_FTYPE_V8SI
:
9168 case V16SF_FTYPE_V4SF
:
9169 case V16SI_FTYPE_V4SI
:
9170 case V16SI_FTYPE_V16SF
:
9171 case V16SI_FTYPE_V16SI
:
9172 case V64QI_FTYPE_V64QI
:
9173 case V32HI_FTYPE_V32HI
:
9174 case V16SF_FTYPE_V16SF
:
9175 case V8DI_FTYPE_UQI
:
9176 case V8DI_FTYPE_V8DI
:
9177 case V8DF_FTYPE_V4DF
:
9178 case V8DF_FTYPE_V2DF
:
9179 case V8DF_FTYPE_V8DF
:
9180 case V4DI_FTYPE_V4DI
:
9181 case V16HI_FTYPE_V16SF
:
9182 case V8HI_FTYPE_V8SF
:
9183 case V8HI_FTYPE_V4SF
:
9186 case V4SF_FTYPE_V4SF_VEC_MERGE
:
9187 case V2DF_FTYPE_V2DF_VEC_MERGE
:
9188 return ix86_expand_unop_vec_merge_builtin (icode
, exp
, target
);
9189 case FLOAT128_FTYPE_FLOAT128_FLOAT128
:
9190 case V16QI_FTYPE_V16QI_V16QI
:
9191 case V16QI_FTYPE_V8HI_V8HI
:
9192 case V16SF_FTYPE_V16SF_V16SF
:
9193 case V8QI_FTYPE_V8QI_V8QI
:
9194 case V8QI_FTYPE_V4HI_V4HI
:
9195 case V8HI_FTYPE_V8HI_V8HI
:
9196 case V8HI_FTYPE_V16QI_V16QI
:
9197 case V8HI_FTYPE_V4SI_V4SI
:
9198 case V8SF_FTYPE_V8SF_V8SF
:
9199 case V8SF_FTYPE_V8SF_V8SI
:
9200 case V8DF_FTYPE_V8DF_V8DF
:
9201 case V4SI_FTYPE_V4SI_V4SI
:
9202 case V4SI_FTYPE_V8HI_V8HI
:
9203 case V4SI_FTYPE_V2DF_V2DF
:
9204 case V4HI_FTYPE_V4HI_V4HI
:
9205 case V4HI_FTYPE_V8QI_V8QI
:
9206 case V4HI_FTYPE_V2SI_V2SI
:
9207 case V4DF_FTYPE_V4DF_V4DF
:
9208 case V4DF_FTYPE_V4DF_V4DI
:
9209 case V4SF_FTYPE_V4SF_V4SF
:
9210 case V4SF_FTYPE_V4SF_V4SI
:
9211 case V4SF_FTYPE_V4SF_V2SI
:
9212 case V4SF_FTYPE_V4SF_V2DF
:
9213 case V4SF_FTYPE_V4SF_UINT
:
9214 case V4SF_FTYPE_V4SF_DI
:
9215 case V4SF_FTYPE_V4SF_SI
:
9216 case V2DI_FTYPE_V2DI_V2DI
:
9217 case V2DI_FTYPE_V16QI_V16QI
:
9218 case V2DI_FTYPE_V4SI_V4SI
:
9219 case V2DI_FTYPE_V2DI_V16QI
:
9220 case V2SI_FTYPE_V2SI_V2SI
:
9221 case V2SI_FTYPE_V4HI_V4HI
:
9222 case V2SI_FTYPE_V2SF_V2SF
:
9223 case V2DF_FTYPE_V2DF_V2DF
:
9224 case V2DF_FTYPE_V2DF_V4SF
:
9225 case V2DF_FTYPE_V2DF_V2DI
:
9226 case V2DF_FTYPE_V2DF_DI
:
9227 case V2DF_FTYPE_V2DF_SI
:
9228 case V2DF_FTYPE_V2DF_UINT
:
9229 case V2SF_FTYPE_V2SF_V2SF
:
9230 case V1DI_FTYPE_V1DI_V1DI
:
9231 case V1DI_FTYPE_V8QI_V8QI
:
9232 case V1DI_FTYPE_V2SI_V2SI
:
9233 case V32QI_FTYPE_V16HI_V16HI
:
9234 case V16HI_FTYPE_V8SI_V8SI
:
9235 case V64QI_FTYPE_V64QI_V64QI
:
9236 case V32QI_FTYPE_V32QI_V32QI
:
9237 case V16HI_FTYPE_V32QI_V32QI
:
9238 case V16HI_FTYPE_V16HI_V16HI
:
9239 case V8SI_FTYPE_V4DF_V4DF
:
9240 case V8SI_FTYPE_V8SI_V8SI
:
9241 case V8SI_FTYPE_V16HI_V16HI
:
9242 case V4DI_FTYPE_V4DI_V4DI
:
9243 case V4DI_FTYPE_V8SI_V8SI
:
9244 case V8DI_FTYPE_V64QI_V64QI
:
9245 if (comparison
== UNKNOWN
)
9246 return ix86_expand_binop_builtin (icode
, exp
, target
);
9249 case V4SF_FTYPE_V4SF_V4SF_SWAP
:
9250 case V2DF_FTYPE_V2DF_V2DF_SWAP
:
9251 gcc_assert (comparison
!= UNKNOWN
);
9255 case V16HI_FTYPE_V16HI_V8HI_COUNT
:
9256 case V16HI_FTYPE_V16HI_SI_COUNT
:
9257 case V8SI_FTYPE_V8SI_V4SI_COUNT
:
9258 case V8SI_FTYPE_V8SI_SI_COUNT
:
9259 case V4DI_FTYPE_V4DI_V2DI_COUNT
:
9260 case V4DI_FTYPE_V4DI_INT_COUNT
:
9261 case V8HI_FTYPE_V8HI_V8HI_COUNT
:
9262 case V8HI_FTYPE_V8HI_SI_COUNT
:
9263 case V4SI_FTYPE_V4SI_V4SI_COUNT
:
9264 case V4SI_FTYPE_V4SI_SI_COUNT
:
9265 case V4HI_FTYPE_V4HI_V4HI_COUNT
:
9266 case V4HI_FTYPE_V4HI_SI_COUNT
:
9267 case V2DI_FTYPE_V2DI_V2DI_COUNT
:
9268 case V2DI_FTYPE_V2DI_SI_COUNT
:
9269 case V2SI_FTYPE_V2SI_V2SI_COUNT
:
9270 case V2SI_FTYPE_V2SI_SI_COUNT
:
9271 case V1DI_FTYPE_V1DI_V1DI_COUNT
:
9272 case V1DI_FTYPE_V1DI_SI_COUNT
:
9274 second_arg_count
= true;
9276 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT
:
9277 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT
:
9278 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT
:
9279 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT
:
9280 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT
:
9281 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT
:
9282 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT
:
9283 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT
:
9284 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT
:
9285 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT
:
9286 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT
:
9287 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT
:
9288 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT
:
9289 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT
:
9290 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT
:
9291 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT
:
9292 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT
:
9293 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT
:
9295 second_arg_count
= true;
9297 case UINT64_FTYPE_UINT64_UINT64
:
9298 case UINT_FTYPE_UINT_UINT
:
9299 case UINT_FTYPE_UINT_USHORT
:
9300 case UINT_FTYPE_UINT_UCHAR
:
9301 case UINT16_FTYPE_UINT16_INT
:
9302 case UINT8_FTYPE_UINT8_INT
:
9303 case UQI_FTYPE_UQI_UQI
:
9304 case UHI_FTYPE_UHI_UHI
:
9305 case USI_FTYPE_USI_USI
:
9306 case UDI_FTYPE_UDI_UDI
:
9307 case V16SI_FTYPE_V8DF_V8DF
:
9308 case V32HI_FTYPE_V16SF_V16SF
:
9309 case V16HI_FTYPE_V8SF_V8SF
:
9310 case V8HI_FTYPE_V4SF_V4SF
:
9311 case V16HI_FTYPE_V16SF_UHI
:
9312 case V8HI_FTYPE_V8SF_UQI
:
9313 case V8HI_FTYPE_V4SF_UQI
:
9316 case V2DI_FTYPE_V2DI_INT_CONVERT
:
9321 case V4DI_FTYPE_V4DI_INT_CONVERT
:
9326 case V8DI_FTYPE_V8DI_INT_CONVERT
:
9331 case V8HI_FTYPE_V8HI_INT
:
9332 case V8HI_FTYPE_V8SF_INT
:
9333 case V16HI_FTYPE_V16SF_INT
:
9334 case V8HI_FTYPE_V4SF_INT
:
9335 case V8SF_FTYPE_V8SF_INT
:
9336 case V4SF_FTYPE_V16SF_INT
:
9337 case V16SF_FTYPE_V16SF_INT
:
9338 case V4SI_FTYPE_V4SI_INT
:
9339 case V4SI_FTYPE_V8SI_INT
:
9340 case V4HI_FTYPE_V4HI_INT
:
9341 case V4DF_FTYPE_V4DF_INT
:
9342 case V4DF_FTYPE_V8DF_INT
:
9343 case V4SF_FTYPE_V4SF_INT
:
9344 case V4SF_FTYPE_V8SF_INT
:
9345 case V2DI_FTYPE_V2DI_INT
:
9346 case V2DF_FTYPE_V2DF_INT
:
9347 case V2DF_FTYPE_V4DF_INT
:
9348 case V16HI_FTYPE_V16HI_INT
:
9349 case V8SI_FTYPE_V8SI_INT
:
9350 case V16SI_FTYPE_V16SI_INT
:
9351 case V4SI_FTYPE_V16SI_INT
:
9352 case V4DI_FTYPE_V4DI_INT
:
9353 case V2DI_FTYPE_V4DI_INT
:
9354 case V4DI_FTYPE_V8DI_INT
:
9355 case UQI_FTYPE_UQI_UQI_CONST
:
9356 case UHI_FTYPE_UHI_UQI
:
9357 case USI_FTYPE_USI_UQI
:
9358 case UDI_FTYPE_UDI_UQI
:
9362 case V16QI_FTYPE_V16QI_V16QI_V16QI
:
9363 case V8SF_FTYPE_V8SF_V8SF_V8SF
:
9364 case V4DF_FTYPE_V4DF_V4DF_V4DF
:
9365 case V4SF_FTYPE_V4SF_V4SF_V4SF
:
9366 case V2DF_FTYPE_V2DF_V2DF_V2DF
:
9367 case V32QI_FTYPE_V32QI_V32QI_V32QI
:
9368 case UHI_FTYPE_V16SI_V16SI_UHI
:
9369 case UQI_FTYPE_V8DI_V8DI_UQI
:
9370 case V16HI_FTYPE_V16SI_V16HI_UHI
:
9371 case V16QI_FTYPE_V16SI_V16QI_UHI
:
9372 case V16QI_FTYPE_V8DI_V16QI_UQI
:
9373 case V16SF_FTYPE_V16SF_V16SF_UHI
:
9374 case V16SF_FTYPE_V4SF_V16SF_UHI
:
9375 case V16SI_FTYPE_SI_V16SI_UHI
:
9376 case V16SI_FTYPE_V16HI_V16SI_UHI
:
9377 case V16SI_FTYPE_V16QI_V16SI_UHI
:
9378 case V8SF_FTYPE_V4SF_V8SF_UQI
:
9379 case V4DF_FTYPE_V2DF_V4DF_UQI
:
9380 case V8SI_FTYPE_V4SI_V8SI_UQI
:
9381 case V8SI_FTYPE_SI_V8SI_UQI
:
9382 case V4SI_FTYPE_V4SI_V4SI_UQI
:
9383 case V4SI_FTYPE_SI_V4SI_UQI
:
9384 case V4DI_FTYPE_V2DI_V4DI_UQI
:
9385 case V4DI_FTYPE_DI_V4DI_UQI
:
9386 case V2DI_FTYPE_V2DI_V2DI_UQI
:
9387 case V2DI_FTYPE_DI_V2DI_UQI
:
9388 case V64QI_FTYPE_V64QI_V64QI_UDI
:
9389 case V64QI_FTYPE_V16QI_V64QI_UDI
:
9390 case V64QI_FTYPE_QI_V64QI_UDI
:
9391 case V32QI_FTYPE_V32QI_V32QI_USI
:
9392 case V32QI_FTYPE_V16QI_V32QI_USI
:
9393 case V32QI_FTYPE_QI_V32QI_USI
:
9394 case V16QI_FTYPE_V16QI_V16QI_UHI
:
9395 case V16QI_FTYPE_QI_V16QI_UHI
:
9396 case V32HI_FTYPE_V8HI_V32HI_USI
:
9397 case V32HI_FTYPE_HI_V32HI_USI
:
9398 case V16HI_FTYPE_V8HI_V16HI_UHI
:
9399 case V16HI_FTYPE_HI_V16HI_UHI
:
9400 case V8HI_FTYPE_V8HI_V8HI_UQI
:
9401 case V8HI_FTYPE_HI_V8HI_UQI
:
9402 case V8SF_FTYPE_V8HI_V8SF_UQI
:
9403 case V4SF_FTYPE_V8HI_V4SF_UQI
:
9404 case V8SI_FTYPE_V8SF_V8SI_UQI
:
9405 case V4SI_FTYPE_V4SF_V4SI_UQI
:
9406 case V4DI_FTYPE_V4SF_V4DI_UQI
:
9407 case V2DI_FTYPE_V4SF_V2DI_UQI
:
9408 case V4SF_FTYPE_V4DI_V4SF_UQI
:
9409 case V4SF_FTYPE_V2DI_V4SF_UQI
:
9410 case V4DF_FTYPE_V4DI_V4DF_UQI
:
9411 case V2DF_FTYPE_V2DI_V2DF_UQI
:
9412 case V16QI_FTYPE_V8HI_V16QI_UQI
:
9413 case V16QI_FTYPE_V16HI_V16QI_UHI
:
9414 case V16QI_FTYPE_V4SI_V16QI_UQI
:
9415 case V16QI_FTYPE_V8SI_V16QI_UQI
:
9416 case V8HI_FTYPE_V4SI_V8HI_UQI
:
9417 case V8HI_FTYPE_V8SI_V8HI_UQI
:
9418 case V16QI_FTYPE_V2DI_V16QI_UQI
:
9419 case V16QI_FTYPE_V4DI_V16QI_UQI
:
9420 case V8HI_FTYPE_V2DI_V8HI_UQI
:
9421 case V8HI_FTYPE_V4DI_V8HI_UQI
:
9422 case V4SI_FTYPE_V2DI_V4SI_UQI
:
9423 case V4SI_FTYPE_V4DI_V4SI_UQI
:
9424 case V32QI_FTYPE_V32HI_V32QI_USI
:
9425 case UHI_FTYPE_V16QI_V16QI_UHI
:
9426 case USI_FTYPE_V32QI_V32QI_USI
:
9427 case UDI_FTYPE_V64QI_V64QI_UDI
:
9428 case UQI_FTYPE_V8HI_V8HI_UQI
:
9429 case UHI_FTYPE_V16HI_V16HI_UHI
:
9430 case USI_FTYPE_V32HI_V32HI_USI
:
9431 case UQI_FTYPE_V4SI_V4SI_UQI
:
9432 case UQI_FTYPE_V8SI_V8SI_UQI
:
9433 case UQI_FTYPE_V2DI_V2DI_UQI
:
9434 case UQI_FTYPE_V4DI_V4DI_UQI
:
9435 case V4SF_FTYPE_V2DF_V4SF_UQI
:
9436 case V4SF_FTYPE_V4DF_V4SF_UQI
:
9437 case V16SI_FTYPE_V16SI_V16SI_UHI
:
9438 case V16SI_FTYPE_V4SI_V16SI_UHI
:
9439 case V2DI_FTYPE_V4SI_V2DI_UQI
:
9440 case V2DI_FTYPE_V8HI_V2DI_UQI
:
9441 case V2DI_FTYPE_V16QI_V2DI_UQI
:
9442 case V4DI_FTYPE_V4DI_V4DI_UQI
:
9443 case V4DI_FTYPE_V4SI_V4DI_UQI
:
9444 case V4DI_FTYPE_V8HI_V4DI_UQI
:
9445 case V4DI_FTYPE_V16QI_V4DI_UQI
:
9446 case V4DI_FTYPE_V4DF_V4DI_UQI
:
9447 case V2DI_FTYPE_V2DF_V2DI_UQI
:
9448 case V4SI_FTYPE_V4DF_V4SI_UQI
:
9449 case V4SI_FTYPE_V2DF_V4SI_UQI
:
9450 case V4SI_FTYPE_V8HI_V4SI_UQI
:
9451 case V4SI_FTYPE_V16QI_V4SI_UQI
:
9452 case V4DI_FTYPE_V4DI_V4DI_V4DI
:
9453 case V8DF_FTYPE_V2DF_V8DF_UQI
:
9454 case V8DF_FTYPE_V4DF_V8DF_UQI
:
9455 case V8DF_FTYPE_V8DF_V8DF_UQI
:
9456 case V8SF_FTYPE_V8SF_V8SF_UQI
:
9457 case V8SF_FTYPE_V8SI_V8SF_UQI
:
9458 case V4DF_FTYPE_V4DF_V4DF_UQI
:
9459 case V4SF_FTYPE_V4SF_V4SF_UQI
:
9460 case V2DF_FTYPE_V2DF_V2DF_UQI
:
9461 case V2DF_FTYPE_V4SF_V2DF_UQI
:
9462 case V2DF_FTYPE_V4SI_V2DF_UQI
:
9463 case V4SF_FTYPE_V4SI_V4SF_UQI
:
9464 case V4DF_FTYPE_V4SF_V4DF_UQI
:
9465 case V4DF_FTYPE_V4SI_V4DF_UQI
:
9466 case V8SI_FTYPE_V8SI_V8SI_UQI
:
9467 case V8SI_FTYPE_V8HI_V8SI_UQI
:
9468 case V8SI_FTYPE_V16QI_V8SI_UQI
:
9469 case V8DF_FTYPE_V8SI_V8DF_UQI
:
9470 case V8DI_FTYPE_DI_V8DI_UQI
:
9471 case V16SF_FTYPE_V8SF_V16SF_UHI
:
9472 case V16SI_FTYPE_V8SI_V16SI_UHI
:
9473 case V16HI_FTYPE_V16HI_V16HI_UHI
:
9474 case V8HI_FTYPE_V16QI_V8HI_UQI
:
9475 case V16HI_FTYPE_V16QI_V16HI_UHI
:
9476 case V32HI_FTYPE_V32HI_V32HI_USI
:
9477 case V32HI_FTYPE_V32QI_V32HI_USI
:
9478 case V8DI_FTYPE_V16QI_V8DI_UQI
:
9479 case V8DI_FTYPE_V2DI_V8DI_UQI
:
9480 case V8DI_FTYPE_V4DI_V8DI_UQI
:
9481 case V8DI_FTYPE_V8DI_V8DI_UQI
:
9482 case V8DI_FTYPE_V8HI_V8DI_UQI
:
9483 case V8DI_FTYPE_V8SI_V8DI_UQI
:
9484 case V8HI_FTYPE_V8DI_V8HI_UQI
:
9485 case V8SI_FTYPE_V8DI_V8SI_UQI
:
9486 case V4SI_FTYPE_V4SI_V4SI_V4SI
:
9487 case V16SI_FTYPE_V16SI_V16SI_V16SI
:
9488 case V8DI_FTYPE_V8DI_V8DI_V8DI
:
9489 case V32HI_FTYPE_V32HI_V32HI_V32HI
:
9490 case V2DI_FTYPE_V2DI_V2DI_V2DI
:
9491 case V16HI_FTYPE_V16HI_V16HI_V16HI
:
9492 case V8SI_FTYPE_V8SI_V8SI_V8SI
:
9493 case V8HI_FTYPE_V8HI_V8HI_V8HI
:
9494 case V32HI_FTYPE_V16SF_V16SF_USI
:
9495 case V16HI_FTYPE_V8SF_V8SF_UHI
:
9496 case V8HI_FTYPE_V4SF_V4SF_UQI
:
9497 case V16HI_FTYPE_V16SF_V16HI_UHI
:
9498 case V8HI_FTYPE_V8SF_V8HI_UQI
:
9499 case V8HI_FTYPE_V4SF_V8HI_UQI
:
9500 case V16SF_FTYPE_V16SF_V32HI_V32HI
:
9501 case V8SF_FTYPE_V8SF_V16HI_V16HI
:
9502 case V4SF_FTYPE_V4SF_V8HI_V8HI
:
9505 case V32QI_FTYPE_V32QI_V32QI_INT
:
9506 case V16HI_FTYPE_V16HI_V16HI_INT
:
9507 case V16QI_FTYPE_V16QI_V16QI_INT
:
9508 case V4DI_FTYPE_V4DI_V4DI_INT
:
9509 case V8HI_FTYPE_V8HI_V8HI_INT
:
9510 case V8SI_FTYPE_V8SI_V8SI_INT
:
9511 case V8SI_FTYPE_V8SI_V4SI_INT
:
9512 case V8SF_FTYPE_V8SF_V8SF_INT
:
9513 case V8SF_FTYPE_V8SF_V4SF_INT
:
9514 case V4SI_FTYPE_V4SI_V4SI_INT
:
9515 case V4DF_FTYPE_V4DF_V4DF_INT
:
9516 case V16SF_FTYPE_V16SF_V16SF_INT
:
9517 case V16SF_FTYPE_V16SF_V4SF_INT
:
9518 case V16SI_FTYPE_V16SI_V4SI_INT
:
9519 case V4DF_FTYPE_V4DF_V2DF_INT
:
9520 case V4SF_FTYPE_V4SF_V4SF_INT
:
9521 case V2DI_FTYPE_V2DI_V2DI_INT
:
9522 case V4DI_FTYPE_V4DI_V2DI_INT
:
9523 case V2DF_FTYPE_V2DF_V2DF_INT
:
9524 case UQI_FTYPE_V8DI_V8UDI_INT
:
9525 case UQI_FTYPE_V8DF_V8DF_INT
:
9526 case UQI_FTYPE_V2DF_V2DF_INT
:
9527 case UQI_FTYPE_V4SF_V4SF_INT
:
9528 case UHI_FTYPE_V16SI_V16SI_INT
:
9529 case UHI_FTYPE_V16SF_V16SF_INT
:
9530 case V64QI_FTYPE_V64QI_V64QI_INT
:
9531 case V32HI_FTYPE_V32HI_V32HI_INT
:
9532 case V16SI_FTYPE_V16SI_V16SI_INT
:
9533 case V8DI_FTYPE_V8DI_V8DI_INT
:
9537 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT
:
9542 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT
:
9547 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT
:
9552 case V2DI_FTYPE_V2DI_UINT_UINT
:
9556 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT
:
9561 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT
:
9567 case QI_FTYPE_V8DF_INT_UQI
:
9568 case QI_FTYPE_V4DF_INT_UQI
:
9569 case QI_FTYPE_V2DF_INT_UQI
:
9570 case HI_FTYPE_V16SF_INT_UHI
:
9571 case QI_FTYPE_V8SF_INT_UQI
:
9572 case QI_FTYPE_V4SF_INT_UQI
:
9573 case V4SI_FTYPE_V4SI_V4SI_UHI
:
9574 case V8SI_FTYPE_V8SI_V8SI_UHI
:
9579 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT
:
9585 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT
:
9591 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI
:
9592 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI
:
9593 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI
:
9594 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI
:
9595 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI
:
9596 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI
:
9597 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI
:
9598 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI
:
9599 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI
:
9600 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI
:
9601 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI
:
9602 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI
:
9603 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI
:
9604 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI
:
9605 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI
:
9606 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI
:
9607 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI
:
9608 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI
:
9609 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI
:
9610 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI
:
9611 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI
:
9612 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI
:
9613 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI
:
9614 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI
:
9615 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI
:
9616 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI
:
9617 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI
:
9618 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI
:
9619 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI
:
9620 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI
:
9621 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI
:
9622 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI
:
9623 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI
:
9624 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI
:
9625 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI
:
9626 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI
:
9627 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI
:
9628 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI
:
9629 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI
:
9630 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI
:
9631 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI
:
9632 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI
:
9633 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI
:
9634 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI
:
9635 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI
:
9636 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI
:
9637 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI
:
9638 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI
:
9639 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI
:
9640 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI
:
9641 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI
:
9642 case V32HI_FTYPE_V16SF_V16SF_V32HI_USI
:
9643 case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI
:
9644 case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI
:
9647 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT
:
9648 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT
:
9649 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT
:
9650 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT
:
9651 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT
:
9655 case UQI_FTYPE_V4DI_V4DI_INT_UQI
:
9656 case UQI_FTYPE_V8SI_V8SI_INT_UQI
:
9657 case QI_FTYPE_V4DF_V4DF_INT_UQI
:
9658 case QI_FTYPE_V8SF_V8SF_INT_UQI
:
9659 case UQI_FTYPE_V2DI_V2DI_INT_UQI
:
9660 case UQI_FTYPE_V4SI_V4SI_INT_UQI
:
9661 case UQI_FTYPE_V2DF_V2DF_INT_UQI
:
9662 case UQI_FTYPE_V4SF_V4SF_INT_UQI
:
9663 case UDI_FTYPE_V64QI_V64QI_INT_UDI
:
9664 case USI_FTYPE_V32QI_V32QI_INT_USI
:
9665 case UHI_FTYPE_V16QI_V16QI_INT_UHI
:
9666 case USI_FTYPE_V32HI_V32HI_INT_USI
:
9667 case UHI_FTYPE_V16HI_V16HI_INT_UHI
:
9668 case UQI_FTYPE_V8HI_V8HI_INT_UQI
:
9673 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT
:
9677 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED
:
9678 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG
:
9679 case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI
:
9680 case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI
:
9681 case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI
:
9684 case UQI_FTYPE_V8DI_V8DI_INT_UQI
:
9685 case UHI_FTYPE_V16SI_V16SI_INT_UHI
:
9690 case V8SF_FTYPE_V8SF_INT_V8SF_UQI
:
9691 case V4SF_FTYPE_V4SF_INT_V4SF_UQI
:
9692 case V2DF_FTYPE_V4DF_INT_V2DF_UQI
:
9693 case V2DI_FTYPE_V4DI_INT_V2DI_UQI
:
9694 case V8SF_FTYPE_V16SF_INT_V8SF_UQI
:
9695 case V8SI_FTYPE_V16SI_INT_V8SI_UQI
:
9696 case V2DF_FTYPE_V8DF_INT_V2DF_UQI
:
9697 case V2DI_FTYPE_V8DI_INT_V2DI_UQI
:
9698 case V4SF_FTYPE_V8SF_INT_V4SF_UQI
:
9699 case V4SI_FTYPE_V8SI_INT_V4SI_UQI
:
9700 case V8HI_FTYPE_V8SF_INT_V8HI_UQI
:
9701 case V8HI_FTYPE_V4SF_INT_V8HI_UQI
:
9702 case V32HI_FTYPE_V32HI_INT_V32HI_USI
:
9703 case V16HI_FTYPE_V16HI_INT_V16HI_UHI
:
9704 case V8HI_FTYPE_V8HI_INT_V8HI_UQI
:
9705 case V4DI_FTYPE_V4DI_INT_V4DI_UQI
:
9706 case V2DI_FTYPE_V2DI_INT_V2DI_UQI
:
9707 case V8SI_FTYPE_V8SI_INT_V8SI_UQI
:
9708 case V4SI_FTYPE_V4SI_INT_V4SI_UQI
:
9709 case V4DF_FTYPE_V4DF_INT_V4DF_UQI
:
9710 case V2DF_FTYPE_V2DF_INT_V2DF_UQI
:
9711 case V8DF_FTYPE_V8DF_INT_V8DF_UQI
:
9712 case V16SF_FTYPE_V16SF_INT_V16SF_UHI
:
9713 case V16HI_FTYPE_V16SF_INT_V16HI_UHI
:
9714 case V16SI_FTYPE_V16SI_INT_V16SI_UHI
:
9715 case V4SI_FTYPE_V16SI_INT_V4SI_UQI
:
9716 case V4DI_FTYPE_V8DI_INT_V4DI_UQI
:
9717 case V4DF_FTYPE_V8DF_INT_V4DF_UQI
:
9718 case V4SF_FTYPE_V16SF_INT_V4SF_UQI
:
9719 case V8DI_FTYPE_V8DI_INT_V8DI_UQI
:
9724 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI
:
9725 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI
:
9726 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI
:
9727 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI
:
9728 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI
:
9729 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI
:
9730 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI
:
9731 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI
:
9732 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI
:
9733 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI
:
9734 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI
:
9735 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI
:
9736 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI
:
9737 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI
:
9738 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI
:
9739 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI
:
9740 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI
:
9741 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI
:
9742 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI
:
9743 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI
:
9744 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI
:
9745 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI
:
9746 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI
:
9747 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI
:
9748 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI
:
9749 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI
:
9750 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI
:
9755 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI
:
9756 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI
:
9757 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI
:
9758 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI
:
9759 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI
:
9760 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI
:
9761 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI
:
9762 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI
:
9763 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI
:
9764 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI
:
9769 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI
:
9770 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI
:
9771 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI
:
9772 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT
:
9773 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT
:
9774 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT
:
9775 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT
:
9776 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT
:
9777 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT
:
9778 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT
:
9779 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT
:
9780 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT
:
9790 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
9792 if (comparison
!= UNKNOWN
)
9794 gcc_assert (nargs
== 2);
9795 return ix86_expand_sse_compare (d
, exp
, target
, swap
);
9798 if (rmode
== VOIDmode
|| rmode
== tmode
)
9802 || GET_MODE (target
) != tmode
9803 || !insn_p
->operand
[0].predicate (target
, tmode
))
9804 target
= gen_reg_rtx (tmode
);
9805 else if (memory_operand (target
, tmode
))
9807 real_target
= target
;
9811 real_target
= gen_reg_rtx (tmode
);
9812 target
= lowpart_subreg (rmode
, real_target
, tmode
);
9815 for (i
= 0; i
< nargs
; i
++)
9817 tree arg
= CALL_EXPR_ARG (exp
, i
);
9818 rtx op
= expand_normal (arg
);
9819 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
9820 bool match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
9822 if (second_arg_count
&& i
== 1)
9824 /* SIMD shift insns take either an 8-bit immediate or
9825 register as count. But builtin functions take int as
9826 count. If count doesn't match, we put it in register.
9827 The instructions are using 64-bit count, if op is just
9828 32-bit, zero-extend it, as negative shift counts
9829 are undefined behavior and zero-extension is more
9833 if (SCALAR_INT_MODE_P (GET_MODE (op
)))
9834 op
= convert_modes (mode
, GET_MODE (op
), op
, 1);
9836 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
9837 if (!insn_p
->operand
[i
+ 1].predicate (op
, mode
))
9838 op
= copy_to_reg (op
);
9841 else if ((mask_pos
&& (nargs
- i
- mask_pos
) == nargs_constant
) ||
9842 (!mask_pos
&& (nargs
- i
) <= nargs_constant
))
9847 case CODE_FOR_avx_vinsertf128v4di
:
9848 case CODE_FOR_avx_vextractf128v4di
:
9849 error ("the last argument must be an 1-bit immediate");
9852 case CODE_FOR_avx512f_cmpv8di3_mask
:
9853 case CODE_FOR_avx512f_cmpv16si3_mask
:
9854 case CODE_FOR_avx512f_ucmpv8di3_mask
:
9855 case CODE_FOR_avx512f_ucmpv16si3_mask
:
9856 case CODE_FOR_avx512vl_cmpv4di3_mask
:
9857 case CODE_FOR_avx512vl_cmpv8si3_mask
:
9858 case CODE_FOR_avx512vl_ucmpv4di3_mask
:
9859 case CODE_FOR_avx512vl_ucmpv8si3_mask
:
9860 case CODE_FOR_avx512vl_cmpv2di3_mask
:
9861 case CODE_FOR_avx512vl_cmpv4si3_mask
:
9862 case CODE_FOR_avx512vl_ucmpv2di3_mask
:
9863 case CODE_FOR_avx512vl_ucmpv4si3_mask
:
9864 error ("the last argument must be a 3-bit immediate");
9867 case CODE_FOR_sse4_1_roundsd
:
9868 case CODE_FOR_sse4_1_roundss
:
9870 case CODE_FOR_sse4_1_roundpd
:
9871 case CODE_FOR_sse4_1_roundps
:
9872 case CODE_FOR_avx_roundpd256
:
9873 case CODE_FOR_avx_roundps256
:
9875 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix
:
9876 case CODE_FOR_sse4_1_roundps_sfix
:
9877 case CODE_FOR_avx_roundpd_vec_pack_sfix256
:
9878 case CODE_FOR_avx_roundps_sfix256
:
9880 case CODE_FOR_sse4_1_blendps
:
9881 case CODE_FOR_avx_blendpd256
:
9882 case CODE_FOR_avx_vpermilv4df
:
9883 case CODE_FOR_avx_vpermilv4df_mask
:
9884 case CODE_FOR_avx512f_getmantv8df_mask
:
9885 case CODE_FOR_avx512f_getmantv16sf_mask
:
9886 case CODE_FOR_avx512vl_getmantv8sf_mask
:
9887 case CODE_FOR_avx512vl_getmantv4df_mask
:
9888 case CODE_FOR_avx512vl_getmantv4sf_mask
:
9889 case CODE_FOR_avx512vl_getmantv2df_mask
:
9890 case CODE_FOR_avx512dq_rangepv8df_mask_round
:
9891 case CODE_FOR_avx512dq_rangepv16sf_mask_round
:
9892 case CODE_FOR_avx512dq_rangepv4df_mask
:
9893 case CODE_FOR_avx512dq_rangepv8sf_mask
:
9894 case CODE_FOR_avx512dq_rangepv2df_mask
:
9895 case CODE_FOR_avx512dq_rangepv4sf_mask
:
9896 case CODE_FOR_avx_shufpd256_mask
:
9897 error ("the last argument must be a 4-bit immediate");
9900 case CODE_FOR_sha1rnds4
:
9901 case CODE_FOR_sse4_1_blendpd
:
9902 case CODE_FOR_avx_vpermilv2df
:
9903 case CODE_FOR_avx_vpermilv2df_mask
:
9904 case CODE_FOR_xop_vpermil2v2df3
:
9905 case CODE_FOR_xop_vpermil2v4sf3
:
9906 case CODE_FOR_xop_vpermil2v4df3
:
9907 case CODE_FOR_xop_vpermil2v8sf3
:
9908 case CODE_FOR_avx512f_vinsertf32x4_mask
:
9909 case CODE_FOR_avx512f_vinserti32x4_mask
:
9910 case CODE_FOR_avx512f_vextractf32x4_mask
:
9911 case CODE_FOR_avx512f_vextracti32x4_mask
:
9912 case CODE_FOR_sse2_shufpd
:
9913 case CODE_FOR_sse2_shufpd_mask
:
9914 case CODE_FOR_avx512dq_shuf_f64x2_mask
:
9915 case CODE_FOR_avx512dq_shuf_i64x2_mask
:
9916 case CODE_FOR_avx512vl_shuf_i32x4_mask
:
9917 case CODE_FOR_avx512vl_shuf_f32x4_mask
:
9918 error ("the last argument must be a 2-bit immediate");
9921 case CODE_FOR_avx_vextractf128v4df
:
9922 case CODE_FOR_avx_vextractf128v8sf
:
9923 case CODE_FOR_avx_vextractf128v8si
:
9924 case CODE_FOR_avx_vinsertf128v4df
:
9925 case CODE_FOR_avx_vinsertf128v8sf
:
9926 case CODE_FOR_avx_vinsertf128v8si
:
9927 case CODE_FOR_avx512f_vinsertf64x4_mask
:
9928 case CODE_FOR_avx512f_vinserti64x4_mask
:
9929 case CODE_FOR_avx512f_vextractf64x4_mask
:
9930 case CODE_FOR_avx512f_vextracti64x4_mask
:
9931 case CODE_FOR_avx512dq_vinsertf32x8_mask
:
9932 case CODE_FOR_avx512dq_vinserti32x8_mask
:
9933 case CODE_FOR_avx512vl_vinsertv4df
:
9934 case CODE_FOR_avx512vl_vinsertv4di
:
9935 case CODE_FOR_avx512vl_vinsertv8sf
:
9936 case CODE_FOR_avx512vl_vinsertv8si
:
9937 error ("the last argument must be a 1-bit immediate");
9940 case CODE_FOR_avx_vmcmpv2df3
:
9941 case CODE_FOR_avx_vmcmpv4sf3
:
9942 case CODE_FOR_avx_cmpv2df3
:
9943 case CODE_FOR_avx_cmpv4sf3
:
9944 case CODE_FOR_avx_cmpv4df3
:
9945 case CODE_FOR_avx_cmpv8sf3
:
9946 case CODE_FOR_avx512f_cmpv8df3_mask
:
9947 case CODE_FOR_avx512f_cmpv16sf3_mask
:
9948 case CODE_FOR_avx512f_vmcmpv2df3_mask
:
9949 case CODE_FOR_avx512f_vmcmpv4sf3_mask
:
9950 error ("the last argument must be a 5-bit immediate");
9954 switch (nargs_constant
)
9957 if ((mask_pos
&& (nargs
- i
- mask_pos
) == nargs_constant
) ||
9958 (!mask_pos
&& (nargs
- i
) == nargs_constant
))
9960 error ("the next to last argument must be an 8-bit immediate");
9965 error ("the last argument must be an 8-bit immediate");
9975 if (VECTOR_MODE_P (mode
))
9976 op
= safe_vector_operand (op
, mode
);
9978 /* If we aren't optimizing, only allow one memory operand to
9980 if (memory_operand (op
, mode
))
9983 op
= fixup_modeless_constant (op
, mode
);
9985 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
9987 if (optimize
|| !match
|| num_memory
> 1)
9988 op
= copy_to_mode_reg (mode
, op
);
9992 op
= copy_to_reg (op
);
9993 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
10003 pat
= GEN_FCN (icode
) (real_target
, xops
[0]);
10006 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1]);
10009 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1], xops
[2]);
10012 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1],
10016 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1],
10017 xops
[2], xops
[3], xops
[4]);
10020 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1],
10021 xops
[2], xops
[3], xops
[4], xops
[5]);
10024 gcc_unreachable ();
10034 /* Transform pattern of following layout:
10036 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
10042 ix86_erase_embedded_rounding (rtx pat
)
10044 if (GET_CODE (pat
) == INSN
)
10045 pat
= PATTERN (pat
);
10047 gcc_assert (GET_CODE (pat
) == SET
);
10048 rtx src
= SET_SRC (pat
);
10049 gcc_assert (XVECLEN (src
, 0) == 2);
10050 rtx p0
= XVECEXP (src
, 0, 0);
10051 gcc_assert (GET_CODE (src
) == UNSPEC
10052 && XINT (src
, 1) == UNSPEC_EMBEDDED_ROUNDING
);
10053 rtx res
= gen_rtx_SET (SET_DEST (pat
), p0
);
10057 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
10060 ix86_expand_sse_comi_round (const struct builtin_description
*d
,
10061 tree exp
, rtx target
)
10064 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10065 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10066 tree arg2
= CALL_EXPR_ARG (exp
, 2);
10067 tree arg3
= CALL_EXPR_ARG (exp
, 3);
10068 rtx op0
= expand_normal (arg0
);
10069 rtx op1
= expand_normal (arg1
);
10070 rtx op2
= expand_normal (arg2
);
10071 rtx op3
= expand_normal (arg3
);
10072 enum insn_code icode
= d
->icode
;
10073 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10074 machine_mode mode0
= insn_p
->operand
[0].mode
;
10075 machine_mode mode1
= insn_p
->operand
[1].mode
;
10077 /* See avxintrin.h for values. */
10078 static const enum rtx_code comparisons
[32] =
10080 EQ
, LT
, LE
, UNORDERED
, NE
, UNGE
, UNGT
, ORDERED
,
10081 UNEQ
, UNLT
, UNLE
, UNORDERED
, LTGT
, GE
, GT
, ORDERED
,
10082 EQ
, LT
, LE
, UNORDERED
, NE
, UNGE
, UNGT
, ORDERED
,
10083 UNEQ
, UNLT
, UNLE
, UNORDERED
, LTGT
, GE
, GT
, ORDERED
10085 static const bool ordereds
[32] =
10087 true, true, true, false, false, false, false, true,
10088 false, false, false, true, true, true, true, false,
10089 true, true, true, false, false, false, false, true,
10090 false, false, false, true, true, true, true, false
10092 static const bool non_signalings
[32] =
10094 true, false, false, true, true, false, false, true,
10095 true, false, false, true, true, false, false, true,
10096 false, true, true, false, false, true, true, false,
10097 false, true, true, false, false, true, true, false
10100 if (!CONST_INT_P (op2
))
10102 error ("the third argument must be comparison constant");
10105 if (INTVAL (op2
) < 0 || INTVAL (op2
) >= 32)
10107 error ("incorrect comparison mode");
10111 if (!insn_p
->operand
[2].predicate (op3
, SImode
))
10113 error ("incorrect rounding operand");
10117 if (VECTOR_MODE_P (mode0
))
10118 op0
= safe_vector_operand (op0
, mode0
);
10119 if (VECTOR_MODE_P (mode1
))
10120 op1
= safe_vector_operand (op1
, mode1
);
10122 enum rtx_code comparison
= comparisons
[INTVAL (op2
)];
10123 bool ordered
= ordereds
[INTVAL (op2
)];
10124 bool non_signaling
= non_signalings
[INTVAL (op2
)];
10125 rtx const_val
= const0_rtx
;
10127 bool check_unordered
= false;
10128 machine_mode mode
= CCFPmode
;
10129 switch (comparison
)
10134 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
10135 if (!non_signaling
)
10141 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
10151 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
10158 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
10159 if (!non_signaling
)
10166 case LE
: /* -> GE */
10167 case LT
: /* -> GT */
10168 case UNGE
: /* -> UNLE */
10169 case UNGT
: /* -> UNLT */
10170 std::swap (op0
, op1
);
10171 comparison
= swap_condition (comparison
);
10179 /* These are supported by CCFPmode. NB: Use ordered/signaling
10180 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
10181 with NAN operands. */
10182 if (ordered
== non_signaling
)
10183 ordered
= !ordered
;
10186 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10187 _CMP_EQ_OQ/_CMP_EQ_OS. */
10188 check_unordered
= true;
10192 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10193 _CMP_NEQ_UQ/_CMP_NEQ_US. */
10194 gcc_assert (!ordered
);
10195 check_unordered
= true;
10197 const_val
= const1_rtx
;
10200 gcc_unreachable ();
10203 target
= gen_reg_rtx (SImode
);
10204 emit_move_insn (target
, const_val
);
10205 target
= gen_rtx_SUBREG (QImode
, target
, 0);
10207 if ((optimize
&& !register_operand (op0
, mode0
))
10208 || !insn_p
->operand
[0].predicate (op0
, mode0
))
10209 op0
= copy_to_mode_reg (mode0
, op0
);
10210 if ((optimize
&& !register_operand (op1
, mode1
))
10211 || !insn_p
->operand
[1].predicate (op1
, mode1
))
10212 op1
= copy_to_mode_reg (mode1
, op1
);
10215 1. COMI: ordered and signaling.
10216 2. UCOMI: unordered and non-signaling.
10219 icode
= (icode
== CODE_FOR_sse_comi_round
10220 ? CODE_FOR_sse_ucomi_round
10221 : CODE_FOR_sse2_ucomi_round
);
10223 pat
= GEN_FCN (icode
) (op0
, op1
, op3
);
10227 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
10228 if (INTVAL (op3
) == NO_ROUND
)
10230 pat
= ix86_erase_embedded_rounding (pat
);
10234 set_dst
= SET_DEST (pat
);
10238 gcc_assert (GET_CODE (pat
) == SET
);
10239 set_dst
= SET_DEST (pat
);
10244 rtx_code_label
*label
= NULL
;
10246 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10247 with NAN operands. */
10248 if (check_unordered
)
10250 gcc_assert (comparison
== EQ
|| comparison
== NE
);
10252 rtx flag
= gen_rtx_REG (CCFPmode
, FLAGS_REG
);
10253 label
= gen_label_rtx ();
10254 rtx tmp
= gen_rtx_fmt_ee (UNORDERED
, VOIDmode
, flag
, const0_rtx
);
10255 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
10256 gen_rtx_LABEL_REF (VOIDmode
, label
),
10258 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
10261 /* NB: Set CCFPmode and check a different CCmode which is in subset
10263 if (GET_MODE (set_dst
) != mode
)
10265 gcc_assert (mode
== CCAmode
|| mode
== CCCmode
10266 || mode
== CCOmode
|| mode
== CCPmode
10267 || mode
== CCSmode
|| mode
== CCZmode
);
10268 set_dst
= gen_rtx_REG (mode
, FLAGS_REG
);
10271 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
10272 gen_rtx_fmt_ee (comparison
, QImode
,
10277 emit_label (label
);
10279 return SUBREG_REG (target
);
10283 ix86_expand_round_builtin (const struct builtin_description
*d
,
10284 tree exp
, rtx target
)
10287 unsigned int i
, nargs
;
10289 enum insn_code icode
= d
->icode
;
10290 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10291 machine_mode tmode
= insn_p
->operand
[0].mode
;
10292 unsigned int nargs_constant
= 0;
10293 unsigned int redundant_embed_rnd
= 0;
10295 switch ((enum ix86_builtin_func_type
) d
->flag
)
10297 case UINT64_FTYPE_V2DF_INT
:
10298 case UINT64_FTYPE_V4SF_INT
:
10299 case UINT_FTYPE_V2DF_INT
:
10300 case UINT_FTYPE_V4SF_INT
:
10301 case INT64_FTYPE_V2DF_INT
:
10302 case INT64_FTYPE_V4SF_INT
:
10303 case INT_FTYPE_V2DF_INT
:
10304 case INT_FTYPE_V4SF_INT
:
10307 case V4SF_FTYPE_V4SF_UINT_INT
:
10308 case V4SF_FTYPE_V4SF_UINT64_INT
:
10309 case V2DF_FTYPE_V2DF_UINT64_INT
:
10310 case V4SF_FTYPE_V4SF_INT_INT
:
10311 case V4SF_FTYPE_V4SF_INT64_INT
:
10312 case V2DF_FTYPE_V2DF_INT64_INT
:
10313 case V4SF_FTYPE_V4SF_V4SF_INT
:
10314 case V2DF_FTYPE_V2DF_V2DF_INT
:
10315 case V4SF_FTYPE_V4SF_V2DF_INT
:
10316 case V2DF_FTYPE_V2DF_V4SF_INT
:
10319 case V8SF_FTYPE_V8DF_V8SF_QI_INT
:
10320 case V8DF_FTYPE_V8DF_V8DF_QI_INT
:
10321 case V8SI_FTYPE_V8DF_V8SI_QI_INT
:
10322 case V8DI_FTYPE_V8DF_V8DI_QI_INT
:
10323 case V8SF_FTYPE_V8DI_V8SF_QI_INT
:
10324 case V8DF_FTYPE_V8DI_V8DF_QI_INT
:
10325 case V16SF_FTYPE_V16SF_V16SF_HI_INT
:
10326 case V8DI_FTYPE_V8SF_V8DI_QI_INT
:
10327 case V16SF_FTYPE_V16SI_V16SF_HI_INT
:
10328 case V16SI_FTYPE_V16SF_V16SI_HI_INT
:
10329 case V8DF_FTYPE_V8SF_V8DF_QI_INT
:
10330 case V16SF_FTYPE_V16HI_V16SF_HI_INT
:
10331 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT
:
10332 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT
:
10335 case V4SF_FTYPE_V4SF_V4SF_INT_INT
:
10336 case V2DF_FTYPE_V2DF_V2DF_INT_INT
:
10337 nargs_constant
= 2;
10340 case INT_FTYPE_V4SF_V4SF_INT_INT
:
10341 case INT_FTYPE_V2DF_V2DF_INT_INT
:
10342 return ix86_expand_sse_comi_round (d
, exp
, target
);
10343 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT
:
10344 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT
:
10345 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT
:
10346 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT
:
10347 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT
:
10348 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT
:
10349 case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT
:
10350 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT
:
10351 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT
:
10352 case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT
:
10355 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT
:
10356 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT
:
10357 case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT
:
10358 case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT
:
10359 nargs_constant
= 4;
10362 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT
:
10363 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT
:
10364 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT
:
10365 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT
:
10366 nargs_constant
= 3;
10369 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT
:
10370 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT
:
10371 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT
:
10372 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT
:
10373 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT
:
10374 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT
:
10376 nargs_constant
= 4;
10378 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT
:
10379 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT
:
10380 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT
:
10381 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT
:
10383 nargs_constant
= 3;
10386 gcc_unreachable ();
10388 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
10392 || GET_MODE (target
) != tmode
10393 || !insn_p
->operand
[0].predicate (target
, tmode
))
10394 target
= gen_reg_rtx (tmode
);
10396 for (i
= 0; i
< nargs
; i
++)
10398 tree arg
= CALL_EXPR_ARG (exp
, i
);
10399 rtx op
= expand_normal (arg
);
10400 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
10401 bool match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
10403 if (i
== nargs
- nargs_constant
)
10409 case CODE_FOR_avx512f_getmantv8df_mask_round
:
10410 case CODE_FOR_avx512f_getmantv16sf_mask_round
:
10411 case CODE_FOR_avx512f_vgetmantv2df_round
:
10412 case CODE_FOR_avx512f_vgetmantv2df_mask_round
:
10413 case CODE_FOR_avx512f_vgetmantv4sf_round
:
10414 case CODE_FOR_avx512f_vgetmantv4sf_mask_round
:
10415 error ("the immediate argument must be a 4-bit immediate");
10417 case CODE_FOR_avx512f_cmpv8df3_mask_round
:
10418 case CODE_FOR_avx512f_cmpv16sf3_mask_round
:
10419 case CODE_FOR_avx512f_vmcmpv2df3_mask_round
:
10420 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round
:
10421 error ("the immediate argument must be a 5-bit immediate");
10424 error ("the immediate argument must be an 8-bit immediate");
10429 else if (i
== nargs
-1)
10431 if (!insn_p
->operand
[nargs
].predicate (op
, SImode
))
10433 error ("incorrect rounding operand");
10437 /* If there is no rounding use normal version of the pattern. */
10438 if (INTVAL (op
) == NO_ROUND
)
10439 redundant_embed_rnd
= 1;
10443 if (VECTOR_MODE_P (mode
))
10444 op
= safe_vector_operand (op
, mode
);
10446 op
= fixup_modeless_constant (op
, mode
);
10448 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
10450 if (optimize
|| !match
)
10451 op
= copy_to_mode_reg (mode
, op
);
10455 op
= copy_to_reg (op
);
10456 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
10466 pat
= GEN_FCN (icode
) (target
, xops
[0]);
10469 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1]);
10472 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2]);
10475 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
10479 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
10480 xops
[2], xops
[3], xops
[4]);
10483 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
10484 xops
[2], xops
[3], xops
[4], xops
[5]);
10487 gcc_unreachable ();
10493 if (redundant_embed_rnd
)
10494 pat
= ix86_erase_embedded_rounding (pat
);
10500 /* Subroutine of ix86_expand_builtin to take care of special insns
10501 with variable number of operands. */
10504 ix86_expand_special_args_builtin (const struct builtin_description
*d
,
10505 tree exp
, rtx target
)
10509 unsigned int i
, nargs
, arg_adjust
, memory
;
10510 bool aligned_mem
= false;
10512 enum insn_code icode
= d
->icode
;
10513 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10514 machine_mode tmode
= insn_p
->operand
[0].mode
;
10515 enum { load
, store
} klass
;
10517 switch ((enum ix86_builtin_func_type
) d
->flag
)
10519 case VOID_FTYPE_VOID
:
10520 emit_insn (GEN_FCN (icode
) (target
));
10522 case VOID_FTYPE_UINT64
:
10523 case VOID_FTYPE_UNSIGNED
:
10529 case INT_FTYPE_VOID
:
10530 case USHORT_FTYPE_VOID
:
10531 case UINT64_FTYPE_VOID
:
10532 case UINT_FTYPE_VOID
:
10533 case UINT8_FTYPE_VOID
:
10534 case UNSIGNED_FTYPE_VOID
:
10539 case UINT64_FTYPE_PUNSIGNED
:
10540 case V2DI_FTYPE_PV2DI
:
10541 case V4DI_FTYPE_PV4DI
:
10542 case V32QI_FTYPE_PCCHAR
:
10543 case V16QI_FTYPE_PCCHAR
:
10544 case V8SF_FTYPE_PCV4SF
:
10545 case V8SF_FTYPE_PCFLOAT
:
10546 case V4SF_FTYPE_PCFLOAT
:
10547 case V4DF_FTYPE_PCV2DF
:
10548 case V4DF_FTYPE_PCDOUBLE
:
10549 case V2DF_FTYPE_PCDOUBLE
:
10550 case VOID_FTYPE_PVOID
:
10551 case V8DI_FTYPE_PV8DI
:
10557 case CODE_FOR_sse4_1_movntdqa
:
10558 case CODE_FOR_avx2_movntdqa
:
10559 case CODE_FOR_avx512f_movntdqa
:
10560 aligned_mem
= true;
10566 case VOID_FTYPE_PV2SF_V4SF
:
10567 case VOID_FTYPE_PV8DI_V8DI
:
10568 case VOID_FTYPE_PV4DI_V4DI
:
10569 case VOID_FTYPE_PV2DI_V2DI
:
10570 case VOID_FTYPE_PCHAR_V32QI
:
10571 case VOID_FTYPE_PCHAR_V16QI
:
10572 case VOID_FTYPE_PFLOAT_V16SF
:
10573 case VOID_FTYPE_PFLOAT_V8SF
:
10574 case VOID_FTYPE_PFLOAT_V4SF
:
10575 case VOID_FTYPE_PDOUBLE_V8DF
:
10576 case VOID_FTYPE_PDOUBLE_V4DF
:
10577 case VOID_FTYPE_PDOUBLE_V2DF
:
10578 case VOID_FTYPE_PLONGLONG_LONGLONG
:
10579 case VOID_FTYPE_PULONGLONG_ULONGLONG
:
10580 case VOID_FTYPE_PUNSIGNED_UNSIGNED
:
10581 case VOID_FTYPE_PINT_INT
:
10584 /* Reserve memory operand for target. */
10585 memory
= ARRAY_SIZE (xops
);
10588 /* These builtins and instructions require the memory
10589 to be properly aligned. */
10590 case CODE_FOR_avx_movntv4di
:
10591 case CODE_FOR_sse2_movntv2di
:
10592 case CODE_FOR_avx_movntv8sf
:
10593 case CODE_FOR_sse_movntv4sf
:
10594 case CODE_FOR_sse4a_vmmovntv4sf
:
10595 case CODE_FOR_avx_movntv4df
:
10596 case CODE_FOR_sse2_movntv2df
:
10597 case CODE_FOR_sse4a_vmmovntv2df
:
10598 case CODE_FOR_sse2_movntidi
:
10599 case CODE_FOR_sse_movntq
:
10600 case CODE_FOR_sse2_movntisi
:
10601 case CODE_FOR_avx512f_movntv16sf
:
10602 case CODE_FOR_avx512f_movntv8df
:
10603 case CODE_FOR_avx512f_movntv8di
:
10604 aligned_mem
= true;
10610 case VOID_FTYPE_PVOID_PCVOID
:
10616 case V4SF_FTYPE_V4SF_PCV2SF
:
10617 case V2DF_FTYPE_V2DF_PCDOUBLE
:
10622 case V8SF_FTYPE_PCV8SF_V8SI
:
10623 case V4DF_FTYPE_PCV4DF_V4DI
:
10624 case V4SF_FTYPE_PCV4SF_V4SI
:
10625 case V2DF_FTYPE_PCV2DF_V2DI
:
10626 case V8SI_FTYPE_PCV8SI_V8SI
:
10627 case V4DI_FTYPE_PCV4DI_V4DI
:
10628 case V4SI_FTYPE_PCV4SI_V4SI
:
10629 case V2DI_FTYPE_PCV2DI_V2DI
:
10630 case VOID_FTYPE_INT_INT64
:
10635 case VOID_FTYPE_PV8DF_V8DF_UQI
:
10636 case VOID_FTYPE_PV4DF_V4DF_UQI
:
10637 case VOID_FTYPE_PV2DF_V2DF_UQI
:
10638 case VOID_FTYPE_PV16SF_V16SF_UHI
:
10639 case VOID_FTYPE_PV8SF_V8SF_UQI
:
10640 case VOID_FTYPE_PV4SF_V4SF_UQI
:
10641 case VOID_FTYPE_PV8DI_V8DI_UQI
:
10642 case VOID_FTYPE_PV4DI_V4DI_UQI
:
10643 case VOID_FTYPE_PV2DI_V2DI_UQI
:
10644 case VOID_FTYPE_PV16SI_V16SI_UHI
:
10645 case VOID_FTYPE_PV8SI_V8SI_UQI
:
10646 case VOID_FTYPE_PV4SI_V4SI_UQI
:
10647 case VOID_FTYPE_PV64QI_V64QI_UDI
:
10648 case VOID_FTYPE_PV32HI_V32HI_USI
:
10649 case VOID_FTYPE_PV32QI_V32QI_USI
:
10650 case VOID_FTYPE_PV16QI_V16QI_UHI
:
10651 case VOID_FTYPE_PV16HI_V16HI_UHI
:
10652 case VOID_FTYPE_PV8HI_V8HI_UQI
:
10655 /* These builtins and instructions require the memory
10656 to be properly aligned. */
10657 case CODE_FOR_avx512f_storev16sf_mask
:
10658 case CODE_FOR_avx512f_storev16si_mask
:
10659 case CODE_FOR_avx512f_storev8df_mask
:
10660 case CODE_FOR_avx512f_storev8di_mask
:
10661 case CODE_FOR_avx512vl_storev8sf_mask
:
10662 case CODE_FOR_avx512vl_storev8si_mask
:
10663 case CODE_FOR_avx512vl_storev4df_mask
:
10664 case CODE_FOR_avx512vl_storev4di_mask
:
10665 case CODE_FOR_avx512vl_storev4sf_mask
:
10666 case CODE_FOR_avx512vl_storev4si_mask
:
10667 case CODE_FOR_avx512vl_storev2df_mask
:
10668 case CODE_FOR_avx512vl_storev2di_mask
:
10669 aligned_mem
= true;
10675 case VOID_FTYPE_PV8SF_V8SI_V8SF
:
10676 case VOID_FTYPE_PV4DF_V4DI_V4DF
:
10677 case VOID_FTYPE_PV4SF_V4SI_V4SF
:
10678 case VOID_FTYPE_PV2DF_V2DI_V2DF
:
10679 case VOID_FTYPE_PV8SI_V8SI_V8SI
:
10680 case VOID_FTYPE_PV4DI_V4DI_V4DI
:
10681 case VOID_FTYPE_PV4SI_V4SI_V4SI
:
10682 case VOID_FTYPE_PV2DI_V2DI_V2DI
:
10683 case VOID_FTYPE_PV8SI_V8DI_UQI
:
10684 case VOID_FTYPE_PV8HI_V8DI_UQI
:
10685 case VOID_FTYPE_PV16HI_V16SI_UHI
:
10686 case VOID_FTYPE_PUDI_V8DI_UQI
:
10687 case VOID_FTYPE_PV16QI_V16SI_UHI
:
10688 case VOID_FTYPE_PV4SI_V4DI_UQI
:
10689 case VOID_FTYPE_PUDI_V2DI_UQI
:
10690 case VOID_FTYPE_PUDI_V4DI_UQI
:
10691 case VOID_FTYPE_PUSI_V2DI_UQI
:
10692 case VOID_FTYPE_PV8HI_V8SI_UQI
:
10693 case VOID_FTYPE_PUDI_V4SI_UQI
:
10694 case VOID_FTYPE_PUSI_V4DI_UQI
:
10695 case VOID_FTYPE_PUHI_V2DI_UQI
:
10696 case VOID_FTYPE_PUDI_V8SI_UQI
:
10697 case VOID_FTYPE_PUSI_V4SI_UQI
:
10698 case VOID_FTYPE_PCHAR_V64QI_UDI
:
10699 case VOID_FTYPE_PCHAR_V32QI_USI
:
10700 case VOID_FTYPE_PCHAR_V16QI_UHI
:
10701 case VOID_FTYPE_PSHORT_V32HI_USI
:
10702 case VOID_FTYPE_PSHORT_V16HI_UHI
:
10703 case VOID_FTYPE_PSHORT_V8HI_UQI
:
10704 case VOID_FTYPE_PINT_V16SI_UHI
:
10705 case VOID_FTYPE_PINT_V8SI_UQI
:
10706 case VOID_FTYPE_PINT_V4SI_UQI
:
10707 case VOID_FTYPE_PINT64_V8DI_UQI
:
10708 case VOID_FTYPE_PINT64_V4DI_UQI
:
10709 case VOID_FTYPE_PINT64_V2DI_UQI
:
10710 case VOID_FTYPE_PDOUBLE_V8DF_UQI
:
10711 case VOID_FTYPE_PDOUBLE_V4DF_UQI
:
10712 case VOID_FTYPE_PDOUBLE_V2DF_UQI
:
10713 case VOID_FTYPE_PFLOAT_V16SF_UHI
:
10714 case VOID_FTYPE_PFLOAT_V8SF_UQI
:
10715 case VOID_FTYPE_PFLOAT_V4SF_UQI
:
10716 case VOID_FTYPE_PV32QI_V32HI_USI
:
10717 case VOID_FTYPE_PV16QI_V16HI_UHI
:
10718 case VOID_FTYPE_PUDI_V8HI_UQI
:
10721 /* Reserve memory operand for target. */
10722 memory
= ARRAY_SIZE (xops
);
10724 case V4SF_FTYPE_PCV4SF_V4SF_UQI
:
10725 case V8SF_FTYPE_PCV8SF_V8SF_UQI
:
10726 case V16SF_FTYPE_PCV16SF_V16SF_UHI
:
10727 case V4SI_FTYPE_PCV4SI_V4SI_UQI
:
10728 case V8SI_FTYPE_PCV8SI_V8SI_UQI
:
10729 case V16SI_FTYPE_PCV16SI_V16SI_UHI
:
10730 case V2DF_FTYPE_PCV2DF_V2DF_UQI
:
10731 case V4DF_FTYPE_PCV4DF_V4DF_UQI
:
10732 case V8DF_FTYPE_PCV8DF_V8DF_UQI
:
10733 case V2DI_FTYPE_PCV2DI_V2DI_UQI
:
10734 case V4DI_FTYPE_PCV4DI_V4DI_UQI
:
10735 case V8DI_FTYPE_PCV8DI_V8DI_UQI
:
10736 case V64QI_FTYPE_PCV64QI_V64QI_UDI
:
10737 case V32HI_FTYPE_PCV32HI_V32HI_USI
:
10738 case V32QI_FTYPE_PCV32QI_V32QI_USI
:
10739 case V16QI_FTYPE_PCV16QI_V16QI_UHI
:
10740 case V16HI_FTYPE_PCV16HI_V16HI_UHI
:
10741 case V8HI_FTYPE_PCV8HI_V8HI_UQI
:
10744 /* These builtins and instructions require the memory
10745 to be properly aligned. */
10746 case CODE_FOR_avx512f_loadv16sf_mask
:
10747 case CODE_FOR_avx512f_loadv16si_mask
:
10748 case CODE_FOR_avx512f_loadv8df_mask
:
10749 case CODE_FOR_avx512f_loadv8di_mask
:
10750 case CODE_FOR_avx512vl_loadv8sf_mask
:
10751 case CODE_FOR_avx512vl_loadv8si_mask
:
10752 case CODE_FOR_avx512vl_loadv4df_mask
:
10753 case CODE_FOR_avx512vl_loadv4di_mask
:
10754 case CODE_FOR_avx512vl_loadv4sf_mask
:
10755 case CODE_FOR_avx512vl_loadv4si_mask
:
10756 case CODE_FOR_avx512vl_loadv2df_mask
:
10757 case CODE_FOR_avx512vl_loadv2di_mask
:
10758 case CODE_FOR_avx512bw_loadv64qi_mask
:
10759 case CODE_FOR_avx512vl_loadv32qi_mask
:
10760 case CODE_FOR_avx512vl_loadv16qi_mask
:
10761 case CODE_FOR_avx512bw_loadv32hi_mask
:
10762 case CODE_FOR_avx512vl_loadv16hi_mask
:
10763 case CODE_FOR_avx512vl_loadv8hi_mask
:
10764 aligned_mem
= true;
10770 case V64QI_FTYPE_PCCHAR_V64QI_UDI
:
10771 case V32QI_FTYPE_PCCHAR_V32QI_USI
:
10772 case V16QI_FTYPE_PCCHAR_V16QI_UHI
:
10773 case V32HI_FTYPE_PCSHORT_V32HI_USI
:
10774 case V16HI_FTYPE_PCSHORT_V16HI_UHI
:
10775 case V8HI_FTYPE_PCSHORT_V8HI_UQI
:
10776 case V16SI_FTYPE_PCINT_V16SI_UHI
:
10777 case V8SI_FTYPE_PCINT_V8SI_UQI
:
10778 case V4SI_FTYPE_PCINT_V4SI_UQI
:
10779 case V8DI_FTYPE_PCINT64_V8DI_UQI
:
10780 case V4DI_FTYPE_PCINT64_V4DI_UQI
:
10781 case V2DI_FTYPE_PCINT64_V2DI_UQI
:
10782 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI
:
10783 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI
:
10784 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI
:
10785 case V16SF_FTYPE_PCFLOAT_V16SF_UHI
:
10786 case V8SF_FTYPE_PCFLOAT_V8SF_UQI
:
10787 case V4SF_FTYPE_PCFLOAT_V4SF_UQI
:
10793 gcc_unreachable ();
10796 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
10798 if (klass
== store
)
10800 arg
= CALL_EXPR_ARG (exp
, 0);
10801 op
= expand_normal (arg
);
10802 gcc_assert (target
== 0);
10805 op
= ix86_zero_extend_to_Pmode (op
);
10806 target
= gen_rtx_MEM (tmode
, op
);
10807 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
10808 on it. Try to improve it using get_pointer_alignment,
10809 and if the special builtin is one that requires strict
10810 mode alignment, also from it's GET_MODE_ALIGNMENT.
10811 Failure to do so could lead to ix86_legitimate_combined_insn
10812 rejecting all changes to such insns. */
10813 unsigned int align
= get_pointer_alignment (arg
);
10814 if (aligned_mem
&& align
< GET_MODE_ALIGNMENT (tmode
))
10815 align
= GET_MODE_ALIGNMENT (tmode
);
10816 if (MEM_ALIGN (target
) < align
)
10817 set_mem_align (target
, align
);
10820 target
= force_reg (tmode
, op
);
10828 || !register_operand (target
, tmode
)
10829 || GET_MODE (target
) != tmode
)
10830 target
= gen_reg_rtx (tmode
);
10833 for (i
= 0; i
< nargs
; i
++)
10835 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
10837 arg
= CALL_EXPR_ARG (exp
, i
+ arg_adjust
);
10838 op
= expand_normal (arg
);
10842 /* This must be the memory operand. */
10843 op
= ix86_zero_extend_to_Pmode (op
);
10844 op
= gen_rtx_MEM (mode
, op
);
10845 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
10846 on it. Try to improve it using get_pointer_alignment,
10847 and if the special builtin is one that requires strict
10848 mode alignment, also from it's GET_MODE_ALIGNMENT.
10849 Failure to do so could lead to ix86_legitimate_combined_insn
10850 rejecting all changes to such insns. */
10851 unsigned int align
= get_pointer_alignment (arg
);
10852 if (aligned_mem
&& align
< GET_MODE_ALIGNMENT (mode
))
10853 align
= GET_MODE_ALIGNMENT (mode
);
10854 if (MEM_ALIGN (op
) < align
)
10855 set_mem_align (op
, align
);
10859 /* This must be register. */
10860 if (VECTOR_MODE_P (mode
))
10861 op
= safe_vector_operand (op
, mode
);
10863 op
= fixup_modeless_constant (op
, mode
);
10865 /* NB: 3-operands load implied it's a mask load,
10866 and that mask operand shoud be at the end.
10867 Keep all-ones mask which would be simplified by the expander. */
10868 if (nargs
== 3 && i
== 2 && klass
== load
10869 && constm1_operand (op
, mode
))
10871 else if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
10872 op
= copy_to_mode_reg (mode
, op
);
10875 op
= copy_to_reg (op
);
10876 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
10886 pat
= GEN_FCN (icode
) (target
);
10889 pat
= GEN_FCN (icode
) (target
, xops
[0]);
10892 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1]);
10895 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2]);
10898 gcc_unreachable ();
10905 return klass
== store
? 0 : target
;
10908 /* Return the integer constant in ARG. Constrain it to be in the range
10909 of the subparts of VEC_TYPE; issue an error if not. */
10912 get_element_number (tree vec_type
, tree arg
)
10914 unsigned HOST_WIDE_INT elt
, max
= TYPE_VECTOR_SUBPARTS (vec_type
) - 1;
10916 if (!tree_fits_uhwi_p (arg
)
10917 || (elt
= tree_to_uhwi (arg
), elt
> max
))
10919 error ("selector must be an integer constant in the range "
10927 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10928 ix86_expand_vector_init. We DO have language-level syntax for this, in
10929 the form of (type){ init-list }. Except that since we can't place emms
10930 instructions from inside the compiler, we can't allow the use of MMX
10931 registers unless the user explicitly asks for it. So we do *not* define
10932 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
10933 we have builtins invoked by mmintrin.h that gives us license to emit
10934 these sorts of instructions. */
10937 ix86_expand_vec_init_builtin (tree type
, tree exp
, rtx target
)
10939 machine_mode tmode
= TYPE_MODE (type
);
10940 machine_mode inner_mode
= GET_MODE_INNER (tmode
);
10941 int i
, n_elt
= GET_MODE_NUNITS (tmode
);
10942 rtvec v
= rtvec_alloc (n_elt
);
10944 gcc_assert (VECTOR_MODE_P (tmode
));
10945 gcc_assert (call_expr_nargs (exp
) == n_elt
);
10947 for (i
= 0; i
< n_elt
; ++i
)
10949 rtx x
= expand_normal (CALL_EXPR_ARG (exp
, i
));
10950 RTVEC_ELT (v
, i
) = gen_lowpart (inner_mode
, x
);
10953 if (!target
|| !register_operand (target
, tmode
))
10954 target
= gen_reg_rtx (tmode
);
10956 ix86_expand_vector_init (true, target
, gen_rtx_PARALLEL (tmode
, v
));
10960 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10961 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
10962 had a language-level syntax for referencing vector elements. */
10965 ix86_expand_vec_ext_builtin (tree exp
, rtx target
)
10967 machine_mode tmode
, mode0
;
10972 arg0
= CALL_EXPR_ARG (exp
, 0);
10973 arg1
= CALL_EXPR_ARG (exp
, 1);
10975 op0
= expand_normal (arg0
);
10976 elt
= get_element_number (TREE_TYPE (arg0
), arg1
);
10978 tmode
= TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0
)));
10979 mode0
= TYPE_MODE (TREE_TYPE (arg0
));
10980 gcc_assert (VECTOR_MODE_P (mode0
));
10982 op0
= force_reg (mode0
, op0
);
10984 if (optimize
|| !target
|| !register_operand (target
, tmode
))
10985 target
= gen_reg_rtx (tmode
);
10987 ix86_expand_vector_extract (true, target
, op0
, elt
);
10992 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10993 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
10994 a language-level syntax for referencing vector elements. */
10997 ix86_expand_vec_set_builtin (tree exp
)
10999 machine_mode tmode
, mode1
;
11000 tree arg0
, arg1
, arg2
;
11002 rtx op0
, op1
, target
;
11004 arg0
= CALL_EXPR_ARG (exp
, 0);
11005 arg1
= CALL_EXPR_ARG (exp
, 1);
11006 arg2
= CALL_EXPR_ARG (exp
, 2);
11008 tmode
= TYPE_MODE (TREE_TYPE (arg0
));
11009 mode1
= TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0
)));
11010 gcc_assert (VECTOR_MODE_P (tmode
));
11012 op0
= expand_expr (arg0
, NULL_RTX
, tmode
, EXPAND_NORMAL
);
11013 op1
= expand_expr (arg1
, NULL_RTX
, mode1
, EXPAND_NORMAL
);
11014 elt
= get_element_number (TREE_TYPE (arg0
), arg2
);
11016 if (GET_MODE (op1
) != mode1
&& GET_MODE (op1
) != VOIDmode
)
11017 op1
= convert_modes (mode1
, GET_MODE (op1
), op1
, true);
11019 op0
= force_reg (tmode
, op0
);
11020 op1
= force_reg (mode1
, op1
);
11022 /* OP0 is the source of these builtin functions and shouldn't be
11023 modified. Create a copy, use it and return it as target. */
11024 target
= gen_reg_rtx (tmode
);
11025 emit_move_insn (target
, op0
);
11026 ix86_expand_vector_set (true, target
, op1
, elt
);
11031 /* Expand an expression EXP that calls a built-in function,
11032 with result going to TARGET if that's convenient
11033 (and in mode MODE if that's convenient).
11034 SUBTARGET may be used as the target for computing one of EXP's operands.
11035 IGNORE is nonzero if the value is to be ignored. */
11038 ix86_expand_builtin (tree exp
, rtx target
, rtx subtarget
,
11039 machine_mode mode
, int ignore
)
11042 enum insn_code icode
, icode2
;
11043 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
11044 tree arg0
, arg1
, arg2
, arg3
, arg4
;
11045 rtx op0
, op1
, op2
, op3
, op4
, pat
, pat2
, insn
;
11046 machine_mode mode0
, mode1
, mode2
, mode3
, mode4
;
11047 unsigned int fcode
= DECL_MD_FUNCTION_CODE (fndecl
);
11049 /* For CPU builtins that can be folded, fold first and expand the fold. */
11052 case IX86_BUILTIN_CPU_INIT
:
11054 /* Make it call __cpu_indicator_init in libgcc. */
11055 tree call_expr
, fndecl
, type
;
11056 type
= build_function_type_list (integer_type_node
, NULL_TREE
);
11057 fndecl
= build_fn_decl ("__cpu_indicator_init", type
);
11058 call_expr
= build_call_expr (fndecl
, 0);
11059 return expand_expr (call_expr
, target
, mode
, EXPAND_NORMAL
);
11061 case IX86_BUILTIN_CPU_IS
:
11062 case IX86_BUILTIN_CPU_SUPPORTS
:
11064 tree arg0
= CALL_EXPR_ARG (exp
, 0);
11065 tree fold_expr
= fold_builtin_cpu (fndecl
, &arg0
);
11066 gcc_assert (fold_expr
!= NULL_TREE
);
11067 return expand_expr (fold_expr
, target
, mode
, EXPAND_NORMAL
);
11071 HOST_WIDE_INT isa
= ix86_isa_flags
;
11072 HOST_WIDE_INT isa2
= ix86_isa_flags2
;
11073 HOST_WIDE_INT bisa
= ix86_builtins_isa
[fcode
].isa
;
11074 HOST_WIDE_INT bisa2
= ix86_builtins_isa
[fcode
].isa2
;
11075 /* The general case is we require all the ISAs specified in bisa{,2}
11077 The exceptions are:
11078 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
11079 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
11080 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
11081 (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
11082 OPTION_MASK_ISA2_AVXVNNI
11083 where for each such pair it is sufficient if either of the ISAs is
11084 enabled, plus if it is ored with other options also those others.
11085 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
11086 if (((bisa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
))
11087 == (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
))
11088 && (isa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
)) != 0)
11089 isa
|= (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
);
11091 if (((bisa
& (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
))
11092 == (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
))
11093 && (isa
& (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
)) != 0)
11094 isa
|= (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
);
11096 if (((bisa
& (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
))
11097 == (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
))
11098 && (isa
& (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
)) != 0)
11099 isa
|= (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
);
11101 if ((((bisa
& (OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
))
11102 == (OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
))
11103 || (bisa2
& OPTION_MASK_ISA2_AVXVNNI
) != 0)
11104 && (((isa
& (OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
))
11105 == (OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
))
11106 || (isa2
& OPTION_MASK_ISA2_AVXVNNI
) != 0))
11108 isa
|= OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
;
11109 isa2
|= OPTION_MASK_ISA2_AVXVNNI
;
11112 if ((bisa
& OPTION_MASK_ISA_MMX
) && !TARGET_MMX
&& TARGET_MMX_WITH_SSE
11113 /* __builtin_ia32_maskmovq requires MMX registers. */
11114 && fcode
!= IX86_BUILTIN_MASKMOVQ
)
11116 bisa
&= ~OPTION_MASK_ISA_MMX
;
11117 bisa
|= OPTION_MASK_ISA_SSE2
;
11120 if ((bisa
& isa
) != bisa
|| (bisa2
& isa2
) != bisa2
)
11122 bool add_abi_p
= bisa
& OPTION_MASK_ISA_64BIT
;
11123 if (TARGET_ABI_X32
)
11124 bisa
|= OPTION_MASK_ABI_X32
;
11126 bisa
|= OPTION_MASK_ABI_64
;
11127 char *opts
= ix86_target_string (bisa
, bisa2
, 0, 0, NULL
, NULL
,
11128 (enum fpmath_unit
) 0,
11129 (enum prefer_vector_width
) 0,
11132 error ("%qE needs unknown isa option", fndecl
);
11135 gcc_assert (opts
!= NULL
);
11136 error ("%qE needs isa option %s", fndecl
, opts
);
11139 return expand_call (exp
, target
, ignore
);
11144 case IX86_BUILTIN_MASKMOVQ
:
11145 case IX86_BUILTIN_MASKMOVDQU
:
11146 icode
= (fcode
== IX86_BUILTIN_MASKMOVQ
11147 ? CODE_FOR_mmx_maskmovq
11148 : CODE_FOR_sse2_maskmovdqu
);
11149 /* Note the arg order is different from the operand order. */
11150 arg1
= CALL_EXPR_ARG (exp
, 0);
11151 arg2
= CALL_EXPR_ARG (exp
, 1);
11152 arg0
= CALL_EXPR_ARG (exp
, 2);
11153 op0
= expand_normal (arg0
);
11154 op1
= expand_normal (arg1
);
11155 op2
= expand_normal (arg2
);
11156 mode0
= insn_data
[icode
].operand
[0].mode
;
11157 mode1
= insn_data
[icode
].operand
[1].mode
;
11158 mode2
= insn_data
[icode
].operand
[2].mode
;
11160 op0
= ix86_zero_extend_to_Pmode (op0
);
11161 op0
= gen_rtx_MEM (mode1
, op0
);
11163 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
11164 op0
= copy_to_mode_reg (mode0
, op0
);
11165 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
11166 op1
= copy_to_mode_reg (mode1
, op1
);
11167 if (!insn_data
[icode
].operand
[2].predicate (op2
, mode2
))
11168 op2
= copy_to_mode_reg (mode2
, op2
);
11169 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11175 case IX86_BUILTIN_LDMXCSR
:
11176 op0
= expand_normal (CALL_EXPR_ARG (exp
, 0));
11177 target
= assign_386_stack_local (SImode
, SLOT_TEMP
);
11178 emit_move_insn (target
, op0
);
11179 emit_insn (gen_sse_ldmxcsr (target
));
11182 case IX86_BUILTIN_STMXCSR
:
11183 target
= assign_386_stack_local (SImode
, SLOT_TEMP
);
11184 emit_insn (gen_sse_stmxcsr (target
));
11185 return copy_to_mode_reg (SImode
, target
);
11187 case IX86_BUILTIN_CLFLUSH
:
11188 arg0
= CALL_EXPR_ARG (exp
, 0);
11189 op0
= expand_normal (arg0
);
11190 icode
= CODE_FOR_sse2_clflush
;
11191 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11192 op0
= ix86_zero_extend_to_Pmode (op0
);
11194 emit_insn (gen_sse2_clflush (op0
));
11197 case IX86_BUILTIN_CLWB
:
11198 arg0
= CALL_EXPR_ARG (exp
, 0);
11199 op0
= expand_normal (arg0
);
11200 icode
= CODE_FOR_clwb
;
11201 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11202 op0
= ix86_zero_extend_to_Pmode (op0
);
11204 emit_insn (gen_clwb (op0
));
11207 case IX86_BUILTIN_CLFLUSHOPT
:
11208 arg0
= CALL_EXPR_ARG (exp
, 0);
11209 op0
= expand_normal (arg0
);
11210 icode
= CODE_FOR_clflushopt
;
11211 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11212 op0
= ix86_zero_extend_to_Pmode (op0
);
11214 emit_insn (gen_clflushopt (op0
));
11217 case IX86_BUILTIN_MONITOR
:
11218 case IX86_BUILTIN_MONITORX
:
11219 arg0
= CALL_EXPR_ARG (exp
, 0);
11220 arg1
= CALL_EXPR_ARG (exp
, 1);
11221 arg2
= CALL_EXPR_ARG (exp
, 2);
11222 op0
= expand_normal (arg0
);
11223 op1
= expand_normal (arg1
);
11224 op2
= expand_normal (arg2
);
11226 op0
= ix86_zero_extend_to_Pmode (op0
);
11228 op1
= copy_to_mode_reg (SImode
, op1
);
11230 op2
= copy_to_mode_reg (SImode
, op2
);
11232 emit_insn (fcode
== IX86_BUILTIN_MONITOR
11233 ? gen_sse3_monitor (Pmode
, op0
, op1
, op2
)
11234 : gen_monitorx (Pmode
, op0
, op1
, op2
));
11237 case IX86_BUILTIN_MWAIT
:
11238 arg0
= CALL_EXPR_ARG (exp
, 0);
11239 arg1
= CALL_EXPR_ARG (exp
, 1);
11240 op0
= expand_normal (arg0
);
11241 op1
= expand_normal (arg1
);
11243 op0
= copy_to_mode_reg (SImode
, op0
);
11245 op1
= copy_to_mode_reg (SImode
, op1
);
11246 emit_insn (gen_sse3_mwait (op0
, op1
));
11249 case IX86_BUILTIN_MWAITX
:
11250 arg0
= CALL_EXPR_ARG (exp
, 0);
11251 arg1
= CALL_EXPR_ARG (exp
, 1);
11252 arg2
= CALL_EXPR_ARG (exp
, 2);
11253 op0
= expand_normal (arg0
);
11254 op1
= expand_normal (arg1
);
11255 op2
= expand_normal (arg2
);
11257 op0
= copy_to_mode_reg (SImode
, op0
);
11259 op1
= copy_to_mode_reg (SImode
, op1
);
11261 op2
= copy_to_mode_reg (SImode
, op2
);
11262 emit_insn (gen_mwaitx (op0
, op1
, op2
));
11265 case IX86_BUILTIN_UMONITOR
:
11266 arg0
= CALL_EXPR_ARG (exp
, 0);
11267 op0
= expand_normal (arg0
);
11269 op0
= ix86_zero_extend_to_Pmode (op0
);
11270 emit_insn (gen_umonitor (Pmode
, op0
));
11273 case IX86_BUILTIN_UMWAIT
:
11274 case IX86_BUILTIN_TPAUSE
:
11275 arg0
= CALL_EXPR_ARG (exp
, 0);
11276 arg1
= CALL_EXPR_ARG (exp
, 1);
11277 op0
= expand_normal (arg0
);
11278 op1
= expand_normal (arg1
);
11281 op0
= copy_to_mode_reg (SImode
, op0
);
11283 op1
= force_reg (DImode
, op1
);
11287 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
11288 NULL
, 1, OPTAB_DIRECT
);
11291 case IX86_BUILTIN_UMWAIT
:
11292 icode
= CODE_FOR_umwait_rex64
;
11294 case IX86_BUILTIN_TPAUSE
:
11295 icode
= CODE_FOR_tpause_rex64
;
11298 gcc_unreachable ();
11301 op2
= gen_lowpart (SImode
, op2
);
11302 op1
= gen_lowpart (SImode
, op1
);
11303 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11309 case IX86_BUILTIN_UMWAIT
:
11310 icode
= CODE_FOR_umwait
;
11312 case IX86_BUILTIN_TPAUSE
:
11313 icode
= CODE_FOR_tpause
;
11316 gcc_unreachable ();
11318 pat
= GEN_FCN (icode
) (op0
, op1
);
11327 || !register_operand (target
, QImode
))
11328 target
= gen_reg_rtx (QImode
);
11330 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
11332 emit_insn (gen_rtx_SET (target
, pat
));
11336 case IX86_BUILTIN_TESTUI
:
11337 emit_insn (gen_testui ());
11340 || !register_operand (target
, QImode
))
11341 target
= gen_reg_rtx (QImode
);
11343 pat
= gen_rtx_LTU (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
11345 emit_insn (gen_rtx_SET (target
, pat
));
11349 case IX86_BUILTIN_CLZERO
:
11350 arg0
= CALL_EXPR_ARG (exp
, 0);
11351 op0
= expand_normal (arg0
);
11353 op0
= ix86_zero_extend_to_Pmode (op0
);
11354 emit_insn (gen_clzero (Pmode
, op0
));
11357 case IX86_BUILTIN_CLDEMOTE
:
11358 arg0
= CALL_EXPR_ARG (exp
, 0);
11359 op0
= expand_normal (arg0
);
11360 icode
= CODE_FOR_cldemote
;
11361 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11362 op0
= ix86_zero_extend_to_Pmode (op0
);
11364 emit_insn (gen_cldemote (op0
));
11367 case IX86_BUILTIN_LOADIWKEY
:
11369 arg0
= CALL_EXPR_ARG (exp
, 0);
11370 arg1
= CALL_EXPR_ARG (exp
, 1);
11371 arg2
= CALL_EXPR_ARG (exp
, 2);
11372 arg3
= CALL_EXPR_ARG (exp
, 3);
11374 op0
= expand_normal (arg0
);
11375 op1
= expand_normal (arg1
);
11376 op2
= expand_normal (arg2
);
11377 op3
= expand_normal (arg3
);
11380 op0
= copy_to_mode_reg (V2DImode
, op0
);
11382 op1
= copy_to_mode_reg (V2DImode
, op1
);
11384 op2
= copy_to_mode_reg (V2DImode
, op2
);
11386 op3
= copy_to_mode_reg (SImode
, op3
);
11388 emit_insn (gen_loadiwkey (op0
, op1
, op2
, op3
));
11393 case IX86_BUILTIN_AESDEC128KLU8
:
11394 icode
= CODE_FOR_aesdec128klu8
;
11395 goto aesdecenc_expand
;
11397 case IX86_BUILTIN_AESDEC256KLU8
:
11398 icode
= CODE_FOR_aesdec256klu8
;
11399 goto aesdecenc_expand
;
11401 case IX86_BUILTIN_AESENC128KLU8
:
11402 icode
= CODE_FOR_aesenc128klu8
;
11403 goto aesdecenc_expand
;
11405 case IX86_BUILTIN_AESENC256KLU8
:
11406 icode
= CODE_FOR_aesenc256klu8
;
11410 arg0
= CALL_EXPR_ARG (exp
, 0); // __m128i *odata
11411 arg1
= CALL_EXPR_ARG (exp
, 1); // __m128i idata
11412 arg2
= CALL_EXPR_ARG (exp
, 2); // const void *p
11414 op0
= expand_normal (arg0
);
11415 op1
= expand_normal (arg1
);
11416 op2
= expand_normal (arg2
);
11418 if (!address_operand (op0
, V2DImode
))
11420 op0
= convert_memory_address (Pmode
, op0
);
11421 op0
= copy_addr_to_reg (op0
);
11423 op0
= gen_rtx_MEM (V2DImode
, op0
);
11426 op1
= copy_to_mode_reg (V2DImode
, op1
);
11428 if (!address_operand (op2
, VOIDmode
))
11430 op2
= convert_memory_address (Pmode
, op2
);
11431 op2
= copy_addr_to_reg (op2
);
11433 op2
= gen_rtx_MEM (BLKmode
, op2
);
11435 emit_insn (GEN_FCN (icode
) (op1
, op1
, op2
));
11438 target
= gen_reg_rtx (QImode
);
11440 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCZmode
, FLAGS_REG
),
11442 emit_insn (gen_rtx_SET (target
, pat
));
11444 emit_insn (gen_rtx_SET (op0
, op1
));
11448 case IX86_BUILTIN_AESDECWIDE128KLU8
:
11449 icode
= CODE_FOR_aesdecwide128klu8
;
11450 goto wideaesdecenc_expand
;
11452 case IX86_BUILTIN_AESDECWIDE256KLU8
:
11453 icode
= CODE_FOR_aesdecwide256klu8
;
11454 goto wideaesdecenc_expand
;
11456 case IX86_BUILTIN_AESENCWIDE128KLU8
:
11457 icode
= CODE_FOR_aesencwide128klu8
;
11458 goto wideaesdecenc_expand
;
11460 case IX86_BUILTIN_AESENCWIDE256KLU8
:
11461 icode
= CODE_FOR_aesencwide256klu8
;
11463 wideaesdecenc_expand
:
11468 arg0
= CALL_EXPR_ARG (exp
, 0); // __m128i * odata
11469 arg1
= CALL_EXPR_ARG (exp
, 1); // const __m128i * idata
11470 arg2
= CALL_EXPR_ARG (exp
, 2); // const void *p
11472 op0
= expand_normal (arg0
);
11473 op1
= expand_normal (arg1
);
11474 op2
= expand_normal (arg2
);
11476 if (!address_operand (op2
, VOIDmode
))
11478 op2
= convert_memory_address (Pmode
, op2
);
11479 op2
= copy_addr_to_reg (op2
);
11481 op2
= gen_rtx_MEM (BLKmode
, op2
);
11483 for (i
= 0; i
< 8; i
++)
11485 xmm_regs
[i
] = gen_rtx_REG (V2DImode
, GET_SSE_REGNO (i
));
11487 op
= gen_rtx_MEM (V2DImode
,
11488 plus_constant (Pmode
, op1
, (i
* 16)));
11490 emit_move_insn (xmm_regs
[i
], op
);
11493 emit_insn (GEN_FCN (icode
) (op2
));
11496 target
= gen_reg_rtx (QImode
);
11498 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCZmode
, FLAGS_REG
),
11500 emit_insn (gen_rtx_SET (target
, pat
));
11502 for (i
= 0; i
< 8; i
++)
11504 op
= gen_rtx_MEM (V2DImode
,
11505 plus_constant (Pmode
, op0
, (i
* 16)));
11506 emit_move_insn (op
, xmm_regs
[i
]);
11511 case IX86_BUILTIN_ENCODEKEY128U32
:
11513 rtx op
, xmm_regs
[7];
11515 arg0
= CALL_EXPR_ARG (exp
, 0); // unsigned int htype
11516 arg1
= CALL_EXPR_ARG (exp
, 1); // __m128i key
11517 arg2
= CALL_EXPR_ARG (exp
, 2); // void *h
11519 op0
= expand_normal (arg0
);
11520 op1
= expand_normal (arg1
);
11521 op2
= expand_normal (arg2
);
11524 op0
= copy_to_mode_reg (SImode
, op0
);
11526 op
= gen_rtx_REG (V2DImode
, GET_SSE_REGNO (0));
11527 emit_move_insn (op
, op1
);
11529 for (i
= 0; i
< 3; i
++)
11530 xmm_regs
[i
] = gen_rtx_REG (V2DImode
, GET_SSE_REGNO (i
));
11533 target
= gen_reg_rtx (SImode
);
11535 emit_insn (gen_encodekey128u32 (target
, op0
));
11537 for (i
= 0; i
< 3; i
++)
11539 op
= gen_rtx_MEM (V2DImode
,
11540 plus_constant (Pmode
, op2
, (i
* 16)));
11541 emit_move_insn (op
, xmm_regs
[i
]);
11546 case IX86_BUILTIN_ENCODEKEY256U32
:
11548 rtx op
, xmm_regs
[7];
11550 arg0
= CALL_EXPR_ARG (exp
, 0); // unsigned int htype
11551 arg1
= CALL_EXPR_ARG (exp
, 1); // __m128i keylow
11552 arg2
= CALL_EXPR_ARG (exp
, 2); // __m128i keyhi
11553 arg3
= CALL_EXPR_ARG (exp
, 3); // void *h
11555 op0
= expand_normal (arg0
);
11556 op1
= expand_normal (arg1
);
11557 op2
= expand_normal (arg2
);
11558 op3
= expand_normal (arg3
);
11561 op0
= copy_to_mode_reg (SImode
, op0
);
11563 /* Force to use xmm0, xmm1 for keylow, keyhi*/
11564 op
= gen_rtx_REG (V2DImode
, GET_SSE_REGNO (0));
11565 emit_move_insn (op
, op1
);
11566 op
= gen_rtx_REG (V2DImode
, GET_SSE_REGNO (1));
11567 emit_move_insn (op
, op2
);
11569 for (i
= 0; i
< 4; i
++)
11570 xmm_regs
[i
] = gen_rtx_REG (V2DImode
, GET_SSE_REGNO (i
));
11573 target
= gen_reg_rtx (SImode
);
11575 emit_insn (gen_encodekey256u32 (target
, op0
));
11577 for (i
= 0; i
< 4; i
++)
11579 op
= gen_rtx_MEM (V2DImode
,
11580 plus_constant (Pmode
, op3
, (i
* 16)));
11581 emit_move_insn (op
, xmm_regs
[i
]);
11587 case IX86_BUILTIN_VEC_INIT_V2SI
:
11588 case IX86_BUILTIN_VEC_INIT_V4HI
:
11589 case IX86_BUILTIN_VEC_INIT_V8QI
:
11590 return ix86_expand_vec_init_builtin (TREE_TYPE (exp
), exp
, target
);
11592 case IX86_BUILTIN_VEC_EXT_V2DF
:
11593 case IX86_BUILTIN_VEC_EXT_V2DI
:
11594 case IX86_BUILTIN_VEC_EXT_V4SF
:
11595 case IX86_BUILTIN_VEC_EXT_V4SI
:
11596 case IX86_BUILTIN_VEC_EXT_V8HI
:
11597 case IX86_BUILTIN_VEC_EXT_V2SI
:
11598 case IX86_BUILTIN_VEC_EXT_V4HI
:
11599 case IX86_BUILTIN_VEC_EXT_V16QI
:
11600 return ix86_expand_vec_ext_builtin (exp
, target
);
11602 case IX86_BUILTIN_VEC_SET_V2DI
:
11603 case IX86_BUILTIN_VEC_SET_V4SF
:
11604 case IX86_BUILTIN_VEC_SET_V4SI
:
11605 case IX86_BUILTIN_VEC_SET_V8HI
:
11606 case IX86_BUILTIN_VEC_SET_V4HI
:
11607 case IX86_BUILTIN_VEC_SET_V16QI
:
11608 return ix86_expand_vec_set_builtin (exp
);
11610 case IX86_BUILTIN_NANQ
:
11611 case IX86_BUILTIN_NANSQ
:
11612 return expand_call (exp
, target
, ignore
);
11614 case IX86_BUILTIN_RDPID
:
11616 op0
= gen_reg_rtx (word_mode
);
11620 insn
= gen_rdpid_rex64 (op0
);
11621 op0
= convert_to_mode (SImode
, op0
, 1);
11624 insn
= gen_rdpid (op0
);
11629 || !register_operand (target
, SImode
))
11630 target
= gen_reg_rtx (SImode
);
11632 emit_move_insn (target
, op0
);
11635 case IX86_BUILTIN_2INTERSECTD512
:
11636 case IX86_BUILTIN_2INTERSECTQ512
:
11637 case IX86_BUILTIN_2INTERSECTD256
:
11638 case IX86_BUILTIN_2INTERSECTQ256
:
11639 case IX86_BUILTIN_2INTERSECTD128
:
11640 case IX86_BUILTIN_2INTERSECTQ128
:
11641 arg0
= CALL_EXPR_ARG (exp
, 0);
11642 arg1
= CALL_EXPR_ARG (exp
, 1);
11643 arg2
= CALL_EXPR_ARG (exp
, 2);
11644 arg3
= CALL_EXPR_ARG (exp
, 3);
11645 op0
= expand_normal (arg0
);
11646 op1
= expand_normal (arg1
);
11647 op2
= expand_normal (arg2
);
11648 op3
= expand_normal (arg3
);
11650 if (!address_operand (op0
, VOIDmode
))
11652 op0
= convert_memory_address (Pmode
, op0
);
11653 op0
= copy_addr_to_reg (op0
);
11655 if (!address_operand (op1
, VOIDmode
))
11657 op1
= convert_memory_address (Pmode
, op1
);
11658 op1
= copy_addr_to_reg (op1
);
11663 case IX86_BUILTIN_2INTERSECTD512
:
11665 icode
= CODE_FOR_avx512vp2intersect_2intersectv16si
;
11667 case IX86_BUILTIN_2INTERSECTQ512
:
11669 icode
= CODE_FOR_avx512vp2intersect_2intersectv8di
;
11671 case IX86_BUILTIN_2INTERSECTD256
:
11673 icode
= CODE_FOR_avx512vp2intersect_2intersectv8si
;
11675 case IX86_BUILTIN_2INTERSECTQ256
:
11677 icode
= CODE_FOR_avx512vp2intersect_2intersectv4di
;
11679 case IX86_BUILTIN_2INTERSECTD128
:
11681 icode
= CODE_FOR_avx512vp2intersect_2intersectv4si
;
11683 case IX86_BUILTIN_2INTERSECTQ128
:
11685 icode
= CODE_FOR_avx512vp2intersect_2intersectv2di
;
11688 gcc_unreachable ();
11691 mode2
= insn_data
[icode
].operand
[1].mode
;
11692 mode3
= insn_data
[icode
].operand
[2].mode
;
11693 if (!insn_data
[icode
].operand
[1].predicate (op2
, mode2
))
11694 op2
= copy_to_mode_reg (mode2
, op2
);
11695 if (!insn_data
[icode
].operand
[2].predicate (op3
, mode3
))
11696 op3
= copy_to_mode_reg (mode3
, op3
);
11698 op4
= gen_reg_rtx (mode4
);
11699 emit_insn (GEN_FCN (icode
) (op4
, op2
, op3
));
11700 mode0
= mode4
== P2HImode
? HImode
: QImode
;
11701 emit_move_insn (gen_rtx_MEM (mode0
, op0
),
11702 gen_lowpart (mode0
, op4
));
11703 emit_move_insn (gen_rtx_MEM (mode0
, op1
),
11704 gen_highpart (mode0
, op4
));
11708 case IX86_BUILTIN_RDPMC
:
11709 case IX86_BUILTIN_RDTSC
:
11710 case IX86_BUILTIN_RDTSCP
:
11711 case IX86_BUILTIN_XGETBV
:
11713 op0
= gen_reg_rtx (DImode
);
11714 op1
= gen_reg_rtx (DImode
);
11716 if (fcode
== IX86_BUILTIN_RDPMC
)
11718 arg0
= CALL_EXPR_ARG (exp
, 0);
11719 op2
= expand_normal (arg0
);
11720 if (!register_operand (op2
, SImode
))
11721 op2
= copy_to_mode_reg (SImode
, op2
);
11723 insn
= (TARGET_64BIT
11724 ? gen_rdpmc_rex64 (op0
, op1
, op2
)
11725 : gen_rdpmc (op0
, op2
));
11728 else if (fcode
== IX86_BUILTIN_XGETBV
)
11730 arg0
= CALL_EXPR_ARG (exp
, 0);
11731 op2
= expand_normal (arg0
);
11732 if (!register_operand (op2
, SImode
))
11733 op2
= copy_to_mode_reg (SImode
, op2
);
11735 insn
= (TARGET_64BIT
11736 ? gen_xgetbv_rex64 (op0
, op1
, op2
)
11737 : gen_xgetbv (op0
, op2
));
11740 else if (fcode
== IX86_BUILTIN_RDTSC
)
11742 insn
= (TARGET_64BIT
11743 ? gen_rdtsc_rex64 (op0
, op1
)
11744 : gen_rdtsc (op0
));
11749 op2
= gen_reg_rtx (SImode
);
11751 insn
= (TARGET_64BIT
11752 ? gen_rdtscp_rex64 (op0
, op1
, op2
)
11753 : gen_rdtscp (op0
, op2
));
11756 arg0
= CALL_EXPR_ARG (exp
, 0);
11757 op4
= expand_normal (arg0
);
11758 if (!address_operand (op4
, VOIDmode
))
11760 op4
= convert_memory_address (Pmode
, op4
);
11761 op4
= copy_addr_to_reg (op4
);
11763 emit_move_insn (gen_rtx_MEM (SImode
, op4
), op2
);
11767 || !register_operand (target
, DImode
))
11768 target
= gen_reg_rtx (DImode
);
11772 op1
= expand_simple_binop (DImode
, ASHIFT
, op1
, GEN_INT (32),
11773 op1
, 1, OPTAB_DIRECT
);
11774 op0
= expand_simple_binop (DImode
, IOR
, op0
, op1
,
11775 op0
, 1, OPTAB_DIRECT
);
11778 emit_move_insn (target
, op0
);
11781 case IX86_BUILTIN_ENQCMD
:
11782 case IX86_BUILTIN_ENQCMDS
:
11783 case IX86_BUILTIN_MOVDIR64B
:
11785 arg0
= CALL_EXPR_ARG (exp
, 0);
11786 arg1
= CALL_EXPR_ARG (exp
, 1);
11787 op0
= expand_normal (arg0
);
11788 op1
= expand_normal (arg1
);
11790 op0
= ix86_zero_extend_to_Pmode (op0
);
11791 if (!address_operand (op1
, VOIDmode
))
11793 op1
= convert_memory_address (Pmode
, op1
);
11794 op1
= copy_addr_to_reg (op1
);
11796 op1
= gen_rtx_MEM (XImode
, op1
);
11798 if (fcode
== IX86_BUILTIN_MOVDIR64B
)
11800 emit_insn (gen_movdir64b (Pmode
, op0
, op1
));
11806 || !register_operand (target
, SImode
))
11807 target
= gen_reg_rtx (SImode
);
11809 emit_move_insn (target
, const0_rtx
);
11810 target
= gen_rtx_SUBREG (QImode
, target
, 0);
11812 int unspecv
= (fcode
== IX86_BUILTIN_ENQCMD
11814 : UNSPECV_ENQCMDS
);
11815 icode
= code_for_enqcmd (unspecv
, Pmode
);
11816 emit_insn (GEN_FCN (icode
) (op0
, op1
));
11819 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
11820 gen_rtx_fmt_ee (EQ
, QImode
,
11821 gen_rtx_REG (CCZmode
, FLAGS_REG
),
11823 return SUBREG_REG (target
);
11826 case IX86_BUILTIN_FXSAVE
:
11827 case IX86_BUILTIN_FXRSTOR
:
11828 case IX86_BUILTIN_FXSAVE64
:
11829 case IX86_BUILTIN_FXRSTOR64
:
11830 case IX86_BUILTIN_FNSTENV
:
11831 case IX86_BUILTIN_FLDENV
:
11835 case IX86_BUILTIN_FXSAVE
:
11836 icode
= CODE_FOR_fxsave
;
11838 case IX86_BUILTIN_FXRSTOR
:
11839 icode
= CODE_FOR_fxrstor
;
11841 case IX86_BUILTIN_FXSAVE64
:
11842 icode
= CODE_FOR_fxsave64
;
11844 case IX86_BUILTIN_FXRSTOR64
:
11845 icode
= CODE_FOR_fxrstor64
;
11847 case IX86_BUILTIN_FNSTENV
:
11848 icode
= CODE_FOR_fnstenv
;
11850 case IX86_BUILTIN_FLDENV
:
11851 icode
= CODE_FOR_fldenv
;
11854 gcc_unreachable ();
11857 arg0
= CALL_EXPR_ARG (exp
, 0);
11858 op0
= expand_normal (arg0
);
11860 if (!address_operand (op0
, VOIDmode
))
11862 op0
= convert_memory_address (Pmode
, op0
);
11863 op0
= copy_addr_to_reg (op0
);
11865 op0
= gen_rtx_MEM (mode0
, op0
);
11867 pat
= GEN_FCN (icode
) (op0
);
11872 case IX86_BUILTIN_XSETBV
:
11873 arg0
= CALL_EXPR_ARG (exp
, 0);
11874 arg1
= CALL_EXPR_ARG (exp
, 1);
11875 op0
= expand_normal (arg0
);
11876 op1
= expand_normal (arg1
);
11879 op0
= copy_to_mode_reg (SImode
, op0
);
11881 op1
= force_reg (DImode
, op1
);
11885 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
11886 NULL
, 1, OPTAB_DIRECT
);
11888 icode
= CODE_FOR_xsetbv_rex64
;
11890 op2
= gen_lowpart (SImode
, op2
);
11891 op1
= gen_lowpart (SImode
, op1
);
11892 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11896 icode
= CODE_FOR_xsetbv
;
11898 pat
= GEN_FCN (icode
) (op0
, op1
);
11904 case IX86_BUILTIN_XSAVE
:
11905 case IX86_BUILTIN_XRSTOR
:
11906 case IX86_BUILTIN_XSAVE64
:
11907 case IX86_BUILTIN_XRSTOR64
:
11908 case IX86_BUILTIN_XSAVEOPT
:
11909 case IX86_BUILTIN_XSAVEOPT64
:
11910 case IX86_BUILTIN_XSAVES
:
11911 case IX86_BUILTIN_XRSTORS
:
11912 case IX86_BUILTIN_XSAVES64
:
11913 case IX86_BUILTIN_XRSTORS64
:
11914 case IX86_BUILTIN_XSAVEC
:
11915 case IX86_BUILTIN_XSAVEC64
:
11916 arg0
= CALL_EXPR_ARG (exp
, 0);
11917 arg1
= CALL_EXPR_ARG (exp
, 1);
11918 op0
= expand_normal (arg0
);
11919 op1
= expand_normal (arg1
);
11921 if (!address_operand (op0
, VOIDmode
))
11923 op0
= convert_memory_address (Pmode
, op0
);
11924 op0
= copy_addr_to_reg (op0
);
11926 op0
= gen_rtx_MEM (BLKmode
, op0
);
11928 op1
= force_reg (DImode
, op1
);
11932 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
11933 NULL
, 1, OPTAB_DIRECT
);
11936 case IX86_BUILTIN_XSAVE
:
11937 icode
= CODE_FOR_xsave_rex64
;
11939 case IX86_BUILTIN_XRSTOR
:
11940 icode
= CODE_FOR_xrstor_rex64
;
11942 case IX86_BUILTIN_XSAVE64
:
11943 icode
= CODE_FOR_xsave64
;
11945 case IX86_BUILTIN_XRSTOR64
:
11946 icode
= CODE_FOR_xrstor64
;
11948 case IX86_BUILTIN_XSAVEOPT
:
11949 icode
= CODE_FOR_xsaveopt_rex64
;
11951 case IX86_BUILTIN_XSAVEOPT64
:
11952 icode
= CODE_FOR_xsaveopt64
;
11954 case IX86_BUILTIN_XSAVES
:
11955 icode
= CODE_FOR_xsaves_rex64
;
11957 case IX86_BUILTIN_XRSTORS
:
11958 icode
= CODE_FOR_xrstors_rex64
;
11960 case IX86_BUILTIN_XSAVES64
:
11961 icode
= CODE_FOR_xsaves64
;
11963 case IX86_BUILTIN_XRSTORS64
:
11964 icode
= CODE_FOR_xrstors64
;
11966 case IX86_BUILTIN_XSAVEC
:
11967 icode
= CODE_FOR_xsavec_rex64
;
11969 case IX86_BUILTIN_XSAVEC64
:
11970 icode
= CODE_FOR_xsavec64
;
11973 gcc_unreachable ();
11976 op2
= gen_lowpart (SImode
, op2
);
11977 op1
= gen_lowpart (SImode
, op1
);
11978 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11984 case IX86_BUILTIN_XSAVE
:
11985 icode
= CODE_FOR_xsave
;
11987 case IX86_BUILTIN_XRSTOR
:
11988 icode
= CODE_FOR_xrstor
;
11990 case IX86_BUILTIN_XSAVEOPT
:
11991 icode
= CODE_FOR_xsaveopt
;
11993 case IX86_BUILTIN_XSAVES
:
11994 icode
= CODE_FOR_xsaves
;
11996 case IX86_BUILTIN_XRSTORS
:
11997 icode
= CODE_FOR_xrstors
;
11999 case IX86_BUILTIN_XSAVEC
:
12000 icode
= CODE_FOR_xsavec
;
12003 gcc_unreachable ();
12005 pat
= GEN_FCN (icode
) (op0
, op1
);
12012 case IX86_BUILTIN_LLWPCB
:
12013 arg0
= CALL_EXPR_ARG (exp
, 0);
12014 op0
= expand_normal (arg0
);
12016 if (!register_operand (op0
, Pmode
))
12017 op0
= ix86_zero_extend_to_Pmode (op0
);
12018 emit_insn (gen_lwp_llwpcb (Pmode
, op0
));
12021 case IX86_BUILTIN_SLWPCB
:
12023 || !register_operand (target
, Pmode
))
12024 target
= gen_reg_rtx (Pmode
);
12025 emit_insn (gen_lwp_slwpcb (Pmode
, target
));
12028 case IX86_BUILTIN_LWPVAL32
:
12029 case IX86_BUILTIN_LWPVAL64
:
12030 case IX86_BUILTIN_LWPINS32
:
12031 case IX86_BUILTIN_LWPINS64
:
12032 mode
= ((fcode
== IX86_BUILTIN_LWPVAL32
12033 || fcode
== IX86_BUILTIN_LWPINS32
)
12034 ? SImode
: DImode
);
12036 if (fcode
== IX86_BUILTIN_LWPVAL32
12037 || fcode
== IX86_BUILTIN_LWPVAL64
)
12038 icode
= code_for_lwp_lwpval (mode
);
12040 icode
= code_for_lwp_lwpins (mode
);
12042 arg0
= CALL_EXPR_ARG (exp
, 0);
12043 arg1
= CALL_EXPR_ARG (exp
, 1);
12044 arg2
= CALL_EXPR_ARG (exp
, 2);
12045 op0
= expand_normal (arg0
);
12046 op1
= expand_normal (arg1
);
12047 op2
= expand_normal (arg2
);
12048 mode0
= insn_data
[icode
].operand
[0].mode
;
12050 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
12051 op0
= copy_to_mode_reg (mode0
, op0
);
12052 if (!insn_data
[icode
].operand
[1].predicate (op1
, SImode
))
12053 op1
= copy_to_mode_reg (SImode
, op1
);
12055 if (!CONST_INT_P (op2
))
12057 error ("the last argument must be a 32-bit immediate");
12061 emit_insn (GEN_FCN (icode
) (op0
, op1
, op2
));
12063 if (fcode
== IX86_BUILTIN_LWPINS32
12064 || fcode
== IX86_BUILTIN_LWPINS64
)
12067 || !nonimmediate_operand (target
, QImode
))
12068 target
= gen_reg_rtx (QImode
);
12070 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
12072 emit_insn (gen_rtx_SET (target
, pat
));
12079 case IX86_BUILTIN_BEXTRI32
:
12080 case IX86_BUILTIN_BEXTRI64
:
12081 mode
= (fcode
== IX86_BUILTIN_BEXTRI32
? SImode
: DImode
);
12083 arg0
= CALL_EXPR_ARG (exp
, 0);
12084 arg1
= CALL_EXPR_ARG (exp
, 1);
12085 op0
= expand_normal (arg0
);
12086 op1
= expand_normal (arg1
);
12088 if (!CONST_INT_P (op1
))
12090 error ("last argument must be an immediate");
12095 unsigned char lsb_index
= UINTVAL (op1
);
12096 unsigned char length
= UINTVAL (op1
) >> 8;
12098 unsigned char bitsize
= GET_MODE_BITSIZE (mode
);
12100 icode
= code_for_tbm_bextri (mode
);
12102 mode1
= insn_data
[icode
].operand
[1].mode
;
12103 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode1
))
12104 op0
= copy_to_mode_reg (mode1
, op0
);
12106 mode0
= insn_data
[icode
].operand
[0].mode
;
12108 || !register_operand (target
, mode0
))
12109 target
= gen_reg_rtx (mode0
);
12111 if (length
== 0 || lsb_index
>= bitsize
)
12113 emit_move_insn (target
, const0_rtx
);
12117 if (length
+ lsb_index
> bitsize
)
12118 length
= bitsize
- lsb_index
;
12120 op1
= GEN_INT (length
);
12121 op2
= GEN_INT (lsb_index
);
12123 emit_insn (GEN_FCN (icode
) (target
, op0
, op1
, op2
));
12127 case IX86_BUILTIN_RDRAND16_STEP
:
12131 case IX86_BUILTIN_RDRAND32_STEP
:
12135 case IX86_BUILTIN_RDRAND64_STEP
:
12139 arg0
= CALL_EXPR_ARG (exp
, 0);
12140 op1
= expand_normal (arg0
);
12141 if (!address_operand (op1
, VOIDmode
))
12143 op1
= convert_memory_address (Pmode
, op1
);
12144 op1
= copy_addr_to_reg (op1
);
12147 op0
= gen_reg_rtx (mode
);
12148 emit_insn (gen_rdrand (mode
, op0
));
12150 emit_move_insn (gen_rtx_MEM (mode
, op1
), op0
);
12152 op1
= force_reg (SImode
, const1_rtx
);
12154 /* Emit SImode conditional move. */
12155 if (mode
== HImode
)
12157 if (TARGET_ZERO_EXTEND_WITH_AND
12158 && optimize_function_for_speed_p (cfun
))
12160 op2
= force_reg (SImode
, const0_rtx
);
12162 emit_insn (gen_movstricthi
12163 (gen_lowpart (HImode
, op2
), op0
));
12167 op2
= gen_reg_rtx (SImode
);
12169 emit_insn (gen_zero_extendhisi2 (op2
, op0
));
12172 else if (mode
== SImode
)
12175 op2
= gen_rtx_SUBREG (SImode
, op0
, 0);
12178 || !register_operand (target
, SImode
))
12179 target
= gen_reg_rtx (SImode
);
12181 pat
= gen_rtx_GEU (VOIDmode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
12183 emit_insn (gen_rtx_SET (target
,
12184 gen_rtx_IF_THEN_ELSE (SImode
, pat
, op2
, op1
)));
12187 case IX86_BUILTIN_RDSEED16_STEP
:
12191 case IX86_BUILTIN_RDSEED32_STEP
:
12195 case IX86_BUILTIN_RDSEED64_STEP
:
12199 arg0
= CALL_EXPR_ARG (exp
, 0);
12200 op1
= expand_normal (arg0
);
12201 if (!address_operand (op1
, VOIDmode
))
12203 op1
= convert_memory_address (Pmode
, op1
);
12204 op1
= copy_addr_to_reg (op1
);
12207 op0
= gen_reg_rtx (mode
);
12208 emit_insn (gen_rdseed (mode
, op0
));
12210 emit_move_insn (gen_rtx_MEM (mode
, op1
), op0
);
12212 op2
= gen_reg_rtx (QImode
);
12214 pat
= gen_rtx_LTU (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
12216 emit_insn (gen_rtx_SET (op2
, pat
));
12219 || !register_operand (target
, SImode
))
12220 target
= gen_reg_rtx (SImode
);
12222 emit_insn (gen_zero_extendqisi2 (target
, op2
));
12225 case IX86_BUILTIN_SBB32
:
12226 icode
= CODE_FOR_subborrowsi
;
12227 icode2
= CODE_FOR_subborrowsi_0
;
12233 case IX86_BUILTIN_SBB64
:
12234 icode
= CODE_FOR_subborrowdi
;
12235 icode2
= CODE_FOR_subborrowdi_0
;
12241 case IX86_BUILTIN_ADDCARRYX32
:
12242 icode
= CODE_FOR_addcarrysi
;
12243 icode2
= CODE_FOR_addcarrysi_0
;
12249 case IX86_BUILTIN_ADDCARRYX64
:
12250 icode
= CODE_FOR_addcarrydi
;
12251 icode2
= CODE_FOR_addcarrydi_0
;
12257 arg0
= CALL_EXPR_ARG (exp
, 0); /* unsigned char c_in. */
12258 arg1
= CALL_EXPR_ARG (exp
, 1); /* unsigned int src1. */
12259 arg2
= CALL_EXPR_ARG (exp
, 2); /* unsigned int src2. */
12260 arg3
= CALL_EXPR_ARG (exp
, 3); /* unsigned int *sum_out. */
12262 op1
= expand_normal (arg0
);
12263 if (!integer_zerop (arg0
))
12264 op1
= copy_to_mode_reg (QImode
, convert_to_mode (QImode
, op1
, 1));
12266 op2
= expand_normal (arg1
);
12267 if (!register_operand (op2
, mode0
))
12268 op2
= copy_to_mode_reg (mode0
, op2
);
12270 op3
= expand_normal (arg2
);
12271 if (!register_operand (op3
, mode0
))
12272 op3
= copy_to_mode_reg (mode0
, op3
);
12274 op4
= expand_normal (arg3
);
12275 if (!address_operand (op4
, VOIDmode
))
12277 op4
= convert_memory_address (Pmode
, op4
);
12278 op4
= copy_addr_to_reg (op4
);
12281 op0
= gen_reg_rtx (mode0
);
12282 if (integer_zerop (arg0
))
12284 /* If arg0 is 0, optimize right away into add or sub
12285 instruction that sets CCCmode flags. */
12286 op1
= gen_rtx_REG (mode2
, FLAGS_REG
);
12287 emit_insn (GEN_FCN (icode2
) (op0
, op2
, op3
));
12291 /* Generate CF from input operand. */
12292 emit_insn (gen_addqi3_cconly_overflow (op1
, constm1_rtx
));
12294 /* Generate instruction that consumes CF. */
12295 op1
= gen_rtx_REG (CCCmode
, FLAGS_REG
);
12296 pat
= gen_rtx_LTU (mode1
, op1
, const0_rtx
);
12297 pat2
= gen_rtx_LTU (mode0
, op1
, const0_rtx
);
12298 emit_insn (GEN_FCN (icode
) (op0
, op2
, op3
, op1
, pat
, pat2
));
12301 /* Return current CF value. */
12303 target
= gen_reg_rtx (QImode
);
12305 pat
= gen_rtx_LTU (QImode
, op1
, const0_rtx
);
12306 emit_insn (gen_rtx_SET (target
, pat
));
12308 /* Store the result. */
12309 emit_move_insn (gen_rtx_MEM (mode0
, op4
), op0
);
12313 case IX86_BUILTIN_READ_FLAGS
:
12314 emit_insn (gen_push (gen_rtx_REG (word_mode
, FLAGS_REG
)));
12317 || target
== NULL_RTX
12318 || !nonimmediate_operand (target
, word_mode
)
12319 || GET_MODE (target
) != word_mode
)
12320 target
= gen_reg_rtx (word_mode
);
12322 emit_insn (gen_pop (target
));
12325 case IX86_BUILTIN_WRITE_FLAGS
:
12327 arg0
= CALL_EXPR_ARG (exp
, 0);
12328 op0
= expand_normal (arg0
);
12329 if (!general_no_elim_operand (op0
, word_mode
))
12330 op0
= copy_to_mode_reg (word_mode
, op0
);
12332 emit_insn (gen_push (op0
));
12333 emit_insn (gen_pop (gen_rtx_REG (word_mode
, FLAGS_REG
)));
12336 case IX86_BUILTIN_KTESTC8
:
12337 icode
= CODE_FOR_ktestqi
;
12341 case IX86_BUILTIN_KTESTZ8
:
12342 icode
= CODE_FOR_ktestqi
;
12346 case IX86_BUILTIN_KTESTC16
:
12347 icode
= CODE_FOR_ktesthi
;
12351 case IX86_BUILTIN_KTESTZ16
:
12352 icode
= CODE_FOR_ktesthi
;
12356 case IX86_BUILTIN_KTESTC32
:
12357 icode
= CODE_FOR_ktestsi
;
12361 case IX86_BUILTIN_KTESTZ32
:
12362 icode
= CODE_FOR_ktestsi
;
12366 case IX86_BUILTIN_KTESTC64
:
12367 icode
= CODE_FOR_ktestdi
;
12371 case IX86_BUILTIN_KTESTZ64
:
12372 icode
= CODE_FOR_ktestdi
;
12376 case IX86_BUILTIN_KORTESTC8
:
12377 icode
= CODE_FOR_kortestqi
;
12381 case IX86_BUILTIN_KORTESTZ8
:
12382 icode
= CODE_FOR_kortestqi
;
12386 case IX86_BUILTIN_KORTESTC16
:
12387 icode
= CODE_FOR_kortesthi
;
12391 case IX86_BUILTIN_KORTESTZ16
:
12392 icode
= CODE_FOR_kortesthi
;
12396 case IX86_BUILTIN_KORTESTC32
:
12397 icode
= CODE_FOR_kortestsi
;
12401 case IX86_BUILTIN_KORTESTZ32
:
12402 icode
= CODE_FOR_kortestsi
;
12406 case IX86_BUILTIN_KORTESTC64
:
12407 icode
= CODE_FOR_kortestdi
;
12411 case IX86_BUILTIN_KORTESTZ64
:
12412 icode
= CODE_FOR_kortestdi
;
12416 arg0
= CALL_EXPR_ARG (exp
, 0); /* Mask reg src1. */
12417 arg1
= CALL_EXPR_ARG (exp
, 1); /* Mask reg src2. */
12418 op0
= expand_normal (arg0
);
12419 op1
= expand_normal (arg1
);
12421 mode0
= insn_data
[icode
].operand
[0].mode
;
12422 mode1
= insn_data
[icode
].operand
[1].mode
;
12424 if (GET_MODE (op0
) != VOIDmode
)
12425 op0
= force_reg (GET_MODE (op0
), op0
);
12427 op0
= gen_lowpart (mode0
, op0
);
12429 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
12430 op0
= copy_to_mode_reg (mode0
, op0
);
12432 if (GET_MODE (op1
) != VOIDmode
)
12433 op1
= force_reg (GET_MODE (op1
), op1
);
12435 op1
= gen_lowpart (mode1
, op1
);
12437 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
12438 op1
= copy_to_mode_reg (mode1
, op1
);
12440 target
= gen_reg_rtx (QImode
);
12442 /* Emit kortest. */
12443 emit_insn (GEN_FCN (icode
) (op0
, op1
));
12444 /* And use setcc to return result from flags. */
12445 ix86_expand_setcc (target
, EQ
,
12446 gen_rtx_REG (mode3
, FLAGS_REG
), const0_rtx
);
12449 case IX86_BUILTIN_GATHERSIV2DF
:
12450 icode
= CODE_FOR_avx2_gathersiv2df
;
12452 case IX86_BUILTIN_GATHERSIV4DF
:
12453 icode
= CODE_FOR_avx2_gathersiv4df
;
12455 case IX86_BUILTIN_GATHERDIV2DF
:
12456 icode
= CODE_FOR_avx2_gatherdiv2df
;
12458 case IX86_BUILTIN_GATHERDIV4DF
:
12459 icode
= CODE_FOR_avx2_gatherdiv4df
;
12461 case IX86_BUILTIN_GATHERSIV4SF
:
12462 icode
= CODE_FOR_avx2_gathersiv4sf
;
12464 case IX86_BUILTIN_GATHERSIV8SF
:
12465 icode
= CODE_FOR_avx2_gathersiv8sf
;
12467 case IX86_BUILTIN_GATHERDIV4SF
:
12468 icode
= CODE_FOR_avx2_gatherdiv4sf
;
12470 case IX86_BUILTIN_GATHERDIV8SF
:
12471 icode
= CODE_FOR_avx2_gatherdiv8sf
;
12473 case IX86_BUILTIN_GATHERSIV2DI
:
12474 icode
= CODE_FOR_avx2_gathersiv2di
;
12476 case IX86_BUILTIN_GATHERSIV4DI
:
12477 icode
= CODE_FOR_avx2_gathersiv4di
;
12479 case IX86_BUILTIN_GATHERDIV2DI
:
12480 icode
= CODE_FOR_avx2_gatherdiv2di
;
12482 case IX86_BUILTIN_GATHERDIV4DI
:
12483 icode
= CODE_FOR_avx2_gatherdiv4di
;
12485 case IX86_BUILTIN_GATHERSIV4SI
:
12486 icode
= CODE_FOR_avx2_gathersiv4si
;
12488 case IX86_BUILTIN_GATHERSIV8SI
:
12489 icode
= CODE_FOR_avx2_gathersiv8si
;
12491 case IX86_BUILTIN_GATHERDIV4SI
:
12492 icode
= CODE_FOR_avx2_gatherdiv4si
;
12494 case IX86_BUILTIN_GATHERDIV8SI
:
12495 icode
= CODE_FOR_avx2_gatherdiv8si
;
12497 case IX86_BUILTIN_GATHERALTSIV4DF
:
12498 icode
= CODE_FOR_avx2_gathersiv4df
;
12500 case IX86_BUILTIN_GATHERALTDIV8SF
:
12501 icode
= CODE_FOR_avx2_gatherdiv8sf
;
12503 case IX86_BUILTIN_GATHERALTSIV4DI
:
12504 icode
= CODE_FOR_avx2_gathersiv4di
;
12506 case IX86_BUILTIN_GATHERALTDIV8SI
:
12507 icode
= CODE_FOR_avx2_gatherdiv8si
;
12509 case IX86_BUILTIN_GATHER3SIV16SF
:
12510 icode
= CODE_FOR_avx512f_gathersiv16sf
;
12512 case IX86_BUILTIN_GATHER3SIV8DF
:
12513 icode
= CODE_FOR_avx512f_gathersiv8df
;
12515 case IX86_BUILTIN_GATHER3DIV16SF
:
12516 icode
= CODE_FOR_avx512f_gatherdiv16sf
;
12518 case IX86_BUILTIN_GATHER3DIV8DF
:
12519 icode
= CODE_FOR_avx512f_gatherdiv8df
;
12521 case IX86_BUILTIN_GATHER3SIV16SI
:
12522 icode
= CODE_FOR_avx512f_gathersiv16si
;
12524 case IX86_BUILTIN_GATHER3SIV8DI
:
12525 icode
= CODE_FOR_avx512f_gathersiv8di
;
12527 case IX86_BUILTIN_GATHER3DIV16SI
:
12528 icode
= CODE_FOR_avx512f_gatherdiv16si
;
12530 case IX86_BUILTIN_GATHER3DIV8DI
:
12531 icode
= CODE_FOR_avx512f_gatherdiv8di
;
12533 case IX86_BUILTIN_GATHER3ALTSIV8DF
:
12534 icode
= CODE_FOR_avx512f_gathersiv8df
;
12536 case IX86_BUILTIN_GATHER3ALTDIV16SF
:
12537 icode
= CODE_FOR_avx512f_gatherdiv16sf
;
12539 case IX86_BUILTIN_GATHER3ALTSIV8DI
:
12540 icode
= CODE_FOR_avx512f_gathersiv8di
;
12542 case IX86_BUILTIN_GATHER3ALTDIV16SI
:
12543 icode
= CODE_FOR_avx512f_gatherdiv16si
;
12545 case IX86_BUILTIN_GATHER3SIV2DF
:
12546 icode
= CODE_FOR_avx512vl_gathersiv2df
;
12548 case IX86_BUILTIN_GATHER3SIV4DF
:
12549 icode
= CODE_FOR_avx512vl_gathersiv4df
;
12551 case IX86_BUILTIN_GATHER3DIV2DF
:
12552 icode
= CODE_FOR_avx512vl_gatherdiv2df
;
12554 case IX86_BUILTIN_GATHER3DIV4DF
:
12555 icode
= CODE_FOR_avx512vl_gatherdiv4df
;
12557 case IX86_BUILTIN_GATHER3SIV4SF
:
12558 icode
= CODE_FOR_avx512vl_gathersiv4sf
;
12560 case IX86_BUILTIN_GATHER3SIV8SF
:
12561 icode
= CODE_FOR_avx512vl_gathersiv8sf
;
12563 case IX86_BUILTIN_GATHER3DIV4SF
:
12564 icode
= CODE_FOR_avx512vl_gatherdiv4sf
;
12566 case IX86_BUILTIN_GATHER3DIV8SF
:
12567 icode
= CODE_FOR_avx512vl_gatherdiv8sf
;
12569 case IX86_BUILTIN_GATHER3SIV2DI
:
12570 icode
= CODE_FOR_avx512vl_gathersiv2di
;
12572 case IX86_BUILTIN_GATHER3SIV4DI
:
12573 icode
= CODE_FOR_avx512vl_gathersiv4di
;
12575 case IX86_BUILTIN_GATHER3DIV2DI
:
12576 icode
= CODE_FOR_avx512vl_gatherdiv2di
;
12578 case IX86_BUILTIN_GATHER3DIV4DI
:
12579 icode
= CODE_FOR_avx512vl_gatherdiv4di
;
12581 case IX86_BUILTIN_GATHER3SIV4SI
:
12582 icode
= CODE_FOR_avx512vl_gathersiv4si
;
12584 case IX86_BUILTIN_GATHER3SIV8SI
:
12585 icode
= CODE_FOR_avx512vl_gathersiv8si
;
12587 case IX86_BUILTIN_GATHER3DIV4SI
:
12588 icode
= CODE_FOR_avx512vl_gatherdiv4si
;
12590 case IX86_BUILTIN_GATHER3DIV8SI
:
12591 icode
= CODE_FOR_avx512vl_gatherdiv8si
;
12593 case IX86_BUILTIN_GATHER3ALTSIV4DF
:
12594 icode
= CODE_FOR_avx512vl_gathersiv4df
;
12596 case IX86_BUILTIN_GATHER3ALTDIV8SF
:
12597 icode
= CODE_FOR_avx512vl_gatherdiv8sf
;
12599 case IX86_BUILTIN_GATHER3ALTSIV4DI
:
12600 icode
= CODE_FOR_avx512vl_gathersiv4di
;
12602 case IX86_BUILTIN_GATHER3ALTDIV8SI
:
12603 icode
= CODE_FOR_avx512vl_gatherdiv8si
;
12605 case IX86_BUILTIN_SCATTERSIV16SF
:
12606 icode
= CODE_FOR_avx512f_scattersiv16sf
;
12608 case IX86_BUILTIN_SCATTERSIV8DF
:
12609 icode
= CODE_FOR_avx512f_scattersiv8df
;
12611 case IX86_BUILTIN_SCATTERDIV16SF
:
12612 icode
= CODE_FOR_avx512f_scatterdiv16sf
;
12614 case IX86_BUILTIN_SCATTERDIV8DF
:
12615 icode
= CODE_FOR_avx512f_scatterdiv8df
;
12617 case IX86_BUILTIN_SCATTERSIV16SI
:
12618 icode
= CODE_FOR_avx512f_scattersiv16si
;
12620 case IX86_BUILTIN_SCATTERSIV8DI
:
12621 icode
= CODE_FOR_avx512f_scattersiv8di
;
12623 case IX86_BUILTIN_SCATTERDIV16SI
:
12624 icode
= CODE_FOR_avx512f_scatterdiv16si
;
12626 case IX86_BUILTIN_SCATTERDIV8DI
:
12627 icode
= CODE_FOR_avx512f_scatterdiv8di
;
12629 case IX86_BUILTIN_SCATTERSIV8SF
:
12630 icode
= CODE_FOR_avx512vl_scattersiv8sf
;
12632 case IX86_BUILTIN_SCATTERSIV4SF
:
12633 icode
= CODE_FOR_avx512vl_scattersiv4sf
;
12635 case IX86_BUILTIN_SCATTERSIV4DF
:
12636 icode
= CODE_FOR_avx512vl_scattersiv4df
;
12638 case IX86_BUILTIN_SCATTERSIV2DF
:
12639 icode
= CODE_FOR_avx512vl_scattersiv2df
;
12641 case IX86_BUILTIN_SCATTERDIV8SF
:
12642 icode
= CODE_FOR_avx512vl_scatterdiv8sf
;
12644 case IX86_BUILTIN_SCATTERDIV4SF
:
12645 icode
= CODE_FOR_avx512vl_scatterdiv4sf
;
12647 case IX86_BUILTIN_SCATTERDIV4DF
:
12648 icode
= CODE_FOR_avx512vl_scatterdiv4df
;
12650 case IX86_BUILTIN_SCATTERDIV2DF
:
12651 icode
= CODE_FOR_avx512vl_scatterdiv2df
;
12653 case IX86_BUILTIN_SCATTERSIV8SI
:
12654 icode
= CODE_FOR_avx512vl_scattersiv8si
;
12656 case IX86_BUILTIN_SCATTERSIV4SI
:
12657 icode
= CODE_FOR_avx512vl_scattersiv4si
;
12659 case IX86_BUILTIN_SCATTERSIV4DI
:
12660 icode
= CODE_FOR_avx512vl_scattersiv4di
;
12662 case IX86_BUILTIN_SCATTERSIV2DI
:
12663 icode
= CODE_FOR_avx512vl_scattersiv2di
;
12665 case IX86_BUILTIN_SCATTERDIV8SI
:
12666 icode
= CODE_FOR_avx512vl_scatterdiv8si
;
12668 case IX86_BUILTIN_SCATTERDIV4SI
:
12669 icode
= CODE_FOR_avx512vl_scatterdiv4si
;
12671 case IX86_BUILTIN_SCATTERDIV4DI
:
12672 icode
= CODE_FOR_avx512vl_scatterdiv4di
;
12674 case IX86_BUILTIN_SCATTERDIV2DI
:
12675 icode
= CODE_FOR_avx512vl_scatterdiv2di
;
12677 case IX86_BUILTIN_GATHERPFDPD
:
12678 icode
= CODE_FOR_avx512pf_gatherpfv8sidf
;
12679 goto vec_prefetch_gen
;
12680 case IX86_BUILTIN_SCATTERALTSIV8DF
:
12681 icode
= CODE_FOR_avx512f_scattersiv8df
;
12683 case IX86_BUILTIN_SCATTERALTDIV16SF
:
12684 icode
= CODE_FOR_avx512f_scatterdiv16sf
;
12686 case IX86_BUILTIN_SCATTERALTSIV8DI
:
12687 icode
= CODE_FOR_avx512f_scattersiv8di
;
12689 case IX86_BUILTIN_SCATTERALTDIV16SI
:
12690 icode
= CODE_FOR_avx512f_scatterdiv16si
;
12692 case IX86_BUILTIN_SCATTERALTSIV4DF
:
12693 icode
= CODE_FOR_avx512vl_scattersiv4df
;
12695 case IX86_BUILTIN_SCATTERALTDIV8SF
:
12696 icode
= CODE_FOR_avx512vl_scatterdiv8sf
;
12698 case IX86_BUILTIN_SCATTERALTSIV4DI
:
12699 icode
= CODE_FOR_avx512vl_scattersiv4di
;
12701 case IX86_BUILTIN_SCATTERALTDIV8SI
:
12702 icode
= CODE_FOR_avx512vl_scatterdiv8si
;
12704 case IX86_BUILTIN_SCATTERALTSIV2DF
:
12705 icode
= CODE_FOR_avx512vl_scattersiv2df
;
12707 case IX86_BUILTIN_SCATTERALTDIV4SF
:
12708 icode
= CODE_FOR_avx512vl_scatterdiv4sf
;
12710 case IX86_BUILTIN_SCATTERALTSIV2DI
:
12711 icode
= CODE_FOR_avx512vl_scattersiv2di
;
12713 case IX86_BUILTIN_SCATTERALTDIV4SI
:
12714 icode
= CODE_FOR_avx512vl_scatterdiv4si
;
12716 case IX86_BUILTIN_GATHERPFDPS
:
12717 icode
= CODE_FOR_avx512pf_gatherpfv16sisf
;
12718 goto vec_prefetch_gen
;
12719 case IX86_BUILTIN_GATHERPFQPD
:
12720 icode
= CODE_FOR_avx512pf_gatherpfv8didf
;
12721 goto vec_prefetch_gen
;
12722 case IX86_BUILTIN_GATHERPFQPS
:
12723 icode
= CODE_FOR_avx512pf_gatherpfv8disf
;
12724 goto vec_prefetch_gen
;
12725 case IX86_BUILTIN_SCATTERPFDPD
:
12726 icode
= CODE_FOR_avx512pf_scatterpfv8sidf
;
12727 goto vec_prefetch_gen
;
12728 case IX86_BUILTIN_SCATTERPFDPS
:
12729 icode
= CODE_FOR_avx512pf_scatterpfv16sisf
;
12730 goto vec_prefetch_gen
;
12731 case IX86_BUILTIN_SCATTERPFQPD
:
12732 icode
= CODE_FOR_avx512pf_scatterpfv8didf
;
12733 goto vec_prefetch_gen
;
12734 case IX86_BUILTIN_SCATTERPFQPS
:
12735 icode
= CODE_FOR_avx512pf_scatterpfv8disf
;
12736 goto vec_prefetch_gen
;
12740 rtx (*gen
) (rtx
, rtx
);
12742 arg0
= CALL_EXPR_ARG (exp
, 0);
12743 arg1
= CALL_EXPR_ARG (exp
, 1);
12744 arg2
= CALL_EXPR_ARG (exp
, 2);
12745 arg3
= CALL_EXPR_ARG (exp
, 3);
12746 arg4
= CALL_EXPR_ARG (exp
, 4);
12747 op0
= expand_normal (arg0
);
12748 op1
= expand_normal (arg1
);
12749 op2
= expand_normal (arg2
);
12750 op3
= expand_normal (arg3
);
12751 op4
= expand_normal (arg4
);
12752 /* Note the arg order is different from the operand order. */
12753 mode0
= insn_data
[icode
].operand
[1].mode
;
12754 mode2
= insn_data
[icode
].operand
[3].mode
;
12755 mode3
= insn_data
[icode
].operand
[4].mode
;
12756 mode4
= insn_data
[icode
].operand
[5].mode
;
12758 if (target
== NULL_RTX
12759 || GET_MODE (target
) != insn_data
[icode
].operand
[0].mode
12760 || !insn_data
[icode
].operand
[0].predicate (target
,
12761 GET_MODE (target
)))
12762 subtarget
= gen_reg_rtx (insn_data
[icode
].operand
[0].mode
);
12764 subtarget
= target
;
12768 case IX86_BUILTIN_GATHER3ALTSIV8DF
:
12769 case IX86_BUILTIN_GATHER3ALTSIV8DI
:
12770 half
= gen_reg_rtx (V8SImode
);
12771 if (!nonimmediate_operand (op2
, V16SImode
))
12772 op2
= copy_to_mode_reg (V16SImode
, op2
);
12773 emit_insn (gen_vec_extract_lo_v16si (half
, op2
));
12776 case IX86_BUILTIN_GATHER3ALTSIV4DF
:
12777 case IX86_BUILTIN_GATHER3ALTSIV4DI
:
12778 case IX86_BUILTIN_GATHERALTSIV4DF
:
12779 case IX86_BUILTIN_GATHERALTSIV4DI
:
12780 half
= gen_reg_rtx (V4SImode
);
12781 if (!nonimmediate_operand (op2
, V8SImode
))
12782 op2
= copy_to_mode_reg (V8SImode
, op2
);
12783 emit_insn (gen_vec_extract_lo_v8si (half
, op2
));
12786 case IX86_BUILTIN_GATHER3ALTDIV16SF
:
12787 case IX86_BUILTIN_GATHER3ALTDIV16SI
:
12788 half
= gen_reg_rtx (mode0
);
12789 if (mode0
== V8SFmode
)
12790 gen
= gen_vec_extract_lo_v16sf
;
12792 gen
= gen_vec_extract_lo_v16si
;
12793 if (!nonimmediate_operand (op0
, GET_MODE (op0
)))
12794 op0
= copy_to_mode_reg (GET_MODE (op0
), op0
);
12795 emit_insn (gen (half
, op0
));
12797 op3
= lowpart_subreg (QImode
, op3
, HImode
);
12799 case IX86_BUILTIN_GATHER3ALTDIV8SF
:
12800 case IX86_BUILTIN_GATHER3ALTDIV8SI
:
12801 case IX86_BUILTIN_GATHERALTDIV8SF
:
12802 case IX86_BUILTIN_GATHERALTDIV8SI
:
12803 half
= gen_reg_rtx (mode0
);
12804 if (mode0
== V4SFmode
)
12805 gen
= gen_vec_extract_lo_v8sf
;
12807 gen
= gen_vec_extract_lo_v8si
;
12808 if (!nonimmediate_operand (op0
, GET_MODE (op0
)))
12809 op0
= copy_to_mode_reg (GET_MODE (op0
), op0
);
12810 emit_insn (gen (half
, op0
));
12812 if (VECTOR_MODE_P (GET_MODE (op3
)))
12814 half
= gen_reg_rtx (mode0
);
12815 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12816 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12817 emit_insn (gen (half
, op3
));
12825 /* Force memory operand only with base register here. But we
12826 don't want to do it on memory operand for other builtin
12828 op1
= ix86_zero_extend_to_Pmode (op1
);
12830 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
12831 op0
= copy_to_mode_reg (mode0
, op0
);
12832 if (!insn_data
[icode
].operand
[2].predicate (op1
, Pmode
))
12833 op1
= copy_to_mode_reg (Pmode
, op1
);
12834 if (!insn_data
[icode
].operand
[3].predicate (op2
, mode2
))
12835 op2
= copy_to_mode_reg (mode2
, op2
);
12837 op3
= fixup_modeless_constant (op3
, mode3
);
12839 if (GET_MODE (op3
) == mode3
|| GET_MODE (op3
) == VOIDmode
)
12841 if (!insn_data
[icode
].operand
[4].predicate (op3
, mode3
))
12842 op3
= copy_to_mode_reg (mode3
, op3
);
12846 op3
= copy_to_reg (op3
);
12847 op3
= lowpart_subreg (mode3
, op3
, GET_MODE (op3
));
12849 if (!insn_data
[icode
].operand
[5].predicate (op4
, mode4
))
12851 error ("the last argument must be scale 1, 2, 4, 8");
12855 /* Optimize. If mask is known to have all high bits set,
12856 replace op0 with pc_rtx to signal that the instruction
12857 overwrites the whole destination and doesn't use its
12858 previous contents. */
12861 if (TREE_CODE (arg3
) == INTEGER_CST
)
12863 if (integer_all_onesp (arg3
))
12866 else if (TREE_CODE (arg3
) == VECTOR_CST
)
12868 unsigned int negative
= 0;
12869 for (i
= 0; i
< VECTOR_CST_NELTS (arg3
); ++i
)
12871 tree cst
= VECTOR_CST_ELT (arg3
, i
);
12872 if (TREE_CODE (cst
) == INTEGER_CST
12873 && tree_int_cst_sign_bit (cst
))
12875 else if (TREE_CODE (cst
) == REAL_CST
12876 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst
)))
12879 if (negative
== TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3
)))
12882 else if (TREE_CODE (arg3
) == SSA_NAME
12883 && TREE_CODE (TREE_TYPE (arg3
)) == VECTOR_TYPE
)
12885 /* Recognize also when mask is like:
12886 __v2df src = _mm_setzero_pd ();
12887 __v2df mask = _mm_cmpeq_pd (src, src);
12889 __v8sf src = _mm256_setzero_ps ();
12890 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
12891 as that is a cheaper way to load all ones into
12892 a register than having to load a constant from
12894 gimple
*def_stmt
= SSA_NAME_DEF_STMT (arg3
);
12895 if (is_gimple_call (def_stmt
))
12897 tree fndecl
= gimple_call_fndecl (def_stmt
);
12899 && fndecl_built_in_p (fndecl
, BUILT_IN_MD
))
12900 switch (DECL_MD_FUNCTION_CODE (fndecl
))
12902 case IX86_BUILTIN_CMPPD
:
12903 case IX86_BUILTIN_CMPPS
:
12904 case IX86_BUILTIN_CMPPD256
:
12905 case IX86_BUILTIN_CMPPS256
:
12906 if (!integer_zerop (gimple_call_arg (def_stmt
, 2)))
12909 case IX86_BUILTIN_CMPEQPD
:
12910 case IX86_BUILTIN_CMPEQPS
:
12911 if (initializer_zerop (gimple_call_arg (def_stmt
, 0))
12912 && initializer_zerop (gimple_call_arg (def_stmt
,
12923 pat
= GEN_FCN (icode
) (subtarget
, op0
, op1
, op2
, op3
, op4
);
12930 case IX86_BUILTIN_GATHER3DIV16SF
:
12931 if (target
== NULL_RTX
)
12932 target
= gen_reg_rtx (V8SFmode
);
12933 emit_insn (gen_vec_extract_lo_v16sf (target
, subtarget
));
12935 case IX86_BUILTIN_GATHER3DIV16SI
:
12936 if (target
== NULL_RTX
)
12937 target
= gen_reg_rtx (V8SImode
);
12938 emit_insn (gen_vec_extract_lo_v16si (target
, subtarget
));
12940 case IX86_BUILTIN_GATHER3DIV8SF
:
12941 case IX86_BUILTIN_GATHERDIV8SF
:
12942 if (target
== NULL_RTX
)
12943 target
= gen_reg_rtx (V4SFmode
);
12944 emit_insn (gen_vec_extract_lo_v8sf (target
, subtarget
));
12946 case IX86_BUILTIN_GATHER3DIV8SI
:
12947 case IX86_BUILTIN_GATHERDIV8SI
:
12948 if (target
== NULL_RTX
)
12949 target
= gen_reg_rtx (V4SImode
);
12950 emit_insn (gen_vec_extract_lo_v8si (target
, subtarget
));
12953 target
= subtarget
;
12959 arg0
= CALL_EXPR_ARG (exp
, 0);
12960 arg1
= CALL_EXPR_ARG (exp
, 1);
12961 arg2
= CALL_EXPR_ARG (exp
, 2);
12962 arg3
= CALL_EXPR_ARG (exp
, 3);
12963 arg4
= CALL_EXPR_ARG (exp
, 4);
12964 op0
= expand_normal (arg0
);
12965 op1
= expand_normal (arg1
);
12966 op2
= expand_normal (arg2
);
12967 op3
= expand_normal (arg3
);
12968 op4
= expand_normal (arg4
);
12969 mode1
= insn_data
[icode
].operand
[1].mode
;
12970 mode2
= insn_data
[icode
].operand
[2].mode
;
12971 mode3
= insn_data
[icode
].operand
[3].mode
;
12972 mode4
= insn_data
[icode
].operand
[4].mode
;
12974 /* Scatter instruction stores operand op3 to memory with
12975 indices from op2 and scale from op4 under writemask op1.
12976 If index operand op2 has more elements then source operand
12977 op3 one need to use only its low half. And vice versa. */
12980 case IX86_BUILTIN_SCATTERALTSIV8DF
:
12981 case IX86_BUILTIN_SCATTERALTSIV8DI
:
12982 half
= gen_reg_rtx (V8SImode
);
12983 if (!nonimmediate_operand (op2
, V16SImode
))
12984 op2
= copy_to_mode_reg (V16SImode
, op2
);
12985 emit_insn (gen_vec_extract_lo_v16si (half
, op2
));
12988 case IX86_BUILTIN_SCATTERALTDIV16SF
:
12989 case IX86_BUILTIN_SCATTERALTDIV16SI
:
12990 half
= gen_reg_rtx (mode3
);
12991 if (mode3
== V8SFmode
)
12992 gen
= gen_vec_extract_lo_v16sf
;
12994 gen
= gen_vec_extract_lo_v16si
;
12995 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12996 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12997 emit_insn (gen (half
, op3
));
13000 case IX86_BUILTIN_SCATTERALTSIV4DF
:
13001 case IX86_BUILTIN_SCATTERALTSIV4DI
:
13002 half
= gen_reg_rtx (V4SImode
);
13003 if (!nonimmediate_operand (op2
, V8SImode
))
13004 op2
= copy_to_mode_reg (V8SImode
, op2
);
13005 emit_insn (gen_vec_extract_lo_v8si (half
, op2
));
13008 case IX86_BUILTIN_SCATTERALTDIV8SF
:
13009 case IX86_BUILTIN_SCATTERALTDIV8SI
:
13010 half
= gen_reg_rtx (mode3
);
13011 if (mode3
== V4SFmode
)
13012 gen
= gen_vec_extract_lo_v8sf
;
13014 gen
= gen_vec_extract_lo_v8si
;
13015 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
13016 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
13017 emit_insn (gen (half
, op3
));
13020 case IX86_BUILTIN_SCATTERALTSIV2DF
:
13021 case IX86_BUILTIN_SCATTERALTSIV2DI
:
13022 if (!nonimmediate_operand (op2
, V4SImode
))
13023 op2
= copy_to_mode_reg (V4SImode
, op2
);
13025 case IX86_BUILTIN_SCATTERALTDIV4SF
:
13026 case IX86_BUILTIN_SCATTERALTDIV4SI
:
13027 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
13028 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
13034 /* Force memory operand only with base register here. But we
13035 don't want to do it on memory operand for other builtin
13037 op0
= force_reg (Pmode
, convert_to_mode (Pmode
, op0
, 1));
13039 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
13040 op0
= copy_to_mode_reg (Pmode
, op0
);
13042 op1
= fixup_modeless_constant (op1
, mode1
);
13044 if (GET_MODE (op1
) == mode1
|| GET_MODE (op1
) == VOIDmode
)
13046 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
13047 op1
= copy_to_mode_reg (mode1
, op1
);
13051 op1
= copy_to_reg (op1
);
13052 op1
= lowpart_subreg (mode1
, op1
, GET_MODE (op1
));
13055 if (!insn_data
[icode
].operand
[2].predicate (op2
, mode2
))
13056 op2
= copy_to_mode_reg (mode2
, op2
);
13058 if (!insn_data
[icode
].operand
[3].predicate (op3
, mode3
))
13059 op3
= copy_to_mode_reg (mode3
, op3
);
13061 if (!insn_data
[icode
].operand
[4].predicate (op4
, mode4
))
13063 error ("the last argument must be scale 1, 2, 4, 8");
13067 pat
= GEN_FCN (icode
) (op0
, op1
, op2
, op3
, op4
);
13075 arg0
= CALL_EXPR_ARG (exp
, 0);
13076 arg1
= CALL_EXPR_ARG (exp
, 1);
13077 arg2
= CALL_EXPR_ARG (exp
, 2);
13078 arg3
= CALL_EXPR_ARG (exp
, 3);
13079 arg4
= CALL_EXPR_ARG (exp
, 4);
13080 op0
= expand_normal (arg0
);
13081 op1
= expand_normal (arg1
);
13082 op2
= expand_normal (arg2
);
13083 op3
= expand_normal (arg3
);
13084 op4
= expand_normal (arg4
);
13085 mode0
= insn_data
[icode
].operand
[0].mode
;
13086 mode1
= insn_data
[icode
].operand
[1].mode
;
13087 mode3
= insn_data
[icode
].operand
[3].mode
;
13088 mode4
= insn_data
[icode
].operand
[4].mode
;
13090 op0
= fixup_modeless_constant (op0
, mode0
);
13092 if (GET_MODE (op0
) == mode0
|| GET_MODE (op0
) == VOIDmode
)
13094 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
13095 op0
= copy_to_mode_reg (mode0
, op0
);
13099 op0
= copy_to_reg (op0
);
13100 op0
= lowpart_subreg (mode0
, op0
, GET_MODE (op0
));
13103 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
13104 op1
= copy_to_mode_reg (mode1
, op1
);
13106 /* Force memory operand only with base register here. But we
13107 don't want to do it on memory operand for other builtin
13109 op2
= force_reg (Pmode
, convert_to_mode (Pmode
, op2
, 1));
13111 if (!insn_data
[icode
].operand
[2].predicate (op2
, Pmode
))
13112 op2
= copy_to_mode_reg (Pmode
, op2
);
13114 if (!insn_data
[icode
].operand
[3].predicate (op3
, mode3
))
13116 error ("the forth argument must be scale 1, 2, 4, 8");
13120 if (!insn_data
[icode
].operand
[4].predicate (op4
, mode4
))
13122 error ("incorrect hint operand");
13126 pat
= GEN_FCN (icode
) (op0
, op1
, op2
, op3
, op4
);
13134 case IX86_BUILTIN_XABORT
:
13135 icode
= CODE_FOR_xabort
;
13136 arg0
= CALL_EXPR_ARG (exp
, 0);
13137 op0
= expand_normal (arg0
);
13138 mode0
= insn_data
[icode
].operand
[0].mode
;
13139 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
13141 error ("the argument to %<xabort%> intrinsic must "
13142 "be an 8-bit immediate");
13145 emit_insn (gen_xabort (op0
));
13148 case IX86_BUILTIN_RDSSPD
:
13149 case IX86_BUILTIN_RDSSPQ
:
13150 mode
= (fcode
== IX86_BUILTIN_RDSSPD
? SImode
: DImode
);
13153 || !register_operand (target
, mode
))
13154 target
= gen_reg_rtx (mode
);
13156 op0
= force_reg (mode
, const0_rtx
);
13158 emit_insn (gen_rdssp (mode
, target
, op0
));
13161 case IX86_BUILTIN_INCSSPD
:
13162 case IX86_BUILTIN_INCSSPQ
:
13163 mode
= (fcode
== IX86_BUILTIN_INCSSPD
? SImode
: DImode
);
13165 arg0
= CALL_EXPR_ARG (exp
, 0);
13166 op0
= expand_normal (arg0
);
13168 op0
= force_reg (mode
, op0
);
13170 emit_insn (gen_incssp (mode
, op0
));
13173 case IX86_BUILTIN_HRESET
:
13174 icode
= CODE_FOR_hreset
;
13175 arg0
= CALL_EXPR_ARG (exp
, 0);
13176 op0
= expand_normal (arg0
);
13177 op0
= force_reg (SImode
, op0
);
13178 emit_insn (gen_hreset (op0
));
13181 case IX86_BUILTIN_RSTORSSP
:
13182 case IX86_BUILTIN_CLRSSBSY
:
13183 arg0
= CALL_EXPR_ARG (exp
, 0);
13184 op0
= expand_normal (arg0
);
13185 icode
= (fcode
== IX86_BUILTIN_RSTORSSP
13186 ? CODE_FOR_rstorssp
13187 : CODE_FOR_clrssbsy
);
13189 if (!address_operand (op0
, VOIDmode
))
13191 op0
= convert_memory_address (Pmode
, op0
);
13192 op0
= copy_addr_to_reg (op0
);
13194 emit_insn (GEN_FCN (icode
) (gen_rtx_MEM (DImode
, op0
)));
13197 case IX86_BUILTIN_WRSSD
:
13198 case IX86_BUILTIN_WRSSQ
:
13199 case IX86_BUILTIN_WRUSSD
:
13200 case IX86_BUILTIN_WRUSSQ
:
13201 mode
= ((fcode
== IX86_BUILTIN_WRSSD
13202 || fcode
== IX86_BUILTIN_WRUSSD
)
13203 ? SImode
: DImode
);
13205 arg0
= CALL_EXPR_ARG (exp
, 0);
13206 op0
= expand_normal (arg0
);
13207 arg1
= CALL_EXPR_ARG (exp
, 1);
13208 op1
= expand_normal (arg1
);
13210 op0
= force_reg (mode
, op0
);
13212 if (!address_operand (op1
, VOIDmode
))
13214 op1
= convert_memory_address (Pmode
, op1
);
13215 op1
= copy_addr_to_reg (op1
);
13217 op1
= gen_rtx_MEM (mode
, op1
);
13219 icode
= ((fcode
== IX86_BUILTIN_WRSSD
13220 || fcode
== IX86_BUILTIN_WRSSQ
)
13221 ? code_for_wrss (mode
)
13222 : code_for_wruss (mode
));
13223 emit_insn (GEN_FCN (icode
) (op0
, op1
));
13227 case IX86_BUILTIN_VZEROUPPER
:
13228 cfun
->machine
->has_explicit_vzeroupper
= true;
13235 if (fcode
>= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
13236 && fcode
<= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST
)
13238 i
= fcode
- IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
;
13239 return ix86_expand_special_args_builtin (bdesc_special_args
+ i
, exp
,
13243 if (fcode
>= IX86_BUILTIN__BDESC_ARGS_FIRST
13244 && fcode
<= IX86_BUILTIN__BDESC_ARGS_LAST
)
13246 i
= fcode
- IX86_BUILTIN__BDESC_ARGS_FIRST
;
13247 rtx (*fcn
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
13248 rtx (*fcn_mask
) (rtx
, rtx
, rtx
, rtx
, rtx
);
13249 rtx (*fcn_maskz
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
);
13251 machine_mode mode
, wide_mode
, nar_mode
;
13253 nar_mode
= V4SFmode
;
13255 wide_mode
= V64SFmode
;
13256 fcn_mask
= gen_avx5124fmaddps_4fmaddps_mask
;
13257 fcn_maskz
= gen_avx5124fmaddps_4fmaddps_maskz
;
13261 case IX86_BUILTIN_4FMAPS
:
13262 fcn
= gen_avx5124fmaddps_4fmaddps
;
13266 case IX86_BUILTIN_4DPWSSD
:
13267 nar_mode
= V4SImode
;
13269 wide_mode
= V64SImode
;
13270 fcn
= gen_avx5124vnniw_vp4dpwssd
;
13274 case IX86_BUILTIN_4DPWSSDS
:
13275 nar_mode
= V4SImode
;
13277 wide_mode
= V64SImode
;
13278 fcn
= gen_avx5124vnniw_vp4dpwssds
;
13282 case IX86_BUILTIN_4FNMAPS
:
13283 fcn
= gen_avx5124fmaddps_4fnmaddps
;
13287 case IX86_BUILTIN_4FNMAPS_MASK
:
13288 fcn_mask
= gen_avx5124fmaddps_4fnmaddps_mask
;
13289 fcn_maskz
= gen_avx5124fmaddps_4fnmaddps_maskz
;
13292 case IX86_BUILTIN_4DPWSSD_MASK
:
13293 nar_mode
= V4SImode
;
13295 wide_mode
= V64SImode
;
13296 fcn_mask
= gen_avx5124vnniw_vp4dpwssd_mask
;
13297 fcn_maskz
= gen_avx5124vnniw_vp4dpwssd_maskz
;
13300 case IX86_BUILTIN_4DPWSSDS_MASK
:
13301 nar_mode
= V4SImode
;
13303 wide_mode
= V64SImode
;
13304 fcn_mask
= gen_avx5124vnniw_vp4dpwssds_mask
;
13305 fcn_maskz
= gen_avx5124vnniw_vp4dpwssds_maskz
;
13308 case IX86_BUILTIN_4FMAPS_MASK
:
13318 wide_reg
= gen_reg_rtx (wide_mode
);
13319 for (i
= 0; i
< 4; i
++)
13321 args
[i
] = CALL_EXPR_ARG (exp
, i
);
13322 ops
[i
] = expand_normal (args
[i
]);
13324 emit_move_insn (gen_rtx_SUBREG (mode
, wide_reg
, i
* 64),
13328 accum
= expand_normal (CALL_EXPR_ARG (exp
, 4));
13329 accum
= force_reg (mode
, accum
);
13331 addr
= expand_normal (CALL_EXPR_ARG (exp
, 5));
13332 addr
= force_reg (Pmode
, addr
);
13334 mem
= gen_rtx_MEM (nar_mode
, addr
);
13336 target
= gen_reg_rtx (mode
);
13338 emit_move_insn (target
, accum
);
13341 emit_insn (fcn (target
, accum
, wide_reg
, mem
));
13345 merge
= expand_normal (CALL_EXPR_ARG (exp
, 6));
13347 mask
= expand_normal (CALL_EXPR_ARG (exp
, 7));
13349 if (CONST_INT_P (mask
))
13350 mask
= fixup_modeless_constant (mask
, HImode
);
13352 mask
= force_reg (HImode
, mask
);
13354 if (GET_MODE (mask
) != HImode
)
13355 mask
= gen_rtx_SUBREG (HImode
, mask
, 0);
13357 /* If merge is 0 then we're about to emit z-masked variant. */
13358 if (const0_operand (merge
, mode
))
13359 emit_insn (fcn_maskz (target
, accum
, wide_reg
, mem
, merge
, mask
));
13360 /* If merge is the same as accum then emit merge-masked variant. */
13361 else if (CALL_EXPR_ARG (exp
, 6) == CALL_EXPR_ARG (exp
, 4))
13363 merge
= force_reg (mode
, merge
);
13364 emit_insn (fcn_mask (target
, wide_reg
, mem
, merge
, mask
));
13366 /* Merge with something unknown might happen if we z-mask w/ -O0. */
13369 target
= gen_reg_rtx (mode
);
13370 emit_move_insn (target
, merge
);
13371 emit_insn (fcn_mask (target
, wide_reg
, mem
, target
, mask
));
13377 case IX86_BUILTIN_4FNMASS
:
13378 fcn
= gen_avx5124fmaddps_4fnmaddss
;
13382 case IX86_BUILTIN_4FMASS
:
13383 fcn
= gen_avx5124fmaddps_4fmaddss
;
13387 case IX86_BUILTIN_4FNMASS_MASK
:
13388 fcn_mask
= gen_avx5124fmaddps_4fnmaddss_mask
;
13389 fcn_maskz
= gen_avx5124fmaddps_4fnmaddss_maskz
;
13392 case IX86_BUILTIN_4FMASS_MASK
:
13401 fcn_mask
= gen_avx5124fmaddps_4fmaddss_mask
;
13402 fcn_maskz
= gen_avx5124fmaddps_4fmaddss_maskz
;
13406 wide_reg
= gen_reg_rtx (V64SFmode
);
13407 for (i
= 0; i
< 4; i
++)
13410 args
[i
] = CALL_EXPR_ARG (exp
, i
);
13411 ops
[i
] = expand_normal (args
[i
]);
13413 tmp
= gen_reg_rtx (SFmode
);
13414 emit_move_insn (tmp
, gen_rtx_SUBREG (SFmode
, ops
[i
], 0));
13416 emit_move_insn (gen_rtx_SUBREG (V16SFmode
, wide_reg
, i
* 64),
13417 gen_rtx_SUBREG (V16SFmode
, tmp
, 0));
13420 accum
= expand_normal (CALL_EXPR_ARG (exp
, 4));
13421 accum
= force_reg (V4SFmode
, accum
);
13423 addr
= expand_normal (CALL_EXPR_ARG (exp
, 5));
13424 addr
= force_reg (Pmode
, addr
);
13426 mem
= gen_rtx_MEM (V4SFmode
, addr
);
13428 target
= gen_reg_rtx (V4SFmode
);
13430 emit_move_insn (target
, accum
);
13433 emit_insn (fcn (target
, accum
, wide_reg
, mem
));
13437 merge
= expand_normal (CALL_EXPR_ARG (exp
, 6));
13439 mask
= expand_normal (CALL_EXPR_ARG (exp
, 7));
13441 if (CONST_INT_P (mask
))
13442 mask
= fixup_modeless_constant (mask
, QImode
);
13444 mask
= force_reg (QImode
, mask
);
13446 if (GET_MODE (mask
) != QImode
)
13447 mask
= gen_rtx_SUBREG (QImode
, mask
, 0);
13449 /* If merge is 0 then we're about to emit z-masked variant. */
13450 if (const0_operand (merge
, mode
))
13451 emit_insn (fcn_maskz (target
, accum
, wide_reg
, mem
, merge
, mask
));
13452 /* If merge is the same as accum then emit merge-masked
13454 else if (CALL_EXPR_ARG (exp
, 6) == CALL_EXPR_ARG (exp
, 4))
13456 merge
= force_reg (mode
, merge
);
13457 emit_insn (fcn_mask (target
, wide_reg
, mem
, merge
, mask
));
13459 /* Merge with something unknown might happen if we z-mask
13463 target
= gen_reg_rtx (mode
);
13464 emit_move_insn (target
, merge
);
13465 emit_insn (fcn_mask (target
, wide_reg
, mem
, target
, mask
));
13470 case IX86_BUILTIN_RDPID
:
13471 return ix86_expand_special_args_builtin (bdesc_args
+ i
, exp
,
13473 case IX86_BUILTIN_FABSQ
:
13474 case IX86_BUILTIN_COPYSIGNQ
:
13476 /* Emit a normal call if SSE isn't available. */
13477 return expand_call (exp
, target
, ignore
);
13480 return ix86_expand_args_builtin (bdesc_args
+ i
, exp
, target
);
13484 if (fcode
>= IX86_BUILTIN__BDESC_COMI_FIRST
13485 && fcode
<= IX86_BUILTIN__BDESC_COMI_LAST
)
13487 i
= fcode
- IX86_BUILTIN__BDESC_COMI_FIRST
;
13488 return ix86_expand_sse_comi (bdesc_comi
+ i
, exp
, target
);
13491 if (fcode
>= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
13492 && fcode
<= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST
)
13494 i
= fcode
- IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
;
13495 return ix86_expand_round_builtin (bdesc_round_args
+ i
, exp
, target
);
13498 if (fcode
>= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
13499 && fcode
<= IX86_BUILTIN__BDESC_PCMPESTR_LAST
)
13501 i
= fcode
- IX86_BUILTIN__BDESC_PCMPESTR_FIRST
;
13502 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr
+ i
, exp
, target
);
13505 if (fcode
>= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
13506 && fcode
<= IX86_BUILTIN__BDESC_PCMPISTR_LAST
)
13508 i
= fcode
- IX86_BUILTIN__BDESC_PCMPISTR_FIRST
;
13509 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr
+ i
, exp
, target
);
13512 if (fcode
>= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
13513 && fcode
<= IX86_BUILTIN__BDESC_MULTI_ARG_LAST
)
13515 i
= fcode
- IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
;
13516 const struct builtin_description
*d
= bdesc_multi_arg
+ i
;
13517 return ix86_expand_multi_arg_builtin (d
->icode
, exp
, target
,
13518 (enum ix86_builtin_func_type
)
13519 d
->flag
, d
->comparison
);
13522 if (fcode
>= IX86_BUILTIN__BDESC_CET_FIRST
13523 && fcode
<= IX86_BUILTIN__BDESC_CET_LAST
)
13525 i
= fcode
- IX86_BUILTIN__BDESC_CET_FIRST
;
13526 return ix86_expand_special_args_builtin (bdesc_cet
+ i
, exp
,
13530 gcc_unreachable ();
13533 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
13534 fill target with val via vec_duplicate. */
13537 ix86_vector_duplicate_value (machine_mode mode
, rtx target
, rtx val
)
13543 /* First attempt to recognize VAL as-is. */
13544 dup
= gen_vec_duplicate (mode
, val
);
13545 insn
= emit_insn (gen_rtx_SET (target
, dup
));
13546 if (recog_memoized (insn
) < 0)
13549 machine_mode innermode
= GET_MODE_INNER (mode
);
13552 /* If that fails, force VAL into a register. */
13555 reg
= force_reg (innermode
, val
);
13556 if (GET_MODE (reg
) != innermode
)
13557 reg
= gen_lowpart (innermode
, reg
);
13558 SET_SRC (PATTERN (insn
)) = gen_vec_duplicate (mode
, reg
);
13559 seq
= get_insns ();
13562 emit_insn_before (seq
, insn
);
13564 ok
= recog_memoized (insn
) >= 0;
13570 /* Get a vector mode of the same size as the original but with elements
13571 twice as wide. This is only guaranteed to apply to integral vectors. */
13573 static machine_mode
13574 get_mode_wider_vector (machine_mode o
)
13576 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
13577 machine_mode n
= GET_MODE_WIDER_MODE (o
).require ();
13578 gcc_assert (GET_MODE_NUNITS (o
) == GET_MODE_NUNITS (n
) * 2);
13579 gcc_assert (GET_MODE_SIZE (o
) == GET_MODE_SIZE (n
));
13583 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d
*d
);
13584 static bool expand_vec_perm_1 (struct expand_vec_perm_d
*d
);
13586 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13587 with all elements equal to VAR. Return true if successful. */
13590 ix86_expand_vector_init_duplicate (bool mmx_ok
, machine_mode mode
,
13591 rtx target
, rtx val
)
13615 return ix86_vector_duplicate_value (mode
, target
, val
);
13620 if (TARGET_SSE
|| TARGET_3DNOW_A
)
13624 val
= gen_lowpart (SImode
, val
);
13625 x
= gen_rtx_TRUNCATE (HImode
, val
);
13626 x
= gen_rtx_VEC_DUPLICATE (mode
, x
);
13627 emit_insn (gen_rtx_SET (target
, x
));
13639 return ix86_vector_duplicate_value (mode
, target
, val
);
13643 struct expand_vec_perm_d dperm
;
13647 memset (&dperm
, 0, sizeof (dperm
));
13648 dperm
.target
= target
;
13649 dperm
.vmode
= mode
;
13650 dperm
.nelt
= GET_MODE_NUNITS (mode
);
13651 dperm
.op0
= dperm
.op1
= gen_reg_rtx (mode
);
13652 dperm
.one_operand_p
= true;
13654 /* Extend to SImode using a paradoxical SUBREG. */
13655 tmp1
= gen_reg_rtx (SImode
);
13656 emit_move_insn (tmp1
, gen_lowpart (SImode
, val
));
13658 /* Insert the SImode value as low element of a V4SImode vector. */
13659 tmp2
= gen_reg_rtx (V4SImode
);
13660 emit_insn (gen_vec_setv4si_0 (tmp2
, CONST0_RTX (V4SImode
), tmp1
));
13661 emit_move_insn (dperm
.op0
, gen_lowpart (mode
, tmp2
));
13663 ok
= (expand_vec_perm_1 (&dperm
)
13664 || expand_vec_perm_broadcast_1 (&dperm
));
13672 return ix86_vector_duplicate_value (mode
, target
, val
);
13679 /* Replicate the value once into the next wider mode and recurse. */
13681 machine_mode smode
, wsmode
, wvmode
;
13684 smode
= GET_MODE_INNER (mode
);
13685 wvmode
= get_mode_wider_vector (mode
);
13686 wsmode
= GET_MODE_INNER (wvmode
);
13688 val
= convert_modes (wsmode
, smode
, val
, true);
13689 x
= expand_simple_binop (wsmode
, ASHIFT
, val
,
13690 GEN_INT (GET_MODE_BITSIZE (smode
)),
13691 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
13692 val
= expand_simple_binop (wsmode
, IOR
, val
, x
, x
, 1, OPTAB_LIB_WIDEN
);
13694 x
= gen_reg_rtx (wvmode
);
13695 ok
= ix86_expand_vector_init_duplicate (mmx_ok
, wvmode
, x
, val
);
13697 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), x
));
13704 return ix86_vector_duplicate_value (mode
, target
, val
);
13707 machine_mode hvmode
= (mode
== V16HImode
? V8HImode
: V16QImode
);
13708 rtx x
= gen_reg_rtx (hvmode
);
13710 ok
= ix86_expand_vector_init_duplicate (false, hvmode
, x
, val
);
13713 x
= gen_rtx_VEC_CONCAT (mode
, x
, x
);
13714 emit_insn (gen_rtx_SET (target
, x
));
13720 if (TARGET_AVX512BW
)
13721 return ix86_vector_duplicate_value (mode
, target
, val
);
13724 machine_mode hvmode
= (mode
== V32HImode
? V16HImode
: V32QImode
);
13725 rtx x
= gen_reg_rtx (hvmode
);
13727 ok
= ix86_expand_vector_init_duplicate (false, hvmode
, x
, val
);
13730 x
= gen_rtx_VEC_CONCAT (mode
, x
, x
);
13731 emit_insn (gen_rtx_SET (target
, x
));
13740 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13741 whose ONE_VAR element is VAR, and other elements are zero. Return true
13745 ix86_expand_vector_init_one_nonzero (bool mmx_ok
, machine_mode mode
,
13746 rtx target
, rtx var
, int one_var
)
13748 machine_mode vsimode
;
13751 bool use_vector_set
= false;
13752 rtx (*gen_vec_set_0
) (rtx
, rtx
, rtx
) = NULL
;
13757 /* For SSE4.1, we normally use vector set. But if the second
13758 element is zero and inter-unit moves are OK, we use movq
13760 use_vector_set
= (TARGET_64BIT
&& TARGET_SSE4_1
13761 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
13767 use_vector_set
= TARGET_SSE4_1
;
13770 use_vector_set
= TARGET_SSE2
;
13773 use_vector_set
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
13776 use_vector_set
= TARGET_SSE
|| TARGET_3DNOW_A
;
13780 use_vector_set
= TARGET_AVX
;
13783 use_vector_set
= TARGET_AVX
;
13784 gen_vec_set_0
= gen_vec_setv8si_0
;
13787 use_vector_set
= TARGET_AVX
;
13788 gen_vec_set_0
= gen_vec_setv8sf_0
;
13791 use_vector_set
= TARGET_AVX
;
13792 gen_vec_set_0
= gen_vec_setv4df_0
;
13795 /* Use ix86_expand_vector_set in 64bit mode only. */
13796 use_vector_set
= TARGET_AVX
&& TARGET_64BIT
;
13797 gen_vec_set_0
= gen_vec_setv4di_0
;
13800 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
13801 gen_vec_set_0
= gen_vec_setv16si_0
;
13804 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
13805 gen_vec_set_0
= gen_vec_setv16sf_0
;
13808 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
13809 gen_vec_set_0
= gen_vec_setv8df_0
;
13812 /* Use ix86_expand_vector_set in 64bit mode only. */
13813 use_vector_set
= TARGET_AVX512F
&& TARGET_64BIT
&& one_var
== 0;
13814 gen_vec_set_0
= gen_vec_setv8di_0
;
13820 if (use_vector_set
)
13822 if (gen_vec_set_0
&& one_var
== 0)
13824 var
= force_reg (GET_MODE_INNER (mode
), var
);
13825 emit_insn (gen_vec_set_0 (target
, CONST0_RTX (mode
), var
));
13828 emit_insn (gen_rtx_SET (target
, CONST0_RTX (mode
)));
13829 var
= force_reg (GET_MODE_INNER (mode
), var
);
13830 ix86_expand_vector_set (mmx_ok
, target
, var
, one_var
);
13846 var
= force_reg (GET_MODE_INNER (mode
), var
);
13847 x
= gen_rtx_VEC_CONCAT (mode
, var
, CONST0_RTX (GET_MODE_INNER (mode
)));
13848 emit_insn (gen_rtx_SET (target
, x
));
13853 if (!REG_P (target
) || REGNO (target
) < FIRST_PSEUDO_REGISTER
)
13854 new_target
= gen_reg_rtx (mode
);
13856 new_target
= target
;
13857 var
= force_reg (GET_MODE_INNER (mode
), var
);
13858 x
= gen_rtx_VEC_DUPLICATE (mode
, var
);
13859 x
= gen_rtx_VEC_MERGE (mode
, x
, CONST0_RTX (mode
), const1_rtx
);
13860 emit_insn (gen_rtx_SET (new_target
, x
));
13863 /* We need to shuffle the value to the correct position, so
13864 create a new pseudo to store the intermediate result. */
13866 /* With SSE2, we can use the integer shuffle insns. */
13867 if (mode
!= V4SFmode
&& TARGET_SSE2
)
13869 emit_insn (gen_sse2_pshufd_1 (new_target
, new_target
,
13871 GEN_INT (one_var
== 1 ? 0 : 1),
13872 GEN_INT (one_var
== 2 ? 0 : 1),
13873 GEN_INT (one_var
== 3 ? 0 : 1)));
13874 if (target
!= new_target
)
13875 emit_move_insn (target
, new_target
);
13879 /* Otherwise convert the intermediate result to V4SFmode and
13880 use the SSE1 shuffle instructions. */
13881 if (mode
!= V4SFmode
)
13883 tmp
= gen_reg_rtx (V4SFmode
);
13884 emit_move_insn (tmp
, gen_lowpart (V4SFmode
, new_target
));
13889 emit_insn (gen_sse_shufps_v4sf (tmp
, tmp
, tmp
,
13891 GEN_INT (one_var
== 1 ? 0 : 1),
13892 GEN_INT (one_var
== 2 ? 0+4 : 1+4),
13893 GEN_INT (one_var
== 3 ? 0+4 : 1+4)));
13895 if (mode
!= V4SFmode
)
13896 emit_move_insn (target
, gen_lowpart (V4SImode
, tmp
));
13897 else if (tmp
!= target
)
13898 emit_move_insn (target
, tmp
);
13900 else if (target
!= new_target
)
13901 emit_move_insn (target
, new_target
);
13906 vsimode
= V4SImode
;
13912 vsimode
= V2SImode
;
13918 /* Zero extend the variable element to SImode and recurse. */
13919 var
= convert_modes (SImode
, GET_MODE_INNER (mode
), var
, true);
13921 x
= gen_reg_rtx (vsimode
);
13922 if (!ix86_expand_vector_init_one_nonzero (mmx_ok
, vsimode
, x
,
13924 gcc_unreachable ();
13926 emit_move_insn (target
, gen_lowpart (mode
, x
));
13934 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13935 consisting of the values in VALS. It is known that all elements
13936 except ONE_VAR are constants. Return true if successful. */
13939 ix86_expand_vector_init_one_var (bool mmx_ok
, machine_mode mode
,
13940 rtx target
, rtx vals
, int one_var
)
13942 rtx var
= XVECEXP (vals
, 0, one_var
);
13943 machine_mode wmode
;
13946 const_vec
= copy_rtx (vals
);
13947 XVECEXP (const_vec
, 0, one_var
) = CONST0_RTX (GET_MODE_INNER (mode
));
13948 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (const_vec
, 0));
13956 /* For the two element vectors, it's just as easy to use
13957 the general case. */
13961 /* Use ix86_expand_vector_set in 64bit mode only. */
13982 if (TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
)
13987 /* There's no way to set one QImode entry easily. Combine
13988 the variable value with its adjacent constant value, and
13989 promote to an HImode set. */
13990 x
= XVECEXP (vals
, 0, one_var
^ 1);
13993 var
= convert_modes (HImode
, QImode
, var
, true);
13994 var
= expand_simple_binop (HImode
, ASHIFT
, var
, GEN_INT (8),
13995 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
13996 x
= GEN_INT (INTVAL (x
) & 0xff);
14000 var
= convert_modes (HImode
, QImode
, var
, true);
14001 x
= gen_int_mode (UINTVAL (x
) << 8, HImode
);
14003 if (x
!= const0_rtx
)
14004 var
= expand_simple_binop (HImode
, IOR
, var
, x
, var
,
14005 1, OPTAB_LIB_WIDEN
);
14007 x
= gen_reg_rtx (wmode
);
14008 emit_move_insn (x
, gen_lowpart (wmode
, const_vec
));
14009 ix86_expand_vector_set (mmx_ok
, x
, var
, one_var
>> 1);
14011 emit_move_insn (target
, gen_lowpart (mode
, x
));
14018 emit_move_insn (target
, const_vec
);
14019 ix86_expand_vector_set (mmx_ok
, target
, var
, one_var
);
14023 /* A subroutine of ix86_expand_vector_init_general. Use vector
14024 concatenate to handle the most general case: all values variable,
14025 and none identical. */
14028 ix86_expand_vector_init_concat (machine_mode mode
,
14029 rtx target
, rtx
*ops
, int n
)
14031 machine_mode half_mode
= VOIDmode
;
14042 half_mode
= V8SImode
;
14045 half_mode
= V8SFmode
;
14048 half_mode
= V4DImode
;
14051 half_mode
= V4DFmode
;
14054 half_mode
= V4SImode
;
14057 half_mode
= V4SFmode
;
14060 half_mode
= V2DImode
;
14063 half_mode
= V2DFmode
;
14066 half_mode
= V2SImode
;
14069 half_mode
= V2SFmode
;
14072 half_mode
= DImode
;
14075 half_mode
= SImode
;
14078 half_mode
= DFmode
;
14081 half_mode
= SFmode
;
14084 gcc_unreachable ();
14087 if (!register_operand (ops
[1], half_mode
))
14088 ops
[1] = force_reg (half_mode
, ops
[1]);
14089 if (!register_operand (ops
[0], half_mode
))
14090 ops
[0] = force_reg (half_mode
, ops
[0]);
14091 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, ops
[0],
14099 half_mode
= V2DImode
;
14102 half_mode
= V2DFmode
;
14105 half_mode
= V2SImode
;
14108 half_mode
= V2SFmode
;
14111 gcc_unreachable ();
14119 half_mode
= V4DImode
;
14122 half_mode
= V4DFmode
;
14125 half_mode
= V4SImode
;
14128 half_mode
= V4SFmode
;
14131 gcc_unreachable ();
14139 half_mode
= V8SImode
;
14142 half_mode
= V8SFmode
;
14145 gcc_unreachable ();
14150 /* FIXME: We process inputs backward to help RA. PR 36222. */
14152 for (j
= 1; j
!= -1; j
--)
14154 half
[j
] = gen_reg_rtx (half_mode
);
14158 v
= gen_rtvec (2, ops
[i
-1], ops
[i
]);
14162 v
= gen_rtvec (4, ops
[i
-3], ops
[i
-2], ops
[i
-1], ops
[i
]);
14166 v
= gen_rtvec (8, ops
[i
-7], ops
[i
-6], ops
[i
-5], ops
[i
-4],
14167 ops
[i
-3], ops
[i
-2], ops
[i
-1], ops
[i
]);
14171 gcc_unreachable ();
14173 ix86_expand_vector_init (false, half
[j
],
14174 gen_rtx_PARALLEL (half_mode
, v
));
14177 ix86_expand_vector_init_concat (mode
, target
, half
, 2);
14181 gcc_unreachable ();
14185 /* A subroutine of ix86_expand_vector_init_general. Use vector
14186 interleave to handle the most general case: all values variable,
14187 and none identical. */
14190 ix86_expand_vector_init_interleave (machine_mode mode
,
14191 rtx target
, rtx
*ops
, int n
)
14193 machine_mode first_imode
, second_imode
, third_imode
, inner_mode
;
14196 rtx (*gen_load_even
) (rtx
, rtx
, rtx
);
14197 rtx (*gen_interleave_first_low
) (rtx
, rtx
, rtx
);
14198 rtx (*gen_interleave_second_low
) (rtx
, rtx
, rtx
);
14203 gen_load_even
= gen_vec_setv8hi
;
14204 gen_interleave_first_low
= gen_vec_interleave_lowv4si
;
14205 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
14206 inner_mode
= HImode
;
14207 first_imode
= V4SImode
;
14208 second_imode
= V2DImode
;
14209 third_imode
= VOIDmode
;
14212 gen_load_even
= gen_vec_setv16qi
;
14213 gen_interleave_first_low
= gen_vec_interleave_lowv8hi
;
14214 gen_interleave_second_low
= gen_vec_interleave_lowv4si
;
14215 inner_mode
= QImode
;
14216 first_imode
= V8HImode
;
14217 second_imode
= V4SImode
;
14218 third_imode
= V2DImode
;
14221 gcc_unreachable ();
14224 for (i
= 0; i
< n
; i
++)
14226 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
14227 op0
= gen_reg_rtx (SImode
);
14228 emit_move_insn (op0
, gen_lowpart (SImode
, ops
[i
+ i
]));
14230 /* Insert the SImode value as low element of V4SImode vector. */
14231 op1
= gen_reg_rtx (V4SImode
);
14232 op0
= gen_rtx_VEC_MERGE (V4SImode
,
14233 gen_rtx_VEC_DUPLICATE (V4SImode
,
14235 CONST0_RTX (V4SImode
),
14237 emit_insn (gen_rtx_SET (op1
, op0
));
14239 /* Cast the V4SImode vector back to a vector in orignal mode. */
14240 op0
= gen_reg_rtx (mode
);
14241 emit_move_insn (op0
, gen_lowpart (mode
, op1
));
14243 /* Load even elements into the second position. */
14244 emit_insn (gen_load_even (op0
,
14245 force_reg (inner_mode
,
14249 /* Cast vector to FIRST_IMODE vector. */
14250 ops
[i
] = gen_reg_rtx (first_imode
);
14251 emit_move_insn (ops
[i
], gen_lowpart (first_imode
, op0
));
14254 /* Interleave low FIRST_IMODE vectors. */
14255 for (i
= j
= 0; i
< n
; i
+= 2, j
++)
14257 op0
= gen_reg_rtx (first_imode
);
14258 emit_insn (gen_interleave_first_low (op0
, ops
[i
], ops
[i
+ 1]));
14260 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
14261 ops
[j
] = gen_reg_rtx (second_imode
);
14262 emit_move_insn (ops
[j
], gen_lowpart (second_imode
, op0
));
14265 /* Interleave low SECOND_IMODE vectors. */
14266 switch (second_imode
)
14269 for (i
= j
= 0; i
< n
/ 2; i
+= 2, j
++)
14271 op0
= gen_reg_rtx (second_imode
);
14272 emit_insn (gen_interleave_second_low (op0
, ops
[i
],
14275 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
14277 ops
[j
] = gen_reg_rtx (third_imode
);
14278 emit_move_insn (ops
[j
], gen_lowpart (third_imode
, op0
));
14280 second_imode
= V2DImode
;
14281 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
14285 op0
= gen_reg_rtx (second_imode
);
14286 emit_insn (gen_interleave_second_low (op0
, ops
[0],
14289 /* Cast the SECOND_IMODE vector back to a vector on original
14291 emit_insn (gen_rtx_SET (target
, gen_lowpart (mode
, op0
)));
14295 gcc_unreachable ();
14299 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
14300 all values variable, and none identical. */
14303 ix86_expand_vector_init_general (bool mmx_ok
, machine_mode mode
,
14304 rtx target
, rtx vals
)
14306 rtx ops
[64], op0
, op1
, op2
, op3
, op4
, op5
;
14307 machine_mode half_mode
= VOIDmode
;
14308 machine_mode quarter_mode
= VOIDmode
;
14315 if (!mmx_ok
&& !TARGET_SSE
)
14331 n
= GET_MODE_NUNITS (mode
);
14332 for (i
= 0; i
< n
; i
++)
14333 ops
[i
] = XVECEXP (vals
, 0, i
);
14334 ix86_expand_vector_init_concat (mode
, target
, ops
, n
);
14338 for (i
= 0; i
< 2; i
++)
14339 ops
[i
] = gen_lowpart (V2DImode
, XVECEXP (vals
, 0, i
));
14340 op0
= gen_reg_rtx (V4DImode
);
14341 ix86_expand_vector_init_concat (V4DImode
, op0
, ops
, 2);
14342 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), op0
));
14346 for (i
= 0; i
< 4; i
++)
14347 ops
[i
] = gen_lowpart (V2DImode
, XVECEXP (vals
, 0, i
));
14348 ops
[4] = gen_reg_rtx (V4DImode
);
14349 ix86_expand_vector_init_concat (V4DImode
, ops
[4], ops
, 2);
14350 ops
[5] = gen_reg_rtx (V4DImode
);
14351 ix86_expand_vector_init_concat (V4DImode
, ops
[5], ops
+ 2, 2);
14352 op0
= gen_reg_rtx (V8DImode
);
14353 ix86_expand_vector_init_concat (V8DImode
, op0
, ops
+ 4, 2);
14354 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), op0
));
14358 half_mode
= V16QImode
;
14362 half_mode
= V8HImode
;
14366 n
= GET_MODE_NUNITS (mode
);
14367 for (i
= 0; i
< n
; i
++)
14368 ops
[i
] = XVECEXP (vals
, 0, i
);
14369 op0
= gen_reg_rtx (half_mode
);
14370 op1
= gen_reg_rtx (half_mode
);
14371 ix86_expand_vector_init_interleave (half_mode
, op0
, ops
,
14373 ix86_expand_vector_init_interleave (half_mode
, op1
,
14374 &ops
[n
>> 1], n
>> 2);
14375 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, op0
, op1
)));
14379 quarter_mode
= V16QImode
;
14380 half_mode
= V32QImode
;
14384 quarter_mode
= V8HImode
;
14385 half_mode
= V16HImode
;
14389 n
= GET_MODE_NUNITS (mode
);
14390 for (i
= 0; i
< n
; i
++)
14391 ops
[i
] = XVECEXP (vals
, 0, i
);
14392 op0
= gen_reg_rtx (quarter_mode
);
14393 op1
= gen_reg_rtx (quarter_mode
);
14394 op2
= gen_reg_rtx (quarter_mode
);
14395 op3
= gen_reg_rtx (quarter_mode
);
14396 op4
= gen_reg_rtx (half_mode
);
14397 op5
= gen_reg_rtx (half_mode
);
14398 ix86_expand_vector_init_interleave (quarter_mode
, op0
, ops
,
14400 ix86_expand_vector_init_interleave (quarter_mode
, op1
,
14401 &ops
[n
>> 2], n
>> 3);
14402 ix86_expand_vector_init_interleave (quarter_mode
, op2
,
14403 &ops
[n
>> 1], n
>> 3);
14404 ix86_expand_vector_init_interleave (quarter_mode
, op3
,
14405 &ops
[(n
>> 1) | (n
>> 2)], n
>> 3);
14406 emit_insn (gen_rtx_SET (op4
, gen_rtx_VEC_CONCAT (half_mode
, op0
, op1
)));
14407 emit_insn (gen_rtx_SET (op5
, gen_rtx_VEC_CONCAT (half_mode
, op2
, op3
)));
14408 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, op4
, op5
)));
14412 if (!TARGET_SSE4_1
)
14420 /* Don't use ix86_expand_vector_init_interleave if we can't
14421 move from GPR to SSE register directly. */
14422 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
)
14425 n
= GET_MODE_NUNITS (mode
);
14426 for (i
= 0; i
< n
; i
++)
14427 ops
[i
] = XVECEXP (vals
, 0, i
);
14428 ix86_expand_vector_init_interleave (mode
, target
, ops
, n
>> 1);
14436 gcc_unreachable ();
14440 int i
, j
, n_elts
, n_words
, n_elt_per_word
;
14441 machine_mode inner_mode
;
14442 rtx words
[4], shift
;
14444 inner_mode
= GET_MODE_INNER (mode
);
14445 n_elts
= GET_MODE_NUNITS (mode
);
14446 n_words
= GET_MODE_SIZE (mode
) / UNITS_PER_WORD
;
14447 n_elt_per_word
= n_elts
/ n_words
;
14448 shift
= GEN_INT (GET_MODE_BITSIZE (inner_mode
));
14450 for (i
= 0; i
< n_words
; ++i
)
14452 rtx word
= NULL_RTX
;
14454 for (j
= 0; j
< n_elt_per_word
; ++j
)
14456 rtx elt
= XVECEXP (vals
, 0, (i
+1)*n_elt_per_word
- j
- 1);
14457 elt
= convert_modes (word_mode
, inner_mode
, elt
, true);
14463 word
= expand_simple_binop (word_mode
, ASHIFT
, word
, shift
,
14464 word
, 1, OPTAB_LIB_WIDEN
);
14465 word
= expand_simple_binop (word_mode
, IOR
, word
, elt
,
14466 word
, 1, OPTAB_LIB_WIDEN
);
14474 emit_move_insn (target
, gen_lowpart (mode
, words
[0]));
14475 else if (n_words
== 2)
14477 rtx tmp
= gen_reg_rtx (mode
);
14478 emit_clobber (tmp
);
14479 emit_move_insn (gen_lowpart (word_mode
, tmp
), words
[0]);
14480 emit_move_insn (gen_highpart (word_mode
, tmp
), words
[1]);
14481 emit_move_insn (target
, tmp
);
14483 else if (n_words
== 4)
14485 rtx tmp
= gen_reg_rtx (V4SImode
);
14486 gcc_assert (word_mode
== SImode
);
14487 vals
= gen_rtx_PARALLEL (V4SImode
, gen_rtvec_v (4, words
));
14488 ix86_expand_vector_init_general (false, V4SImode
, tmp
, vals
);
14489 emit_move_insn (target
, gen_lowpart (mode
, tmp
));
14492 gcc_unreachable ();
14496 /* Initialize vector TARGET via VALS. Suppress the use of MMX
14497 instructions unless MMX_OK is true. */
14500 ix86_expand_vector_init (bool mmx_ok
, rtx target
, rtx vals
)
14502 machine_mode mode
= GET_MODE (target
);
14503 machine_mode inner_mode
= GET_MODE_INNER (mode
);
14504 int n_elts
= GET_MODE_NUNITS (mode
);
14505 int n_var
= 0, one_var
= -1;
14506 bool all_same
= true, all_const_zero
= true;
14510 /* Handle first initialization from vector elts. */
14511 if (n_elts
!= XVECLEN (vals
, 0))
14513 rtx subtarget
= target
;
14514 x
= XVECEXP (vals
, 0, 0);
14515 gcc_assert (GET_MODE_INNER (GET_MODE (x
)) == inner_mode
);
14516 if (GET_MODE_NUNITS (GET_MODE (x
)) * 2 == n_elts
)
14518 rtx ops
[2] = { XVECEXP (vals
, 0, 0), XVECEXP (vals
, 0, 1) };
14519 if (inner_mode
== QImode
|| inner_mode
== HImode
)
14521 unsigned int n_bits
= n_elts
* GET_MODE_SIZE (inner_mode
);
14522 mode
= mode_for_vector (SImode
, n_bits
/ 4).require ();
14523 inner_mode
= mode_for_vector (SImode
, n_bits
/ 8).require ();
14524 ops
[0] = gen_lowpart (inner_mode
, ops
[0]);
14525 ops
[1] = gen_lowpart (inner_mode
, ops
[1]);
14526 subtarget
= gen_reg_rtx (mode
);
14528 ix86_expand_vector_init_concat (mode
, subtarget
, ops
, 2);
14529 if (subtarget
!= target
)
14530 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), subtarget
));
14533 gcc_unreachable ();
14536 for (i
= 0; i
< n_elts
; ++i
)
14538 x
= XVECEXP (vals
, 0, i
);
14539 if (!(CONST_SCALAR_INT_P (x
)
14540 || CONST_DOUBLE_P (x
)
14541 || CONST_FIXED_P (x
)))
14542 n_var
++, one_var
= i
;
14543 else if (x
!= CONST0_RTX (inner_mode
))
14544 all_const_zero
= false;
14545 if (i
> 0 && !rtx_equal_p (x
, XVECEXP (vals
, 0, 0)))
14549 /* Constants are best loaded from the constant pool. */
14552 emit_move_insn (target
, gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0)));
14556 /* If all values are identical, broadcast the value. */
14558 && ix86_expand_vector_init_duplicate (mmx_ok
, mode
, target
,
14559 XVECEXP (vals
, 0, 0)))
14562 /* Values where only one field is non-constant are best loaded from
14563 the pool and overwritten via move later. */
14567 && ix86_expand_vector_init_one_nonzero (mmx_ok
, mode
, target
,
14568 XVECEXP (vals
, 0, one_var
),
14572 if (ix86_expand_vector_init_one_var (mmx_ok
, mode
, target
, vals
, one_var
))
14576 ix86_expand_vector_init_general (mmx_ok
, mode
, target
, vals
);
14580 V setg (V v, int idx, T val)
14582 V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
14583 V valv = (V){val, val, val, val, val, val, val, val};
14584 V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
14585 v = (v & ~mask) | (valv & mask);
14589 ix86_expand_vector_set_var (rtx target
, rtx val
, rtx idx
)
14592 machine_mode mode
= GET_MODE (target
);
14593 machine_mode cmp_mode
= mode
;
14594 int n_elts
= GET_MODE_NUNITS (mode
);
14595 rtx valv
,idxv
,constv
,idx_tmp
;
14598 /* 512-bits vector byte/word broadcast and comparison only available
14599 under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
14600 when without TARGET_AVX512BW. */
14601 if ((mode
== V32HImode
|| mode
== V64QImode
) && !TARGET_AVX512BW
)
14603 gcc_assert (TARGET_AVX512F
);
14604 rtx vhi
, vlo
, idx_hi
;
14605 machine_mode half_mode
;
14606 rtx (*extract_hi
)(rtx
, rtx
);
14607 rtx (*extract_lo
)(rtx
, rtx
);
14609 if (mode
== V32HImode
)
14611 half_mode
= V16HImode
;
14612 extract_hi
= gen_vec_extract_hi_v32hi
;
14613 extract_lo
= gen_vec_extract_lo_v32hi
;
14617 half_mode
= V32QImode
;
14618 extract_hi
= gen_vec_extract_hi_v64qi
;
14619 extract_lo
= gen_vec_extract_lo_v64qi
;
14622 vhi
= gen_reg_rtx (half_mode
);
14623 vlo
= gen_reg_rtx (half_mode
);
14624 idx_hi
= gen_reg_rtx (GET_MODE (idx
));
14625 emit_insn (extract_hi (vhi
, target
));
14626 emit_insn (extract_lo (vlo
, target
));
14629 vec
[2] = GEN_INT (n_elts
/2);
14630 ix86_expand_binary_operator (MINUS
, GET_MODE (idx
), vec
);
14631 ix86_expand_vector_set_var (vhi
, val
, idx_hi
);
14632 ix86_expand_vector_set_var (vlo
, val
, idx
);
14633 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, vlo
, vhi
)));
14637 if (FLOAT_MODE_P (GET_MODE_INNER (mode
)))
14642 cmp_mode
= V2DImode
;
14645 cmp_mode
= V4DImode
;
14648 cmp_mode
= V8DImode
;
14651 cmp_mode
= V4SImode
;
14654 cmp_mode
= V8SImode
;
14657 cmp_mode
= V16SImode
;
14660 gcc_unreachable ();
14664 for (int i
= 0; i
!= n_elts
; i
++)
14665 vec
[i
] = GEN_INT (i
);
14666 constv
= gen_rtx_CONST_VECTOR (cmp_mode
, gen_rtvec_v (n_elts
, vec
));
14667 valv
= gen_reg_rtx (mode
);
14668 idxv
= gen_reg_rtx (cmp_mode
);
14669 idx_tmp
= convert_to_mode (GET_MODE_INNER (cmp_mode
), idx
, 1);
14671 ok
= ix86_expand_vector_init_duplicate (false, mode
, valv
, val
);
14673 ok
= ix86_expand_vector_init_duplicate (false, cmp_mode
, idxv
, idx_tmp
);
14678 vec
[3] = gen_rtx_EQ (mode
, idxv
, constv
);
14681 ok
= ix86_expand_int_vcond (vec
);
14686 ix86_expand_vector_set (bool mmx_ok
, rtx target
, rtx val
, int elt
)
14688 machine_mode mode
= GET_MODE (target
);
14689 machine_mode inner_mode
= GET_MODE_INNER (mode
);
14690 machine_mode half_mode
;
14691 bool use_vec_merge
= false;
14693 static rtx (*gen_extract
[6][2]) (rtx
, rtx
)
14695 { gen_vec_extract_lo_v32qi
, gen_vec_extract_hi_v32qi
},
14696 { gen_vec_extract_lo_v16hi
, gen_vec_extract_hi_v16hi
},
14697 { gen_vec_extract_lo_v8si
, gen_vec_extract_hi_v8si
},
14698 { gen_vec_extract_lo_v4di
, gen_vec_extract_hi_v4di
},
14699 { gen_vec_extract_lo_v8sf
, gen_vec_extract_hi_v8sf
},
14700 { gen_vec_extract_lo_v4df
, gen_vec_extract_hi_v4df
}
14702 static rtx (*gen_insert
[6][2]) (rtx
, rtx
, rtx
)
14704 { gen_vec_set_lo_v32qi
, gen_vec_set_hi_v32qi
},
14705 { gen_vec_set_lo_v16hi
, gen_vec_set_hi_v16hi
},
14706 { gen_vec_set_lo_v8si
, gen_vec_set_hi_v8si
},
14707 { gen_vec_set_lo_v4di
, gen_vec_set_hi_v4di
},
14708 { gen_vec_set_lo_v8sf
, gen_vec_set_hi_v8sf
},
14709 { gen_vec_set_lo_v4df
, gen_vec_set_hi_v4df
}
14712 machine_mode mmode
= VOIDmode
;
14713 rtx (*gen_blendm
) (rtx
, rtx
, rtx
, rtx
);
14718 use_vec_merge
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
14726 tmp
= gen_reg_rtx (GET_MODE_INNER (mode
));
14727 ix86_expand_vector_extract (true, tmp
, target
, 1 - elt
);
14729 tmp
= gen_rtx_VEC_CONCAT (mode
, val
, tmp
);
14731 tmp
= gen_rtx_VEC_CONCAT (mode
, tmp
, val
);
14732 emit_insn (gen_rtx_SET (target
, tmp
));
14738 use_vec_merge
= TARGET_SSE4_1
&& TARGET_64BIT
;
14742 tmp
= gen_reg_rtx (GET_MODE_INNER (mode
));
14743 ix86_expand_vector_extract (false, tmp
, target
, 1 - elt
);
14745 tmp
= gen_rtx_VEC_CONCAT (mode
, val
, tmp
);
14747 tmp
= gen_rtx_VEC_CONCAT (mode
, tmp
, val
);
14748 emit_insn (gen_rtx_SET (target
, tmp
));
14752 /* NB: For ELT == 0, use standard scalar operation patterns which
14753 preserve the rest of the vector for combiner:
14756 (vec_duplicate:V2DF (reg:DF))
14766 /* For the two element vectors, we implement a VEC_CONCAT with
14767 the extraction of the other element. */
14769 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (1 - elt
)));
14770 tmp
= gen_rtx_VEC_SELECT (inner_mode
, target
, tmp
);
14773 op0
= val
, op1
= tmp
;
14775 op0
= tmp
, op1
= val
;
14777 tmp
= gen_rtx_VEC_CONCAT (mode
, op0
, op1
);
14778 emit_insn (gen_rtx_SET (target
, tmp
));
14783 use_vec_merge
= TARGET_SSE4_1
;
14790 use_vec_merge
= true;
14794 /* tmp = target = A B C D */
14795 tmp
= copy_to_reg (target
);
14796 /* target = A A B B */
14797 emit_insn (gen_vec_interleave_lowv4sf (target
, target
, target
));
14798 /* target = X A B B */
14799 ix86_expand_vector_set (false, target
, val
, 0);
14800 /* target = A X C D */
14801 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
14802 const1_rtx
, const0_rtx
,
14803 GEN_INT (2+4), GEN_INT (3+4)));
14807 /* tmp = target = A B C D */
14808 tmp
= copy_to_reg (target
);
14809 /* tmp = X B C D */
14810 ix86_expand_vector_set (false, tmp
, val
, 0);
14811 /* target = A B X D */
14812 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
14813 const0_rtx
, const1_rtx
,
14814 GEN_INT (0+4), GEN_INT (3+4)));
14818 /* tmp = target = A B C D */
14819 tmp
= copy_to_reg (target
);
14820 /* tmp = X B C D */
14821 ix86_expand_vector_set (false, tmp
, val
, 0);
14822 /* target = A B X D */
14823 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
14824 const0_rtx
, const1_rtx
,
14825 GEN_INT (2+4), GEN_INT (0+4)));
14829 gcc_unreachable ();
14834 use_vec_merge
= TARGET_SSE4_1
;
14838 /* Element 0 handled by vec_merge below. */
14841 use_vec_merge
= true;
14847 /* With SSE2, use integer shuffles to swap element 0 and ELT,
14848 store into element 0, then shuffle them back. */
14852 order
[0] = GEN_INT (elt
);
14853 order
[1] = const1_rtx
;
14854 order
[2] = const2_rtx
;
14855 order
[3] = GEN_INT (3);
14856 order
[elt
] = const0_rtx
;
14858 emit_insn (gen_sse2_pshufd_1 (target
, target
, order
[0],
14859 order
[1], order
[2], order
[3]));
14861 ix86_expand_vector_set (false, target
, val
, 0);
14863 emit_insn (gen_sse2_pshufd_1 (target
, target
, order
[0],
14864 order
[1], order
[2], order
[3]));
14868 /* For SSE1, we have to reuse the V4SF code. */
14869 rtx t
= gen_reg_rtx (V4SFmode
);
14870 emit_move_insn (t
, gen_lowpart (V4SFmode
, target
));
14871 ix86_expand_vector_set (false, t
, gen_lowpart (SFmode
, val
), elt
);
14872 emit_move_insn (target
, gen_lowpart (mode
, t
));
14877 use_vec_merge
= TARGET_SSE2
;
14880 use_vec_merge
= mmx_ok
&& (TARGET_SSE
|| TARGET_3DNOW_A
);
14884 use_vec_merge
= TARGET_SSE4_1
;
14888 use_vec_merge
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
14892 half_mode
= V16QImode
;
14898 half_mode
= V8HImode
;
14904 half_mode
= V4SImode
;
14910 half_mode
= V2DImode
;
14916 half_mode
= V4SFmode
;
14922 half_mode
= V2DFmode
;
14928 /* Compute offset. */
14932 gcc_assert (i
<= 1);
14934 /* Extract the half. */
14935 tmp
= gen_reg_rtx (half_mode
);
14936 emit_insn (gen_extract
[j
][i
] (tmp
, target
));
14938 /* Put val in tmp at elt. */
14939 ix86_expand_vector_set (false, tmp
, val
, elt
);
14942 emit_insn (gen_insert
[j
][i
] (target
, target
, tmp
));
14946 if (TARGET_AVX512F
)
14949 gen_blendm
= gen_avx512f_blendmv8df
;
14954 if (TARGET_AVX512F
)
14957 gen_blendm
= gen_avx512f_blendmv8di
;
14962 if (TARGET_AVX512F
)
14965 gen_blendm
= gen_avx512f_blendmv16sf
;
14970 if (TARGET_AVX512F
)
14973 gen_blendm
= gen_avx512f_blendmv16si
;
14978 if (TARGET_AVX512BW
)
14981 gen_blendm
= gen_avx512bw_blendmv32hi
;
14983 else if (TARGET_AVX512F
)
14985 half_mode
= E_V8HImode
;
14992 if (TARGET_AVX512BW
)
14995 gen_blendm
= gen_avx512bw_blendmv64qi
;
14997 else if (TARGET_AVX512F
)
14999 half_mode
= E_V16QImode
;
15006 /* Compute offset. */
15010 gcc_assert (i
<= 3);
15013 /* Extract the quarter. */
15014 tmp
= gen_reg_rtx (V4SImode
);
15015 rtx tmp2
= gen_lowpart (V16SImode
, target
);
15016 rtx mask
= gen_reg_rtx (QImode
);
15018 emit_move_insn (mask
, constm1_rtx
);
15019 emit_insn (gen_avx512f_vextracti32x4_mask (tmp
, tmp2
, GEN_INT (i
),
15022 tmp2
= gen_reg_rtx (half_mode
);
15023 emit_move_insn (tmp2
, gen_lowpart (half_mode
, tmp
));
15026 /* Put val in tmp at elt. */
15027 ix86_expand_vector_set (false, tmp
, val
, elt
);
15030 tmp2
= gen_reg_rtx (V16SImode
);
15031 rtx tmp3
= gen_lowpart (V16SImode
, target
);
15032 mask
= gen_reg_rtx (HImode
);
15033 emit_move_insn (mask
, constm1_rtx
);
15034 tmp
= gen_lowpart (V4SImode
, tmp
);
15035 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2
, tmp3
, tmp
, GEN_INT (i
),
15037 emit_move_insn (target
, gen_lowpart (mode
, tmp2
));
15045 if (mmode
!= VOIDmode
)
15047 tmp
= gen_reg_rtx (mode
);
15048 emit_insn (gen_rtx_SET (tmp
, gen_rtx_VEC_DUPLICATE (mode
, val
)));
15049 /* The avx512*_blendm<mode> expanders have different operand order
15050 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
15051 elements where the mask is set and second input operand otherwise,
15052 in {sse,avx}*_*blend* the first input operand is used for elements
15053 where the mask is clear and second input operand otherwise. */
15054 emit_insn (gen_blendm (target
, target
, tmp
,
15056 gen_int_mode (HOST_WIDE_INT_1U
<< elt
,
15059 else if (use_vec_merge
)
15062 tmp
= gen_rtx_VEC_DUPLICATE (mode
, val
);
15063 tmp
= gen_rtx_VEC_MERGE (mode
, tmp
, target
,
15064 GEN_INT (HOST_WIDE_INT_1U
<< elt
));
15065 emit_insn (gen_rtx_SET (target
, tmp
));
15069 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
15071 emit_move_insn (mem
, target
);
15073 tmp
= adjust_address (mem
, inner_mode
, elt
* GET_MODE_SIZE (inner_mode
));
15074 emit_move_insn (tmp
, val
);
15076 emit_move_insn (target
, mem
);
15081 ix86_expand_vector_extract (bool mmx_ok
, rtx target
, rtx vec
, int elt
)
15083 machine_mode mode
= GET_MODE (vec
);
15084 machine_mode inner_mode
= GET_MODE_INNER (mode
);
15085 bool use_vec_extr
= false;
15091 use_vec_extr
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
15105 use_vec_extr
= true;
15109 use_vec_extr
= TARGET_SSE4_1
;
15121 tmp
= gen_reg_rtx (mode
);
15122 emit_insn (gen_sse_shufps_v4sf (tmp
, vec
, vec
,
15123 GEN_INT (elt
), GEN_INT (elt
),
15124 GEN_INT (elt
+4), GEN_INT (elt
+4)));
15128 tmp
= gen_reg_rtx (mode
);
15129 emit_insn (gen_vec_interleave_highv4sf (tmp
, vec
, vec
));
15133 gcc_unreachable ();
15136 use_vec_extr
= true;
15141 use_vec_extr
= TARGET_SSE4_1
;
15155 tmp
= gen_reg_rtx (mode
);
15156 emit_insn (gen_sse2_pshufd_1 (tmp
, vec
,
15157 GEN_INT (elt
), GEN_INT (elt
),
15158 GEN_INT (elt
), GEN_INT (elt
)));
15162 tmp
= gen_reg_rtx (mode
);
15163 emit_insn (gen_vec_interleave_highv4si (tmp
, vec
, vec
));
15167 gcc_unreachable ();
15170 use_vec_extr
= true;
15175 /* For SSE1, we have to reuse the V4SF code. */
15176 ix86_expand_vector_extract (false, gen_lowpart (SFmode
, target
),
15177 gen_lowpart (V4SFmode
, vec
), elt
);
15183 use_vec_extr
= TARGET_SSE2
;
15186 use_vec_extr
= mmx_ok
&& (TARGET_SSE
|| TARGET_3DNOW_A
);
15190 use_vec_extr
= TARGET_SSE4_1
;
15194 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC
))
15196 tmp
= gen_reg_rtx (SImode
);
15197 ix86_expand_vector_extract (false, tmp
, gen_lowpart (V4SImode
, vec
),
15199 emit_insn (gen_rtx_SET (target
, gen_lowpart (QImode
, tmp
)));
15207 tmp
= gen_reg_rtx (V4SFmode
);
15209 emit_insn (gen_vec_extract_lo_v8sf (tmp
, vec
));
15211 emit_insn (gen_vec_extract_hi_v8sf (tmp
, vec
));
15212 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
15220 tmp
= gen_reg_rtx (V2DFmode
);
15222 emit_insn (gen_vec_extract_lo_v4df (tmp
, vec
));
15224 emit_insn (gen_vec_extract_hi_v4df (tmp
, vec
));
15225 ix86_expand_vector_extract (false, target
, tmp
, elt
& 1);
15233 tmp
= gen_reg_rtx (V16QImode
);
15235 emit_insn (gen_vec_extract_lo_v32qi (tmp
, vec
));
15237 emit_insn (gen_vec_extract_hi_v32qi (tmp
, vec
));
15238 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
15246 tmp
= gen_reg_rtx (V8HImode
);
15248 emit_insn (gen_vec_extract_lo_v16hi (tmp
, vec
));
15250 emit_insn (gen_vec_extract_hi_v16hi (tmp
, vec
));
15251 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
15259 tmp
= gen_reg_rtx (V4SImode
);
15261 emit_insn (gen_vec_extract_lo_v8si (tmp
, vec
));
15263 emit_insn (gen_vec_extract_hi_v8si (tmp
, vec
));
15264 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
15272 tmp
= gen_reg_rtx (V2DImode
);
15274 emit_insn (gen_vec_extract_lo_v4di (tmp
, vec
));
15276 emit_insn (gen_vec_extract_hi_v4di (tmp
, vec
));
15277 ix86_expand_vector_extract (false, target
, tmp
, elt
& 1);
15283 if (TARGET_AVX512BW
)
15285 tmp
= gen_reg_rtx (V16HImode
);
15287 emit_insn (gen_vec_extract_lo_v32hi (tmp
, vec
));
15289 emit_insn (gen_vec_extract_hi_v32hi (tmp
, vec
));
15290 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
15296 if (TARGET_AVX512BW
)
15298 tmp
= gen_reg_rtx (V32QImode
);
15300 emit_insn (gen_vec_extract_lo_v64qi (tmp
, vec
));
15302 emit_insn (gen_vec_extract_hi_v64qi (tmp
, vec
));
15303 ix86_expand_vector_extract (false, target
, tmp
, elt
& 31);
15309 tmp
= gen_reg_rtx (V8SFmode
);
15311 emit_insn (gen_vec_extract_lo_v16sf (tmp
, vec
));
15313 emit_insn (gen_vec_extract_hi_v16sf (tmp
, vec
));
15314 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
15318 tmp
= gen_reg_rtx (V4DFmode
);
15320 emit_insn (gen_vec_extract_lo_v8df (tmp
, vec
));
15322 emit_insn (gen_vec_extract_hi_v8df (tmp
, vec
));
15323 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
15327 tmp
= gen_reg_rtx (V8SImode
);
15329 emit_insn (gen_vec_extract_lo_v16si (tmp
, vec
));
15331 emit_insn (gen_vec_extract_hi_v16si (tmp
, vec
));
15332 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
15336 tmp
= gen_reg_rtx (V4DImode
);
15338 emit_insn (gen_vec_extract_lo_v8di (tmp
, vec
));
15340 emit_insn (gen_vec_extract_hi_v8di (tmp
, vec
));
15341 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
15345 use_vec_extr
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
15346 /* ??? Could extract the appropriate HImode element and shift. */
15355 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (elt
)));
15356 tmp
= gen_rtx_VEC_SELECT (inner_mode
, vec
, tmp
);
15358 /* Let the rtl optimizers know about the zero extension performed. */
15359 if (inner_mode
== QImode
|| inner_mode
== HImode
)
15361 tmp
= gen_rtx_ZERO_EXTEND (SImode
, tmp
);
15362 target
= gen_lowpart (SImode
, target
);
15365 emit_insn (gen_rtx_SET (target
, tmp
));
15369 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
15371 emit_move_insn (mem
, vec
);
15373 tmp
= adjust_address (mem
, inner_mode
, elt
*GET_MODE_SIZE (inner_mode
));
15374 emit_move_insn (target
, tmp
);
15378 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
15379 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
15380 The upper bits of DEST are undefined, though they shouldn't cause
15381 exceptions (some bits from src or all zeros are ok). */
15384 emit_reduc_half (rtx dest
, rtx src
, int i
)
15387 switch (GET_MODE (src
))
15391 tem
= gen_sse_movhlps (dest
, src
, src
);
15393 tem
= gen_sse_shufps_v4sf (dest
, src
, src
, const1_rtx
, const1_rtx
,
15394 GEN_INT (1 + 4), GEN_INT (1 + 4));
15397 tem
= gen_vec_interleave_highv2df (dest
, src
, src
);
15403 d
= gen_reg_rtx (V1TImode
);
15404 tem
= gen_sse2_lshrv1ti3 (d
, gen_lowpart (V1TImode
, src
),
15409 tem
= gen_avx_vperm2f128v8sf3 (dest
, src
, src
, const1_rtx
);
15411 tem
= gen_avx_shufps256 (dest
, src
, src
,
15412 GEN_INT (i
== 128 ? 2 + (3 << 2) : 1));
15416 tem
= gen_avx_vperm2f128v4df3 (dest
, src
, src
, const1_rtx
);
15418 tem
= gen_avx_shufpd256 (dest
, src
, src
, const1_rtx
);
15426 if (GET_MODE (dest
) != V4DImode
)
15427 d
= gen_reg_rtx (V4DImode
);
15428 tem
= gen_avx2_permv2ti (d
, gen_lowpart (V4DImode
, src
),
15429 gen_lowpart (V4DImode
, src
),
15434 d
= gen_reg_rtx (V2TImode
);
15435 tem
= gen_avx2_lshrv2ti3 (d
, gen_lowpart (V2TImode
, src
),
15443 d
= gen_reg_rtx (V4TImode
);
15444 tem
= gen_avx512bw_lshrv4ti3 (d
, gen_lowpart (V4TImode
, src
),
15454 tem
= gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode
, dest
),
15455 gen_lowpart (V16SImode
, src
),
15456 gen_lowpart (V16SImode
, src
),
15457 GEN_INT (0x4 + (i
== 512 ? 4 : 0)),
15458 GEN_INT (0x5 + (i
== 512 ? 4 : 0)),
15459 GEN_INT (0x6 + (i
== 512 ? 4 : 0)),
15460 GEN_INT (0x7 + (i
== 512 ? 4 : 0)),
15461 GEN_INT (0xC), GEN_INT (0xD),
15462 GEN_INT (0xE), GEN_INT (0xF),
15463 GEN_INT (0x10), GEN_INT (0x11),
15464 GEN_INT (0x12), GEN_INT (0x13),
15465 GEN_INT (0x14), GEN_INT (0x15),
15466 GEN_INT (0x16), GEN_INT (0x17));
15468 tem
= gen_avx512f_pshufd_1 (gen_lowpart (V16SImode
, dest
),
15469 gen_lowpart (V16SImode
, src
),
15470 GEN_INT (i
== 128 ? 0x2 : 0x1),
15474 GEN_INT (i
== 128 ? 0x6 : 0x5),
15478 GEN_INT (i
== 128 ? 0xA : 0x9),
15482 GEN_INT (i
== 128 ? 0xE : 0xD),
15488 gcc_unreachable ();
15492 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), d
));
15495 /* Expand a vector reduction. FN is the binary pattern to reduce;
15496 DEST is the destination; IN is the input vector. */
15499 ix86_expand_reduc (rtx (*fn
) (rtx
, rtx
, rtx
), rtx dest
, rtx in
)
15501 rtx half
, dst
, vec
= in
;
15502 machine_mode mode
= GET_MODE (in
);
15505 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
15507 && mode
== V8HImode
15508 && fn
== gen_uminv8hi3
)
15510 emit_insn (gen_sse4_1_phminposuw (dest
, in
));
15514 for (i
= GET_MODE_BITSIZE (mode
);
15515 i
> GET_MODE_UNIT_BITSIZE (mode
);
15518 half
= gen_reg_rtx (mode
);
15519 emit_reduc_half (half
, vec
, i
);
15520 if (i
== GET_MODE_UNIT_BITSIZE (mode
) * 2)
15523 dst
= gen_reg_rtx (mode
);
15524 emit_insn (fn (dst
, half
, vec
));
15529 /* Output code to perform a conditional jump to LABEL, if C2 flag in
15530 FP status register is set. */
15533 ix86_emit_fp_unordered_jump (rtx label
)
15535 rtx reg
= gen_reg_rtx (HImode
);
15539 emit_insn (gen_x86_fnstsw_1 (reg
));
15541 if (TARGET_SAHF
&& (TARGET_USE_SAHF
|| optimize_insn_for_size_p ()))
15543 emit_insn (gen_x86_sahf_1 (reg
));
15545 temp
= gen_rtx_REG (CCmode
, FLAGS_REG
);
15546 temp
= gen_rtx_UNORDERED (VOIDmode
, temp
, const0_rtx
);
15550 emit_insn (gen_testqi_ext_1_ccno (reg
, GEN_INT (0x04)));
15552 temp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15553 temp
= gen_rtx_NE (VOIDmode
, temp
, const0_rtx
);
15556 temp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, temp
,
15557 gen_rtx_LABEL_REF (VOIDmode
, label
),
15559 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, temp
));
15560 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
15561 JUMP_LABEL (insn
) = label
;
15564 /* Output code to perform an sinh XFmode calculation. */
15566 void ix86_emit_i387_sinh (rtx op0
, rtx op1
)
15568 rtx e1
= gen_reg_rtx (XFmode
);
15569 rtx e2
= gen_reg_rtx (XFmode
);
15570 rtx scratch
= gen_reg_rtx (HImode
);
15571 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15572 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15574 rtx_code_label
*jump_label
= gen_label_rtx ();
15577 /* scratch = fxam (op1) */
15578 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15580 /* e1 = expm1 (|op1|) */
15581 emit_insn (gen_absxf2 (e2
, op1
));
15582 emit_insn (gen_expm1xf2 (e1
, e2
));
15584 /* e2 = e1 / (e1 + 1.0) + e1 */
15585 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15586 emit_insn (gen_addxf3 (e2
, e1
, cst1
));
15587 emit_insn (gen_divxf3 (e2
, e1
, e2
));
15588 emit_insn (gen_addxf3 (e2
, e2
, e1
));
15590 /* flags = signbit (op1) */
15591 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15593 /* if (flags) then e2 = -e2 */
15594 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15595 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
15596 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15598 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15599 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15600 JUMP_LABEL (insn
) = jump_label
;
15602 emit_insn (gen_negxf2 (e2
, e2
));
15604 emit_label (jump_label
);
15605 LABEL_NUSES (jump_label
) = 1;
15607 /* op0 = 0.5 * e2 */
15608 half
= force_reg (XFmode
, half
);
15609 emit_insn (gen_mulxf3 (op0
, e2
, half
));
15612 /* Output code to perform an cosh XFmode calculation. */
15614 void ix86_emit_i387_cosh (rtx op0
, rtx op1
)
15616 rtx e1
= gen_reg_rtx (XFmode
);
15617 rtx e2
= gen_reg_rtx (XFmode
);
15618 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15621 /* e1 = exp (op1) */
15622 emit_insn (gen_expxf2 (e1
, op1
));
15624 /* e2 = e1 + 1.0 / e1 */
15625 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15626 emit_insn (gen_divxf3 (e2
, cst1
, e1
));
15627 emit_insn (gen_addxf3 (e2
, e1
, e2
));
15629 /* op0 = 0.5 * e2 */
15630 half
= force_reg (XFmode
, half
);
15631 emit_insn (gen_mulxf3 (op0
, e2
, half
));
15634 /* Output code to perform an tanh XFmode calculation. */
15636 void ix86_emit_i387_tanh (rtx op0
, rtx op1
)
15638 rtx e1
= gen_reg_rtx (XFmode
);
15639 rtx e2
= gen_reg_rtx (XFmode
);
15640 rtx scratch
= gen_reg_rtx (HImode
);
15641 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15643 rtx_code_label
*jump_label
= gen_label_rtx ();
15646 /* scratch = fxam (op1) */
15647 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15649 /* e1 = expm1 (-|2 * op1|) */
15650 emit_insn (gen_addxf3 (e2
, op1
, op1
));
15651 emit_insn (gen_absxf2 (e2
, e2
));
15652 emit_insn (gen_negxf2 (e2
, e2
));
15653 emit_insn (gen_expm1xf2 (e1
, e2
));
15655 /* e2 = e1 / (e1 + 2.0) */
15656 cst2
= force_reg (XFmode
, CONST2_RTX (XFmode
));
15657 emit_insn (gen_addxf3 (e2
, e1
, cst2
));
15658 emit_insn (gen_divxf3 (e2
, e1
, e2
));
15660 /* flags = signbit (op1) */
15661 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15663 /* if (!flags) then e2 = -e2 */
15664 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15665 gen_rtx_NE (VOIDmode
, flags
, const0_rtx
),
15666 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15668 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15669 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15670 JUMP_LABEL (insn
) = jump_label
;
15672 emit_insn (gen_negxf2 (e2
, e2
));
15674 emit_label (jump_label
);
15675 LABEL_NUSES (jump_label
) = 1;
15677 emit_move_insn (op0
, e2
);
15680 /* Output code to perform an asinh XFmode calculation. */
15682 void ix86_emit_i387_asinh (rtx op0
, rtx op1
)
15684 rtx e1
= gen_reg_rtx (XFmode
);
15685 rtx e2
= gen_reg_rtx (XFmode
);
15686 rtx scratch
= gen_reg_rtx (HImode
);
15687 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15689 rtx_code_label
*jump_label
= gen_label_rtx ();
15692 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
15693 emit_insn (gen_mulxf3 (e1
, op1
, op1
));
15694 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15695 emit_insn (gen_addxf3 (e2
, e1
, cst1
));
15696 emit_insn (gen_sqrtxf2 (e2
, e2
));
15697 emit_insn (gen_addxf3 (e2
, e2
, cst1
));
15700 emit_insn (gen_divxf3 (e1
, e1
, e2
));
15702 /* scratch = fxam (op1) */
15703 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15705 /* e1 = e1 + |op1| */
15706 emit_insn (gen_absxf2 (e2
, op1
));
15707 emit_insn (gen_addxf3 (e1
, e1
, e2
));
15709 /* e2 = log1p (e1) */
15710 ix86_emit_i387_log1p (e2
, e1
);
15712 /* flags = signbit (op1) */
15713 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15715 /* if (flags) then e2 = -e2 */
15716 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15717 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
15718 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15720 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15721 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15722 JUMP_LABEL (insn
) = jump_label
;
15724 emit_insn (gen_negxf2 (e2
, e2
));
15726 emit_label (jump_label
);
15727 LABEL_NUSES (jump_label
) = 1;
15729 emit_move_insn (op0
, e2
);
15732 /* Output code to perform an acosh XFmode calculation. */
15734 void ix86_emit_i387_acosh (rtx op0
, rtx op1
)
15736 rtx e1
= gen_reg_rtx (XFmode
);
15737 rtx e2
= gen_reg_rtx (XFmode
);
15738 rtx cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15740 /* e2 = sqrt (op1 + 1.0) */
15741 emit_insn (gen_addxf3 (e2
, op1
, cst1
));
15742 emit_insn (gen_sqrtxf2 (e2
, e2
));
15744 /* e1 = sqrt (op1 - 1.0) */
15745 emit_insn (gen_subxf3 (e1
, op1
, cst1
));
15746 emit_insn (gen_sqrtxf2 (e1
, e1
));
15749 emit_insn (gen_mulxf3 (e1
, e1
, e2
));
15751 /* e1 = e1 + op1 */
15752 emit_insn (gen_addxf3 (e1
, e1
, op1
));
15754 /* op0 = log (e1) */
15755 emit_insn (gen_logxf2 (op0
, e1
));
15758 /* Output code to perform an atanh XFmode calculation. */
15760 void ix86_emit_i387_atanh (rtx op0
, rtx op1
)
15762 rtx e1
= gen_reg_rtx (XFmode
);
15763 rtx e2
= gen_reg_rtx (XFmode
);
15764 rtx scratch
= gen_reg_rtx (HImode
);
15765 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15766 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15768 rtx_code_label
*jump_label
= gen_label_rtx ();
15771 /* scratch = fxam (op1) */
15772 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15775 emit_insn (gen_absxf2 (e2
, op1
));
15777 /* e1 = -(e2 + e2) / (e2 + 1.0) */
15778 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15779 emit_insn (gen_addxf3 (e1
, e2
, cst1
));
15780 emit_insn (gen_addxf3 (e2
, e2
, e2
));
15781 emit_insn (gen_negxf2 (e2
, e2
));
15782 emit_insn (gen_divxf3 (e1
, e2
, e1
));
15784 /* e2 = log1p (e1) */
15785 ix86_emit_i387_log1p (e2
, e1
);
15787 /* flags = signbit (op1) */
15788 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15790 /* if (!flags) then e2 = -e2 */
15791 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15792 gen_rtx_NE (VOIDmode
, flags
, const0_rtx
),
15793 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15795 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15796 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15797 JUMP_LABEL (insn
) = jump_label
;
15799 emit_insn (gen_negxf2 (e2
, e2
));
15801 emit_label (jump_label
);
15802 LABEL_NUSES (jump_label
) = 1;
15804 /* op0 = 0.5 * e2 */
15805 half
= force_reg (XFmode
, half
);
15806 emit_insn (gen_mulxf3 (op0
, e2
, half
));
15809 /* Output code to perform a log1p XFmode calculation. */
15811 void ix86_emit_i387_log1p (rtx op0
, rtx op1
)
15813 rtx_code_label
*label1
= gen_label_rtx ();
15814 rtx_code_label
*label2
= gen_label_rtx ();
15816 rtx tmp
= gen_reg_rtx (XFmode
);
15817 rtx res
= gen_reg_rtx (XFmode
);
15818 rtx cst
, cstln2
, cst1
;
15821 cst
= const_double_from_real_value
15822 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode
), XFmode
);
15823 cstln2
= force_reg (XFmode
, standard_80387_constant_rtx (4)); /* fldln2 */
15825 emit_insn (gen_absxf2 (tmp
, op1
));
15827 cst
= force_reg (XFmode
, cst
);
15828 ix86_expand_branch (GE
, tmp
, cst
, label1
);
15829 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
15830 insn
= get_last_insn ();
15831 JUMP_LABEL (insn
) = label1
;
15833 emit_insn (gen_fyl2xp1xf3_i387 (res
, op1
, cstln2
));
15834 emit_jump (label2
);
15836 emit_label (label1
);
15837 LABEL_NUSES (label1
) = 1;
15839 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15840 emit_insn (gen_rtx_SET (tmp
, gen_rtx_PLUS (XFmode
, op1
, cst1
)));
15841 emit_insn (gen_fyl2xxf3_i387 (res
, tmp
, cstln2
));
15843 emit_label (label2
);
15844 LABEL_NUSES (label2
) = 1;
15846 emit_move_insn (op0
, res
);
15849 /* Emit code for round calculation. */
15850 void ix86_emit_i387_round (rtx op0
, rtx op1
)
15852 machine_mode inmode
= GET_MODE (op1
);
15853 machine_mode outmode
= GET_MODE (op0
);
15854 rtx e1
= gen_reg_rtx (XFmode
);
15855 rtx e2
= gen_reg_rtx (XFmode
);
15856 rtx scratch
= gen_reg_rtx (HImode
);
15857 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15858 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15859 rtx res
= gen_reg_rtx (outmode
);
15860 rtx_code_label
*jump_label
= gen_label_rtx ();
15861 rtx (*floor_insn
) (rtx
, rtx
);
15862 rtx (*neg_insn
) (rtx
, rtx
);
15870 tmp
= gen_reg_rtx (XFmode
);
15872 emit_insn (gen_rtx_SET (tmp
, gen_rtx_FLOAT_EXTEND (XFmode
, op1
)));
15878 gcc_unreachable ();
15884 floor_insn
= gen_frndintxf2_floor
;
15885 neg_insn
= gen_negsf2
;
15888 floor_insn
= gen_frndintxf2_floor
;
15889 neg_insn
= gen_negdf2
;
15892 floor_insn
= gen_frndintxf2_floor
;
15893 neg_insn
= gen_negxf2
;
15896 floor_insn
= gen_lfloorxfhi2
;
15897 neg_insn
= gen_neghi2
;
15900 floor_insn
= gen_lfloorxfsi2
;
15901 neg_insn
= gen_negsi2
;
15904 floor_insn
= gen_lfloorxfdi2
;
15905 neg_insn
= gen_negdi2
;
15908 gcc_unreachable ();
15911 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
15913 /* scratch = fxam(op1) */
15914 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15916 /* e1 = fabs(op1) */
15917 emit_insn (gen_absxf2 (e1
, op1
));
15919 /* e2 = e1 + 0.5 */
15920 half
= force_reg (XFmode
, half
);
15921 emit_insn (gen_rtx_SET (e2
, gen_rtx_PLUS (XFmode
, e1
, half
)));
15923 /* res = floor(e2) */
15929 tmp
= gen_reg_rtx (XFmode
);
15931 emit_insn (floor_insn (tmp
, e2
));
15932 emit_insn (gen_rtx_SET (res
,
15933 gen_rtx_UNSPEC (outmode
, gen_rtvec (1, tmp
),
15934 UNSPEC_TRUNC_NOOP
)));
15938 emit_insn (floor_insn (res
, e2
));
15941 /* flags = signbit(a) */
15942 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15944 /* if (flags) then res = -res */
15945 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15946 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
15947 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15949 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15950 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15951 JUMP_LABEL (insn
) = jump_label
;
15953 emit_insn (neg_insn (res
, res
));
15955 emit_label (jump_label
);
15956 LABEL_NUSES (jump_label
) = 1;
15958 emit_move_insn (op0
, res
);
15961 /* Output code to perform a Newton-Rhapson approximation of a single precision
15962 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
15964 void ix86_emit_swdivsf (rtx res
, rtx a
, rtx b
, machine_mode mode
)
15966 rtx x0
, x1
, e0
, e1
;
15968 x0
= gen_reg_rtx (mode
);
15969 e0
= gen_reg_rtx (mode
);
15970 e1
= gen_reg_rtx (mode
);
15971 x1
= gen_reg_rtx (mode
);
15973 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
15975 b
= force_reg (mode
, b
);
15977 /* x0 = rcp(b) estimate */
15978 if (mode
== V16SFmode
|| mode
== V8DFmode
)
15980 if (TARGET_AVX512ER
)
15982 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
15985 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, a
, x0
)));
15989 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
15993 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
15997 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, b
)));
16000 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, e0
)));
16003 emit_insn (gen_rtx_SET (e1
, gen_rtx_PLUS (mode
, x0
, x0
)));
16006 emit_insn (gen_rtx_SET (x1
, gen_rtx_MINUS (mode
, e1
, e0
)));
16009 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, a
, x1
)));
16012 /* Output code to perform a Newton-Rhapson approximation of a
16013 single precision floating point [reciprocal] square root. */
16015 void ix86_emit_swsqrtsf (rtx res
, rtx a
, machine_mode mode
, bool recip
)
16017 rtx x0
, e0
, e1
, e2
, e3
, mthree
, mhalf
;
16021 x0
= gen_reg_rtx (mode
);
16022 e0
= gen_reg_rtx (mode
);
16023 e1
= gen_reg_rtx (mode
);
16024 e2
= gen_reg_rtx (mode
);
16025 e3
= gen_reg_rtx (mode
);
16027 if (TARGET_AVX512ER
&& mode
== V16SFmode
)
16030 /* res = rsqrt28(a) estimate */
16031 emit_insn (gen_rtx_SET (res
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
16035 /* x0 = rsqrt28(a) estimate */
16036 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
16038 /* res = rcp28(x0) estimate */
16039 emit_insn (gen_rtx_SET (res
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, x0
),
16045 real_from_integer (&r
, VOIDmode
, -3, SIGNED
);
16046 mthree
= const_double_from_real_value (r
, SFmode
);
16048 real_arithmetic (&r
, NEGATE_EXPR
, &dconsthalf
, NULL
);
16049 mhalf
= const_double_from_real_value (r
, SFmode
);
16050 unspec
= UNSPEC_RSQRT
;
16052 if (VECTOR_MODE_P (mode
))
16054 mthree
= ix86_build_const_vector (mode
, true, mthree
);
16055 mhalf
= ix86_build_const_vector (mode
, true, mhalf
);
16056 /* There is no 512-bit rsqrt. There is however rsqrt14. */
16057 if (GET_MODE_SIZE (mode
) == 64)
16058 unspec
= UNSPEC_RSQRT14
;
16061 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
16062 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
16064 a
= force_reg (mode
, a
);
16066 /* x0 = rsqrt(a) estimate */
16067 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
16070 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
16073 rtx zero
= force_reg (mode
, CONST0_RTX(mode
));
16076 /* Handle masked compare. */
16077 if (VECTOR_MODE_P (mode
) && GET_MODE_SIZE (mode
) == 64)
16079 mask
= gen_reg_rtx (HImode
);
16080 /* Imm value 0x4 corresponds to not-equal comparison. */
16081 emit_insn (gen_avx512f_cmpv16sf3 (mask
, zero
, a
, GEN_INT (0x4)));
16082 emit_insn (gen_avx512f_blendmv16sf (x0
, zero
, x0
, mask
));
16086 mask
= gen_reg_rtx (mode
);
16087 emit_insn (gen_rtx_SET (mask
, gen_rtx_NE (mode
, zero
, a
)));
16088 emit_insn (gen_rtx_SET (x0
, gen_rtx_AND (mode
, x0
, mask
)));
16092 mthree
= force_reg (mode
, mthree
);
16095 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, a
)));
16097 unsigned vector_size
= GET_MODE_SIZE (mode
);
16099 || (TARGET_AVX512F
&& vector_size
== 64)
16100 || (TARGET_AVX512VL
&& (vector_size
== 32 || vector_size
== 16)))
16101 emit_insn (gen_rtx_SET (e2
,
16102 gen_rtx_FMA (mode
, e0
, x0
, mthree
)));
16106 emit_insn (gen_rtx_SET (e1
, gen_rtx_MULT (mode
, e0
, x0
)));
16109 emit_insn (gen_rtx_SET (e2
, gen_rtx_PLUS (mode
, e1
, mthree
)));
16112 mhalf
= force_reg (mode
, mhalf
);
16114 /* e3 = -.5 * x0 */
16115 emit_insn (gen_rtx_SET (e3
, gen_rtx_MULT (mode
, x0
, mhalf
)));
16117 /* e3 = -.5 * e0 */
16118 emit_insn (gen_rtx_SET (e3
, gen_rtx_MULT (mode
, e0
, mhalf
)));
16119 /* ret = e2 * e3 */
16120 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, e2
, e3
)));
16123 /* Expand fabs (OP0) and return a new rtx that holds the result. The
16124 mask for masking out the sign-bit is stored in *SMASK, if that is
16128 ix86_expand_sse_fabs (rtx op0
, rtx
*smask
)
16130 machine_mode vmode
, mode
= GET_MODE (op0
);
16133 xa
= gen_reg_rtx (mode
);
16134 if (mode
== SFmode
)
16136 else if (mode
== DFmode
)
16140 mask
= ix86_build_signbit_mask (vmode
, VECTOR_MODE_P (mode
), true);
16141 if (!VECTOR_MODE_P (mode
))
16143 /* We need to generate a scalar mode mask in this case. */
16144 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const0_rtx
));
16145 tmp
= gen_rtx_VEC_SELECT (mode
, mask
, tmp
);
16146 mask
= gen_reg_rtx (mode
);
16147 emit_insn (gen_rtx_SET (mask
, tmp
));
16149 emit_insn (gen_rtx_SET (xa
, gen_rtx_AND (mode
, op0
, mask
)));
16157 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
16158 swapping the operands if SWAP_OPERANDS is true. The expanded
16159 code is a forward jump to a newly created label in case the
16160 comparison is true. The generated label rtx is returned. */
16161 static rtx_code_label
*
16162 ix86_expand_sse_compare_and_jump (enum rtx_code code
, rtx op0
, rtx op1
,
16163 bool swap_operands
)
16165 bool unordered_compare
= ix86_unordered_fp_compare (code
);
16166 rtx_code_label
*label
;
16170 std::swap (op0
, op1
);
16172 label
= gen_label_rtx ();
16173 tmp
= gen_rtx_COMPARE (CCFPmode
, op0
, op1
);
16174 if (unordered_compare
)
16175 tmp
= gen_rtx_UNSPEC (CCFPmode
, gen_rtvec (1, tmp
), UNSPEC_NOTRAP
);
16176 reg
= gen_rtx_REG (CCFPmode
, FLAGS_REG
);
16177 emit_insn (gen_rtx_SET (reg
, tmp
));
16178 tmp
= gen_rtx_fmt_ee (code
, VOIDmode
, reg
, const0_rtx
);
16179 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
16180 gen_rtx_LABEL_REF (VOIDmode
, label
), pc_rtx
);
16181 tmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
16182 JUMP_LABEL (tmp
) = label
;
16187 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
16188 using comparison code CODE. Operands are swapped for the comparison if
16189 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
16191 ix86_expand_sse_compare_mask (enum rtx_code code
, rtx op0
, rtx op1
,
16192 bool swap_operands
)
16194 rtx (*insn
)(rtx
, rtx
, rtx
, rtx
);
16195 machine_mode mode
= GET_MODE (op0
);
16196 rtx mask
= gen_reg_rtx (mode
);
16199 std::swap (op0
, op1
);
16201 insn
= mode
== DFmode
? gen_setcc_df_sse
: gen_setcc_sf_sse
;
16203 emit_insn (insn (mask
, op0
, op1
,
16204 gen_rtx_fmt_ee (code
, mode
, op0
, op1
)));
16208 /* Expand copysign from SIGN to the positive value ABS_VALUE
16209 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
16213 ix86_sse_copysign_to_positive (rtx result
, rtx abs_value
, rtx sign
, rtx mask
)
16215 machine_mode mode
= GET_MODE (sign
);
16216 rtx sgn
= gen_reg_rtx (mode
);
16217 if (mask
== NULL_RTX
)
16219 machine_mode vmode
;
16221 if (mode
== SFmode
)
16223 else if (mode
== DFmode
)
16228 mask
= ix86_build_signbit_mask (vmode
, VECTOR_MODE_P (mode
), false);
16229 if (!VECTOR_MODE_P (mode
))
16231 /* We need to generate a scalar mode mask in this case. */
16232 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const0_rtx
));
16233 tmp
= gen_rtx_VEC_SELECT (mode
, mask
, tmp
);
16234 mask
= gen_reg_rtx (mode
);
16235 emit_insn (gen_rtx_SET (mask
, tmp
));
16239 mask
= gen_rtx_NOT (mode
, mask
);
16240 emit_insn (gen_rtx_SET (sgn
, gen_rtx_AND (mode
, mask
, sign
)));
16241 emit_insn (gen_rtx_SET (result
, gen_rtx_IOR (mode
, abs_value
, sgn
)));
16244 /* Expand SSE sequence for computing lround from OP1 storing
16248 ix86_expand_lround (rtx op0
, rtx op1
)
16250 /* C code for the stuff we're doing below:
16251 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
16254 machine_mode mode
= GET_MODE (op1
);
16255 const struct real_format
*fmt
;
16256 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
16259 /* load nextafter (0.5, 0.0) */
16260 fmt
= REAL_MODE_FORMAT (mode
);
16261 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
16262 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
16264 /* adj = copysign (0.5, op1) */
16265 adj
= force_reg (mode
, const_double_from_real_value (pred_half
, mode
));
16266 ix86_sse_copysign_to_positive (adj
, adj
, force_reg (mode
, op1
), NULL_RTX
);
16268 /* adj = op1 + adj */
16269 adj
= expand_simple_binop (mode
, PLUS
, adj
, op1
, NULL_RTX
, 0, OPTAB_DIRECT
);
16271 /* op0 = (imode)adj */
16272 expand_fix (op0
, adj
, 0);
16275 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
16279 ix86_expand_lfloorceil (rtx op0
, rtx op1
, bool do_floor
)
16281 /* C code for the stuff we're doing below (for do_floor):
16283 xi -= (double)xi > op1 ? 1 : 0;
16286 machine_mode fmode
= GET_MODE (op1
);
16287 machine_mode imode
= GET_MODE (op0
);
16288 rtx ireg
, freg
, tmp
;
16289 rtx_code_label
*label
;
16291 /* reg = (long)op1 */
16292 ireg
= gen_reg_rtx (imode
);
16293 expand_fix (ireg
, op1
, 0);
16295 /* freg = (double)reg */
16296 freg
= gen_reg_rtx (fmode
);
16297 expand_float (freg
, ireg
, 0);
16299 /* ireg = (freg > op1) ? ireg - 1 : ireg */
16300 label
= ix86_expand_sse_compare_and_jump (UNLE
,
16301 freg
, op1
, !do_floor
);
16302 tmp
= expand_simple_binop (imode
, do_floor
? MINUS
: PLUS
,
16303 ireg
, const1_rtx
, NULL_RTX
, 0, OPTAB_DIRECT
);
16304 emit_move_insn (ireg
, tmp
);
16306 emit_label (label
);
16307 LABEL_NUSES (label
) = 1;
16309 emit_move_insn (op0
, ireg
);
16312 /* Generate and return a rtx of mode MODE for 2**n where n is the number
16313 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
16316 ix86_gen_TWO52 (machine_mode mode
)
16318 const struct real_format
*fmt
;
16319 REAL_VALUE_TYPE TWO52r
;
16322 fmt
= REAL_MODE_FORMAT (mode
);
16323 real_2expN (&TWO52r
, fmt
->p
- 1, mode
);
16324 TWO52
= const_double_from_real_value (TWO52r
, mode
);
16325 TWO52
= force_reg (mode
, TWO52
);
16330 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
16333 ix86_expand_rint (rtx operand0
, rtx operand1
)
16335 /* C code for the stuff we're doing below:
16336 xa = fabs (operand1);
16337 if (!isless (xa, 2**52))
16340 if (flag_rounding_math)
16342 two52 = copysign (two52, operand1);
16345 xa = xa + two52 - two52;
16346 return copysign (xa, operand1);
16348 machine_mode mode
= GET_MODE (operand0
);
16349 rtx res
, xa
, TWO52
, mask
;
16350 rtx_code_label
*label
;
16352 TWO52
= ix86_gen_TWO52 (mode
);
16354 /* Temporary for holding the result, initialized to the input
16355 operand to ease control flow. */
16356 res
= copy_to_reg (operand1
);
16358 /* xa = abs (operand1) */
16359 xa
= ix86_expand_sse_fabs (res
, &mask
);
16361 /* if (!isless (xa, TWO52)) goto label; */
16362 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16364 if (flag_rounding_math
)
16366 ix86_sse_copysign_to_positive (TWO52
, TWO52
, res
, mask
);
16370 xa
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
16371 xa
= expand_simple_binop (mode
, MINUS
, xa
, TWO52
, xa
, 0, OPTAB_DIRECT
);
16373 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
16374 if (HONOR_SIGNED_ZEROS (mode
) && flag_rounding_math
)
16375 xa
= ix86_expand_sse_fabs (xa
, NULL
);
16377 ix86_sse_copysign_to_positive (res
, xa
, res
, mask
);
16379 emit_label (label
);
16380 LABEL_NUSES (label
) = 1;
16382 emit_move_insn (operand0
, res
);
16385 /* Expand SSE2 sequence for computing floor or ceil
16386 from OPERAND1 storing into OPERAND0. */
16388 ix86_expand_floorceil (rtx operand0
, rtx operand1
, bool do_floor
)
16390 /* C code for the stuff we expand below.
16391 double xa = fabs (x), x2;
16392 if (!isless (xa, TWO52))
16394 x2 = (double)(long)x;
16403 if (HONOR_SIGNED_ZEROS (mode))
16404 return copysign (x2, x);
16407 machine_mode mode
= GET_MODE (operand0
);
16408 rtx xa
, xi
, TWO52
, tmp
, one
, res
, mask
;
16409 rtx_code_label
*label
;
16411 TWO52
= ix86_gen_TWO52 (mode
);
16413 /* Temporary for holding the result, initialized to the input
16414 operand to ease control flow. */
16415 res
= copy_to_reg (operand1
);
16417 /* xa = abs (operand1) */
16418 xa
= ix86_expand_sse_fabs (res
, &mask
);
16420 /* if (!isless (xa, TWO52)) goto label; */
16421 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16423 /* xa = (double)(long)x */
16424 xi
= gen_reg_rtx (int_mode_for_mode (mode
).require ());
16425 expand_fix (xi
, res
, 0);
16426 expand_float (xa
, xi
, 0);
16429 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
16431 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
16432 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa
, res
, !do_floor
);
16433 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
16434 tmp
= expand_simple_binop (mode
, do_floor
? MINUS
: PLUS
,
16435 xa
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
16436 if (HONOR_SIGNED_ZEROS (mode
))
16438 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
16439 if (do_floor
&& flag_rounding_math
)
16440 tmp
= ix86_expand_sse_fabs (tmp
, NULL
);
16442 ix86_sse_copysign_to_positive (tmp
, tmp
, res
, mask
);
16444 emit_move_insn (res
, tmp
);
16446 emit_label (label
);
16447 LABEL_NUSES (label
) = 1;
16449 emit_move_insn (operand0
, res
);
16452 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
16453 into OPERAND0 without relying on DImode truncation via cvttsd2siq
16454 that is only available on 64bit targets. */
16456 ix86_expand_floorceildf_32 (rtx operand0
, rtx operand1
, bool do_floor
)
16458 /* C code for the stuff we expand below.
16459 double xa = fabs (x), x2;
16460 if (!isless (xa, TWO52))
16462 xa = xa + TWO52 - TWO52;
16463 x2 = copysign (xa, x);
16472 if (HONOR_SIGNED_ZEROS (mode))
16473 x2 = copysign (x2, x);
16476 machine_mode mode
= GET_MODE (operand0
);
16477 rtx xa
, TWO52
, tmp
, one
, res
, mask
;
16478 rtx_code_label
*label
;
16480 TWO52
= ix86_gen_TWO52 (mode
);
16482 /* Temporary for holding the result, initialized to the input
16483 operand to ease control flow. */
16484 res
= copy_to_reg (operand1
);
16486 /* xa = abs (operand1) */
16487 xa
= ix86_expand_sse_fabs (res
, &mask
);
16489 /* if (!isless (xa, TWO52)) goto label; */
16490 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16492 /* xa = xa + TWO52 - TWO52; */
16493 xa
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
16494 xa
= expand_simple_binop (mode
, MINUS
, xa
, TWO52
, xa
, 0, OPTAB_DIRECT
);
16496 /* xa = copysign (xa, operand1) */
16497 ix86_sse_copysign_to_positive (xa
, xa
, res
, mask
);
16500 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
16502 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
16503 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa
, res
, !do_floor
);
16504 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
16505 tmp
= expand_simple_binop (mode
, do_floor
? MINUS
: PLUS
,
16506 xa
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
16507 if (HONOR_SIGNED_ZEROS (mode
))
16509 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
16510 if (do_floor
&& flag_rounding_math
)
16511 tmp
= ix86_expand_sse_fabs (tmp
, NULL
);
16513 ix86_sse_copysign_to_positive (tmp
, tmp
, res
, mask
);
16515 emit_move_insn (res
, tmp
);
16517 emit_label (label
);
16518 LABEL_NUSES (label
) = 1;
16520 emit_move_insn (operand0
, res
);
16523 /* Expand SSE sequence for computing trunc
16524 from OPERAND1 storing into OPERAND0. */
16526 ix86_expand_trunc (rtx operand0
, rtx operand1
)
16528 /* C code for SSE variant we expand below.
16529 double xa = fabs (x), x2;
16530 if (!isless (xa, TWO52))
16532 x2 = (double)(long)x;
16533 if (HONOR_SIGNED_ZEROS (mode))
16534 return copysign (x2, x);
16537 machine_mode mode
= GET_MODE (operand0
);
16538 rtx xa
, xi
, TWO52
, res
, mask
;
16539 rtx_code_label
*label
;
16541 TWO52
= ix86_gen_TWO52 (mode
);
16543 /* Temporary for holding the result, initialized to the input
16544 operand to ease control flow. */
16545 res
= copy_to_reg (operand1
);
16547 /* xa = abs (operand1) */
16548 xa
= ix86_expand_sse_fabs (res
, &mask
);
16550 /* if (!isless (xa, TWO52)) goto label; */
16551 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16553 /* xa = (double)(long)x */
16554 xi
= gen_reg_rtx (int_mode_for_mode (mode
).require ());
16555 expand_fix (xi
, res
, 0);
16556 expand_float (xa
, xi
, 0);
16558 if (HONOR_SIGNED_ZEROS (mode
))
16559 ix86_sse_copysign_to_positive (xa
, xa
, res
, mask
);
16561 emit_move_insn (res
, xa
);
16563 emit_label (label
);
16564 LABEL_NUSES (label
) = 1;
16566 emit_move_insn (operand0
, res
);
16569 /* Expand SSE sequence for computing trunc from OPERAND1 storing
16570 into OPERAND0 without relying on DImode truncation via cvttsd2siq
16571 that is only available on 64bit targets. */
16573 ix86_expand_truncdf_32 (rtx operand0
, rtx operand1
)
16575 machine_mode mode
= GET_MODE (operand0
);
16576 rtx xa
, xa2
, TWO52
, tmp
, one
, res
, mask
;
16577 rtx_code_label
*label
;
16579 /* C code for SSE variant we expand below.
16580 double xa = fabs (x), x2;
16581 if (!isless (xa, TWO52))
16583 xa2 = xa + TWO52 - TWO52;
16587 x2 = copysign (xa2, x);
16591 TWO52
= ix86_gen_TWO52 (mode
);
16593 /* Temporary for holding the result, initialized to the input
16594 operand to ease control flow. */
16595 res
=copy_to_reg (operand1
);
16597 /* xa = abs (operand1) */
16598 xa
= ix86_expand_sse_fabs (res
, &mask
);
16600 /* if (!isless (xa, TWO52)) goto label; */
16601 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16603 /* xa2 = xa + TWO52 - TWO52; */
16604 xa2
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
16605 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, TWO52
, xa2
, 0, OPTAB_DIRECT
);
16608 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
16610 /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */
16611 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa2
, xa
, false);
16612 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
16613 tmp
= expand_simple_binop (mode
, MINUS
,
16614 xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
16615 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
16616 if (HONOR_SIGNED_ZEROS (mode
) && flag_rounding_math
)
16617 tmp
= ix86_expand_sse_fabs (tmp
, NULL
);
16619 /* res = copysign (xa2, operand1) */
16620 ix86_sse_copysign_to_positive (res
, tmp
, res
, mask
);
16622 emit_label (label
);
16623 LABEL_NUSES (label
) = 1;
16625 emit_move_insn (operand0
, res
);
16628 /* Expand SSE sequence for computing round
16629 from OPERAND1 storing into OPERAND0. */
16631 ix86_expand_round (rtx operand0
, rtx operand1
)
16633 /* C code for the stuff we're doing below:
16634 double xa = fabs (x);
16635 if (!isless (xa, TWO52))
16637 xa = (double)(long)(xa + nextafter (0.5, 0.0));
16638 return copysign (xa, x);
16640 machine_mode mode
= GET_MODE (operand0
);
16641 rtx res
, TWO52
, xa
, xi
, half
, mask
;
16642 rtx_code_label
*label
;
16643 const struct real_format
*fmt
;
16644 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
16646 /* Temporary for holding the result, initialized to the input
16647 operand to ease control flow. */
16648 res
= copy_to_reg (operand1
);
16650 TWO52
= ix86_gen_TWO52 (mode
);
16651 xa
= ix86_expand_sse_fabs (res
, &mask
);
16652 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16654 /* load nextafter (0.5, 0.0) */
16655 fmt
= REAL_MODE_FORMAT (mode
);
16656 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
16657 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
16659 /* xa = xa + 0.5 */
16660 half
= force_reg (mode
, const_double_from_real_value (pred_half
, mode
));
16661 xa
= expand_simple_binop (mode
, PLUS
, xa
, half
, NULL_RTX
, 0, OPTAB_DIRECT
);
16663 /* xa = (double)(int64_t)xa */
16664 xi
= gen_reg_rtx (int_mode_for_mode (mode
).require ());
16665 expand_fix (xi
, xa
, 0);
16666 expand_float (xa
, xi
, 0);
16668 /* res = copysign (xa, operand1) */
16669 ix86_sse_copysign_to_positive (res
, xa
, res
, mask
);
16671 emit_label (label
);
16672 LABEL_NUSES (label
) = 1;
16674 emit_move_insn (operand0
, res
);
16677 /* Expand SSE sequence for computing round from OPERAND1 storing
16678 into OPERAND0 without relying on DImode truncation via cvttsd2siq
16679 that is only available on 64bit targets. */
16681 ix86_expand_rounddf_32 (rtx operand0
, rtx operand1
)
16683 /* C code for the stuff we expand below.
16684 double xa = fabs (x), xa2, x2;
16685 if (!isless (xa, TWO52))
16687 Using the absolute value and copying back sign makes
16688 -0.0 -> -0.0 correct.
16689 xa2 = xa + TWO52 - TWO52;
16694 else if (dxa > 0.5)
16696 x2 = copysign (xa2, x);
16699 machine_mode mode
= GET_MODE (operand0
);
16700 rtx xa
, xa2
, dxa
, TWO52
, tmp
, half
, mhalf
, one
, res
, mask
;
16701 rtx_code_label
*label
;
16703 TWO52
= ix86_gen_TWO52 (mode
);
16705 /* Temporary for holding the result, initialized to the input
16706 operand to ease control flow. */
16707 res
= copy_to_reg (operand1
);
16709 /* xa = abs (operand1) */
16710 xa
= ix86_expand_sse_fabs (res
, &mask
);
16712 /* if (!isless (xa, TWO52)) goto label; */
16713 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16715 /* xa2 = xa + TWO52 - TWO52; */
16716 xa2
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
16717 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, TWO52
, xa2
, 0, OPTAB_DIRECT
);
16719 /* dxa = xa2 - xa; */
16720 dxa
= expand_simple_binop (mode
, MINUS
, xa2
, xa
, NULL_RTX
, 0, OPTAB_DIRECT
);
16722 /* generate 0.5, 1.0 and -0.5 */
16723 half
= force_reg (mode
, const_double_from_real_value (dconsthalf
, mode
));
16724 one
= expand_simple_binop (mode
, PLUS
, half
, half
, NULL_RTX
, 0, OPTAB_DIRECT
);
16725 mhalf
= expand_simple_binop (mode
, MINUS
, half
, one
, NULL_RTX
,
16729 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
16730 tmp
= ix86_expand_sse_compare_mask (UNGT
, dxa
, half
, false);
16731 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, tmp
, one
)));
16732 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
16733 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
16734 tmp
= ix86_expand_sse_compare_mask (UNGE
, mhalf
, dxa
, false);
16735 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, tmp
, one
)));
16736 xa2
= expand_simple_binop (mode
, PLUS
, xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
16738 /* res = copysign (xa2, operand1) */
16739 ix86_sse_copysign_to_positive (res
, xa2
, res
, mask
);
16741 emit_label (label
);
16742 LABEL_NUSES (label
) = 1;
16744 emit_move_insn (operand0
, res
);
16747 /* Expand SSE sequence for computing round
16748 from OP1 storing into OP0 using sse4 round insn. */
16750 ix86_expand_round_sse4 (rtx op0
, rtx op1
)
16752 machine_mode mode
= GET_MODE (op0
);
16753 rtx e1
, e2
, res
, half
;
16754 const struct real_format
*fmt
;
16755 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
16756 rtx (*gen_copysign
) (rtx
, rtx
, rtx
);
16757 rtx (*gen_round
) (rtx
, rtx
, rtx
);
16762 gen_copysign
= gen_copysignsf3
;
16763 gen_round
= gen_sse4_1_roundsf2
;
16766 gen_copysign
= gen_copysigndf3
;
16767 gen_round
= gen_sse4_1_rounddf2
;
16770 gcc_unreachable ();
16773 /* round (a) = trunc (a + copysign (0.5, a)) */
16775 /* load nextafter (0.5, 0.0) */
16776 fmt
= REAL_MODE_FORMAT (mode
);
16777 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
16778 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
16779 half
= const_double_from_real_value (pred_half
, mode
);
16781 /* e1 = copysign (0.5, op1) */
16782 e1
= gen_reg_rtx (mode
);
16783 emit_insn (gen_copysign (e1
, half
, op1
));
16785 /* e2 = op1 + e1 */
16786 e2
= expand_simple_binop (mode
, PLUS
, op1
, e1
, NULL_RTX
, 0, OPTAB_DIRECT
);
16788 /* res = trunc (e2) */
16789 res
= gen_reg_rtx (mode
);
16790 emit_insn (gen_round (res
, e2
, GEN_INT (ROUND_TRUNC
)));
16792 emit_move_insn (op0
, res
);
16795 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
16796 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
16797 insn every time. */
16799 static GTY(()) rtx_insn
*vselect_insn
;
16801 /* Initialize vselect_insn. */
16804 init_vselect_insn (void)
16809 x
= gen_rtx_PARALLEL (VOIDmode
, rtvec_alloc (MAX_VECT_LEN
));
16810 for (i
= 0; i
< MAX_VECT_LEN
; ++i
)
16811 XVECEXP (x
, 0, i
) = const0_rtx
;
16812 x
= gen_rtx_VEC_SELECT (V2DFmode
, gen_rtx_VEC_CONCAT (V4DFmode
, const0_rtx
,
16814 x
= gen_rtx_SET (const0_rtx
, x
);
16816 vselect_insn
= emit_insn (x
);
16820 /* Construct (set target (vec_select op0 (parallel perm))) and
16821 return true if that's a valid instruction in the active ISA. */
16824 expand_vselect (rtx target
, rtx op0
, const unsigned char *perm
,
16825 unsigned nelt
, bool testing_p
)
16828 rtx x
, save_vconcat
;
16831 if (vselect_insn
== NULL_RTX
)
16832 init_vselect_insn ();
16834 x
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 1);
16835 PUT_NUM_ELEM (XVEC (x
, 0), nelt
);
16836 for (i
= 0; i
< nelt
; ++i
)
16837 XVECEXP (x
, 0, i
) = GEN_INT (perm
[i
]);
16838 save_vconcat
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 0);
16839 XEXP (SET_SRC (PATTERN (vselect_insn
)), 0) = op0
;
16840 PUT_MODE (SET_SRC (PATTERN (vselect_insn
)), GET_MODE (target
));
16841 SET_DEST (PATTERN (vselect_insn
)) = target
;
16842 icode
= recog_memoized (vselect_insn
);
16844 if (icode
>= 0 && !testing_p
)
16845 emit_insn (copy_rtx (PATTERN (vselect_insn
)));
16847 SET_DEST (PATTERN (vselect_insn
)) = const0_rtx
;
16848 XEXP (SET_SRC (PATTERN (vselect_insn
)), 0) = save_vconcat
;
16849 INSN_CODE (vselect_insn
) = -1;
16854 /* Similar, but generate a vec_concat from op0 and op1 as well. */
16857 expand_vselect_vconcat (rtx target
, rtx op0
, rtx op1
,
16858 const unsigned char *perm
, unsigned nelt
,
16861 machine_mode v2mode
;
16865 if (vselect_insn
== NULL_RTX
)
16866 init_vselect_insn ();
16868 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0
)).exists (&v2mode
))
16870 x
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 0);
16871 PUT_MODE (x
, v2mode
);
16874 ok
= expand_vselect (target
, x
, perm
, nelt
, testing_p
);
16875 XEXP (x
, 0) = const0_rtx
;
16876 XEXP (x
, 1) = const0_rtx
;
16880 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16881 using movss or movsd. */
16883 expand_vec_perm_movs (struct expand_vec_perm_d
*d
)
16885 machine_mode vmode
= d
->vmode
;
16886 unsigned i
, nelt
= d
->nelt
;
16889 if (d
->one_operand_p
)
16892 if (!(TARGET_SSE
&& vmode
== V4SFmode
)
16893 && !(TARGET_MMX_WITH_SSE
&& vmode
== V2SFmode
)
16894 && !(TARGET_SSE2
&& vmode
== V2DFmode
))
16897 /* Only the first element is changed. */
16898 if (d
->perm
[0] != nelt
&& d
->perm
[0] != 0)
16900 for (i
= 1; i
< nelt
; ++i
)
16901 if (d
->perm
[i
] != i
+ nelt
- d
->perm
[0])
16907 if (d
->perm
[0] == nelt
)
16908 x
= gen_rtx_VEC_MERGE (vmode
, d
->op1
, d
->op0
, GEN_INT (1));
16910 x
= gen_rtx_VEC_MERGE (vmode
, d
->op0
, d
->op1
, GEN_INT (1));
16912 emit_insn (gen_rtx_SET (d
->target
, x
));
16917 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
16918 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
16921 expand_vec_perm_blend (struct expand_vec_perm_d
*d
)
16923 machine_mode mmode
, vmode
= d
->vmode
;
16924 unsigned i
, nelt
= d
->nelt
;
16925 unsigned HOST_WIDE_INT mask
;
16926 rtx target
, op0
, op1
, maskop
, x
;
16927 rtx rperm
[32], vperm
;
16929 if (d
->one_operand_p
)
16931 if (TARGET_AVX512F
&& GET_MODE_SIZE (vmode
) == 64
16932 && (TARGET_AVX512BW
16933 || GET_MODE_UNIT_SIZE (vmode
) >= 4))
16935 else if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
16937 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
16939 else if (TARGET_SSE4_1
&& GET_MODE_SIZE (vmode
) == 16)
16944 /* This is a blend, not a permute. Elements must stay in their
16945 respective lanes. */
16946 for (i
= 0; i
< nelt
; ++i
)
16948 unsigned e
= d
->perm
[i
];
16949 if (!(e
== i
|| e
== i
+ nelt
))
16956 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
16957 decision should be extracted elsewhere, so that we only try that
16958 sequence once all budget==3 options have been tried. */
16959 target
= d
->target
;
16978 for (i
= 0; i
< nelt
; ++i
)
16979 mask
|= ((unsigned HOST_WIDE_INT
) (d
->perm
[i
] >= nelt
)) << i
;
16983 for (i
= 0; i
< 2; ++i
)
16984 mask
|= (d
->perm
[i
] >= 2 ? 15 : 0) << (i
* 4);
16989 for (i
= 0; i
< 4; ++i
)
16990 mask
|= (d
->perm
[i
] >= 4 ? 3 : 0) << (i
* 2);
16995 /* See if bytes move in pairs so we can use pblendw with
16996 an immediate argument, rather than pblendvb with a vector
16998 for (i
= 0; i
< 16; i
+= 2)
16999 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
17002 for (i
= 0; i
< nelt
; ++i
)
17003 rperm
[i
] = (d
->perm
[i
] < nelt
? const0_rtx
: constm1_rtx
);
17006 vperm
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
17007 vperm
= force_reg (vmode
, vperm
);
17009 if (GET_MODE_SIZE (vmode
) == 16)
17010 emit_insn (gen_sse4_1_pblendvb (target
, op0
, op1
, vperm
));
17012 emit_insn (gen_avx2_pblendvb (target
, op0
, op1
, vperm
));
17013 if (target
!= d
->target
)
17014 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
17018 for (i
= 0; i
< 8; ++i
)
17019 mask
|= (d
->perm
[i
* 2] >= 16) << i
;
17024 target
= gen_reg_rtx (vmode
);
17025 op0
= gen_lowpart (vmode
, op0
);
17026 op1
= gen_lowpart (vmode
, op1
);
17030 /* See if bytes move in pairs. If not, vpblendvb must be used. */
17031 for (i
= 0; i
< 32; i
+= 2)
17032 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
17034 /* See if bytes move in quadruplets. If yes, vpblendd
17035 with immediate can be used. */
17036 for (i
= 0; i
< 32; i
+= 4)
17037 if (d
->perm
[i
] + 2 != d
->perm
[i
+ 2])
17041 /* See if bytes move the same in both lanes. If yes,
17042 vpblendw with immediate can be used. */
17043 for (i
= 0; i
< 16; i
+= 2)
17044 if (d
->perm
[i
] + 16 != d
->perm
[i
+ 16])
17047 /* Use vpblendw. */
17048 for (i
= 0; i
< 16; ++i
)
17049 mask
|= (d
->perm
[i
* 2] >= 32) << i
;
17054 /* Use vpblendd. */
17055 for (i
= 0; i
< 8; ++i
)
17056 mask
|= (d
->perm
[i
* 4] >= 32) << i
;
17061 /* See if words move in pairs. If yes, vpblendd can be used. */
17062 for (i
= 0; i
< 16; i
+= 2)
17063 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
17067 /* See if words move the same in both lanes. If not,
17068 vpblendvb must be used. */
17069 for (i
= 0; i
< 8; i
++)
17070 if (d
->perm
[i
] + 8 != d
->perm
[i
+ 8])
17072 /* Use vpblendvb. */
17073 for (i
= 0; i
< 32; ++i
)
17074 rperm
[i
] = (d
->perm
[i
/ 2] < 16 ? const0_rtx
: constm1_rtx
);
17078 target
= gen_reg_rtx (vmode
);
17079 op0
= gen_lowpart (vmode
, op0
);
17080 op1
= gen_lowpart (vmode
, op1
);
17081 goto finish_pblendvb
;
17084 /* Use vpblendw. */
17085 for (i
= 0; i
< 16; ++i
)
17086 mask
|= (d
->perm
[i
] >= 16) << i
;
17090 /* Use vpblendd. */
17091 for (i
= 0; i
< 8; ++i
)
17092 mask
|= (d
->perm
[i
* 2] >= 16) << i
;
17097 /* Use vpblendd. */
17098 for (i
= 0; i
< 4; ++i
)
17099 mask
|= (d
->perm
[i
] >= 4 ? 3 : 0) << (i
* 2);
17104 gcc_unreachable ();
17127 if (mmode
!= VOIDmode
)
17128 maskop
= force_reg (mmode
, gen_int_mode (mask
, mmode
));
17130 maskop
= GEN_INT (mask
);
17132 /* This matches five different patterns with the different modes. */
17133 x
= gen_rtx_VEC_MERGE (vmode
, op1
, op0
, maskop
);
17134 x
= gen_rtx_SET (target
, x
);
17136 if (target
!= d
->target
)
17137 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
17142 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
17143 in terms of the variable form of vpermilps.
17145 Note that we will have already failed the immediate input vpermilps,
17146 which requires that the high and low part shuffle be identical; the
17147 variable form doesn't require that. */
17150 expand_vec_perm_vpermil (struct expand_vec_perm_d
*d
)
17152 rtx rperm
[8], vperm
;
17155 if (!TARGET_AVX
|| d
->vmode
!= V8SFmode
|| !d
->one_operand_p
)
17158 /* We can only permute within the 128-bit lane. */
17159 for (i
= 0; i
< 8; ++i
)
17161 unsigned e
= d
->perm
[i
];
17162 if (i
< 4 ? e
>= 4 : e
< 4)
17169 for (i
= 0; i
< 8; ++i
)
17171 unsigned e
= d
->perm
[i
];
17173 /* Within each 128-bit lane, the elements of op0 are numbered
17174 from 0 and the elements of op1 are numbered from 4. */
17180 rperm
[i
] = GEN_INT (e
);
17183 vperm
= gen_rtx_CONST_VECTOR (V8SImode
, gen_rtvec_v (8, rperm
));
17184 vperm
= force_reg (V8SImode
, vperm
);
17185 emit_insn (gen_avx_vpermilvarv8sf3 (d
->target
, d
->op0
, vperm
));
17190 /* Return true if permutation D can be performed as VMODE permutation
17194 valid_perm_using_mode_p (machine_mode vmode
, struct expand_vec_perm_d
*d
)
17196 unsigned int i
, j
, chunk
;
17198 if (GET_MODE_CLASS (vmode
) != MODE_VECTOR_INT
17199 || GET_MODE_CLASS (d
->vmode
) != MODE_VECTOR_INT
17200 || GET_MODE_SIZE (vmode
) != GET_MODE_SIZE (d
->vmode
))
17203 if (GET_MODE_NUNITS (vmode
) >= d
->nelt
)
17206 chunk
= d
->nelt
/ GET_MODE_NUNITS (vmode
);
17207 for (i
= 0; i
< d
->nelt
; i
+= chunk
)
17208 if (d
->perm
[i
] & (chunk
- 1))
17211 for (j
= 1; j
< chunk
; ++j
)
17212 if (d
->perm
[i
] + j
!= d
->perm
[i
+ j
])
17218 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
17219 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
17222 expand_vec_perm_pshufb (struct expand_vec_perm_d
*d
)
17224 unsigned i
, nelt
, eltsz
, mask
;
17225 unsigned char perm
[64];
17226 machine_mode vmode
= V16QImode
;
17227 rtx rperm
[64], vperm
, target
, op0
, op1
;
17231 if (!d
->one_operand_p
)
17233 if (!TARGET_XOP
|| GET_MODE_SIZE (d
->vmode
) != 16)
17236 && valid_perm_using_mode_p (V2TImode
, d
))
17241 /* Use vperm2i128 insn. The pattern uses
17242 V4DImode instead of V2TImode. */
17243 target
= d
->target
;
17244 if (d
->vmode
!= V4DImode
)
17245 target
= gen_reg_rtx (V4DImode
);
17246 op0
= gen_lowpart (V4DImode
, d
->op0
);
17247 op1
= gen_lowpart (V4DImode
, d
->op1
);
17249 = GEN_INT ((d
->perm
[0] / (nelt
/ 2))
17250 | ((d
->perm
[nelt
/ 2] / (nelt
/ 2)) * 16));
17251 emit_insn (gen_avx2_permv2ti (target
, op0
, op1
, rperm
[0]));
17252 if (target
!= d
->target
)
17253 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
17261 if (GET_MODE_SIZE (d
->vmode
) == 16)
17266 else if (GET_MODE_SIZE (d
->vmode
) == 32)
17271 /* V4DImode should be already handled through
17272 expand_vselect by vpermq instruction. */
17273 gcc_assert (d
->vmode
!= V4DImode
);
17276 if (d
->vmode
== V8SImode
17277 || d
->vmode
== V16HImode
17278 || d
->vmode
== V32QImode
)
17280 /* First see if vpermq can be used for
17281 V8SImode/V16HImode/V32QImode. */
17282 if (valid_perm_using_mode_p (V4DImode
, d
))
17284 for (i
= 0; i
< 4; i
++)
17285 perm
[i
] = (d
->perm
[i
* nelt
/ 4] * 4 / nelt
) & 3;
17288 target
= gen_reg_rtx (V4DImode
);
17289 if (expand_vselect (target
, gen_lowpart (V4DImode
, d
->op0
),
17292 emit_move_insn (d
->target
,
17293 gen_lowpart (d
->vmode
, target
));
17299 /* Next see if vpermd can be used. */
17300 if (valid_perm_using_mode_p (V8SImode
, d
))
17303 /* Or if vpermps can be used. */
17304 else if (d
->vmode
== V8SFmode
)
17307 if (vmode
== V32QImode
)
17309 /* vpshufb only works intra lanes, it is not
17310 possible to shuffle bytes in between the lanes. */
17311 for (i
= 0; i
< nelt
; ++i
)
17312 if ((d
->perm
[i
] ^ i
) & (nelt
/ 2))
17316 else if (GET_MODE_SIZE (d
->vmode
) == 64)
17318 if (!TARGET_AVX512BW
)
17321 /* If vpermq didn't work, vpshufb won't work either. */
17322 if (d
->vmode
== V8DFmode
|| d
->vmode
== V8DImode
)
17326 if (d
->vmode
== V16SImode
17327 || d
->vmode
== V32HImode
17328 || d
->vmode
== V64QImode
)
17330 /* First see if vpermq can be used for
17331 V16SImode/V32HImode/V64QImode. */
17332 if (valid_perm_using_mode_p (V8DImode
, d
))
17334 for (i
= 0; i
< 8; i
++)
17335 perm
[i
] = (d
->perm
[i
* nelt
/ 8] * 8 / nelt
) & 7;
17338 target
= gen_reg_rtx (V8DImode
);
17339 if (expand_vselect (target
, gen_lowpart (V8DImode
, d
->op0
),
17342 emit_move_insn (d
->target
,
17343 gen_lowpart (d
->vmode
, target
));
17349 /* Next see if vpermd can be used. */
17350 if (valid_perm_using_mode_p (V16SImode
, d
))
17353 /* Or if vpermps can be used. */
17354 else if (d
->vmode
== V16SFmode
)
17356 if (vmode
== V64QImode
)
17358 /* vpshufb only works intra lanes, it is not
17359 possible to shuffle bytes in between the lanes. */
17360 for (i
= 0; i
< nelt
; ++i
)
17361 if ((d
->perm
[i
] ^ i
) & (3 * nelt
/ 4))
17372 if (vmode
== V8SImode
)
17373 for (i
= 0; i
< 8; ++i
)
17374 rperm
[i
] = GEN_INT ((d
->perm
[i
* nelt
/ 8] * 8 / nelt
) & 7);
17375 else if (vmode
== V16SImode
)
17376 for (i
= 0; i
< 16; ++i
)
17377 rperm
[i
] = GEN_INT ((d
->perm
[i
* nelt
/ 16] * 16 / nelt
) & 15);
17380 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
17381 if (!d
->one_operand_p
)
17382 mask
= 2 * nelt
- 1;
17383 else if (vmode
== V16QImode
)
17385 else if (vmode
== V64QImode
)
17386 mask
= nelt
/ 4 - 1;
17388 mask
= nelt
/ 2 - 1;
17390 for (i
= 0; i
< nelt
; ++i
)
17392 unsigned j
, e
= d
->perm
[i
] & mask
;
17393 for (j
= 0; j
< eltsz
; ++j
)
17394 rperm
[i
* eltsz
+ j
] = GEN_INT (e
* eltsz
+ j
);
17398 vperm
= gen_rtx_CONST_VECTOR (vmode
,
17399 gen_rtvec_v (GET_MODE_NUNITS (vmode
), rperm
));
17400 vperm
= force_reg (vmode
, vperm
);
17402 target
= d
->target
;
17403 if (d
->vmode
!= vmode
)
17404 target
= gen_reg_rtx (vmode
);
17405 op0
= gen_lowpart (vmode
, d
->op0
);
17406 if (d
->one_operand_p
)
17408 if (vmode
== V16QImode
)
17409 emit_insn (gen_ssse3_pshufbv16qi3 (target
, op0
, vperm
));
17410 else if (vmode
== V32QImode
)
17411 emit_insn (gen_avx2_pshufbv32qi3 (target
, op0
, vperm
));
17412 else if (vmode
== V64QImode
)
17413 emit_insn (gen_avx512bw_pshufbv64qi3 (target
, op0
, vperm
));
17414 else if (vmode
== V8SFmode
)
17415 emit_insn (gen_avx2_permvarv8sf (target
, op0
, vperm
));
17416 else if (vmode
== V8SImode
)
17417 emit_insn (gen_avx2_permvarv8si (target
, op0
, vperm
));
17418 else if (vmode
== V16SFmode
)
17419 emit_insn (gen_avx512f_permvarv16sf (target
, op0
, vperm
));
17420 else if (vmode
== V16SImode
)
17421 emit_insn (gen_avx512f_permvarv16si (target
, op0
, vperm
));
17423 gcc_unreachable ();
17427 op1
= gen_lowpart (vmode
, d
->op1
);
17428 emit_insn (gen_xop_pperm (target
, op0
, op1
, vperm
));
17430 if (target
!= d
->target
)
17431 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
17436 /* For V*[QHS]Imode permutations, check if the same permutation
17437 can't be performed in a 2x, 4x or 8x wider inner mode. */
17440 canonicalize_vector_int_perm (const struct expand_vec_perm_d
*d
,
17441 struct expand_vec_perm_d
*nd
)
17444 machine_mode mode
= VOIDmode
;
17448 case E_V16QImode
: mode
= V8HImode
; break;
17449 case E_V32QImode
: mode
= V16HImode
; break;
17450 case E_V64QImode
: mode
= V32HImode
; break;
17451 case E_V8HImode
: mode
= V4SImode
; break;
17452 case E_V16HImode
: mode
= V8SImode
; break;
17453 case E_V32HImode
: mode
= V16SImode
; break;
17454 case E_V4SImode
: mode
= V2DImode
; break;
17455 case E_V8SImode
: mode
= V4DImode
; break;
17456 case E_V16SImode
: mode
= V8DImode
; break;
17457 default: return false;
17459 for (i
= 0; i
< d
->nelt
; i
+= 2)
17460 if ((d
->perm
[i
] & 1) || d
->perm
[i
+ 1] != d
->perm
[i
] + 1)
17463 nd
->nelt
= d
->nelt
/ 2;
17464 for (i
= 0; i
< nd
->nelt
; i
++)
17465 nd
->perm
[i
] = d
->perm
[2 * i
] / 2;
17466 if (GET_MODE_INNER (mode
) != DImode
)
17467 canonicalize_vector_int_perm (nd
, nd
);
17470 nd
->one_operand_p
= d
->one_operand_p
;
17471 nd
->testing_p
= d
->testing_p
;
17472 if (d
->op0
== d
->op1
)
17473 nd
->op0
= nd
->op1
= gen_lowpart (nd
->vmode
, d
->op0
);
17476 nd
->op0
= gen_lowpart (nd
->vmode
, d
->op0
);
17477 nd
->op1
= gen_lowpart (nd
->vmode
, d
->op1
);
17480 nd
->target
= gen_raw_REG (nd
->vmode
, LAST_VIRTUAL_REGISTER
+ 1);
17482 nd
->target
= gen_reg_rtx (nd
->vmode
);
17487 /* Try to expand one-operand permutation with constant mask. */
17490 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d
*d
)
17492 machine_mode mode
= GET_MODE (d
->op0
);
17493 machine_mode maskmode
= mode
;
17494 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
17495 rtx target
, op0
, mask
;
17498 if (!rtx_equal_p (d
->op0
, d
->op1
))
17501 if (!TARGET_AVX512F
)
17507 gen
= gen_avx512f_permvarv16si
;
17510 gen
= gen_avx512f_permvarv16sf
;
17511 maskmode
= V16SImode
;
17514 gen
= gen_avx512f_permvarv8di
;
17517 gen
= gen_avx512f_permvarv8df
;
17518 maskmode
= V8DImode
;
17524 target
= d
->target
;
17526 for (int i
= 0; i
< d
->nelt
; ++i
)
17527 vec
[i
] = GEN_INT (d
->perm
[i
]);
17528 mask
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (d
->nelt
, vec
));
17529 emit_insn (gen (target
, op0
, force_reg (maskmode
, mask
)));
17533 static bool expand_vec_perm_palignr (struct expand_vec_perm_d
*d
, bool);
17535 /* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
17536 in a single instruction. */
17539 expand_vec_perm_1 (struct expand_vec_perm_d
*d
)
17541 unsigned i
, nelt
= d
->nelt
;
17542 struct expand_vec_perm_d nd
;
17544 /* Check plain VEC_SELECT first, because AVX has instructions that could
17545 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
17546 input where SEL+CONCAT may not. */
17547 if (d
->one_operand_p
)
17549 int mask
= nelt
- 1;
17550 bool identity_perm
= true;
17551 bool broadcast_perm
= true;
17553 for (i
= 0; i
< nelt
; i
++)
17555 nd
.perm
[i
] = d
->perm
[i
] & mask
;
17556 if (nd
.perm
[i
] != i
)
17557 identity_perm
= false;
17559 broadcast_perm
= false;
17565 emit_move_insn (d
->target
, d
->op0
);
17568 else if (broadcast_perm
&& TARGET_AVX2
)
17570 /* Use vpbroadcast{b,w,d}. */
17571 rtx (*gen
) (rtx
, rtx
) = NULL
;
17575 if (TARGET_AVX512BW
)
17576 gen
= gen_avx512bw_vec_dupv64qi_1
;
17579 gen
= gen_avx2_pbroadcastv32qi_1
;
17582 if (TARGET_AVX512BW
)
17583 gen
= gen_avx512bw_vec_dupv32hi_1
;
17586 gen
= gen_avx2_pbroadcastv16hi_1
;
17589 if (TARGET_AVX512F
)
17590 gen
= gen_avx512f_vec_dupv16si_1
;
17593 gen
= gen_avx2_pbroadcastv8si_1
;
17596 gen
= gen_avx2_pbroadcastv16qi
;
17599 gen
= gen_avx2_pbroadcastv8hi
;
17602 if (TARGET_AVX512F
)
17603 gen
= gen_avx512f_vec_dupv16sf_1
;
17606 gen
= gen_avx2_vec_dupv8sf_1
;
17609 if (TARGET_AVX512F
)
17610 gen
= gen_avx512f_vec_dupv8df_1
;
17613 if (TARGET_AVX512F
)
17614 gen
= gen_avx512f_vec_dupv8di_1
;
17616 /* For other modes prefer other shuffles this function creates. */
17622 emit_insn (gen (d
->target
, d
->op0
));
17627 if (expand_vselect (d
->target
, d
->op0
, nd
.perm
, nelt
, d
->testing_p
))
17630 /* There are plenty of patterns in sse.md that are written for
17631 SEL+CONCAT and are not replicated for a single op. Perhaps
17632 that should be changed, to avoid the nastiness here. */
17634 /* Recognize interleave style patterns, which means incrementing
17635 every other permutation operand. */
17636 for (i
= 0; i
< nelt
; i
+= 2)
17638 nd
.perm
[i
] = d
->perm
[i
] & mask
;
17639 nd
.perm
[i
+ 1] = (d
->perm
[i
+ 1] & mask
) + nelt
;
17641 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op0
, nd
.perm
, nelt
,
17645 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
17648 for (i
= 0; i
< nelt
; i
+= 4)
17650 nd
.perm
[i
+ 0] = d
->perm
[i
+ 0] & mask
;
17651 nd
.perm
[i
+ 1] = d
->perm
[i
+ 1] & mask
;
17652 nd
.perm
[i
+ 2] = (d
->perm
[i
+ 2] & mask
) + nelt
;
17653 nd
.perm
[i
+ 3] = (d
->perm
[i
+ 3] & mask
) + nelt
;
17656 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op0
, nd
.perm
, nelt
,
17662 /* Try movss/movsd instructions. */
17663 if (expand_vec_perm_movs (d
))
17666 /* Finally, try the fully general two operand permute. */
17667 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op1
, d
->perm
, nelt
,
17671 /* Recognize interleave style patterns with reversed operands. */
17672 if (!d
->one_operand_p
)
17674 for (i
= 0; i
< nelt
; ++i
)
17676 unsigned e
= d
->perm
[i
];
17684 if (expand_vselect_vconcat (d
->target
, d
->op1
, d
->op0
, nd
.perm
, nelt
,
17689 /* Try the SSE4.1 blend variable merge instructions. */
17690 if (expand_vec_perm_blend (d
))
17693 /* Try one of the AVX vpermil variable permutations. */
17694 if (expand_vec_perm_vpermil (d
))
17697 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
17698 vpshufb, vpermd, vpermps or vpermq variable permutation. */
17699 if (expand_vec_perm_pshufb (d
))
17702 /* Try the AVX2 vpalignr instruction. */
17703 if (expand_vec_perm_palignr (d
, true))
17706 /* Try the AVX512F vperm{s,d} instructions. */
17707 if (ix86_expand_vec_one_operand_perm_avx512 (d
))
17710 /* Try the AVX512F vpermt2/vpermi2 instructions. */
17711 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX
, NULL_RTX
, NULL_RTX
, NULL_RTX
, d
))
17714 /* See if we can get the same permutation in different vector integer
17716 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
17719 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
17725 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
17726 in terms of a pair of pshuflw + pshufhw instructions. */
17729 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d
*d
)
17731 unsigned char perm2
[MAX_VECT_LEN
];
17735 if (d
->vmode
!= V8HImode
|| !d
->one_operand_p
)
17738 /* The two permutations only operate in 64-bit lanes. */
17739 for (i
= 0; i
< 4; ++i
)
17740 if (d
->perm
[i
] >= 4)
17742 for (i
= 4; i
< 8; ++i
)
17743 if (d
->perm
[i
] < 4)
17749 /* Emit the pshuflw. */
17750 memcpy (perm2
, d
->perm
, 4);
17751 for (i
= 4; i
< 8; ++i
)
17753 ok
= expand_vselect (d
->target
, d
->op0
, perm2
, 8, d
->testing_p
);
17756 /* Emit the pshufhw. */
17757 memcpy (perm2
+ 4, d
->perm
+ 4, 4);
17758 for (i
= 0; i
< 4; ++i
)
17760 ok
= expand_vselect (d
->target
, d
->target
, perm2
, 8, d
->testing_p
);
17766 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17767 the permutation using the SSSE3 palignr instruction. This succeeds
17768 when all of the elements in PERM fit within one vector and we merely
17769 need to shift them down so that a single vector permutation has a
17770 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
17771 the vpalignr instruction itself can perform the requested permutation. */
17774 expand_vec_perm_palignr (struct expand_vec_perm_d
*d
, bool single_insn_only_p
)
17776 unsigned i
, nelt
= d
->nelt
;
17777 unsigned min
, max
, minswap
, maxswap
;
17778 bool in_order
, ok
, swap
= false;
17780 struct expand_vec_perm_d dcopy
;
17782 /* Even with AVX, palignr only operates on 128-bit vectors,
17783 in AVX2 palignr operates on both 128-bit lanes. */
17784 if ((!TARGET_SSSE3
|| GET_MODE_SIZE (d
->vmode
) != 16)
17785 && (!TARGET_AVX2
|| GET_MODE_SIZE (d
->vmode
) != 32))
17790 minswap
= 2 * nelt
;
17792 for (i
= 0; i
< nelt
; ++i
)
17794 unsigned e
= d
->perm
[i
];
17795 unsigned eswap
= d
->perm
[i
] ^ nelt
;
17796 if (GET_MODE_SIZE (d
->vmode
) == 32)
17798 e
= (e
& ((nelt
/ 2) - 1)) | ((e
& nelt
) >> 1);
17799 eswap
= e
^ (nelt
/ 2);
17805 if (eswap
< minswap
)
17807 if (eswap
> maxswap
)
17811 || max
- min
>= (GET_MODE_SIZE (d
->vmode
) == 32 ? nelt
/ 2 : nelt
))
17813 if (d
->one_operand_p
17815 || maxswap
- minswap
>= (GET_MODE_SIZE (d
->vmode
) == 32
17816 ? nelt
/ 2 : nelt
))
17823 /* Given that we have SSSE3, we know we'll be able to implement the
17824 single operand permutation after the palignr with pshufb for
17825 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
17827 if (d
->testing_p
&& GET_MODE_SIZE (d
->vmode
) == 16 && !single_insn_only_p
)
17833 dcopy
.op0
= d
->op1
;
17834 dcopy
.op1
= d
->op0
;
17835 for (i
= 0; i
< nelt
; ++i
)
17836 dcopy
.perm
[i
] ^= nelt
;
17840 for (i
= 0; i
< nelt
; ++i
)
17842 unsigned e
= dcopy
.perm
[i
];
17843 if (GET_MODE_SIZE (d
->vmode
) == 32
17845 && (e
& (nelt
/ 2 - 1)) < min
)
17846 e
= e
- min
- (nelt
/ 2);
17853 dcopy
.one_operand_p
= true;
17855 if (single_insn_only_p
&& !in_order
)
17858 /* For AVX2, test whether we can permute the result in one instruction. */
17863 dcopy
.op1
= dcopy
.op0
;
17864 return expand_vec_perm_1 (&dcopy
);
17867 shift
= GEN_INT (min
* GET_MODE_UNIT_BITSIZE (d
->vmode
));
17868 if (GET_MODE_SIZE (d
->vmode
) == 16)
17870 target
= gen_reg_rtx (TImode
);
17871 emit_insn (gen_ssse3_palignrti (target
, gen_lowpart (TImode
, dcopy
.op1
),
17872 gen_lowpart (TImode
, dcopy
.op0
), shift
));
17876 target
= gen_reg_rtx (V2TImode
);
17877 emit_insn (gen_avx2_palignrv2ti (target
,
17878 gen_lowpart (V2TImode
, dcopy
.op1
),
17879 gen_lowpart (V2TImode
, dcopy
.op0
),
17883 dcopy
.op0
= dcopy
.op1
= gen_lowpart (d
->vmode
, target
);
17885 /* Test for the degenerate case where the alignment by itself
17886 produces the desired permutation. */
17889 emit_move_insn (d
->target
, dcopy
.op0
);
17893 ok
= expand_vec_perm_1 (&dcopy
);
17894 gcc_assert (ok
|| GET_MODE_SIZE (d
->vmode
) == 32);
17899 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17900 the permutation using the SSE4_1 pblendv instruction. Potentially
17901 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
17904 expand_vec_perm_pblendv (struct expand_vec_perm_d
*d
)
17906 unsigned i
, which
, nelt
= d
->nelt
;
17907 struct expand_vec_perm_d dcopy
, dcopy1
;
17908 machine_mode vmode
= d
->vmode
;
17911 /* Use the same checks as in expand_vec_perm_blend. */
17912 if (d
->one_operand_p
)
17914 if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
17916 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
17918 else if (TARGET_SSE4_1
&& GET_MODE_SIZE (vmode
) == 16)
17923 /* Figure out where permutation elements stay not in their
17924 respective lanes. */
17925 for (i
= 0, which
= 0; i
< nelt
; ++i
)
17927 unsigned e
= d
->perm
[i
];
17929 which
|= (e
< nelt
? 1 : 2);
17931 /* We can pblend the part where elements stay not in their
17932 respective lanes only when these elements are all in one
17933 half of a permutation.
17934 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
17935 lanes, but both 8 and 9 >= 8
17936 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
17937 respective lanes and 8 >= 8, but 2 not. */
17938 if (which
!= 1 && which
!= 2)
17940 if (d
->testing_p
&& GET_MODE_SIZE (vmode
) == 16)
17943 /* First we apply one operand permutation to the part where
17944 elements stay not in their respective lanes. */
17947 dcopy
.op0
= dcopy
.op1
= d
->op1
;
17949 dcopy
.op0
= dcopy
.op1
= d
->op0
;
17951 dcopy
.target
= gen_reg_rtx (vmode
);
17952 dcopy
.one_operand_p
= true;
17954 for (i
= 0; i
< nelt
; ++i
)
17955 dcopy
.perm
[i
] = d
->perm
[i
] & (nelt
- 1);
17957 ok
= expand_vec_perm_1 (&dcopy
);
17958 if (GET_MODE_SIZE (vmode
) != 16 && !ok
)
17965 /* Next we put permuted elements into their positions. */
17968 dcopy1
.op1
= dcopy
.target
;
17970 dcopy1
.op0
= dcopy
.target
;
17972 for (i
= 0; i
< nelt
; ++i
)
17973 dcopy1
.perm
[i
] = ((d
->perm
[i
] >= nelt
) ? (nelt
+ i
) : i
);
17975 ok
= expand_vec_perm_blend (&dcopy1
);
17981 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d
*d
);
17983 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17984 a two vector permutation into a single vector permutation by using
17985 an interleave operation to merge the vectors. */
17988 expand_vec_perm_interleave2 (struct expand_vec_perm_d
*d
)
17990 struct expand_vec_perm_d dremap
, dfinal
;
17991 unsigned i
, nelt
= d
->nelt
, nelt2
= nelt
/ 2;
17992 unsigned HOST_WIDE_INT contents
;
17993 unsigned char remap
[2 * MAX_VECT_LEN
];
17995 bool ok
, same_halves
= false;
17997 if (GET_MODE_SIZE (d
->vmode
) == 16)
17999 if (d
->one_operand_p
)
18002 else if (GET_MODE_SIZE (d
->vmode
) == 32)
18006 /* For 32-byte modes allow even d->one_operand_p.
18007 The lack of cross-lane shuffling in some instructions
18008 might prevent a single insn shuffle. */
18010 dfinal
.testing_p
= true;
18011 /* If expand_vec_perm_interleave3 can expand this into
18012 a 3 insn sequence, give up and let it be expanded as
18013 3 insn sequence. While that is one insn longer,
18014 it doesn't need a memory operand and in the common
18015 case that both interleave low and high permutations
18016 with the same operands are adjacent needs 4 insns
18017 for both after CSE. */
18018 if (expand_vec_perm_interleave3 (&dfinal
))
18024 /* Examine from whence the elements come. */
18026 for (i
= 0; i
< nelt
; ++i
)
18027 contents
|= HOST_WIDE_INT_1U
<< d
->perm
[i
];
18029 memset (remap
, 0xff, sizeof (remap
));
18032 if (GET_MODE_SIZE (d
->vmode
) == 16)
18034 unsigned HOST_WIDE_INT h1
, h2
, h3
, h4
;
18036 /* Split the two input vectors into 4 halves. */
18037 h1
= (HOST_WIDE_INT_1U
<< nelt2
) - 1;
18042 /* If the elements from the low halves use interleave low, and similarly
18043 for interleave high. If the elements are from mis-matched halves, we
18044 can use shufps for V4SF/V4SI or do a DImode shuffle. */
18045 if ((contents
& (h1
| h3
)) == contents
)
18048 for (i
= 0; i
< nelt2
; ++i
)
18051 remap
[i
+ nelt
] = i
* 2 + 1;
18052 dremap
.perm
[i
* 2] = i
;
18053 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
18055 if (!TARGET_SSE2
&& d
->vmode
== V4SImode
)
18056 dremap
.vmode
= V4SFmode
;
18058 else if ((contents
& (h2
| h4
)) == contents
)
18061 for (i
= 0; i
< nelt2
; ++i
)
18063 remap
[i
+ nelt2
] = i
* 2;
18064 remap
[i
+ nelt
+ nelt2
] = i
* 2 + 1;
18065 dremap
.perm
[i
* 2] = i
+ nelt2
;
18066 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt2
;
18068 if (!TARGET_SSE2
&& d
->vmode
== V4SImode
)
18069 dremap
.vmode
= V4SFmode
;
18071 else if ((contents
& (h1
| h4
)) == contents
)
18074 for (i
= 0; i
< nelt2
; ++i
)
18077 remap
[i
+ nelt
+ nelt2
] = i
+ nelt2
;
18078 dremap
.perm
[i
] = i
;
18079 dremap
.perm
[i
+ nelt2
] = i
+ nelt
+ nelt2
;
18084 dremap
.vmode
= V2DImode
;
18086 dremap
.perm
[0] = 0;
18087 dremap
.perm
[1] = 3;
18090 else if ((contents
& (h2
| h3
)) == contents
)
18093 for (i
= 0; i
< nelt2
; ++i
)
18095 remap
[i
+ nelt2
] = i
;
18096 remap
[i
+ nelt
] = i
+ nelt2
;
18097 dremap
.perm
[i
] = i
+ nelt2
;
18098 dremap
.perm
[i
+ nelt2
] = i
+ nelt
;
18103 dremap
.vmode
= V2DImode
;
18105 dremap
.perm
[0] = 1;
18106 dremap
.perm
[1] = 2;
18114 unsigned int nelt4
= nelt
/ 4, nzcnt
= 0;
18115 unsigned HOST_WIDE_INT q
[8];
18116 unsigned int nonzero_halves
[4];
18118 /* Split the two input vectors into 8 quarters. */
18119 q
[0] = (HOST_WIDE_INT_1U
<< nelt4
) - 1;
18120 for (i
= 1; i
< 8; ++i
)
18121 q
[i
] = q
[0] << (nelt4
* i
);
18122 for (i
= 0; i
< 4; ++i
)
18123 if (((q
[2 * i
] | q
[2 * i
+ 1]) & contents
) != 0)
18125 nonzero_halves
[nzcnt
] = i
;
18131 gcc_assert (d
->one_operand_p
);
18132 nonzero_halves
[1] = nonzero_halves
[0];
18133 same_halves
= true;
18135 else if (d
->one_operand_p
)
18137 gcc_assert (nonzero_halves
[0] == 0);
18138 gcc_assert (nonzero_halves
[1] == 1);
18143 if (d
->perm
[0] / nelt2
== nonzero_halves
[1])
18145 /* Attempt to increase the likelihood that dfinal
18146 shuffle will be intra-lane. */
18147 std::swap (nonzero_halves
[0], nonzero_halves
[1]);
18150 /* vperm2f128 or vperm2i128. */
18151 for (i
= 0; i
< nelt2
; ++i
)
18153 remap
[i
+ nonzero_halves
[1] * nelt2
] = i
+ nelt2
;
18154 remap
[i
+ nonzero_halves
[0] * nelt2
] = i
;
18155 dremap
.perm
[i
+ nelt2
] = i
+ nonzero_halves
[1] * nelt2
;
18156 dremap
.perm
[i
] = i
+ nonzero_halves
[0] * nelt2
;
18159 if (d
->vmode
!= V8SFmode
18160 && d
->vmode
!= V4DFmode
18161 && d
->vmode
!= V8SImode
)
18163 dremap
.vmode
= V8SImode
;
18165 for (i
= 0; i
< 4; ++i
)
18167 dremap
.perm
[i
] = i
+ nonzero_halves
[0] * 4;
18168 dremap
.perm
[i
+ 4] = i
+ nonzero_halves
[1] * 4;
18172 else if (d
->one_operand_p
)
18174 else if (TARGET_AVX2
18175 && (contents
& (q
[0] | q
[2] | q
[4] | q
[6])) == contents
)
18178 for (i
= 0; i
< nelt4
; ++i
)
18181 remap
[i
+ nelt
] = i
* 2 + 1;
18182 remap
[i
+ nelt2
] = i
* 2 + nelt2
;
18183 remap
[i
+ nelt
+ nelt2
] = i
* 2 + nelt2
+ 1;
18184 dremap
.perm
[i
* 2] = i
;
18185 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
18186 dremap
.perm
[i
* 2 + nelt2
] = i
+ nelt2
;
18187 dremap
.perm
[i
* 2 + nelt2
+ 1] = i
+ nelt
+ nelt2
;
18190 else if (TARGET_AVX2
18191 && (contents
& (q
[1] | q
[3] | q
[5] | q
[7])) == contents
)
18194 for (i
= 0; i
< nelt4
; ++i
)
18196 remap
[i
+ nelt4
] = i
* 2;
18197 remap
[i
+ nelt
+ nelt4
] = i
* 2 + 1;
18198 remap
[i
+ nelt2
+ nelt4
] = i
* 2 + nelt2
;
18199 remap
[i
+ nelt
+ nelt2
+ nelt4
] = i
* 2 + nelt2
+ 1;
18200 dremap
.perm
[i
* 2] = i
+ nelt4
;
18201 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt4
;
18202 dremap
.perm
[i
* 2 + nelt2
] = i
+ nelt2
+ nelt4
;
18203 dremap
.perm
[i
* 2 + nelt2
+ 1] = i
+ nelt
+ nelt2
+ nelt4
;
18210 /* Use the remapping array set up above to move the elements from their
18211 swizzled locations into their final destinations. */
18213 for (i
= 0; i
< nelt
; ++i
)
18215 unsigned e
= remap
[d
->perm
[i
]];
18216 gcc_assert (e
< nelt
);
18217 /* If same_halves is true, both halves of the remapped vector are the
18218 same. Avoid cross-lane accesses if possible. */
18219 if (same_halves
&& i
>= nelt2
)
18221 gcc_assert (e
< nelt2
);
18222 dfinal
.perm
[i
] = e
+ nelt2
;
18225 dfinal
.perm
[i
] = e
;
18229 dremap
.target
= gen_reg_rtx (dremap
.vmode
);
18230 dfinal
.op0
= gen_lowpart (dfinal
.vmode
, dremap
.target
);
18232 dfinal
.op1
= dfinal
.op0
;
18233 dfinal
.one_operand_p
= true;
18235 /* Test if the final remap can be done with a single insn. For V4SFmode or
18236 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
18238 ok
= expand_vec_perm_1 (&dfinal
);
18239 seq
= get_insns ();
18248 if (dremap
.vmode
!= dfinal
.vmode
)
18250 dremap
.op0
= gen_lowpart (dremap
.vmode
, dremap
.op0
);
18251 dremap
.op1
= gen_lowpart (dremap
.vmode
, dremap
.op1
);
18254 ok
= expand_vec_perm_1 (&dremap
);
18261 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
18262 a single vector cross-lane permutation into vpermq followed
18263 by any of the single insn permutations. */
18266 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d
*d
)
18268 struct expand_vec_perm_d dremap
, dfinal
;
18269 unsigned i
, j
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, nelt4
= nelt
/ 4;
18270 unsigned contents
[2];
18274 && (d
->vmode
== V32QImode
|| d
->vmode
== V16HImode
)
18275 && d
->one_operand_p
))
18280 for (i
= 0; i
< nelt2
; ++i
)
18282 contents
[0] |= 1u << (d
->perm
[i
] / nelt4
);
18283 contents
[1] |= 1u << (d
->perm
[i
+ nelt2
] / nelt4
);
18286 for (i
= 0; i
< 2; ++i
)
18288 unsigned int cnt
= 0;
18289 for (j
= 0; j
< 4; ++j
)
18290 if ((contents
[i
] & (1u << j
)) != 0 && ++cnt
> 2)
18298 dremap
.vmode
= V4DImode
;
18300 dremap
.target
= gen_reg_rtx (V4DImode
);
18301 dremap
.op0
= gen_lowpart (V4DImode
, d
->op0
);
18302 dremap
.op1
= dremap
.op0
;
18303 dremap
.one_operand_p
= true;
18304 for (i
= 0; i
< 2; ++i
)
18306 unsigned int cnt
= 0;
18307 for (j
= 0; j
< 4; ++j
)
18308 if ((contents
[i
] & (1u << j
)) != 0)
18309 dremap
.perm
[2 * i
+ cnt
++] = j
;
18310 for (; cnt
< 2; ++cnt
)
18311 dremap
.perm
[2 * i
+ cnt
] = 0;
18315 dfinal
.op0
= gen_lowpart (dfinal
.vmode
, dremap
.target
);
18316 dfinal
.op1
= dfinal
.op0
;
18317 dfinal
.one_operand_p
= true;
18318 for (i
= 0, j
= 0; i
< nelt
; ++i
)
18322 dfinal
.perm
[i
] = (d
->perm
[i
] & (nelt4
- 1)) | (j
? nelt2
: 0);
18323 if ((d
->perm
[i
] / nelt4
) == dremap
.perm
[j
])
18325 else if ((d
->perm
[i
] / nelt4
) == dremap
.perm
[j
+ 1])
18326 dfinal
.perm
[i
] |= nelt4
;
18328 gcc_unreachable ();
18331 ok
= expand_vec_perm_1 (&dremap
);
18334 ok
= expand_vec_perm_1 (&dfinal
);
18340 static bool canonicalize_perm (struct expand_vec_perm_d
*d
);
18342 /* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
18343 a vector permutation using two instructions, vperm2f128 resp.
18344 vperm2i128 followed by any single in-lane permutation. */
18347 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d
*d
)
18349 struct expand_vec_perm_d dfirst
, dsecond
;
18350 unsigned i
, j
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, perm
;
18354 || GET_MODE_SIZE (d
->vmode
) != 32
18355 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
&& !TARGET_AVX2
))
18359 dsecond
.one_operand_p
= false;
18360 dsecond
.testing_p
= true;
18362 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
18363 immediate. For perm < 16 the second permutation uses
18364 d->op0 as first operand, for perm >= 16 it uses d->op1
18365 as first operand. The second operand is the result of
18367 for (perm
= 0; perm
< 32; perm
++)
18369 /* Ignore permutations which do not move anything cross-lane. */
18372 /* The second shuffle for e.g. V4DFmode has
18373 0123 and ABCD operands.
18374 Ignore AB23, as 23 is already in the second lane
18375 of the first operand. */
18376 if ((perm
& 0xc) == (1 << 2)) continue;
18377 /* And 01CD, as 01 is in the first lane of the first
18379 if ((perm
& 3) == 0) continue;
18380 /* And 4567, as then the vperm2[fi]128 doesn't change
18381 anything on the original 4567 second operand. */
18382 if ((perm
& 0xf) == ((3 << 2) | 2)) continue;
18386 /* The second shuffle for e.g. V4DFmode has
18387 4567 and ABCD operands.
18388 Ignore AB67, as 67 is already in the second lane
18389 of the first operand. */
18390 if ((perm
& 0xc) == (3 << 2)) continue;
18391 /* And 45CD, as 45 is in the first lane of the first
18393 if ((perm
& 3) == 2) continue;
18394 /* And 0123, as then the vperm2[fi]128 doesn't change
18395 anything on the original 0123 first operand. */
18396 if ((perm
& 0xf) == (1 << 2)) continue;
18399 for (i
= 0; i
< nelt
; i
++)
18401 j
= d
->perm
[i
] / nelt2
;
18402 if (j
== ((perm
>> (2 * (i
>= nelt2
))) & 3))
18403 dsecond
.perm
[i
] = nelt
+ (i
& nelt2
) + (d
->perm
[i
] & (nelt2
- 1));
18404 else if (j
== (unsigned) (i
>= nelt2
) + 2 * (perm
>= 16))
18405 dsecond
.perm
[i
] = d
->perm
[i
] & (nelt
- 1);
18413 ok
= expand_vec_perm_1 (&dsecond
);
18424 /* Found a usable second shuffle. dfirst will be
18425 vperm2f128 on d->op0 and d->op1. */
18426 dsecond
.testing_p
= false;
18428 dfirst
.target
= gen_reg_rtx (d
->vmode
);
18429 for (i
= 0; i
< nelt
; i
++)
18430 dfirst
.perm
[i
] = (i
& (nelt2
- 1))
18431 + ((perm
>> (2 * (i
>= nelt2
))) & 3) * nelt2
;
18433 canonicalize_perm (&dfirst
);
18434 ok
= expand_vec_perm_1 (&dfirst
);
18437 /* And dsecond is some single insn shuffle, taking
18438 d->op0 and result of vperm2f128 (if perm < 16) or
18439 d->op1 and result of vperm2f128 (otherwise). */
18441 dsecond
.op0
= dsecond
.op1
;
18442 dsecond
.op1
= dfirst
.target
;
18444 ok
= expand_vec_perm_1 (&dsecond
);
18450 /* For one operand, the only useful vperm2f128 permutation is 0x01
18452 if (d
->one_operand_p
)
18459 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
18460 a two vector permutation using 2 intra-lane interleave insns
18461 and cross-lane shuffle for 32-byte vectors. */
18464 expand_vec_perm_interleave3 (struct expand_vec_perm_d
*d
)
18467 rtx (*gen
) (rtx
, rtx
, rtx
);
18469 if (d
->one_operand_p
)
18471 if (TARGET_AVX2
&& GET_MODE_SIZE (d
->vmode
) == 32)
18473 else if (TARGET_AVX
&& (d
->vmode
== V8SFmode
|| d
->vmode
== V4DFmode
))
18479 if (d
->perm
[0] != 0 && d
->perm
[0] != nelt
/ 2)
18481 for (i
= 0; i
< nelt
; i
+= 2)
18482 if (d
->perm
[i
] != d
->perm
[0] + i
/ 2
18483 || d
->perm
[i
+ 1] != d
->perm
[0] + i
/ 2 + nelt
)
18493 gen
= gen_vec_interleave_highv32qi
;
18495 gen
= gen_vec_interleave_lowv32qi
;
18499 gen
= gen_vec_interleave_highv16hi
;
18501 gen
= gen_vec_interleave_lowv16hi
;
18505 gen
= gen_vec_interleave_highv8si
;
18507 gen
= gen_vec_interleave_lowv8si
;
18511 gen
= gen_vec_interleave_highv4di
;
18513 gen
= gen_vec_interleave_lowv4di
;
18517 gen
= gen_vec_interleave_highv8sf
;
18519 gen
= gen_vec_interleave_lowv8sf
;
18523 gen
= gen_vec_interleave_highv4df
;
18525 gen
= gen_vec_interleave_lowv4df
;
18528 gcc_unreachable ();
18531 emit_insn (gen (d
->target
, d
->op0
, d
->op1
));
18535 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
18536 a single vector permutation using a single intra-lane vector
18537 permutation, vperm2f128 swapping the lanes and vblend* insn blending
18538 the non-swapped and swapped vectors together. */
18541 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d
*d
)
18543 struct expand_vec_perm_d dfirst
, dsecond
;
18544 unsigned i
, j
, msk
, nelt
= d
->nelt
, nelt2
= nelt
/ 2;
18547 rtx (*blend
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
18551 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
)
18552 || !d
->one_operand_p
)
18556 for (i
= 0; i
< nelt
; i
++)
18557 dfirst
.perm
[i
] = 0xff;
18558 for (i
= 0, msk
= 0; i
< nelt
; i
++)
18560 j
= (d
->perm
[i
] & nelt2
) ? i
| nelt2
: i
& ~nelt2
;
18561 if (dfirst
.perm
[j
] != 0xff && dfirst
.perm
[j
] != d
->perm
[i
])
18563 dfirst
.perm
[j
] = d
->perm
[i
];
18567 for (i
= 0; i
< nelt
; i
++)
18568 if (dfirst
.perm
[i
] == 0xff)
18569 dfirst
.perm
[i
] = i
;
18572 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
18575 ok
= expand_vec_perm_1 (&dfirst
);
18576 seq
= get_insns ();
18588 dsecond
.op0
= dfirst
.target
;
18589 dsecond
.op1
= dfirst
.target
;
18590 dsecond
.one_operand_p
= true;
18591 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
18592 for (i
= 0; i
< nelt
; i
++)
18593 dsecond
.perm
[i
] = i
^ nelt2
;
18595 ok
= expand_vec_perm_1 (&dsecond
);
18598 blend
= d
->vmode
== V8SFmode
? gen_avx_blendps256
: gen_avx_blendpd256
;
18599 emit_insn (blend (d
->target
, dfirst
.target
, dsecond
.target
, GEN_INT (msk
)));
18603 /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
18604 permutation using two vperm2f128, followed by a vshufpd insn blending
18605 the two vectors together. */
18608 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d
*d
)
18610 struct expand_vec_perm_d dfirst
, dsecond
, dthird
;
18613 if (!TARGET_AVX
|| (d
->vmode
!= V4DFmode
))
18623 dfirst
.perm
[0] = (d
->perm
[0] & ~1);
18624 dfirst
.perm
[1] = (d
->perm
[0] & ~1) + 1;
18625 dfirst
.perm
[2] = (d
->perm
[2] & ~1);
18626 dfirst
.perm
[3] = (d
->perm
[2] & ~1) + 1;
18627 dsecond
.perm
[0] = (d
->perm
[1] & ~1);
18628 dsecond
.perm
[1] = (d
->perm
[1] & ~1) + 1;
18629 dsecond
.perm
[2] = (d
->perm
[3] & ~1);
18630 dsecond
.perm
[3] = (d
->perm
[3] & ~1) + 1;
18631 dthird
.perm
[0] = (d
->perm
[0] % 2);
18632 dthird
.perm
[1] = (d
->perm
[1] % 2) + 4;
18633 dthird
.perm
[2] = (d
->perm
[2] % 2) + 2;
18634 dthird
.perm
[3] = (d
->perm
[3] % 2) + 6;
18636 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
18637 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
18638 dthird
.op0
= dfirst
.target
;
18639 dthird
.op1
= dsecond
.target
;
18640 dthird
.one_operand_p
= false;
18642 canonicalize_perm (&dfirst
);
18643 canonicalize_perm (&dsecond
);
18645 ok
= expand_vec_perm_1 (&dfirst
)
18646 && expand_vec_perm_1 (&dsecond
)
18647 && expand_vec_perm_1 (&dthird
);
18654 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d
*);
18656 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
18657 a two vector permutation using two intra-lane vector
18658 permutations, vperm2f128 swapping the lanes and vblend* insn blending
18659 the non-swapped and swapped vectors together. */
18662 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d
*d
)
18664 struct expand_vec_perm_d dfirst
, dsecond
, dthird
;
18665 unsigned i
, j
, msk
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, which1
= 0, which2
= 0;
18666 rtx_insn
*seq1
, *seq2
;
18668 rtx (*blend
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
18672 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
)
18673 || d
->one_operand_p
)
18678 for (i
= 0; i
< nelt
; i
++)
18680 dfirst
.perm
[i
] = 0xff;
18681 dsecond
.perm
[i
] = 0xff;
18683 for (i
= 0, msk
= 0; i
< nelt
; i
++)
18685 j
= (d
->perm
[i
] & nelt2
) ? i
| nelt2
: i
& ~nelt2
;
18688 dfirst
.perm
[j
] = d
->perm
[i
];
18689 which1
|= (d
->perm
[i
] < nelt
? 1 : 2);
18693 dsecond
.perm
[j
] = d
->perm
[i
];
18694 which2
|= (d
->perm
[i
] < nelt
? 1 : 2);
18698 if (msk
== 0 || msk
== (1U << nelt
) - 1)
18703 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
18704 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
18707 for (i
= 0; i
< nelt
; i
++)
18709 if (dfirst
.perm
[i
] == 0xff)
18710 dfirst
.perm
[i
] = (which1
== 2 ? i
+ nelt
: i
);
18711 if (dsecond
.perm
[i
] == 0xff)
18712 dsecond
.perm
[i
] = (which2
== 2 ? i
+ nelt
: i
);
18714 canonicalize_perm (&dfirst
);
18716 ok
= ix86_expand_vec_perm_const_1 (&dfirst
);
18717 seq1
= get_insns ();
18723 canonicalize_perm (&dsecond
);
18725 ok
= ix86_expand_vec_perm_const_1 (&dsecond
);
18726 seq2
= get_insns ();
18739 dthird
.op0
= dsecond
.target
;
18740 dthird
.op1
= dsecond
.target
;
18741 dthird
.one_operand_p
= true;
18742 dthird
.target
= gen_reg_rtx (dthird
.vmode
);
18743 for (i
= 0; i
< nelt
; i
++)
18744 dthird
.perm
[i
] = i
^ nelt2
;
18746 ok
= expand_vec_perm_1 (&dthird
);
18749 blend
= d
->vmode
== V8SFmode
? gen_avx_blendps256
: gen_avx_blendpd256
;
18750 emit_insn (blend (d
->target
, dfirst
.target
, dthird
.target
, GEN_INT (msk
)));
18754 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
18755 permutation with two pshufb insns and an ior. We should have already
18756 failed all two instruction sequences. */
18759 expand_vec_perm_pshufb2 (struct expand_vec_perm_d
*d
)
18761 rtx rperm
[2][16], vperm
, l
, h
, op
, m128
;
18762 unsigned int i
, nelt
, eltsz
;
18764 if (!TARGET_SSSE3
|| GET_MODE_SIZE (d
->vmode
) != 16)
18766 gcc_assert (!d
->one_operand_p
);
18772 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
18774 /* Generate two permutation masks. If the required element is within
18775 the given vector it is shuffled into the proper lane. If the required
18776 element is in the other vector, force a zero into the lane by setting
18777 bit 7 in the permutation mask. */
18778 m128
= GEN_INT (-128);
18779 for (i
= 0; i
< nelt
; ++i
)
18781 unsigned j
, e
= d
->perm
[i
];
18782 unsigned which
= (e
>= nelt
);
18786 for (j
= 0; j
< eltsz
; ++j
)
18788 rperm
[which
][i
*eltsz
+ j
] = GEN_INT (e
*eltsz
+ j
);
18789 rperm
[1-which
][i
*eltsz
+ j
] = m128
;
18793 vperm
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, rperm
[0]));
18794 vperm
= force_reg (V16QImode
, vperm
);
18796 l
= gen_reg_rtx (V16QImode
);
18797 op
= gen_lowpart (V16QImode
, d
->op0
);
18798 emit_insn (gen_ssse3_pshufbv16qi3 (l
, op
, vperm
));
18800 vperm
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, rperm
[1]));
18801 vperm
= force_reg (V16QImode
, vperm
);
18803 h
= gen_reg_rtx (V16QImode
);
18804 op
= gen_lowpart (V16QImode
, d
->op1
);
18805 emit_insn (gen_ssse3_pshufbv16qi3 (h
, op
, vperm
));
18808 if (d
->vmode
!= V16QImode
)
18809 op
= gen_reg_rtx (V16QImode
);
18810 emit_insn (gen_iorv16qi3 (op
, l
, h
));
18811 if (op
!= d
->target
)
18812 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
18817 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
18818 with two vpshufb insns, vpermq and vpor. We should have already failed
18819 all two or three instruction sequences. */
18822 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d
*d
)
18824 rtx rperm
[2][32], vperm
, l
, h
, hp
, op
, m128
;
18825 unsigned int i
, nelt
, eltsz
;
18828 || !d
->one_operand_p
18829 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
18836 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
18838 /* Generate two permutation masks. If the required element is within
18839 the same lane, it is shuffled in. If the required element from the
18840 other lane, force a zero by setting bit 7 in the permutation mask.
18841 In the other mask the mask has non-negative elements if element
18842 is requested from the other lane, but also moved to the other lane,
18843 so that the result of vpshufb can have the two V2TImode halves
18845 m128
= GEN_INT (-128);
18846 for (i
= 0; i
< nelt
; ++i
)
18848 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
18849 unsigned which
= ((d
->perm
[i
] ^ i
) & (nelt
/ 2)) * eltsz
;
18851 for (j
= 0; j
< eltsz
; ++j
)
18853 rperm
[!!which
][(i
* eltsz
+ j
) ^ which
] = GEN_INT (e
* eltsz
+ j
);
18854 rperm
[!which
][(i
* eltsz
+ j
) ^ (which
^ 16)] = m128
;
18858 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[1]));
18859 vperm
= force_reg (V32QImode
, vperm
);
18861 h
= gen_reg_rtx (V32QImode
);
18862 op
= gen_lowpart (V32QImode
, d
->op0
);
18863 emit_insn (gen_avx2_pshufbv32qi3 (h
, op
, vperm
));
18865 /* Swap the 128-byte lanes of h into hp. */
18866 hp
= gen_reg_rtx (V4DImode
);
18867 op
= gen_lowpart (V4DImode
, h
);
18868 emit_insn (gen_avx2_permv4di_1 (hp
, op
, const2_rtx
, GEN_INT (3), const0_rtx
,
18871 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[0]));
18872 vperm
= force_reg (V32QImode
, vperm
);
18874 l
= gen_reg_rtx (V32QImode
);
18875 op
= gen_lowpart (V32QImode
, d
->op0
);
18876 emit_insn (gen_avx2_pshufbv32qi3 (l
, op
, vperm
));
18879 if (d
->vmode
!= V32QImode
)
18880 op
= gen_reg_rtx (V32QImode
);
18881 emit_insn (gen_iorv32qi3 (op
, l
, gen_lowpart (V32QImode
, hp
)));
18882 if (op
!= d
->target
)
18883 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
18888 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18889 and extract-odd permutations of two V32QImode and V16QImode operand
18890 with two vpshufb insns, vpor and vpermq. We should have already
18891 failed all two or three instruction sequences. */
18894 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d
*d
)
18896 rtx rperm
[2][32], vperm
, l
, h
, ior
, op
, m128
;
18897 unsigned int i
, nelt
, eltsz
;
18900 || d
->one_operand_p
18901 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
18904 for (i
= 0; i
< d
->nelt
; ++i
)
18905 if ((d
->perm
[i
] ^ (i
* 2)) & (3 * d
->nelt
/ 2))
18912 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
18914 /* Generate two permutation masks. In the first permutation mask
18915 the first quarter will contain indexes for the first half
18916 of the op0, the second quarter will contain bit 7 set, third quarter
18917 will contain indexes for the second half of the op0 and the
18918 last quarter bit 7 set. In the second permutation mask
18919 the first quarter will contain bit 7 set, the second quarter
18920 indexes for the first half of the op1, the third quarter bit 7 set
18921 and last quarter indexes for the second half of the op1.
18922 I.e. the first mask e.g. for V32QImode extract even will be:
18923 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
18924 (all values masked with 0xf except for -128) and second mask
18925 for extract even will be
18926 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
18927 m128
= GEN_INT (-128);
18928 for (i
= 0; i
< nelt
; ++i
)
18930 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
18931 unsigned which
= d
->perm
[i
] >= nelt
;
18932 unsigned xorv
= (i
>= nelt
/ 4 && i
< 3 * nelt
/ 4) ? 24 : 0;
18934 for (j
= 0; j
< eltsz
; ++j
)
18936 rperm
[which
][(i
* eltsz
+ j
) ^ xorv
] = GEN_INT (e
* eltsz
+ j
);
18937 rperm
[1 - which
][(i
* eltsz
+ j
) ^ xorv
] = m128
;
18941 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[0]));
18942 vperm
= force_reg (V32QImode
, vperm
);
18944 l
= gen_reg_rtx (V32QImode
);
18945 op
= gen_lowpart (V32QImode
, d
->op0
);
18946 emit_insn (gen_avx2_pshufbv32qi3 (l
, op
, vperm
));
18948 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[1]));
18949 vperm
= force_reg (V32QImode
, vperm
);
18951 h
= gen_reg_rtx (V32QImode
);
18952 op
= gen_lowpart (V32QImode
, d
->op1
);
18953 emit_insn (gen_avx2_pshufbv32qi3 (h
, op
, vperm
));
18955 ior
= gen_reg_rtx (V32QImode
);
18956 emit_insn (gen_iorv32qi3 (ior
, l
, h
));
18958 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
18959 op
= gen_reg_rtx (V4DImode
);
18960 ior
= gen_lowpart (V4DImode
, ior
);
18961 emit_insn (gen_avx2_permv4di_1 (op
, ior
, const0_rtx
, const2_rtx
,
18962 const1_rtx
, GEN_INT (3)));
18963 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
18968 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18969 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
18970 with two "and" and "pack" or two "shift" and "pack" insns. We should
18971 have already failed all two instruction sequences. */
18974 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d
*d
)
18976 rtx op
, dop0
, dop1
, t
;
18977 unsigned i
, odd
, c
, s
, nelt
= d
->nelt
;
18978 bool end_perm
= false;
18979 machine_mode half_mode
;
18980 rtx (*gen_and
) (rtx
, rtx
, rtx
);
18981 rtx (*gen_pack
) (rtx
, rtx
, rtx
);
18982 rtx (*gen_shift
) (rtx
, rtx
, rtx
);
18984 if (d
->one_operand_p
)
18990 /* Required for "pack". */
18991 if (!TARGET_SSE4_1
)
18995 half_mode
= V4SImode
;
18996 gen_and
= gen_andv4si3
;
18997 gen_pack
= gen_sse4_1_packusdw
;
18998 gen_shift
= gen_lshrv4si3
;
19001 /* No check as all instructions are SSE2. */
19004 half_mode
= V8HImode
;
19005 gen_and
= gen_andv8hi3
;
19006 gen_pack
= gen_sse2_packuswb
;
19007 gen_shift
= gen_lshrv8hi3
;
19014 half_mode
= V8SImode
;
19015 gen_and
= gen_andv8si3
;
19016 gen_pack
= gen_avx2_packusdw
;
19017 gen_shift
= gen_lshrv8si3
;
19025 half_mode
= V16HImode
;
19026 gen_and
= gen_andv16hi3
;
19027 gen_pack
= gen_avx2_packuswb
;
19028 gen_shift
= gen_lshrv16hi3
;
19032 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
19033 general shuffles. */
19037 /* Check that permutation is even or odd. */
19042 for (i
= 1; i
< nelt
; ++i
)
19043 if (d
->perm
[i
] != 2 * i
+ odd
)
19049 dop0
= gen_reg_rtx (half_mode
);
19050 dop1
= gen_reg_rtx (half_mode
);
19053 t
= gen_const_vec_duplicate (half_mode
, GEN_INT (c
));
19054 t
= force_reg (half_mode
, t
);
19055 emit_insn (gen_and (dop0
, t
, gen_lowpart (half_mode
, d
->op0
)));
19056 emit_insn (gen_and (dop1
, t
, gen_lowpart (half_mode
, d
->op1
)));
19060 emit_insn (gen_shift (dop0
,
19061 gen_lowpart (half_mode
, d
->op0
),
19063 emit_insn (gen_shift (dop1
,
19064 gen_lowpart (half_mode
, d
->op1
),
19067 /* In AVX2 for 256 bit case we need to permute pack result. */
19068 if (TARGET_AVX2
&& end_perm
)
19070 op
= gen_reg_rtx (d
->vmode
);
19071 t
= gen_reg_rtx (V4DImode
);
19072 emit_insn (gen_pack (op
, dop0
, dop1
));
19073 emit_insn (gen_avx2_permv4di_1 (t
,
19074 gen_lowpart (V4DImode
, op
),
19079 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, t
));
19082 emit_insn (gen_pack (d
->target
, dop0
, dop1
));
19087 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
19088 and extract-odd permutations of two V64QI operands
19089 with two "shifts", two "truncs" and one "concat" insns for "odd"
19090 and two "truncs" and one concat insn for "even."
19091 Have already failed all two instruction sequences. */
19094 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d
*d
)
19096 rtx t1
, t2
, t3
, t4
;
19097 unsigned i
, odd
, nelt
= d
->nelt
;
19099 if (!TARGET_AVX512BW
19100 || d
->one_operand_p
19101 || d
->vmode
!= V64QImode
)
19104 /* Check that permutation is even or odd. */
19109 for (i
= 1; i
< nelt
; ++i
)
19110 if (d
->perm
[i
] != 2 * i
+ odd
)
19119 t1
= gen_reg_rtx (V32HImode
);
19120 t2
= gen_reg_rtx (V32HImode
);
19121 emit_insn (gen_lshrv32hi3 (t1
,
19122 gen_lowpart (V32HImode
, d
->op0
),
19124 emit_insn (gen_lshrv32hi3 (t2
,
19125 gen_lowpart (V32HImode
, d
->op1
),
19130 t1
= gen_lowpart (V32HImode
, d
->op0
);
19131 t2
= gen_lowpart (V32HImode
, d
->op1
);
19134 t3
= gen_reg_rtx (V32QImode
);
19135 t4
= gen_reg_rtx (V32QImode
);
19136 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3
, t1
));
19137 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4
, t2
));
19138 emit_insn (gen_avx_vec_concatv64qi (d
->target
, t3
, t4
));
19143 /* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
19144 and extract-odd permutations. */
19147 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d
*d
, unsigned odd
)
19149 rtx t1
, t2
, t3
, t4
, t5
;
19156 t1
= gen_reg_rtx (V4DFmode
);
19157 t2
= gen_reg_rtx (V4DFmode
);
19159 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
19160 emit_insn (gen_avx_vperm2f128v4df3 (t1
, d
->op0
, d
->op1
, GEN_INT (0x20)));
19161 emit_insn (gen_avx_vperm2f128v4df3 (t2
, d
->op0
, d
->op1
, GEN_INT (0x31)));
19163 /* Now an unpck[lh]pd will produce the result required. */
19165 t3
= gen_avx_unpckhpd256 (d
->target
, t1
, t2
);
19167 t3
= gen_avx_unpcklpd256 (d
->target
, t1
, t2
);
19173 int mask
= odd
? 0xdd : 0x88;
19177 t1
= gen_reg_rtx (V8SFmode
);
19178 t2
= gen_reg_rtx (V8SFmode
);
19179 t3
= gen_reg_rtx (V8SFmode
);
19181 /* Shuffle within the 128-bit lanes to produce:
19182 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
19183 emit_insn (gen_avx_shufps256 (t1
, d
->op0
, d
->op1
,
19186 /* Shuffle the lanes around to produce:
19187 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
19188 emit_insn (gen_avx_vperm2f128v8sf3 (t2
, t1
, t1
,
19191 /* Shuffle within the 128-bit lanes to produce:
19192 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
19193 emit_insn (gen_avx_shufps256 (t3
, t1
, t2
, GEN_INT (0x44)));
19195 /* Shuffle within the 128-bit lanes to produce:
19196 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
19197 emit_insn (gen_avx_shufps256 (t2
, t1
, t2
, GEN_INT (0xee)));
19199 /* Shuffle the lanes around to produce:
19200 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
19201 emit_insn (gen_avx_vperm2f128v8sf3 (d
->target
, t3
, t2
,
19211 /* These are always directly implementable by expand_vec_perm_1. */
19212 gcc_unreachable ();
19215 gcc_assert (TARGET_MMX_WITH_SSE
);
19216 /* We have no suitable instructions. */
19224 /* We need 2*log2(N)-1 operations to achieve odd/even
19225 with interleave. */
19226 t1
= gen_reg_rtx (V4HImode
);
19227 emit_insn (gen_mmx_punpckhwd (t1
, d
->op0
, d
->op1
));
19228 emit_insn (gen_mmx_punpcklwd (d
->target
, d
->op0
, d
->op1
));
19230 t2
= gen_mmx_punpckhwd (d
->target
, d
->target
, t1
);
19232 t2
= gen_mmx_punpcklwd (d
->target
, d
->target
, t1
);
19238 return expand_vec_perm_even_odd_pack (d
);
19239 else if (TARGET_SSSE3
&& !TARGET_SLOW_PSHUFB
)
19240 return expand_vec_perm_pshufb2 (d
);
19245 /* We need 2*log2(N)-1 operations to achieve odd/even
19246 with interleave. */
19247 t1
= gen_reg_rtx (V8HImode
);
19248 t2
= gen_reg_rtx (V8HImode
);
19249 emit_insn (gen_vec_interleave_highv8hi (t1
, d
->op0
, d
->op1
));
19250 emit_insn (gen_vec_interleave_lowv8hi (d
->target
, d
->op0
, d
->op1
));
19251 emit_insn (gen_vec_interleave_highv8hi (t2
, d
->target
, t1
));
19252 emit_insn (gen_vec_interleave_lowv8hi (d
->target
, d
->target
, t1
));
19254 t3
= gen_vec_interleave_highv8hi (d
->target
, d
->target
, t2
);
19256 t3
= gen_vec_interleave_lowv8hi (d
->target
, d
->target
, t2
);
19262 return expand_vec_perm_even_odd_pack (d
);
19266 return expand_vec_perm_even_odd_pack (d
);
19269 return expand_vec_perm_even_odd_trunc (d
);
19274 struct expand_vec_perm_d d_copy
= *d
;
19275 d_copy
.vmode
= V4DFmode
;
19277 d_copy
.target
= gen_raw_REG (V4DFmode
, LAST_VIRTUAL_REGISTER
+ 1);
19279 d_copy
.target
= gen_reg_rtx (V4DFmode
);
19280 d_copy
.op0
= gen_lowpart (V4DFmode
, d
->op0
);
19281 d_copy
.op1
= gen_lowpart (V4DFmode
, d
->op1
);
19282 if (expand_vec_perm_even_odd_1 (&d_copy
, odd
))
19285 emit_move_insn (d
->target
,
19286 gen_lowpart (V4DImode
, d_copy
.target
));
19295 t1
= gen_reg_rtx (V4DImode
);
19296 t2
= gen_reg_rtx (V4DImode
);
19298 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
19299 emit_insn (gen_avx2_permv2ti (t1
, d
->op0
, d
->op1
, GEN_INT (0x20)));
19300 emit_insn (gen_avx2_permv2ti (t2
, d
->op0
, d
->op1
, GEN_INT (0x31)));
19302 /* Now an vpunpck[lh]qdq will produce the result required. */
19304 t3
= gen_avx2_interleave_highv4di (d
->target
, t1
, t2
);
19306 t3
= gen_avx2_interleave_lowv4di (d
->target
, t1
, t2
);
19313 struct expand_vec_perm_d d_copy
= *d
;
19314 d_copy
.vmode
= V8SFmode
;
19316 d_copy
.target
= gen_raw_REG (V8SFmode
, LAST_VIRTUAL_REGISTER
+ 1);
19318 d_copy
.target
= gen_reg_rtx (V8SFmode
);
19319 d_copy
.op0
= gen_lowpart (V8SFmode
, d
->op0
);
19320 d_copy
.op1
= gen_lowpart (V8SFmode
, d
->op1
);
19321 if (expand_vec_perm_even_odd_1 (&d_copy
, odd
))
19324 emit_move_insn (d
->target
,
19325 gen_lowpart (V8SImode
, d_copy
.target
));
19334 t1
= gen_reg_rtx (V8SImode
);
19335 t2
= gen_reg_rtx (V8SImode
);
19336 t3
= gen_reg_rtx (V4DImode
);
19337 t4
= gen_reg_rtx (V4DImode
);
19338 t5
= gen_reg_rtx (V4DImode
);
19340 /* Shuffle the lanes around into
19341 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
19342 emit_insn (gen_avx2_permv2ti (t3
, gen_lowpart (V4DImode
, d
->op0
),
19343 gen_lowpart (V4DImode
, d
->op1
),
19345 emit_insn (gen_avx2_permv2ti (t4
, gen_lowpart (V4DImode
, d
->op0
),
19346 gen_lowpart (V4DImode
, d
->op1
),
19349 /* Swap the 2nd and 3rd position in each lane into
19350 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
19351 emit_insn (gen_avx2_pshufdv3 (t1
, gen_lowpart (V8SImode
, t3
),
19352 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
19353 emit_insn (gen_avx2_pshufdv3 (t2
, gen_lowpart (V8SImode
, t4
),
19354 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
19356 /* Now an vpunpck[lh]qdq will produce
19357 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
19359 t3
= gen_avx2_interleave_highv4di (t5
, gen_lowpart (V4DImode
, t1
),
19360 gen_lowpart (V4DImode
, t2
));
19362 t3
= gen_avx2_interleave_lowv4di (t5
, gen_lowpart (V4DImode
, t1
),
19363 gen_lowpart (V4DImode
, t2
));
19365 emit_move_insn (d
->target
, gen_lowpart (V8SImode
, t5
));
19369 gcc_unreachable ();
19375 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
19376 extract-even and extract-odd permutations. */
19379 expand_vec_perm_even_odd (struct expand_vec_perm_d
*d
)
19381 unsigned i
, odd
, nelt
= d
->nelt
;
19384 if (odd
!= 0 && odd
!= 1)
19387 for (i
= 1; i
< nelt
; ++i
)
19388 if (d
->perm
[i
] != 2 * i
+ odd
)
19391 return expand_vec_perm_even_odd_1 (d
, odd
);
19394 /* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
19395 permutations. We assume that expand_vec_perm_1 has already failed. */
19398 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d
*d
)
19400 unsigned elt
= d
->perm
[0], nelt2
= d
->nelt
/ 2;
19401 machine_mode vmode
= d
->vmode
;
19402 unsigned char perm2
[4];
19403 rtx op0
= d
->op0
, dest
;
19410 /* These are special-cased in sse.md so that we can optionally
19411 use the vbroadcast instruction. They expand to two insns
19412 if the input happens to be in a register. */
19413 gcc_unreachable ();
19421 /* These are always implementable using standard shuffle patterns. */
19422 gcc_unreachable ();
19426 /* These can be implemented via interleave. We save one insn by
19427 stopping once we have promoted to V4SImode and then use pshufd. */
19433 rtx (*gen
) (rtx
, rtx
, rtx
)
19434 = vmode
== V16QImode
? gen_vec_interleave_lowv16qi
19435 : gen_vec_interleave_lowv8hi
;
19439 gen
= vmode
== V16QImode
? gen_vec_interleave_highv16qi
19440 : gen_vec_interleave_highv8hi
;
19445 dest
= gen_reg_rtx (vmode
);
19446 emit_insn (gen (dest
, op0
, op0
));
19447 vmode
= get_mode_wider_vector (vmode
);
19448 op0
= gen_lowpart (vmode
, dest
);
19450 while (vmode
!= V4SImode
);
19452 memset (perm2
, elt
, 4);
19453 dest
= gen_reg_rtx (V4SImode
);
19454 ok
= expand_vselect (dest
, op0
, perm2
, 4, d
->testing_p
);
19457 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
19465 /* For AVX2 broadcasts of the first element vpbroadcast* or
19466 vpermq should be used by expand_vec_perm_1. */
19467 gcc_assert (!TARGET_AVX2
|| d
->perm
[0]);
19471 gcc_unreachable ();
19475 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
19476 broadcast permutations. */
19479 expand_vec_perm_broadcast (struct expand_vec_perm_d
*d
)
19481 unsigned i
, elt
, nelt
= d
->nelt
;
19483 if (!d
->one_operand_p
)
19487 for (i
= 1; i
< nelt
; ++i
)
19488 if (d
->perm
[i
] != elt
)
19491 return expand_vec_perm_broadcast_1 (d
);
19494 /* Implement arbitrary permutations of two V64QImode operands
19495 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
19497 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d
*d
)
19499 if (!TARGET_AVX512BW
|| !(d
->vmode
== V64QImode
))
19505 struct expand_vec_perm_d ds
[2];
19506 rtx rperm
[128], vperm
, target0
, target1
;
19507 unsigned int i
, nelt
;
19508 machine_mode vmode
;
19513 for (i
= 0; i
< 2; i
++)
19516 ds
[i
].vmode
= V32HImode
;
19518 ds
[i
].target
= gen_reg_rtx (V32HImode
);
19519 ds
[i
].op0
= gen_lowpart (V32HImode
, d
->op0
);
19520 ds
[i
].op1
= gen_lowpart (V32HImode
, d
->op1
);
19523 /* Prepare permutations such that the first one takes care of
19524 putting the even bytes into the right positions or one higher
19525 positions (ds[0]) and the second one takes care of
19526 putting the odd bytes into the right positions or one below
19529 for (i
= 0; i
< nelt
; i
++)
19531 ds
[i
& 1].perm
[i
/ 2] = d
->perm
[i
] / 2;
19534 rperm
[i
] = constm1_rtx
;
19535 rperm
[i
+ 64] = GEN_INT ((i
& 14) + (d
->perm
[i
] & 1));
19539 rperm
[i
] = GEN_INT ((i
& 14) + (d
->perm
[i
] & 1));
19540 rperm
[i
+ 64] = constm1_rtx
;
19544 bool ok
= expand_vec_perm_1 (&ds
[0]);
19546 ds
[0].target
= gen_lowpart (V64QImode
, ds
[0].target
);
19548 ok
= expand_vec_perm_1 (&ds
[1]);
19550 ds
[1].target
= gen_lowpart (V64QImode
, ds
[1].target
);
19552 vperm
= gen_rtx_CONST_VECTOR (V64QImode
, gen_rtvec_v (64, rperm
));
19553 vperm
= force_reg (vmode
, vperm
);
19554 target0
= gen_reg_rtx (V64QImode
);
19555 emit_insn (gen_avx512bw_pshufbv64qi3 (target0
, ds
[0].target
, vperm
));
19557 vperm
= gen_rtx_CONST_VECTOR (V64QImode
, gen_rtvec_v (64, rperm
+ 64));
19558 vperm
= force_reg (vmode
, vperm
);
19559 target1
= gen_reg_rtx (V64QImode
);
19560 emit_insn (gen_avx512bw_pshufbv64qi3 (target1
, ds
[1].target
, vperm
));
19562 emit_insn (gen_iorv64qi3 (d
->target
, target0
, target1
));
19566 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
19567 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
19568 all the shorter instruction sequences. */
19571 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d
*d
)
19573 rtx rperm
[4][32], vperm
, l
[2], h
[2], op
, m128
;
19574 unsigned int i
, nelt
, eltsz
;
19578 || d
->one_operand_p
19579 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
19586 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
19588 /* Generate 4 permutation masks. If the required element is within
19589 the same lane, it is shuffled in. If the required element from the
19590 other lane, force a zero by setting bit 7 in the permutation mask.
19591 In the other mask the mask has non-negative elements if element
19592 is requested from the other lane, but also moved to the other lane,
19593 so that the result of vpshufb can have the two V2TImode halves
19595 m128
= GEN_INT (-128);
19596 for (i
= 0; i
< 32; ++i
)
19598 rperm
[0][i
] = m128
;
19599 rperm
[1][i
] = m128
;
19600 rperm
[2][i
] = m128
;
19601 rperm
[3][i
] = m128
;
19607 for (i
= 0; i
< nelt
; ++i
)
19609 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
19610 unsigned xlane
= ((d
->perm
[i
] ^ i
) & (nelt
/ 2)) * eltsz
;
19611 unsigned int which
= ((d
->perm
[i
] & nelt
) ? 2 : 0) + (xlane
? 1 : 0);
19613 for (j
= 0; j
< eltsz
; ++j
)
19614 rperm
[which
][(i
* eltsz
+ j
) ^ xlane
] = GEN_INT (e
* eltsz
+ j
);
19615 used
[which
] = true;
19618 for (i
= 0; i
< 2; ++i
)
19620 if (!used
[2 * i
+ 1])
19625 vperm
= gen_rtx_CONST_VECTOR (V32QImode
,
19626 gen_rtvec_v (32, rperm
[2 * i
+ 1]));
19627 vperm
= force_reg (V32QImode
, vperm
);
19628 h
[i
] = gen_reg_rtx (V32QImode
);
19629 op
= gen_lowpart (V32QImode
, i
? d
->op1
: d
->op0
);
19630 emit_insn (gen_avx2_pshufbv32qi3 (h
[i
], op
, vperm
));
19633 /* Swap the 128-byte lanes of h[X]. */
19634 for (i
= 0; i
< 2; ++i
)
19636 if (h
[i
] == NULL_RTX
)
19638 op
= gen_reg_rtx (V4DImode
);
19639 emit_insn (gen_avx2_permv4di_1 (op
, gen_lowpart (V4DImode
, h
[i
]),
19640 const2_rtx
, GEN_INT (3), const0_rtx
,
19642 h
[i
] = gen_lowpart (V32QImode
, op
);
19645 for (i
= 0; i
< 2; ++i
)
19652 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[2 * i
]));
19653 vperm
= force_reg (V32QImode
, vperm
);
19654 l
[i
] = gen_reg_rtx (V32QImode
);
19655 op
= gen_lowpart (V32QImode
, i
? d
->op1
: d
->op0
);
19656 emit_insn (gen_avx2_pshufbv32qi3 (l
[i
], op
, vperm
));
19659 for (i
= 0; i
< 2; ++i
)
19663 op
= gen_reg_rtx (V32QImode
);
19664 emit_insn (gen_iorv32qi3 (op
, l
[i
], h
[i
]));
19671 gcc_assert (l
[0] && l
[1]);
19673 if (d
->vmode
!= V32QImode
)
19674 op
= gen_reg_rtx (V32QImode
);
19675 emit_insn (gen_iorv32qi3 (op
, l
[0], l
[1]));
19676 if (op
!= d
->target
)
19677 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
19681 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
19682 taken care of, perform the expansion in D and return true on success. */
19685 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
19687 /* Try a single instruction expansion. */
19688 if (expand_vec_perm_1 (d
))
19691 /* Try sequences of two instructions. */
19693 if (expand_vec_perm_pshuflw_pshufhw (d
))
19696 if (expand_vec_perm_palignr (d
, false))
19699 if (expand_vec_perm_interleave2 (d
))
19702 if (expand_vec_perm_broadcast (d
))
19705 if (expand_vec_perm_vpermq_perm_1 (d
))
19708 if (expand_vec_perm_vperm2f128 (d
))
19711 if (expand_vec_perm_pblendv (d
))
19714 /* Try sequences of three instructions. */
19716 if (expand_vec_perm_even_odd_pack (d
))
19719 if (expand_vec_perm_2vperm2f128_vshuf (d
))
19722 if (expand_vec_perm_pshufb2 (d
))
19725 if (expand_vec_perm_interleave3 (d
))
19728 if (expand_vec_perm_vperm2f128_vblend (d
))
19731 /* Try sequences of four instructions. */
19733 if (expand_vec_perm_even_odd_trunc (d
))
19735 if (expand_vec_perm_vpshufb2_vpermq (d
))
19738 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d
))
19741 if (expand_vec_perm_vpermt2_vpshub2 (d
))
19744 /* ??? Look for narrow permutations whose element orderings would
19745 allow the promotion to a wider mode. */
19747 /* ??? Look for sequences of interleave or a wider permute that place
19748 the data into the correct lanes for a half-vector shuffle like
19749 pshuf[lh]w or vpermilps. */
19751 /* ??? Look for sequences of interleave that produce the desired results.
19752 The combinatorics of punpck[lh] get pretty ugly... */
19754 if (expand_vec_perm_even_odd (d
))
19757 /* Even longer sequences. */
19758 if (expand_vec_perm_vpshufb4_vpermq2 (d
))
19761 /* See if we can get the same permutation in different vector integer
19763 struct expand_vec_perm_d nd
;
19764 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
19767 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
19771 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
19772 if (expand_vec_perm2_vperm2f128_vblend (d
))
19778 /* If a permutation only uses one operand, make it clear. Returns true
19779 if the permutation references both operands. */
19782 canonicalize_perm (struct expand_vec_perm_d
*d
)
19784 int i
, which
, nelt
= d
->nelt
;
19786 for (i
= which
= 0; i
< nelt
; ++i
)
19787 which
|= (d
->perm
[i
] < nelt
? 1 : 2);
19789 d
->one_operand_p
= true;
19796 if (!rtx_equal_p (d
->op0
, d
->op1
))
19798 d
->one_operand_p
= false;
19801 /* The elements of PERM do not suggest that only the first operand
19802 is used, but both operands are identical. Allow easier matching
19803 of the permutation by folding the permutation into the single
19808 for (i
= 0; i
< nelt
; ++i
)
19809 d
->perm
[i
] &= nelt
- 1;
19818 return (which
== 3);
19821 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
19824 ix86_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
19825 rtx op1
, const vec_perm_indices
&sel
)
19827 struct expand_vec_perm_d d
;
19828 unsigned char perm
[MAX_VECT_LEN
];
19829 unsigned int i
, nelt
, which
;
19837 gcc_assert (VECTOR_MODE_P (d
.vmode
));
19838 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
19839 d
.testing_p
= !target
;
19841 gcc_assert (sel
.length () == nelt
);
19842 gcc_checking_assert (sizeof (d
.perm
) == sizeof (perm
));
19844 /* Given sufficient ISA support we can just return true here
19845 for selected vector modes. */
19852 if (!TARGET_AVX512F
)
19854 /* All implementable with a single vperm[it]2 insn. */
19859 if (!TARGET_AVX512BW
)
19862 /* All implementable with a single vperm[it]2 insn. */
19866 if (!TARGET_AVX512BW
)
19869 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
19878 if (d
.testing_p
&& TARGET_AVX512VL
)
19879 /* All implementable with a single vperm[it]2 insn. */
19885 if (d
.testing_p
&& TARGET_AVX2
)
19886 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19892 if (d
.testing_p
&& TARGET_AVX2
)
19893 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19900 /* Fall through. */
19905 /* All implementable with a single vpperm insn. */
19906 if (d
.testing_p
&& TARGET_XOP
)
19908 /* All implementable with 2 pshufb + 1 ior. */
19909 if (d
.testing_p
&& TARGET_SSSE3
)
19915 if (!TARGET_MMX_WITH_SSE
)
19922 /* All implementable with shufpd or unpck[lh]pd. */
19930 for (i
= which
= 0; i
< nelt
; ++i
)
19932 unsigned char e
= sel
[i
];
19933 gcc_assert (e
< 2 * nelt
);
19936 which
|= (e
< nelt
? 1 : 2);
19941 /* For all elements from second vector, fold the elements to first. */
19943 for (i
= 0; i
< nelt
; ++i
)
19946 /* Check whether the mask can be applied to the vector type. */
19947 d
.one_operand_p
= (which
!= 3);
19949 /* Implementable with shufps or pshufd. */
19950 if (d
.one_operand_p
19951 && (d
.vmode
== V4SFmode
|| d
.vmode
== V2SFmode
19952 || d
.vmode
== V4SImode
|| d
.vmode
== V2SImode
))
19955 /* Otherwise we have to go through the motions and see if we can
19956 figure out how to generate the requested permutation. */
19957 d
.target
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 1);
19958 d
.op1
= d
.op0
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 2);
19959 if (!d
.one_operand_p
)
19960 d
.op1
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 3);
19963 bool ret
= ix86_expand_vec_perm_const_1 (&d
);
19969 two_args
= canonicalize_perm (&d
);
19971 /* If one of the operands is a zero vector, try to match pmovzx. */
19972 if (two_args
&& (d
.op0
== CONST0_RTX (vmode
) || d
.op1
== CONST0_RTX (vmode
)))
19974 struct expand_vec_perm_d dzero
= d
;
19975 if (d
.op0
== CONST0_RTX (vmode
))
19977 d
.op1
= dzero
.op1
= force_reg (vmode
, d
.op1
);
19978 std::swap (dzero
.op0
, dzero
.op1
);
19979 for (i
= 0; i
< nelt
; ++i
)
19980 dzero
.perm
[i
] ^= nelt
;
19983 d
.op0
= dzero
.op0
= force_reg (vmode
, d
.op0
);
19985 if (expand_vselect_vconcat (dzero
.target
, dzero
.op0
, dzero
.op1
,
19986 dzero
.perm
, nelt
, dzero
.testing_p
))
19990 /* Force operands into registers. */
19991 rtx nop0
= force_reg (vmode
, d
.op0
);
19992 if (d
.op0
== d
.op1
)
19995 d
.op1
= force_reg (vmode
, d
.op1
);
19997 if (ix86_expand_vec_perm_const_1 (&d
))
20000 /* If the selector says both arguments are needed, but the operands are the
20001 same, the above tried to expand with one_operand_p and flattened selector.
20002 If that didn't work, retry without one_operand_p; we succeeded with that
20004 if (two_args
&& d
.one_operand_p
)
20006 d
.one_operand_p
= false;
20007 memcpy (d
.perm
, perm
, sizeof (perm
));
20008 return ix86_expand_vec_perm_const_1 (&d
);
20015 ix86_expand_vec_extract_even_odd (rtx targ
, rtx op0
, rtx op1
, unsigned odd
)
20017 struct expand_vec_perm_d d
;
20023 d
.vmode
= GET_MODE (targ
);
20024 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
20025 d
.one_operand_p
= false;
20026 d
.testing_p
= false;
20028 for (i
= 0; i
< nelt
; ++i
)
20029 d
.perm
[i
] = i
* 2 + odd
;
20031 /* We'll either be able to implement the permutation directly... */
20032 if (expand_vec_perm_1 (&d
))
20035 /* ... or we use the special-case patterns. */
20036 expand_vec_perm_even_odd_1 (&d
, odd
);
20040 ix86_expand_vec_interleave (rtx targ
, rtx op0
, rtx op1
, bool high_p
)
20042 struct expand_vec_perm_d d
;
20043 unsigned i
, nelt
, base
;
20049 d
.vmode
= GET_MODE (targ
);
20050 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
20051 d
.one_operand_p
= false;
20052 d
.testing_p
= false;
20054 base
= high_p
? nelt
/ 2 : 0;
20055 for (i
= 0; i
< nelt
/ 2; ++i
)
20057 d
.perm
[i
* 2] = i
+ base
;
20058 d
.perm
[i
* 2 + 1] = i
+ base
+ nelt
;
20061 /* Note that for AVX this isn't one instruction. */
20062 ok
= ix86_expand_vec_perm_const_1 (&d
);
20066 /* Optimize vector MUL generation for V8QI, V16QI and V32QI
20067 under TARGET_AVX512BW. i.e. for v16qi a * b, it has
20069 vpmovzxbw ymm2, xmm0
20070 vpmovzxbw ymm3, xmm1
20071 vpmullw ymm4, ymm2, ymm3
20074 it would take less instructions than ix86_expand_vecop_qihi.
20075 Return true if success. */
20078 ix86_expand_vecmul_qihi (rtx dest
, rtx op1
, rtx op2
)
20080 machine_mode himode
, qimode
= GET_MODE (dest
);
20081 rtx hop1
, hop2
, hdest
;
20082 rtx (*gen_extend
)(rtx
, rtx
);
20083 rtx (*gen_truncate
)(rtx
, rtx
);
20085 /* There's no V64HImode multiplication instruction. */
20086 if (qimode
== E_V64QImode
)
20089 /* vpmovwb only available under AVX512BW. */
20090 if (!TARGET_AVX512BW
)
20092 if ((qimode
== V8QImode
|| qimode
== V16QImode
)
20093 && !TARGET_AVX512VL
)
20095 /* Not generate zmm instruction when prefer 128/256 bit vector width. */
20096 if (qimode
== V32QImode
20097 && (TARGET_PREFER_AVX128
|| TARGET_PREFER_AVX256
))
20104 gen_extend
= gen_zero_extendv8qiv8hi2
;
20105 gen_truncate
= gen_truncv8hiv8qi2
;
20108 himode
= V16HImode
;
20109 gen_extend
= gen_zero_extendv16qiv16hi2
;
20110 gen_truncate
= gen_truncv16hiv16qi2
;
20113 himode
= V32HImode
;
20114 gen_extend
= gen_zero_extendv32qiv32hi2
;
20115 gen_truncate
= gen_truncv32hiv32qi2
;
20118 gcc_unreachable ();
20121 hop1
= gen_reg_rtx (himode
);
20122 hop2
= gen_reg_rtx (himode
);
20123 hdest
= gen_reg_rtx (himode
);
20124 emit_insn (gen_extend (hop1
, op1
));
20125 emit_insn (gen_extend (hop2
, op2
));
20126 emit_insn (gen_rtx_SET (hdest
, simplify_gen_binary (MULT
, himode
,
20128 emit_insn (gen_truncate (dest
, hdest
));
20132 /* Expand a vector operation shift by constant for a V*QImode in terms of the
20133 same operation on V*HImode. Return true if success. */
20135 ix86_expand_vec_shift_qihi_constant (enum rtx_code code
, rtx dest
, rtx op1
, rtx op2
)
20137 machine_mode qimode
, himode
;
20138 HOST_WIDE_INT and_constant
, xor_constant
;
20139 HOST_WIDE_INT shift_amount
;
20140 rtx vec_const_and
, vec_const_xor
;
20141 rtx tmp
, op1_subreg
;
20142 rtx (*gen_shift
) (rtx
, rtx
, rtx
);
20143 rtx (*gen_and
) (rtx
, rtx
, rtx
);
20144 rtx (*gen_xor
) (rtx
, rtx
, rtx
);
20145 rtx (*gen_sub
) (rtx
, rtx
, rtx
);
20147 /* Only optimize shift by constant. */
20148 if (!CONST_INT_P (op2
))
20151 qimode
= GET_MODE (dest
);
20152 shift_amount
= INTVAL (op2
);
20153 /* Do nothing when shift amount greater equal 8. */
20154 if (shift_amount
> 7)
20157 gcc_assert (code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
);
20158 /* Record sign bit. */
20159 xor_constant
= 1 << (8 - shift_amount
- 1);
20161 /* Zero upper/lower bits shift from left/right element. */
20163 = (code
== ASHIFT
? 256 - (1 << shift_amount
)
20164 : (1 << (8 - shift_amount
)) - 1);
20173 : (code
== ASHIFTRT
) ? gen_ashrv8hi3
: gen_lshrv8hi3
);
20174 gen_and
= gen_andv16qi3
;
20175 gen_xor
= gen_xorv16qi3
;
20176 gen_sub
= gen_subv16qi3
;
20179 himode
= V16HImode
;
20183 : (code
== ASHIFTRT
) ? gen_ashrv16hi3
: gen_lshrv16hi3
);
20184 gen_and
= gen_andv32qi3
;
20185 gen_xor
= gen_xorv32qi3
;
20186 gen_sub
= gen_subv32qi3
;
20189 himode
= V32HImode
;
20193 : (code
== ASHIFTRT
) ? gen_ashrv32hi3
: gen_lshrv32hi3
);
20194 gen_and
= gen_andv64qi3
;
20195 gen_xor
= gen_xorv64qi3
;
20196 gen_sub
= gen_subv64qi3
;
20199 gcc_unreachable ();
20202 tmp
= gen_reg_rtx (himode
);
20203 vec_const_and
= gen_reg_rtx (qimode
);
20204 op1_subreg
= lowpart_subreg (himode
, op1
, qimode
);
20206 /* For ASHIFT and LSHIFTRT, perform operation like
20207 vpsllw/vpsrlw $shift_amount, %op1, %dest.
20208 vpand %vec_const_and, %dest. */
20209 emit_insn (gen_shift (tmp
, op1_subreg
, op2
));
20210 emit_move_insn (dest
, simplify_gen_subreg (qimode
, tmp
, himode
, 0));
20211 emit_move_insn (vec_const_and
,
20212 ix86_build_const_vector (qimode
, true,
20213 gen_int_mode (and_constant
, QImode
)));
20214 emit_insn (gen_and (dest
, dest
, vec_const_and
));
20216 /* For ASHIFTRT, perform extra operation like
20217 vpxor %vec_const_xor, %dest, %dest
20218 vpsubb %vec_const_xor, %dest, %dest */
20219 if (code
== ASHIFTRT
)
20221 vec_const_xor
= gen_reg_rtx (qimode
);
20222 emit_move_insn (vec_const_xor
,
20223 ix86_build_const_vector (qimode
, true,
20224 gen_int_mode (xor_constant
, QImode
)));
20225 emit_insn (gen_xor (dest
, dest
, vec_const_xor
));
20226 emit_insn (gen_sub (dest
, dest
, vec_const_xor
));
20231 /* Expand a vector operation CODE for a V*QImode in terms of the
20232 same operation on V*HImode. */
20235 ix86_expand_vecop_qihi (enum rtx_code code
, rtx dest
, rtx op1
, rtx op2
)
20237 machine_mode qimode
= GET_MODE (dest
);
20238 machine_mode himode
;
20239 rtx (*gen_il
) (rtx
, rtx
, rtx
);
20240 rtx (*gen_ih
) (rtx
, rtx
, rtx
);
20241 rtx op1_l
, op1_h
, op2_l
, op2_h
, res_l
, res_h
;
20242 struct expand_vec_perm_d d
;
20243 bool ok
, full_interleave
;
20244 bool uns_p
= false;
20251 gen_il
= gen_vec_interleave_lowv16qi
;
20252 gen_ih
= gen_vec_interleave_highv16qi
;
20255 himode
= V16HImode
;
20256 gen_il
= gen_avx2_interleave_lowv32qi
;
20257 gen_ih
= gen_avx2_interleave_highv32qi
;
20260 himode
= V32HImode
;
20261 gen_il
= gen_avx512bw_interleave_lowv64qi
;
20262 gen_ih
= gen_avx512bw_interleave_highv64qi
;
20265 gcc_unreachable ();
20268 op2_l
= op2_h
= op2
;
20272 /* Unpack data such that we've got a source byte in each low byte of
20273 each word. We don't care what goes into the high byte of each word.
20274 Rather than trying to get zero in there, most convenient is to let
20275 it be a copy of the low byte. */
20276 op2_l
= gen_reg_rtx (qimode
);
20277 op2_h
= gen_reg_rtx (qimode
);
20278 emit_insn (gen_il (op2_l
, op2
, op2
));
20279 emit_insn (gen_ih (op2_h
, op2
, op2
));
20281 op1_l
= gen_reg_rtx (qimode
);
20282 op1_h
= gen_reg_rtx (qimode
);
20283 emit_insn (gen_il (op1_l
, op1
, op1
));
20284 emit_insn (gen_ih (op1_h
, op1
, op1
));
20285 full_interleave
= qimode
== V16QImode
;
20293 op1_l
= gen_reg_rtx (himode
);
20294 op1_h
= gen_reg_rtx (himode
);
20295 ix86_expand_sse_unpack (op1_l
, op1
, uns_p
, false);
20296 ix86_expand_sse_unpack (op1_h
, op1
, uns_p
, true);
20297 full_interleave
= true;
20300 gcc_unreachable ();
20303 /* Perform the operation. */
20304 res_l
= expand_simple_binop (himode
, code
, op1_l
, op2_l
, NULL_RTX
,
20306 res_h
= expand_simple_binop (himode
, code
, op1_h
, op2_h
, NULL_RTX
,
20308 gcc_assert (res_l
&& res_h
);
20310 /* Merge the data back into the right place. */
20312 d
.op0
= gen_lowpart (qimode
, res_l
);
20313 d
.op1
= gen_lowpart (qimode
, res_h
);
20315 d
.nelt
= GET_MODE_NUNITS (qimode
);
20316 d
.one_operand_p
= false;
20317 d
.testing_p
= false;
20319 if (full_interleave
)
20321 /* For SSE2, we used an full interleave, so the desired
20322 results are in the even elements. */
20323 for (i
= 0; i
< d
.nelt
; ++i
)
20328 /* For AVX, the interleave used above was not cross-lane. So the
20329 extraction is evens but with the second and third quarter swapped.
20330 Happily, that is even one insn shorter than even extraction.
20331 For AVX512BW we have 4 lanes. We extract evens from within a lane,
20332 always first from the first and then from the second source operand,
20333 the index bits above the low 4 bits remains the same.
20334 Thus, for d.nelt == 32 we want permutation
20335 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
20336 and for d.nelt == 64 we want permutation
20337 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
20338 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
20339 for (i
= 0; i
< d
.nelt
; ++i
)
20340 d
.perm
[i
] = ((i
* 2) & 14) + ((i
& 8) ? d
.nelt
: 0) + (i
& ~15);
20343 ok
= ix86_expand_vec_perm_const_1 (&d
);
20346 set_unique_reg_note (get_last_insn (), REG_EQUAL
,
20347 gen_rtx_fmt_ee (code
, qimode
, op1
, op2
));
20350 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
20351 if op is CONST_VECTOR with all odd elements equal to their
20352 preceding element. */
20355 const_vector_equal_evenodd_p (rtx op
)
20357 machine_mode mode
= GET_MODE (op
);
20358 int i
, nunits
= GET_MODE_NUNITS (mode
);
20359 if (GET_CODE (op
) != CONST_VECTOR
20360 || nunits
!= CONST_VECTOR_NUNITS (op
))
20362 for (i
= 0; i
< nunits
; i
+= 2)
20363 if (CONST_VECTOR_ELT (op
, i
) != CONST_VECTOR_ELT (op
, i
+ 1))
20369 ix86_expand_mul_widen_evenodd (rtx dest
, rtx op1
, rtx op2
,
20370 bool uns_p
, bool odd_p
)
20372 machine_mode mode
= GET_MODE (op1
);
20373 machine_mode wmode
= GET_MODE (dest
);
20375 rtx orig_op1
= op1
, orig_op2
= op2
;
20377 if (!nonimmediate_operand (op1
, mode
))
20378 op1
= force_reg (mode
, op1
);
20379 if (!nonimmediate_operand (op2
, mode
))
20380 op2
= force_reg (mode
, op2
);
20382 /* We only play even/odd games with vectors of SImode. */
20383 gcc_assert (mode
== V4SImode
|| mode
== V8SImode
|| mode
== V16SImode
);
20385 /* If we're looking for the odd results, shift those members down to
20386 the even slots. For some cpus this is faster than a PSHUFD. */
20389 /* For XOP use vpmacsdqh, but only for smult, as it is only
20391 if (TARGET_XOP
&& mode
== V4SImode
&& !uns_p
)
20393 x
= force_reg (wmode
, CONST0_RTX (wmode
));
20394 emit_insn (gen_xop_pmacsdqh (dest
, op1
, op2
, x
));
20398 x
= GEN_INT (GET_MODE_UNIT_BITSIZE (mode
));
20399 if (!const_vector_equal_evenodd_p (orig_op1
))
20400 op1
= expand_binop (wmode
, lshr_optab
, gen_lowpart (wmode
, op1
),
20401 x
, NULL
, 1, OPTAB_DIRECT
);
20402 if (!const_vector_equal_evenodd_p (orig_op2
))
20403 op2
= expand_binop (wmode
, lshr_optab
, gen_lowpart (wmode
, op2
),
20404 x
, NULL
, 1, OPTAB_DIRECT
);
20405 op1
= gen_lowpart (mode
, op1
);
20406 op2
= gen_lowpart (mode
, op2
);
20409 if (mode
== V16SImode
)
20412 x
= gen_vec_widen_umult_even_v16si (dest
, op1
, op2
);
20414 x
= gen_vec_widen_smult_even_v16si (dest
, op1
, op2
);
20416 else if (mode
== V8SImode
)
20419 x
= gen_vec_widen_umult_even_v8si (dest
, op1
, op2
);
20421 x
= gen_vec_widen_smult_even_v8si (dest
, op1
, op2
);
20424 x
= gen_vec_widen_umult_even_v4si (dest
, op1
, op2
);
20425 else if (TARGET_SSE4_1
)
20426 x
= gen_sse4_1_mulv2siv2di3 (dest
, op1
, op2
);
20429 rtx s1
, s2
, t0
, t1
, t2
;
20431 /* The easiest way to implement this without PMULDQ is to go through
20432 the motions as if we are performing a full 64-bit multiply. With
20433 the exception that we need to do less shuffling of the elements. */
20435 /* Compute the sign-extension, aka highparts, of the two operands. */
20436 s1
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), GT
, CONST0_RTX (mode
),
20437 op1
, pc_rtx
, pc_rtx
);
20438 s2
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), GT
, CONST0_RTX (mode
),
20439 op2
, pc_rtx
, pc_rtx
);
20441 /* Multiply LO(A) * HI(B), and vice-versa. */
20442 t1
= gen_reg_rtx (wmode
);
20443 t2
= gen_reg_rtx (wmode
);
20444 emit_insn (gen_vec_widen_umult_even_v4si (t1
, s1
, op2
));
20445 emit_insn (gen_vec_widen_umult_even_v4si (t2
, s2
, op1
));
20447 /* Multiply LO(A) * LO(B). */
20448 t0
= gen_reg_rtx (wmode
);
20449 emit_insn (gen_vec_widen_umult_even_v4si (t0
, op1
, op2
));
20451 /* Combine and shift the highparts into place. */
20452 t1
= expand_binop (wmode
, add_optab
, t1
, t2
, t1
, 1, OPTAB_DIRECT
);
20453 t1
= expand_binop (wmode
, ashl_optab
, t1
, GEN_INT (32), t1
,
20456 /* Combine high and low parts. */
20457 force_expand_binop (wmode
, add_optab
, t0
, t1
, dest
, 1, OPTAB_DIRECT
);
20464 ix86_expand_mul_widen_hilo (rtx dest
, rtx op1
, rtx op2
,
20465 bool uns_p
, bool high_p
)
20467 machine_mode wmode
= GET_MODE (dest
);
20468 machine_mode mode
= GET_MODE (op1
);
20469 rtx t1
, t2
, t3
, t4
, mask
;
20474 t1
= gen_reg_rtx (mode
);
20475 t2
= gen_reg_rtx (mode
);
20476 if (TARGET_XOP
&& !uns_p
)
20478 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
20479 shuffle the elements once so that all elements are in the right
20480 place for immediate use: { A C B D }. */
20481 emit_insn (gen_sse2_pshufd_1 (t1
, op1
, const0_rtx
, const2_rtx
,
20482 const1_rtx
, GEN_INT (3)));
20483 emit_insn (gen_sse2_pshufd_1 (t2
, op2
, const0_rtx
, const2_rtx
,
20484 const1_rtx
, GEN_INT (3)));
20488 /* Put the elements into place for the multiply. */
20489 ix86_expand_vec_interleave (t1
, op1
, op1
, high_p
);
20490 ix86_expand_vec_interleave (t2
, op2
, op2
, high_p
);
20493 ix86_expand_mul_widen_evenodd (dest
, t1
, t2
, uns_p
, high_p
);
20497 /* Shuffle the elements between the lanes. After this we
20498 have { A B E F | C D G H } for each operand. */
20499 t1
= gen_reg_rtx (V4DImode
);
20500 t2
= gen_reg_rtx (V4DImode
);
20501 emit_insn (gen_avx2_permv4di_1 (t1
, gen_lowpart (V4DImode
, op1
),
20502 const0_rtx
, const2_rtx
,
20503 const1_rtx
, GEN_INT (3)));
20504 emit_insn (gen_avx2_permv4di_1 (t2
, gen_lowpart (V4DImode
, op2
),
20505 const0_rtx
, const2_rtx
,
20506 const1_rtx
, GEN_INT (3)));
20508 /* Shuffle the elements within the lanes. After this we
20509 have { A A B B | C C D D } or { E E F F | G G H H }. */
20510 t3
= gen_reg_rtx (V8SImode
);
20511 t4
= gen_reg_rtx (V8SImode
);
20512 mask
= GEN_INT (high_p
20513 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
20514 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
20515 emit_insn (gen_avx2_pshufdv3 (t3
, gen_lowpart (V8SImode
, t1
), mask
));
20516 emit_insn (gen_avx2_pshufdv3 (t4
, gen_lowpart (V8SImode
, t2
), mask
));
20518 ix86_expand_mul_widen_evenodd (dest
, t3
, t4
, uns_p
, false);
20523 t1
= expand_binop (mode
, smul_optab
, op1
, op2
, NULL_RTX
,
20524 uns_p
, OPTAB_DIRECT
);
20525 t2
= expand_binop (mode
,
20526 uns_p
? umul_highpart_optab
: smul_highpart_optab
,
20527 op1
, op2
, NULL_RTX
, uns_p
, OPTAB_DIRECT
);
20528 gcc_assert (t1
&& t2
);
20530 t3
= gen_reg_rtx (mode
);
20531 ix86_expand_vec_interleave (t3
, t1
, t2
, high_p
);
20532 emit_move_insn (dest
, gen_lowpart (wmode
, t3
));
20540 t1
= gen_reg_rtx (wmode
);
20541 t2
= gen_reg_rtx (wmode
);
20542 ix86_expand_sse_unpack (t1
, op1
, uns_p
, high_p
);
20543 ix86_expand_sse_unpack (t2
, op2
, uns_p
, high_p
);
20545 emit_insn (gen_rtx_SET (dest
, gen_rtx_MULT (wmode
, t1
, t2
)));
20549 gcc_unreachable ();
20554 ix86_expand_sse2_mulv4si3 (rtx op0
, rtx op1
, rtx op2
)
20556 rtx res_1
, res_2
, res_3
, res_4
;
20558 res_1
= gen_reg_rtx (V4SImode
);
20559 res_2
= gen_reg_rtx (V4SImode
);
20560 res_3
= gen_reg_rtx (V2DImode
);
20561 res_4
= gen_reg_rtx (V2DImode
);
20562 ix86_expand_mul_widen_evenodd (res_3
, op1
, op2
, true, false);
20563 ix86_expand_mul_widen_evenodd (res_4
, op1
, op2
, true, true);
20565 /* Move the results in element 2 down to element 1; we don't care
20566 what goes in elements 2 and 3. Then we can merge the parts
20567 back together with an interleave.
20569 Note that two other sequences were tried:
20570 (1) Use interleaves at the start instead of psrldq, which allows
20571 us to use a single shufps to merge things back at the end.
20572 (2) Use shufps here to combine the two vectors, then pshufd to
20573 put the elements in the correct order.
20574 In both cases the cost of the reformatting stall was too high
20575 and the overall sequence slower. */
20577 emit_insn (gen_sse2_pshufd_1 (res_1
, gen_lowpart (V4SImode
, res_3
),
20578 const0_rtx
, const2_rtx
,
20579 const0_rtx
, const0_rtx
));
20580 emit_insn (gen_sse2_pshufd_1 (res_2
, gen_lowpart (V4SImode
, res_4
),
20581 const0_rtx
, const2_rtx
,
20582 const0_rtx
, const0_rtx
));
20583 res_1
= emit_insn (gen_vec_interleave_lowv4si (op0
, res_1
, res_2
));
20585 set_unique_reg_note (res_1
, REG_EQUAL
, gen_rtx_MULT (V4SImode
, op1
, op2
));
20589 ix86_expand_sse2_mulvxdi3 (rtx op0
, rtx op1
, rtx op2
)
20591 machine_mode mode
= GET_MODE (op0
);
20592 rtx t1
, t2
, t3
, t4
, t5
, t6
;
20594 if (TARGET_AVX512DQ
&& mode
== V8DImode
)
20595 emit_insn (gen_avx512dq_mulv8di3 (op0
, op1
, op2
));
20596 else if (TARGET_AVX512DQ
&& TARGET_AVX512VL
&& mode
== V4DImode
)
20597 emit_insn (gen_avx512dq_mulv4di3 (op0
, op1
, op2
));
20598 else if (TARGET_AVX512DQ
&& TARGET_AVX512VL
&& mode
== V2DImode
)
20599 emit_insn (gen_avx512dq_mulv2di3 (op0
, op1
, op2
));
20600 else if (TARGET_XOP
&& mode
== V2DImode
)
20602 /* op1: A,B,C,D, op2: E,F,G,H */
20603 op1
= gen_lowpart (V4SImode
, op1
);
20604 op2
= gen_lowpart (V4SImode
, op2
);
20606 t1
= gen_reg_rtx (V4SImode
);
20607 t2
= gen_reg_rtx (V4SImode
);
20608 t3
= gen_reg_rtx (V2DImode
);
20609 t4
= gen_reg_rtx (V2DImode
);
20612 emit_insn (gen_sse2_pshufd_1 (t1
, op1
,
20618 /* t2: (B*E),(A*F),(D*G),(C*H) */
20619 emit_insn (gen_mulv4si3 (t2
, t1
, op2
));
20621 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
20622 emit_insn (gen_xop_phadddq (t3
, t2
));
20624 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
20625 emit_insn (gen_ashlv2di3 (t4
, t3
, GEN_INT (32)));
20627 /* Multiply lower parts and add all */
20628 t5
= gen_reg_rtx (V2DImode
);
20629 emit_insn (gen_vec_widen_umult_even_v4si (t5
,
20630 gen_lowpart (V4SImode
, op1
),
20631 gen_lowpart (V4SImode
, op2
)));
20632 force_expand_binop (mode
, add_optab
, t5
, t4
, op0
, 1, OPTAB_DIRECT
);
20636 machine_mode nmode
;
20637 rtx (*umul
) (rtx
, rtx
, rtx
);
20639 if (mode
== V2DImode
)
20641 umul
= gen_vec_widen_umult_even_v4si
;
20644 else if (mode
== V4DImode
)
20646 umul
= gen_vec_widen_umult_even_v8si
;
20649 else if (mode
== V8DImode
)
20651 umul
= gen_vec_widen_umult_even_v16si
;
20655 gcc_unreachable ();
20658 /* Multiply low parts. */
20659 t1
= gen_reg_rtx (mode
);
20660 emit_insn (umul (t1
, gen_lowpart (nmode
, op1
), gen_lowpart (nmode
, op2
)));
20662 /* Shift input vectors right 32 bits so we can multiply high parts. */
20664 t2
= expand_binop (mode
, lshr_optab
, op1
, t6
, NULL
, 1, OPTAB_DIRECT
);
20665 t3
= expand_binop (mode
, lshr_optab
, op2
, t6
, NULL
, 1, OPTAB_DIRECT
);
20667 /* Multiply high parts by low parts. */
20668 t4
= gen_reg_rtx (mode
);
20669 t5
= gen_reg_rtx (mode
);
20670 emit_insn (umul (t4
, gen_lowpart (nmode
, t2
), gen_lowpart (nmode
, op2
)));
20671 emit_insn (umul (t5
, gen_lowpart (nmode
, t3
), gen_lowpart (nmode
, op1
)));
20673 /* Combine and shift the highparts back. */
20674 t4
= expand_binop (mode
, add_optab
, t4
, t5
, t4
, 1, OPTAB_DIRECT
);
20675 t4
= expand_binop (mode
, ashl_optab
, t4
, t6
, t4
, 1, OPTAB_DIRECT
);
20677 /* Combine high and low parts. */
20678 force_expand_binop (mode
, add_optab
, t1
, t4
, op0
, 1, OPTAB_DIRECT
);
20681 set_unique_reg_note (get_last_insn (), REG_EQUAL
,
20682 gen_rtx_MULT (mode
, op1
, op2
));
20685 /* Return 1 if control tansfer instruction INSN
20686 should be encoded with notrack prefix. */
20689 ix86_notrack_prefixed_insn_p (rtx_insn
*insn
)
20691 if (!insn
|| !((flag_cf_protection
& CF_BRANCH
)))
20696 rtx call
= get_call_rtx_from (insn
);
20697 gcc_assert (call
!= NULL_RTX
);
20698 rtx addr
= XEXP (call
, 0);
20700 /* Do not emit 'notrack' if it's not an indirect call. */
20702 && GET_CODE (XEXP (addr
, 0)) == SYMBOL_REF
)
20705 return find_reg_note (insn
, REG_CALL_NOCF_CHECK
, 0);
20708 if (JUMP_P (insn
) && !flag_cet_switch
)
20710 rtx target
= JUMP_LABEL (insn
);
20711 if (target
== NULL_RTX
|| ANY_RETURN_P (target
))
20714 /* Check the jump is a switch table. */
20715 rtx_insn
*label
= as_a
<rtx_insn
*> (target
);
20716 rtx_insn
*table
= next_insn (label
);
20717 if (table
== NULL_RTX
|| !JUMP_TABLE_DATA_P (table
))
20725 /* Calculate integer abs() using only SSE2 instructions. */
20728 ix86_expand_sse2_abs (rtx target
, rtx input
)
20730 machine_mode mode
= GET_MODE (target
);
20737 /* For 64-bit signed integer X, with SSE4.2 use
20738 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
20739 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
20740 32 and use logical instead of arithmetic right shift (which is
20741 unimplemented) and subtract. */
20744 tmp0
= gen_reg_rtx (mode
);
20745 tmp1
= gen_reg_rtx (mode
);
20746 emit_move_insn (tmp1
, CONST0_RTX (mode
));
20747 if (mode
== E_V2DImode
)
20748 emit_insn (gen_sse4_2_gtv2di3 (tmp0
, tmp1
, input
));
20750 emit_insn (gen_avx2_gtv4di3 (tmp0
, tmp1
, input
));
20754 tmp0
= expand_simple_binop (mode
, LSHIFTRT
, input
,
20755 GEN_INT (GET_MODE_UNIT_BITSIZE (mode
)
20756 - 1), NULL
, 0, OPTAB_DIRECT
);
20757 tmp0
= expand_simple_unop (mode
, NEG
, tmp0
, NULL
, false);
20760 tmp1
= expand_simple_binop (mode
, XOR
, tmp0
, input
,
20761 NULL
, 0, OPTAB_DIRECT
);
20762 x
= expand_simple_binop (mode
, MINUS
, tmp1
, tmp0
,
20763 target
, 0, OPTAB_DIRECT
);
20767 /* For 32-bit signed integer X, the best way to calculate the absolute
20768 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
20769 tmp0
= expand_simple_binop (mode
, ASHIFTRT
, input
,
20770 GEN_INT (GET_MODE_UNIT_BITSIZE (mode
) - 1),
20771 NULL
, 0, OPTAB_DIRECT
);
20772 tmp1
= expand_simple_binop (mode
, XOR
, tmp0
, input
,
20773 NULL
, 0, OPTAB_DIRECT
);
20774 x
= expand_simple_binop (mode
, MINUS
, tmp1
, tmp0
,
20775 target
, 0, OPTAB_DIRECT
);
20779 /* For 16-bit signed integer X, the best way to calculate the absolute
20780 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
20781 tmp0
= expand_unop (mode
, neg_optab
, input
, NULL_RTX
, 0);
20783 x
= expand_simple_binop (mode
, SMAX
, tmp0
, input
,
20784 target
, 0, OPTAB_DIRECT
);
20788 /* For 8-bit signed integer X, the best way to calculate the absolute
20789 value of X is min ((unsigned char) X, (unsigned char) (-X)),
20790 as SSE2 provides the PMINUB insn. */
20791 tmp0
= expand_unop (mode
, neg_optab
, input
, NULL_RTX
, 0);
20793 x
= expand_simple_binop (V16QImode
, UMIN
, tmp0
, input
,
20794 target
, 0, OPTAB_DIRECT
);
20798 gcc_unreachable ();
20802 emit_move_insn (target
, x
);
20805 /* Expand an extract from a vector register through pextr insn.
20806 Return true if successful. */
20809 ix86_expand_pextr (rtx
*operands
)
20811 rtx dst
= operands
[0];
20812 rtx src
= operands
[1];
20814 unsigned int size
= INTVAL (operands
[2]);
20815 unsigned int pos
= INTVAL (operands
[3]);
20817 if (SUBREG_P (dst
))
20819 /* Reject non-lowpart subregs. */
20820 if (SUBREG_BYTE (dst
) > 0)
20822 dst
= SUBREG_REG (dst
);
20825 if (SUBREG_P (src
))
20827 pos
+= SUBREG_BYTE (src
) * BITS_PER_UNIT
;
20828 src
= SUBREG_REG (src
);
20831 switch (GET_MODE (src
))
20839 machine_mode srcmode
, dstmode
;
20842 if (!int_mode_for_size (size
, 0).exists (&dstmode
))
20848 if (!TARGET_SSE4_1
)
20850 srcmode
= V16QImode
;
20856 srcmode
= V8HImode
;
20860 if (!TARGET_SSE4_1
)
20862 srcmode
= V4SImode
;
20866 gcc_assert (TARGET_64BIT
);
20867 if (!TARGET_SSE4_1
)
20869 srcmode
= V2DImode
;
20876 /* Reject extractions from misaligned positions. */
20877 if (pos
& (size
-1))
20880 if (GET_MODE (dst
) == dstmode
)
20883 d
= gen_reg_rtx (dstmode
);
20885 /* Construct insn pattern. */
20886 pat
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (pos
/ size
)));
20887 pat
= gen_rtx_VEC_SELECT (dstmode
, gen_lowpart (srcmode
, src
), pat
);
20889 /* Let the rtl optimizers know about the zero extension performed. */
20890 if (dstmode
== QImode
|| dstmode
== HImode
)
20892 pat
= gen_rtx_ZERO_EXTEND (SImode
, pat
);
20893 d
= gen_lowpart (SImode
, d
);
20896 emit_insn (gen_rtx_SET (d
, pat
));
20899 emit_move_insn (dst
, gen_lowpart (GET_MODE (dst
), d
));
20908 /* Expand an insert into a vector register through pinsr insn.
20909 Return true if successful. */
20912 ix86_expand_pinsr (rtx
*operands
)
20914 rtx dst
= operands
[0];
20915 rtx src
= operands
[3];
20917 unsigned int size
= INTVAL (operands
[1]);
20918 unsigned int pos
= INTVAL (operands
[2]);
20920 if (SUBREG_P (dst
))
20922 pos
+= SUBREG_BYTE (dst
) * BITS_PER_UNIT
;
20923 dst
= SUBREG_REG (dst
);
20926 switch (GET_MODE (dst
))
20934 machine_mode srcmode
, dstmode
;
20935 rtx (*pinsr
)(rtx
, rtx
, rtx
, rtx
);
20938 if (!int_mode_for_size (size
, 0).exists (&srcmode
))
20944 if (!TARGET_SSE4_1
)
20946 dstmode
= V16QImode
;
20947 pinsr
= gen_sse4_1_pinsrb
;
20953 dstmode
= V8HImode
;
20954 pinsr
= gen_sse2_pinsrw
;
20958 if (!TARGET_SSE4_1
)
20960 dstmode
= V4SImode
;
20961 pinsr
= gen_sse4_1_pinsrd
;
20965 gcc_assert (TARGET_64BIT
);
20966 if (!TARGET_SSE4_1
)
20968 dstmode
= V2DImode
;
20969 pinsr
= gen_sse4_1_pinsrq
;
20976 /* Reject insertions to misaligned positions. */
20977 if (pos
& (size
-1))
20980 if (SUBREG_P (src
))
20982 unsigned int srcpos
= SUBREG_BYTE (src
);
20988 extr_ops
[0] = gen_reg_rtx (srcmode
);
20989 extr_ops
[1] = gen_lowpart (srcmode
, SUBREG_REG (src
));
20990 extr_ops
[2] = GEN_INT (size
);
20991 extr_ops
[3] = GEN_INT (srcpos
* BITS_PER_UNIT
);
20993 if (!ix86_expand_pextr (extr_ops
))
20999 src
= gen_lowpart (srcmode
, SUBREG_REG (src
));
21002 if (GET_MODE (dst
) == dstmode
)
21005 d
= gen_reg_rtx (dstmode
);
21007 emit_insn (pinsr (d
, gen_lowpart (dstmode
, dst
),
21008 gen_lowpart (srcmode
, src
),
21009 GEN_INT (1 << (pos
/ size
))));
21011 emit_move_insn (dst
, gen_lowpart (GET_MODE (dst
), d
));
21020 /* All CPUs prefer to avoid cross-lane operations so perform reductions
21021 upper against lower halves up to SSE reg size. */
21024 ix86_split_reduction (machine_mode mode
)
21026 /* Reduce lowpart against highpart until we reach SSE reg width to
21027 avoid cross-lane operations. */
21053 /* Generate call to __divmoddi4. */
21056 ix86_expand_divmod_libfunc (rtx libfunc
, machine_mode mode
,
21058 rtx
*quot_p
, rtx
*rem_p
)
21060 rtx rem
= assign_386_stack_local (mode
, SLOT_TEMP
);
21062 rtx quot
= emit_library_call_value (libfunc
, NULL_RTX
, LCT_NORMAL
,
21063 mode
, op0
, mode
, op1
, mode
,
21064 XEXP (rem
, 0), Pmode
);
21069 #include "gt-i386-expand.h"