1 /* Copyright (C) 1988-2019 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
19 #define IN_TARGET_CODE 1
23 #include "coretypes.h"
33 #include "stringpool.h"
40 #include "diagnostic.h"
43 #include "fold-const.h"
46 #include "stor-layout.h"
49 #include "insn-attr.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
60 #include "tm-constrs.h"
63 #include "sched-int.h"
65 #include "tree-pass.h"
67 #include "pass_manager.h"
68 #include "target-globals.h"
69 #include "gimple-iterator.h"
70 #include "tree-vectorizer.h"
71 #include "shrink-wrap.h"
74 #include "tree-iterator.h"
76 #include "case-cfn-macros.h"
78 #include "fold-const-call.h"
80 #include "tree-ssanames.h"
82 #include "selftest-rtl.h"
83 #include "print-rtl.h"
86 #include "symbol-summary.h"
88 #include "ipa-fnsummary.h"
89 #include "wide-int-bitmask.h"
90 #include "tree-vector-builder.h"
92 #include "dwarf2out.h"
93 #include "i386-options.h"
94 #include "i386-builtins.h"
95 #include "i386-expand.h"
97 /* Split one or more double-mode RTL references into pairs of half-mode
98 references. The RTL can be REG, offsettable MEM, integer constant, or
99 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
100 split and "num" is its length. lo_half and hi_half are output arrays
101 that parallel "operands". */
104 split_double_mode (machine_mode mode
, rtx operands
[],
105 int num
, rtx lo_half
[], rtx hi_half
[])
107 machine_mode half_mode
;
122 byte
= GET_MODE_SIZE (half_mode
);
126 rtx op
= operands
[num
];
128 /* simplify_subreg refuse to split volatile memory addresses,
129 but we still have to handle it. */
132 lo_half
[num
] = adjust_address (op
, half_mode
, 0);
133 hi_half
[num
] = adjust_address (op
, half_mode
, byte
);
137 lo_half
[num
] = simplify_gen_subreg (half_mode
, op
,
138 GET_MODE (op
) == VOIDmode
139 ? mode
: GET_MODE (op
), 0);
140 hi_half
[num
] = simplify_gen_subreg (half_mode
, op
,
141 GET_MODE (op
) == VOIDmode
142 ? mode
: GET_MODE (op
), byte
);
147 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
151 ix86_expand_clear (rtx dest
)
155 /* We play register width games, which are only valid after reload. */
156 gcc_assert (reload_completed
);
158 /* Avoid HImode and its attendant prefix byte. */
159 if (GET_MODE_SIZE (GET_MODE (dest
)) < 4)
160 dest
= gen_rtx_REG (SImode
, REGNO (dest
));
161 tmp
= gen_rtx_SET (dest
, const0_rtx
);
163 if (!TARGET_USE_MOV0
|| optimize_insn_for_size_p ())
165 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
166 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, tmp
, clob
));
173 ix86_expand_move (machine_mode mode
, rtx operands
[])
176 rtx tmp
, addend
= NULL_RTX
;
177 enum tls_model model
;
182 switch (GET_CODE (op1
))
187 if (GET_CODE (tmp
) != PLUS
188 || GET_CODE (XEXP (tmp
, 0)) != SYMBOL_REF
)
192 addend
= XEXP (tmp
, 1);
196 model
= SYMBOL_REF_TLS_MODEL (op1
);
199 op1
= legitimize_tls_address (op1
, model
, true);
200 else if (ix86_force_load_from_GOT_p (op1
))
202 /* Load the external function address via GOT slot to avoid PLT. */
203 op1
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, op1
),
207 op1
= gen_rtx_CONST (Pmode
, op1
);
208 op1
= gen_const_mem (Pmode
, op1
);
209 set_mem_alias_set (op1
, ix86_GOT_alias_set ());
213 tmp
= legitimize_pe_coff_symbol (op1
, addend
!= NULL_RTX
);
229 op1
= force_operand (op1
, NULL_RTX
);
230 op1
= expand_simple_binop (Pmode
, PLUS
, op1
, addend
,
231 op0
, 1, OPTAB_DIRECT
);
234 op1
= force_operand (op1
, op0
);
239 op1
= convert_to_mode (mode
, op1
, 1);
245 if ((flag_pic
|| MACHOPIC_INDIRECT
)
246 && symbolic_operand (op1
, mode
))
248 if (TARGET_MACHO
&& !TARGET_64BIT
)
252 if (MACHOPIC_INDIRECT
)
254 rtx temp
= (op0
&& REG_P (op0
) && mode
== Pmode
)
255 ? op0
: gen_reg_rtx (Pmode
);
256 op1
= machopic_indirect_data_reference (op1
, temp
);
258 op1
= machopic_legitimize_pic_address (op1
, mode
,
259 temp
== op1
? 0 : temp
);
261 if (op0
!= op1
&& GET_CODE (op0
) != MEM
)
263 rtx insn
= gen_rtx_SET (op0
, op1
);
267 if (GET_CODE (op0
) == MEM
)
268 op1
= force_reg (Pmode
, op1
);
272 if (GET_CODE (temp
) != REG
)
273 temp
= gen_reg_rtx (Pmode
);
274 temp
= legitimize_pic_address (op1
, temp
);
285 op1
= force_reg (mode
, op1
);
286 else if (!(TARGET_64BIT
&& x86_64_movabs_operand (op1
, DImode
)))
288 rtx reg
= can_create_pseudo_p () ? NULL_RTX
: op0
;
289 op1
= legitimize_pic_address (op1
, reg
);
292 op1
= convert_to_mode (mode
, op1
, 1);
299 && (PUSH_ROUNDING (GET_MODE_SIZE (mode
)) != GET_MODE_SIZE (mode
)
300 || !push_operand (op0
, mode
))
302 op1
= force_reg (mode
, op1
);
304 if (push_operand (op0
, mode
)
305 && ! general_no_elim_operand (op1
, mode
))
306 op1
= copy_to_mode_reg (mode
, op1
);
308 /* Force large constants in 64bit compilation into register
309 to get them CSEed. */
310 if (can_create_pseudo_p ()
311 && (mode
== DImode
) && TARGET_64BIT
312 && immediate_operand (op1
, mode
)
313 && !x86_64_zext_immediate_operand (op1
, VOIDmode
)
314 && !register_operand (op0
, mode
)
316 op1
= copy_to_mode_reg (mode
, op1
);
318 if (can_create_pseudo_p ()
319 && CONST_DOUBLE_P (op1
))
321 /* If we are loading a floating point constant to a register,
322 force the value to memory now, since we'll get better code
325 op1
= validize_mem (force_const_mem (mode
, op1
));
326 if (!register_operand (op0
, mode
))
328 rtx temp
= gen_reg_rtx (mode
);
329 emit_insn (gen_rtx_SET (temp
, op1
));
330 emit_move_insn (op0
, temp
);
336 emit_insn (gen_rtx_SET (op0
, op1
));
340 ix86_expand_vector_move (machine_mode mode
, rtx operands
[])
342 rtx op0
= operands
[0], op1
= operands
[1];
343 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
344 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
345 unsigned int align
= (TARGET_IAMCU
346 ? GET_MODE_BITSIZE (mode
)
347 : GET_MODE_ALIGNMENT (mode
));
349 if (push_operand (op0
, VOIDmode
))
350 op0
= emit_move_resolve_push (mode
, op0
);
352 /* Force constants other than zero into memory. We do not know how
353 the instructions used to build constants modify the upper 64 bits
354 of the register, once we have that information we may be able
355 to handle some of them more efficiently. */
356 if (can_create_pseudo_p ()
359 && CONSTANT_P (SUBREG_REG (op1
))))
360 && ((register_operand (op0
, mode
)
361 && !standard_sse_constant_p (op1
, mode
))
362 /* ix86_expand_vector_move_misalign() does not like constants. */
363 || (SSE_REG_MODE_P (mode
)
365 && MEM_ALIGN (op0
) < align
)))
369 machine_mode imode
= GET_MODE (SUBREG_REG (op1
));
370 rtx r
= force_const_mem (imode
, SUBREG_REG (op1
));
372 r
= validize_mem (r
);
374 r
= force_reg (imode
, SUBREG_REG (op1
));
375 op1
= simplify_gen_subreg (mode
, r
, imode
, SUBREG_BYTE (op1
));
378 op1
= validize_mem (force_const_mem (mode
, op1
));
381 /* We need to check memory alignment for SSE mode since attribute
382 can make operands unaligned. */
383 if (can_create_pseudo_p ()
384 && SSE_REG_MODE_P (mode
)
385 && ((MEM_P (op0
) && (MEM_ALIGN (op0
) < align
))
386 || (MEM_P (op1
) && (MEM_ALIGN (op1
) < align
))))
390 /* ix86_expand_vector_move_misalign() does not like both
391 arguments in memory. */
392 if (!register_operand (op0
, mode
)
393 && !register_operand (op1
, mode
))
394 op1
= force_reg (mode
, op1
);
396 tmp
[0] = op0
; tmp
[1] = op1
;
397 ix86_expand_vector_move_misalign (mode
, tmp
);
401 /* Make operand1 a register if it isn't already. */
402 if (can_create_pseudo_p ()
403 && !register_operand (op0
, mode
)
404 && !register_operand (op1
, mode
))
406 emit_move_insn (op0
, force_reg (GET_MODE (op0
), op1
));
410 emit_insn (gen_rtx_SET (op0
, op1
));
413 /* Split 32-byte AVX unaligned load and store if needed. */
416 ix86_avx256_split_vector_move_misalign (rtx op0
, rtx op1
)
419 rtx (*extract
) (rtx
, rtx
, rtx
);
422 if ((MEM_P (op1
) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD
)
423 || (MEM_P (op0
) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE
))
425 emit_insn (gen_rtx_SET (op0
, op1
));
429 rtx orig_op0
= NULL_RTX
;
430 mode
= GET_MODE (op0
);
431 switch (GET_MODE_CLASS (mode
))
433 case MODE_VECTOR_INT
:
435 if (mode
!= V32QImode
)
440 op0
= gen_reg_rtx (V32QImode
);
443 op0
= gen_lowpart (V32QImode
, op0
);
444 op1
= gen_lowpart (V32QImode
, op1
);
448 case MODE_VECTOR_FLOAT
:
459 extract
= gen_avx_vextractf128v32qi
;
463 extract
= gen_avx_vextractf128v8sf
;
467 extract
= gen_avx_vextractf128v4df
;
474 rtx r
= gen_reg_rtx (mode
);
475 m
= adjust_address (op1
, mode
, 0);
476 emit_move_insn (r
, m
);
477 m
= adjust_address (op1
, mode
, 16);
478 r
= gen_rtx_VEC_CONCAT (GET_MODE (op0
), r
, m
);
479 emit_move_insn (op0
, r
);
481 else if (MEM_P (op0
))
483 m
= adjust_address (op0
, mode
, 0);
484 emit_insn (extract (m
, op1
, const0_rtx
));
485 m
= adjust_address (op0
, mode
, 16);
486 emit_insn (extract (m
, copy_rtx (op1
), const1_rtx
));
492 emit_move_insn (orig_op0
, gen_lowpart (GET_MODE (orig_op0
), op0
));
495 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
496 straight to ix86_expand_vector_move. */
497 /* Code generation for scalar reg-reg moves of single and double precision data:
498 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
502 if (x86_sse_partial_reg_dependency == true)
507 Code generation for scalar loads of double precision data:
508 if (x86_sse_split_regs == true)
509 movlpd mem, reg (gas syntax)
513 Code generation for unaligned packed loads of single precision data
514 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
515 if (x86_sse_unaligned_move_optimal)
518 if (x86_sse_partial_reg_dependency == true)
530 Code generation for unaligned packed loads of double precision data
531 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
532 if (x86_sse_unaligned_move_optimal)
535 if (x86_sse_split_regs == true)
548 ix86_expand_vector_move_misalign (machine_mode mode
, rtx operands
[])
555 /* Use unaligned load/store for AVX512 or when optimizing for size. */
556 if (GET_MODE_SIZE (mode
) == 64 || optimize_insn_for_size_p ())
558 emit_insn (gen_rtx_SET (op0
, op1
));
564 if (GET_MODE_SIZE (mode
) == 32)
565 ix86_avx256_split_vector_move_misalign (op0
, op1
);
567 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
568 emit_insn (gen_rtx_SET (op0
, op1
));
572 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
573 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
)
575 emit_insn (gen_rtx_SET (op0
, op1
));
579 /* ??? If we have typed data, then it would appear that using
580 movdqu is the only way to get unaligned data loaded with
582 if (TARGET_SSE2
&& GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
584 emit_insn (gen_rtx_SET (op0
, op1
));
590 if (TARGET_SSE2
&& mode
== V2DFmode
)
594 /* When SSE registers are split into halves, we can avoid
595 writing to the top half twice. */
596 if (TARGET_SSE_SPLIT_REGS
)
603 /* ??? Not sure about the best option for the Intel chips.
604 The following would seem to satisfy; the register is
605 entirely cleared, breaking the dependency chain. We
606 then store to the upper half, with a dependency depth
607 of one. A rumor has it that Intel recommends two movsd
608 followed by an unpacklpd, but this is unconfirmed. And
609 given that the dependency depth of the unpacklpd would
610 still be one, I'm not sure why this would be better. */
611 zero
= CONST0_RTX (V2DFmode
);
614 m
= adjust_address (op1
, DFmode
, 0);
615 emit_insn (gen_sse2_loadlpd (op0
, zero
, m
));
616 m
= adjust_address (op1
, DFmode
, 8);
617 emit_insn (gen_sse2_loadhpd (op0
, op0
, m
));
623 if (mode
!= V4SFmode
)
624 t
= gen_reg_rtx (V4SFmode
);
628 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY
)
629 emit_move_insn (t
, CONST0_RTX (V4SFmode
));
633 m
= adjust_address (op1
, V2SFmode
, 0);
634 emit_insn (gen_sse_loadlps (t
, t
, m
));
635 m
= adjust_address (op1
, V2SFmode
, 8);
636 emit_insn (gen_sse_loadhps (t
, t
, m
));
637 if (mode
!= V4SFmode
)
638 emit_move_insn (op0
, gen_lowpart (mode
, t
));
641 else if (MEM_P (op0
))
643 if (TARGET_SSE2
&& mode
== V2DFmode
)
645 m
= adjust_address (op0
, DFmode
, 0);
646 emit_insn (gen_sse2_storelpd (m
, op1
));
647 m
= adjust_address (op0
, DFmode
, 8);
648 emit_insn (gen_sse2_storehpd (m
, op1
));
652 if (mode
!= V4SFmode
)
653 op1
= gen_lowpart (V4SFmode
, op1
);
655 m
= adjust_address (op0
, V2SFmode
, 0);
656 emit_insn (gen_sse_storelps (m
, op1
));
657 m
= adjust_address (op0
, V2SFmode
, 8);
658 emit_insn (gen_sse_storehps (m
, copy_rtx (op1
)));
665 /* Move bits 64:95 to bits 32:63. */
668 ix86_move_vector_high_sse_to_mmx (rtx op
)
670 rtx mask
= gen_rtx_PARALLEL (VOIDmode
,
671 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
672 GEN_INT (0), GEN_INT (0)));
673 rtx dest
= lowpart_subreg (V4SImode
, op
, GET_MODE (op
));
674 op
= gen_rtx_VEC_SELECT (V4SImode
, dest
, mask
);
675 rtx insn
= gen_rtx_SET (dest
, op
);
679 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
682 ix86_split_mmx_pack (rtx operands
[], enum rtx_code code
)
684 rtx op0
= operands
[0];
685 rtx op1
= operands
[1];
686 rtx op2
= operands
[2];
688 machine_mode dmode
= GET_MODE (op0
);
689 machine_mode smode
= GET_MODE (op1
);
690 machine_mode inner_dmode
= GET_MODE_INNER (dmode
);
691 machine_mode inner_smode
= GET_MODE_INNER (smode
);
693 /* Get the corresponding SSE mode for destination. */
694 int nunits
= 16 / GET_MODE_SIZE (inner_dmode
);
695 machine_mode sse_dmode
= mode_for_vector (GET_MODE_INNER (dmode
),
697 machine_mode sse_half_dmode
= mode_for_vector (GET_MODE_INNER (dmode
),
698 nunits
/ 2).require ();
700 /* Get the corresponding SSE mode for source. */
701 nunits
= 16 / GET_MODE_SIZE (inner_smode
);
702 machine_mode sse_smode
= mode_for_vector (GET_MODE_INNER (smode
),
705 /* Generate SSE pack with signed/unsigned saturation. */
706 rtx dest
= lowpart_subreg (sse_dmode
, op0
, GET_MODE (op0
));
707 op1
= lowpart_subreg (sse_smode
, op1
, GET_MODE (op1
));
708 op2
= lowpart_subreg (sse_smode
, op2
, GET_MODE (op2
));
710 op1
= gen_rtx_fmt_e (code
, sse_half_dmode
, op1
);
711 op2
= gen_rtx_fmt_e (code
, sse_half_dmode
, op2
);
712 rtx insn
= gen_rtx_SET (dest
, gen_rtx_VEC_CONCAT (sse_dmode
,
716 ix86_move_vector_high_sse_to_mmx (op0
);
719 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
722 ix86_split_mmx_punpck (rtx operands
[], bool high_p
)
724 rtx op0
= operands
[0];
725 rtx op1
= operands
[1];
726 rtx op2
= operands
[2];
727 machine_mode mode
= GET_MODE (op0
);
729 /* The corresponding SSE mode. */
730 machine_mode sse_mode
, double_sse_mode
;
735 sse_mode
= V16QImode
;
736 double_sse_mode
= V32QImode
;
737 mask
= gen_rtx_PARALLEL (VOIDmode
,
739 GEN_INT (0), GEN_INT (16),
740 GEN_INT (1), GEN_INT (17),
741 GEN_INT (2), GEN_INT (18),
742 GEN_INT (3), GEN_INT (19),
743 GEN_INT (4), GEN_INT (20),
744 GEN_INT (5), GEN_INT (21),
745 GEN_INT (6), GEN_INT (22),
746 GEN_INT (7), GEN_INT (23)));
751 double_sse_mode
= V16HImode
;
752 mask
= gen_rtx_PARALLEL (VOIDmode
,
754 GEN_INT (0), GEN_INT (8),
755 GEN_INT (1), GEN_INT (9),
756 GEN_INT (2), GEN_INT (10),
757 GEN_INT (3), GEN_INT (11)));
762 double_sse_mode
= V8SImode
;
763 mask
= gen_rtx_PARALLEL (VOIDmode
,
765 GEN_INT (0), GEN_INT (4),
766 GEN_INT (1), GEN_INT (5)));
773 /* Generate SSE punpcklXX. */
774 rtx dest
= lowpart_subreg (sse_mode
, op0
, GET_MODE (op0
));
775 op1
= lowpart_subreg (sse_mode
, op1
, GET_MODE (op1
));
776 op2
= lowpart_subreg (sse_mode
, op2
, GET_MODE (op2
));
778 op1
= gen_rtx_VEC_CONCAT (double_sse_mode
, op1
, op2
);
779 op2
= gen_rtx_VEC_SELECT (sse_mode
, op1
, mask
);
780 rtx insn
= gen_rtx_SET (dest
, op2
);
785 /* Move bits 64:127 to bits 0:63. */
786 mask
= gen_rtx_PARALLEL (VOIDmode
,
787 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
788 GEN_INT (0), GEN_INT (0)));
789 dest
= lowpart_subreg (V4SImode
, dest
, GET_MODE (dest
));
790 op1
= gen_rtx_VEC_SELECT (V4SImode
, dest
, mask
);
791 insn
= gen_rtx_SET (dest
, op1
);
796 /* Helper function of ix86_fixup_binary_operands to canonicalize
797 operand order. Returns true if the operands should be swapped. */
800 ix86_swap_binary_operands_p (enum rtx_code code
, machine_mode mode
,
803 rtx dst
= operands
[0];
804 rtx src1
= operands
[1];
805 rtx src2
= operands
[2];
807 /* If the operation is not commutative, we can't do anything. */
808 if (GET_RTX_CLASS (code
) != RTX_COMM_ARITH
809 && GET_RTX_CLASS (code
) != RTX_COMM_COMPARE
)
812 /* Highest priority is that src1 should match dst. */
813 if (rtx_equal_p (dst
, src1
))
815 if (rtx_equal_p (dst
, src2
))
818 /* Next highest priority is that immediate constants come second. */
819 if (immediate_operand (src2
, mode
))
821 if (immediate_operand (src1
, mode
))
824 /* Lowest priority is that memory references should come second. */
834 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
835 destination to use for the operation. If different from the true
836 destination in operands[0], a copy operation will be required. */
839 ix86_fixup_binary_operands (enum rtx_code code
, machine_mode mode
,
842 rtx dst
= operands
[0];
843 rtx src1
= operands
[1];
844 rtx src2
= operands
[2];
846 /* Canonicalize operand order. */
847 if (ix86_swap_binary_operands_p (code
, mode
, operands
))
849 /* It is invalid to swap operands of different modes. */
850 gcc_assert (GET_MODE (src1
) == GET_MODE (src2
));
852 std::swap (src1
, src2
);
855 /* Both source operands cannot be in memory. */
856 if (MEM_P (src1
) && MEM_P (src2
))
858 /* Optimization: Only read from memory once. */
859 if (rtx_equal_p (src1
, src2
))
861 src2
= force_reg (mode
, src2
);
864 else if (rtx_equal_p (dst
, src1
))
865 src2
= force_reg (mode
, src2
);
867 src1
= force_reg (mode
, src1
);
870 /* If the destination is memory, and we do not have matching source
871 operands, do things in registers. */
872 if (MEM_P (dst
) && !rtx_equal_p (dst
, src1
))
873 dst
= gen_reg_rtx (mode
);
875 /* Source 1 cannot be a constant. */
876 if (CONSTANT_P (src1
))
877 src1
= force_reg (mode
, src1
);
879 /* Source 1 cannot be a non-matching memory. */
880 if (MEM_P (src1
) && !rtx_equal_p (dst
, src1
))
881 src1
= force_reg (mode
, src1
);
883 /* Improve address combine. */
885 && GET_MODE_CLASS (mode
) == MODE_INT
887 src2
= force_reg (mode
, src2
);
894 /* Similarly, but assume that the destination has already been
898 ix86_fixup_binary_operands_no_copy (enum rtx_code code
,
899 machine_mode mode
, rtx operands
[])
901 rtx dst
= ix86_fixup_binary_operands (code
, mode
, operands
);
902 gcc_assert (dst
== operands
[0]);
905 /* Attempt to expand a binary operator. Make the expansion closer to the
906 actual machine, then just general_operand, which will allow 3 separate
907 memory references (one output, two input) in a single insn. */
910 ix86_expand_binary_operator (enum rtx_code code
, machine_mode mode
,
913 rtx src1
, src2
, dst
, op
, clob
;
915 dst
= ix86_fixup_binary_operands (code
, mode
, operands
);
919 /* Emit the instruction. */
921 op
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, src1
, src2
));
925 && !rtx_equal_p (dst
, src1
))
927 /* This is going to be an LEA; avoid splitting it later. */
932 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
933 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
936 /* Fix up the destination if needed. */
937 if (dst
!= operands
[0])
938 emit_move_insn (operands
[0], dst
);
941 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
942 the given OPERANDS. */
945 ix86_expand_vector_logical_operator (enum rtx_code code
, machine_mode mode
,
948 rtx op1
= NULL_RTX
, op2
= NULL_RTX
;
949 if (SUBREG_P (operands
[1]))
954 else if (SUBREG_P (operands
[2]))
959 /* Optimize (__m128i) d | (__m128i) e and similar code
960 when d and e are float vectors into float vector logical
961 insn. In C/C++ without using intrinsics there is no other way
962 to express vector logical operation on float vectors than
963 to cast them temporarily to integer vectors. */
965 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
966 && (SUBREG_P (op2
) || GET_CODE (op2
) == CONST_VECTOR
)
967 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1
))) == MODE_VECTOR_FLOAT
968 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1
))) == GET_MODE_SIZE (mode
)
969 && SUBREG_BYTE (op1
) == 0
970 && (GET_CODE (op2
) == CONST_VECTOR
971 || (GET_MODE (SUBREG_REG (op1
)) == GET_MODE (SUBREG_REG (op2
))
972 && SUBREG_BYTE (op2
) == 0))
973 && can_create_pseudo_p ())
976 switch (GET_MODE (SUBREG_REG (op1
)))
984 dst
= gen_reg_rtx (GET_MODE (SUBREG_REG (op1
)));
985 if (GET_CODE (op2
) == CONST_VECTOR
)
987 op2
= gen_lowpart (GET_MODE (dst
), op2
);
988 op2
= force_reg (GET_MODE (dst
), op2
);
993 op2
= SUBREG_REG (operands
[2]);
994 if (!vector_operand (op2
, GET_MODE (dst
)))
995 op2
= force_reg (GET_MODE (dst
), op2
);
997 op1
= SUBREG_REG (op1
);
998 if (!vector_operand (op1
, GET_MODE (dst
)))
999 op1
= force_reg (GET_MODE (dst
), op1
);
1000 emit_insn (gen_rtx_SET (dst
,
1001 gen_rtx_fmt_ee (code
, GET_MODE (dst
),
1003 emit_move_insn (operands
[0], gen_lowpart (mode
, dst
));
1009 if (!vector_operand (operands
[1], mode
))
1010 operands
[1] = force_reg (mode
, operands
[1]);
1011 if (!vector_operand (operands
[2], mode
))
1012 operands
[2] = force_reg (mode
, operands
[2]);
1013 ix86_fixup_binary_operands_no_copy (code
, mode
, operands
);
1014 emit_insn (gen_rtx_SET (operands
[0],
1015 gen_rtx_fmt_ee (code
, mode
, operands
[1],
1019 /* Return TRUE or FALSE depending on whether the binary operator meets the
1020 appropriate constraints. */
1023 ix86_binary_operator_ok (enum rtx_code code
, machine_mode mode
,
1026 rtx dst
= operands
[0];
1027 rtx src1
= operands
[1];
1028 rtx src2
= operands
[2];
1030 /* Both source operands cannot be in memory. */
1031 if (MEM_P (src1
) && MEM_P (src2
))
1034 /* Canonicalize operand order for commutative operators. */
1035 if (ix86_swap_binary_operands_p (code
, mode
, operands
))
1036 std::swap (src1
, src2
);
1038 /* If the destination is memory, we must have a matching source operand. */
1039 if (MEM_P (dst
) && !rtx_equal_p (dst
, src1
))
1042 /* Source 1 cannot be a constant. */
1043 if (CONSTANT_P (src1
))
1046 /* Source 1 cannot be a non-matching memory. */
1047 if (MEM_P (src1
) && !rtx_equal_p (dst
, src1
))
1048 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1052 || (TARGET_64BIT
&& mode
== DImode
))
1053 && satisfies_constraint_L (src2
));
1058 /* Attempt to expand a unary operator. Make the expansion closer to the
1059 actual machine, then just general_operand, which will allow 2 separate
1060 memory references (one output, one input) in a single insn. */
1063 ix86_expand_unary_operator (enum rtx_code code
, machine_mode mode
,
1066 bool matching_memory
= false;
1067 rtx src
, dst
, op
, clob
;
1072 /* If the destination is memory, and we do not have matching source
1073 operands, do things in registers. */
1076 if (rtx_equal_p (dst
, src
))
1077 matching_memory
= true;
1079 dst
= gen_reg_rtx (mode
);
1082 /* When source operand is memory, destination must match. */
1083 if (MEM_P (src
) && !matching_memory
)
1084 src
= force_reg (mode
, src
);
1086 /* Emit the instruction. */
1088 op
= gen_rtx_SET (dst
, gen_rtx_fmt_e (code
, mode
, src
));
1094 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1095 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1098 /* Fix up the destination if needed. */
1099 if (dst
!= operands
[0])
1100 emit_move_insn (operands
[0], dst
);
1103 /* Predict just emitted jump instruction to be taken with probability PROB. */
1106 predict_jump (int prob
)
1108 rtx_insn
*insn
= get_last_insn ();
1109 gcc_assert (JUMP_P (insn
));
1110 add_reg_br_prob_note (insn
, profile_probability::from_reg_br_prob_base (prob
));
1113 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1114 divisor are within the range [0-255]. */
1117 ix86_split_idivmod (machine_mode mode
, rtx operands
[],
1120 rtx_code_label
*end_label
, *qimode_label
;
1123 rtx scratch
, tmp0
, tmp1
, tmp2
;
1124 rtx (*gen_divmod4_1
) (rtx
, rtx
, rtx
, rtx
);
1129 if (GET_MODE (operands
[0]) == SImode
)
1131 if (GET_MODE (operands
[1]) == SImode
)
1132 gen_divmod4_1
= unsigned_p
? gen_udivmodsi4_1
: gen_divmodsi4_1
;
1135 = unsigned_p
? gen_udivmodsi4_zext_2
: gen_divmodsi4_zext_2
;
1139 = unsigned_p
? gen_udivmodsi4_zext_1
: gen_divmodsi4_zext_1
;
1143 gen_divmod4_1
= unsigned_p
? gen_udivmoddi4_1
: gen_divmoddi4_1
;
1150 end_label
= gen_label_rtx ();
1151 qimode_label
= gen_label_rtx ();
1153 scratch
= gen_reg_rtx (mode
);
1155 /* Use 8bit unsigned divimod if dividend and divisor are within
1156 the range [0-255]. */
1157 emit_move_insn (scratch
, operands
[2]);
1158 scratch
= expand_simple_binop (mode
, IOR
, scratch
, operands
[3],
1159 scratch
, 1, OPTAB_DIRECT
);
1160 emit_insn (gen_test_ccno_1 (mode
, scratch
, GEN_INT (-0x100)));
1161 tmp0
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
1162 tmp0
= gen_rtx_EQ (VOIDmode
, tmp0
, const0_rtx
);
1163 tmp0
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp0
,
1164 gen_rtx_LABEL_REF (VOIDmode
, qimode_label
),
1166 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp0
));
1167 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
1168 JUMP_LABEL (insn
) = qimode_label
;
1170 /* Generate original signed/unsigned divimod. */
1171 div
= gen_divmod4_1 (operands
[0], operands
[1],
1172 operands
[2], operands
[3]);
1175 /* Branch to the end. */
1176 emit_jump_insn (gen_jump (end_label
));
1179 /* Generate 8bit unsigned divide. */
1180 emit_label (qimode_label
);
1181 /* Don't use operands[0] for result of 8bit divide since not all
1182 registers support QImode ZERO_EXTRACT. */
1183 tmp0
= lowpart_subreg (HImode
, scratch
, mode
);
1184 tmp1
= lowpart_subreg (HImode
, operands
[2], mode
);
1185 tmp2
= lowpart_subreg (QImode
, operands
[3], mode
);
1186 emit_insn (gen_udivmodhiqi3 (tmp0
, tmp1
, tmp2
));
1190 div
= gen_rtx_UDIV (mode
, operands
[2], operands
[3]);
1191 mod
= gen_rtx_UMOD (mode
, operands
[2], operands
[3]);
1195 div
= gen_rtx_DIV (mode
, operands
[2], operands
[3]);
1196 mod
= gen_rtx_MOD (mode
, operands
[2], operands
[3]);
1200 if (GET_MODE (operands
[0]) != SImode
)
1201 div
= gen_rtx_ZERO_EXTEND (DImode
, div
);
1202 if (GET_MODE (operands
[1]) != SImode
)
1203 mod
= gen_rtx_ZERO_EXTEND (DImode
, mod
);
1206 /* Extract remainder from AH. */
1207 tmp1
= gen_rtx_ZERO_EXTRACT (GET_MODE (operands
[1]),
1208 tmp0
, GEN_INT (8), GEN_INT (8));
1209 if (REG_P (operands
[1]))
1210 insn
= emit_move_insn (operands
[1], tmp1
);
1213 /* Need a new scratch register since the old one has result
1215 scratch
= gen_reg_rtx (GET_MODE (operands
[1]));
1216 emit_move_insn (scratch
, tmp1
);
1217 insn
= emit_move_insn (operands
[1], scratch
);
1219 set_unique_reg_note (insn
, REG_EQUAL
, mod
);
1221 /* Zero extend quotient from AL. */
1222 tmp1
= gen_lowpart (QImode
, tmp0
);
1223 insn
= emit_insn (gen_extend_insn
1225 GET_MODE (operands
[0]), QImode
, 1));
1226 set_unique_reg_note (insn
, REG_EQUAL
, div
);
1228 emit_label (end_label
);
1231 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1232 matches destination. RTX includes clobber of FLAGS_REG. */
1235 ix86_emit_binop (enum rtx_code code
, machine_mode mode
,
1240 op
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, dst
, src
));
1241 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1243 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1246 /* Return true if regno1 def is nearest to the insn. */
1249 find_nearest_reg_def (rtx_insn
*insn
, int regno1
, int regno2
)
1251 rtx_insn
*prev
= insn
;
1252 rtx_insn
*start
= BB_HEAD (BLOCK_FOR_INSN (insn
));
1256 while (prev
&& prev
!= start
)
1258 if (!INSN_P (prev
) || !NONDEBUG_INSN_P (prev
))
1260 prev
= PREV_INSN (prev
);
1263 if (insn_defines_reg (regno1
, INVALID_REGNUM
, prev
))
1265 else if (insn_defines_reg (regno2
, INVALID_REGNUM
, prev
))
1267 prev
= PREV_INSN (prev
);
1270 /* None of the regs is defined in the bb. */
1274 /* Split lea instructions into a sequence of instructions
1275 which are executed on ALU to avoid AGU stalls.
1276 It is assumed that it is allowed to clobber flags register
1280 ix86_split_lea_for_addr (rtx_insn
*insn
, rtx operands
[], machine_mode mode
)
1282 unsigned int regno0
, regno1
, regno2
;
1283 struct ix86_address parts
;
1287 ok
= ix86_decompose_address (operands
[1], &parts
);
1290 target
= gen_lowpart (mode
, operands
[0]);
1292 regno0
= true_regnum (target
);
1293 regno1
= INVALID_REGNUM
;
1294 regno2
= INVALID_REGNUM
;
1298 parts
.base
= gen_lowpart (mode
, parts
.base
);
1299 regno1
= true_regnum (parts
.base
);
1304 parts
.index
= gen_lowpart (mode
, parts
.index
);
1305 regno2
= true_regnum (parts
.index
);
1309 parts
.disp
= gen_lowpart (mode
, parts
.disp
);
1311 if (parts
.scale
> 1)
1313 /* Case r1 = r1 + ... */
1314 if (regno1
== regno0
)
1316 /* If we have a case r1 = r1 + C * r2 then we
1317 should use multiplication which is very
1318 expensive. Assume cost model is wrong if we
1319 have such case here. */
1320 gcc_assert (regno2
!= regno0
);
1322 for (adds
= parts
.scale
; adds
> 0; adds
--)
1323 ix86_emit_binop (PLUS
, mode
, target
, parts
.index
);
1327 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1328 if (regno0
!= regno2
)
1329 emit_insn (gen_rtx_SET (target
, parts
.index
));
1331 /* Use shift for scaling. */
1332 ix86_emit_binop (ASHIFT
, mode
, target
,
1333 GEN_INT (exact_log2 (parts
.scale
)));
1336 ix86_emit_binop (PLUS
, mode
, target
, parts
.base
);
1338 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1339 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1342 else if (!parts
.base
&& !parts
.index
)
1344 gcc_assert(parts
.disp
);
1345 emit_insn (gen_rtx_SET (target
, parts
.disp
));
1351 if (regno0
!= regno2
)
1352 emit_insn (gen_rtx_SET (target
, parts
.index
));
1354 else if (!parts
.index
)
1356 if (regno0
!= regno1
)
1357 emit_insn (gen_rtx_SET (target
, parts
.base
));
1361 if (regno0
== regno1
)
1363 else if (regno0
== regno2
)
1369 /* Find better operand for SET instruction, depending
1370 on which definition is farther from the insn. */
1371 if (find_nearest_reg_def (insn
, regno1
, regno2
))
1372 tmp
= parts
.index
, tmp1
= parts
.base
;
1374 tmp
= parts
.base
, tmp1
= parts
.index
;
1376 emit_insn (gen_rtx_SET (target
, tmp
));
1378 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1379 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1381 ix86_emit_binop (PLUS
, mode
, target
, tmp1
);
1385 ix86_emit_binop (PLUS
, mode
, target
, tmp
);
1388 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1389 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1393 /* Post-reload splitter for converting an SF or DFmode value in an
1394 SSE register into an unsigned SImode. */
1397 ix86_split_convert_uns_si_sse (rtx operands
[])
1399 machine_mode vecmode
;
1400 rtx value
, large
, zero_or_two31
, input
, two31
, x
;
1402 large
= operands
[1];
1403 zero_or_two31
= operands
[2];
1404 input
= operands
[3];
1405 two31
= operands
[4];
1406 vecmode
= GET_MODE (large
);
1407 value
= gen_rtx_REG (vecmode
, REGNO (operands
[0]));
1409 /* Load up the value into the low element. We must ensure that the other
1410 elements are valid floats -- zero is the easiest such value. */
1413 if (vecmode
== V4SFmode
)
1414 emit_insn (gen_vec_setv4sf_0 (value
, CONST0_RTX (V4SFmode
), input
));
1416 emit_insn (gen_sse2_loadlpd (value
, CONST0_RTX (V2DFmode
), input
));
1420 input
= gen_rtx_REG (vecmode
, REGNO (input
));
1421 emit_move_insn (value
, CONST0_RTX (vecmode
));
1422 if (vecmode
== V4SFmode
)
1423 emit_insn (gen_sse_movss (value
, value
, input
));
1425 emit_insn (gen_sse2_movsd (value
, value
, input
));
1428 emit_move_insn (large
, two31
);
1429 emit_move_insn (zero_or_two31
, MEM_P (two31
) ? large
: two31
);
1431 x
= gen_rtx_fmt_ee (LE
, vecmode
, large
, value
);
1432 emit_insn (gen_rtx_SET (large
, x
));
1434 x
= gen_rtx_AND (vecmode
, zero_or_two31
, large
);
1435 emit_insn (gen_rtx_SET (zero_or_two31
, x
));
1437 x
= gen_rtx_MINUS (vecmode
, value
, zero_or_two31
);
1438 emit_insn (gen_rtx_SET (value
, x
));
1440 large
= gen_rtx_REG (V4SImode
, REGNO (large
));
1441 emit_insn (gen_ashlv4si3 (large
, large
, GEN_INT (31)));
1443 x
= gen_rtx_REG (V4SImode
, REGNO (value
));
1444 if (vecmode
== V4SFmode
)
1445 emit_insn (gen_fix_truncv4sfv4si2 (x
, value
));
1447 emit_insn (gen_sse2_cvttpd2dq (x
, value
));
1450 emit_insn (gen_xorv4si3 (value
, value
, large
));
1453 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok
,
1454 machine_mode mode
, rtx target
,
1455 rtx var
, int one_var
);
1457 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1458 Expects the 64-bit DImode to be supplied in a pair of integral
1459 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1460 -mfpmath=sse, !optimize_size only. */
1463 ix86_expand_convert_uns_didf_sse (rtx target
, rtx input
)
1465 REAL_VALUE_TYPE bias_lo_rvt
, bias_hi_rvt
;
1466 rtx int_xmm
, fp_xmm
;
1467 rtx biases
, exponents
;
1470 int_xmm
= gen_reg_rtx (V4SImode
);
1471 if (TARGET_INTER_UNIT_MOVES_TO_VEC
)
1472 emit_insn (gen_movdi_to_sse (int_xmm
, input
));
1473 else if (TARGET_SSE_SPLIT_REGS
)
1475 emit_clobber (int_xmm
);
1476 emit_move_insn (gen_lowpart (DImode
, int_xmm
), input
);
1480 x
= gen_reg_rtx (V2DImode
);
1481 ix86_expand_vector_init_one_nonzero (false, V2DImode
, x
, input
, 0);
1482 emit_move_insn (int_xmm
, gen_lowpart (V4SImode
, x
));
1485 x
= gen_rtx_CONST_VECTOR (V4SImode
,
1486 gen_rtvec (4, GEN_INT (0x43300000UL
),
1487 GEN_INT (0x45300000UL
),
1488 const0_rtx
, const0_rtx
));
1489 exponents
= validize_mem (force_const_mem (V4SImode
, x
));
1491 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1492 emit_insn (gen_vec_interleave_lowv4si (int_xmm
, int_xmm
, exponents
));
1494 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1495 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1496 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1497 (0x1.0p84 + double(fp_value_hi_xmm)).
1498 Note these exponents differ by 32. */
1500 fp_xmm
= copy_to_mode_reg (V2DFmode
, gen_lowpart (V2DFmode
, int_xmm
));
1502 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1503 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1504 real_ldexp (&bias_lo_rvt
, &dconst1
, 52);
1505 real_ldexp (&bias_hi_rvt
, &dconst1
, 84);
1506 biases
= const_double_from_real_value (bias_lo_rvt
, DFmode
);
1507 x
= const_double_from_real_value (bias_hi_rvt
, DFmode
);
1508 biases
= gen_rtx_CONST_VECTOR (V2DFmode
, gen_rtvec (2, biases
, x
));
1509 biases
= validize_mem (force_const_mem (V2DFmode
, biases
));
1510 emit_insn (gen_subv2df3 (fp_xmm
, fp_xmm
, biases
));
1512 /* Add the upper and lower DFmode values together. */
1514 emit_insn (gen_sse3_haddv2df3 (fp_xmm
, fp_xmm
, fp_xmm
));
1517 x
= copy_to_mode_reg (V2DFmode
, fp_xmm
);
1518 emit_insn (gen_vec_interleave_highv2df (fp_xmm
, fp_xmm
, fp_xmm
));
1519 emit_insn (gen_addv2df3 (fp_xmm
, fp_xmm
, x
));
1522 ix86_expand_vector_extract (false, target
, fp_xmm
, 0);
1525 /* Not used, but eases macroization of patterns. */
1527 ix86_expand_convert_uns_sixf_sse (rtx
, rtx
)
1532 /* Convert an unsigned SImode value into a DFmode. Only currently used
1533 for SSE, but applicable anywhere. */
1536 ix86_expand_convert_uns_sidf_sse (rtx target
, rtx input
)
1538 REAL_VALUE_TYPE TWO31r
;
1541 x
= expand_simple_binop (SImode
, PLUS
, input
, GEN_INT (-2147483647 - 1),
1542 NULL
, 1, OPTAB_DIRECT
);
1544 fp
= gen_reg_rtx (DFmode
);
1545 emit_insn (gen_floatsidf2 (fp
, x
));
1547 real_ldexp (&TWO31r
, &dconst1
, 31);
1548 x
= const_double_from_real_value (TWO31r
, DFmode
);
1550 x
= expand_simple_binop (DFmode
, PLUS
, fp
, x
, target
, 0, OPTAB_DIRECT
);
1552 emit_move_insn (target
, x
);
1555 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1556 32-bit mode; otherwise we have a direct convert instruction. */
1559 ix86_expand_convert_sign_didf_sse (rtx target
, rtx input
)
1561 REAL_VALUE_TYPE TWO32r
;
1562 rtx fp_lo
, fp_hi
, x
;
1564 fp_lo
= gen_reg_rtx (DFmode
);
1565 fp_hi
= gen_reg_rtx (DFmode
);
1567 emit_insn (gen_floatsidf2 (fp_hi
, gen_highpart (SImode
, input
)));
1569 real_ldexp (&TWO32r
, &dconst1
, 32);
1570 x
= const_double_from_real_value (TWO32r
, DFmode
);
1571 fp_hi
= expand_simple_binop (DFmode
, MULT
, fp_hi
, x
, fp_hi
, 0, OPTAB_DIRECT
);
1573 ix86_expand_convert_uns_sidf_sse (fp_lo
, gen_lowpart (SImode
, input
));
1575 x
= expand_simple_binop (DFmode
, PLUS
, fp_hi
, fp_lo
, target
,
1578 emit_move_insn (target
, x
);
1581 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1582 For x86_32, -mfpmath=sse, !optimize_size only. */
1584 ix86_expand_convert_uns_sisf_sse (rtx target
, rtx input
)
1586 REAL_VALUE_TYPE ONE16r
;
1587 rtx fp_hi
, fp_lo
, int_hi
, int_lo
, x
;
1589 real_ldexp (&ONE16r
, &dconst1
, 16);
1590 x
= const_double_from_real_value (ONE16r
, SFmode
);
1591 int_lo
= expand_simple_binop (SImode
, AND
, input
, GEN_INT(0xffff),
1592 NULL
, 0, OPTAB_DIRECT
);
1593 int_hi
= expand_simple_binop (SImode
, LSHIFTRT
, input
, GEN_INT(16),
1594 NULL
, 0, OPTAB_DIRECT
);
1595 fp_hi
= gen_reg_rtx (SFmode
);
1596 fp_lo
= gen_reg_rtx (SFmode
);
1597 emit_insn (gen_floatsisf2 (fp_hi
, int_hi
));
1598 emit_insn (gen_floatsisf2 (fp_lo
, int_lo
));
1599 fp_hi
= expand_simple_binop (SFmode
, MULT
, fp_hi
, x
, fp_hi
,
1601 fp_hi
= expand_simple_binop (SFmode
, PLUS
, fp_hi
, fp_lo
, target
,
1603 if (!rtx_equal_p (target
, fp_hi
))
1604 emit_move_insn (target
, fp_hi
);
1607 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1608 a vector of unsigned ints VAL to vector of floats TARGET. */
1611 ix86_expand_vector_convert_uns_vsivsf (rtx target
, rtx val
)
1614 REAL_VALUE_TYPE TWO16r
;
1615 machine_mode intmode
= GET_MODE (val
);
1616 machine_mode fltmode
= GET_MODE (target
);
1617 rtx (*cvt
) (rtx
, rtx
);
1619 if (intmode
== V4SImode
)
1620 cvt
= gen_floatv4siv4sf2
;
1622 cvt
= gen_floatv8siv8sf2
;
1623 tmp
[0] = ix86_build_const_vector (intmode
, 1, GEN_INT (0xffff));
1624 tmp
[0] = force_reg (intmode
, tmp
[0]);
1625 tmp
[1] = expand_simple_binop (intmode
, AND
, val
, tmp
[0], NULL_RTX
, 1,
1627 tmp
[2] = expand_simple_binop (intmode
, LSHIFTRT
, val
, GEN_INT (16),
1628 NULL_RTX
, 1, OPTAB_DIRECT
);
1629 tmp
[3] = gen_reg_rtx (fltmode
);
1630 emit_insn (cvt (tmp
[3], tmp
[1]));
1631 tmp
[4] = gen_reg_rtx (fltmode
);
1632 emit_insn (cvt (tmp
[4], tmp
[2]));
1633 real_ldexp (&TWO16r
, &dconst1
, 16);
1634 tmp
[5] = const_double_from_real_value (TWO16r
, SFmode
);
1635 tmp
[5] = force_reg (fltmode
, ix86_build_const_vector (fltmode
, 1, tmp
[5]));
1636 tmp
[6] = expand_simple_binop (fltmode
, MULT
, tmp
[4], tmp
[5], NULL_RTX
, 1,
1638 tmp
[7] = expand_simple_binop (fltmode
, PLUS
, tmp
[3], tmp
[6], target
, 1,
1640 if (tmp
[7] != target
)
1641 emit_move_insn (target
, tmp
[7]);
1644 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1645 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1646 This is done by doing just signed conversion if < 0x1p31, and otherwise by
1647 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
1650 ix86_expand_adjust_ufix_to_sfix_si (rtx val
, rtx
*xorp
)
1652 REAL_VALUE_TYPE TWO31r
;
1654 machine_mode mode
= GET_MODE (val
);
1655 machine_mode scalarmode
= GET_MODE_INNER (mode
);
1656 machine_mode intmode
= GET_MODE_SIZE (mode
) == 32 ? V8SImode
: V4SImode
;
1657 rtx (*cmp
) (rtx
, rtx
, rtx
, rtx
);
1660 for (i
= 0; i
< 3; i
++)
1661 tmp
[i
] = gen_reg_rtx (mode
);
1662 real_ldexp (&TWO31r
, &dconst1
, 31);
1663 two31r
= const_double_from_real_value (TWO31r
, scalarmode
);
1664 two31r
= ix86_build_const_vector (mode
, 1, two31r
);
1665 two31r
= force_reg (mode
, two31r
);
1668 case E_V8SFmode
: cmp
= gen_avx_maskcmpv8sf3
; break;
1669 case E_V4SFmode
: cmp
= gen_sse_maskcmpv4sf3
; break;
1670 case E_V4DFmode
: cmp
= gen_avx_maskcmpv4df3
; break;
1671 case E_V2DFmode
: cmp
= gen_sse2_maskcmpv2df3
; break;
1672 default: gcc_unreachable ();
1674 tmp
[3] = gen_rtx_LE (mode
, two31r
, val
);
1675 emit_insn (cmp (tmp
[0], two31r
, val
, tmp
[3]));
1676 tmp
[1] = expand_simple_binop (mode
, AND
, tmp
[0], two31r
, tmp
[1],
1678 if (intmode
== V4SImode
|| TARGET_AVX2
)
1679 *xorp
= expand_simple_binop (intmode
, ASHIFT
,
1680 gen_lowpart (intmode
, tmp
[0]),
1681 GEN_INT (31), NULL_RTX
, 0,
1685 rtx two31
= GEN_INT (HOST_WIDE_INT_1U
<< 31);
1686 two31
= ix86_build_const_vector (intmode
, 1, two31
);
1687 *xorp
= expand_simple_binop (intmode
, AND
,
1688 gen_lowpart (intmode
, tmp
[0]),
1692 return expand_simple_binop (mode
, MINUS
, val
, tmp
[1], tmp
[2],
1696 /* Generate code for floating point ABS or NEG. */
1699 ix86_expand_fp_absneg_operator (enum rtx_code code
, machine_mode mode
,
1703 bool use_sse
= false;
1704 bool vector_mode
= VECTOR_MODE_P (mode
);
1705 machine_mode vmode
= mode
;
1710 else if (mode
== TFmode
)
1712 else if (TARGET_SSE_MATH
)
1714 use_sse
= SSE_FLOAT_MODE_P (mode
);
1717 else if (mode
== DFmode
)
1724 set
= gen_rtx_fmt_e (code
, mode
, src
);
1725 set
= gen_rtx_SET (dst
, set
);
1729 rtx mask
, use
, clob
;
1731 /* NEG and ABS performed with SSE use bitwise mask operations.
1732 Create the appropriate mask now. */
1733 mask
= ix86_build_signbit_mask (vmode
, vector_mode
, code
== ABS
);
1734 use
= gen_rtx_USE (VOIDmode
, mask
);
1736 par
= gen_rtvec (2, set
, use
);
1739 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1740 par
= gen_rtvec (3, set
, use
, clob
);
1747 /* Changing of sign for FP values is doable using integer unit too. */
1748 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1749 par
= gen_rtvec (2, set
, clob
);
1752 emit_insn (gen_rtx_PARALLEL (VOIDmode
, par
));
1755 /* Deconstruct a floating point ABS or NEG operation
1756 with integer registers into integer operations. */
1759 ix86_split_fp_absneg_operator (enum rtx_code code
, machine_mode mode
,
1762 enum rtx_code absneg_op
;
1765 gcc_assert (operands_match_p (operands
[0], operands
[1]));
1770 dst
= gen_lowpart (SImode
, operands
[0]);
1774 set
= gen_int_mode (0x7fffffff, SImode
);
1779 set
= gen_int_mode (0x80000000, SImode
);
1782 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
1788 dst
= gen_lowpart (DImode
, operands
[0]);
1789 dst
= gen_rtx_ZERO_EXTRACT (DImode
, dst
, const1_rtx
, GEN_INT (63));
1794 set
= gen_rtx_NOT (DImode
, dst
);
1798 dst
= gen_highpart (SImode
, operands
[0]);
1802 set
= gen_int_mode (0x7fffffff, SImode
);
1807 set
= gen_int_mode (0x80000000, SImode
);
1810 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
1815 dst
= gen_rtx_REG (SImode
,
1816 REGNO (operands
[0]) + (TARGET_64BIT
? 1 : 2));
1819 set
= GEN_INT (0x7fff);
1824 set
= GEN_INT (0x8000);
1827 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
1834 set
= gen_rtx_SET (dst
, set
);
1836 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1837 rtvec par
= gen_rtvec (2, set
, clob
);
1839 emit_insn (gen_rtx_PARALLEL (VOIDmode
, par
));
1842 /* Expand a copysign operation. Special case operand 0 being a constant. */
1845 ix86_expand_copysign (rtx operands
[])
1847 machine_mode mode
, vmode
;
1848 rtx dest
, op0
, op1
, mask
;
1854 mode
= GET_MODE (dest
);
1858 else if (mode
== DFmode
)
1860 else if (mode
== TFmode
)
1865 mask
= ix86_build_signbit_mask (vmode
, 0, 0);
1867 if (CONST_DOUBLE_P (op0
))
1869 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0
)))
1870 op0
= simplify_unary_operation (ABS
, mode
, op0
, mode
);
1872 if (mode
== SFmode
|| mode
== DFmode
)
1874 if (op0
== CONST0_RTX (mode
))
1875 op0
= CONST0_RTX (vmode
);
1878 rtx v
= ix86_build_const_vector (vmode
, false, op0
);
1880 op0
= force_reg (vmode
, v
);
1883 else if (op0
!= CONST0_RTX (mode
))
1884 op0
= force_reg (mode
, op0
);
1886 emit_insn (gen_copysign3_const (mode
, dest
, op0
, op1
, mask
));
1890 rtx nmask
= ix86_build_signbit_mask (vmode
, 0, 1);
1892 emit_insn (gen_copysign3_var
1893 (mode
, dest
, NULL_RTX
, op0
, op1
, nmask
, mask
));
1897 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
1898 be a constant, and so has already been expanded into a vector constant. */
1901 ix86_split_copysign_const (rtx operands
[])
1903 machine_mode mode
, vmode
;
1904 rtx dest
, op0
, mask
, x
;
1910 mode
= GET_MODE (dest
);
1911 vmode
= GET_MODE (mask
);
1913 dest
= lowpart_subreg (vmode
, dest
, mode
);
1914 x
= gen_rtx_AND (vmode
, dest
, mask
);
1915 emit_insn (gen_rtx_SET (dest
, x
));
1917 if (op0
!= CONST0_RTX (vmode
))
1919 x
= gen_rtx_IOR (vmode
, dest
, op0
);
1920 emit_insn (gen_rtx_SET (dest
, x
));
1924 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
1925 so we have to do two masks. */
1928 ix86_split_copysign_var (rtx operands
[])
1930 machine_mode mode
, vmode
;
1931 rtx dest
, scratch
, op0
, op1
, mask
, nmask
, x
;
1934 scratch
= operands
[1];
1937 nmask
= operands
[4];
1940 mode
= GET_MODE (dest
);
1941 vmode
= GET_MODE (mask
);
1943 if (rtx_equal_p (op0
, op1
))
1945 /* Shouldn't happen often (it's useless, obviously), but when it does
1946 we'd generate incorrect code if we continue below. */
1947 emit_move_insn (dest
, op0
);
1951 if (REG_P (mask
) && REGNO (dest
) == REGNO (mask
)) /* alternative 0 */
1953 gcc_assert (REGNO (op1
) == REGNO (scratch
));
1955 x
= gen_rtx_AND (vmode
, scratch
, mask
);
1956 emit_insn (gen_rtx_SET (scratch
, x
));
1959 op0
= lowpart_subreg (vmode
, op0
, mode
);
1960 x
= gen_rtx_NOT (vmode
, dest
);
1961 x
= gen_rtx_AND (vmode
, x
, op0
);
1962 emit_insn (gen_rtx_SET (dest
, x
));
1966 if (REGNO (op1
) == REGNO (scratch
)) /* alternative 1,3 */
1968 x
= gen_rtx_AND (vmode
, scratch
, mask
);
1970 else /* alternative 2,4 */
1972 gcc_assert (REGNO (mask
) == REGNO (scratch
));
1973 op1
= lowpart_subreg (vmode
, op1
, mode
);
1974 x
= gen_rtx_AND (vmode
, scratch
, op1
);
1976 emit_insn (gen_rtx_SET (scratch
, x
));
1978 if (REGNO (op0
) == REGNO (dest
)) /* alternative 1,2 */
1980 dest
= lowpart_subreg (vmode
, op0
, mode
);
1981 x
= gen_rtx_AND (vmode
, dest
, nmask
);
1983 else /* alternative 3,4 */
1985 gcc_assert (REGNO (nmask
) == REGNO (dest
));
1987 op0
= lowpart_subreg (vmode
, op0
, mode
);
1988 x
= gen_rtx_AND (vmode
, dest
, op0
);
1990 emit_insn (gen_rtx_SET (dest
, x
));
1993 x
= gen_rtx_IOR (vmode
, dest
, scratch
);
1994 emit_insn (gen_rtx_SET (dest
, x
));
1997 /* Expand an xorsign operation. */
2000 ix86_expand_xorsign (rtx operands
[])
2002 machine_mode mode
, vmode
;
2003 rtx dest
, op0
, op1
, mask
;
2009 mode
= GET_MODE (dest
);
2013 else if (mode
== DFmode
)
2018 mask
= ix86_build_signbit_mask (vmode
, 0, 0);
2020 emit_insn (gen_xorsign3_1 (mode
, dest
, op0
, op1
, mask
));
2023 /* Deconstruct an xorsign operation into bit masks. */
2026 ix86_split_xorsign (rtx operands
[])
2028 machine_mode mode
, vmode
;
2029 rtx dest
, op0
, mask
, x
;
2035 mode
= GET_MODE (dest
);
2036 vmode
= GET_MODE (mask
);
2038 dest
= lowpart_subreg (vmode
, dest
, mode
);
2039 x
= gen_rtx_AND (vmode
, dest
, mask
);
2040 emit_insn (gen_rtx_SET (dest
, x
));
2042 op0
= lowpart_subreg (vmode
, op0
, mode
);
2043 x
= gen_rtx_XOR (vmode
, dest
, op0
);
2044 emit_insn (gen_rtx_SET (dest
, x
));
2047 static rtx
ix86_expand_compare (enum rtx_code code
, rtx op0
, rtx op1
);
2050 ix86_expand_branch (enum rtx_code code
, rtx op0
, rtx op1
, rtx label
)
2052 machine_mode mode
= GET_MODE (op0
);
2055 /* Handle special case - vector comparsion with boolean result, transform
2056 it using ptest instruction. */
2057 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
2059 rtx flag
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
2060 machine_mode p_mode
= GET_MODE_SIZE (mode
) == 32 ? V4DImode
: V2DImode
;
2062 gcc_assert (code
== EQ
|| code
== NE
);
2063 /* Generate XOR since we can't check that one operand is zero vector. */
2064 tmp
= gen_reg_rtx (mode
);
2065 emit_insn (gen_rtx_SET (tmp
, gen_rtx_XOR (mode
, op0
, op1
)));
2066 tmp
= gen_lowpart (p_mode
, tmp
);
2067 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode
, FLAGS_REG
),
2068 gen_rtx_UNSPEC (CCmode
,
2069 gen_rtvec (2, tmp
, tmp
),
2071 tmp
= gen_rtx_fmt_ee (code
, VOIDmode
, flag
, const0_rtx
);
2072 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
2073 gen_rtx_LABEL_REF (VOIDmode
, label
),
2075 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2088 tmp
= ix86_expand_compare (code
, op0
, op1
);
2089 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
2090 gen_rtx_LABEL_REF (VOIDmode
, label
),
2092 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2098 /* For 32-bit target DI comparison may be performed on
2099 SSE registers. To allow this we should avoid split
2100 to SI mode which is achieved by doing xor in DI mode
2101 and then comparing with zero (which is recognized by
2102 STV pass). We don't compare using xor when optimizing
2104 if (!optimize_insn_for_size_p ()
2106 && (code
== EQ
|| code
== NE
))
2108 op0
= force_reg (mode
, gen_rtx_XOR (mode
, op0
, op1
));
2113 /* Expand DImode branch into multiple compare+branch. */
2116 rtx_code_label
*label2
;
2117 enum rtx_code code1
, code2
, code3
;
2118 machine_mode submode
;
2120 if (CONSTANT_P (op0
) && !CONSTANT_P (op1
))
2122 std::swap (op0
, op1
);
2123 code
= swap_condition (code
);
2126 split_double_mode (mode
, &op0
, 1, lo
+0, hi
+0);
2127 split_double_mode (mode
, &op1
, 1, lo
+1, hi
+1);
2129 submode
= mode
== DImode
? SImode
: DImode
;
2131 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
2132 avoid two branches. This costs one extra insn, so disable when
2133 optimizing for size. */
2135 if ((code
== EQ
|| code
== NE
)
2136 && (!optimize_insn_for_size_p ()
2137 || hi
[1] == const0_rtx
|| lo
[1] == const0_rtx
))
2142 if (hi
[1] != const0_rtx
)
2143 xor1
= expand_binop (submode
, xor_optab
, xor1
, hi
[1],
2144 NULL_RTX
, 0, OPTAB_WIDEN
);
2147 if (lo
[1] != const0_rtx
)
2148 xor0
= expand_binop (submode
, xor_optab
, xor0
, lo
[1],
2149 NULL_RTX
, 0, OPTAB_WIDEN
);
2151 tmp
= expand_binop (submode
, ior_optab
, xor1
, xor0
,
2152 NULL_RTX
, 0, OPTAB_WIDEN
);
2154 ix86_expand_branch (code
, tmp
, const0_rtx
, label
);
2158 /* Otherwise, if we are doing less-than or greater-or-equal-than,
2159 op1 is a constant and the low word is zero, then we can just
2160 examine the high word. Similarly for low word -1 and
2161 less-or-equal-than or greater-than. */
2163 if (CONST_INT_P (hi
[1]))
2166 case LT
: case LTU
: case GE
: case GEU
:
2167 if (lo
[1] == const0_rtx
)
2169 ix86_expand_branch (code
, hi
[0], hi
[1], label
);
2173 case LE
: case LEU
: case GT
: case GTU
:
2174 if (lo
[1] == constm1_rtx
)
2176 ix86_expand_branch (code
, hi
[0], hi
[1], label
);
2184 /* Emulate comparisons that do not depend on Zero flag with
2185 double-word subtraction. Note that only Overflow, Sign
2186 and Carry flags are valid, so swap arguments and condition
2187 of comparisons that would otherwise test Zero flag. */
2191 case LE
: case LEU
: case GT
: case GTU
:
2192 std::swap (lo
[0], lo
[1]);
2193 std::swap (hi
[0], hi
[1]);
2194 code
= swap_condition (code
);
2197 case LT
: case LTU
: case GE
: case GEU
:
2199 bool uns
= (code
== LTU
|| code
== GEU
);
2200 rtx (*sbb_insn
) (machine_mode
, rtx
, rtx
, rtx
)
2201 = uns
? gen_sub3_carry_ccc
: gen_sub3_carry_ccgz
;
2203 if (!nonimmediate_operand (lo
[0], submode
))
2204 lo
[0] = force_reg (submode
, lo
[0]);
2205 if (!x86_64_general_operand (lo
[1], submode
))
2206 lo
[1] = force_reg (submode
, lo
[1]);
2208 if (!register_operand (hi
[0], submode
))
2209 hi
[0] = force_reg (submode
, hi
[0]);
2210 if ((uns
&& !nonimmediate_operand (hi
[1], submode
))
2211 || (!uns
&& !x86_64_general_operand (hi
[1], submode
)))
2212 hi
[1] = force_reg (submode
, hi
[1]);
2214 emit_insn (gen_cmp_1 (submode
, lo
[0], lo
[1]));
2216 tmp
= gen_rtx_SCRATCH (submode
);
2217 emit_insn (sbb_insn (submode
, tmp
, hi
[0], hi
[1]));
2219 tmp
= gen_rtx_REG (uns
? CCCmode
: CCGZmode
, FLAGS_REG
);
2220 ix86_expand_branch (code
, tmp
, const0_rtx
, label
);
2228 /* Otherwise, we need two or three jumps. */
2230 label2
= gen_label_rtx ();
2233 code2
= swap_condition (code
);
2234 code3
= unsigned_condition (code
);
2238 case LT
: case GT
: case LTU
: case GTU
:
2241 case LE
: code1
= LT
; code2
= GT
; break;
2242 case GE
: code1
= GT
; code2
= LT
; break;
2243 case LEU
: code1
= LTU
; code2
= GTU
; break;
2244 case GEU
: code1
= GTU
; code2
= LTU
; break;
2246 case EQ
: code1
= UNKNOWN
; code2
= NE
; break;
2247 case NE
: code2
= UNKNOWN
; break;
2255 * if (hi(a) < hi(b)) goto true;
2256 * if (hi(a) > hi(b)) goto false;
2257 * if (lo(a) < lo(b)) goto true;
2261 if (code1
!= UNKNOWN
)
2262 ix86_expand_branch (code1
, hi
[0], hi
[1], label
);
2263 if (code2
!= UNKNOWN
)
2264 ix86_expand_branch (code2
, hi
[0], hi
[1], label2
);
2266 ix86_expand_branch (code3
, lo
[0], lo
[1], label
);
2268 if (code2
!= UNKNOWN
)
2269 emit_label (label2
);
2274 gcc_assert (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_CC
);
2279 /* Figure out whether to use unordered fp comparisons. */
2282 ix86_unordered_fp_compare (enum rtx_code code
)
2284 if (!TARGET_IEEE_FP
)
2313 /* Return a comparison we can do and that it is equivalent to
2314 swap_condition (code) apart possibly from orderedness.
2315 But, never change orderedness if TARGET_IEEE_FP, returning
2316 UNKNOWN in that case if necessary. */
2318 static enum rtx_code
2319 ix86_fp_swap_condition (enum rtx_code code
)
2323 case GT
: /* GTU - CF=0 & ZF=0 */
2324 return TARGET_IEEE_FP
? UNKNOWN
: UNLT
;
2325 case GE
: /* GEU - CF=0 */
2326 return TARGET_IEEE_FP
? UNKNOWN
: UNLE
;
2327 case UNLT
: /* LTU - CF=1 */
2328 return TARGET_IEEE_FP
? UNKNOWN
: GT
;
2329 case UNLE
: /* LEU - CF=1 | ZF=1 */
2330 return TARGET_IEEE_FP
? UNKNOWN
: GE
;
2332 return swap_condition (code
);
2336 /* Return cost of comparison CODE using the best strategy for performance.
2337 All following functions do use number of instructions as a cost metrics.
2338 In future this should be tweaked to compute bytes for optimize_size and
2339 take into account performance of various instructions on various CPUs. */
2342 ix86_fp_comparison_cost (enum rtx_code code
)
2346 /* The cost of code using bit-twiddling on %ah. */
2363 arith_cost
= TARGET_IEEE_FP
? 5 : 4;
2367 arith_cost
= TARGET_IEEE_FP
? 6 : 4;
2373 switch (ix86_fp_comparison_strategy (code
))
2375 case IX86_FPCMP_COMI
:
2376 return arith_cost
> 4 ? 3 : 2;
2377 case IX86_FPCMP_SAHF
:
2378 return arith_cost
> 4 ? 4 : 3;
2384 /* Swap, force into registers, or otherwise massage the two operands
2385 to a fp comparison. The operands are updated in place; the new
2386 comparison code is returned. */
2388 static enum rtx_code
2389 ix86_prepare_fp_compare_args (enum rtx_code code
, rtx
*pop0
, rtx
*pop1
)
2391 bool unordered_compare
= ix86_unordered_fp_compare (code
);
2392 rtx op0
= *pop0
, op1
= *pop1
;
2393 machine_mode op_mode
= GET_MODE (op0
);
2394 bool is_sse
= TARGET_SSE_MATH
&& SSE_FLOAT_MODE_P (op_mode
);
2396 /* All of the unordered compare instructions only work on registers.
2397 The same is true of the fcomi compare instructions. The XFmode
2398 compare instructions require registers except when comparing
2399 against zero or when converting operand 1 from fixed point to
2403 && (unordered_compare
2404 || (op_mode
== XFmode
2405 && ! (standard_80387_constant_p (op0
) == 1
2406 || standard_80387_constant_p (op1
) == 1)
2407 && GET_CODE (op1
) != FLOAT
)
2408 || ix86_fp_comparison_strategy (code
) == IX86_FPCMP_COMI
))
2410 op0
= force_reg (op_mode
, op0
);
2411 op1
= force_reg (op_mode
, op1
);
2415 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2416 things around if they appear profitable, otherwise force op0
2419 if (standard_80387_constant_p (op0
) == 0
2421 && ! (standard_80387_constant_p (op1
) == 0
2424 enum rtx_code new_code
= ix86_fp_swap_condition (code
);
2425 if (new_code
!= UNKNOWN
)
2427 std::swap (op0
, op1
);
2433 op0
= force_reg (op_mode
, op0
);
2435 if (CONSTANT_P (op1
))
2437 int tmp
= standard_80387_constant_p (op1
);
2439 op1
= validize_mem (force_const_mem (op_mode
, op1
));
2443 op1
= force_reg (op_mode
, op1
);
2446 op1
= force_reg (op_mode
, op1
);
2450 /* Try to rearrange the comparison to make it cheaper. */
2451 if (ix86_fp_comparison_cost (code
)
2452 > ix86_fp_comparison_cost (swap_condition (code
))
2453 && (REG_P (op1
) || can_create_pseudo_p ()))
2455 std::swap (op0
, op1
);
2456 code
= swap_condition (code
);
2458 op0
= force_reg (op_mode
, op0
);
2466 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2469 ix86_expand_fp_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2471 bool unordered_compare
= ix86_unordered_fp_compare (code
);
2472 machine_mode cmp_mode
;
2475 code
= ix86_prepare_fp_compare_args (code
, &op0
, &op1
);
2477 tmp
= gen_rtx_COMPARE (CCFPmode
, op0
, op1
);
2478 if (unordered_compare
)
2479 tmp
= gen_rtx_UNSPEC (CCFPmode
, gen_rtvec (1, tmp
), UNSPEC_NOTRAP
);
2481 /* Do fcomi/sahf based test when profitable. */
2482 switch (ix86_fp_comparison_strategy (code
))
2484 case IX86_FPCMP_COMI
:
2485 cmp_mode
= CCFPmode
;
2486 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode
, FLAGS_REG
), tmp
));
2489 case IX86_FPCMP_SAHF
:
2490 cmp_mode
= CCFPmode
;
2491 tmp
= gen_rtx_UNSPEC (HImode
, gen_rtvec (1, tmp
), UNSPEC_FNSTSW
);
2492 scratch
= gen_reg_rtx (HImode
);
2493 emit_insn (gen_rtx_SET (scratch
, tmp
));
2494 emit_insn (gen_x86_sahf_1 (scratch
));
2497 case IX86_FPCMP_ARITH
:
2498 cmp_mode
= CCNOmode
;
2499 tmp
= gen_rtx_UNSPEC (HImode
, gen_rtvec (1, tmp
), UNSPEC_FNSTSW
);
2500 scratch
= gen_reg_rtx (HImode
);
2501 emit_insn (gen_rtx_SET (scratch
, tmp
));
2503 /* In the unordered case, we have to check C2 for NaN's, which
2504 doesn't happen to work out to anything nice combination-wise.
2505 So do some bit twiddling on the value we've got in AH to come
2506 up with an appropriate set of condition codes. */
2512 if (code
== GT
|| !TARGET_IEEE_FP
)
2514 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x45)));
2519 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2520 emit_insn (gen_addqi_ext_1 (scratch
, scratch
, constm1_rtx
));
2521 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x44)));
2528 if (code
== LT
&& TARGET_IEEE_FP
)
2530 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2531 emit_insn (gen_cmpqi_ext_3 (scratch
, const1_rtx
));
2537 emit_insn (gen_testqi_ext_1_ccno (scratch
, const1_rtx
));
2543 if (code
== GE
|| !TARGET_IEEE_FP
)
2545 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x05)));
2550 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2551 emit_insn (gen_xorqi_ext_1_cc (scratch
, scratch
, const1_rtx
));
2557 if (code
== LE
&& TARGET_IEEE_FP
)
2559 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2560 emit_insn (gen_addqi_ext_1 (scratch
, scratch
, constm1_rtx
));
2561 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x40)));
2567 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x45)));
2573 if (code
== EQ
&& TARGET_IEEE_FP
)
2575 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2576 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x40)));
2582 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x40)));
2588 if (code
== NE
&& TARGET_IEEE_FP
)
2590 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2591 emit_insn (gen_xorqi_ext_1_cc (scratch
, scratch
,
2597 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x40)));
2603 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x04)));
2607 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x04)));
2620 /* Return the test that should be put into the flags user, i.e.
2621 the bcc, scc, or cmov instruction. */
2622 return gen_rtx_fmt_ee (code
, VOIDmode
,
2623 gen_rtx_REG (cmp_mode
, FLAGS_REG
),
2627 /* Generate insn patterns to do an integer compare of OPERANDS. */
2630 ix86_expand_int_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2632 machine_mode cmpmode
;
2635 cmpmode
= SELECT_CC_MODE (code
, op0
, op1
);
2636 flags
= gen_rtx_REG (cmpmode
, FLAGS_REG
);
2638 /* This is very simple, but making the interface the same as in the
2639 FP case makes the rest of the code easier. */
2640 tmp
= gen_rtx_COMPARE (cmpmode
, op0
, op1
);
2641 emit_insn (gen_rtx_SET (flags
, tmp
));
2643 /* Return the test that should be put into the flags user, i.e.
2644 the bcc, scc, or cmov instruction. */
2645 return gen_rtx_fmt_ee (code
, VOIDmode
, flags
, const0_rtx
);
2649 ix86_expand_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2653 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_CC
)
2654 ret
= gen_rtx_fmt_ee (code
, VOIDmode
, op0
, op1
);
2656 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0
)))
2658 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0
)));
2659 ret
= ix86_expand_fp_compare (code
, op0
, op1
);
2662 ret
= ix86_expand_int_compare (code
, op0
, op1
);
2668 ix86_expand_setcc (rtx dest
, enum rtx_code code
, rtx op0
, rtx op1
)
2672 gcc_assert (GET_MODE (dest
) == QImode
);
2674 ret
= ix86_expand_compare (code
, op0
, op1
);
2675 PUT_MODE (ret
, QImode
);
2676 emit_insn (gen_rtx_SET (dest
, ret
));
2679 /* Expand comparison setting or clearing carry flag. Return true when
2680 successful and set pop for the operation. */
2682 ix86_expand_carry_flag_compare (enum rtx_code code
, rtx op0
, rtx op1
, rtx
*pop
)
2685 = GET_MODE (op0
) != VOIDmode
? GET_MODE (op0
) : GET_MODE (op1
);
2687 /* Do not handle double-mode compares that go through special path. */
2688 if (mode
== (TARGET_64BIT
? TImode
: DImode
))
2691 if (SCALAR_FLOAT_MODE_P (mode
))
2694 rtx_insn
*compare_seq
;
2696 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode
));
2698 /* Shortcut: following common codes never translate
2699 into carry flag compares. */
2700 if (code
== EQ
|| code
== NE
|| code
== UNEQ
|| code
== LTGT
2701 || code
== ORDERED
|| code
== UNORDERED
)
2704 /* These comparisons require zero flag; swap operands so they won't. */
2705 if ((code
== GT
|| code
== UNLE
|| code
== LE
|| code
== UNGT
)
2708 std::swap (op0
, op1
);
2709 code
= swap_condition (code
);
2712 /* Try to expand the comparison and verify that we end up with
2713 carry flag based comparison. This fails to be true only when
2714 we decide to expand comparison using arithmetic that is not
2715 too common scenario. */
2717 compare_op
= ix86_expand_fp_compare (code
, op0
, op1
);
2718 compare_seq
= get_insns ();
2721 if (GET_MODE (XEXP (compare_op
, 0)) == CCFPmode
)
2722 code
= ix86_fp_compare_code_to_integer (GET_CODE (compare_op
));
2724 code
= GET_CODE (compare_op
);
2726 if (code
!= LTU
&& code
!= GEU
)
2729 emit_insn (compare_seq
);
2734 if (!INTEGRAL_MODE_P (mode
))
2743 /* Convert a==0 into (unsigned)a<1. */
2746 if (op1
!= const0_rtx
)
2749 code
= (code
== EQ
? LTU
: GEU
);
2752 /* Convert a>b into b<a or a>=b-1. */
2755 if (CONST_INT_P (op1
))
2757 op1
= gen_int_mode (INTVAL (op1
) + 1, GET_MODE (op0
));
2758 /* Bail out on overflow. We still can swap operands but that
2759 would force loading of the constant into register. */
2760 if (op1
== const0_rtx
2761 || !x86_64_immediate_operand (op1
, GET_MODE (op1
)))
2763 code
= (code
== GTU
? GEU
: LTU
);
2767 std::swap (op0
, op1
);
2768 code
= (code
== GTU
? LTU
: GEU
);
2772 /* Convert a>=0 into (unsigned)a<0x80000000. */
2775 if (mode
== DImode
|| op1
!= const0_rtx
)
2777 op1
= gen_int_mode (1 << (GET_MODE_BITSIZE (mode
) - 1), mode
);
2778 code
= (code
== LT
? GEU
: LTU
);
2782 if (mode
== DImode
|| op1
!= constm1_rtx
)
2784 op1
= gen_int_mode (1 << (GET_MODE_BITSIZE (mode
) - 1), mode
);
2785 code
= (code
== LE
? GEU
: LTU
);
2791 /* Swapping operands may cause constant to appear as first operand. */
2792 if (!nonimmediate_operand (op0
, VOIDmode
))
2794 if (!can_create_pseudo_p ())
2796 op0
= force_reg (mode
, op0
);
2798 *pop
= ix86_expand_compare (code
, op0
, op1
);
2799 gcc_assert (GET_CODE (*pop
) == LTU
|| GET_CODE (*pop
) == GEU
);
2803 /* Expand conditional increment or decrement using adb/sbb instructions.
2804 The default case using setcc followed by the conditional move can be
2805 done by generic code. */
2807 ix86_expand_int_addcc (rtx operands
[])
2809 enum rtx_code code
= GET_CODE (operands
[1]);
2811 rtx (*insn
) (machine_mode
, rtx
, rtx
, rtx
, rtx
, rtx
);
2813 rtx val
= const0_rtx
;
2816 rtx op0
= XEXP (operands
[1], 0);
2817 rtx op1
= XEXP (operands
[1], 1);
2819 if (operands
[3] != const1_rtx
2820 && operands
[3] != constm1_rtx
)
2822 if (!ix86_expand_carry_flag_compare (code
, op0
, op1
, &compare_op
))
2824 code
= GET_CODE (compare_op
);
2826 flags
= XEXP (compare_op
, 0);
2828 if (GET_MODE (flags
) == CCFPmode
)
2831 code
= ix86_fp_compare_code_to_integer (code
);
2838 PUT_CODE (compare_op
,
2839 reverse_condition_maybe_unordered
2840 (GET_CODE (compare_op
)));
2842 PUT_CODE (compare_op
, reverse_condition (GET_CODE (compare_op
)));
2845 mode
= GET_MODE (operands
[0]);
2847 /* Construct either adc or sbb insn. */
2848 if ((code
== LTU
) == (operands
[3] == constm1_rtx
))
2849 insn
= gen_sub3_carry
;
2851 insn
= gen_add3_carry
;
2853 emit_insn (insn (mode
, operands
[0], operands
[2], val
, flags
, compare_op
));
2859 ix86_expand_int_movcc (rtx operands
[])
2861 enum rtx_code code
= GET_CODE (operands
[1]), compare_code
;
2862 rtx_insn
*compare_seq
;
2864 machine_mode mode
= GET_MODE (operands
[0]);
2865 bool sign_bit_compare_p
= false;
2866 rtx op0
= XEXP (operands
[1], 0);
2867 rtx op1
= XEXP (operands
[1], 1);
2869 if (GET_MODE (op0
) == TImode
2870 || (GET_MODE (op0
) == DImode
2875 compare_op
= ix86_expand_compare (code
, op0
, op1
);
2876 compare_seq
= get_insns ();
2879 compare_code
= GET_CODE (compare_op
);
2881 if ((op1
== const0_rtx
&& (code
== GE
|| code
== LT
))
2882 || (op1
== constm1_rtx
&& (code
== GT
|| code
== LE
)))
2883 sign_bit_compare_p
= true;
2885 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
2886 HImode insns, we'd be swallowed in word prefix ops. */
2888 if ((mode
!= HImode
|| TARGET_FAST_PREFIX
)
2889 && (mode
!= (TARGET_64BIT
? TImode
: DImode
))
2890 && CONST_INT_P (operands
[2])
2891 && CONST_INT_P (operands
[3]))
2893 rtx out
= operands
[0];
2894 HOST_WIDE_INT ct
= INTVAL (operands
[2]);
2895 HOST_WIDE_INT cf
= INTVAL (operands
[3]);
2899 /* Sign bit compares are better done using shifts than we do by using
2901 if (sign_bit_compare_p
2902 || ix86_expand_carry_flag_compare (code
, op0
, op1
, &compare_op
))
2904 /* Detect overlap between destination and compare sources. */
2907 if (!sign_bit_compare_p
)
2912 compare_code
= GET_CODE (compare_op
);
2914 flags
= XEXP (compare_op
, 0);
2916 if (GET_MODE (flags
) == CCFPmode
)
2920 = ix86_fp_compare_code_to_integer (compare_code
);
2923 /* To simplify rest of code, restrict to the GEU case. */
2924 if (compare_code
== LTU
)
2927 compare_code
= reverse_condition (compare_code
);
2928 code
= reverse_condition (code
);
2933 PUT_CODE (compare_op
,
2934 reverse_condition_maybe_unordered
2935 (GET_CODE (compare_op
)));
2937 PUT_CODE (compare_op
,
2938 reverse_condition (GET_CODE (compare_op
)));
2942 if (reg_overlap_mentioned_p (out
, op0
)
2943 || reg_overlap_mentioned_p (out
, op1
))
2944 tmp
= gen_reg_rtx (mode
);
2947 emit_insn (gen_x86_movdicc_0_m1 (tmp
, flags
, compare_op
));
2949 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode
, tmp
),
2950 flags
, compare_op
));
2954 if (code
== GT
|| code
== GE
)
2955 code
= reverse_condition (code
);
2961 tmp
= emit_store_flag (tmp
, code
, op0
, op1
, VOIDmode
, 0, -1);
2974 tmp
= expand_simple_binop (mode
, PLUS
,
2976 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
2987 tmp
= expand_simple_binop (mode
, IOR
,
2989 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
2991 else if (diff
== -1 && ct
)
3001 tmp
= expand_simple_unop (mode
, NOT
, tmp
, copy_rtx (tmp
), 1);
3003 tmp
= expand_simple_binop (mode
, PLUS
,
3004 copy_rtx (tmp
), GEN_INT (cf
),
3005 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3013 * andl cf - ct, dest
3023 tmp
= expand_simple_unop (mode
, NOT
, tmp
, copy_rtx (tmp
), 1);
3026 tmp
= expand_simple_binop (mode
, AND
,
3028 gen_int_mode (cf
- ct
, mode
),
3029 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3031 tmp
= expand_simple_binop (mode
, PLUS
,
3032 copy_rtx (tmp
), GEN_INT (ct
),
3033 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3036 if (!rtx_equal_p (tmp
, out
))
3037 emit_move_insn (copy_rtx (out
), copy_rtx (tmp
));
3044 machine_mode cmp_mode
= GET_MODE (op0
);
3045 enum rtx_code new_code
;
3047 if (SCALAR_FLOAT_MODE_P (cmp_mode
))
3049 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode
));
3051 /* We may be reversing unordered compare to normal compare, that
3052 is not valid in general (we may convert non-trapping condition
3053 to trapping one), however on i386 we currently emit all
3054 comparisons unordered. */
3055 new_code
= reverse_condition_maybe_unordered (code
);
3058 new_code
= ix86_reverse_condition (code
, cmp_mode
);
3059 if (new_code
!= UNKNOWN
)
3067 compare_code
= UNKNOWN
;
3068 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
3069 && CONST_INT_P (op1
))
3071 if (op1
== const0_rtx
3072 && (code
== LT
|| code
== GE
))
3073 compare_code
= code
;
3074 else if (op1
== constm1_rtx
)
3078 else if (code
== GT
)
3083 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3084 if (compare_code
!= UNKNOWN
3085 && GET_MODE (op0
) == GET_MODE (out
)
3086 && (cf
== -1 || ct
== -1))
3088 /* If lea code below could be used, only optimize
3089 if it results in a 2 insn sequence. */
3091 if (! (diff
== 1 || diff
== 2 || diff
== 4 || diff
== 8
3092 || diff
== 3 || diff
== 5 || diff
== 9)
3093 || (compare_code
== LT
&& ct
== -1)
3094 || (compare_code
== GE
&& cf
== -1))
3097 * notl op1 (if necessary)
3105 code
= reverse_condition (code
);
3108 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, -1);
3110 out
= expand_simple_binop (mode
, IOR
,
3112 out
, 1, OPTAB_DIRECT
);
3113 if (out
!= operands
[0])
3114 emit_move_insn (operands
[0], out
);
3121 if ((diff
== 1 || diff
== 2 || diff
== 4 || diff
== 8
3122 || diff
== 3 || diff
== 5 || diff
== 9)
3123 && ((mode
!= QImode
&& mode
!= HImode
) || !TARGET_PARTIAL_REG_STALL
)
3125 || x86_64_immediate_operand (GEN_INT (cf
), VOIDmode
)))
3131 * lea cf(dest*(ct-cf)),dest
3135 * This also catches the degenerate setcc-only case.
3141 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, 1);
3144 /* On x86_64 the lea instruction operates on Pmode, so we need
3145 to get arithmetics done in proper mode to match. */
3147 tmp
= copy_rtx (out
);
3151 out1
= copy_rtx (out
);
3152 tmp
= gen_rtx_MULT (mode
, out1
, GEN_INT (diff
& ~1));
3156 tmp
= gen_rtx_PLUS (mode
, tmp
, out1
);
3162 tmp
= gen_rtx_PLUS (mode
, tmp
, GEN_INT (cf
));
3165 if (!rtx_equal_p (tmp
, out
))
3168 out
= force_operand (tmp
, copy_rtx (out
));
3170 emit_insn (gen_rtx_SET (copy_rtx (out
), copy_rtx (tmp
)));
3172 if (!rtx_equal_p (out
, operands
[0]))
3173 emit_move_insn (operands
[0], copy_rtx (out
));
3179 * General case: Jumpful:
3180 * xorl dest,dest cmpl op1, op2
3181 * cmpl op1, op2 movl ct, dest
3183 * decl dest movl cf, dest
3184 * andl (cf-ct),dest 1:
3189 * This is reasonably steep, but branch mispredict costs are
3190 * high on modern cpus, so consider failing only if optimizing
3194 if ((!TARGET_CMOVE
|| (mode
== QImode
&& TARGET_PARTIAL_REG_STALL
))
3195 && BRANCH_COST (optimize_insn_for_speed_p (),
3200 machine_mode cmp_mode
= GET_MODE (op0
);
3201 enum rtx_code new_code
;
3203 if (SCALAR_FLOAT_MODE_P (cmp_mode
))
3205 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode
));
3207 /* We may be reversing unordered compare to normal compare,
3208 that is not valid in general (we may convert non-trapping
3209 condition to trapping one), however on i386 we currently
3210 emit all comparisons unordered. */
3211 new_code
= reverse_condition_maybe_unordered (code
);
3215 new_code
= ix86_reverse_condition (code
, cmp_mode
);
3216 if (compare_code
!= UNKNOWN
&& new_code
!= UNKNOWN
)
3217 compare_code
= reverse_condition (compare_code
);
3220 if (new_code
!= UNKNOWN
)
3228 if (compare_code
!= UNKNOWN
)
3230 /* notl op1 (if needed)
3235 For x < 0 (resp. x <= -1) there will be no notl,
3236 so if possible swap the constants to get rid of the
3238 True/false will be -1/0 while code below (store flag
3239 followed by decrement) is 0/-1, so the constants need
3240 to be exchanged once more. */
3242 if (compare_code
== GE
|| !cf
)
3244 code
= reverse_condition (code
);
3250 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, -1);
3254 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, 1);
3256 out
= expand_simple_binop (mode
, PLUS
, copy_rtx (out
),
3258 copy_rtx (out
), 1, OPTAB_DIRECT
);
3261 out
= expand_simple_binop (mode
, AND
, copy_rtx (out
),
3262 gen_int_mode (cf
- ct
, mode
),
3263 copy_rtx (out
), 1, OPTAB_DIRECT
);
3265 out
= expand_simple_binop (mode
, PLUS
, copy_rtx (out
), GEN_INT (ct
),
3266 copy_rtx (out
), 1, OPTAB_DIRECT
);
3267 if (!rtx_equal_p (out
, operands
[0]))
3268 emit_move_insn (operands
[0], copy_rtx (out
));
3274 if (!TARGET_CMOVE
|| (mode
== QImode
&& TARGET_PARTIAL_REG_STALL
))
3276 /* Try a few things more with specific constants and a variable. */
3279 rtx var
, orig_out
, out
, tmp
;
3281 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3284 /* If one of the two operands is an interesting constant, load a
3285 constant with the above and mask it in with a logical operation. */
3287 if (CONST_INT_P (operands
[2]))
3290 if (INTVAL (operands
[2]) == 0 && operands
[3] != constm1_rtx
)
3291 operands
[3] = constm1_rtx
, op
= and_optab
;
3292 else if (INTVAL (operands
[2]) == -1 && operands
[3] != const0_rtx
)
3293 operands
[3] = const0_rtx
, op
= ior_optab
;
3297 else if (CONST_INT_P (operands
[3]))
3300 if (INTVAL (operands
[3]) == 0 && operands
[2] != constm1_rtx
)
3301 operands
[2] = constm1_rtx
, op
= and_optab
;
3302 else if (INTVAL (operands
[3]) == -1 && operands
[3] != const0_rtx
)
3303 operands
[2] = const0_rtx
, op
= ior_optab
;
3310 orig_out
= operands
[0];
3311 tmp
= gen_reg_rtx (mode
);
3314 /* Recurse to get the constant loaded. */
3315 if (!ix86_expand_int_movcc (operands
))
3318 /* Mask in the interesting variable. */
3319 out
= expand_binop (mode
, op
, var
, tmp
, orig_out
, 0,
3321 if (!rtx_equal_p (out
, orig_out
))
3322 emit_move_insn (copy_rtx (orig_out
), copy_rtx (out
));
3328 * For comparison with above,
3338 if (! nonimmediate_operand (operands
[2], mode
))
3339 operands
[2] = force_reg (mode
, operands
[2]);
3340 if (! nonimmediate_operand (operands
[3], mode
))
3341 operands
[3] = force_reg (mode
, operands
[3]);
3343 if (! register_operand (operands
[2], VOIDmode
)
3345 || ! register_operand (operands
[3], VOIDmode
)))
3346 operands
[2] = force_reg (mode
, operands
[2]);
3349 && ! register_operand (operands
[3], VOIDmode
))
3350 operands
[3] = force_reg (mode
, operands
[3]);
3352 emit_insn (compare_seq
);
3353 emit_insn (gen_rtx_SET (operands
[0],
3354 gen_rtx_IF_THEN_ELSE (mode
,
3355 compare_op
, operands
[2],
3360 /* Detect conditional moves that exactly match min/max operational
3361 semantics. Note that this is IEEE safe, as long as we don't
3362 interchange the operands.
3364 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3365 and TRUE if the operation is successful and instructions are emitted. */
3368 ix86_expand_sse_fp_minmax (rtx dest
, enum rtx_code code
, rtx cmp_op0
,
3369 rtx cmp_op1
, rtx if_true
, rtx if_false
)
3377 else if (code
== UNGE
)
3378 std::swap (if_true
, if_false
);
3382 if (rtx_equal_p (cmp_op0
, if_true
) && rtx_equal_p (cmp_op1
, if_false
))
3384 else if (rtx_equal_p (cmp_op1
, if_true
) && rtx_equal_p (cmp_op0
, if_false
))
3389 mode
= GET_MODE (dest
);
3391 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3392 but MODE may be a vector mode and thus not appropriate. */
3393 if (!flag_finite_math_only
|| flag_signed_zeros
)
3395 int u
= is_min
? UNSPEC_IEEE_MIN
: UNSPEC_IEEE_MAX
;
3398 if_true
= force_reg (mode
, if_true
);
3399 v
= gen_rtvec (2, if_true
, if_false
);
3400 tmp
= gen_rtx_UNSPEC (mode
, v
, u
);
3404 code
= is_min
? SMIN
: SMAX
;
3405 if (MEM_P (if_true
) && MEM_P (if_false
))
3406 if_true
= force_reg (mode
, if_true
);
3407 tmp
= gen_rtx_fmt_ee (code
, mode
, if_true
, if_false
);
3410 emit_insn (gen_rtx_SET (dest
, tmp
));
3414 /* Expand an SSE comparison. Return the register with the result. */
3417 ix86_expand_sse_cmp (rtx dest
, enum rtx_code code
, rtx cmp_op0
, rtx cmp_op1
,
3418 rtx op_true
, rtx op_false
)
3420 machine_mode mode
= GET_MODE (dest
);
3421 machine_mode cmp_ops_mode
= GET_MODE (cmp_op0
);
3423 /* In general case result of comparison can differ from operands' type. */
3424 machine_mode cmp_mode
;
3426 /* In AVX512F the result of comparison is an integer mask. */
3427 bool maskcmp
= false;
3430 if (GET_MODE_SIZE (cmp_ops_mode
) == 64)
3432 unsigned int nbits
= GET_MODE_NUNITS (cmp_ops_mode
);
3433 cmp_mode
= int_mode_for_size (nbits
, 0).require ();
3437 cmp_mode
= cmp_ops_mode
;
3439 cmp_op0
= force_reg (cmp_ops_mode
, cmp_op0
);
3441 int (*op1_predicate
)(rtx
, machine_mode
)
3442 = VECTOR_MODE_P (cmp_ops_mode
) ? vector_operand
: nonimmediate_operand
;
3444 if (!op1_predicate (cmp_op1
, cmp_ops_mode
))
3445 cmp_op1
= force_reg (cmp_ops_mode
, cmp_op1
);
3448 || (maskcmp
&& cmp_mode
!= mode
)
3449 || (op_true
&& reg_overlap_mentioned_p (dest
, op_true
))
3450 || (op_false
&& reg_overlap_mentioned_p (dest
, op_false
)))
3451 dest
= gen_reg_rtx (maskcmp
? cmp_mode
: mode
);
3453 /* Compare patterns for int modes are unspec in AVX512F only. */
3454 if (maskcmp
&& (code
== GT
|| code
== EQ
))
3456 rtx (*gen
)(rtx
, rtx
, rtx
);
3458 switch (cmp_ops_mode
)
3461 gcc_assert (TARGET_AVX512BW
);
3462 gen
= code
== GT
? gen_avx512bw_gtv64qi3
: gen_avx512bw_eqv64qi3_1
;
3465 gcc_assert (TARGET_AVX512BW
);
3466 gen
= code
== GT
? gen_avx512bw_gtv32hi3
: gen_avx512bw_eqv32hi3_1
;
3469 gen
= code
== GT
? gen_avx512f_gtv16si3
: gen_avx512f_eqv16si3_1
;
3472 gen
= code
== GT
? gen_avx512f_gtv8di3
: gen_avx512f_eqv8di3_1
;
3480 emit_insn (gen (dest
, cmp_op0
, cmp_op1
));
3484 x
= gen_rtx_fmt_ee (code
, cmp_mode
, cmp_op0
, cmp_op1
);
3486 if (cmp_mode
!= mode
&& !maskcmp
)
3488 x
= force_reg (cmp_ops_mode
, x
);
3489 convert_move (dest
, x
, false);
3492 emit_insn (gen_rtx_SET (dest
, x
));
3497 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3498 operations. This is used for both scalar and vector conditional moves. */
3501 ix86_expand_sse_movcc (rtx dest
, rtx cmp
, rtx op_true
, rtx op_false
)
3503 machine_mode mode
= GET_MODE (dest
);
3504 machine_mode cmpmode
= GET_MODE (cmp
);
3506 /* In AVX512F the result of comparison is an integer mask. */
3507 bool maskcmp
= (mode
!= cmpmode
&& TARGET_AVX512F
);
3511 /* If we have an integer mask and FP value then we need
3512 to cast mask to FP mode. */
3513 if (mode
!= cmpmode
&& VECTOR_MODE_P (cmpmode
))
3515 cmp
= force_reg (cmpmode
, cmp
);
3516 cmp
= gen_rtx_SUBREG (mode
, cmp
, 0);
3521 rtx (*gen
) (rtx
, rtx
) = NULL
;
3522 if ((op_true
== CONST0_RTX (mode
)
3523 && vector_all_ones_operand (op_false
, mode
))
3524 || (op_false
== CONST0_RTX (mode
)
3525 && vector_all_ones_operand (op_true
, mode
)))
3529 if (TARGET_AVX512BW
)
3530 gen
= gen_avx512bw_cvtmask2bv64qi
;
3533 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
3534 gen
= gen_avx512vl_cvtmask2bv32qi
;
3537 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
3538 gen
= gen_avx512vl_cvtmask2bv16qi
;
3541 if (TARGET_AVX512BW
)
3542 gen
= gen_avx512bw_cvtmask2wv32hi
;
3545 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
3546 gen
= gen_avx512vl_cvtmask2wv16hi
;
3549 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
3550 gen
= gen_avx512vl_cvtmask2wv8hi
;
3553 if (TARGET_AVX512DQ
)
3554 gen
= gen_avx512f_cvtmask2dv16si
;
3557 if (TARGET_AVX512VL
&& TARGET_AVX512DQ
)
3558 gen
= gen_avx512vl_cvtmask2dv8si
;
3561 if (TARGET_AVX512VL
&& TARGET_AVX512DQ
)
3562 gen
= gen_avx512vl_cvtmask2dv4si
;
3565 if (TARGET_AVX512DQ
)
3566 gen
= gen_avx512f_cvtmask2qv8di
;
3569 if (TARGET_AVX512VL
&& TARGET_AVX512DQ
)
3570 gen
= gen_avx512vl_cvtmask2qv4di
;
3573 if (TARGET_AVX512VL
&& TARGET_AVX512DQ
)
3574 gen
= gen_avx512vl_cvtmask2qv2di
;
3579 if (gen
&& SCALAR_INT_MODE_P (cmpmode
))
3581 cmp
= force_reg (cmpmode
, cmp
);
3582 if (op_true
== CONST0_RTX (mode
))
3584 rtx (*gen_not
) (rtx
, rtx
);
3587 case E_QImode
: gen_not
= gen_knotqi
; break;
3588 case E_HImode
: gen_not
= gen_knothi
; break;
3589 case E_SImode
: gen_not
= gen_knotsi
; break;
3590 case E_DImode
: gen_not
= gen_knotdi
; break;
3591 default: gcc_unreachable ();
3593 rtx n
= gen_reg_rtx (cmpmode
);
3594 emit_insn (gen_not (n
, cmp
));
3597 emit_insn (gen (dest
, cmp
));
3601 else if (vector_all_ones_operand (op_true
, mode
)
3602 && op_false
== CONST0_RTX (mode
))
3604 emit_insn (gen_rtx_SET (dest
, cmp
));
3607 else if (op_false
== CONST0_RTX (mode
))
3609 op_true
= force_reg (mode
, op_true
);
3610 x
= gen_rtx_AND (mode
, cmp
, op_true
);
3611 emit_insn (gen_rtx_SET (dest
, x
));
3614 else if (op_true
== CONST0_RTX (mode
))
3616 op_false
= force_reg (mode
, op_false
);
3617 x
= gen_rtx_NOT (mode
, cmp
);
3618 x
= gen_rtx_AND (mode
, x
, op_false
);
3619 emit_insn (gen_rtx_SET (dest
, x
));
3622 else if (INTEGRAL_MODE_P (mode
) && op_true
== CONSTM1_RTX (mode
))
3624 op_false
= force_reg (mode
, op_false
);
3625 x
= gen_rtx_IOR (mode
, cmp
, op_false
);
3626 emit_insn (gen_rtx_SET (dest
, x
));
3629 else if (TARGET_XOP
)
3631 op_true
= force_reg (mode
, op_true
);
3633 if (!nonimmediate_operand (op_false
, mode
))
3634 op_false
= force_reg (mode
, op_false
);
3636 emit_insn (gen_rtx_SET (dest
, gen_rtx_IF_THEN_ELSE (mode
, cmp
,
3642 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
3645 if (!vector_operand (op_true
, mode
))
3646 op_true
= force_reg (mode
, op_true
);
3648 op_false
= force_reg (mode
, op_false
);
3654 gen
= gen_sse4_1_blendvps
;
3658 gen
= gen_sse4_1_blendvpd
;
3663 gen
= gen_sse4_1_blendvss
;
3664 op_true
= force_reg (mode
, op_true
);
3670 gen
= gen_sse4_1_blendvsd
;
3671 op_true
= force_reg (mode
, op_true
);
3680 gen
= gen_sse4_1_pblendvb
;
3681 if (mode
!= V16QImode
)
3682 d
= gen_reg_rtx (V16QImode
);
3683 op_false
= gen_lowpart (V16QImode
, op_false
);
3684 op_true
= gen_lowpart (V16QImode
, op_true
);
3685 cmp
= gen_lowpart (V16QImode
, cmp
);
3690 gen
= gen_avx_blendvps256
;
3694 gen
= gen_avx_blendvpd256
;
3702 gen
= gen_avx2_pblendvb
;
3703 if (mode
!= V32QImode
)
3704 d
= gen_reg_rtx (V32QImode
);
3705 op_false
= gen_lowpart (V32QImode
, op_false
);
3706 op_true
= gen_lowpart (V32QImode
, op_true
);
3707 cmp
= gen_lowpart (V32QImode
, cmp
);
3712 gen
= gen_avx512bw_blendmv64qi
;
3715 gen
= gen_avx512bw_blendmv32hi
;
3718 gen
= gen_avx512f_blendmv16si
;
3721 gen
= gen_avx512f_blendmv8di
;
3724 gen
= gen_avx512f_blendmv8df
;
3727 gen
= gen_avx512f_blendmv16sf
;
3736 emit_insn (gen (d
, op_false
, op_true
, cmp
));
3738 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), d
));
3742 op_true
= force_reg (mode
, op_true
);
3744 t2
= gen_reg_rtx (mode
);
3746 t3
= gen_reg_rtx (mode
);
3750 x
= gen_rtx_AND (mode
, op_true
, cmp
);
3751 emit_insn (gen_rtx_SET (t2
, x
));
3753 x
= gen_rtx_NOT (mode
, cmp
);
3754 x
= gen_rtx_AND (mode
, x
, op_false
);
3755 emit_insn (gen_rtx_SET (t3
, x
));
3757 x
= gen_rtx_IOR (mode
, t3
, t2
);
3758 emit_insn (gen_rtx_SET (dest
, x
));
3762 /* Swap, force into registers, or otherwise massage the two operands
3763 to an sse comparison with a mask result. Thus we differ a bit from
3764 ix86_prepare_fp_compare_args which expects to produce a flags result.
3766 The DEST operand exists to help determine whether to commute commutative
3767 operators. The POP0/POP1 operands are updated in place. The new
3768 comparison code is returned, or UNKNOWN if not implementable. */
3770 static enum rtx_code
3771 ix86_prepare_sse_fp_compare_args (rtx dest
, enum rtx_code code
,
3772 rtx
*pop0
, rtx
*pop1
)
3778 /* AVX supports all the needed comparisons. */
3781 /* We have no LTGT as an operator. We could implement it with
3782 NE & ORDERED, but this requires an extra temporary. It's
3783 not clear that it's worth it. */
3790 /* These are supported directly. */
3797 /* AVX has 3 operand comparisons, no need to swap anything. */
3800 /* For commutative operators, try to canonicalize the destination
3801 operand to be first in the comparison - this helps reload to
3802 avoid extra moves. */
3803 if (!dest
|| !rtx_equal_p (dest
, *pop1
))
3811 /* These are not supported directly before AVX, and furthermore
3812 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
3813 comparison operands to transform into something that is
3815 std::swap (*pop0
, *pop1
);
3816 code
= swap_condition (code
);
3826 /* Expand a floating-point conditional move. Return true if successful. */
3829 ix86_expand_fp_movcc (rtx operands
[])
3831 machine_mode mode
= GET_MODE (operands
[0]);
3832 enum rtx_code code
= GET_CODE (operands
[1]);
3833 rtx tmp
, compare_op
;
3834 rtx op0
= XEXP (operands
[1], 0);
3835 rtx op1
= XEXP (operands
[1], 1);
3837 if (TARGET_SSE_MATH
&& SSE_FLOAT_MODE_P (mode
))
3841 /* Since we've no cmove for sse registers, don't force bad register
3842 allocation just to gain access to it. Deny movcc when the
3843 comparison mode doesn't match the move mode. */
3844 cmode
= GET_MODE (op0
);
3845 if (cmode
== VOIDmode
)
3846 cmode
= GET_MODE (op1
);
3850 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
, &op0
, &op1
);
3851 if (code
== UNKNOWN
)
3854 if (ix86_expand_sse_fp_minmax (operands
[0], code
, op0
, op1
,
3855 operands
[2], operands
[3]))
3858 tmp
= ix86_expand_sse_cmp (operands
[0], code
, op0
, op1
,
3859 operands
[2], operands
[3]);
3860 ix86_expand_sse_movcc (operands
[0], tmp
, operands
[2], operands
[3]);
3864 if (GET_MODE (op0
) == TImode
3865 || (GET_MODE (op0
) == DImode
3869 /* The floating point conditional move instructions don't directly
3870 support conditions resulting from a signed integer comparison. */
3872 compare_op
= ix86_expand_compare (code
, op0
, op1
);
3873 if (!fcmov_comparison_operator (compare_op
, VOIDmode
))
3875 tmp
= gen_reg_rtx (QImode
);
3876 ix86_expand_setcc (tmp
, code
, op0
, op1
);
3878 compare_op
= ix86_expand_compare (NE
, tmp
, const0_rtx
);
3881 emit_insn (gen_rtx_SET (operands
[0],
3882 gen_rtx_IF_THEN_ELSE (mode
, compare_op
,
3883 operands
[2], operands
[3])));
3888 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
3891 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code
)
3916 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
3919 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code
)
3956 /* Return immediate value to be used in UNSPEC_PCMP
3957 for comparison CODE in MODE. */
3960 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code
, machine_mode mode
)
3962 if (FLOAT_MODE_P (mode
))
3963 return ix86_fp_cmp_code_to_pcmp_immediate (code
);
3964 return ix86_int_cmp_code_to_pcmp_immediate (code
);
3967 /* Expand AVX-512 vector comparison. */
3970 ix86_expand_mask_vec_cmp (rtx operands
[])
3972 machine_mode mask_mode
= GET_MODE (operands
[0]);
3973 machine_mode cmp_mode
= GET_MODE (operands
[2]);
3974 enum rtx_code code
= GET_CODE (operands
[1]);
3975 rtx imm
= GEN_INT (ix86_cmp_code_to_pcmp_immediate (code
, cmp_mode
));
3985 unspec_code
= UNSPEC_UNSIGNED_PCMP
;
3989 unspec_code
= UNSPEC_PCMP
;
3992 unspec
= gen_rtx_UNSPEC (mask_mode
, gen_rtvec (3, operands
[2],
3995 emit_insn (gen_rtx_SET (operands
[0], unspec
));
4000 /* Expand fp vector comparison. */
4003 ix86_expand_fp_vec_cmp (rtx operands
[])
4005 enum rtx_code code
= GET_CODE (operands
[1]);
4008 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
,
4009 &operands
[2], &operands
[3]);
4010 if (code
== UNKNOWN
)
4013 switch (GET_CODE (operands
[1]))
4016 temp
= ix86_expand_sse_cmp (operands
[0], ORDERED
, operands
[2],
4017 operands
[3], NULL
, NULL
);
4018 cmp
= ix86_expand_sse_cmp (operands
[0], NE
, operands
[2],
4019 operands
[3], NULL
, NULL
);
4023 temp
= ix86_expand_sse_cmp (operands
[0], UNORDERED
, operands
[2],
4024 operands
[3], NULL
, NULL
);
4025 cmp
= ix86_expand_sse_cmp (operands
[0], EQ
, operands
[2],
4026 operands
[3], NULL
, NULL
);
4032 cmp
= expand_simple_binop (GET_MODE (cmp
), code
, temp
, cmp
, cmp
, 1,
4036 cmp
= ix86_expand_sse_cmp (operands
[0], code
, operands
[2], operands
[3],
4037 operands
[1], operands
[2]);
4039 if (operands
[0] != cmp
)
4040 emit_move_insn (operands
[0], cmp
);
4046 ix86_expand_int_sse_cmp (rtx dest
, enum rtx_code code
, rtx cop0
, rtx cop1
,
4047 rtx op_true
, rtx op_false
, bool *negate
)
4049 machine_mode data_mode
= GET_MODE (dest
);
4050 machine_mode mode
= GET_MODE (cop0
);
4055 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4057 && (mode
== V16QImode
|| mode
== V8HImode
4058 || mode
== V4SImode
|| mode
== V2DImode
))
4062 /* Canonicalize the comparison to EQ, GT, GTU. */
4073 code
= reverse_condition (code
);
4079 code
= reverse_condition (code
);
4085 std::swap (cop0
, cop1
);
4086 code
= swap_condition (code
);
4093 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4094 if (mode
== V2DImode
)
4099 /* SSE4.1 supports EQ. */
4106 /* SSE4.2 supports GT/GTU. */
4116 rtx optrue
= op_true
? op_true
: CONSTM1_RTX (data_mode
);
4117 rtx opfalse
= op_false
? op_false
: CONST0_RTX (data_mode
);
4119 std::swap (optrue
, opfalse
);
4121 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4122 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4123 min (x, y) == x). While we add one instruction (the minimum),
4124 we remove the need for two instructions in the negation, as the
4125 result is done this way.
4126 When using masks, do it for SI/DImode element types, as it is shorter
4127 than the two subtractions. */
4129 && GET_MODE_SIZE (mode
) != 64
4130 && vector_all_ones_operand (opfalse
, data_mode
)
4131 && optrue
== CONST0_RTX (data_mode
))
4133 && GET_MODE_SIZE (GET_MODE_INNER (mode
)) >= 4
4134 /* Don't do it if not using integer masks and we'd end up with
4135 the right values in the registers though. */
4136 && (GET_MODE_SIZE (mode
) == 64
4137 || !vector_all_ones_operand (optrue
, data_mode
)
4138 || opfalse
!= CONST0_RTX (data_mode
))))
4140 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
4145 gen
= (code
== GTU
) ? gen_uminv16si3
: gen_sminv16si3
;
4148 gen
= (code
== GTU
) ? gen_uminv8di3
: gen_sminv8di3
;
4149 cop0
= force_reg (mode
, cop0
);
4150 cop1
= force_reg (mode
, cop1
);
4154 gen
= (code
== GTU
) ? gen_uminv32qi3
: gen_sminv32qi3
;
4158 gen
= (code
== GTU
) ? gen_uminv16hi3
: gen_sminv16hi3
;
4162 gen
= (code
== GTU
) ? gen_uminv8si3
: gen_sminv8si3
;
4165 if (TARGET_AVX512VL
)
4167 gen
= (code
== GTU
) ? gen_uminv4di3
: gen_sminv4di3
;
4168 cop0
= force_reg (mode
, cop0
);
4169 cop1
= force_reg (mode
, cop1
);
4173 if (code
== GTU
&& TARGET_SSE2
)
4174 gen
= gen_uminv16qi3
;
4175 else if (code
== GT
&& TARGET_SSE4_1
)
4176 gen
= gen_sminv16qi3
;
4179 if (code
== GTU
&& TARGET_SSE4_1
)
4180 gen
= gen_uminv8hi3
;
4181 else if (code
== GT
&& TARGET_SSE2
)
4182 gen
= gen_sminv8hi3
;
4186 gen
= (code
== GTU
) ? gen_uminv4si3
: gen_sminv4si3
;
4189 if (TARGET_AVX512VL
)
4191 gen
= (code
== GTU
) ? gen_uminv2di3
: gen_sminv2di3
;
4192 cop0
= force_reg (mode
, cop0
);
4193 cop1
= force_reg (mode
, cop1
);
4202 rtx tem
= gen_reg_rtx (mode
);
4203 if (!vector_operand (cop0
, mode
))
4204 cop0
= force_reg (mode
, cop0
);
4205 if (!vector_operand (cop1
, mode
))
4206 cop1
= force_reg (mode
, cop1
);
4208 emit_insn (gen (tem
, cop0
, cop1
));
4214 /* Unsigned parallel compare is not supported by the hardware.
4215 Play some tricks to turn this into a signed comparison
4219 cop0
= force_reg (mode
, cop0
);
4232 /* Subtract (-(INT MAX) - 1) from both operands to make
4234 mask
= ix86_build_signbit_mask (mode
, true, false);
4235 t1
= gen_reg_rtx (mode
);
4236 emit_insn (gen_sub3_insn (t1
, cop0
, mask
));
4238 t2
= gen_reg_rtx (mode
);
4239 emit_insn (gen_sub3_insn (t2
, cop1
, mask
));
4253 /* Perform a parallel unsigned saturating subtraction. */
4254 x
= gen_reg_rtx (mode
);
4255 emit_insn (gen_rtx_SET
4256 (x
, gen_rtx_US_MINUS (mode
, cop0
, cop1
)));
4258 cop1
= CONST0_RTX (mode
);
4270 std::swap (op_true
, op_false
);
4272 /* Allow the comparison to be done in one mode, but the movcc to
4273 happen in another mode. */
4274 if (data_mode
== mode
)
4276 x
= ix86_expand_sse_cmp (dest
, code
, cop0
, cop1
,
4281 gcc_assert (GET_MODE_SIZE (data_mode
) == GET_MODE_SIZE (mode
));
4282 x
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), code
, cop0
, cop1
,
4284 if (GET_MODE (x
) == mode
)
4285 x
= gen_lowpart (data_mode
, x
);
4291 /* Expand integer vector comparison. */
4294 ix86_expand_int_vec_cmp (rtx operands
[])
4296 rtx_code code
= GET_CODE (operands
[1]);
4297 bool negate
= false;
4298 rtx cmp
= ix86_expand_int_sse_cmp (operands
[0], code
, operands
[2],
4299 operands
[3], NULL
, NULL
, &negate
);
4305 cmp
= ix86_expand_int_sse_cmp (operands
[0], EQ
, cmp
,
4306 CONST0_RTX (GET_MODE (cmp
)),
4307 NULL
, NULL
, &negate
);
4309 gcc_assert (!negate
);
4311 if (operands
[0] != cmp
)
4312 emit_move_insn (operands
[0], cmp
);
4317 /* Expand a floating-point vector conditional move; a vcond operation
4318 rather than a movcc operation. */
4321 ix86_expand_fp_vcond (rtx operands
[])
4323 enum rtx_code code
= GET_CODE (operands
[3]);
4326 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
,
4327 &operands
[4], &operands
[5]);
4328 if (code
== UNKNOWN
)
4331 switch (GET_CODE (operands
[3]))
4334 temp
= ix86_expand_sse_cmp (operands
[0], ORDERED
, operands
[4],
4335 operands
[5], operands
[0], operands
[0]);
4336 cmp
= ix86_expand_sse_cmp (operands
[0], NE
, operands
[4],
4337 operands
[5], operands
[1], operands
[2]);
4341 temp
= ix86_expand_sse_cmp (operands
[0], UNORDERED
, operands
[4],
4342 operands
[5], operands
[0], operands
[0]);
4343 cmp
= ix86_expand_sse_cmp (operands
[0], EQ
, operands
[4],
4344 operands
[5], operands
[1], operands
[2]);
4350 cmp
= expand_simple_binop (GET_MODE (cmp
), code
, temp
, cmp
, cmp
, 1,
4352 ix86_expand_sse_movcc (operands
[0], cmp
, operands
[1], operands
[2]);
4356 if (ix86_expand_sse_fp_minmax (operands
[0], code
, operands
[4],
4357 operands
[5], operands
[1], operands
[2]))
4360 cmp
= ix86_expand_sse_cmp (operands
[0], code
, operands
[4], operands
[5],
4361 operands
[1], operands
[2]);
4362 ix86_expand_sse_movcc (operands
[0], cmp
, operands
[1], operands
[2]);
4366 /* Expand a signed/unsigned integral vector conditional move. */
4369 ix86_expand_int_vcond (rtx operands
[])
4371 machine_mode data_mode
= GET_MODE (operands
[0]);
4372 machine_mode mode
= GET_MODE (operands
[4]);
4373 enum rtx_code code
= GET_CODE (operands
[3]);
4374 bool negate
= false;
4380 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4381 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4382 if ((code
== LT
|| code
== GE
)
4383 && data_mode
== mode
4384 && cop1
== CONST0_RTX (mode
)
4385 && operands
[1 + (code
== LT
)] == CONST0_RTX (data_mode
)
4386 && GET_MODE_UNIT_SIZE (data_mode
) > 1
4387 && GET_MODE_UNIT_SIZE (data_mode
) <= 8
4388 && (GET_MODE_SIZE (data_mode
) == 16
4389 || (TARGET_AVX2
&& GET_MODE_SIZE (data_mode
) == 32)))
4391 rtx negop
= operands
[2 - (code
== LT
)];
4392 int shift
= GET_MODE_UNIT_BITSIZE (data_mode
) - 1;
4393 if (negop
== CONST1_RTX (data_mode
))
4395 rtx res
= expand_simple_binop (mode
, LSHIFTRT
, cop0
, GEN_INT (shift
),
4396 operands
[0], 1, OPTAB_DIRECT
);
4397 if (res
!= operands
[0])
4398 emit_move_insn (operands
[0], res
);
4401 else if (GET_MODE_INNER (data_mode
) != DImode
4402 && vector_all_ones_operand (negop
, data_mode
))
4404 rtx res
= expand_simple_binop (mode
, ASHIFTRT
, cop0
, GEN_INT (shift
),
4405 operands
[0], 0, OPTAB_DIRECT
);
4406 if (res
!= operands
[0])
4407 emit_move_insn (operands
[0], res
);
4412 if (!nonimmediate_operand (cop1
, mode
))
4413 cop1
= force_reg (mode
, cop1
);
4414 if (!general_operand (operands
[1], data_mode
))
4415 operands
[1] = force_reg (data_mode
, operands
[1]);
4416 if (!general_operand (operands
[2], data_mode
))
4417 operands
[2] = force_reg (data_mode
, operands
[2]);
4419 x
= ix86_expand_int_sse_cmp (operands
[0], code
, cop0
, cop1
,
4420 operands
[1], operands
[2], &negate
);
4425 ix86_expand_sse_movcc (operands
[0], x
, operands
[1+negate
],
4426 operands
[2-negate
]);
4431 ix86_expand_vec_perm_vpermt2 (rtx target
, rtx mask
, rtx op0
, rtx op1
,
4432 struct expand_vec_perm_d
*d
)
4434 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4435 expander, so args are either in d, or in op0, op1 etc. */
4436 machine_mode mode
= GET_MODE (d
? d
->op0
: op0
);
4437 machine_mode maskmode
= mode
;
4438 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
4443 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
4444 gen
= gen_avx512vl_vpermt2varv8hi3
;
4447 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
4448 gen
= gen_avx512vl_vpermt2varv16hi3
;
4451 if (TARGET_AVX512VBMI
)
4452 gen
= gen_avx512bw_vpermt2varv64qi3
;
4455 if (TARGET_AVX512BW
)
4456 gen
= gen_avx512bw_vpermt2varv32hi3
;
4459 if (TARGET_AVX512VL
)
4460 gen
= gen_avx512vl_vpermt2varv4si3
;
4463 if (TARGET_AVX512VL
)
4464 gen
= gen_avx512vl_vpermt2varv8si3
;
4468 gen
= gen_avx512f_vpermt2varv16si3
;
4471 if (TARGET_AVX512VL
)
4473 gen
= gen_avx512vl_vpermt2varv4sf3
;
4474 maskmode
= V4SImode
;
4478 if (TARGET_AVX512VL
)
4480 gen
= gen_avx512vl_vpermt2varv8sf3
;
4481 maskmode
= V8SImode
;
4487 gen
= gen_avx512f_vpermt2varv16sf3
;
4488 maskmode
= V16SImode
;
4492 if (TARGET_AVX512VL
)
4493 gen
= gen_avx512vl_vpermt2varv2di3
;
4496 if (TARGET_AVX512VL
)
4497 gen
= gen_avx512vl_vpermt2varv4di3
;
4501 gen
= gen_avx512f_vpermt2varv8di3
;
4504 if (TARGET_AVX512VL
)
4506 gen
= gen_avx512vl_vpermt2varv2df3
;
4507 maskmode
= V2DImode
;
4511 if (TARGET_AVX512VL
)
4513 gen
= gen_avx512vl_vpermt2varv4df3
;
4514 maskmode
= V4DImode
;
4520 gen
= gen_avx512f_vpermt2varv8df3
;
4521 maskmode
= V8DImode
;
4531 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4532 expander, so args are either in d, or in op0, op1 etc. */
4539 for (int i
= 0; i
< d
->nelt
; ++i
)
4540 vec
[i
] = GEN_INT (d
->perm
[i
]);
4541 mask
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (d
->nelt
, vec
));
4544 emit_insn (gen (target
, force_reg (maskmode
, mask
), op0
, op1
));
4548 /* Expand a variable vector permutation. */
4551 ix86_expand_vec_perm (rtx operands
[])
4553 rtx target
= operands
[0];
4554 rtx op0
= operands
[1];
4555 rtx op1
= operands
[2];
4556 rtx mask
= operands
[3];
4557 rtx t1
, t2
, t3
, t4
, t5
, t6
, t7
, t8
, vt
, vt2
, vec
[32];
4558 machine_mode mode
= GET_MODE (op0
);
4559 machine_mode maskmode
= GET_MODE (mask
);
4561 bool one_operand_shuffle
= rtx_equal_p (op0
, op1
);
4563 /* Number of elements in the vector. */
4564 w
= GET_MODE_NUNITS (mode
);
4565 e
= GET_MODE_UNIT_SIZE (mode
);
4566 gcc_assert (w
<= 64);
4568 if (TARGET_AVX512F
&& one_operand_shuffle
)
4570 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
4574 gen
=gen_avx512f_permvarv16si
;
4577 gen
= gen_avx512f_permvarv16sf
;
4580 gen
= gen_avx512f_permvarv8di
;
4583 gen
= gen_avx512f_permvarv8df
;
4590 emit_insn (gen (target
, op0
, mask
));
4595 if (ix86_expand_vec_perm_vpermt2 (target
, mask
, op0
, op1
, NULL
))
4600 if (mode
== V4DImode
|| mode
== V4DFmode
|| mode
== V16HImode
)
4602 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
4603 an constant shuffle operand. With a tiny bit of effort we can
4604 use VPERMD instead. A re-interpretation stall for V4DFmode is
4605 unfortunate but there's no avoiding it.
4606 Similarly for V16HImode we don't have instructions for variable
4607 shuffling, while for V32QImode we can use after preparing suitable
4608 masks vpshufb; vpshufb; vpermq; vpor. */
4610 if (mode
== V16HImode
)
4612 maskmode
= mode
= V32QImode
;
4618 maskmode
= mode
= V8SImode
;
4622 t1
= gen_reg_rtx (maskmode
);
4624 /* Replicate the low bits of the V4DImode mask into V8SImode:
4626 t1 = { A A B B C C D D }. */
4627 for (i
= 0; i
< w
/ 2; ++i
)
4628 vec
[i
*2 + 1] = vec
[i
*2] = GEN_INT (i
* 2);
4629 vt
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (w
, vec
));
4630 vt
= force_reg (maskmode
, vt
);
4631 mask
= gen_lowpart (maskmode
, mask
);
4632 if (maskmode
== V8SImode
)
4633 emit_insn (gen_avx2_permvarv8si (t1
, mask
, vt
));
4635 emit_insn (gen_avx2_pshufbv32qi3 (t1
, mask
, vt
));
4637 /* Multiply the shuffle indicies by two. */
4638 t1
= expand_simple_binop (maskmode
, PLUS
, t1
, t1
, t1
, 1,
4641 /* Add one to the odd shuffle indicies:
4642 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
4643 for (i
= 0; i
< w
/ 2; ++i
)
4645 vec
[i
* 2] = const0_rtx
;
4646 vec
[i
* 2 + 1] = const1_rtx
;
4648 vt
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (w
, vec
));
4649 vt
= validize_mem (force_const_mem (maskmode
, vt
));
4650 t1
= expand_simple_binop (maskmode
, PLUS
, t1
, vt
, t1
, 1,
4653 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
4654 operands
[3] = mask
= t1
;
4655 target
= gen_reg_rtx (mode
);
4656 op0
= gen_lowpart (mode
, op0
);
4657 op1
= gen_lowpart (mode
, op1
);
4663 /* The VPERMD and VPERMPS instructions already properly ignore
4664 the high bits of the shuffle elements. No need for us to
4665 perform an AND ourselves. */
4666 if (one_operand_shuffle
)
4668 emit_insn (gen_avx2_permvarv8si (target
, op0
, mask
));
4669 if (target
!= operands
[0])
4670 emit_move_insn (operands
[0],
4671 gen_lowpart (GET_MODE (operands
[0]), target
));
4675 t1
= gen_reg_rtx (V8SImode
);
4676 t2
= gen_reg_rtx (V8SImode
);
4677 emit_insn (gen_avx2_permvarv8si (t1
, op0
, mask
));
4678 emit_insn (gen_avx2_permvarv8si (t2
, op1
, mask
));
4684 mask
= gen_lowpart (V8SImode
, mask
);
4685 if (one_operand_shuffle
)
4686 emit_insn (gen_avx2_permvarv8sf (target
, op0
, mask
));
4689 t1
= gen_reg_rtx (V8SFmode
);
4690 t2
= gen_reg_rtx (V8SFmode
);
4691 emit_insn (gen_avx2_permvarv8sf (t1
, op0
, mask
));
4692 emit_insn (gen_avx2_permvarv8sf (t2
, op1
, mask
));
4698 /* By combining the two 128-bit input vectors into one 256-bit
4699 input vector, we can use VPERMD and VPERMPS for the full
4700 two-operand shuffle. */
4701 t1
= gen_reg_rtx (V8SImode
);
4702 t2
= gen_reg_rtx (V8SImode
);
4703 emit_insn (gen_avx_vec_concatv8si (t1
, op0
, op1
));
4704 emit_insn (gen_avx_vec_concatv8si (t2
, mask
, mask
));
4705 emit_insn (gen_avx2_permvarv8si (t1
, t1
, t2
));
4706 emit_insn (gen_avx_vextractf128v8si (target
, t1
, const0_rtx
));
4710 t1
= gen_reg_rtx (V8SFmode
);
4711 t2
= gen_reg_rtx (V8SImode
);
4712 mask
= gen_lowpart (V4SImode
, mask
);
4713 emit_insn (gen_avx_vec_concatv8sf (t1
, op0
, op1
));
4714 emit_insn (gen_avx_vec_concatv8si (t2
, mask
, mask
));
4715 emit_insn (gen_avx2_permvarv8sf (t1
, t1
, t2
));
4716 emit_insn (gen_avx_vextractf128v8sf (target
, t1
, const0_rtx
));
4720 t1
= gen_reg_rtx (V32QImode
);
4721 t2
= gen_reg_rtx (V32QImode
);
4722 t3
= gen_reg_rtx (V32QImode
);
4723 vt2
= GEN_INT (-128);
4724 vt
= gen_const_vec_duplicate (V32QImode
, vt2
);
4725 vt
= force_reg (V32QImode
, vt
);
4726 for (i
= 0; i
< 32; i
++)
4727 vec
[i
] = i
< 16 ? vt2
: const0_rtx
;
4728 vt2
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, vec
));
4729 vt2
= force_reg (V32QImode
, vt2
);
4730 /* From mask create two adjusted masks, which contain the same
4731 bits as mask in the low 7 bits of each vector element.
4732 The first mask will have the most significant bit clear
4733 if it requests element from the same 128-bit lane
4734 and MSB set if it requests element from the other 128-bit lane.
4735 The second mask will have the opposite values of the MSB,
4736 and additionally will have its 128-bit lanes swapped.
4737 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
4738 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
4739 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
4740 stands for other 12 bytes. */
4741 /* The bit whether element is from the same lane or the other
4742 lane is bit 4, so shift it up by 3 to the MSB position. */
4743 t5
= gen_reg_rtx (V4DImode
);
4744 emit_insn (gen_ashlv4di3 (t5
, gen_lowpart (V4DImode
, mask
),
4746 /* Clear MSB bits from the mask just in case it had them set. */
4747 emit_insn (gen_avx2_andnotv32qi3 (t2
, vt
, mask
));
4748 /* After this t1 will have MSB set for elements from other lane. */
4749 emit_insn (gen_xorv32qi3 (t1
, gen_lowpart (V32QImode
, t5
), vt2
));
4750 /* Clear bits other than MSB. */
4751 emit_insn (gen_andv32qi3 (t1
, t1
, vt
));
4752 /* Or in the lower bits from mask into t3. */
4753 emit_insn (gen_iorv32qi3 (t3
, t1
, t2
));
4754 /* And invert MSB bits in t1, so MSB is set for elements from the same
4756 emit_insn (gen_xorv32qi3 (t1
, t1
, vt
));
4757 /* Swap 128-bit lanes in t3. */
4758 t6
= gen_reg_rtx (V4DImode
);
4759 emit_insn (gen_avx2_permv4di_1 (t6
, gen_lowpart (V4DImode
, t3
),
4760 const2_rtx
, GEN_INT (3),
4761 const0_rtx
, const1_rtx
));
4762 /* And or in the lower bits from mask into t1. */
4763 emit_insn (gen_iorv32qi3 (t1
, t1
, t2
));
4764 if (one_operand_shuffle
)
4766 /* Each of these shuffles will put 0s in places where
4767 element from the other 128-bit lane is needed, otherwise
4768 will shuffle in the requested value. */
4769 emit_insn (gen_avx2_pshufbv32qi3 (t3
, op0
,
4770 gen_lowpart (V32QImode
, t6
)));
4771 emit_insn (gen_avx2_pshufbv32qi3 (t1
, op0
, t1
));
4772 /* For t3 the 128-bit lanes are swapped again. */
4773 t7
= gen_reg_rtx (V4DImode
);
4774 emit_insn (gen_avx2_permv4di_1 (t7
, gen_lowpart (V4DImode
, t3
),
4775 const2_rtx
, GEN_INT (3),
4776 const0_rtx
, const1_rtx
));
4777 /* And oring both together leads to the result. */
4778 emit_insn (gen_iorv32qi3 (target
, t1
,
4779 gen_lowpart (V32QImode
, t7
)));
4780 if (target
!= operands
[0])
4781 emit_move_insn (operands
[0],
4782 gen_lowpart (GET_MODE (operands
[0]), target
));
4786 t4
= gen_reg_rtx (V32QImode
);
4787 /* Similarly to the above one_operand_shuffle code,
4788 just for repeated twice for each operand. merge_two:
4789 code will merge the two results together. */
4790 emit_insn (gen_avx2_pshufbv32qi3 (t4
, op0
,
4791 gen_lowpart (V32QImode
, t6
)));
4792 emit_insn (gen_avx2_pshufbv32qi3 (t3
, op1
,
4793 gen_lowpart (V32QImode
, t6
)));
4794 emit_insn (gen_avx2_pshufbv32qi3 (t2
, op0
, t1
));
4795 emit_insn (gen_avx2_pshufbv32qi3 (t1
, op1
, t1
));
4796 t7
= gen_reg_rtx (V4DImode
);
4797 emit_insn (gen_avx2_permv4di_1 (t7
, gen_lowpart (V4DImode
, t4
),
4798 const2_rtx
, GEN_INT (3),
4799 const0_rtx
, const1_rtx
));
4800 t8
= gen_reg_rtx (V4DImode
);
4801 emit_insn (gen_avx2_permv4di_1 (t8
, gen_lowpart (V4DImode
, t3
),
4802 const2_rtx
, GEN_INT (3),
4803 const0_rtx
, const1_rtx
));
4804 emit_insn (gen_iorv32qi3 (t4
, t2
, gen_lowpart (V32QImode
, t7
)));
4805 emit_insn (gen_iorv32qi3 (t3
, t1
, gen_lowpart (V32QImode
, t8
)));
4811 gcc_assert (GET_MODE_SIZE (mode
) <= 16);
4818 /* The XOP VPPERM insn supports three inputs. By ignoring the
4819 one_operand_shuffle special case, we avoid creating another
4820 set of constant vectors in memory. */
4821 one_operand_shuffle
= false;
4823 /* mask = mask & {2*w-1, ...} */
4824 vt
= GEN_INT (2*w
- 1);
4828 /* mask = mask & {w-1, ...} */
4829 vt
= GEN_INT (w
- 1);
4832 vt
= gen_const_vec_duplicate (maskmode
, vt
);
4833 mask
= expand_simple_binop (maskmode
, AND
, mask
, vt
,
4834 NULL_RTX
, 0, OPTAB_DIRECT
);
4836 /* For non-QImode operations, convert the word permutation control
4837 into a byte permutation control. */
4838 if (mode
!= V16QImode
)
4840 mask
= expand_simple_binop (maskmode
, ASHIFT
, mask
,
4841 GEN_INT (exact_log2 (e
)),
4842 NULL_RTX
, 0, OPTAB_DIRECT
);
4844 /* Convert mask to vector of chars. */
4845 mask
= force_reg (V16QImode
, gen_lowpart (V16QImode
, mask
));
4847 /* Replicate each of the input bytes into byte positions:
4848 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
4849 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
4850 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
4851 for (i
= 0; i
< 16; ++i
)
4852 vec
[i
] = GEN_INT (i
/e
* e
);
4853 vt
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, vec
));
4854 vt
= validize_mem (force_const_mem (V16QImode
, vt
));
4856 emit_insn (gen_xop_pperm (mask
, mask
, mask
, vt
));
4858 emit_insn (gen_ssse3_pshufbv16qi3 (mask
, mask
, vt
));
4860 /* Convert it into the byte positions by doing
4861 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
4862 for (i
= 0; i
< 16; ++i
)
4863 vec
[i
] = GEN_INT (i
% e
);
4864 vt
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, vec
));
4865 vt
= validize_mem (force_const_mem (V16QImode
, vt
));
4866 emit_insn (gen_addv16qi3 (mask
, mask
, vt
));
4869 /* The actual shuffle operations all operate on V16QImode. */
4870 op0
= gen_lowpart (V16QImode
, op0
);
4871 op1
= gen_lowpart (V16QImode
, op1
);
4875 if (GET_MODE (target
) != V16QImode
)
4876 target
= gen_reg_rtx (V16QImode
);
4877 emit_insn (gen_xop_pperm (target
, op0
, op1
, mask
));
4878 if (target
!= operands
[0])
4879 emit_move_insn (operands
[0],
4880 gen_lowpart (GET_MODE (operands
[0]), target
));
4882 else if (one_operand_shuffle
)
4884 if (GET_MODE (target
) != V16QImode
)
4885 target
= gen_reg_rtx (V16QImode
);
4886 emit_insn (gen_ssse3_pshufbv16qi3 (target
, op0
, mask
));
4887 if (target
!= operands
[0])
4888 emit_move_insn (operands
[0],
4889 gen_lowpart (GET_MODE (operands
[0]), target
));
4896 /* Shuffle the two input vectors independently. */
4897 t1
= gen_reg_rtx (V16QImode
);
4898 t2
= gen_reg_rtx (V16QImode
);
4899 emit_insn (gen_ssse3_pshufbv16qi3 (t1
, op0
, mask
));
4900 emit_insn (gen_ssse3_pshufbv16qi3 (t2
, op1
, mask
));
4903 /* Then merge them together. The key is whether any given control
4904 element contained a bit set that indicates the second word. */
4907 if (maskmode
== V2DImode
&& !TARGET_SSE4_1
)
4909 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
4910 more shuffle to convert the V2DI input mask into a V4SI
4911 input mask. At which point the masking that expand_int_vcond
4912 will work as desired. */
4913 rtx t3
= gen_reg_rtx (V4SImode
);
4914 emit_insn (gen_sse2_pshufd_1 (t3
, gen_lowpart (V4SImode
, mask
),
4915 const0_rtx
, const0_rtx
,
4916 const2_rtx
, const2_rtx
));
4918 maskmode
= V4SImode
;
4922 vt
= gen_const_vec_duplicate (maskmode
, vt
);
4923 vt
= force_reg (maskmode
, vt
);
4924 mask
= expand_simple_binop (maskmode
, AND
, mask
, vt
,
4925 NULL_RTX
, 0, OPTAB_DIRECT
);
4927 if (GET_MODE (target
) != mode
)
4928 target
= gen_reg_rtx (mode
);
4930 xops
[1] = gen_lowpart (mode
, t2
);
4931 xops
[2] = gen_lowpart (mode
, t1
);
4932 xops
[3] = gen_rtx_EQ (maskmode
, mask
, vt
);
4935 ok
= ix86_expand_int_vcond (xops
);
4937 if (target
!= operands
[0])
4938 emit_move_insn (operands
[0],
4939 gen_lowpart (GET_MODE (operands
[0]), target
));
4943 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
4944 true if we should do zero extension, else sign extension. HIGH_P is
4945 true if we want the N/2 high elements, else the low elements. */
4948 ix86_expand_sse_unpack (rtx dest
, rtx src
, bool unsigned_p
, bool high_p
)
4950 machine_mode imode
= GET_MODE (src
);
4955 rtx (*unpack
)(rtx
, rtx
);
4956 rtx (*extract
)(rtx
, rtx
) = NULL
;
4957 machine_mode halfmode
= BLKmode
;
4963 unpack
= gen_avx512bw_zero_extendv32qiv32hi2
;
4965 unpack
= gen_avx512bw_sign_extendv32qiv32hi2
;
4966 halfmode
= V32QImode
;
4968 = high_p
? gen_vec_extract_hi_v64qi
: gen_vec_extract_lo_v64qi
;
4972 unpack
= gen_avx2_zero_extendv16qiv16hi2
;
4974 unpack
= gen_avx2_sign_extendv16qiv16hi2
;
4975 halfmode
= V16QImode
;
4977 = high_p
? gen_vec_extract_hi_v32qi
: gen_vec_extract_lo_v32qi
;
4981 unpack
= gen_avx512f_zero_extendv16hiv16si2
;
4983 unpack
= gen_avx512f_sign_extendv16hiv16si2
;
4984 halfmode
= V16HImode
;
4986 = high_p
? gen_vec_extract_hi_v32hi
: gen_vec_extract_lo_v32hi
;
4990 unpack
= gen_avx2_zero_extendv8hiv8si2
;
4992 unpack
= gen_avx2_sign_extendv8hiv8si2
;
4993 halfmode
= V8HImode
;
4995 = high_p
? gen_vec_extract_hi_v16hi
: gen_vec_extract_lo_v16hi
;
4999 unpack
= gen_avx512f_zero_extendv8siv8di2
;
5001 unpack
= gen_avx512f_sign_extendv8siv8di2
;
5002 halfmode
= V8SImode
;
5004 = high_p
? gen_vec_extract_hi_v16si
: gen_vec_extract_lo_v16si
;
5008 unpack
= gen_avx2_zero_extendv4siv4di2
;
5010 unpack
= gen_avx2_sign_extendv4siv4di2
;
5011 halfmode
= V4SImode
;
5013 = high_p
? gen_vec_extract_hi_v8si
: gen_vec_extract_lo_v8si
;
5017 unpack
= gen_sse4_1_zero_extendv8qiv8hi2
;
5019 unpack
= gen_sse4_1_sign_extendv8qiv8hi2
;
5023 unpack
= gen_sse4_1_zero_extendv4hiv4si2
;
5025 unpack
= gen_sse4_1_sign_extendv4hiv4si2
;
5029 unpack
= gen_sse4_1_zero_extendv2siv2di2
;
5031 unpack
= gen_sse4_1_sign_extendv2siv2di2
;
5037 if (GET_MODE_SIZE (imode
) >= 32)
5039 tmp
= gen_reg_rtx (halfmode
);
5040 emit_insn (extract (tmp
, src
));
5044 /* Shift higher 8 bytes to lower 8 bytes. */
5045 tmp
= gen_reg_rtx (V1TImode
);
5046 emit_insn (gen_sse2_lshrv1ti3 (tmp
, gen_lowpart (V1TImode
, src
),
5048 tmp
= gen_lowpart (imode
, tmp
);
5053 emit_insn (unpack (dest
, tmp
));
5057 rtx (*unpack
)(rtx
, rtx
, rtx
);
5063 unpack
= gen_vec_interleave_highv16qi
;
5065 unpack
= gen_vec_interleave_lowv16qi
;
5069 unpack
= gen_vec_interleave_highv8hi
;
5071 unpack
= gen_vec_interleave_lowv8hi
;
5075 unpack
= gen_vec_interleave_highv4si
;
5077 unpack
= gen_vec_interleave_lowv4si
;
5084 tmp
= force_reg (imode
, CONST0_RTX (imode
));
5086 tmp
= ix86_expand_sse_cmp (gen_reg_rtx (imode
), GT
, CONST0_RTX (imode
),
5087 src
, pc_rtx
, pc_rtx
);
5089 rtx tmp2
= gen_reg_rtx (imode
);
5090 emit_insn (unpack (tmp2
, src
, tmp
));
5091 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), tmp2
));
5095 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5096 but works for floating pointer parameters and nonoffsetable memories.
5097 For pushes, it returns just stack offsets; the values will be saved
5098 in the right order. Maximally three parts are generated. */
5101 ix86_split_to_parts (rtx operand
, rtx
*parts
, machine_mode mode
)
5106 size
= mode
==XFmode
? 3 : GET_MODE_SIZE (mode
) / 4;
5108 size
= (GET_MODE_SIZE (mode
) + 4) / 8;
5110 gcc_assert (!REG_P (operand
) || !MMX_REGNO_P (REGNO (operand
)));
5111 gcc_assert (size
>= 2 && size
<= 4);
5113 /* Optimize constant pool reference to immediates. This is used by fp
5114 moves, that force all constants to memory to allow combining. */
5115 if (MEM_P (operand
) && MEM_READONLY_P (operand
))
5116 operand
= avoid_constant_pool_reference (operand
);
5118 if (MEM_P (operand
) && !offsettable_memref_p (operand
))
5120 /* The only non-offsetable memories we handle are pushes. */
5121 int ok
= push_operand (operand
, VOIDmode
);
5125 operand
= copy_rtx (operand
);
5126 PUT_MODE (operand
, word_mode
);
5127 parts
[0] = parts
[1] = parts
[2] = parts
[3] = operand
;
5131 if (GET_CODE (operand
) == CONST_VECTOR
)
5133 scalar_int_mode imode
= int_mode_for_mode (mode
).require ();
5134 /* Caution: if we looked through a constant pool memory above,
5135 the operand may actually have a different mode now. That's
5136 ok, since we want to pun this all the way back to an integer. */
5137 operand
= simplify_subreg (imode
, operand
, GET_MODE (operand
), 0);
5138 gcc_assert (operand
!= NULL
);
5145 split_double_mode (mode
, &operand
, 1, &parts
[0], &parts
[1]);
5150 if (REG_P (operand
))
5152 gcc_assert (reload_completed
);
5153 for (i
= 0; i
< size
; i
++)
5154 parts
[i
] = gen_rtx_REG (SImode
, REGNO (operand
) + i
);
5156 else if (offsettable_memref_p (operand
))
5158 operand
= adjust_address (operand
, SImode
, 0);
5160 for (i
= 1; i
< size
; i
++)
5161 parts
[i
] = adjust_address (operand
, SImode
, 4 * i
);
5163 else if (CONST_DOUBLE_P (operand
))
5165 const REAL_VALUE_TYPE
*r
;
5168 r
= CONST_DOUBLE_REAL_VALUE (operand
);
5172 real_to_target (l
, r
, mode
);
5173 parts
[3] = gen_int_mode (l
[3], SImode
);
5174 parts
[2] = gen_int_mode (l
[2], SImode
);
5177 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5178 long double may not be 80-bit. */
5179 real_to_target (l
, r
, mode
);
5180 parts
[2] = gen_int_mode (l
[2], SImode
);
5183 REAL_VALUE_TO_TARGET_DOUBLE (*r
, l
);
5188 parts
[1] = gen_int_mode (l
[1], SImode
);
5189 parts
[0] = gen_int_mode (l
[0], SImode
);
5198 split_double_mode (mode
, &operand
, 1, &parts
[0], &parts
[1]);
5199 if (mode
== XFmode
|| mode
== TFmode
)
5201 machine_mode upper_mode
= mode
==XFmode
? SImode
: DImode
;
5202 if (REG_P (operand
))
5204 gcc_assert (reload_completed
);
5205 parts
[0] = gen_rtx_REG (DImode
, REGNO (operand
) + 0);
5206 parts
[1] = gen_rtx_REG (upper_mode
, REGNO (operand
) + 1);
5208 else if (offsettable_memref_p (operand
))
5210 operand
= adjust_address (operand
, DImode
, 0);
5212 parts
[1] = adjust_address (operand
, upper_mode
, 8);
5214 else if (CONST_DOUBLE_P (operand
))
5218 real_to_target (l
, CONST_DOUBLE_REAL_VALUE (operand
), mode
);
5220 /* real_to_target puts 32-bit pieces in each long. */
5221 parts
[0] = gen_int_mode ((l
[0] & HOST_WIDE_INT_C (0xffffffff))
5222 | ((l
[1] & HOST_WIDE_INT_C (0xffffffff))
5225 if (upper_mode
== SImode
)
5226 parts
[1] = gen_int_mode (l
[2], SImode
);
5229 = gen_int_mode ((l
[2] & HOST_WIDE_INT_C (0xffffffff))
5230 | ((l
[3] & HOST_WIDE_INT_C (0xffffffff))
5241 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5242 Return false when normal moves are needed; true when all required
5243 insns have been emitted. Operands 2-4 contain the input values
5244 int the correct order; operands 5-7 contain the output values. */
5247 ix86_split_long_move (rtx operands
[])
5253 machine_mode mode
= GET_MODE (operands
[0]);
5254 bool collisionparts
[4];
5256 /* The DFmode expanders may ask us to move double.
5257 For 64bit target this is single move. By hiding the fact
5258 here we simplify i386.md splitters. */
5259 if (TARGET_64BIT
&& GET_MODE_SIZE (GET_MODE (operands
[0])) == 8)
5261 /* Optimize constant pool reference to immediates. This is used by
5262 fp moves, that force all constants to memory to allow combining. */
5264 if (MEM_P (operands
[1])
5265 && GET_CODE (XEXP (operands
[1], 0)) == SYMBOL_REF
5266 && CONSTANT_POOL_ADDRESS_P (XEXP (operands
[1], 0)))
5267 operands
[1] = get_pool_constant (XEXP (operands
[1], 0));
5268 if (push_operand (operands
[0], VOIDmode
))
5270 operands
[0] = copy_rtx (operands
[0]);
5271 PUT_MODE (operands
[0], word_mode
);
5274 operands
[0] = gen_lowpart (DImode
, operands
[0]);
5275 operands
[1] = gen_lowpart (DImode
, operands
[1]);
5276 emit_move_insn (operands
[0], operands
[1]);
5280 /* The only non-offsettable memory we handle is push. */
5281 if (push_operand (operands
[0], VOIDmode
))
5284 gcc_assert (!MEM_P (operands
[0])
5285 || offsettable_memref_p (operands
[0]));
5287 nparts
= ix86_split_to_parts (operands
[1], part
[1], GET_MODE (operands
[0]));
5288 ix86_split_to_parts (operands
[0], part
[0], GET_MODE (operands
[0]));
5290 /* When emitting push, take care for source operands on the stack. */
5291 if (push
&& MEM_P (operands
[1])
5292 && reg_overlap_mentioned_p (stack_pointer_rtx
, operands
[1]))
5294 rtx src_base
= XEXP (part
[1][nparts
- 1], 0);
5296 /* Compensate for the stack decrement by 4. */
5297 if (!TARGET_64BIT
&& nparts
== 3
5298 && mode
== XFmode
&& TARGET_128BIT_LONG_DOUBLE
)
5299 src_base
= plus_constant (Pmode
, src_base
, 4);
5301 /* src_base refers to the stack pointer and is
5302 automatically decreased by emitted push. */
5303 for (i
= 0; i
< nparts
; i
++)
5304 part
[1][i
] = change_address (part
[1][i
],
5305 GET_MODE (part
[1][i
]), src_base
);
5308 /* We need to do copy in the right order in case an address register
5309 of the source overlaps the destination. */
5310 if (REG_P (part
[0][0]) && MEM_P (part
[1][0]))
5314 for (i
= 0; i
< nparts
; i
++)
5317 = reg_overlap_mentioned_p (part
[0][i
], XEXP (part
[1][0], 0));
5318 if (collisionparts
[i
])
5322 /* Collision in the middle part can be handled by reordering. */
5323 if (collisions
== 1 && nparts
== 3 && collisionparts
[1])
5325 std::swap (part
[0][1], part
[0][2]);
5326 std::swap (part
[1][1], part
[1][2]);
5328 else if (collisions
== 1
5330 && (collisionparts
[1] || collisionparts
[2]))
5332 if (collisionparts
[1])
5334 std::swap (part
[0][1], part
[0][2]);
5335 std::swap (part
[1][1], part
[1][2]);
5339 std::swap (part
[0][2], part
[0][3]);
5340 std::swap (part
[1][2], part
[1][3]);
5344 /* If there are more collisions, we can't handle it by reordering.
5345 Do an lea to the last part and use only one colliding move. */
5346 else if (collisions
> 1)
5352 base
= part
[0][nparts
- 1];
5354 /* Handle the case when the last part isn't valid for lea.
5355 Happens in 64-bit mode storing the 12-byte XFmode. */
5356 if (GET_MODE (base
) != Pmode
)
5357 base
= gen_rtx_REG (Pmode
, REGNO (base
));
5359 addr
= XEXP (part
[1][0], 0);
5360 if (TARGET_TLS_DIRECT_SEG_REFS
)
5362 struct ix86_address parts
;
5363 int ok
= ix86_decompose_address (addr
, &parts
);
5365 /* It is not valid to use %gs: or %fs: in lea. */
5366 gcc_assert (parts
.seg
== ADDR_SPACE_GENERIC
);
5368 emit_insn (gen_rtx_SET (base
, addr
));
5369 part
[1][0] = replace_equiv_address (part
[1][0], base
);
5370 for (i
= 1; i
< nparts
; i
++)
5372 tmp
= plus_constant (Pmode
, base
, UNITS_PER_WORD
* i
);
5373 part
[1][i
] = replace_equiv_address (part
[1][i
], tmp
);
5384 if (TARGET_128BIT_LONG_DOUBLE
&& mode
== XFmode
)
5385 emit_insn (gen_add2_insn (stack_pointer_rtx
, GEN_INT (-4)));
5386 emit_move_insn (part
[0][2], part
[1][2]);
5388 else if (nparts
== 4)
5390 emit_move_insn (part
[0][3], part
[1][3]);
5391 emit_move_insn (part
[0][2], part
[1][2]);
5396 /* In 64bit mode we don't have 32bit push available. In case this is
5397 register, it is OK - we will just use larger counterpart. We also
5398 retype memory - these comes from attempt to avoid REX prefix on
5399 moving of second half of TFmode value. */
5400 if (GET_MODE (part
[1][1]) == SImode
)
5402 switch (GET_CODE (part
[1][1]))
5405 part
[1][1] = adjust_address (part
[1][1], DImode
, 0);
5409 part
[1][1] = gen_rtx_REG (DImode
, REGNO (part
[1][1]));
5416 if (GET_MODE (part
[1][0]) == SImode
)
5417 part
[1][0] = part
[1][1];
5420 emit_move_insn (part
[0][1], part
[1][1]);
5421 emit_move_insn (part
[0][0], part
[1][0]);
5425 /* Choose correct order to not overwrite the source before it is copied. */
5426 if ((REG_P (part
[0][0])
5427 && REG_P (part
[1][1])
5428 && (REGNO (part
[0][0]) == REGNO (part
[1][1])
5430 && REGNO (part
[0][0]) == REGNO (part
[1][2]))
5432 && REGNO (part
[0][0]) == REGNO (part
[1][3]))))
5434 && reg_overlap_mentioned_p (part
[0][0], XEXP (part
[1][0], 0))))
5436 for (i
= 0, j
= nparts
- 1; i
< nparts
; i
++, j
--)
5438 operands
[2 + i
] = part
[0][j
];
5439 operands
[6 + i
] = part
[1][j
];
5444 for (i
= 0; i
< nparts
; i
++)
5446 operands
[2 + i
] = part
[0][i
];
5447 operands
[6 + i
] = part
[1][i
];
5451 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
5452 if (optimize_insn_for_size_p ())
5454 for (j
= 0; j
< nparts
- 1; j
++)
5455 if (CONST_INT_P (operands
[6 + j
])
5456 && operands
[6 + j
] != const0_rtx
5457 && REG_P (operands
[2 + j
]))
5458 for (i
= j
; i
< nparts
- 1; i
++)
5459 if (CONST_INT_P (operands
[7 + i
])
5460 && INTVAL (operands
[7 + i
]) == INTVAL (operands
[6 + j
]))
5461 operands
[7 + i
] = operands
[2 + j
];
5464 for (i
= 0; i
< nparts
; i
++)
5465 emit_move_insn (operands
[2 + i
], operands
[6 + i
]);
5470 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
5471 left shift by a constant, either using a single shift or
5472 a sequence of add instructions. */
5475 ix86_expand_ashl_const (rtx operand
, int count
, machine_mode mode
)
5478 || (count
* ix86_cost
->add
<= ix86_cost
->shift_const
5479 && !optimize_insn_for_size_p ()))
5482 emit_insn (gen_add2_insn (operand
, operand
));
5486 rtx (*insn
)(rtx
, rtx
, rtx
);
5488 insn
= mode
== DImode
? gen_ashlsi3
: gen_ashldi3
;
5489 emit_insn (insn (operand
, operand
, GEN_INT (count
)));
5494 ix86_split_ashl (rtx
*operands
, rtx scratch
, machine_mode mode
)
5496 rtx (*gen_ashl3
)(rtx
, rtx
, rtx
);
5497 rtx (*gen_shld
)(rtx
, rtx
, rtx
);
5498 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
5499 machine_mode half_mode
;
5501 rtx low
[2], high
[2];
5504 if (CONST_INT_P (operands
[2]))
5506 split_double_mode (mode
, operands
, 2, low
, high
);
5507 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
5509 if (count
>= half_width
)
5511 emit_move_insn (high
[0], low
[1]);
5512 emit_move_insn (low
[0], const0_rtx
);
5514 if (count
> half_width
)
5515 ix86_expand_ashl_const (high
[0], count
- half_width
, mode
);
5519 gen_shld
= mode
== DImode
? gen_x86_shld
: gen_x86_64_shld
;
5521 if (!rtx_equal_p (operands
[0], operands
[1]))
5522 emit_move_insn (operands
[0], operands
[1]);
5524 emit_insn (gen_shld (high
[0], low
[0], GEN_INT (count
)));
5525 ix86_expand_ashl_const (low
[0], count
, mode
);
5530 split_double_mode (mode
, operands
, 1, low
, high
);
5531 half_mode
= mode
== DImode
? SImode
: DImode
;
5533 gen_ashl3
= mode
== DImode
? gen_ashlsi3
: gen_ashldi3
;
5535 if (operands
[1] == const1_rtx
)
5537 /* Assuming we've chosen a QImode capable registers, then 1 << N
5538 can be done with two 32/64-bit shifts, no branches, no cmoves. */
5539 if (ANY_QI_REG_P (low
[0]) && ANY_QI_REG_P (high
[0]))
5541 rtx s
, d
, flags
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
5543 ix86_expand_clear (low
[0]);
5544 ix86_expand_clear (high
[0]);
5545 emit_insn (gen_testqi_ccz_1 (operands
[2], GEN_INT (half_width
)));
5547 d
= gen_lowpart (QImode
, low
[0]);
5548 d
= gen_rtx_STRICT_LOW_PART (VOIDmode
, d
);
5549 s
= gen_rtx_EQ (QImode
, flags
, const0_rtx
);
5550 emit_insn (gen_rtx_SET (d
, s
));
5552 d
= gen_lowpart (QImode
, high
[0]);
5553 d
= gen_rtx_STRICT_LOW_PART (VOIDmode
, d
);
5554 s
= gen_rtx_NE (QImode
, flags
, const0_rtx
);
5555 emit_insn (gen_rtx_SET (d
, s
));
5558 /* Otherwise, we can get the same results by manually performing
5559 a bit extract operation on bit 5/6, and then performing the two
5560 shifts. The two methods of getting 0/1 into low/high are exactly
5561 the same size. Avoiding the shift in the bit extract case helps
5562 pentium4 a bit; no one else seems to care much either way. */
5565 rtx (*gen_lshr3
)(rtx
, rtx
, rtx
);
5566 rtx (*gen_and3
)(rtx
, rtx
, rtx
);
5567 rtx (*gen_xor3
)(rtx
, rtx
, rtx
);
5573 gen_lshr3
= gen_lshrsi3
;
5574 gen_and3
= gen_andsi3
;
5575 gen_xor3
= gen_xorsi3
;
5580 gen_lshr3
= gen_lshrdi3
;
5581 gen_and3
= gen_anddi3
;
5582 gen_xor3
= gen_xordi3
;
5586 if (TARGET_PARTIAL_REG_STALL
&& !optimize_insn_for_size_p ())
5587 x
= gen_rtx_ZERO_EXTEND (half_mode
, operands
[2]);
5589 x
= gen_lowpart (half_mode
, operands
[2]);
5590 emit_insn (gen_rtx_SET (high
[0], x
));
5592 emit_insn (gen_lshr3 (high
[0], high
[0], GEN_INT (bits
)));
5593 emit_insn (gen_and3 (high
[0], high
[0], const1_rtx
));
5594 emit_move_insn (low
[0], high
[0]);
5595 emit_insn (gen_xor3 (low
[0], low
[0], const1_rtx
));
5598 emit_insn (gen_ashl3 (low
[0], low
[0], operands
[2]));
5599 emit_insn (gen_ashl3 (high
[0], high
[0], operands
[2]));
5603 if (operands
[1] == constm1_rtx
)
5605 /* For -1 << N, we can avoid the shld instruction, because we
5606 know that we're shifting 0...31/63 ones into a -1. */
5607 emit_move_insn (low
[0], constm1_rtx
);
5608 if (optimize_insn_for_size_p ())
5609 emit_move_insn (high
[0], low
[0]);
5611 emit_move_insn (high
[0], constm1_rtx
);
5615 gen_shld
= mode
== DImode
? gen_x86_shld
: gen_x86_64_shld
;
5617 if (!rtx_equal_p (operands
[0], operands
[1]))
5618 emit_move_insn (operands
[0], operands
[1]);
5620 split_double_mode (mode
, operands
, 1, low
, high
);
5621 emit_insn (gen_shld (high
[0], low
[0], operands
[2]));
5624 emit_insn (gen_ashl3 (low
[0], low
[0], operands
[2]));
5626 if (TARGET_CMOVE
&& scratch
)
5628 ix86_expand_clear (scratch
);
5629 emit_insn (gen_x86_shift_adj_1
5630 (half_mode
, high
[0], low
[0], operands
[2], scratch
));
5633 emit_insn (gen_x86_shift_adj_2 (half_mode
, high
[0], low
[0], operands
[2]));
5637 ix86_split_ashr (rtx
*operands
, rtx scratch
, machine_mode mode
)
5639 rtx (*gen_ashr3
)(rtx
, rtx
, rtx
)
5640 = mode
== DImode
? gen_ashrsi3
: gen_ashrdi3
;
5641 rtx (*gen_shrd
)(rtx
, rtx
, rtx
);
5642 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
5644 rtx low
[2], high
[2];
5647 if (CONST_INT_P (operands
[2]))
5649 split_double_mode (mode
, operands
, 2, low
, high
);
5650 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
5652 if (count
== GET_MODE_BITSIZE (mode
) - 1)
5654 emit_move_insn (high
[0], high
[1]);
5655 emit_insn (gen_ashr3 (high
[0], high
[0],
5656 GEN_INT (half_width
- 1)));
5657 emit_move_insn (low
[0], high
[0]);
5660 else if (count
>= half_width
)
5662 emit_move_insn (low
[0], high
[1]);
5663 emit_move_insn (high
[0], low
[0]);
5664 emit_insn (gen_ashr3 (high
[0], high
[0],
5665 GEN_INT (half_width
- 1)));
5667 if (count
> half_width
)
5668 emit_insn (gen_ashr3 (low
[0], low
[0],
5669 GEN_INT (count
- half_width
)));
5673 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5675 if (!rtx_equal_p (operands
[0], operands
[1]))
5676 emit_move_insn (operands
[0], operands
[1]);
5678 emit_insn (gen_shrd (low
[0], high
[0], GEN_INT (count
)));
5679 emit_insn (gen_ashr3 (high
[0], high
[0], GEN_INT (count
)));
5684 machine_mode half_mode
;
5686 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5688 if (!rtx_equal_p (operands
[0], operands
[1]))
5689 emit_move_insn (operands
[0], operands
[1]);
5691 split_double_mode (mode
, operands
, 1, low
, high
);
5692 half_mode
= mode
== DImode
? SImode
: DImode
;
5694 emit_insn (gen_shrd (low
[0], high
[0], operands
[2]));
5695 emit_insn (gen_ashr3 (high
[0], high
[0], operands
[2]));
5697 if (TARGET_CMOVE
&& scratch
)
5699 emit_move_insn (scratch
, high
[0]);
5700 emit_insn (gen_ashr3 (scratch
, scratch
,
5701 GEN_INT (half_width
- 1)));
5702 emit_insn (gen_x86_shift_adj_1
5703 (half_mode
, low
[0], high
[0], operands
[2], scratch
));
5706 emit_insn (gen_x86_shift_adj_3
5707 (half_mode
, low
[0], high
[0], operands
[2]));
5712 ix86_split_lshr (rtx
*operands
, rtx scratch
, machine_mode mode
)
5714 rtx (*gen_lshr3
)(rtx
, rtx
, rtx
)
5715 = mode
== DImode
? gen_lshrsi3
: gen_lshrdi3
;
5716 rtx (*gen_shrd
)(rtx
, rtx
, rtx
);
5717 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
5719 rtx low
[2], high
[2];
5722 if (CONST_INT_P (operands
[2]))
5724 split_double_mode (mode
, operands
, 2, low
, high
);
5725 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
5727 if (count
>= half_width
)
5729 emit_move_insn (low
[0], high
[1]);
5730 ix86_expand_clear (high
[0]);
5732 if (count
> half_width
)
5733 emit_insn (gen_lshr3 (low
[0], low
[0],
5734 GEN_INT (count
- half_width
)));
5738 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5740 if (!rtx_equal_p (operands
[0], operands
[1]))
5741 emit_move_insn (operands
[0], operands
[1]);
5743 emit_insn (gen_shrd (low
[0], high
[0], GEN_INT (count
)));
5744 emit_insn (gen_lshr3 (high
[0], high
[0], GEN_INT (count
)));
5749 machine_mode half_mode
;
5751 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
5753 if (!rtx_equal_p (operands
[0], operands
[1]))
5754 emit_move_insn (operands
[0], operands
[1]);
5756 split_double_mode (mode
, operands
, 1, low
, high
);
5757 half_mode
= mode
== DImode
? SImode
: DImode
;
5759 emit_insn (gen_shrd (low
[0], high
[0], operands
[2]));
5760 emit_insn (gen_lshr3 (high
[0], high
[0], operands
[2]));
5762 if (TARGET_CMOVE
&& scratch
)
5764 ix86_expand_clear (scratch
);
5765 emit_insn (gen_x86_shift_adj_1
5766 (half_mode
, low
[0], high
[0], operands
[2], scratch
));
5769 emit_insn (gen_x86_shift_adj_2
5770 (half_mode
, low
[0], high
[0], operands
[2]));
5774 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
5775 DImode for constant loop counts. */
5778 counter_mode (rtx count_exp
)
5780 if (GET_MODE (count_exp
) != VOIDmode
)
5781 return GET_MODE (count_exp
);
5782 if (!CONST_INT_P (count_exp
))
5784 if (TARGET_64BIT
&& (INTVAL (count_exp
) & ~0xffffffff))
5789 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
5790 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
5791 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
5792 memory by VALUE (supposed to be in MODE).
5794 The size is rounded down to whole number of chunk size moved at once.
5795 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
5799 expand_set_or_cpymem_via_loop (rtx destmem
, rtx srcmem
,
5800 rtx destptr
, rtx srcptr
, rtx value
,
5801 rtx count
, machine_mode mode
, int unroll
,
5802 int expected_size
, bool issetmem
)
5804 rtx_code_label
*out_label
, *top_label
;
5806 machine_mode iter_mode
= counter_mode (count
);
5807 int piece_size_n
= GET_MODE_SIZE (mode
) * unroll
;
5808 rtx piece_size
= GEN_INT (piece_size_n
);
5809 rtx piece_size_mask
= GEN_INT (~((GET_MODE_SIZE (mode
) * unroll
) - 1));
5813 top_label
= gen_label_rtx ();
5814 out_label
= gen_label_rtx ();
5815 iter
= gen_reg_rtx (iter_mode
);
5817 size
= expand_simple_binop (iter_mode
, AND
, count
, piece_size_mask
,
5818 NULL
, 1, OPTAB_DIRECT
);
5819 /* Those two should combine. */
5820 if (piece_size
== const1_rtx
)
5822 emit_cmp_and_jump_insns (size
, const0_rtx
, EQ
, NULL_RTX
, iter_mode
,
5824 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
5826 emit_move_insn (iter
, const0_rtx
);
5828 emit_label (top_label
);
5830 tmp
= convert_modes (Pmode
, iter_mode
, iter
, true);
5832 /* This assert could be relaxed - in this case we'll need to compute
5833 smallest power of two, containing in PIECE_SIZE_N and pass it to
5835 gcc_assert ((piece_size_n
& (piece_size_n
- 1)) == 0);
5836 destmem
= offset_address (destmem
, tmp
, piece_size_n
);
5837 destmem
= adjust_address (destmem
, mode
, 0);
5841 srcmem
= offset_address (srcmem
, copy_rtx (tmp
), piece_size_n
);
5842 srcmem
= adjust_address (srcmem
, mode
, 0);
5844 /* When unrolling for chips that reorder memory reads and writes,
5845 we can save registers by using single temporary.
5846 Also using 4 temporaries is overkill in 32bit mode. */
5847 if (!TARGET_64BIT
&& 0)
5849 for (i
= 0; i
< unroll
; i
++)
5853 destmem
= adjust_address (copy_rtx (destmem
), mode
,
5854 GET_MODE_SIZE (mode
));
5855 srcmem
= adjust_address (copy_rtx (srcmem
), mode
,
5856 GET_MODE_SIZE (mode
));
5858 emit_move_insn (destmem
, srcmem
);
5864 gcc_assert (unroll
<= 4);
5865 for (i
= 0; i
< unroll
; i
++)
5867 tmpreg
[i
] = gen_reg_rtx (mode
);
5869 srcmem
= adjust_address (copy_rtx (srcmem
), mode
,
5870 GET_MODE_SIZE (mode
));
5871 emit_move_insn (tmpreg
[i
], srcmem
);
5873 for (i
= 0; i
< unroll
; i
++)
5876 destmem
= adjust_address (copy_rtx (destmem
), mode
,
5877 GET_MODE_SIZE (mode
));
5878 emit_move_insn (destmem
, tmpreg
[i
]);
5883 for (i
= 0; i
< unroll
; i
++)
5886 destmem
= adjust_address (copy_rtx (destmem
), mode
,
5887 GET_MODE_SIZE (mode
));
5888 emit_move_insn (destmem
, value
);
5891 tmp
= expand_simple_binop (iter_mode
, PLUS
, iter
, piece_size
, iter
,
5892 true, OPTAB_LIB_WIDEN
);
5894 emit_move_insn (iter
, tmp
);
5896 emit_cmp_and_jump_insns (iter
, size
, LT
, NULL_RTX
, iter_mode
,
5898 if (expected_size
!= -1)
5900 expected_size
/= GET_MODE_SIZE (mode
) * unroll
;
5901 if (expected_size
== 0)
5903 else if (expected_size
> REG_BR_PROB_BASE
)
5904 predict_jump (REG_BR_PROB_BASE
- 1);
5906 predict_jump (REG_BR_PROB_BASE
- (REG_BR_PROB_BASE
+ expected_size
/ 2)
5910 predict_jump (REG_BR_PROB_BASE
* 80 / 100);
5911 iter
= ix86_zero_extend_to_Pmode (iter
);
5912 tmp
= expand_simple_binop (Pmode
, PLUS
, destptr
, iter
, destptr
,
5913 true, OPTAB_LIB_WIDEN
);
5915 emit_move_insn (destptr
, tmp
);
5918 tmp
= expand_simple_binop (Pmode
, PLUS
, srcptr
, iter
, srcptr
,
5919 true, OPTAB_LIB_WIDEN
);
5921 emit_move_insn (srcptr
, tmp
);
5923 emit_label (out_label
);
5926 /* Divide COUNTREG by SCALE. */
5928 scale_counter (rtx countreg
, int scale
)
5934 if (CONST_INT_P (countreg
))
5935 return GEN_INT (INTVAL (countreg
) / scale
);
5936 gcc_assert (REG_P (countreg
));
5938 sc
= expand_simple_binop (GET_MODE (countreg
), LSHIFTRT
, countreg
,
5939 GEN_INT (exact_log2 (scale
)),
5940 NULL
, 1, OPTAB_DIRECT
);
5944 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
5945 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
5946 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
5947 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
5948 ORIG_VALUE is the original value passed to memset to fill the memory with.
5949 Other arguments have same meaning as for previous function. */
5952 expand_set_or_cpymem_via_rep (rtx destmem
, rtx srcmem
,
5953 rtx destptr
, rtx srcptr
, rtx value
, rtx orig_value
,
5955 machine_mode mode
, bool issetmem
)
5960 HOST_WIDE_INT rounded_count
;
5962 /* If possible, it is shorter to use rep movs.
5963 TODO: Maybe it is better to move this logic to decide_alg. */
5964 if (mode
== QImode
&& CONST_INT_P (count
) && !(INTVAL (count
) & 3)
5965 && (!issetmem
|| orig_value
== const0_rtx
))
5968 if (destptr
!= XEXP (destmem
, 0) || GET_MODE (destmem
) != BLKmode
)
5969 destmem
= adjust_automodify_address_nv (destmem
, BLKmode
, destptr
, 0);
5971 countreg
= ix86_zero_extend_to_Pmode (scale_counter (count
,
5972 GET_MODE_SIZE (mode
)));
5975 destexp
= gen_rtx_ASHIFT (Pmode
, countreg
,
5976 GEN_INT (exact_log2 (GET_MODE_SIZE (mode
))));
5977 destexp
= gen_rtx_PLUS (Pmode
, destexp
, destptr
);
5980 destexp
= gen_rtx_PLUS (Pmode
, destptr
, countreg
);
5981 if ((!issetmem
|| orig_value
== const0_rtx
) && CONST_INT_P (count
))
5984 = ROUND_DOWN (INTVAL (count
), (HOST_WIDE_INT
) GET_MODE_SIZE (mode
));
5985 destmem
= shallow_copy_rtx (destmem
);
5986 set_mem_size (destmem
, rounded_count
);
5988 else if (MEM_SIZE_KNOWN_P (destmem
))
5989 clear_mem_size (destmem
);
5993 value
= force_reg (mode
, gen_lowpart (mode
, value
));
5994 emit_insn (gen_rep_stos (destptr
, countreg
, destmem
, value
, destexp
));
5998 if (srcptr
!= XEXP (srcmem
, 0) || GET_MODE (srcmem
) != BLKmode
)
5999 srcmem
= adjust_automodify_address_nv (srcmem
, BLKmode
, srcptr
, 0);
6002 srcexp
= gen_rtx_ASHIFT (Pmode
, countreg
,
6003 GEN_INT (exact_log2 (GET_MODE_SIZE (mode
))));
6004 srcexp
= gen_rtx_PLUS (Pmode
, srcexp
, srcptr
);
6007 srcexp
= gen_rtx_PLUS (Pmode
, srcptr
, countreg
);
6008 if (CONST_INT_P (count
))
6011 = ROUND_DOWN (INTVAL (count
), (HOST_WIDE_INT
) GET_MODE_SIZE (mode
));
6012 srcmem
= shallow_copy_rtx (srcmem
);
6013 set_mem_size (srcmem
, rounded_count
);
6017 if (MEM_SIZE_KNOWN_P (srcmem
))
6018 clear_mem_size (srcmem
);
6020 emit_insn (gen_rep_mov (destptr
, destmem
, srcptr
, srcmem
, countreg
,
6025 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
6027 SRC is passed by pointer to be updated on return.
6028 Return value is updated DST. */
6030 emit_memmov (rtx destmem
, rtx
*srcmem
, rtx destptr
, rtx srcptr
,
6031 HOST_WIDE_INT size_to_move
)
6033 rtx dst
= destmem
, src
= *srcmem
, adjust
, tempreg
;
6034 enum insn_code code
;
6035 machine_mode move_mode
;
6038 /* Find the widest mode in which we could perform moves.
6039 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6040 it until move of such size is supported. */
6041 piece_size
= 1 << floor_log2 (size_to_move
);
6042 while (!int_mode_for_size (piece_size
* BITS_PER_UNIT
, 0).exists (&move_mode
)
6043 || (code
= optab_handler (mov_optab
, move_mode
)) == CODE_FOR_nothing
)
6045 gcc_assert (piece_size
> 1);
6049 /* Find the corresponding vector mode with the same size as MOVE_MODE.
6050 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
6051 if (GET_MODE_SIZE (move_mode
) > GET_MODE_SIZE (word_mode
))
6053 int nunits
= GET_MODE_SIZE (move_mode
) / GET_MODE_SIZE (word_mode
);
6054 if (!mode_for_vector (word_mode
, nunits
).exists (&move_mode
)
6055 || (code
= optab_handler (mov_optab
, move_mode
)) == CODE_FOR_nothing
)
6057 move_mode
= word_mode
;
6058 piece_size
= GET_MODE_SIZE (move_mode
);
6059 code
= optab_handler (mov_optab
, move_mode
);
6062 gcc_assert (code
!= CODE_FOR_nothing
);
6064 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
, 0);
6065 src
= adjust_automodify_address_nv (src
, move_mode
, srcptr
, 0);
6067 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6068 gcc_assert (size_to_move
% piece_size
== 0);
6069 adjust
= GEN_INT (piece_size
);
6070 for (i
= 0; i
< size_to_move
; i
+= piece_size
)
6072 /* We move from memory to memory, so we'll need to do it via
6073 a temporary register. */
6074 tempreg
= gen_reg_rtx (move_mode
);
6075 emit_insn (GEN_FCN (code
) (tempreg
, src
));
6076 emit_insn (GEN_FCN (code
) (dst
, tempreg
));
6078 emit_move_insn (destptr
,
6079 gen_rtx_PLUS (Pmode
, copy_rtx (destptr
), adjust
));
6080 emit_move_insn (srcptr
,
6081 gen_rtx_PLUS (Pmode
, copy_rtx (srcptr
), adjust
));
6083 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
6085 src
= adjust_automodify_address_nv (src
, move_mode
, srcptr
,
6089 /* Update DST and SRC rtx. */
6094 /* Helper function for the string operations below. Dest VARIABLE whether
6095 it is aligned to VALUE bytes. If true, jump to the label. */
6097 static rtx_code_label
*
6098 ix86_expand_aligntest (rtx variable
, int value
, bool epilogue
)
6100 rtx_code_label
*label
= gen_label_rtx ();
6101 rtx tmpcount
= gen_reg_rtx (GET_MODE (variable
));
6102 if (GET_MODE (variable
) == DImode
)
6103 emit_insn (gen_anddi3 (tmpcount
, variable
, GEN_INT (value
)));
6105 emit_insn (gen_andsi3 (tmpcount
, variable
, GEN_INT (value
)));
6106 emit_cmp_and_jump_insns (tmpcount
, const0_rtx
, EQ
, 0, GET_MODE (variable
),
6109 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
6111 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
6116 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
6119 expand_cpymem_epilogue (rtx destmem
, rtx srcmem
,
6120 rtx destptr
, rtx srcptr
, rtx count
, int max_size
)
6123 if (CONST_INT_P (count
))
6125 HOST_WIDE_INT countval
= INTVAL (count
);
6126 HOST_WIDE_INT epilogue_size
= countval
% max_size
;
6129 /* For now MAX_SIZE should be a power of 2. This assert could be
6130 relaxed, but it'll require a bit more complicated epilogue
6132 gcc_assert ((max_size
& (max_size
- 1)) == 0);
6133 for (i
= max_size
; i
>= 1; i
>>= 1)
6135 if (epilogue_size
& i
)
6136 destmem
= emit_memmov (destmem
, &srcmem
, destptr
, srcptr
, i
);
6142 count
= expand_simple_binop (GET_MODE (count
), AND
, count
, GEN_INT (max_size
- 1),
6143 count
, 1, OPTAB_DIRECT
);
6144 expand_set_or_cpymem_via_loop (destmem
, srcmem
, destptr
, srcptr
, NULL
,
6145 count
, QImode
, 1, 4, false);
6149 /* When there are stringops, we can cheaply increase dest and src pointers.
6150 Otherwise we save code size by maintaining offset (zero is readily
6151 available from preceding rep operation) and using x86 addressing modes.
6153 if (TARGET_SINGLE_STRINGOP
)
6157 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
6158 src
= change_address (srcmem
, SImode
, srcptr
);
6159 dest
= change_address (destmem
, SImode
, destptr
);
6160 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
6162 LABEL_NUSES (label
) = 1;
6166 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
6167 src
= change_address (srcmem
, HImode
, srcptr
);
6168 dest
= change_address (destmem
, HImode
, destptr
);
6169 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
6171 LABEL_NUSES (label
) = 1;
6175 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
6176 src
= change_address (srcmem
, QImode
, srcptr
);
6177 dest
= change_address (destmem
, QImode
, destptr
);
6178 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
6180 LABEL_NUSES (label
) = 1;
6185 rtx offset
= force_reg (Pmode
, const0_rtx
);
6190 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
6191 src
= change_address (srcmem
, SImode
, srcptr
);
6192 dest
= change_address (destmem
, SImode
, destptr
);
6193 emit_move_insn (dest
, src
);
6194 tmp
= expand_simple_binop (Pmode
, PLUS
, offset
, GEN_INT (4), NULL
,
6195 true, OPTAB_LIB_WIDEN
);
6197 emit_move_insn (offset
, tmp
);
6199 LABEL_NUSES (label
) = 1;
6203 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
6204 tmp
= gen_rtx_PLUS (Pmode
, srcptr
, offset
);
6205 src
= change_address (srcmem
, HImode
, tmp
);
6206 tmp
= gen_rtx_PLUS (Pmode
, destptr
, offset
);
6207 dest
= change_address (destmem
, HImode
, tmp
);
6208 emit_move_insn (dest
, src
);
6209 tmp
= expand_simple_binop (Pmode
, PLUS
, offset
, GEN_INT (2), tmp
,
6210 true, OPTAB_LIB_WIDEN
);
6212 emit_move_insn (offset
, tmp
);
6214 LABEL_NUSES (label
) = 1;
6218 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
6219 tmp
= gen_rtx_PLUS (Pmode
, srcptr
, offset
);
6220 src
= change_address (srcmem
, QImode
, tmp
);
6221 tmp
= gen_rtx_PLUS (Pmode
, destptr
, offset
);
6222 dest
= change_address (destmem
, QImode
, tmp
);
6223 emit_move_insn (dest
, src
);
6225 LABEL_NUSES (label
) = 1;
6230 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
6231 with value PROMOTED_VAL.
6232 SRC is passed by pointer to be updated on return.
6233 Return value is updated DST. */
6235 emit_memset (rtx destmem
, rtx destptr
, rtx promoted_val
,
6236 HOST_WIDE_INT size_to_move
)
6238 rtx dst
= destmem
, adjust
;
6239 enum insn_code code
;
6240 machine_mode move_mode
;
6243 /* Find the widest mode in which we could perform moves.
6244 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
6245 it until move of such size is supported. */
6246 move_mode
= GET_MODE (promoted_val
);
6247 if (move_mode
== VOIDmode
)
6249 if (size_to_move
< GET_MODE_SIZE (move_mode
))
6251 unsigned int move_bits
= size_to_move
* BITS_PER_UNIT
;
6252 move_mode
= int_mode_for_size (move_bits
, 0).require ();
6253 promoted_val
= gen_lowpart (move_mode
, promoted_val
);
6255 piece_size
= GET_MODE_SIZE (move_mode
);
6256 code
= optab_handler (mov_optab
, move_mode
);
6257 gcc_assert (code
!= CODE_FOR_nothing
&& promoted_val
!= NULL_RTX
);
6259 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
, 0);
6261 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
6262 gcc_assert (size_to_move
% piece_size
== 0);
6263 adjust
= GEN_INT (piece_size
);
6264 for (i
= 0; i
< size_to_move
; i
+= piece_size
)
6266 if (piece_size
<= GET_MODE_SIZE (word_mode
))
6268 emit_insn (gen_strset (destptr
, dst
, promoted_val
));
6269 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
6274 emit_insn (GEN_FCN (code
) (dst
, promoted_val
));
6276 emit_move_insn (destptr
,
6277 gen_rtx_PLUS (Pmode
, copy_rtx (destptr
), adjust
));
6279 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
6283 /* Update DST rtx. */
6286 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6288 expand_setmem_epilogue_via_loop (rtx destmem
, rtx destptr
, rtx value
,
6289 rtx count
, int max_size
)
6291 count
= expand_simple_binop (counter_mode (count
), AND
, count
,
6292 GEN_INT (max_size
- 1), count
, 1, OPTAB_DIRECT
);
6293 expand_set_or_cpymem_via_loop (destmem
, NULL
, destptr
, NULL
,
6294 gen_lowpart (QImode
, value
), count
, QImode
,
6295 1, max_size
/ 2, true);
6298 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
6300 expand_setmem_epilogue (rtx destmem
, rtx destptr
, rtx value
, rtx vec_value
,
6301 rtx count
, int max_size
)
6305 if (CONST_INT_P (count
))
6307 HOST_WIDE_INT countval
= INTVAL (count
);
6308 HOST_WIDE_INT epilogue_size
= countval
% max_size
;
6311 /* For now MAX_SIZE should be a power of 2. This assert could be
6312 relaxed, but it'll require a bit more complicated epilogue
6314 gcc_assert ((max_size
& (max_size
- 1)) == 0);
6315 for (i
= max_size
; i
>= 1; i
>>= 1)
6317 if (epilogue_size
& i
)
6319 if (vec_value
&& i
> GET_MODE_SIZE (GET_MODE (value
)))
6320 destmem
= emit_memset (destmem
, destptr
, vec_value
, i
);
6322 destmem
= emit_memset (destmem
, destptr
, value
, i
);
6329 expand_setmem_epilogue_via_loop (destmem
, destptr
, value
, count
, max_size
);
6334 rtx_code_label
*label
= ix86_expand_aligntest (count
, 16, true);
6337 dest
= change_address (destmem
, DImode
, destptr
);
6338 emit_insn (gen_strset (destptr
, dest
, value
));
6339 dest
= adjust_automodify_address_nv (dest
, DImode
, destptr
, 8);
6340 emit_insn (gen_strset (destptr
, dest
, value
));
6344 dest
= change_address (destmem
, SImode
, destptr
);
6345 emit_insn (gen_strset (destptr
, dest
, value
));
6346 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 4);
6347 emit_insn (gen_strset (destptr
, dest
, value
));
6348 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 8);
6349 emit_insn (gen_strset (destptr
, dest
, value
));
6350 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 12);
6351 emit_insn (gen_strset (destptr
, dest
, value
));
6354 LABEL_NUSES (label
) = 1;
6358 rtx_code_label
*label
= ix86_expand_aligntest (count
, 8, true);
6361 dest
= change_address (destmem
, DImode
, destptr
);
6362 emit_insn (gen_strset (destptr
, dest
, value
));
6366 dest
= change_address (destmem
, SImode
, destptr
);
6367 emit_insn (gen_strset (destptr
, dest
, value
));
6368 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 4);
6369 emit_insn (gen_strset (destptr
, dest
, value
));
6372 LABEL_NUSES (label
) = 1;
6376 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
6377 dest
= change_address (destmem
, SImode
, destptr
);
6378 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (SImode
, value
)));
6380 LABEL_NUSES (label
) = 1;
6384 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
6385 dest
= change_address (destmem
, HImode
, destptr
);
6386 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (HImode
, value
)));
6388 LABEL_NUSES (label
) = 1;
6392 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
6393 dest
= change_address (destmem
, QImode
, destptr
);
6394 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (QImode
, value
)));
6396 LABEL_NUSES (label
) = 1;
6400 /* Adjust COUNTER by the VALUE. */
6402 ix86_adjust_counter (rtx countreg
, HOST_WIDE_INT value
)
6404 emit_insn (gen_add2_insn (countreg
, GEN_INT (-value
)));
6407 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
6408 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
6409 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
6411 Return value is updated DESTMEM. */
6414 expand_set_or_cpymem_prologue (rtx destmem
, rtx srcmem
,
6415 rtx destptr
, rtx srcptr
, rtx value
,
6416 rtx vec_value
, rtx count
, int align
,
6417 int desired_alignment
, bool issetmem
)
6420 for (i
= 1; i
< desired_alignment
; i
<<= 1)
6424 rtx_code_label
*label
= ix86_expand_aligntest (destptr
, i
, false);
6427 if (vec_value
&& i
> GET_MODE_SIZE (GET_MODE (value
)))
6428 destmem
= emit_memset (destmem
, destptr
, vec_value
, i
);
6430 destmem
= emit_memset (destmem
, destptr
, value
, i
);
6433 destmem
= emit_memmov (destmem
, &srcmem
, destptr
, srcptr
, i
);
6434 ix86_adjust_counter (count
, i
);
6436 LABEL_NUSES (label
) = 1;
6437 set_mem_align (destmem
, i
* 2 * BITS_PER_UNIT
);
6443 /* Test if COUNT&SIZE is nonzero and if so, expand movme
6444 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
6445 and jump to DONE_LABEL. */
6447 expand_small_cpymem_or_setmem (rtx destmem
, rtx srcmem
,
6448 rtx destptr
, rtx srcptr
,
6449 rtx value
, rtx vec_value
,
6450 rtx count
, int size
,
6451 rtx done_label
, bool issetmem
)
6453 rtx_code_label
*label
= ix86_expand_aligntest (count
, size
, false);
6454 machine_mode mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 1).else_blk ();
6458 /* If we do not have vector value to copy, we must reduce size. */
6463 if (GET_MODE (value
) == VOIDmode
&& size
> 8)
6465 else if (GET_MODE_SIZE (mode
) > GET_MODE_SIZE (GET_MODE (value
)))
6466 mode
= GET_MODE (value
);
6469 mode
= GET_MODE (vec_value
), value
= vec_value
;
6473 /* Choose appropriate vector mode. */
6475 mode
= TARGET_AVX
? V32QImode
: TARGET_SSE
? V16QImode
: DImode
;
6476 else if (size
>= 16)
6477 mode
= TARGET_SSE
? V16QImode
: DImode
;
6478 srcmem
= change_address (srcmem
, mode
, srcptr
);
6480 destmem
= change_address (destmem
, mode
, destptr
);
6481 modesize
= GEN_INT (GET_MODE_SIZE (mode
));
6482 gcc_assert (GET_MODE_SIZE (mode
) <= size
);
6483 for (n
= 0; n
* GET_MODE_SIZE (mode
) < size
; n
++)
6486 emit_move_insn (destmem
, gen_lowpart (mode
, value
));
6489 emit_move_insn (destmem
, srcmem
);
6490 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
6492 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
6495 destmem
= offset_address (destmem
, count
, 1);
6496 destmem
= offset_address (destmem
, GEN_INT (-2 * size
),
6497 GET_MODE_SIZE (mode
));
6500 srcmem
= offset_address (srcmem
, count
, 1);
6501 srcmem
= offset_address (srcmem
, GEN_INT (-2 * size
),
6502 GET_MODE_SIZE (mode
));
6504 for (n
= 0; n
* GET_MODE_SIZE (mode
) < size
; n
++)
6507 emit_move_insn (destmem
, gen_lowpart (mode
, value
));
6510 emit_move_insn (destmem
, srcmem
);
6511 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
6513 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
6515 emit_jump_insn (gen_jump (done_label
));
6519 LABEL_NUSES (label
) = 1;
6522 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
6523 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
6524 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
6525 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
6526 DONE_LABEL is a label after the whole copying sequence. The label is created
6527 on demand if *DONE_LABEL is NULL.
6528 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
6529 bounds after the initial copies.
6531 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
6532 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
6533 we will dispatch to a library call for large blocks.
6535 In pseudocode we do:
6539 Assume that SIZE is 4. Bigger sizes are handled analogously
6542 copy 4 bytes from SRCPTR to DESTPTR
6543 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
6548 copy 1 byte from SRCPTR to DESTPTR
6551 copy 2 bytes from SRCPTR to DESTPTR
6552 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
6557 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
6558 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
6560 OLD_DESPTR = DESTPTR;
6561 Align DESTPTR up to DESIRED_ALIGN
6562 SRCPTR += DESTPTR - OLD_DESTPTR
6563 COUNT -= DEST_PTR - OLD_DESTPTR
6565 Round COUNT down to multiple of SIZE
6566 << optional caller supplied zero size guard is here >>
6567 << optional caller supplied dynamic check is here >>
6568 << caller supplied main copy loop is here >>
6573 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem
, rtx srcmem
,
6574 rtx
*destptr
, rtx
*srcptr
,
6576 rtx value
, rtx vec_value
,
6578 rtx_code_label
**done_label
,
6582 unsigned HOST_WIDE_INT
*min_size
,
6586 rtx_code_label
*loop_label
= NULL
, *label
;
6589 int prolog_size
= 0;
6592 /* Chose proper value to copy. */
6593 if (issetmem
&& VECTOR_MODE_P (mode
))
6594 mode_value
= vec_value
;
6597 gcc_assert (GET_MODE_SIZE (mode
) <= size
);
6599 /* See if block is big or small, handle small blocks. */
6600 if (!CONST_INT_P (*count
) && *min_size
< (unsigned HOST_WIDE_INT
)size
)
6603 loop_label
= gen_label_rtx ();
6606 *done_label
= gen_label_rtx ();
6608 emit_cmp_and_jump_insns (*count
, GEN_INT (size2
), GE
, 0, GET_MODE (*count
),
6612 /* Handle sizes > 3. */
6613 for (;size2
> 2; size2
>>= 1)
6614 expand_small_cpymem_or_setmem (destmem
, srcmem
,
6618 size2
, *done_label
, issetmem
);
6619 /* Nothing to copy? Jump to DONE_LABEL if so */
6620 emit_cmp_and_jump_insns (*count
, const0_rtx
, EQ
, 0, GET_MODE (*count
),
6623 /* Do a byte copy. */
6624 destmem
= change_address (destmem
, QImode
, *destptr
);
6626 emit_move_insn (destmem
, gen_lowpart (QImode
, value
));
6629 srcmem
= change_address (srcmem
, QImode
, *srcptr
);
6630 emit_move_insn (destmem
, srcmem
);
6633 /* Handle sizes 2 and 3. */
6634 label
= ix86_expand_aligntest (*count
, 2, false);
6635 destmem
= change_address (destmem
, HImode
, *destptr
);
6636 destmem
= offset_address (destmem
, *count
, 1);
6637 destmem
= offset_address (destmem
, GEN_INT (-2), 2);
6639 emit_move_insn (destmem
, gen_lowpart (HImode
, value
));
6642 srcmem
= change_address (srcmem
, HImode
, *srcptr
);
6643 srcmem
= offset_address (srcmem
, *count
, 1);
6644 srcmem
= offset_address (srcmem
, GEN_INT (-2), 2);
6645 emit_move_insn (destmem
, srcmem
);
6649 LABEL_NUSES (label
) = 1;
6650 emit_jump_insn (gen_jump (*done_label
));
6654 gcc_assert (*min_size
>= (unsigned HOST_WIDE_INT
)size
6655 || UINTVAL (*count
) >= (unsigned HOST_WIDE_INT
)size
);
6657 /* Start memcpy for COUNT >= SIZE. */
6660 emit_label (loop_label
);
6661 LABEL_NUSES (loop_label
) = 1;
6664 /* Copy first desired_align bytes. */
6666 srcmem
= change_address (srcmem
, mode
, *srcptr
);
6667 destmem
= change_address (destmem
, mode
, *destptr
);
6668 modesize
= GEN_INT (GET_MODE_SIZE (mode
));
6669 for (n
= 0; prolog_size
< desired_align
- align
; n
++)
6672 emit_move_insn (destmem
, mode_value
);
6675 emit_move_insn (destmem
, srcmem
);
6676 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
6678 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
6679 prolog_size
+= GET_MODE_SIZE (mode
);
6683 /* Copy last SIZE bytes. */
6684 destmem
= offset_address (destmem
, *count
, 1);
6685 destmem
= offset_address (destmem
,
6686 GEN_INT (-size
- prolog_size
),
6689 emit_move_insn (destmem
, mode_value
);
6692 srcmem
= offset_address (srcmem
, *count
, 1);
6693 srcmem
= offset_address (srcmem
,
6694 GEN_INT (-size
- prolog_size
),
6696 emit_move_insn (destmem
, srcmem
);
6698 for (n
= 1; n
* GET_MODE_SIZE (mode
) < size
; n
++)
6700 destmem
= offset_address (destmem
, modesize
, 1);
6702 emit_move_insn (destmem
, mode_value
);
6705 srcmem
= offset_address (srcmem
, modesize
, 1);
6706 emit_move_insn (destmem
, srcmem
);
6710 /* Align destination. */
6711 if (desired_align
> 1 && desired_align
> align
)
6713 rtx saveddest
= *destptr
;
6715 gcc_assert (desired_align
<= size
);
6716 /* Align destptr up, place it to new register. */
6717 *destptr
= expand_simple_binop (GET_MODE (*destptr
), PLUS
, *destptr
,
6718 GEN_INT (prolog_size
),
6719 NULL_RTX
, 1, OPTAB_DIRECT
);
6720 if (REG_P (*destptr
) && REG_P (saveddest
) && REG_POINTER (saveddest
))
6721 REG_POINTER (*destptr
) = 1;
6722 *destptr
= expand_simple_binop (GET_MODE (*destptr
), AND
, *destptr
,
6723 GEN_INT (-desired_align
),
6724 *destptr
, 1, OPTAB_DIRECT
);
6725 /* See how many bytes we skipped. */
6726 saveddest
= expand_simple_binop (GET_MODE (*destptr
), MINUS
, saveddest
,
6728 saveddest
, 1, OPTAB_DIRECT
);
6729 /* Adjust srcptr and count. */
6731 *srcptr
= expand_simple_binop (GET_MODE (*srcptr
), MINUS
, *srcptr
,
6732 saveddest
, *srcptr
, 1, OPTAB_DIRECT
);
6733 *count
= expand_simple_binop (GET_MODE (*count
), PLUS
, *count
,
6734 saveddest
, *count
, 1, OPTAB_DIRECT
);
6735 /* We copied at most size + prolog_size. */
6736 if (*min_size
> (unsigned HOST_WIDE_INT
)(size
+ prolog_size
))
6738 = ROUND_DOWN (*min_size
- size
, (unsigned HOST_WIDE_INT
)size
);
6742 /* Our loops always round down the block size, but for dispatch to
6743 library we need precise value. */
6745 *count
= expand_simple_binop (GET_MODE (*count
), AND
, *count
,
6746 GEN_INT (-size
), *count
, 1, OPTAB_DIRECT
);
6750 gcc_assert (prolog_size
== 0);
6751 /* Decrease count, so we won't end up copying last word twice. */
6752 if (!CONST_INT_P (*count
))
6753 *count
= expand_simple_binop (GET_MODE (*count
), PLUS
, *count
,
6754 constm1_rtx
, *count
, 1, OPTAB_DIRECT
);
6756 *count
= GEN_INT (ROUND_DOWN (UINTVAL (*count
) - 1,
6757 (unsigned HOST_WIDE_INT
)size
));
6759 *min_size
= ROUND_DOWN (*min_size
- 1, (unsigned HOST_WIDE_INT
)size
);
6764 /* This function is like the previous one, except here we know how many bytes
6765 need to be copied. That allows us to update alignment not only of DST, which
6766 is returned, but also of SRC, which is passed as a pointer for that
6769 expand_set_or_cpymem_constant_prologue (rtx dst
, rtx
*srcp
, rtx destreg
,
6770 rtx srcreg
, rtx value
, rtx vec_value
,
6771 int desired_align
, int align_bytes
,
6776 rtx orig_src
= NULL
;
6778 int copied_bytes
= 0;
6782 gcc_assert (srcp
!= NULL
);
6787 for (piece_size
= 1;
6788 piece_size
<= desired_align
&& copied_bytes
< align_bytes
;
6791 if (align_bytes
& piece_size
)
6795 if (vec_value
&& piece_size
> GET_MODE_SIZE (GET_MODE (value
)))
6796 dst
= emit_memset (dst
, destreg
, vec_value
, piece_size
);
6798 dst
= emit_memset (dst
, destreg
, value
, piece_size
);
6801 dst
= emit_memmov (dst
, &src
, destreg
, srcreg
, piece_size
);
6802 copied_bytes
+= piece_size
;
6805 if (MEM_ALIGN (dst
) < (unsigned int) desired_align
* BITS_PER_UNIT
)
6806 set_mem_align (dst
, desired_align
* BITS_PER_UNIT
);
6807 if (MEM_SIZE_KNOWN_P (orig_dst
))
6808 set_mem_size (dst
, MEM_SIZE (orig_dst
) - align_bytes
);
6812 int src_align_bytes
= get_mem_align_offset (src
, desired_align
6814 if (src_align_bytes
>= 0)
6815 src_align_bytes
= desired_align
- src_align_bytes
;
6816 if (src_align_bytes
>= 0)
6818 unsigned int src_align
;
6819 for (src_align
= desired_align
; src_align
>= 2; src_align
>>= 1)
6821 if ((src_align_bytes
& (src_align
- 1))
6822 == (align_bytes
& (src_align
- 1)))
6825 if (src_align
> (unsigned int) desired_align
)
6826 src_align
= desired_align
;
6827 if (MEM_ALIGN (src
) < src_align
* BITS_PER_UNIT
)
6828 set_mem_align (src
, src_align
* BITS_PER_UNIT
);
6830 if (MEM_SIZE_KNOWN_P (orig_src
))
6831 set_mem_size (src
, MEM_SIZE (orig_src
) - align_bytes
);
6838 /* Return true if ALG can be used in current context.
6839 Assume we expand memset if MEMSET is true. */
6841 alg_usable_p (enum stringop_alg alg
, bool memset
, bool have_as
)
6843 if (alg
== no_stringop
)
6845 if (alg
== vector_loop
)
6846 return TARGET_SSE
|| TARGET_AVX
;
6847 /* Algorithms using the rep prefix want at least edi and ecx;
6848 additionally, memset wants eax and memcpy wants esi. Don't
6849 consider such algorithms if the user has appropriated those
6850 registers for their own purposes, or if we have a non-default
6851 address space, since some string insns cannot override the segment. */
6852 if (alg
== rep_prefix_1_byte
6853 || alg
== rep_prefix_4_byte
6854 || alg
== rep_prefix_8_byte
)
6858 if (fixed_regs
[CX_REG
]
6859 || fixed_regs
[DI_REG
]
6860 || (memset
? fixed_regs
[AX_REG
] : fixed_regs
[SI_REG
]))
6866 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
6867 static enum stringop_alg
6868 decide_alg (HOST_WIDE_INT count
, HOST_WIDE_INT expected_size
,
6869 unsigned HOST_WIDE_INT min_size
, unsigned HOST_WIDE_INT max_size
,
6870 bool memset
, bool zero_memset
, bool have_as
,
6871 int *dynamic_check
, bool *noalign
, bool recur
)
6873 const struct stringop_algs
*algs
;
6874 bool optimize_for_speed
;
6876 const struct processor_costs
*cost
;
6878 bool any_alg_usable_p
= false;
6881 *dynamic_check
= -1;
6883 /* Even if the string operation call is cold, we still might spend a lot
6884 of time processing large blocks. */
6885 if (optimize_function_for_size_p (cfun
)
6886 || (optimize_insn_for_size_p ()
6888 || (expected_size
!= -1 && expected_size
< 256))))
6889 optimize_for_speed
= false;
6891 optimize_for_speed
= true;
6893 cost
= optimize_for_speed
? ix86_cost
: &ix86_size_cost
;
6895 algs
= &cost
->memset
[TARGET_64BIT
!= 0];
6897 algs
= &cost
->memcpy
[TARGET_64BIT
!= 0];
6899 /* See maximal size for user defined algorithm. */
6900 for (i
= 0; i
< MAX_STRINGOP_ALGS
; i
++)
6902 enum stringop_alg candidate
= algs
->size
[i
].alg
;
6903 bool usable
= alg_usable_p (candidate
, memset
, have_as
);
6904 any_alg_usable_p
|= usable
;
6906 if (candidate
!= libcall
&& candidate
&& usable
)
6907 max
= algs
->size
[i
].max
;
6910 /* If expected size is not known but max size is small enough
6911 so inline version is a win, set expected size into
6913 if (((max
> 1 && (unsigned HOST_WIDE_INT
) max
>= max_size
) || max
== -1)
6914 && expected_size
== -1)
6915 expected_size
= min_size
/ 2 + max_size
/ 2;
6917 /* If user specified the algorithm, honor it if possible. */
6918 if (ix86_stringop_alg
!= no_stringop
6919 && alg_usable_p (ix86_stringop_alg
, memset
, have_as
))
6920 return ix86_stringop_alg
;
6921 /* rep; movq or rep; movl is the smallest variant. */
6922 else if (!optimize_for_speed
)
6925 if (!count
|| (count
& 3) || (memset
&& !zero_memset
))
6926 return alg_usable_p (rep_prefix_1_byte
, memset
, have_as
)
6927 ? rep_prefix_1_byte
: loop_1_byte
;
6929 return alg_usable_p (rep_prefix_4_byte
, memset
, have_as
)
6930 ? rep_prefix_4_byte
: loop
;
6932 /* Very tiny blocks are best handled via the loop, REP is expensive to
6934 else if (expected_size
!= -1 && expected_size
< 4)
6936 else if (expected_size
!= -1)
6938 enum stringop_alg alg
= libcall
;
6939 bool alg_noalign
= false;
6940 for (i
= 0; i
< MAX_STRINGOP_ALGS
; i
++)
6942 /* We get here if the algorithms that were not libcall-based
6943 were rep-prefix based and we are unable to use rep prefixes
6944 based on global register usage. Break out of the loop and
6945 use the heuristic below. */
6946 if (algs
->size
[i
].max
== 0)
6948 if (algs
->size
[i
].max
>= expected_size
|| algs
->size
[i
].max
== -1)
6950 enum stringop_alg candidate
= algs
->size
[i
].alg
;
6952 if (candidate
!= libcall
6953 && alg_usable_p (candidate
, memset
, have_as
))
6956 alg_noalign
= algs
->size
[i
].noalign
;
6958 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
6959 last non-libcall inline algorithm. */
6960 if (TARGET_INLINE_ALL_STRINGOPS
)
6962 /* When the current size is best to be copied by a libcall,
6963 but we are still forced to inline, run the heuristic below
6964 that will pick code for medium sized blocks. */
6967 *noalign
= alg_noalign
;
6970 else if (!any_alg_usable_p
)
6973 else if (alg_usable_p (candidate
, memset
, have_as
))
6975 *noalign
= algs
->size
[i
].noalign
;
6981 /* When asked to inline the call anyway, try to pick meaningful choice.
6982 We look for maximal size of block that is faster to copy by hand and
6983 take blocks of at most of that size guessing that average size will
6984 be roughly half of the block.
6986 If this turns out to be bad, we might simply specify the preferred
6987 choice in ix86_costs. */
6988 if ((TARGET_INLINE_ALL_STRINGOPS
|| TARGET_INLINE_STRINGOPS_DYNAMICALLY
)
6989 && (algs
->unknown_size
== libcall
6990 || !alg_usable_p (algs
->unknown_size
, memset
, have_as
)))
6992 enum stringop_alg alg
;
6993 HOST_WIDE_INT new_expected_size
= (max
> 0 ? max
: 4096) / 2;
6995 /* If there aren't any usable algorithms or if recursing already,
6996 then recursing on smaller sizes or same size isn't going to
6997 find anything. Just return the simple byte-at-a-time copy loop. */
6998 if (!any_alg_usable_p
|| recur
)
7000 /* Pick something reasonable. */
7001 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY
&& !recur
)
7002 *dynamic_check
= 128;
7005 alg
= decide_alg (count
, new_expected_size
, min_size
, max_size
, memset
,
7006 zero_memset
, have_as
, dynamic_check
, noalign
, true);
7007 gcc_assert (*dynamic_check
== -1);
7008 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY
)
7009 *dynamic_check
= max
;
7011 gcc_assert (alg
!= libcall
);
7014 return (alg_usable_p (algs
->unknown_size
, memset
, have_as
)
7015 ? algs
->unknown_size
: libcall
);
7018 /* Decide on alignment. We know that the operand is already aligned to ALIGN
7019 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
7021 decide_alignment (int align
,
7022 enum stringop_alg alg
,
7024 machine_mode move_mode
)
7026 int desired_align
= 0;
7028 gcc_assert (alg
!= no_stringop
);
7032 if (move_mode
== VOIDmode
)
7035 desired_align
= GET_MODE_SIZE (move_mode
);
7036 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
7037 copying whole cacheline at once. */
7038 if (TARGET_PENTIUMPRO
7039 && (alg
== rep_prefix_4_byte
|| alg
== rep_prefix_1_byte
))
7044 if (desired_align
< align
)
7045 desired_align
= align
;
7046 if (expected_size
!= -1 && expected_size
< 4)
7047 desired_align
= align
;
7049 return desired_align
;
7053 /* Helper function for memcpy. For QImode value 0xXY produce
7054 0xXYXYXYXY of wide specified by MODE. This is essentially
7055 a * 0x10101010, but we can do slightly better than
7056 synth_mult by unwinding the sequence by hand on CPUs with
7059 promote_duplicated_reg (machine_mode mode
, rtx val
)
7061 machine_mode valmode
= GET_MODE (val
);
7063 int nops
= mode
== DImode
? 3 : 2;
7065 gcc_assert (mode
== SImode
|| mode
== DImode
|| val
== const0_rtx
);
7066 if (val
== const0_rtx
)
7067 return copy_to_mode_reg (mode
, CONST0_RTX (mode
));
7068 if (CONST_INT_P (val
))
7070 HOST_WIDE_INT v
= INTVAL (val
) & 255;
7075 v
|= (v
<< 16) << 16;
7076 return copy_to_mode_reg (mode
, gen_int_mode (v
, mode
));
7079 if (valmode
== VOIDmode
)
7081 if (valmode
!= QImode
)
7082 val
= gen_lowpart (QImode
, val
);
7085 if (!TARGET_PARTIAL_REG_STALL
)
7087 if (ix86_cost
->mult_init
[mode
== DImode
? 3 : 2]
7088 + ix86_cost
->mult_bit
* (mode
== DImode
? 8 : 4)
7089 <= (ix86_cost
->shift_const
+ ix86_cost
->add
) * nops
7090 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL
== 0)))
7092 rtx reg
= convert_modes (mode
, QImode
, val
, true);
7093 tmp
= promote_duplicated_reg (mode
, const1_rtx
);
7094 return expand_simple_binop (mode
, MULT
, reg
, tmp
, NULL
, 1,
7099 rtx reg
= convert_modes (mode
, QImode
, val
, true);
7101 if (!TARGET_PARTIAL_REG_STALL
)
7103 emit_insn (gen_insvsi_1 (reg
, reg
));
7105 emit_insn (gen_insvdi_1 (reg
, reg
));
7108 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (8),
7109 NULL
, 1, OPTAB_DIRECT
);
7110 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1,
7113 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (16),
7114 NULL
, 1, OPTAB_DIRECT
);
7115 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1, OPTAB_DIRECT
);
7118 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (32),
7119 NULL
, 1, OPTAB_DIRECT
);
7120 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1, OPTAB_DIRECT
);
7125 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
7126 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
7127 alignment from ALIGN to DESIRED_ALIGN. */
7129 promote_duplicated_reg_to_size (rtx val
, int size_needed
, int desired_align
,
7135 && (size_needed
> 4 || (desired_align
> align
&& desired_align
> 4)))
7136 promoted_val
= promote_duplicated_reg (DImode
, val
);
7137 else if (size_needed
> 2 || (desired_align
> align
&& desired_align
> 2))
7138 promoted_val
= promote_duplicated_reg (SImode
, val
);
7139 else if (size_needed
> 1 || (desired_align
> align
&& desired_align
> 1))
7140 promoted_val
= promote_duplicated_reg (HImode
, val
);
7144 return promoted_val
;
7147 /* Copy the address to a Pmode register. This is used for x32 to
7148 truncate DImode TLS address to a SImode register. */
7151 ix86_copy_addr_to_reg (rtx addr
)
7154 if (GET_MODE (addr
) == Pmode
|| GET_MODE (addr
) == VOIDmode
)
7156 reg
= copy_addr_to_reg (addr
);
7157 REG_POINTER (reg
) = 1;
7162 gcc_assert (GET_MODE (addr
) == DImode
&& Pmode
== SImode
);
7163 reg
= copy_to_mode_reg (DImode
, addr
);
7164 REG_POINTER (reg
) = 1;
7165 return gen_rtx_SUBREG (SImode
, reg
, 0);
7169 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
7170 operations when profitable. The code depends upon architecture, block size
7171 and alignment, but always has one of the following overall structures:
7173 Aligned move sequence:
7175 1) Prologue guard: Conditional that jumps up to epilogues for small
7176 blocks that can be handled by epilogue alone. This is faster
7177 but also needed for correctness, since prologue assume the block
7178 is larger than the desired alignment.
7180 Optional dynamic check for size and libcall for large
7181 blocks is emitted here too, with -minline-stringops-dynamically.
7183 2) Prologue: copy first few bytes in order to get destination
7184 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
7185 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
7186 copied. We emit either a jump tree on power of two sized
7187 blocks, or a byte loop.
7189 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7190 with specified algorithm.
7192 4) Epilogue: code copying tail of the block that is too small to be
7193 handled by main body (or up to size guarded by prologue guard).
7195 Misaligned move sequence
7197 1) missaligned move prologue/epilogue containing:
7198 a) Prologue handling small memory blocks and jumping to done_label
7199 (skipped if blocks are known to be large enough)
7200 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
7201 needed by single possibly misaligned move
7202 (skipped if alignment is not needed)
7203 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
7205 2) Zero size guard dispatching to done_label, if needed
7207 3) dispatch to library call, if needed,
7209 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
7210 with specified algorithm. */
7212 ix86_expand_set_or_cpymem (rtx dst
, rtx src
, rtx count_exp
, rtx val_exp
,
7213 rtx align_exp
, rtx expected_align_exp
,
7214 rtx expected_size_exp
, rtx min_size_exp
,
7215 rtx max_size_exp
, rtx probable_max_size_exp
,
7220 rtx_code_label
*label
= NULL
;
7222 rtx_code_label
*jump_around_label
= NULL
;
7223 HOST_WIDE_INT align
= 1;
7224 unsigned HOST_WIDE_INT count
= 0;
7225 HOST_WIDE_INT expected_size
= -1;
7226 int size_needed
= 0, epilogue_size_needed
;
7227 int desired_align
= 0, align_bytes
= 0;
7228 enum stringop_alg alg
;
7229 rtx promoted_val
= NULL
;
7230 rtx vec_promoted_val
= NULL
;
7231 bool force_loopy_epilogue
= false;
7233 bool need_zero_guard
= false;
7235 machine_mode move_mode
= VOIDmode
;
7236 machine_mode wider_mode
;
7237 int unroll_factor
= 1;
7238 /* TODO: Once value ranges are available, fill in proper data. */
7239 unsigned HOST_WIDE_INT min_size
= 0;
7240 unsigned HOST_WIDE_INT max_size
= -1;
7241 unsigned HOST_WIDE_INT probable_max_size
= -1;
7242 bool misaligned_prologue_used
= false;
7245 if (CONST_INT_P (align_exp
))
7246 align
= INTVAL (align_exp
);
7247 /* i386 can do misaligned access on reasonably increased cost. */
7248 if (CONST_INT_P (expected_align_exp
)
7249 && INTVAL (expected_align_exp
) > align
)
7250 align
= INTVAL (expected_align_exp
);
7251 /* ALIGN is the minimum of destination and source alignment, but we care here
7252 just about destination alignment. */
7254 && MEM_ALIGN (dst
) > (unsigned HOST_WIDE_INT
) align
* BITS_PER_UNIT
)
7255 align
= MEM_ALIGN (dst
) / BITS_PER_UNIT
;
7257 if (CONST_INT_P (count_exp
))
7259 min_size
= max_size
= probable_max_size
= count
= expected_size
7260 = INTVAL (count_exp
);
7261 /* When COUNT is 0, there is nothing to do. */
7268 min_size
= INTVAL (min_size_exp
);
7270 max_size
= INTVAL (max_size_exp
);
7271 if (probable_max_size_exp
)
7272 probable_max_size
= INTVAL (probable_max_size_exp
);
7273 if (CONST_INT_P (expected_size_exp
))
7274 expected_size
= INTVAL (expected_size_exp
);
7277 /* Make sure we don't need to care about overflow later on. */
7278 if (count
> (HOST_WIDE_INT_1U
<< 30))
7281 have_as
= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst
));
7283 have_as
|= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src
));
7285 /* Step 0: Decide on preferred algorithm, desired alignment and
7286 size of chunks to be copied by main loop. */
7287 alg
= decide_alg (count
, expected_size
, min_size
, probable_max_size
,
7289 issetmem
&& val_exp
== const0_rtx
, have_as
,
7290 &dynamic_check
, &noalign
, false);
7293 fprintf (dump_file
, "Selected stringop expansion strategy: %s\n",
7294 stringop_alg_names
[alg
]);
7298 gcc_assert (alg
!= no_stringop
);
7300 /* For now vector-version of memset is generated only for memory zeroing, as
7301 creating of promoted vector value is very cheap in this case. */
7302 if (issetmem
&& alg
== vector_loop
&& val_exp
!= const0_rtx
)
7303 alg
= unrolled_loop
;
7306 count_exp
= copy_to_mode_reg (GET_MODE (count_exp
), count_exp
);
7307 destreg
= ix86_copy_addr_to_reg (XEXP (dst
, 0));
7309 srcreg
= ix86_copy_addr_to_reg (XEXP (src
, 0));
7312 move_mode
= word_mode
;
7320 need_zero_guard
= true;
7324 need_zero_guard
= true;
7327 need_zero_guard
= true;
7328 unroll_factor
= (TARGET_64BIT
? 4 : 2);
7331 need_zero_guard
= true;
7333 /* Find the widest supported mode. */
7334 move_mode
= word_mode
;
7335 while (GET_MODE_WIDER_MODE (move_mode
).exists (&wider_mode
)
7336 && optab_handler (mov_optab
, wider_mode
) != CODE_FOR_nothing
)
7337 move_mode
= wider_mode
;
7339 if (TARGET_AVX128_OPTIMAL
&& GET_MODE_BITSIZE (move_mode
) > 128)
7342 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7343 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7344 if (GET_MODE_SIZE (move_mode
) > GET_MODE_SIZE (word_mode
))
7346 int nunits
= GET_MODE_SIZE (move_mode
) / GET_MODE_SIZE (word_mode
);
7347 if (!mode_for_vector (word_mode
, nunits
).exists (&move_mode
)
7348 || optab_handler (mov_optab
, move_mode
) == CODE_FOR_nothing
)
7349 move_mode
= word_mode
;
7351 gcc_assert (optab_handler (mov_optab
, move_mode
) != CODE_FOR_nothing
);
7353 case rep_prefix_8_byte
:
7356 case rep_prefix_4_byte
:
7359 case rep_prefix_1_byte
:
7363 size_needed
= GET_MODE_SIZE (move_mode
) * unroll_factor
;
7364 epilogue_size_needed
= size_needed
;
7366 /* If we are going to call any library calls conditionally, make sure any
7367 pending stack adjustment happen before the first conditional branch,
7368 otherwise they will be emitted before the library call only and won't
7369 happen from the other branches. */
7370 if (dynamic_check
!= -1)
7371 do_pending_stack_adjust ();
7373 desired_align
= decide_alignment (align
, alg
, expected_size
, move_mode
);
7374 if (!TARGET_ALIGN_STRINGOPS
|| noalign
)
7375 align
= desired_align
;
7377 /* Step 1: Prologue guard. */
7379 /* Alignment code needs count to be in register. */
7380 if (CONST_INT_P (count_exp
) && desired_align
> align
)
7382 if (INTVAL (count_exp
) > desired_align
7383 && INTVAL (count_exp
) > size_needed
)
7386 = get_mem_align_offset (dst
, desired_align
* BITS_PER_UNIT
);
7387 if (align_bytes
<= 0)
7390 align_bytes
= desired_align
- align_bytes
;
7392 if (align_bytes
== 0)
7393 count_exp
= force_reg (counter_mode (count_exp
), count_exp
);
7395 gcc_assert (desired_align
>= 1 && align
>= 1);
7397 /* Misaligned move sequences handle both prologue and epilogue at once.
7398 Default code generation results in a smaller code for large alignments
7399 and also avoids redundant job when sizes are known precisely. */
7400 misaligned_prologue_used
7401 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
7402 && MAX (desired_align
, epilogue_size_needed
) <= 32
7403 && desired_align
<= epilogue_size_needed
7404 && ((desired_align
> align
&& !align_bytes
)
7405 || (!count
&& epilogue_size_needed
> 1)));
7407 /* Do the cheap promotion to allow better CSE across the
7408 main loop and epilogue (ie one load of the big constant in the
7410 For now the misaligned move sequences do not have fast path
7411 without broadcasting. */
7412 if (issetmem
&& ((CONST_INT_P (val_exp
) || misaligned_prologue_used
)))
7414 if (alg
== vector_loop
)
7416 gcc_assert (val_exp
== const0_rtx
);
7417 vec_promoted_val
= promote_duplicated_reg (move_mode
, val_exp
);
7418 promoted_val
= promote_duplicated_reg_to_size (val_exp
,
7419 GET_MODE_SIZE (word_mode
),
7420 desired_align
, align
);
7424 promoted_val
= promote_duplicated_reg_to_size (val_exp
, size_needed
,
7425 desired_align
, align
);
7428 /* Misaligned move sequences handles both prologues and epilogues at once.
7429 Default code generation results in smaller code for large alignments and
7430 also avoids redundant job when sizes are known precisely. */
7431 if (misaligned_prologue_used
)
7433 /* Misaligned move prologue handled small blocks by itself. */
7434 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
7435 (dst
, src
, &destreg
, &srcreg
,
7436 move_mode
, promoted_val
, vec_promoted_val
,
7439 desired_align
< align
7440 ? MAX (desired_align
, epilogue_size_needed
) : epilogue_size_needed
,
7441 desired_align
, align
, &min_size
, dynamic_check
, issetmem
);
7443 src
= change_address (src
, BLKmode
, srcreg
);
7444 dst
= change_address (dst
, BLKmode
, destreg
);
7445 set_mem_align (dst
, desired_align
* BITS_PER_UNIT
);
7446 epilogue_size_needed
= 0;
7448 && min_size
< (unsigned HOST_WIDE_INT
) size_needed
)
7450 /* It is possible that we copied enough so the main loop will not
7452 gcc_assert (size_needed
> 1);
7453 if (jump_around_label
== NULL_RTX
)
7454 jump_around_label
= gen_label_rtx ();
7455 emit_cmp_and_jump_insns (count_exp
,
7456 GEN_INT (size_needed
),
7457 LTU
, 0, counter_mode (count_exp
), 1, jump_around_label
);
7458 if (expected_size
== -1
7459 || expected_size
< (desired_align
- align
) / 2 + size_needed
)
7460 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
7462 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
7465 /* Ensure that alignment prologue won't copy past end of block. */
7466 else if (size_needed
> 1 || (desired_align
> 1 && desired_align
> align
))
7468 epilogue_size_needed
= MAX (size_needed
- 1, desired_align
- align
);
7469 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
7470 Make sure it is power of 2. */
7471 epilogue_size_needed
= 1 << (floor_log2 (epilogue_size_needed
) + 1);
7473 /* To improve performance of small blocks, we jump around the VAL
7474 promoting mode. This mean that if the promoted VAL is not constant,
7475 we might not use it in the epilogue and have to use byte
7477 if (issetmem
&& epilogue_size_needed
> 2 && !promoted_val
)
7478 force_loopy_epilogue
= true;
7479 if ((count
&& count
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
7480 || max_size
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
7482 /* If main algorithm works on QImode, no epilogue is needed.
7483 For small sizes just don't align anything. */
7484 if (size_needed
== 1)
7485 desired_align
= align
;
7490 && min_size
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
7492 label
= gen_label_rtx ();
7493 emit_cmp_and_jump_insns (count_exp
,
7494 GEN_INT (epilogue_size_needed
),
7495 LTU
, 0, counter_mode (count_exp
), 1, label
);
7496 if (expected_size
== -1 || expected_size
< epilogue_size_needed
)
7497 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
7499 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
7503 /* Emit code to decide on runtime whether library call or inline should be
7505 if (dynamic_check
!= -1)
7507 if (!issetmem
&& CONST_INT_P (count_exp
))
7509 if (UINTVAL (count_exp
) >= (unsigned HOST_WIDE_INT
)dynamic_check
)
7511 emit_block_copy_via_libcall (dst
, src
, count_exp
);
7512 count_exp
= const0_rtx
;
7518 rtx_code_label
*hot_label
= gen_label_rtx ();
7519 if (jump_around_label
== NULL_RTX
)
7520 jump_around_label
= gen_label_rtx ();
7521 emit_cmp_and_jump_insns (count_exp
, GEN_INT (dynamic_check
- 1),
7522 LEU
, 0, counter_mode (count_exp
),
7524 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
7526 set_storage_via_libcall (dst
, count_exp
, val_exp
);
7528 emit_block_copy_via_libcall (dst
, src
, count_exp
);
7529 emit_jump (jump_around_label
);
7530 emit_label (hot_label
);
7534 /* Step 2: Alignment prologue. */
7535 /* Do the expensive promotion once we branched off the small blocks. */
7536 if (issetmem
&& !promoted_val
)
7537 promoted_val
= promote_duplicated_reg_to_size (val_exp
, size_needed
,
7538 desired_align
, align
);
7540 if (desired_align
> align
&& !misaligned_prologue_used
)
7542 if (align_bytes
== 0)
7544 /* Except for the first move in prologue, we no longer know
7545 constant offset in aliasing info. It don't seems to worth
7546 the pain to maintain it for the first move, so throw away
7548 dst
= change_address (dst
, BLKmode
, destreg
);
7550 src
= change_address (src
, BLKmode
, srcreg
);
7551 dst
= expand_set_or_cpymem_prologue (dst
, src
, destreg
, srcreg
,
7552 promoted_val
, vec_promoted_val
,
7553 count_exp
, align
, desired_align
,
7555 /* At most desired_align - align bytes are copied. */
7556 if (min_size
< (unsigned)(desired_align
- align
))
7559 min_size
-= desired_align
- align
;
7563 /* If we know how many bytes need to be stored before dst is
7564 sufficiently aligned, maintain aliasing info accurately. */
7565 dst
= expand_set_or_cpymem_constant_prologue (dst
, &src
, destreg
,
7573 count_exp
= plus_constant (counter_mode (count_exp
),
7574 count_exp
, -align_bytes
);
7575 count
-= align_bytes
;
7576 min_size
-= align_bytes
;
7577 max_size
-= align_bytes
;
7580 && min_size
< (unsigned HOST_WIDE_INT
) size_needed
7581 && (count
< (unsigned HOST_WIDE_INT
) size_needed
7582 || (align_bytes
== 0
7583 && count
< ((unsigned HOST_WIDE_INT
) size_needed
7584 + desired_align
- align
))))
7586 /* It is possible that we copied enough so the main loop will not
7588 gcc_assert (size_needed
> 1);
7589 if (label
== NULL_RTX
)
7590 label
= gen_label_rtx ();
7591 emit_cmp_and_jump_insns (count_exp
,
7592 GEN_INT (size_needed
),
7593 LTU
, 0, counter_mode (count_exp
), 1, label
);
7594 if (expected_size
== -1
7595 || expected_size
< (desired_align
- align
) / 2 + size_needed
)
7596 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
7598 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
7601 if (label
&& size_needed
== 1)
7604 LABEL_NUSES (label
) = 1;
7606 epilogue_size_needed
= 1;
7608 promoted_val
= val_exp
;
7610 else if (label
== NULL_RTX
&& !misaligned_prologue_used
)
7611 epilogue_size_needed
= size_needed
;
7613 /* Step 3: Main loop. */
7624 expand_set_or_cpymem_via_loop (dst
, src
, destreg
, srcreg
, promoted_val
,
7625 count_exp
, move_mode
, unroll_factor
,
7626 expected_size
, issetmem
);
7629 expand_set_or_cpymem_via_loop (dst
, src
, destreg
, srcreg
,
7630 vec_promoted_val
, count_exp
, move_mode
,
7631 unroll_factor
, expected_size
, issetmem
);
7633 case rep_prefix_8_byte
:
7634 case rep_prefix_4_byte
:
7635 case rep_prefix_1_byte
:
7636 expand_set_or_cpymem_via_rep (dst
, src
, destreg
, srcreg
, promoted_val
,
7637 val_exp
, count_exp
, move_mode
, issetmem
);
7640 /* Adjust properly the offset of src and dest memory for aliasing. */
7641 if (CONST_INT_P (count_exp
))
7644 src
= adjust_automodify_address_nv (src
, BLKmode
, srcreg
,
7645 (count
/ size_needed
) * size_needed
);
7646 dst
= adjust_automodify_address_nv (dst
, BLKmode
, destreg
,
7647 (count
/ size_needed
) * size_needed
);
7652 src
= change_address (src
, BLKmode
, srcreg
);
7653 dst
= change_address (dst
, BLKmode
, destreg
);
7656 /* Step 4: Epilogue to copy the remaining bytes. */
7660 /* When the main loop is done, COUNT_EXP might hold original count,
7661 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
7662 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
7663 bytes. Compensate if needed. */
7665 if (size_needed
< epilogue_size_needed
)
7667 tmp
= expand_simple_binop (counter_mode (count_exp
), AND
, count_exp
,
7668 GEN_INT (size_needed
- 1), count_exp
, 1,
7670 if (tmp
!= count_exp
)
7671 emit_move_insn (count_exp
, tmp
);
7674 LABEL_NUSES (label
) = 1;
7677 if (count_exp
!= const0_rtx
&& epilogue_size_needed
> 1)
7679 if (force_loopy_epilogue
)
7680 expand_setmem_epilogue_via_loop (dst
, destreg
, val_exp
, count_exp
,
7681 epilogue_size_needed
);
7685 expand_setmem_epilogue (dst
, destreg
, promoted_val
,
7686 vec_promoted_val
, count_exp
,
7687 epilogue_size_needed
);
7689 expand_cpymem_epilogue (dst
, src
, destreg
, srcreg
, count_exp
,
7690 epilogue_size_needed
);
7693 if (jump_around_label
)
7694 emit_label (jump_around_label
);
7699 /* Expand the appropriate insns for doing strlen if not just doing
7702 out = result, initialized with the start address
7703 align_rtx = alignment of the address.
7704 scratch = scratch register, initialized with the startaddress when
7705 not aligned, otherwise undefined
7707 This is just the body. It needs the initializations mentioned above and
7708 some address computing at the end. These things are done in i386.md. */
7711 ix86_expand_strlensi_unroll_1 (rtx out
, rtx src
, rtx align_rtx
)
7715 rtx_code_label
*align_2_label
= NULL
;
7716 rtx_code_label
*align_3_label
= NULL
;
7717 rtx_code_label
*align_4_label
= gen_label_rtx ();
7718 rtx_code_label
*end_0_label
= gen_label_rtx ();
7720 rtx tmpreg
= gen_reg_rtx (SImode
);
7721 rtx scratch
= gen_reg_rtx (SImode
);
7725 if (CONST_INT_P (align_rtx
))
7726 align
= INTVAL (align_rtx
);
7728 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
7730 /* Is there a known alignment and is it less than 4? */
7733 rtx scratch1
= gen_reg_rtx (Pmode
);
7734 emit_move_insn (scratch1
, out
);
7735 /* Is there a known alignment and is it not 2? */
7738 align_3_label
= gen_label_rtx (); /* Label when aligned to 3-byte */
7739 align_2_label
= gen_label_rtx (); /* Label when aligned to 2-byte */
7741 /* Leave just the 3 lower bits. */
7742 align_rtx
= expand_binop (Pmode
, and_optab
, scratch1
, GEN_INT (3),
7743 NULL_RTX
, 0, OPTAB_WIDEN
);
7745 emit_cmp_and_jump_insns (align_rtx
, const0_rtx
, EQ
, NULL
,
7746 Pmode
, 1, align_4_label
);
7747 emit_cmp_and_jump_insns (align_rtx
, const2_rtx
, EQ
, NULL
,
7748 Pmode
, 1, align_2_label
);
7749 emit_cmp_and_jump_insns (align_rtx
, const2_rtx
, GTU
, NULL
,
7750 Pmode
, 1, align_3_label
);
7754 /* Since the alignment is 2, we have to check 2 or 0 bytes;
7755 check if is aligned to 4 - byte. */
7757 align_rtx
= expand_binop (Pmode
, and_optab
, scratch1
, const2_rtx
,
7758 NULL_RTX
, 0, OPTAB_WIDEN
);
7760 emit_cmp_and_jump_insns (align_rtx
, const0_rtx
, EQ
, NULL
,
7761 Pmode
, 1, align_4_label
);
7764 mem
= change_address (src
, QImode
, out
);
7766 /* Now compare the bytes. */
7768 /* Compare the first n unaligned byte on a byte per byte basis. */
7769 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
,
7770 QImode
, 1, end_0_label
);
7772 /* Increment the address. */
7773 emit_insn (gen_add2_insn (out
, const1_rtx
));
7775 /* Not needed with an alignment of 2 */
7778 emit_label (align_2_label
);
7780 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
, QImode
, 1,
7783 emit_insn (gen_add2_insn (out
, const1_rtx
));
7785 emit_label (align_3_label
);
7788 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
, QImode
, 1,
7791 emit_insn (gen_add2_insn (out
, const1_rtx
));
7794 /* Generate loop to check 4 bytes at a time. It is not a good idea to
7795 align this loop. It gives only huge programs, but does not help to
7797 emit_label (align_4_label
);
7799 mem
= change_address (src
, SImode
, out
);
7800 emit_move_insn (scratch
, mem
);
7801 emit_insn (gen_add2_insn (out
, GEN_INT (4)));
7803 /* This formula yields a nonzero result iff one of the bytes is zero.
7804 This saves three branches inside loop and many cycles. */
7806 emit_insn (gen_addsi3 (tmpreg
, scratch
, GEN_INT (-0x01010101)));
7807 emit_insn (gen_one_cmplsi2 (scratch
, scratch
));
7808 emit_insn (gen_andsi3 (tmpreg
, tmpreg
, scratch
));
7809 emit_insn (gen_andsi3 (tmpreg
, tmpreg
,
7810 gen_int_mode (0x80808080, SImode
)));
7811 emit_cmp_and_jump_insns (tmpreg
, const0_rtx
, EQ
, 0, SImode
, 1,
7816 rtx reg
= gen_reg_rtx (SImode
);
7817 rtx reg2
= gen_reg_rtx (Pmode
);
7818 emit_move_insn (reg
, tmpreg
);
7819 emit_insn (gen_lshrsi3 (reg
, reg
, GEN_INT (16)));
7821 /* If zero is not in the first two bytes, move two bytes forward. */
7822 emit_insn (gen_testsi_ccno_1 (tmpreg
, GEN_INT (0x8080)));
7823 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
7824 tmp
= gen_rtx_EQ (VOIDmode
, tmp
, const0_rtx
);
7825 emit_insn (gen_rtx_SET (tmpreg
,
7826 gen_rtx_IF_THEN_ELSE (SImode
, tmp
,
7829 /* Emit lea manually to avoid clobbering of flags. */
7830 emit_insn (gen_rtx_SET (reg2
, gen_rtx_PLUS (Pmode
, out
, const2_rtx
)));
7832 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
7833 tmp
= gen_rtx_EQ (VOIDmode
, tmp
, const0_rtx
);
7834 emit_insn (gen_rtx_SET (out
,
7835 gen_rtx_IF_THEN_ELSE (Pmode
, tmp
,
7841 rtx_code_label
*end_2_label
= gen_label_rtx ();
7842 /* Is zero in the first two bytes? */
7844 emit_insn (gen_testsi_ccno_1 (tmpreg
, GEN_INT (0x8080)));
7845 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
7846 tmp
= gen_rtx_NE (VOIDmode
, tmp
, const0_rtx
);
7847 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
7848 gen_rtx_LABEL_REF (VOIDmode
, end_2_label
),
7850 tmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
7851 JUMP_LABEL (tmp
) = end_2_label
;
7853 /* Not in the first two. Move two bytes forward. */
7854 emit_insn (gen_lshrsi3 (tmpreg
, tmpreg
, GEN_INT (16)));
7855 emit_insn (gen_add2_insn (out
, const2_rtx
));
7857 emit_label (end_2_label
);
7861 /* Avoid branch in fixing the byte. */
7862 tmpreg
= gen_lowpart (QImode
, tmpreg
);
7863 emit_insn (gen_addqi3_cconly_overflow (tmpreg
, tmpreg
));
7864 tmp
= gen_rtx_REG (CCmode
, FLAGS_REG
);
7865 cmp
= gen_rtx_LTU (VOIDmode
, tmp
, const0_rtx
);
7866 emit_insn (gen_sub3_carry (Pmode
, out
, out
, GEN_INT (3), tmp
, cmp
));
7868 emit_label (end_0_label
);
7871 /* Expand strlen. */
7874 ix86_expand_strlen (rtx out
, rtx src
, rtx eoschar
, rtx align
)
7876 if (TARGET_UNROLL_STRLEN
7877 && TARGET_INLINE_ALL_STRINGOPS
7878 && eoschar
== const0_rtx
7881 /* The generic case of strlen expander is long. Avoid it's
7882 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
7883 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
7884 /* Well it seems that some optimizer does not combine a call like
7885 foo(strlen(bar), strlen(bar));
7886 when the move and the subtraction is done here. It does calculate
7887 the length just once when these instructions are done inside of
7888 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
7889 often used and I use one fewer register for the lifetime of
7890 output_strlen_unroll() this is better. */
7892 emit_move_insn (out
, addr
);
7894 ix86_expand_strlensi_unroll_1 (out
, src
, align
);
7896 /* strlensi_unroll_1 returns the address of the zero at the end of
7897 the string, like memchr(), so compute the length by subtracting
7898 the start address. */
7899 emit_insn (gen_sub2_insn (out
, addr
));
7906 /* For given symbol (function) construct code to compute address of it's PLT
7907 entry in large x86-64 PIC model. */
7910 construct_plt_address (rtx symbol
)
7914 gcc_assert (GET_CODE (symbol
) == SYMBOL_REF
);
7915 gcc_assert (ix86_cmodel
== CM_LARGE_PIC
&& !TARGET_PECOFF
);
7916 gcc_assert (Pmode
== DImode
);
7918 tmp
= gen_reg_rtx (Pmode
);
7919 unspec
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, symbol
), UNSPEC_PLTOFF
);
7921 emit_move_insn (tmp
, gen_rtx_CONST (Pmode
, unspec
));
7922 emit_insn (gen_add2_insn (tmp
, pic_offset_table_rtx
));
7926 /* Additional registers that are clobbered by SYSV calls. */
7928 static int const x86_64_ms_sysv_extra_clobbered_registers
7929 [NUM_X86_64_MS_CLOBBERED_REGS
] =
7933 XMM8_REG
, XMM9_REG
, XMM10_REG
, XMM11_REG
,
7934 XMM12_REG
, XMM13_REG
, XMM14_REG
, XMM15_REG
7938 ix86_expand_call (rtx retval
, rtx fnaddr
, rtx callarg1
,
7940 rtx pop
, bool sibcall
)
7943 rtx use
= NULL
, call
;
7944 unsigned int vec_len
= 0;
7947 if (GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
7949 fndecl
= SYMBOL_REF_DECL (XEXP (fnaddr
, 0));
7951 && (lookup_attribute ("interrupt",
7952 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
)))))
7953 error ("interrupt service routine cannot be called directly");
7958 if (pop
== const0_rtx
)
7960 gcc_assert (!TARGET_64BIT
|| !pop
);
7962 if (TARGET_MACHO
&& !TARGET_64BIT
)
7965 if (flag_pic
&& GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
7966 fnaddr
= machopic_indirect_call_target (fnaddr
);
7971 /* Static functions and indirect calls don't need the pic register. Also,
7972 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
7973 it an indirect call. */
7974 rtx addr
= XEXP (fnaddr
, 0);
7976 && GET_CODE (addr
) == SYMBOL_REF
7977 && !SYMBOL_REF_LOCAL_P (addr
))
7980 && (SYMBOL_REF_DECL (addr
) == NULL_TREE
7981 || !lookup_attribute ("noplt",
7982 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr
)))))
7985 || (ix86_cmodel
== CM_LARGE_PIC
7986 && DEFAULT_ABI
!= MS_ABI
))
7988 use_reg (&use
, gen_rtx_REG (Pmode
,
7989 REAL_PIC_OFFSET_TABLE_REGNUM
));
7990 if (ix86_use_pseudo_pic_reg ())
7991 emit_move_insn (gen_rtx_REG (Pmode
,
7992 REAL_PIC_OFFSET_TABLE_REGNUM
),
7993 pic_offset_table_rtx
);
7996 else if (!TARGET_PECOFF
&& !TARGET_MACHO
)
8000 fnaddr
= gen_rtx_UNSPEC (Pmode
,
8001 gen_rtvec (1, addr
),
8003 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
8007 fnaddr
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, addr
),
8009 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
8010 fnaddr
= gen_rtx_PLUS (Pmode
, pic_offset_table_rtx
,
8013 fnaddr
= gen_const_mem (Pmode
, fnaddr
);
8014 /* Pmode may not be the same as word_mode for x32, which
8015 doesn't support indirect branch via 32-bit memory slot.
8016 Since x32 GOT slot is 64 bit with zero upper 32 bits,
8017 indirect branch via x32 GOT slot is OK. */
8018 if (GET_MODE (fnaddr
) != word_mode
)
8019 fnaddr
= gen_rtx_ZERO_EXTEND (word_mode
, fnaddr
);
8020 fnaddr
= gen_rtx_MEM (QImode
, fnaddr
);
8025 /* Skip setting up RAX register for -mskip-rax-setup when there are no
8026 parameters passed in vector registers. */
8028 && (INTVAL (callarg2
) > 0
8029 || (INTVAL (callarg2
) == 0
8030 && (TARGET_SSE
|| !flag_skip_rax_setup
))))
8032 rtx al
= gen_rtx_REG (QImode
, AX_REG
);
8033 emit_move_insn (al
, callarg2
);
8037 if (ix86_cmodel
== CM_LARGE_PIC
8040 && GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
8041 && !local_symbolic_operand (XEXP (fnaddr
, 0), VOIDmode
))
8042 fnaddr
= gen_rtx_MEM (QImode
, construct_plt_address (XEXP (fnaddr
, 0)));
8043 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
8044 branch via x32 GOT slot is OK. */
8045 else if (!(TARGET_X32
8047 && GET_CODE (XEXP (fnaddr
, 0)) == ZERO_EXTEND
8048 && GOT_memory_operand (XEXP (XEXP (fnaddr
, 0), 0), Pmode
))
8050 ? !sibcall_insn_operand (XEXP (fnaddr
, 0), word_mode
)
8051 : !call_insn_operand (XEXP (fnaddr
, 0), word_mode
)))
8053 fnaddr
= convert_to_mode (word_mode
, XEXP (fnaddr
, 0), 1);
8054 fnaddr
= gen_rtx_MEM (QImode
, copy_to_mode_reg (word_mode
, fnaddr
));
8057 call
= gen_rtx_CALL (VOIDmode
, fnaddr
, callarg1
);
8060 call
= gen_rtx_SET (retval
, call
);
8061 vec
[vec_len
++] = call
;
8065 pop
= gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, pop
);
8066 pop
= gen_rtx_SET (stack_pointer_rtx
, pop
);
8067 vec
[vec_len
++] = pop
;
8070 if (cfun
->machine
->no_caller_saved_registers
8072 || (!TREE_THIS_VOLATILE (fndecl
)
8073 && !lookup_attribute ("no_caller_saved_registers",
8074 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
))))))
8076 static const char ix86_call_used_regs
[] = CALL_USED_REGISTERS
;
8077 bool is_64bit_ms_abi
= (TARGET_64BIT
8078 && ix86_function_abi (fndecl
) == MS_ABI
);
8079 char c_mask
= CALL_USED_REGISTERS_MASK (is_64bit_ms_abi
);
8081 /* If there are no caller-saved registers, add all registers
8082 that are clobbered by the call which returns. */
8083 for (int i
= 0; i
< FIRST_PSEUDO_REGISTER
; i
++)
8085 && (ix86_call_used_regs
[i
] == 1
8086 || (ix86_call_used_regs
[i
] & c_mask
))
8087 && !STACK_REGNO_P (i
)
8088 && !MMX_REGNO_P (i
))
8090 gen_rtx_REG (GET_MODE (regno_reg_rtx
[i
]), i
));
8092 else if (TARGET_64BIT_MS_ABI
8093 && (!callarg2
|| INTVAL (callarg2
) != -2))
8097 for (i
= 0; i
< NUM_X86_64_MS_CLOBBERED_REGS
; i
++)
8099 int regno
= x86_64_ms_sysv_extra_clobbered_registers
[i
];
8100 machine_mode mode
= SSE_REGNO_P (regno
) ? TImode
: DImode
;
8102 clobber_reg (&use
, gen_rtx_REG (mode
, regno
));
8105 /* Set here, but it may get cleared later. */
8106 if (TARGET_CALL_MS2SYSV_XLOGUES
)
8111 /* Don't break hot-patched functions. */
8112 else if (ix86_function_ms_hook_prologue (current_function_decl
))
8115 /* TODO: Cases not yet examined. */
8116 else if (flag_split_stack
)
8117 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
8121 gcc_assert (!reload_completed
);
8122 cfun
->machine
->call_ms2sysv
= true;
8128 call
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec_v (vec_len
, vec
));
8129 rtx_insn
*call_insn
= emit_call_insn (call
);
8131 CALL_INSN_FUNCTION_USAGE (call_insn
) = use
;
8136 /* Split simple return with popping POPC bytes from stack to indirect
8137 branch with stack adjustment . */
8140 ix86_split_simple_return_pop_internal (rtx popc
)
8142 struct machine_function
*m
= cfun
->machine
;
8143 rtx ecx
= gen_rtx_REG (SImode
, CX_REG
);
8146 /* There is no "pascal" calling convention in any 64bit ABI. */
8147 gcc_assert (!TARGET_64BIT
);
8149 insn
= emit_insn (gen_pop (ecx
));
8150 m
->fs
.cfa_offset
-= UNITS_PER_WORD
;
8151 m
->fs
.sp_offset
-= UNITS_PER_WORD
;
8153 rtx x
= plus_constant (Pmode
, stack_pointer_rtx
, UNITS_PER_WORD
);
8154 x
= gen_rtx_SET (stack_pointer_rtx
, x
);
8155 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, x
);
8156 add_reg_note (insn
, REG_CFA_REGISTER
, gen_rtx_SET (ecx
, pc_rtx
));
8157 RTX_FRAME_RELATED_P (insn
) = 1;
8159 x
= gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, popc
);
8160 x
= gen_rtx_SET (stack_pointer_rtx
, x
);
8161 insn
= emit_insn (x
);
8162 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, x
);
8163 RTX_FRAME_RELATED_P (insn
) = 1;
8165 /* Now return address is in ECX. */
8166 emit_jump_insn (gen_simple_return_indirect_internal (ecx
));
8169 /* Errors in the source file can cause expand_expr to return const0_rtx
8170 where we expect a vector. To avoid crashing, use one of the vector
8171 clear instructions. */
8174 safe_vector_operand (rtx x
, machine_mode mode
)
8176 if (x
== const0_rtx
)
8177 x
= CONST0_RTX (mode
);
8181 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
8184 ix86_expand_binop_builtin (enum insn_code icode
, tree exp
, rtx target
)
8187 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8188 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8189 rtx op0
= expand_normal (arg0
);
8190 rtx op1
= expand_normal (arg1
);
8191 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
8192 machine_mode mode0
= insn_data
[icode
].operand
[1].mode
;
8193 machine_mode mode1
= insn_data
[icode
].operand
[2].mode
;
8195 if (VECTOR_MODE_P (mode0
))
8196 op0
= safe_vector_operand (op0
, mode0
);
8197 if (VECTOR_MODE_P (mode1
))
8198 op1
= safe_vector_operand (op1
, mode1
);
8200 if (optimize
|| !target
8201 || GET_MODE (target
) != tmode
8202 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
8203 target
= gen_reg_rtx (tmode
);
8205 if (GET_MODE (op1
) == SImode
&& mode1
== TImode
)
8207 rtx x
= gen_reg_rtx (V4SImode
);
8208 emit_insn (gen_sse2_loadd (x
, op1
));
8209 op1
= gen_lowpart (TImode
, x
);
8212 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
8213 op0
= copy_to_mode_reg (mode0
, op0
);
8214 if (!insn_data
[icode
].operand
[2].predicate (op1
, mode1
))
8215 op1
= copy_to_mode_reg (mode1
, op1
);
8217 pat
= GEN_FCN (icode
) (target
, op0
, op1
);
8226 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
8229 ix86_expand_multi_arg_builtin (enum insn_code icode
, tree exp
, rtx target
,
8230 enum ix86_builtin_func_type m_type
,
8231 enum rtx_code sub_code
)
8236 bool comparison_p
= false;
8238 bool last_arg_constant
= false;
8245 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
8249 case MULTI_ARG_4_DF2_DI_I
:
8250 case MULTI_ARG_4_DF2_DI_I1
:
8251 case MULTI_ARG_4_SF2_SI_I
:
8252 case MULTI_ARG_4_SF2_SI_I1
:
8254 last_arg_constant
= true;
8257 case MULTI_ARG_3_SF
:
8258 case MULTI_ARG_3_DF
:
8259 case MULTI_ARG_3_SF2
:
8260 case MULTI_ARG_3_DF2
:
8261 case MULTI_ARG_3_DI
:
8262 case MULTI_ARG_3_SI
:
8263 case MULTI_ARG_3_SI_DI
:
8264 case MULTI_ARG_3_HI
:
8265 case MULTI_ARG_3_HI_SI
:
8266 case MULTI_ARG_3_QI
:
8267 case MULTI_ARG_3_DI2
:
8268 case MULTI_ARG_3_SI2
:
8269 case MULTI_ARG_3_HI2
:
8270 case MULTI_ARG_3_QI2
:
8274 case MULTI_ARG_2_SF
:
8275 case MULTI_ARG_2_DF
:
8276 case MULTI_ARG_2_DI
:
8277 case MULTI_ARG_2_SI
:
8278 case MULTI_ARG_2_HI
:
8279 case MULTI_ARG_2_QI
:
8283 case MULTI_ARG_2_DI_IMM
:
8284 case MULTI_ARG_2_SI_IMM
:
8285 case MULTI_ARG_2_HI_IMM
:
8286 case MULTI_ARG_2_QI_IMM
:
8288 last_arg_constant
= true;
8291 case MULTI_ARG_1_SF
:
8292 case MULTI_ARG_1_DF
:
8293 case MULTI_ARG_1_SF2
:
8294 case MULTI_ARG_1_DF2
:
8295 case MULTI_ARG_1_DI
:
8296 case MULTI_ARG_1_SI
:
8297 case MULTI_ARG_1_HI
:
8298 case MULTI_ARG_1_QI
:
8299 case MULTI_ARG_1_SI_DI
:
8300 case MULTI_ARG_1_HI_DI
:
8301 case MULTI_ARG_1_HI_SI
:
8302 case MULTI_ARG_1_QI_DI
:
8303 case MULTI_ARG_1_QI_SI
:
8304 case MULTI_ARG_1_QI_HI
:
8308 case MULTI_ARG_2_DI_CMP
:
8309 case MULTI_ARG_2_SI_CMP
:
8310 case MULTI_ARG_2_HI_CMP
:
8311 case MULTI_ARG_2_QI_CMP
:
8313 comparison_p
= true;
8316 case MULTI_ARG_2_SF_TF
:
8317 case MULTI_ARG_2_DF_TF
:
8318 case MULTI_ARG_2_DI_TF
:
8319 case MULTI_ARG_2_SI_TF
:
8320 case MULTI_ARG_2_HI_TF
:
8321 case MULTI_ARG_2_QI_TF
:
8330 if (optimize
|| !target
8331 || GET_MODE (target
) != tmode
8332 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
8333 target
= gen_reg_rtx (tmode
);
8334 else if (memory_operand (target
, tmode
))
8337 gcc_assert (nargs
<= 4);
8339 for (i
= 0; i
< nargs
; i
++)
8341 tree arg
= CALL_EXPR_ARG (exp
, i
);
8342 rtx op
= expand_normal (arg
);
8343 int adjust
= (comparison_p
) ? 1 : 0;
8344 machine_mode mode
= insn_data
[icode
].operand
[i
+adjust
+1].mode
;
8346 if (last_arg_constant
&& i
== nargs
- 1)
8348 if (!insn_data
[icode
].operand
[i
+ 1].predicate (op
, mode
))
8350 enum insn_code new_icode
= icode
;
8353 case CODE_FOR_xop_vpermil2v2df3
:
8354 case CODE_FOR_xop_vpermil2v4sf3
:
8355 case CODE_FOR_xop_vpermil2v4df3
:
8356 case CODE_FOR_xop_vpermil2v8sf3
:
8357 error ("the last argument must be a 2-bit immediate");
8358 return gen_reg_rtx (tmode
);
8359 case CODE_FOR_xop_rotlv2di3
:
8360 new_icode
= CODE_FOR_rotlv2di3
;
8362 case CODE_FOR_xop_rotlv4si3
:
8363 new_icode
= CODE_FOR_rotlv4si3
;
8365 case CODE_FOR_xop_rotlv8hi3
:
8366 new_icode
= CODE_FOR_rotlv8hi3
;
8368 case CODE_FOR_xop_rotlv16qi3
:
8369 new_icode
= CODE_FOR_rotlv16qi3
;
8371 if (CONST_INT_P (op
))
8373 int mask
= GET_MODE_UNIT_BITSIZE (tmode
) - 1;
8374 op
= GEN_INT (INTVAL (op
) & mask
);
8376 (insn_data
[icode
].operand
[i
+ 1].predicate (op
, mode
));
8382 && insn_data
[new_icode
].operand
[0].mode
== tmode
8383 && insn_data
[new_icode
].operand
[1].mode
== tmode
8384 && insn_data
[new_icode
].operand
[2].mode
== mode
8385 && insn_data
[new_icode
].operand
[0].predicate
8386 == insn_data
[icode
].operand
[0].predicate
8387 && insn_data
[new_icode
].operand
[1].predicate
8388 == insn_data
[icode
].operand
[1].predicate
);
8401 if (VECTOR_MODE_P (mode
))
8402 op
= safe_vector_operand (op
, mode
);
8404 /* If we aren't optimizing, only allow one memory operand to be
8406 if (memory_operand (op
, mode
))
8409 gcc_assert (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
);
8412 || !insn_data
[icode
].operand
[i
+adjust
+1].predicate (op
, mode
)
8414 op
= force_reg (mode
, op
);
8418 args
[i
].mode
= mode
;
8424 pat
= GEN_FCN (icode
) (target
, args
[0].op
);
8429 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
,
8430 GEN_INT ((int)sub_code
));
8431 else if (! comparison_p
)
8432 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
);
8435 rtx cmp_op
= gen_rtx_fmt_ee (sub_code
, GET_MODE (target
),
8439 pat
= GEN_FCN (icode
) (target
, cmp_op
, args
[0].op
, args
[1].op
);
8444 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
, args
[2].op
);
8448 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
, args
[2].op
, args
[3].op
);
8462 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
8463 insns with vec_merge. */
8466 ix86_expand_unop_vec_merge_builtin (enum insn_code icode
, tree exp
,
8470 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8471 rtx op1
, op0
= expand_normal (arg0
);
8472 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
8473 machine_mode mode0
= insn_data
[icode
].operand
[1].mode
;
8475 if (optimize
|| !target
8476 || GET_MODE (target
) != tmode
8477 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
8478 target
= gen_reg_rtx (tmode
);
8480 if (VECTOR_MODE_P (mode0
))
8481 op0
= safe_vector_operand (op0
, mode0
);
8483 if ((optimize
&& !register_operand (op0
, mode0
))
8484 || !insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
8485 op0
= copy_to_mode_reg (mode0
, op0
);
8488 if (!insn_data
[icode
].operand
[2].predicate (op1
, mode0
))
8489 op1
= copy_to_mode_reg (mode0
, op1
);
8491 pat
= GEN_FCN (icode
) (target
, op0
, op1
);
8498 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
8501 ix86_expand_sse_compare (const struct builtin_description
*d
,
8502 tree exp
, rtx target
, bool swap
)
8505 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8506 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8507 rtx op0
= expand_normal (arg0
);
8508 rtx op1
= expand_normal (arg1
);
8510 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
8511 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
8512 machine_mode mode1
= insn_data
[d
->icode
].operand
[2].mode
;
8513 enum rtx_code comparison
= d
->comparison
;
8515 if (VECTOR_MODE_P (mode0
))
8516 op0
= safe_vector_operand (op0
, mode0
);
8517 if (VECTOR_MODE_P (mode1
))
8518 op1
= safe_vector_operand (op1
, mode1
);
8520 /* Swap operands if we have a comparison that isn't available in
8523 std::swap (op0
, op1
);
8525 if (optimize
|| !target
8526 || GET_MODE (target
) != tmode
8527 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
8528 target
= gen_reg_rtx (tmode
);
8530 if ((optimize
&& !register_operand (op0
, mode0
))
8531 || !insn_data
[d
->icode
].operand
[1].predicate (op0
, mode0
))
8532 op0
= copy_to_mode_reg (mode0
, op0
);
8533 if ((optimize
&& !register_operand (op1
, mode1
))
8534 || !insn_data
[d
->icode
].operand
[2].predicate (op1
, mode1
))
8535 op1
= copy_to_mode_reg (mode1
, op1
);
8537 op2
= gen_rtx_fmt_ee (comparison
, mode0
, op0
, op1
);
8538 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
, op2
);
8545 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
8548 ix86_expand_sse_comi (const struct builtin_description
*d
, tree exp
,
8552 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8553 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8554 rtx op0
= expand_normal (arg0
);
8555 rtx op1
= expand_normal (arg1
);
8556 machine_mode mode0
= insn_data
[d
->icode
].operand
[0].mode
;
8557 machine_mode mode1
= insn_data
[d
->icode
].operand
[1].mode
;
8558 enum rtx_code comparison
= d
->comparison
;
8560 if (VECTOR_MODE_P (mode0
))
8561 op0
= safe_vector_operand (op0
, mode0
);
8562 if (VECTOR_MODE_P (mode1
))
8563 op1
= safe_vector_operand (op1
, mode1
);
8565 /* Swap operands if we have a comparison that isn't available in
8567 if (d
->flag
& BUILTIN_DESC_SWAP_OPERANDS
)
8568 std::swap (op0
, op1
);
8570 target
= gen_reg_rtx (SImode
);
8571 emit_move_insn (target
, const0_rtx
);
8572 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8574 if ((optimize
&& !register_operand (op0
, mode0
))
8575 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8576 op0
= copy_to_mode_reg (mode0
, op0
);
8577 if ((optimize
&& !register_operand (op1
, mode1
))
8578 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
8579 op1
= copy_to_mode_reg (mode1
, op1
);
8581 pat
= GEN_FCN (d
->icode
) (op0
, op1
);
8585 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8586 gen_rtx_fmt_ee (comparison
, QImode
,
8590 return SUBREG_REG (target
);
8593 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
8596 ix86_expand_sse_round (const struct builtin_description
*d
, tree exp
,
8600 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8601 rtx op1
, op0
= expand_normal (arg0
);
8602 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
8603 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
8605 if (optimize
|| target
== 0
8606 || GET_MODE (target
) != tmode
8607 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
8608 target
= gen_reg_rtx (tmode
);
8610 if (VECTOR_MODE_P (mode0
))
8611 op0
= safe_vector_operand (op0
, mode0
);
8613 if ((optimize
&& !register_operand (op0
, mode0
))
8614 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8615 op0
= copy_to_mode_reg (mode0
, op0
);
8617 op1
= GEN_INT (d
->comparison
);
8619 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
);
8627 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description
*d
,
8628 tree exp
, rtx target
)
8631 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8632 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8633 rtx op0
= expand_normal (arg0
);
8634 rtx op1
= expand_normal (arg1
);
8636 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
8637 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
8638 machine_mode mode1
= insn_data
[d
->icode
].operand
[2].mode
;
8640 if (optimize
|| target
== 0
8641 || GET_MODE (target
) != tmode
8642 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
8643 target
= gen_reg_rtx (tmode
);
8645 op0
= safe_vector_operand (op0
, mode0
);
8646 op1
= safe_vector_operand (op1
, mode1
);
8648 if ((optimize
&& !register_operand (op0
, mode0
))
8649 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8650 op0
= copy_to_mode_reg (mode0
, op0
);
8651 if ((optimize
&& !register_operand (op1
, mode1
))
8652 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
8653 op1
= copy_to_mode_reg (mode1
, op1
);
8655 op2
= GEN_INT (d
->comparison
);
8657 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
, op2
);
8664 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
8667 ix86_expand_sse_ptest (const struct builtin_description
*d
, tree exp
,
8671 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8672 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8673 rtx op0
= expand_normal (arg0
);
8674 rtx op1
= expand_normal (arg1
);
8675 machine_mode mode0
= insn_data
[d
->icode
].operand
[0].mode
;
8676 machine_mode mode1
= insn_data
[d
->icode
].operand
[1].mode
;
8677 enum rtx_code comparison
= d
->comparison
;
8679 if (VECTOR_MODE_P (mode0
))
8680 op0
= safe_vector_operand (op0
, mode0
);
8681 if (VECTOR_MODE_P (mode1
))
8682 op1
= safe_vector_operand (op1
, mode1
);
8684 target
= gen_reg_rtx (SImode
);
8685 emit_move_insn (target
, const0_rtx
);
8686 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8688 if ((optimize
&& !register_operand (op0
, mode0
))
8689 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
8690 op0
= copy_to_mode_reg (mode0
, op0
);
8691 if ((optimize
&& !register_operand (op1
, mode1
))
8692 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
8693 op1
= copy_to_mode_reg (mode1
, op1
);
8695 pat
= GEN_FCN (d
->icode
) (op0
, op1
);
8699 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8700 gen_rtx_fmt_ee (comparison
, QImode
,
8704 return SUBREG_REG (target
);
8707 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
8710 ix86_expand_sse_pcmpestr (const struct builtin_description
*d
,
8711 tree exp
, rtx target
)
8714 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8715 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8716 tree arg2
= CALL_EXPR_ARG (exp
, 2);
8717 tree arg3
= CALL_EXPR_ARG (exp
, 3);
8718 tree arg4
= CALL_EXPR_ARG (exp
, 4);
8719 rtx scratch0
, scratch1
;
8720 rtx op0
= expand_normal (arg0
);
8721 rtx op1
= expand_normal (arg1
);
8722 rtx op2
= expand_normal (arg2
);
8723 rtx op3
= expand_normal (arg3
);
8724 rtx op4
= expand_normal (arg4
);
8725 machine_mode tmode0
, tmode1
, modev2
, modei3
, modev4
, modei5
, modeimm
;
8727 tmode0
= insn_data
[d
->icode
].operand
[0].mode
;
8728 tmode1
= insn_data
[d
->icode
].operand
[1].mode
;
8729 modev2
= insn_data
[d
->icode
].operand
[2].mode
;
8730 modei3
= insn_data
[d
->icode
].operand
[3].mode
;
8731 modev4
= insn_data
[d
->icode
].operand
[4].mode
;
8732 modei5
= insn_data
[d
->icode
].operand
[5].mode
;
8733 modeimm
= insn_data
[d
->icode
].operand
[6].mode
;
8735 if (VECTOR_MODE_P (modev2
))
8736 op0
= safe_vector_operand (op0
, modev2
);
8737 if (VECTOR_MODE_P (modev4
))
8738 op2
= safe_vector_operand (op2
, modev4
);
8740 if (!insn_data
[d
->icode
].operand
[2].predicate (op0
, modev2
))
8741 op0
= copy_to_mode_reg (modev2
, op0
);
8742 if (!insn_data
[d
->icode
].operand
[3].predicate (op1
, modei3
))
8743 op1
= copy_to_mode_reg (modei3
, op1
);
8744 if ((optimize
&& !register_operand (op2
, modev4
))
8745 || !insn_data
[d
->icode
].operand
[4].predicate (op2
, modev4
))
8746 op2
= copy_to_mode_reg (modev4
, op2
);
8747 if (!insn_data
[d
->icode
].operand
[5].predicate (op3
, modei5
))
8748 op3
= copy_to_mode_reg (modei5
, op3
);
8750 if (!insn_data
[d
->icode
].operand
[6].predicate (op4
, modeimm
))
8752 error ("the fifth argument must be an 8-bit immediate");
8756 if (d
->code
== IX86_BUILTIN_PCMPESTRI128
)
8758 if (optimize
|| !target
8759 || GET_MODE (target
) != tmode0
8760 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode0
))
8761 target
= gen_reg_rtx (tmode0
);
8763 scratch1
= gen_reg_rtx (tmode1
);
8765 pat
= GEN_FCN (d
->icode
) (target
, scratch1
, op0
, op1
, op2
, op3
, op4
);
8767 else if (d
->code
== IX86_BUILTIN_PCMPESTRM128
)
8769 if (optimize
|| !target
8770 || GET_MODE (target
) != tmode1
8771 || !insn_data
[d
->icode
].operand
[1].predicate (target
, tmode1
))
8772 target
= gen_reg_rtx (tmode1
);
8774 scratch0
= gen_reg_rtx (tmode0
);
8776 pat
= GEN_FCN (d
->icode
) (scratch0
, target
, op0
, op1
, op2
, op3
, op4
);
8780 gcc_assert (d
->flag
);
8782 scratch0
= gen_reg_rtx (tmode0
);
8783 scratch1
= gen_reg_rtx (tmode1
);
8785 pat
= GEN_FCN (d
->icode
) (scratch0
, scratch1
, op0
, op1
, op2
, op3
, op4
);
8795 target
= gen_reg_rtx (SImode
);
8796 emit_move_insn (target
, const0_rtx
);
8797 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8800 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8801 gen_rtx_fmt_ee (EQ
, QImode
,
8802 gen_rtx_REG ((machine_mode
) d
->flag
,
8805 return SUBREG_REG (target
);
8812 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
8815 ix86_expand_sse_pcmpistr (const struct builtin_description
*d
,
8816 tree exp
, rtx target
)
8819 tree arg0
= CALL_EXPR_ARG (exp
, 0);
8820 tree arg1
= CALL_EXPR_ARG (exp
, 1);
8821 tree arg2
= CALL_EXPR_ARG (exp
, 2);
8822 rtx scratch0
, scratch1
;
8823 rtx op0
= expand_normal (arg0
);
8824 rtx op1
= expand_normal (arg1
);
8825 rtx op2
= expand_normal (arg2
);
8826 machine_mode tmode0
, tmode1
, modev2
, modev3
, modeimm
;
8828 tmode0
= insn_data
[d
->icode
].operand
[0].mode
;
8829 tmode1
= insn_data
[d
->icode
].operand
[1].mode
;
8830 modev2
= insn_data
[d
->icode
].operand
[2].mode
;
8831 modev3
= insn_data
[d
->icode
].operand
[3].mode
;
8832 modeimm
= insn_data
[d
->icode
].operand
[4].mode
;
8834 if (VECTOR_MODE_P (modev2
))
8835 op0
= safe_vector_operand (op0
, modev2
);
8836 if (VECTOR_MODE_P (modev3
))
8837 op1
= safe_vector_operand (op1
, modev3
);
8839 if (!insn_data
[d
->icode
].operand
[2].predicate (op0
, modev2
))
8840 op0
= copy_to_mode_reg (modev2
, op0
);
8841 if ((optimize
&& !register_operand (op1
, modev3
))
8842 || !insn_data
[d
->icode
].operand
[3].predicate (op1
, modev3
))
8843 op1
= copy_to_mode_reg (modev3
, op1
);
8845 if (!insn_data
[d
->icode
].operand
[4].predicate (op2
, modeimm
))
8847 error ("the third argument must be an 8-bit immediate");
8851 if (d
->code
== IX86_BUILTIN_PCMPISTRI128
)
8853 if (optimize
|| !target
8854 || GET_MODE (target
) != tmode0
8855 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode0
))
8856 target
= gen_reg_rtx (tmode0
);
8858 scratch1
= gen_reg_rtx (tmode1
);
8860 pat
= GEN_FCN (d
->icode
) (target
, scratch1
, op0
, op1
, op2
);
8862 else if (d
->code
== IX86_BUILTIN_PCMPISTRM128
)
8864 if (optimize
|| !target
8865 || GET_MODE (target
) != tmode1
8866 || !insn_data
[d
->icode
].operand
[1].predicate (target
, tmode1
))
8867 target
= gen_reg_rtx (tmode1
);
8869 scratch0
= gen_reg_rtx (tmode0
);
8871 pat
= GEN_FCN (d
->icode
) (scratch0
, target
, op0
, op1
, op2
);
8875 gcc_assert (d
->flag
);
8877 scratch0
= gen_reg_rtx (tmode0
);
8878 scratch1
= gen_reg_rtx (tmode1
);
8880 pat
= GEN_FCN (d
->icode
) (scratch0
, scratch1
, op0
, op1
, op2
);
8890 target
= gen_reg_rtx (SImode
);
8891 emit_move_insn (target
, const0_rtx
);
8892 target
= gen_rtx_SUBREG (QImode
, target
, 0);
8895 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
8896 gen_rtx_fmt_ee (EQ
, QImode
,
8897 gen_rtx_REG ((machine_mode
) d
->flag
,
8900 return SUBREG_REG (target
);
8906 /* Fixup modeless constants to fit required mode. */
8909 fixup_modeless_constant (rtx x
, machine_mode mode
)
8911 if (GET_MODE (x
) == VOIDmode
)
8912 x
= convert_to_mode (mode
, x
, 1);
8916 /* Subroutine of ix86_expand_builtin to take care of insns with
8917 variable number of operands. */
8920 ix86_expand_args_builtin (const struct builtin_description
*d
,
8921 tree exp
, rtx target
)
8923 rtx pat
, real_target
;
8924 unsigned int i
, nargs
;
8925 unsigned int nargs_constant
= 0;
8926 unsigned int mask_pos
= 0;
8933 bool second_arg_count
= false;
8934 enum insn_code icode
= d
->icode
;
8935 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
8936 machine_mode tmode
= insn_p
->operand
[0].mode
;
8937 machine_mode rmode
= VOIDmode
;
8939 enum rtx_code comparison
= d
->comparison
;
8941 switch ((enum ix86_builtin_func_type
) d
->flag
)
8943 case V2DF_FTYPE_V2DF_ROUND
:
8944 case V4DF_FTYPE_V4DF_ROUND
:
8945 case V8DF_FTYPE_V8DF_ROUND
:
8946 case V4SF_FTYPE_V4SF_ROUND
:
8947 case V8SF_FTYPE_V8SF_ROUND
:
8948 case V16SF_FTYPE_V16SF_ROUND
:
8949 case V4SI_FTYPE_V4SF_ROUND
:
8950 case V8SI_FTYPE_V8SF_ROUND
:
8951 case V16SI_FTYPE_V16SF_ROUND
:
8952 return ix86_expand_sse_round (d
, exp
, target
);
8953 case V4SI_FTYPE_V2DF_V2DF_ROUND
:
8954 case V8SI_FTYPE_V4DF_V4DF_ROUND
:
8955 case V16SI_FTYPE_V8DF_V8DF_ROUND
:
8956 return ix86_expand_sse_round_vec_pack_sfix (d
, exp
, target
);
8957 case INT_FTYPE_V8SF_V8SF_PTEST
:
8958 case INT_FTYPE_V4DI_V4DI_PTEST
:
8959 case INT_FTYPE_V4DF_V4DF_PTEST
:
8960 case INT_FTYPE_V4SF_V4SF_PTEST
:
8961 case INT_FTYPE_V2DI_V2DI_PTEST
:
8962 case INT_FTYPE_V2DF_V2DF_PTEST
:
8963 return ix86_expand_sse_ptest (d
, exp
, target
);
8964 case FLOAT128_FTYPE_FLOAT128
:
8965 case FLOAT_FTYPE_FLOAT
:
8967 case UINT_FTYPE_UINT
:
8968 case UINT16_FTYPE_UINT16
:
8969 case UINT64_FTYPE_INT
:
8970 case UINT64_FTYPE_UINT64
:
8971 case INT64_FTYPE_INT64
:
8972 case INT64_FTYPE_V4SF
:
8973 case INT64_FTYPE_V2DF
:
8974 case INT_FTYPE_V16QI
:
8975 case INT_FTYPE_V8QI
:
8976 case INT_FTYPE_V8SF
:
8977 case INT_FTYPE_V4DF
:
8978 case INT_FTYPE_V4SF
:
8979 case INT_FTYPE_V2DF
:
8980 case INT_FTYPE_V32QI
:
8981 case V16QI_FTYPE_V16QI
:
8982 case V8SI_FTYPE_V8SF
:
8983 case V8SI_FTYPE_V4SI
:
8984 case V8HI_FTYPE_V8HI
:
8985 case V8HI_FTYPE_V16QI
:
8986 case V8QI_FTYPE_V8QI
:
8987 case V8SF_FTYPE_V8SF
:
8988 case V8SF_FTYPE_V8SI
:
8989 case V8SF_FTYPE_V4SF
:
8990 case V8SF_FTYPE_V8HI
:
8991 case V4SI_FTYPE_V4SI
:
8992 case V4SI_FTYPE_V16QI
:
8993 case V4SI_FTYPE_V4SF
:
8994 case V4SI_FTYPE_V8SI
:
8995 case V4SI_FTYPE_V8HI
:
8996 case V4SI_FTYPE_V4DF
:
8997 case V4SI_FTYPE_V2DF
:
8998 case V4HI_FTYPE_V4HI
:
8999 case V4DF_FTYPE_V4DF
:
9000 case V4DF_FTYPE_V4SI
:
9001 case V4DF_FTYPE_V4SF
:
9002 case V4DF_FTYPE_V2DF
:
9003 case V4SF_FTYPE_V4SF
:
9004 case V4SF_FTYPE_V4SI
:
9005 case V4SF_FTYPE_V8SF
:
9006 case V4SF_FTYPE_V4DF
:
9007 case V4SF_FTYPE_V8HI
:
9008 case V4SF_FTYPE_V2DF
:
9009 case V2DI_FTYPE_V2DI
:
9010 case V2DI_FTYPE_V16QI
:
9011 case V2DI_FTYPE_V8HI
:
9012 case V2DI_FTYPE_V4SI
:
9013 case V2DF_FTYPE_V2DF
:
9014 case V2DF_FTYPE_V4SI
:
9015 case V2DF_FTYPE_V4DF
:
9016 case V2DF_FTYPE_V4SF
:
9017 case V2DF_FTYPE_V2SI
:
9018 case V2SI_FTYPE_V2SI
:
9019 case V2SI_FTYPE_V4SF
:
9020 case V2SI_FTYPE_V2SF
:
9021 case V2SI_FTYPE_V2DF
:
9022 case V2SF_FTYPE_V2SF
:
9023 case V2SF_FTYPE_V2SI
:
9024 case V32QI_FTYPE_V32QI
:
9025 case V32QI_FTYPE_V16QI
:
9026 case V16HI_FTYPE_V16HI
:
9027 case V16HI_FTYPE_V8HI
:
9028 case V8SI_FTYPE_V8SI
:
9029 case V16HI_FTYPE_V16QI
:
9030 case V8SI_FTYPE_V16QI
:
9031 case V4DI_FTYPE_V16QI
:
9032 case V8SI_FTYPE_V8HI
:
9033 case V4DI_FTYPE_V8HI
:
9034 case V4DI_FTYPE_V4SI
:
9035 case V4DI_FTYPE_V2DI
:
9042 case UHI_FTYPE_V16QI
:
9043 case USI_FTYPE_V32QI
:
9044 case UDI_FTYPE_V64QI
:
9045 case V16QI_FTYPE_UHI
:
9046 case V32QI_FTYPE_USI
:
9047 case V64QI_FTYPE_UDI
:
9048 case V8HI_FTYPE_UQI
:
9049 case V16HI_FTYPE_UHI
:
9050 case V32HI_FTYPE_USI
:
9051 case V4SI_FTYPE_UQI
:
9052 case V8SI_FTYPE_UQI
:
9053 case V4SI_FTYPE_UHI
:
9054 case V8SI_FTYPE_UHI
:
9055 case UQI_FTYPE_V8HI
:
9056 case UHI_FTYPE_V16HI
:
9057 case USI_FTYPE_V32HI
:
9058 case UQI_FTYPE_V4SI
:
9059 case UQI_FTYPE_V8SI
:
9060 case UHI_FTYPE_V16SI
:
9061 case UQI_FTYPE_V2DI
:
9062 case UQI_FTYPE_V4DI
:
9063 case UQI_FTYPE_V8DI
:
9064 case V16SI_FTYPE_UHI
:
9065 case V2DI_FTYPE_UQI
:
9066 case V4DI_FTYPE_UQI
:
9067 case V16SI_FTYPE_INT
:
9068 case V16SF_FTYPE_V8SF
:
9069 case V16SI_FTYPE_V8SI
:
9070 case V16SF_FTYPE_V4SF
:
9071 case V16SI_FTYPE_V4SI
:
9072 case V16SI_FTYPE_V16SF
:
9073 case V16SI_FTYPE_V16SI
:
9074 case V64QI_FTYPE_V64QI
:
9075 case V32HI_FTYPE_V32HI
:
9076 case V16SF_FTYPE_V16SF
:
9077 case V8DI_FTYPE_UQI
:
9078 case V8DI_FTYPE_V8DI
:
9079 case V8DF_FTYPE_V4DF
:
9080 case V8DF_FTYPE_V2DF
:
9081 case V8DF_FTYPE_V8DF
:
9082 case V4DI_FTYPE_V4DI
:
9083 case V16HI_FTYPE_V16SF
:
9084 case V8HI_FTYPE_V8SF
:
9085 case V8HI_FTYPE_V4SF
:
9088 case V4SF_FTYPE_V4SF_VEC_MERGE
:
9089 case V2DF_FTYPE_V2DF_VEC_MERGE
:
9090 return ix86_expand_unop_vec_merge_builtin (icode
, exp
, target
);
9091 case FLOAT128_FTYPE_FLOAT128_FLOAT128
:
9092 case V16QI_FTYPE_V16QI_V16QI
:
9093 case V16QI_FTYPE_V8HI_V8HI
:
9094 case V16SF_FTYPE_V16SF_V16SF
:
9095 case V8QI_FTYPE_V8QI_V8QI
:
9096 case V8QI_FTYPE_V4HI_V4HI
:
9097 case V8HI_FTYPE_V8HI_V8HI
:
9098 case V8HI_FTYPE_V16QI_V16QI
:
9099 case V8HI_FTYPE_V4SI_V4SI
:
9100 case V8SF_FTYPE_V8SF_V8SF
:
9101 case V8SF_FTYPE_V8SF_V8SI
:
9102 case V8DF_FTYPE_V8DF_V8DF
:
9103 case V4SI_FTYPE_V4SI_V4SI
:
9104 case V4SI_FTYPE_V8HI_V8HI
:
9105 case V4SI_FTYPE_V2DF_V2DF
:
9106 case V4HI_FTYPE_V4HI_V4HI
:
9107 case V4HI_FTYPE_V8QI_V8QI
:
9108 case V4HI_FTYPE_V2SI_V2SI
:
9109 case V4DF_FTYPE_V4DF_V4DF
:
9110 case V4DF_FTYPE_V4DF_V4DI
:
9111 case V4SF_FTYPE_V4SF_V4SF
:
9112 case V4SF_FTYPE_V4SF_V4SI
:
9113 case V4SF_FTYPE_V4SF_V2SI
:
9114 case V4SF_FTYPE_V4SF_V2DF
:
9115 case V4SF_FTYPE_V4SF_UINT
:
9116 case V4SF_FTYPE_V4SF_DI
:
9117 case V4SF_FTYPE_V4SF_SI
:
9118 case V2DI_FTYPE_V2DI_V2DI
:
9119 case V2DI_FTYPE_V16QI_V16QI
:
9120 case V2DI_FTYPE_V4SI_V4SI
:
9121 case V2DI_FTYPE_V2DI_V16QI
:
9122 case V2SI_FTYPE_V2SI_V2SI
:
9123 case V2SI_FTYPE_V4HI_V4HI
:
9124 case V2SI_FTYPE_V2SF_V2SF
:
9125 case V2DF_FTYPE_V2DF_V2DF
:
9126 case V2DF_FTYPE_V2DF_V4SF
:
9127 case V2DF_FTYPE_V2DF_V2DI
:
9128 case V2DF_FTYPE_V2DF_DI
:
9129 case V2DF_FTYPE_V2DF_SI
:
9130 case V2DF_FTYPE_V2DF_UINT
:
9131 case V2SF_FTYPE_V2SF_V2SF
:
9132 case V1DI_FTYPE_V1DI_V1DI
:
9133 case V1DI_FTYPE_V8QI_V8QI
:
9134 case V1DI_FTYPE_V2SI_V2SI
:
9135 case V32QI_FTYPE_V16HI_V16HI
:
9136 case V16HI_FTYPE_V8SI_V8SI
:
9137 case V64QI_FTYPE_V64QI_V64QI
:
9138 case V32QI_FTYPE_V32QI_V32QI
:
9139 case V16HI_FTYPE_V32QI_V32QI
:
9140 case V16HI_FTYPE_V16HI_V16HI
:
9141 case V8SI_FTYPE_V4DF_V4DF
:
9142 case V8SI_FTYPE_V8SI_V8SI
:
9143 case V8SI_FTYPE_V16HI_V16HI
:
9144 case V4DI_FTYPE_V4DI_V4DI
:
9145 case V4DI_FTYPE_V8SI_V8SI
:
9146 case V8DI_FTYPE_V64QI_V64QI
:
9147 if (comparison
== UNKNOWN
)
9148 return ix86_expand_binop_builtin (icode
, exp
, target
);
9151 case V4SF_FTYPE_V4SF_V4SF_SWAP
:
9152 case V2DF_FTYPE_V2DF_V2DF_SWAP
:
9153 gcc_assert (comparison
!= UNKNOWN
);
9157 case V16HI_FTYPE_V16HI_V8HI_COUNT
:
9158 case V16HI_FTYPE_V16HI_SI_COUNT
:
9159 case V8SI_FTYPE_V8SI_V4SI_COUNT
:
9160 case V8SI_FTYPE_V8SI_SI_COUNT
:
9161 case V4DI_FTYPE_V4DI_V2DI_COUNT
:
9162 case V4DI_FTYPE_V4DI_INT_COUNT
:
9163 case V8HI_FTYPE_V8HI_V8HI_COUNT
:
9164 case V8HI_FTYPE_V8HI_SI_COUNT
:
9165 case V4SI_FTYPE_V4SI_V4SI_COUNT
:
9166 case V4SI_FTYPE_V4SI_SI_COUNT
:
9167 case V4HI_FTYPE_V4HI_V4HI_COUNT
:
9168 case V4HI_FTYPE_V4HI_SI_COUNT
:
9169 case V2DI_FTYPE_V2DI_V2DI_COUNT
:
9170 case V2DI_FTYPE_V2DI_SI_COUNT
:
9171 case V2SI_FTYPE_V2SI_V2SI_COUNT
:
9172 case V2SI_FTYPE_V2SI_SI_COUNT
:
9173 case V1DI_FTYPE_V1DI_V1DI_COUNT
:
9174 case V1DI_FTYPE_V1DI_SI_COUNT
:
9176 second_arg_count
= true;
9178 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT
:
9179 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT
:
9180 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT
:
9181 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT
:
9182 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT
:
9183 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT
:
9184 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT
:
9185 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT
:
9186 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT
:
9187 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT
:
9188 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT
:
9189 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT
:
9190 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT
:
9191 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT
:
9192 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT
:
9193 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT
:
9194 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT
:
9195 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT
:
9197 second_arg_count
= true;
9199 case UINT64_FTYPE_UINT64_UINT64
:
9200 case UINT_FTYPE_UINT_UINT
:
9201 case UINT_FTYPE_UINT_USHORT
:
9202 case UINT_FTYPE_UINT_UCHAR
:
9203 case UINT16_FTYPE_UINT16_INT
:
9204 case UINT8_FTYPE_UINT8_INT
:
9205 case UQI_FTYPE_UQI_UQI
:
9206 case UHI_FTYPE_UHI_UHI
:
9207 case USI_FTYPE_USI_USI
:
9208 case UDI_FTYPE_UDI_UDI
:
9209 case V16SI_FTYPE_V8DF_V8DF
:
9210 case V32HI_FTYPE_V16SF_V16SF
:
9211 case V16HI_FTYPE_V8SF_V8SF
:
9212 case V8HI_FTYPE_V4SF_V4SF
:
9213 case V16HI_FTYPE_V16SF_UHI
:
9214 case V8HI_FTYPE_V8SF_UQI
:
9215 case V8HI_FTYPE_V4SF_UQI
:
9218 case V2DI_FTYPE_V2DI_INT_CONVERT
:
9223 case V4DI_FTYPE_V4DI_INT_CONVERT
:
9228 case V8DI_FTYPE_V8DI_INT_CONVERT
:
9233 case V8HI_FTYPE_V8HI_INT
:
9234 case V8HI_FTYPE_V8SF_INT
:
9235 case V16HI_FTYPE_V16SF_INT
:
9236 case V8HI_FTYPE_V4SF_INT
:
9237 case V8SF_FTYPE_V8SF_INT
:
9238 case V4SF_FTYPE_V16SF_INT
:
9239 case V16SF_FTYPE_V16SF_INT
:
9240 case V4SI_FTYPE_V4SI_INT
:
9241 case V4SI_FTYPE_V8SI_INT
:
9242 case V4HI_FTYPE_V4HI_INT
:
9243 case V4DF_FTYPE_V4DF_INT
:
9244 case V4DF_FTYPE_V8DF_INT
:
9245 case V4SF_FTYPE_V4SF_INT
:
9246 case V4SF_FTYPE_V8SF_INT
:
9247 case V2DI_FTYPE_V2DI_INT
:
9248 case V2DF_FTYPE_V2DF_INT
:
9249 case V2DF_FTYPE_V4DF_INT
:
9250 case V16HI_FTYPE_V16HI_INT
:
9251 case V8SI_FTYPE_V8SI_INT
:
9252 case V16SI_FTYPE_V16SI_INT
:
9253 case V4SI_FTYPE_V16SI_INT
:
9254 case V4DI_FTYPE_V4DI_INT
:
9255 case V2DI_FTYPE_V4DI_INT
:
9256 case V4DI_FTYPE_V8DI_INT
:
9257 case UQI_FTYPE_UQI_UQI_CONST
:
9258 case UHI_FTYPE_UHI_UQI
:
9259 case USI_FTYPE_USI_UQI
:
9260 case UDI_FTYPE_UDI_UQI
:
9264 case V16QI_FTYPE_V16QI_V16QI_V16QI
:
9265 case V8SF_FTYPE_V8SF_V8SF_V8SF
:
9266 case V4DF_FTYPE_V4DF_V4DF_V4DF
:
9267 case V4SF_FTYPE_V4SF_V4SF_V4SF
:
9268 case V2DF_FTYPE_V2DF_V2DF_V2DF
:
9269 case V32QI_FTYPE_V32QI_V32QI_V32QI
:
9270 case UHI_FTYPE_V16SI_V16SI_UHI
:
9271 case UQI_FTYPE_V8DI_V8DI_UQI
:
9272 case V16HI_FTYPE_V16SI_V16HI_UHI
:
9273 case V16QI_FTYPE_V16SI_V16QI_UHI
:
9274 case V16QI_FTYPE_V8DI_V16QI_UQI
:
9275 case V16SF_FTYPE_V16SF_V16SF_UHI
:
9276 case V16SF_FTYPE_V4SF_V16SF_UHI
:
9277 case V16SI_FTYPE_SI_V16SI_UHI
:
9278 case V16SI_FTYPE_V16HI_V16SI_UHI
:
9279 case V16SI_FTYPE_V16QI_V16SI_UHI
:
9280 case V8SF_FTYPE_V4SF_V8SF_UQI
:
9281 case V4DF_FTYPE_V2DF_V4DF_UQI
:
9282 case V8SI_FTYPE_V4SI_V8SI_UQI
:
9283 case V8SI_FTYPE_SI_V8SI_UQI
:
9284 case V4SI_FTYPE_V4SI_V4SI_UQI
:
9285 case V4SI_FTYPE_SI_V4SI_UQI
:
9286 case V4DI_FTYPE_V2DI_V4DI_UQI
:
9287 case V4DI_FTYPE_DI_V4DI_UQI
:
9288 case V2DI_FTYPE_V2DI_V2DI_UQI
:
9289 case V2DI_FTYPE_DI_V2DI_UQI
:
9290 case V64QI_FTYPE_V64QI_V64QI_UDI
:
9291 case V64QI_FTYPE_V16QI_V64QI_UDI
:
9292 case V64QI_FTYPE_QI_V64QI_UDI
:
9293 case V32QI_FTYPE_V32QI_V32QI_USI
:
9294 case V32QI_FTYPE_V16QI_V32QI_USI
:
9295 case V32QI_FTYPE_QI_V32QI_USI
:
9296 case V16QI_FTYPE_V16QI_V16QI_UHI
:
9297 case V16QI_FTYPE_QI_V16QI_UHI
:
9298 case V32HI_FTYPE_V8HI_V32HI_USI
:
9299 case V32HI_FTYPE_HI_V32HI_USI
:
9300 case V16HI_FTYPE_V8HI_V16HI_UHI
:
9301 case V16HI_FTYPE_HI_V16HI_UHI
:
9302 case V8HI_FTYPE_V8HI_V8HI_UQI
:
9303 case V8HI_FTYPE_HI_V8HI_UQI
:
9304 case V8SF_FTYPE_V8HI_V8SF_UQI
:
9305 case V4SF_FTYPE_V8HI_V4SF_UQI
:
9306 case V8SI_FTYPE_V8SF_V8SI_UQI
:
9307 case V4SI_FTYPE_V4SF_V4SI_UQI
:
9308 case V4DI_FTYPE_V4SF_V4DI_UQI
:
9309 case V2DI_FTYPE_V4SF_V2DI_UQI
:
9310 case V4SF_FTYPE_V4DI_V4SF_UQI
:
9311 case V4SF_FTYPE_V2DI_V4SF_UQI
:
9312 case V4DF_FTYPE_V4DI_V4DF_UQI
:
9313 case V2DF_FTYPE_V2DI_V2DF_UQI
:
9314 case V16QI_FTYPE_V8HI_V16QI_UQI
:
9315 case V16QI_FTYPE_V16HI_V16QI_UHI
:
9316 case V16QI_FTYPE_V4SI_V16QI_UQI
:
9317 case V16QI_FTYPE_V8SI_V16QI_UQI
:
9318 case V8HI_FTYPE_V4SI_V8HI_UQI
:
9319 case V8HI_FTYPE_V8SI_V8HI_UQI
:
9320 case V16QI_FTYPE_V2DI_V16QI_UQI
:
9321 case V16QI_FTYPE_V4DI_V16QI_UQI
:
9322 case V8HI_FTYPE_V2DI_V8HI_UQI
:
9323 case V8HI_FTYPE_V4DI_V8HI_UQI
:
9324 case V4SI_FTYPE_V2DI_V4SI_UQI
:
9325 case V4SI_FTYPE_V4DI_V4SI_UQI
:
9326 case V32QI_FTYPE_V32HI_V32QI_USI
:
9327 case UHI_FTYPE_V16QI_V16QI_UHI
:
9328 case USI_FTYPE_V32QI_V32QI_USI
:
9329 case UDI_FTYPE_V64QI_V64QI_UDI
:
9330 case UQI_FTYPE_V8HI_V8HI_UQI
:
9331 case UHI_FTYPE_V16HI_V16HI_UHI
:
9332 case USI_FTYPE_V32HI_V32HI_USI
:
9333 case UQI_FTYPE_V4SI_V4SI_UQI
:
9334 case UQI_FTYPE_V8SI_V8SI_UQI
:
9335 case UQI_FTYPE_V2DI_V2DI_UQI
:
9336 case UQI_FTYPE_V4DI_V4DI_UQI
:
9337 case V4SF_FTYPE_V2DF_V4SF_UQI
:
9338 case V4SF_FTYPE_V4DF_V4SF_UQI
:
9339 case V16SI_FTYPE_V16SI_V16SI_UHI
:
9340 case V16SI_FTYPE_V4SI_V16SI_UHI
:
9341 case V2DI_FTYPE_V4SI_V2DI_UQI
:
9342 case V2DI_FTYPE_V8HI_V2DI_UQI
:
9343 case V2DI_FTYPE_V16QI_V2DI_UQI
:
9344 case V4DI_FTYPE_V4DI_V4DI_UQI
:
9345 case V4DI_FTYPE_V4SI_V4DI_UQI
:
9346 case V4DI_FTYPE_V8HI_V4DI_UQI
:
9347 case V4DI_FTYPE_V16QI_V4DI_UQI
:
9348 case V4DI_FTYPE_V4DF_V4DI_UQI
:
9349 case V2DI_FTYPE_V2DF_V2DI_UQI
:
9350 case V4SI_FTYPE_V4DF_V4SI_UQI
:
9351 case V4SI_FTYPE_V2DF_V4SI_UQI
:
9352 case V4SI_FTYPE_V8HI_V4SI_UQI
:
9353 case V4SI_FTYPE_V16QI_V4SI_UQI
:
9354 case V4DI_FTYPE_V4DI_V4DI_V4DI
:
9355 case V8DF_FTYPE_V2DF_V8DF_UQI
:
9356 case V8DF_FTYPE_V4DF_V8DF_UQI
:
9357 case V8DF_FTYPE_V8DF_V8DF_UQI
:
9358 case V8SF_FTYPE_V8SF_V8SF_UQI
:
9359 case V8SF_FTYPE_V8SI_V8SF_UQI
:
9360 case V4DF_FTYPE_V4DF_V4DF_UQI
:
9361 case V4SF_FTYPE_V4SF_V4SF_UQI
:
9362 case V2DF_FTYPE_V2DF_V2DF_UQI
:
9363 case V2DF_FTYPE_V4SF_V2DF_UQI
:
9364 case V2DF_FTYPE_V4SI_V2DF_UQI
:
9365 case V4SF_FTYPE_V4SI_V4SF_UQI
:
9366 case V4DF_FTYPE_V4SF_V4DF_UQI
:
9367 case V4DF_FTYPE_V4SI_V4DF_UQI
:
9368 case V8SI_FTYPE_V8SI_V8SI_UQI
:
9369 case V8SI_FTYPE_V8HI_V8SI_UQI
:
9370 case V8SI_FTYPE_V16QI_V8SI_UQI
:
9371 case V8DF_FTYPE_V8SI_V8DF_UQI
:
9372 case V8DI_FTYPE_DI_V8DI_UQI
:
9373 case V16SF_FTYPE_V8SF_V16SF_UHI
:
9374 case V16SI_FTYPE_V8SI_V16SI_UHI
:
9375 case V16HI_FTYPE_V16HI_V16HI_UHI
:
9376 case V8HI_FTYPE_V16QI_V8HI_UQI
:
9377 case V16HI_FTYPE_V16QI_V16HI_UHI
:
9378 case V32HI_FTYPE_V32HI_V32HI_USI
:
9379 case V32HI_FTYPE_V32QI_V32HI_USI
:
9380 case V8DI_FTYPE_V16QI_V8DI_UQI
:
9381 case V8DI_FTYPE_V2DI_V8DI_UQI
:
9382 case V8DI_FTYPE_V4DI_V8DI_UQI
:
9383 case V8DI_FTYPE_V8DI_V8DI_UQI
:
9384 case V8DI_FTYPE_V8HI_V8DI_UQI
:
9385 case V8DI_FTYPE_V8SI_V8DI_UQI
:
9386 case V8HI_FTYPE_V8DI_V8HI_UQI
:
9387 case V8SI_FTYPE_V8DI_V8SI_UQI
:
9388 case V4SI_FTYPE_V4SI_V4SI_V4SI
:
9389 case V16SI_FTYPE_V16SI_V16SI_V16SI
:
9390 case V8DI_FTYPE_V8DI_V8DI_V8DI
:
9391 case V32HI_FTYPE_V32HI_V32HI_V32HI
:
9392 case V2DI_FTYPE_V2DI_V2DI_V2DI
:
9393 case V16HI_FTYPE_V16HI_V16HI_V16HI
:
9394 case V8SI_FTYPE_V8SI_V8SI_V8SI
:
9395 case V8HI_FTYPE_V8HI_V8HI_V8HI
:
9396 case V32HI_FTYPE_V16SF_V16SF_USI
:
9397 case V16HI_FTYPE_V8SF_V8SF_UHI
:
9398 case V8HI_FTYPE_V4SF_V4SF_UQI
:
9399 case V16HI_FTYPE_V16SF_V16HI_UHI
:
9400 case V8HI_FTYPE_V8SF_V8HI_UQI
:
9401 case V8HI_FTYPE_V4SF_V8HI_UQI
:
9402 case V16SF_FTYPE_V16SF_V32HI_V32HI
:
9403 case V8SF_FTYPE_V8SF_V16HI_V16HI
:
9404 case V4SF_FTYPE_V4SF_V8HI_V8HI
:
9407 case V32QI_FTYPE_V32QI_V32QI_INT
:
9408 case V16HI_FTYPE_V16HI_V16HI_INT
:
9409 case V16QI_FTYPE_V16QI_V16QI_INT
:
9410 case V4DI_FTYPE_V4DI_V4DI_INT
:
9411 case V8HI_FTYPE_V8HI_V8HI_INT
:
9412 case V8SI_FTYPE_V8SI_V8SI_INT
:
9413 case V8SI_FTYPE_V8SI_V4SI_INT
:
9414 case V8SF_FTYPE_V8SF_V8SF_INT
:
9415 case V8SF_FTYPE_V8SF_V4SF_INT
:
9416 case V4SI_FTYPE_V4SI_V4SI_INT
:
9417 case V4DF_FTYPE_V4DF_V4DF_INT
:
9418 case V16SF_FTYPE_V16SF_V16SF_INT
:
9419 case V16SF_FTYPE_V16SF_V4SF_INT
:
9420 case V16SI_FTYPE_V16SI_V4SI_INT
:
9421 case V4DF_FTYPE_V4DF_V2DF_INT
:
9422 case V4SF_FTYPE_V4SF_V4SF_INT
:
9423 case V2DI_FTYPE_V2DI_V2DI_INT
:
9424 case V4DI_FTYPE_V4DI_V2DI_INT
:
9425 case V2DF_FTYPE_V2DF_V2DF_INT
:
9426 case UQI_FTYPE_V8DI_V8UDI_INT
:
9427 case UQI_FTYPE_V8DF_V8DF_INT
:
9428 case UQI_FTYPE_V2DF_V2DF_INT
:
9429 case UQI_FTYPE_V4SF_V4SF_INT
:
9430 case UHI_FTYPE_V16SI_V16SI_INT
:
9431 case UHI_FTYPE_V16SF_V16SF_INT
:
9432 case V64QI_FTYPE_V64QI_V64QI_INT
:
9433 case V32HI_FTYPE_V32HI_V32HI_INT
:
9434 case V16SI_FTYPE_V16SI_V16SI_INT
:
9435 case V8DI_FTYPE_V8DI_V8DI_INT
:
9439 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT
:
9444 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT
:
9449 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT
:
9454 case V2DI_FTYPE_V2DI_UINT_UINT
:
9458 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT
:
9463 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT
:
9469 case QI_FTYPE_V8DF_INT_UQI
:
9470 case QI_FTYPE_V4DF_INT_UQI
:
9471 case QI_FTYPE_V2DF_INT_UQI
:
9472 case HI_FTYPE_V16SF_INT_UHI
:
9473 case QI_FTYPE_V8SF_INT_UQI
:
9474 case QI_FTYPE_V4SF_INT_UQI
:
9475 case V4SI_FTYPE_V4SI_V4SI_UHI
:
9476 case V8SI_FTYPE_V8SI_V8SI_UHI
:
9481 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT
:
9487 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT
:
9493 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI
:
9494 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI
:
9495 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI
:
9496 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI
:
9497 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI
:
9498 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI
:
9499 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI
:
9500 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI
:
9501 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI
:
9502 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI
:
9503 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI
:
9504 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI
:
9505 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI
:
9506 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI
:
9507 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI
:
9508 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI
:
9509 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI
:
9510 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI
:
9511 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI
:
9512 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI
:
9513 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI
:
9514 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI
:
9515 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI
:
9516 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI
:
9517 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI
:
9518 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI
:
9519 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI
:
9520 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI
:
9521 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI
:
9522 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI
:
9523 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI
:
9524 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI
:
9525 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI
:
9526 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI
:
9527 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI
:
9528 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI
:
9529 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI
:
9530 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI
:
9531 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI
:
9532 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI
:
9533 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI
:
9534 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI
:
9535 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI
:
9536 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI
:
9537 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI
:
9538 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI
:
9539 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI
:
9540 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI
:
9541 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI
:
9542 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI
:
9543 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI
:
9544 case V32HI_FTYPE_V16SF_V16SF_V32HI_USI
:
9545 case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI
:
9546 case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI
:
9549 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT
:
9550 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT
:
9551 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT
:
9552 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT
:
9553 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT
:
9557 case UQI_FTYPE_V4DI_V4DI_INT_UQI
:
9558 case UQI_FTYPE_V8SI_V8SI_INT_UQI
:
9559 case QI_FTYPE_V4DF_V4DF_INT_UQI
:
9560 case QI_FTYPE_V8SF_V8SF_INT_UQI
:
9561 case UQI_FTYPE_V2DI_V2DI_INT_UQI
:
9562 case UQI_FTYPE_V4SI_V4SI_INT_UQI
:
9563 case UQI_FTYPE_V2DF_V2DF_INT_UQI
:
9564 case UQI_FTYPE_V4SF_V4SF_INT_UQI
:
9565 case UDI_FTYPE_V64QI_V64QI_INT_UDI
:
9566 case USI_FTYPE_V32QI_V32QI_INT_USI
:
9567 case UHI_FTYPE_V16QI_V16QI_INT_UHI
:
9568 case USI_FTYPE_V32HI_V32HI_INT_USI
:
9569 case UHI_FTYPE_V16HI_V16HI_INT_UHI
:
9570 case UQI_FTYPE_V8HI_V8HI_INT_UQI
:
9575 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT
:
9579 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED
:
9580 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG
:
9581 case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI
:
9582 case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI
:
9583 case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI
:
9586 case UQI_FTYPE_V8DI_V8DI_INT_UQI
:
9587 case UHI_FTYPE_V16SI_V16SI_INT_UHI
:
9592 case V8SF_FTYPE_V8SF_INT_V8SF_UQI
:
9593 case V4SF_FTYPE_V4SF_INT_V4SF_UQI
:
9594 case V2DF_FTYPE_V4DF_INT_V2DF_UQI
:
9595 case V2DI_FTYPE_V4DI_INT_V2DI_UQI
:
9596 case V8SF_FTYPE_V16SF_INT_V8SF_UQI
:
9597 case V8SI_FTYPE_V16SI_INT_V8SI_UQI
:
9598 case V2DF_FTYPE_V8DF_INT_V2DF_UQI
:
9599 case V2DI_FTYPE_V8DI_INT_V2DI_UQI
:
9600 case V4SF_FTYPE_V8SF_INT_V4SF_UQI
:
9601 case V4SI_FTYPE_V8SI_INT_V4SI_UQI
:
9602 case V8HI_FTYPE_V8SF_INT_V8HI_UQI
:
9603 case V8HI_FTYPE_V4SF_INT_V8HI_UQI
:
9604 case V32HI_FTYPE_V32HI_INT_V32HI_USI
:
9605 case V16HI_FTYPE_V16HI_INT_V16HI_UHI
:
9606 case V8HI_FTYPE_V8HI_INT_V8HI_UQI
:
9607 case V4DI_FTYPE_V4DI_INT_V4DI_UQI
:
9608 case V2DI_FTYPE_V2DI_INT_V2DI_UQI
:
9609 case V8SI_FTYPE_V8SI_INT_V8SI_UQI
:
9610 case V4SI_FTYPE_V4SI_INT_V4SI_UQI
:
9611 case V4DF_FTYPE_V4DF_INT_V4DF_UQI
:
9612 case V2DF_FTYPE_V2DF_INT_V2DF_UQI
:
9613 case V8DF_FTYPE_V8DF_INT_V8DF_UQI
:
9614 case V16SF_FTYPE_V16SF_INT_V16SF_UHI
:
9615 case V16HI_FTYPE_V16SF_INT_V16HI_UHI
:
9616 case V16SI_FTYPE_V16SI_INT_V16SI_UHI
:
9617 case V4SI_FTYPE_V16SI_INT_V4SI_UQI
:
9618 case V4DI_FTYPE_V8DI_INT_V4DI_UQI
:
9619 case V4DF_FTYPE_V8DF_INT_V4DF_UQI
:
9620 case V4SF_FTYPE_V16SF_INT_V4SF_UQI
:
9621 case V8DI_FTYPE_V8DI_INT_V8DI_UQI
:
9626 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI
:
9627 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI
:
9628 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI
:
9629 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI
:
9630 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI
:
9631 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI
:
9632 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI
:
9633 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI
:
9634 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI
:
9635 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI
:
9636 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI
:
9637 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI
:
9638 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI
:
9639 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI
:
9640 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI
:
9641 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI
:
9642 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI
:
9643 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI
:
9644 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI
:
9645 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI
:
9646 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI
:
9647 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI
:
9648 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI
:
9649 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI
:
9650 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI
:
9651 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI
:
9652 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI
:
9657 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI
:
9658 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI
:
9659 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI
:
9660 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI
:
9661 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI
:
9662 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI
:
9663 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI
:
9664 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI
:
9665 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI
:
9666 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI
:
9671 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI
:
9672 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI
:
9673 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI
:
9674 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT
:
9675 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT
:
9676 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT
:
9677 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT
:
9678 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT
:
9679 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT
:
9680 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT
:
9681 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT
:
9682 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT
:
9692 gcc_assert (nargs
<= ARRAY_SIZE (args
));
9694 if (comparison
!= UNKNOWN
)
9696 gcc_assert (nargs
== 2);
9697 return ix86_expand_sse_compare (d
, exp
, target
, swap
);
9700 if (rmode
== VOIDmode
|| rmode
== tmode
)
9704 || GET_MODE (target
) != tmode
9705 || !insn_p
->operand
[0].predicate (target
, tmode
))
9706 target
= gen_reg_rtx (tmode
);
9707 else if (memory_operand (target
, tmode
))
9709 real_target
= target
;
9713 real_target
= gen_reg_rtx (tmode
);
9714 target
= lowpart_subreg (rmode
, real_target
, tmode
);
9717 for (i
= 0; i
< nargs
; i
++)
9719 tree arg
= CALL_EXPR_ARG (exp
, i
);
9720 rtx op
= expand_normal (arg
);
9721 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
9722 bool match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
9724 if (second_arg_count
&& i
== 1)
9726 /* SIMD shift insns take either an 8-bit immediate or
9727 register as count. But builtin functions take int as
9728 count. If count doesn't match, we put it in register.
9729 The instructions are using 64-bit count, if op is just
9730 32-bit, zero-extend it, as negative shift counts
9731 are undefined behavior and zero-extension is more
9735 if (SCALAR_INT_MODE_P (GET_MODE (op
)))
9736 op
= convert_modes (mode
, GET_MODE (op
), op
, 1);
9738 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
9739 if (!insn_p
->operand
[i
+ 1].predicate (op
, mode
))
9740 op
= copy_to_reg (op
);
9743 else if ((mask_pos
&& (nargs
- i
- mask_pos
) == nargs_constant
) ||
9744 (!mask_pos
&& (nargs
- i
) <= nargs_constant
))
9749 case CODE_FOR_avx_vinsertf128v4di
:
9750 case CODE_FOR_avx_vextractf128v4di
:
9751 error ("the last argument must be an 1-bit immediate");
9754 case CODE_FOR_avx512f_cmpv8di3_mask
:
9755 case CODE_FOR_avx512f_cmpv16si3_mask
:
9756 case CODE_FOR_avx512f_ucmpv8di3_mask
:
9757 case CODE_FOR_avx512f_ucmpv16si3_mask
:
9758 case CODE_FOR_avx512vl_cmpv4di3_mask
:
9759 case CODE_FOR_avx512vl_cmpv8si3_mask
:
9760 case CODE_FOR_avx512vl_ucmpv4di3_mask
:
9761 case CODE_FOR_avx512vl_ucmpv8si3_mask
:
9762 case CODE_FOR_avx512vl_cmpv2di3_mask
:
9763 case CODE_FOR_avx512vl_cmpv4si3_mask
:
9764 case CODE_FOR_avx512vl_ucmpv2di3_mask
:
9765 case CODE_FOR_avx512vl_ucmpv4si3_mask
:
9766 error ("the last argument must be a 3-bit immediate");
9769 case CODE_FOR_sse4_1_roundsd
:
9770 case CODE_FOR_sse4_1_roundss
:
9772 case CODE_FOR_sse4_1_roundpd
:
9773 case CODE_FOR_sse4_1_roundps
:
9774 case CODE_FOR_avx_roundpd256
:
9775 case CODE_FOR_avx_roundps256
:
9777 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix
:
9778 case CODE_FOR_sse4_1_roundps_sfix
:
9779 case CODE_FOR_avx_roundpd_vec_pack_sfix256
:
9780 case CODE_FOR_avx_roundps_sfix256
:
9782 case CODE_FOR_sse4_1_blendps
:
9783 case CODE_FOR_avx_blendpd256
:
9784 case CODE_FOR_avx_vpermilv4df
:
9785 case CODE_FOR_avx_vpermilv4df_mask
:
9786 case CODE_FOR_avx512f_getmantv8df_mask
:
9787 case CODE_FOR_avx512f_getmantv16sf_mask
:
9788 case CODE_FOR_avx512vl_getmantv8sf_mask
:
9789 case CODE_FOR_avx512vl_getmantv4df_mask
:
9790 case CODE_FOR_avx512vl_getmantv4sf_mask
:
9791 case CODE_FOR_avx512vl_getmantv2df_mask
:
9792 case CODE_FOR_avx512dq_rangepv8df_mask_round
:
9793 case CODE_FOR_avx512dq_rangepv16sf_mask_round
:
9794 case CODE_FOR_avx512dq_rangepv4df_mask
:
9795 case CODE_FOR_avx512dq_rangepv8sf_mask
:
9796 case CODE_FOR_avx512dq_rangepv2df_mask
:
9797 case CODE_FOR_avx512dq_rangepv4sf_mask
:
9798 case CODE_FOR_avx_shufpd256_mask
:
9799 error ("the last argument must be a 4-bit immediate");
9802 case CODE_FOR_sha1rnds4
:
9803 case CODE_FOR_sse4_1_blendpd
:
9804 case CODE_FOR_avx_vpermilv2df
:
9805 case CODE_FOR_avx_vpermilv2df_mask
:
9806 case CODE_FOR_xop_vpermil2v2df3
:
9807 case CODE_FOR_xop_vpermil2v4sf3
:
9808 case CODE_FOR_xop_vpermil2v4df3
:
9809 case CODE_FOR_xop_vpermil2v8sf3
:
9810 case CODE_FOR_avx512f_vinsertf32x4_mask
:
9811 case CODE_FOR_avx512f_vinserti32x4_mask
:
9812 case CODE_FOR_avx512f_vextractf32x4_mask
:
9813 case CODE_FOR_avx512f_vextracti32x4_mask
:
9814 case CODE_FOR_sse2_shufpd
:
9815 case CODE_FOR_sse2_shufpd_mask
:
9816 case CODE_FOR_avx512dq_shuf_f64x2_mask
:
9817 case CODE_FOR_avx512dq_shuf_i64x2_mask
:
9818 case CODE_FOR_avx512vl_shuf_i32x4_mask
:
9819 case CODE_FOR_avx512vl_shuf_f32x4_mask
:
9820 error ("the last argument must be a 2-bit immediate");
9823 case CODE_FOR_avx_vextractf128v4df
:
9824 case CODE_FOR_avx_vextractf128v8sf
:
9825 case CODE_FOR_avx_vextractf128v8si
:
9826 case CODE_FOR_avx_vinsertf128v4df
:
9827 case CODE_FOR_avx_vinsertf128v8sf
:
9828 case CODE_FOR_avx_vinsertf128v8si
:
9829 case CODE_FOR_avx512f_vinsertf64x4_mask
:
9830 case CODE_FOR_avx512f_vinserti64x4_mask
:
9831 case CODE_FOR_avx512f_vextractf64x4_mask
:
9832 case CODE_FOR_avx512f_vextracti64x4_mask
:
9833 case CODE_FOR_avx512dq_vinsertf32x8_mask
:
9834 case CODE_FOR_avx512dq_vinserti32x8_mask
:
9835 case CODE_FOR_avx512vl_vinsertv4df
:
9836 case CODE_FOR_avx512vl_vinsertv4di
:
9837 case CODE_FOR_avx512vl_vinsertv8sf
:
9838 case CODE_FOR_avx512vl_vinsertv8si
:
9839 error ("the last argument must be a 1-bit immediate");
9842 case CODE_FOR_avx_vmcmpv2df3
:
9843 case CODE_FOR_avx_vmcmpv4sf3
:
9844 case CODE_FOR_avx_cmpv2df3
:
9845 case CODE_FOR_avx_cmpv4sf3
:
9846 case CODE_FOR_avx_cmpv4df3
:
9847 case CODE_FOR_avx_cmpv8sf3
:
9848 case CODE_FOR_avx512f_cmpv8df3_mask
:
9849 case CODE_FOR_avx512f_cmpv16sf3_mask
:
9850 case CODE_FOR_avx512f_vmcmpv2df3_mask
:
9851 case CODE_FOR_avx512f_vmcmpv4sf3_mask
:
9852 error ("the last argument must be a 5-bit immediate");
9856 switch (nargs_constant
)
9859 if ((mask_pos
&& (nargs
- i
- mask_pos
) == nargs_constant
) ||
9860 (!mask_pos
&& (nargs
- i
) == nargs_constant
))
9862 error ("the next to last argument must be an 8-bit immediate");
9867 error ("the last argument must be an 8-bit immediate");
9877 if (VECTOR_MODE_P (mode
))
9878 op
= safe_vector_operand (op
, mode
);
9880 /* If we aren't optimizing, only allow one memory operand to
9882 if (memory_operand (op
, mode
))
9885 op
= fixup_modeless_constant (op
, mode
);
9887 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
9889 if (optimize
|| !match
|| num_memory
> 1)
9890 op
= copy_to_mode_reg (mode
, op
);
9894 op
= copy_to_reg (op
);
9895 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
9900 args
[i
].mode
= mode
;
9906 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
);
9909 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
, args
[1].op
);
9912 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
, args
[1].op
,
9916 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
, args
[1].op
,
9917 args
[2].op
, args
[3].op
);
9920 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
, args
[1].op
,
9921 args
[2].op
, args
[3].op
, args
[4].op
);
9924 pat
= GEN_FCN (icode
) (real_target
, args
[0].op
, args
[1].op
,
9925 args
[2].op
, args
[3].op
, args
[4].op
,
9939 /* Transform pattern of following layout:
9941 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
9947 ix86_erase_embedded_rounding (rtx pat
)
9949 if (GET_CODE (pat
) == INSN
)
9950 pat
= PATTERN (pat
);
9952 gcc_assert (GET_CODE (pat
) == SET
);
9953 rtx src
= SET_SRC (pat
);
9954 gcc_assert (XVECLEN (src
, 0) == 2);
9955 rtx p0
= XVECEXP (src
, 0, 0);
9956 gcc_assert (GET_CODE (src
) == UNSPEC
9957 && XINT (src
, 1) == UNSPEC_EMBEDDED_ROUNDING
);
9958 rtx res
= gen_rtx_SET (SET_DEST (pat
), p0
);
9962 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
9965 ix86_expand_sse_comi_round (const struct builtin_description
*d
,
9966 tree exp
, rtx target
)
9969 tree arg0
= CALL_EXPR_ARG (exp
, 0);
9970 tree arg1
= CALL_EXPR_ARG (exp
, 1);
9971 tree arg2
= CALL_EXPR_ARG (exp
, 2);
9972 tree arg3
= CALL_EXPR_ARG (exp
, 3);
9973 rtx op0
= expand_normal (arg0
);
9974 rtx op1
= expand_normal (arg1
);
9975 rtx op2
= expand_normal (arg2
);
9976 rtx op3
= expand_normal (arg3
);
9977 enum insn_code icode
= d
->icode
;
9978 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
9979 machine_mode mode0
= insn_p
->operand
[0].mode
;
9980 machine_mode mode1
= insn_p
->operand
[1].mode
;
9982 /* See avxintrin.h for values. */
9983 static const enum rtx_code comparisons
[32] =
9985 EQ
, LT
, LE
, UNORDERED
, NE
, UNGE
, UNGT
, ORDERED
,
9986 UNEQ
, UNLT
, UNLE
, UNORDERED
, LTGT
, GE
, GT
, ORDERED
,
9987 EQ
, LT
, LE
, UNORDERED
, NE
, UNGE
, UNGT
, ORDERED
,
9988 UNEQ
, UNLT
, UNLE
, UNORDERED
, LTGT
, GE
, GT
, ORDERED
9990 static const bool ordereds
[32] =
9992 true, true, true, false, false, false, false, true,
9993 false, false, false, true, true, true, true, false,
9994 true, true, true, false, false, false, false, true,
9995 false, false, false, true, true, true, true, false
9997 static const bool non_signalings
[32] =
9999 true, false, false, true, true, false, false, true,
10000 true, false, false, true, true, false, false, true,
10001 false, true, true, false, false, true, true, false,
10002 false, true, true, false, false, true, true, false
10005 if (!CONST_INT_P (op2
))
10007 error ("the third argument must be comparison constant");
10010 if (INTVAL (op2
) < 0 || INTVAL (op2
) >= 32)
10012 error ("incorrect comparison mode");
10016 if (!insn_p
->operand
[2].predicate (op3
, SImode
))
10018 error ("incorrect rounding operand");
10022 if (VECTOR_MODE_P (mode0
))
10023 op0
= safe_vector_operand (op0
, mode0
);
10024 if (VECTOR_MODE_P (mode1
))
10025 op1
= safe_vector_operand (op1
, mode1
);
10027 enum rtx_code comparison
= comparisons
[INTVAL (op2
)];
10028 bool ordered
= ordereds
[INTVAL (op2
)];
10029 bool non_signaling
= non_signalings
[INTVAL (op2
)];
10030 rtx const_val
= const0_rtx
;
10032 bool check_unordered
= false;
10033 machine_mode mode
= CCFPmode
;
10034 switch (comparison
)
10039 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
10040 if (!non_signaling
)
10046 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
10056 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
10063 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
10064 if (!non_signaling
)
10071 case LE
: /* -> GE */
10072 case LT
: /* -> GT */
10073 case UNGE
: /* -> UNLE */
10074 case UNGT
: /* -> UNLT */
10075 std::swap (op0
, op1
);
10076 comparison
= swap_condition (comparison
);
10084 /* These are supported by CCFPmode. NB: Use ordered/signaling
10085 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
10086 with NAN operands. */
10087 if (ordered
== non_signaling
)
10088 ordered
= !ordered
;
10091 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10092 _CMP_EQ_OQ/_CMP_EQ_OS. */
10093 check_unordered
= true;
10097 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
10098 _CMP_NEQ_UQ/_CMP_NEQ_US. */
10099 gcc_assert (!ordered
);
10100 check_unordered
= true;
10102 const_val
= const1_rtx
;
10105 gcc_unreachable ();
10108 target
= gen_reg_rtx (SImode
);
10109 emit_move_insn (target
, const_val
);
10110 target
= gen_rtx_SUBREG (QImode
, target
, 0);
10112 if ((optimize
&& !register_operand (op0
, mode0
))
10113 || !insn_p
->operand
[0].predicate (op0
, mode0
))
10114 op0
= copy_to_mode_reg (mode0
, op0
);
10115 if ((optimize
&& !register_operand (op1
, mode1
))
10116 || !insn_p
->operand
[1].predicate (op1
, mode1
))
10117 op1
= copy_to_mode_reg (mode1
, op1
);
10120 1. COMI: ordered and signaling.
10121 2. UCOMI: unordered and non-signaling.
10124 icode
= (icode
== CODE_FOR_sse_comi_round
10125 ? CODE_FOR_sse_ucomi_round
10126 : CODE_FOR_sse2_ucomi_round
);
10128 pat
= GEN_FCN (icode
) (op0
, op1
, op3
);
10132 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
10133 if (INTVAL (op3
) == NO_ROUND
)
10135 pat
= ix86_erase_embedded_rounding (pat
);
10139 set_dst
= SET_DEST (pat
);
10143 gcc_assert (GET_CODE (pat
) == SET
);
10144 set_dst
= SET_DEST (pat
);
10149 rtx_code_label
*label
= NULL
;
10151 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
10152 with NAN operands. */
10153 if (check_unordered
)
10155 gcc_assert (comparison
== EQ
|| comparison
== NE
);
10157 rtx flag
= gen_rtx_REG (CCFPmode
, FLAGS_REG
);
10158 label
= gen_label_rtx ();
10159 rtx tmp
= gen_rtx_fmt_ee (UNORDERED
, VOIDmode
, flag
, const0_rtx
);
10160 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
10161 gen_rtx_LABEL_REF (VOIDmode
, label
),
10163 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
10166 /* NB: Set CCFPmode and check a different CCmode which is in subset
10168 if (GET_MODE (set_dst
) != mode
)
10170 gcc_assert (mode
== CCAmode
|| mode
== CCCmode
10171 || mode
== CCOmode
|| mode
== CCPmode
10172 || mode
== CCSmode
|| mode
== CCZmode
);
10173 set_dst
= gen_rtx_REG (mode
, FLAGS_REG
);
10176 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
10177 gen_rtx_fmt_ee (comparison
, QImode
,
10182 emit_label (label
);
10184 return SUBREG_REG (target
);
10188 ix86_expand_round_builtin (const struct builtin_description
*d
,
10189 tree exp
, rtx target
)
10192 unsigned int i
, nargs
;
10198 enum insn_code icode
= d
->icode
;
10199 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10200 machine_mode tmode
= insn_p
->operand
[0].mode
;
10201 unsigned int nargs_constant
= 0;
10202 unsigned int redundant_embed_rnd
= 0;
10204 switch ((enum ix86_builtin_func_type
) d
->flag
)
10206 case UINT64_FTYPE_V2DF_INT
:
10207 case UINT64_FTYPE_V4SF_INT
:
10208 case UINT_FTYPE_V2DF_INT
:
10209 case UINT_FTYPE_V4SF_INT
:
10210 case INT64_FTYPE_V2DF_INT
:
10211 case INT64_FTYPE_V4SF_INT
:
10212 case INT_FTYPE_V2DF_INT
:
10213 case INT_FTYPE_V4SF_INT
:
10216 case V4SF_FTYPE_V4SF_UINT_INT
:
10217 case V4SF_FTYPE_V4SF_UINT64_INT
:
10218 case V2DF_FTYPE_V2DF_UINT64_INT
:
10219 case V4SF_FTYPE_V4SF_INT_INT
:
10220 case V4SF_FTYPE_V4SF_INT64_INT
:
10221 case V2DF_FTYPE_V2DF_INT64_INT
:
10222 case V4SF_FTYPE_V4SF_V4SF_INT
:
10223 case V2DF_FTYPE_V2DF_V2DF_INT
:
10224 case V4SF_FTYPE_V4SF_V2DF_INT
:
10225 case V2DF_FTYPE_V2DF_V4SF_INT
:
10228 case V8SF_FTYPE_V8DF_V8SF_QI_INT
:
10229 case V8DF_FTYPE_V8DF_V8DF_QI_INT
:
10230 case V8SI_FTYPE_V8DF_V8SI_QI_INT
:
10231 case V8DI_FTYPE_V8DF_V8DI_QI_INT
:
10232 case V8SF_FTYPE_V8DI_V8SF_QI_INT
:
10233 case V8DF_FTYPE_V8DI_V8DF_QI_INT
:
10234 case V16SF_FTYPE_V16SF_V16SF_HI_INT
:
10235 case V8DI_FTYPE_V8SF_V8DI_QI_INT
:
10236 case V16SF_FTYPE_V16SI_V16SF_HI_INT
:
10237 case V16SI_FTYPE_V16SF_V16SI_HI_INT
:
10238 case V8DF_FTYPE_V8SF_V8DF_QI_INT
:
10239 case V16SF_FTYPE_V16HI_V16SF_HI_INT
:
10240 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT
:
10241 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT
:
10244 case V4SF_FTYPE_V4SF_V4SF_INT_INT
:
10245 case V2DF_FTYPE_V2DF_V2DF_INT_INT
:
10246 nargs_constant
= 2;
10249 case INT_FTYPE_V4SF_V4SF_INT_INT
:
10250 case INT_FTYPE_V2DF_V2DF_INT_INT
:
10251 return ix86_expand_sse_comi_round (d
, exp
, target
);
10252 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT
:
10253 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT
:
10254 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT
:
10255 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT
:
10256 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT
:
10257 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT
:
10258 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT
:
10259 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT
:
10262 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT
:
10263 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT
:
10264 nargs_constant
= 4;
10267 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT
:
10268 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT
:
10269 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT
:
10270 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT
:
10271 nargs_constant
= 3;
10274 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT
:
10275 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT
:
10276 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT
:
10277 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT
:
10278 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT
:
10279 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT
:
10281 nargs_constant
= 4;
10283 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT
:
10284 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT
:
10285 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT
:
10286 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT
:
10288 nargs_constant
= 3;
10291 gcc_unreachable ();
10293 gcc_assert (nargs
<= ARRAY_SIZE (args
));
10297 || GET_MODE (target
) != tmode
10298 || !insn_p
->operand
[0].predicate (target
, tmode
))
10299 target
= gen_reg_rtx (tmode
);
10301 for (i
= 0; i
< nargs
; i
++)
10303 tree arg
= CALL_EXPR_ARG (exp
, i
);
10304 rtx op
= expand_normal (arg
);
10305 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
10306 bool match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
10308 if (i
== nargs
- nargs_constant
)
10314 case CODE_FOR_avx512f_getmantv8df_mask_round
:
10315 case CODE_FOR_avx512f_getmantv16sf_mask_round
:
10316 case CODE_FOR_avx512f_vgetmantv2df_round
:
10317 case CODE_FOR_avx512f_vgetmantv2df_mask_round
:
10318 case CODE_FOR_avx512f_vgetmantv4sf_round
:
10319 case CODE_FOR_avx512f_vgetmantv4sf_mask_round
:
10320 error ("the immediate argument must be a 4-bit immediate");
10322 case CODE_FOR_avx512f_cmpv8df3_mask_round
:
10323 case CODE_FOR_avx512f_cmpv16sf3_mask_round
:
10324 case CODE_FOR_avx512f_vmcmpv2df3_mask_round
:
10325 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round
:
10326 error ("the immediate argument must be a 5-bit immediate");
10329 error ("the immediate argument must be an 8-bit immediate");
10334 else if (i
== nargs
-1)
10336 if (!insn_p
->operand
[nargs
].predicate (op
, SImode
))
10338 error ("incorrect rounding operand");
10342 /* If there is no rounding use normal version of the pattern. */
10343 if (INTVAL (op
) == NO_ROUND
)
10344 redundant_embed_rnd
= 1;
10348 if (VECTOR_MODE_P (mode
))
10349 op
= safe_vector_operand (op
, mode
);
10351 op
= fixup_modeless_constant (op
, mode
);
10353 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
10355 if (optimize
|| !match
)
10356 op
= copy_to_mode_reg (mode
, op
);
10360 op
= copy_to_reg (op
);
10361 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
10366 args
[i
].mode
= mode
;
10372 pat
= GEN_FCN (icode
) (target
, args
[0].op
);
10375 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
);
10378 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
,
10382 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
,
10383 args
[2].op
, args
[3].op
);
10386 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
,
10387 args
[2].op
, args
[3].op
, args
[4].op
);
10390 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
,
10391 args
[2].op
, args
[3].op
, args
[4].op
,
10395 gcc_unreachable ();
10401 if (redundant_embed_rnd
)
10402 pat
= ix86_erase_embedded_rounding (pat
);
10408 /* Subroutine of ix86_expand_builtin to take care of special insns
10409 with variable number of operands. */
10412 ix86_expand_special_args_builtin (const struct builtin_description
*d
,
10413 tree exp
, rtx target
)
10417 unsigned int i
, nargs
, arg_adjust
, memory
;
10418 bool aligned_mem
= false;
10424 enum insn_code icode
= d
->icode
;
10425 bool last_arg_constant
= false;
10426 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10427 machine_mode tmode
= insn_p
->operand
[0].mode
;
10428 enum { load
, store
} klass
;
10430 switch ((enum ix86_builtin_func_type
) d
->flag
)
10432 case VOID_FTYPE_VOID
:
10433 emit_insn (GEN_FCN (icode
) (target
));
10435 case VOID_FTYPE_UINT64
:
10436 case VOID_FTYPE_UNSIGNED
:
10442 case INT_FTYPE_VOID
:
10443 case USHORT_FTYPE_VOID
:
10444 case UINT64_FTYPE_VOID
:
10445 case UINT_FTYPE_VOID
:
10446 case UNSIGNED_FTYPE_VOID
:
10451 case UINT64_FTYPE_PUNSIGNED
:
10452 case V2DI_FTYPE_PV2DI
:
10453 case V4DI_FTYPE_PV4DI
:
10454 case V32QI_FTYPE_PCCHAR
:
10455 case V16QI_FTYPE_PCCHAR
:
10456 case V8SF_FTYPE_PCV4SF
:
10457 case V8SF_FTYPE_PCFLOAT
:
10458 case V4SF_FTYPE_PCFLOAT
:
10459 case V4DF_FTYPE_PCV2DF
:
10460 case V4DF_FTYPE_PCDOUBLE
:
10461 case V2DF_FTYPE_PCDOUBLE
:
10462 case VOID_FTYPE_PVOID
:
10463 case V8DI_FTYPE_PV8DI
:
10469 case CODE_FOR_sse4_1_movntdqa
:
10470 case CODE_FOR_avx2_movntdqa
:
10471 case CODE_FOR_avx512f_movntdqa
:
10472 aligned_mem
= true;
10478 case VOID_FTYPE_PV2SF_V4SF
:
10479 case VOID_FTYPE_PV8DI_V8DI
:
10480 case VOID_FTYPE_PV4DI_V4DI
:
10481 case VOID_FTYPE_PV2DI_V2DI
:
10482 case VOID_FTYPE_PCHAR_V32QI
:
10483 case VOID_FTYPE_PCHAR_V16QI
:
10484 case VOID_FTYPE_PFLOAT_V16SF
:
10485 case VOID_FTYPE_PFLOAT_V8SF
:
10486 case VOID_FTYPE_PFLOAT_V4SF
:
10487 case VOID_FTYPE_PDOUBLE_V8DF
:
10488 case VOID_FTYPE_PDOUBLE_V4DF
:
10489 case VOID_FTYPE_PDOUBLE_V2DF
:
10490 case VOID_FTYPE_PLONGLONG_LONGLONG
:
10491 case VOID_FTYPE_PULONGLONG_ULONGLONG
:
10492 case VOID_FTYPE_PUNSIGNED_UNSIGNED
:
10493 case VOID_FTYPE_PINT_INT
:
10496 /* Reserve memory operand for target. */
10497 memory
= ARRAY_SIZE (args
);
10500 /* These builtins and instructions require the memory
10501 to be properly aligned. */
10502 case CODE_FOR_avx_movntv4di
:
10503 case CODE_FOR_sse2_movntv2di
:
10504 case CODE_FOR_avx_movntv8sf
:
10505 case CODE_FOR_sse_movntv4sf
:
10506 case CODE_FOR_sse4a_vmmovntv4sf
:
10507 case CODE_FOR_avx_movntv4df
:
10508 case CODE_FOR_sse2_movntv2df
:
10509 case CODE_FOR_sse4a_vmmovntv2df
:
10510 case CODE_FOR_sse2_movntidi
:
10511 case CODE_FOR_sse_movntq
:
10512 case CODE_FOR_sse2_movntisi
:
10513 case CODE_FOR_avx512f_movntv16sf
:
10514 case CODE_FOR_avx512f_movntv8df
:
10515 case CODE_FOR_avx512f_movntv8di
:
10516 aligned_mem
= true;
10522 case VOID_FTYPE_PVOID_PCVOID
:
10528 case V4SF_FTYPE_V4SF_PCV2SF
:
10529 case V2DF_FTYPE_V2DF_PCDOUBLE
:
10534 case V8SF_FTYPE_PCV8SF_V8SI
:
10535 case V4DF_FTYPE_PCV4DF_V4DI
:
10536 case V4SF_FTYPE_PCV4SF_V4SI
:
10537 case V2DF_FTYPE_PCV2DF_V2DI
:
10538 case V8SI_FTYPE_PCV8SI_V8SI
:
10539 case V4DI_FTYPE_PCV4DI_V4DI
:
10540 case V4SI_FTYPE_PCV4SI_V4SI
:
10541 case V2DI_FTYPE_PCV2DI_V2DI
:
10542 case VOID_FTYPE_INT_INT64
:
10547 case VOID_FTYPE_PV8DF_V8DF_UQI
:
10548 case VOID_FTYPE_PV4DF_V4DF_UQI
:
10549 case VOID_FTYPE_PV2DF_V2DF_UQI
:
10550 case VOID_FTYPE_PV16SF_V16SF_UHI
:
10551 case VOID_FTYPE_PV8SF_V8SF_UQI
:
10552 case VOID_FTYPE_PV4SF_V4SF_UQI
:
10553 case VOID_FTYPE_PV8DI_V8DI_UQI
:
10554 case VOID_FTYPE_PV4DI_V4DI_UQI
:
10555 case VOID_FTYPE_PV2DI_V2DI_UQI
:
10556 case VOID_FTYPE_PV16SI_V16SI_UHI
:
10557 case VOID_FTYPE_PV8SI_V8SI_UQI
:
10558 case VOID_FTYPE_PV4SI_V4SI_UQI
:
10559 case VOID_FTYPE_PV64QI_V64QI_UDI
:
10560 case VOID_FTYPE_PV32HI_V32HI_USI
:
10561 case VOID_FTYPE_PV32QI_V32QI_USI
:
10562 case VOID_FTYPE_PV16QI_V16QI_UHI
:
10563 case VOID_FTYPE_PV16HI_V16HI_UHI
:
10564 case VOID_FTYPE_PV8HI_V8HI_UQI
:
10567 /* These builtins and instructions require the memory
10568 to be properly aligned. */
10569 case CODE_FOR_avx512f_storev16sf_mask
:
10570 case CODE_FOR_avx512f_storev16si_mask
:
10571 case CODE_FOR_avx512f_storev8df_mask
:
10572 case CODE_FOR_avx512f_storev8di_mask
:
10573 case CODE_FOR_avx512vl_storev8sf_mask
:
10574 case CODE_FOR_avx512vl_storev8si_mask
:
10575 case CODE_FOR_avx512vl_storev4df_mask
:
10576 case CODE_FOR_avx512vl_storev4di_mask
:
10577 case CODE_FOR_avx512vl_storev4sf_mask
:
10578 case CODE_FOR_avx512vl_storev4si_mask
:
10579 case CODE_FOR_avx512vl_storev2df_mask
:
10580 case CODE_FOR_avx512vl_storev2di_mask
:
10581 aligned_mem
= true;
10587 case VOID_FTYPE_PV8SF_V8SI_V8SF
:
10588 case VOID_FTYPE_PV4DF_V4DI_V4DF
:
10589 case VOID_FTYPE_PV4SF_V4SI_V4SF
:
10590 case VOID_FTYPE_PV2DF_V2DI_V2DF
:
10591 case VOID_FTYPE_PV8SI_V8SI_V8SI
:
10592 case VOID_FTYPE_PV4DI_V4DI_V4DI
:
10593 case VOID_FTYPE_PV4SI_V4SI_V4SI
:
10594 case VOID_FTYPE_PV2DI_V2DI_V2DI
:
10595 case VOID_FTYPE_PV8SI_V8DI_UQI
:
10596 case VOID_FTYPE_PV8HI_V8DI_UQI
:
10597 case VOID_FTYPE_PV16HI_V16SI_UHI
:
10598 case VOID_FTYPE_PV16QI_V8DI_UQI
:
10599 case VOID_FTYPE_PV16QI_V16SI_UHI
:
10600 case VOID_FTYPE_PV4SI_V4DI_UQI
:
10601 case VOID_FTYPE_PV4SI_V2DI_UQI
:
10602 case VOID_FTYPE_PV8HI_V4DI_UQI
:
10603 case VOID_FTYPE_PV8HI_V2DI_UQI
:
10604 case VOID_FTYPE_PV8HI_V8SI_UQI
:
10605 case VOID_FTYPE_PV8HI_V4SI_UQI
:
10606 case VOID_FTYPE_PV16QI_V4DI_UQI
:
10607 case VOID_FTYPE_PV16QI_V2DI_UQI
:
10608 case VOID_FTYPE_PV16QI_V8SI_UQI
:
10609 case VOID_FTYPE_PV16QI_V4SI_UQI
:
10610 case VOID_FTYPE_PCHAR_V64QI_UDI
:
10611 case VOID_FTYPE_PCHAR_V32QI_USI
:
10612 case VOID_FTYPE_PCHAR_V16QI_UHI
:
10613 case VOID_FTYPE_PSHORT_V32HI_USI
:
10614 case VOID_FTYPE_PSHORT_V16HI_UHI
:
10615 case VOID_FTYPE_PSHORT_V8HI_UQI
:
10616 case VOID_FTYPE_PINT_V16SI_UHI
:
10617 case VOID_FTYPE_PINT_V8SI_UQI
:
10618 case VOID_FTYPE_PINT_V4SI_UQI
:
10619 case VOID_FTYPE_PINT64_V8DI_UQI
:
10620 case VOID_FTYPE_PINT64_V4DI_UQI
:
10621 case VOID_FTYPE_PINT64_V2DI_UQI
:
10622 case VOID_FTYPE_PDOUBLE_V8DF_UQI
:
10623 case VOID_FTYPE_PDOUBLE_V4DF_UQI
:
10624 case VOID_FTYPE_PDOUBLE_V2DF_UQI
:
10625 case VOID_FTYPE_PFLOAT_V16SF_UHI
:
10626 case VOID_FTYPE_PFLOAT_V8SF_UQI
:
10627 case VOID_FTYPE_PFLOAT_V4SF_UQI
:
10628 case VOID_FTYPE_PV32QI_V32HI_USI
:
10629 case VOID_FTYPE_PV16QI_V16HI_UHI
:
10630 case VOID_FTYPE_PV8QI_V8HI_UQI
:
10633 /* Reserve memory operand for target. */
10634 memory
= ARRAY_SIZE (args
);
10636 case V4SF_FTYPE_PCV4SF_V4SF_UQI
:
10637 case V8SF_FTYPE_PCV8SF_V8SF_UQI
:
10638 case V16SF_FTYPE_PCV16SF_V16SF_UHI
:
10639 case V4SI_FTYPE_PCV4SI_V4SI_UQI
:
10640 case V8SI_FTYPE_PCV8SI_V8SI_UQI
:
10641 case V16SI_FTYPE_PCV16SI_V16SI_UHI
:
10642 case V2DF_FTYPE_PCV2DF_V2DF_UQI
:
10643 case V4DF_FTYPE_PCV4DF_V4DF_UQI
:
10644 case V8DF_FTYPE_PCV8DF_V8DF_UQI
:
10645 case V2DI_FTYPE_PCV2DI_V2DI_UQI
:
10646 case V4DI_FTYPE_PCV4DI_V4DI_UQI
:
10647 case V8DI_FTYPE_PCV8DI_V8DI_UQI
:
10648 case V64QI_FTYPE_PCV64QI_V64QI_UDI
:
10649 case V32HI_FTYPE_PCV32HI_V32HI_USI
:
10650 case V32QI_FTYPE_PCV32QI_V32QI_USI
:
10651 case V16QI_FTYPE_PCV16QI_V16QI_UHI
:
10652 case V16HI_FTYPE_PCV16HI_V16HI_UHI
:
10653 case V8HI_FTYPE_PCV8HI_V8HI_UQI
:
10656 /* These builtins and instructions require the memory
10657 to be properly aligned. */
10658 case CODE_FOR_avx512f_loadv16sf_mask
:
10659 case CODE_FOR_avx512f_loadv16si_mask
:
10660 case CODE_FOR_avx512f_loadv8df_mask
:
10661 case CODE_FOR_avx512f_loadv8di_mask
:
10662 case CODE_FOR_avx512vl_loadv8sf_mask
:
10663 case CODE_FOR_avx512vl_loadv8si_mask
:
10664 case CODE_FOR_avx512vl_loadv4df_mask
:
10665 case CODE_FOR_avx512vl_loadv4di_mask
:
10666 case CODE_FOR_avx512vl_loadv4sf_mask
:
10667 case CODE_FOR_avx512vl_loadv4si_mask
:
10668 case CODE_FOR_avx512vl_loadv2df_mask
:
10669 case CODE_FOR_avx512vl_loadv2di_mask
:
10670 case CODE_FOR_avx512bw_loadv64qi_mask
:
10671 case CODE_FOR_avx512vl_loadv32qi_mask
:
10672 case CODE_FOR_avx512vl_loadv16qi_mask
:
10673 case CODE_FOR_avx512bw_loadv32hi_mask
:
10674 case CODE_FOR_avx512vl_loadv16hi_mask
:
10675 case CODE_FOR_avx512vl_loadv8hi_mask
:
10676 aligned_mem
= true;
10682 case V64QI_FTYPE_PCCHAR_V64QI_UDI
:
10683 case V32QI_FTYPE_PCCHAR_V32QI_USI
:
10684 case V16QI_FTYPE_PCCHAR_V16QI_UHI
:
10685 case V32HI_FTYPE_PCSHORT_V32HI_USI
:
10686 case V16HI_FTYPE_PCSHORT_V16HI_UHI
:
10687 case V8HI_FTYPE_PCSHORT_V8HI_UQI
:
10688 case V16SI_FTYPE_PCINT_V16SI_UHI
:
10689 case V8SI_FTYPE_PCINT_V8SI_UQI
:
10690 case V4SI_FTYPE_PCINT_V4SI_UQI
:
10691 case V8DI_FTYPE_PCINT64_V8DI_UQI
:
10692 case V4DI_FTYPE_PCINT64_V4DI_UQI
:
10693 case V2DI_FTYPE_PCINT64_V2DI_UQI
:
10694 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI
:
10695 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI
:
10696 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI
:
10697 case V16SF_FTYPE_PCFLOAT_V16SF_UHI
:
10698 case V8SF_FTYPE_PCFLOAT_V8SF_UQI
:
10699 case V4SF_FTYPE_PCFLOAT_V4SF_UQI
:
10704 case VOID_FTYPE_UINT_UINT_UINT
:
10705 case VOID_FTYPE_UINT64_UINT_UINT
:
10706 case UCHAR_FTYPE_UINT_UINT_UINT
:
10707 case UCHAR_FTYPE_UINT64_UINT_UINT
:
10710 memory
= ARRAY_SIZE (args
);
10711 last_arg_constant
= true;
10714 gcc_unreachable ();
10717 gcc_assert (nargs
<= ARRAY_SIZE (args
));
10719 if (klass
== store
)
10721 arg
= CALL_EXPR_ARG (exp
, 0);
10722 op
= expand_normal (arg
);
10723 gcc_assert (target
== 0);
10726 op
= ix86_zero_extend_to_Pmode (op
);
10727 target
= gen_rtx_MEM (tmode
, op
);
10728 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
10729 on it. Try to improve it using get_pointer_alignment,
10730 and if the special builtin is one that requires strict
10731 mode alignment, also from it's GET_MODE_ALIGNMENT.
10732 Failure to do so could lead to ix86_legitimate_combined_insn
10733 rejecting all changes to such insns. */
10734 unsigned int align
= get_pointer_alignment (arg
);
10735 if (aligned_mem
&& align
< GET_MODE_ALIGNMENT (tmode
))
10736 align
= GET_MODE_ALIGNMENT (tmode
);
10737 if (MEM_ALIGN (target
) < align
)
10738 set_mem_align (target
, align
);
10741 target
= force_reg (tmode
, op
);
10749 || !register_operand (target
, tmode
)
10750 || GET_MODE (target
) != tmode
)
10751 target
= gen_reg_rtx (tmode
);
10754 for (i
= 0; i
< nargs
; i
++)
10756 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
10759 arg
= CALL_EXPR_ARG (exp
, i
+ arg_adjust
);
10760 op
= expand_normal (arg
);
10761 match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
10763 if (last_arg_constant
&& (i
+ 1) == nargs
)
10767 if (icode
== CODE_FOR_lwp_lwpvalsi3
10768 || icode
== CODE_FOR_lwp_lwpinssi3
10769 || icode
== CODE_FOR_lwp_lwpvaldi3
10770 || icode
== CODE_FOR_lwp_lwpinsdi3
)
10771 error ("the last argument must be a 32-bit immediate");
10773 error ("the last argument must be an 8-bit immediate");
10781 /* This must be the memory operand. */
10782 op
= ix86_zero_extend_to_Pmode (op
);
10783 op
= gen_rtx_MEM (mode
, op
);
10784 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
10785 on it. Try to improve it using get_pointer_alignment,
10786 and if the special builtin is one that requires strict
10787 mode alignment, also from it's GET_MODE_ALIGNMENT.
10788 Failure to do so could lead to ix86_legitimate_combined_insn
10789 rejecting all changes to such insns. */
10790 unsigned int align
= get_pointer_alignment (arg
);
10791 if (aligned_mem
&& align
< GET_MODE_ALIGNMENT (mode
))
10792 align
= GET_MODE_ALIGNMENT (mode
);
10793 if (MEM_ALIGN (op
) < align
)
10794 set_mem_align (op
, align
);
10798 /* This must be register. */
10799 if (VECTOR_MODE_P (mode
))
10800 op
= safe_vector_operand (op
, mode
);
10802 op
= fixup_modeless_constant (op
, mode
);
10804 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
10805 op
= copy_to_mode_reg (mode
, op
);
10808 op
= copy_to_reg (op
);
10809 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
10815 args
[i
].mode
= mode
;
10821 pat
= GEN_FCN (icode
) (target
);
10824 pat
= GEN_FCN (icode
) (target
, args
[0].op
);
10827 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
);
10830 pat
= GEN_FCN (icode
) (target
, args
[0].op
, args
[1].op
, args
[2].op
);
10833 gcc_unreachable ();
10839 return klass
== store
? 0 : target
;
10842 /* Return the integer constant in ARG. Constrain it to be in the range
10843 of the subparts of VEC_TYPE; issue an error if not. */
10846 get_element_number (tree vec_type
, tree arg
)
10848 unsigned HOST_WIDE_INT elt
, max
= TYPE_VECTOR_SUBPARTS (vec_type
) - 1;
10850 if (!tree_fits_uhwi_p (arg
)
10851 || (elt
= tree_to_uhwi (arg
), elt
> max
))
10853 error ("selector must be an integer constant in the range "
10861 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10862 ix86_expand_vector_init. We DO have language-level syntax for this, in
10863 the form of (type){ init-list }. Except that since we can't place emms
10864 instructions from inside the compiler, we can't allow the use of MMX
10865 registers unless the user explicitly asks for it. So we do *not* define
10866 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
10867 we have builtins invoked by mmintrin.h that gives us license to emit
10868 these sorts of instructions. */
10871 ix86_expand_vec_init_builtin (tree type
, tree exp
, rtx target
)
10873 machine_mode tmode
= TYPE_MODE (type
);
10874 machine_mode inner_mode
= GET_MODE_INNER (tmode
);
10875 int i
, n_elt
= GET_MODE_NUNITS (tmode
);
10876 rtvec v
= rtvec_alloc (n_elt
);
10878 gcc_assert (VECTOR_MODE_P (tmode
));
10879 gcc_assert (call_expr_nargs (exp
) == n_elt
);
10881 for (i
= 0; i
< n_elt
; ++i
)
10883 rtx x
= expand_normal (CALL_EXPR_ARG (exp
, i
));
10884 RTVEC_ELT (v
, i
) = gen_lowpart (inner_mode
, x
);
10887 if (!target
|| !register_operand (target
, tmode
))
10888 target
= gen_reg_rtx (tmode
);
10890 ix86_expand_vector_init (true, target
, gen_rtx_PARALLEL (tmode
, v
));
10894 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10895 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
10896 had a language-level syntax for referencing vector elements. */
10899 ix86_expand_vec_ext_builtin (tree exp
, rtx target
)
10901 machine_mode tmode
, mode0
;
10906 arg0
= CALL_EXPR_ARG (exp
, 0);
10907 arg1
= CALL_EXPR_ARG (exp
, 1);
10909 op0
= expand_normal (arg0
);
10910 elt
= get_element_number (TREE_TYPE (arg0
), arg1
);
10912 tmode
= TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0
)));
10913 mode0
= TYPE_MODE (TREE_TYPE (arg0
));
10914 gcc_assert (VECTOR_MODE_P (mode0
));
10916 op0
= force_reg (mode0
, op0
);
10918 if (optimize
|| !target
|| !register_operand (target
, tmode
))
10919 target
= gen_reg_rtx (tmode
);
10921 ix86_expand_vector_extract (true, target
, op0
, elt
);
10926 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
10927 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
10928 a language-level syntax for referencing vector elements. */
10931 ix86_expand_vec_set_builtin (tree exp
)
10933 machine_mode tmode
, mode1
;
10934 tree arg0
, arg1
, arg2
;
10936 rtx op0
, op1
, target
;
10938 arg0
= CALL_EXPR_ARG (exp
, 0);
10939 arg1
= CALL_EXPR_ARG (exp
, 1);
10940 arg2
= CALL_EXPR_ARG (exp
, 2);
10942 tmode
= TYPE_MODE (TREE_TYPE (arg0
));
10943 mode1
= TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0
)));
10944 gcc_assert (VECTOR_MODE_P (tmode
));
10946 op0
= expand_expr (arg0
, NULL_RTX
, tmode
, EXPAND_NORMAL
);
10947 op1
= expand_expr (arg1
, NULL_RTX
, mode1
, EXPAND_NORMAL
);
10948 elt
= get_element_number (TREE_TYPE (arg0
), arg2
);
10950 if (GET_MODE (op1
) != mode1
&& GET_MODE (op1
) != VOIDmode
)
10951 op1
= convert_modes (mode1
, GET_MODE (op1
), op1
, true);
10953 op0
= force_reg (tmode
, op0
);
10954 op1
= force_reg (mode1
, op1
);
10956 /* OP0 is the source of these builtin functions and shouldn't be
10957 modified. Create a copy, use it and return it as target. */
10958 target
= gen_reg_rtx (tmode
);
10959 emit_move_insn (target
, op0
);
10960 ix86_expand_vector_set (true, target
, op1
, elt
);
10965 /* Expand an expression EXP that calls a built-in function,
10966 with result going to TARGET if that's convenient
10967 (and in mode MODE if that's convenient).
10968 SUBTARGET may be used as the target for computing one of EXP's operands.
10969 IGNORE is nonzero if the value is to be ignored. */
10972 ix86_expand_builtin (tree exp
, rtx target
, rtx subtarget
,
10973 machine_mode mode
, int ignore
)
10976 enum insn_code icode
, icode2
;
10977 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
10978 tree arg0
, arg1
, arg2
, arg3
, arg4
;
10979 rtx op0
, op1
, op2
, op3
, op4
, pat
, pat2
, insn
;
10980 machine_mode mode0
, mode1
, mode2
, mode3
, mode4
;
10981 unsigned int fcode
= DECL_FUNCTION_CODE (fndecl
);
10983 /* For CPU builtins that can be folded, fold first and expand the fold. */
10986 case IX86_BUILTIN_CPU_INIT
:
10988 /* Make it call __cpu_indicator_init in libgcc. */
10989 tree call_expr
, fndecl
, type
;
10990 type
= build_function_type_list (integer_type_node
, NULL_TREE
);
10991 fndecl
= build_fn_decl ("__cpu_indicator_init", type
);
10992 call_expr
= build_call_expr (fndecl
, 0);
10993 return expand_expr (call_expr
, target
, mode
, EXPAND_NORMAL
);
10995 case IX86_BUILTIN_CPU_IS
:
10996 case IX86_BUILTIN_CPU_SUPPORTS
:
10998 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10999 tree fold_expr
= fold_builtin_cpu (fndecl
, &arg0
);
11000 gcc_assert (fold_expr
!= NULL_TREE
);
11001 return expand_expr (fold_expr
, target
, mode
, EXPAND_NORMAL
);
11005 HOST_WIDE_INT isa
= ix86_isa_flags
;
11006 HOST_WIDE_INT isa2
= ix86_isa_flags2
;
11007 HOST_WIDE_INT bisa
= ix86_builtins_isa
[fcode
].isa
;
11008 HOST_WIDE_INT bisa2
= ix86_builtins_isa
[fcode
].isa2
;
11009 /* The general case is we require all the ISAs specified in bisa{,2}
11011 The exceptions are:
11012 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
11013 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
11014 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
11015 where for each this pair it is sufficient if either of the ISAs is
11016 enabled, plus if it is ored with other options also those others. */
11017 if (((bisa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
))
11018 == (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
))
11019 && (isa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
)) != 0)
11020 isa
|= (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
);
11021 if (((bisa
& (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
))
11022 == (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
))
11023 && (isa
& (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
)) != 0)
11024 isa
|= (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
);
11025 if (((bisa
& (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
))
11026 == (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
))
11027 && (isa
& (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
)) != 0)
11028 isa
|= (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
);
11029 /* Use SSE/SSE2/SSSE3 to emulate MMX intrinsics in 64-bit mode when
11030 MMX is disabled. NB: Since MMX intrinsics are marked with
11031 SSE/SSE2/SSSE3, enable them without SSE/SSE2/SSSE3 if MMX is
11033 if (TARGET_MMX
|| TARGET_MMX_WITH_SSE
)
11035 if (((bisa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_MMX
))
11036 == (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_MMX
))
11037 && (isa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_MMX
)) != 0)
11038 isa
|= (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_MMX
);
11039 if (((bisa
& (OPTION_MASK_ISA_SSE2
| OPTION_MASK_ISA_MMX
))
11040 == (OPTION_MASK_ISA_SSE2
| OPTION_MASK_ISA_MMX
))
11041 && (isa
& (OPTION_MASK_ISA_SSE2
| OPTION_MASK_ISA_MMX
)) != 0)
11042 isa
|= (OPTION_MASK_ISA_SSE2
| OPTION_MASK_ISA_MMX
);
11043 if (((bisa
& (OPTION_MASK_ISA_SSSE3
| OPTION_MASK_ISA_MMX
))
11044 == (OPTION_MASK_ISA_SSSE3
| OPTION_MASK_ISA_MMX
))
11045 && (isa
& (OPTION_MASK_ISA_SSSE3
| OPTION_MASK_ISA_MMX
)) != 0)
11046 isa
|= (OPTION_MASK_ISA_SSSE3
| OPTION_MASK_ISA_MMX
);
11048 if ((bisa
& isa
) != bisa
|| (bisa2
& isa2
) != bisa2
)
11050 bool add_abi_p
= bisa
& OPTION_MASK_ISA_64BIT
;
11051 if (TARGET_ABI_X32
)
11052 bisa
|= OPTION_MASK_ABI_X32
;
11054 bisa
|= OPTION_MASK_ABI_64
;
11055 char *opts
= ix86_target_string (bisa
, bisa2
, 0, 0, NULL
, NULL
,
11056 (enum fpmath_unit
) 0, false, add_abi_p
);
11058 error ("%qE needs unknown isa option", fndecl
);
11061 gcc_assert (opts
!= NULL
);
11062 error ("%qE needs isa option %s", fndecl
, opts
);
11065 return expand_call (exp
, target
, ignore
);
11070 case IX86_BUILTIN_MASKMOVQ
:
11071 case IX86_BUILTIN_MASKMOVDQU
:
11072 icode
= (fcode
== IX86_BUILTIN_MASKMOVQ
11073 ? CODE_FOR_mmx_maskmovq
11074 : CODE_FOR_sse2_maskmovdqu
);
11075 /* Note the arg order is different from the operand order. */
11076 arg1
= CALL_EXPR_ARG (exp
, 0);
11077 arg2
= CALL_EXPR_ARG (exp
, 1);
11078 arg0
= CALL_EXPR_ARG (exp
, 2);
11079 op0
= expand_normal (arg0
);
11080 op1
= expand_normal (arg1
);
11081 op2
= expand_normal (arg2
);
11082 mode0
= insn_data
[icode
].operand
[0].mode
;
11083 mode1
= insn_data
[icode
].operand
[1].mode
;
11084 mode2
= insn_data
[icode
].operand
[2].mode
;
11086 op0
= ix86_zero_extend_to_Pmode (op0
);
11087 op0
= gen_rtx_MEM (mode1
, op0
);
11089 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
11090 op0
= copy_to_mode_reg (mode0
, op0
);
11091 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
11092 op1
= copy_to_mode_reg (mode1
, op1
);
11093 if (!insn_data
[icode
].operand
[2].predicate (op2
, mode2
))
11094 op2
= copy_to_mode_reg (mode2
, op2
);
11095 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11101 case IX86_BUILTIN_LDMXCSR
:
11102 op0
= expand_normal (CALL_EXPR_ARG (exp
, 0));
11103 target
= assign_386_stack_local (SImode
, SLOT_TEMP
);
11104 emit_move_insn (target
, op0
);
11105 emit_insn (gen_sse_ldmxcsr (target
));
11108 case IX86_BUILTIN_STMXCSR
:
11109 target
= assign_386_stack_local (SImode
, SLOT_TEMP
);
11110 emit_insn (gen_sse_stmxcsr (target
));
11111 return copy_to_mode_reg (SImode
, target
);
11113 case IX86_BUILTIN_CLFLUSH
:
11114 arg0
= CALL_EXPR_ARG (exp
, 0);
11115 op0
= expand_normal (arg0
);
11116 icode
= CODE_FOR_sse2_clflush
;
11117 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11118 op0
= ix86_zero_extend_to_Pmode (op0
);
11120 emit_insn (gen_sse2_clflush (op0
));
11123 case IX86_BUILTIN_CLWB
:
11124 arg0
= CALL_EXPR_ARG (exp
, 0);
11125 op0
= expand_normal (arg0
);
11126 icode
= CODE_FOR_clwb
;
11127 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11128 op0
= ix86_zero_extend_to_Pmode (op0
);
11130 emit_insn (gen_clwb (op0
));
11133 case IX86_BUILTIN_CLFLUSHOPT
:
11134 arg0
= CALL_EXPR_ARG (exp
, 0);
11135 op0
= expand_normal (arg0
);
11136 icode
= CODE_FOR_clflushopt
;
11137 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11138 op0
= ix86_zero_extend_to_Pmode (op0
);
11140 emit_insn (gen_clflushopt (op0
));
11143 case IX86_BUILTIN_MONITOR
:
11144 case IX86_BUILTIN_MONITORX
:
11145 arg0
= CALL_EXPR_ARG (exp
, 0);
11146 arg1
= CALL_EXPR_ARG (exp
, 1);
11147 arg2
= CALL_EXPR_ARG (exp
, 2);
11148 op0
= expand_normal (arg0
);
11149 op1
= expand_normal (arg1
);
11150 op2
= expand_normal (arg2
);
11152 op0
= ix86_zero_extend_to_Pmode (op0
);
11154 op1
= copy_to_mode_reg (SImode
, op1
);
11156 op2
= copy_to_mode_reg (SImode
, op2
);
11158 emit_insn (fcode
== IX86_BUILTIN_MONITOR
11159 ? gen_sse3_monitor (Pmode
, op0
, op1
, op2
)
11160 : gen_monitorx (Pmode
, op0
, op1
, op2
));
11163 case IX86_BUILTIN_MWAIT
:
11164 arg0
= CALL_EXPR_ARG (exp
, 0);
11165 arg1
= CALL_EXPR_ARG (exp
, 1);
11166 op0
= expand_normal (arg0
);
11167 op1
= expand_normal (arg1
);
11169 op0
= copy_to_mode_reg (SImode
, op0
);
11171 op1
= copy_to_mode_reg (SImode
, op1
);
11172 emit_insn (gen_sse3_mwait (op0
, op1
));
11175 case IX86_BUILTIN_MWAITX
:
11176 arg0
= CALL_EXPR_ARG (exp
, 0);
11177 arg1
= CALL_EXPR_ARG (exp
, 1);
11178 arg2
= CALL_EXPR_ARG (exp
, 2);
11179 op0
= expand_normal (arg0
);
11180 op1
= expand_normal (arg1
);
11181 op2
= expand_normal (arg2
);
11183 op0
= copy_to_mode_reg (SImode
, op0
);
11185 op1
= copy_to_mode_reg (SImode
, op1
);
11187 op2
= copy_to_mode_reg (SImode
, op2
);
11188 emit_insn (gen_mwaitx (op0
, op1
, op2
));
11191 case IX86_BUILTIN_UMONITOR
:
11192 arg0
= CALL_EXPR_ARG (exp
, 0);
11193 op0
= expand_normal (arg0
);
11195 op0
= ix86_zero_extend_to_Pmode (op0
);
11196 emit_insn (gen_umonitor (Pmode
, op0
));
11199 case IX86_BUILTIN_UMWAIT
:
11200 case IX86_BUILTIN_TPAUSE
:
11201 arg0
= CALL_EXPR_ARG (exp
, 0);
11202 arg1
= CALL_EXPR_ARG (exp
, 1);
11203 op0
= expand_normal (arg0
);
11204 op1
= expand_normal (arg1
);
11207 op0
= copy_to_mode_reg (SImode
, op0
);
11209 op1
= force_reg (DImode
, op1
);
11213 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
11214 NULL
, 1, OPTAB_DIRECT
);
11217 case IX86_BUILTIN_UMWAIT
:
11218 icode
= CODE_FOR_umwait_rex64
;
11220 case IX86_BUILTIN_TPAUSE
:
11221 icode
= CODE_FOR_tpause_rex64
;
11224 gcc_unreachable ();
11227 op2
= gen_lowpart (SImode
, op2
);
11228 op1
= gen_lowpart (SImode
, op1
);
11229 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11235 case IX86_BUILTIN_UMWAIT
:
11236 icode
= CODE_FOR_umwait
;
11238 case IX86_BUILTIN_TPAUSE
:
11239 icode
= CODE_FOR_tpause
;
11242 gcc_unreachable ();
11244 pat
= GEN_FCN (icode
) (op0
, op1
);
11253 || !register_operand (target
, QImode
))
11254 target
= gen_reg_rtx (QImode
);
11256 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
11258 emit_insn (gen_rtx_SET (target
, pat
));
11262 case IX86_BUILTIN_CLZERO
:
11263 arg0
= CALL_EXPR_ARG (exp
, 0);
11264 op0
= expand_normal (arg0
);
11266 op0
= ix86_zero_extend_to_Pmode (op0
);
11267 emit_insn (gen_clzero (Pmode
, op0
));
11270 case IX86_BUILTIN_CLDEMOTE
:
11271 arg0
= CALL_EXPR_ARG (exp
, 0);
11272 op0
= expand_normal (arg0
);
11273 icode
= CODE_FOR_cldemote
;
11274 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11275 op0
= ix86_zero_extend_to_Pmode (op0
);
11277 emit_insn (gen_cldemote (op0
));
11280 case IX86_BUILTIN_VEC_INIT_V2SI
:
11281 case IX86_BUILTIN_VEC_INIT_V4HI
:
11282 case IX86_BUILTIN_VEC_INIT_V8QI
:
11283 return ix86_expand_vec_init_builtin (TREE_TYPE (exp
), exp
, target
);
11285 case IX86_BUILTIN_VEC_EXT_V2DF
:
11286 case IX86_BUILTIN_VEC_EXT_V2DI
:
11287 case IX86_BUILTIN_VEC_EXT_V4SF
:
11288 case IX86_BUILTIN_VEC_EXT_V4SI
:
11289 case IX86_BUILTIN_VEC_EXT_V8HI
:
11290 case IX86_BUILTIN_VEC_EXT_V2SI
:
11291 case IX86_BUILTIN_VEC_EXT_V4HI
:
11292 case IX86_BUILTIN_VEC_EXT_V16QI
:
11293 return ix86_expand_vec_ext_builtin (exp
, target
);
11295 case IX86_BUILTIN_VEC_SET_V2DI
:
11296 case IX86_BUILTIN_VEC_SET_V4SF
:
11297 case IX86_BUILTIN_VEC_SET_V4SI
:
11298 case IX86_BUILTIN_VEC_SET_V8HI
:
11299 case IX86_BUILTIN_VEC_SET_V4HI
:
11300 case IX86_BUILTIN_VEC_SET_V16QI
:
11301 return ix86_expand_vec_set_builtin (exp
);
11303 case IX86_BUILTIN_NANQ
:
11304 case IX86_BUILTIN_NANSQ
:
11305 return expand_call (exp
, target
, ignore
);
11307 case IX86_BUILTIN_RDPID
:
11309 op0
= gen_reg_rtx (word_mode
);
11313 insn
= gen_rdpid_rex64 (op0
);
11314 op0
= convert_to_mode (SImode
, op0
, 1);
11317 insn
= gen_rdpid (op0
);
11322 || !register_operand (target
, SImode
))
11323 target
= gen_reg_rtx (SImode
);
11325 emit_move_insn (target
, op0
);
11328 case IX86_BUILTIN_2INTERSECTD512
:
11329 case IX86_BUILTIN_2INTERSECTQ512
:
11330 case IX86_BUILTIN_2INTERSECTD256
:
11331 case IX86_BUILTIN_2INTERSECTQ256
:
11332 case IX86_BUILTIN_2INTERSECTD128
:
11333 case IX86_BUILTIN_2INTERSECTQ128
:
11334 arg0
= CALL_EXPR_ARG (exp
, 0);
11335 arg1
= CALL_EXPR_ARG (exp
, 1);
11336 arg2
= CALL_EXPR_ARG (exp
, 2);
11337 arg3
= CALL_EXPR_ARG (exp
, 3);
11338 op0
= expand_normal (arg0
);
11339 op1
= expand_normal (arg1
);
11340 op2
= expand_normal (arg2
);
11341 op3
= expand_normal (arg3
);
11343 if (!address_operand (op0
, VOIDmode
))
11345 op0
= convert_memory_address (Pmode
, op0
);
11346 op0
= copy_addr_to_reg (op0
);
11348 if (!address_operand (op1
, VOIDmode
))
11350 op1
= convert_memory_address (Pmode
, op1
);
11351 op1
= copy_addr_to_reg (op1
);
11356 case IX86_BUILTIN_2INTERSECTD512
:
11358 icode
= CODE_FOR_avx512vp2intersect_2intersectv16si
;
11360 case IX86_BUILTIN_2INTERSECTQ512
:
11362 icode
= CODE_FOR_avx512vp2intersect_2intersectv8di
;
11364 case IX86_BUILTIN_2INTERSECTD256
:
11366 icode
= CODE_FOR_avx512vp2intersect_2intersectv8si
;
11368 case IX86_BUILTIN_2INTERSECTQ256
:
11370 icode
= CODE_FOR_avx512vp2intersect_2intersectv4di
;
11372 case IX86_BUILTIN_2INTERSECTD128
:
11374 icode
= CODE_FOR_avx512vp2intersect_2intersectv4si
;
11376 case IX86_BUILTIN_2INTERSECTQ128
:
11378 icode
= CODE_FOR_avx512vp2intersect_2intersectv2di
;
11381 gcc_unreachable ();
11384 mode2
= insn_data
[icode
].operand
[1].mode
;
11385 mode3
= insn_data
[icode
].operand
[2].mode
;
11386 if (!insn_data
[icode
].operand
[1].predicate (op2
, mode2
))
11387 op2
= copy_to_mode_reg (mode2
, op2
);
11388 if (!insn_data
[icode
].operand
[2].predicate (op3
, mode3
))
11389 op3
= copy_to_mode_reg (mode3
, op3
);
11391 op4
= gen_reg_rtx (mode4
);
11392 emit_insn (GEN_FCN (icode
) (op4
, op2
, op3
));
11393 mode0
= mode4
== P2HImode
? HImode
: QImode
;
11394 emit_move_insn (gen_rtx_MEM (mode0
, op0
),
11395 gen_lowpart (mode0
, op4
));
11396 emit_move_insn (gen_rtx_MEM (mode0
, op1
),
11397 gen_highpart (mode0
, op4
));
11401 case IX86_BUILTIN_RDPMC
:
11402 case IX86_BUILTIN_RDTSC
:
11403 case IX86_BUILTIN_RDTSCP
:
11404 case IX86_BUILTIN_XGETBV
:
11406 op0
= gen_reg_rtx (DImode
);
11407 op1
= gen_reg_rtx (DImode
);
11409 if (fcode
== IX86_BUILTIN_RDPMC
)
11411 arg0
= CALL_EXPR_ARG (exp
, 0);
11412 op2
= expand_normal (arg0
);
11413 if (!register_operand (op2
, SImode
))
11414 op2
= copy_to_mode_reg (SImode
, op2
);
11416 insn
= (TARGET_64BIT
11417 ? gen_rdpmc_rex64 (op0
, op1
, op2
)
11418 : gen_rdpmc (op0
, op2
));
11421 else if (fcode
== IX86_BUILTIN_XGETBV
)
11423 arg0
= CALL_EXPR_ARG (exp
, 0);
11424 op2
= expand_normal (arg0
);
11425 if (!register_operand (op2
, SImode
))
11426 op2
= copy_to_mode_reg (SImode
, op2
);
11428 insn
= (TARGET_64BIT
11429 ? gen_xgetbv_rex64 (op0
, op1
, op2
)
11430 : gen_xgetbv (op0
, op2
));
11433 else if (fcode
== IX86_BUILTIN_RDTSC
)
11435 insn
= (TARGET_64BIT
11436 ? gen_rdtsc_rex64 (op0
, op1
)
11437 : gen_rdtsc (op0
));
11442 op2
= gen_reg_rtx (SImode
);
11444 insn
= (TARGET_64BIT
11445 ? gen_rdtscp_rex64 (op0
, op1
, op2
)
11446 : gen_rdtscp (op0
, op2
));
11449 arg0
= CALL_EXPR_ARG (exp
, 0);
11450 op4
= expand_normal (arg0
);
11451 if (!address_operand (op4
, VOIDmode
))
11453 op4
= convert_memory_address (Pmode
, op4
);
11454 op4
= copy_addr_to_reg (op4
);
11456 emit_move_insn (gen_rtx_MEM (SImode
, op4
), op2
);
11460 || !register_operand (target
, DImode
))
11461 target
= gen_reg_rtx (DImode
);
11465 op1
= expand_simple_binop (DImode
, ASHIFT
, op1
, GEN_INT (32),
11466 op1
, 1, OPTAB_DIRECT
);
11467 op0
= expand_simple_binop (DImode
, IOR
, op0
, op1
,
11468 op0
, 1, OPTAB_DIRECT
);
11471 emit_move_insn (target
, op0
);
11474 case IX86_BUILTIN_ENQCMD
:
11475 case IX86_BUILTIN_ENQCMDS
:
11476 case IX86_BUILTIN_MOVDIR64B
:
11478 arg0
= CALL_EXPR_ARG (exp
, 0);
11479 arg1
= CALL_EXPR_ARG (exp
, 1);
11480 op0
= expand_normal (arg0
);
11481 op1
= expand_normal (arg1
);
11483 op0
= ix86_zero_extend_to_Pmode (op0
);
11484 if (!address_operand (op1
, VOIDmode
))
11486 op1
= convert_memory_address (Pmode
, op1
);
11487 op1
= copy_addr_to_reg (op1
);
11489 op1
= gen_rtx_MEM (XImode
, op1
);
11491 if (fcode
== IX86_BUILTIN_MOVDIR64B
)
11493 emit_insn (gen_movdir64b (Pmode
, op0
, op1
));
11500 target
= gen_reg_rtx (SImode
);
11501 emit_move_insn (target
, const0_rtx
);
11502 target
= gen_rtx_SUBREG (QImode
, target
, 0);
11504 if (fcode
== IX86_BUILTIN_ENQCMD
)
11505 pat
= gen_enqcmd (UNSPECV_ENQCMD
, Pmode
, op0
, op1
);
11507 pat
= gen_enqcmd (UNSPECV_ENQCMDS
, Pmode
, op0
, op1
);
11511 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
11512 gen_rtx_fmt_ee (EQ
, QImode
,
11516 return SUBREG_REG (target
);
11519 case IX86_BUILTIN_FXSAVE
:
11520 case IX86_BUILTIN_FXRSTOR
:
11521 case IX86_BUILTIN_FXSAVE64
:
11522 case IX86_BUILTIN_FXRSTOR64
:
11523 case IX86_BUILTIN_FNSTENV
:
11524 case IX86_BUILTIN_FLDENV
:
11528 case IX86_BUILTIN_FXSAVE
:
11529 icode
= CODE_FOR_fxsave
;
11531 case IX86_BUILTIN_FXRSTOR
:
11532 icode
= CODE_FOR_fxrstor
;
11534 case IX86_BUILTIN_FXSAVE64
:
11535 icode
= CODE_FOR_fxsave64
;
11537 case IX86_BUILTIN_FXRSTOR64
:
11538 icode
= CODE_FOR_fxrstor64
;
11540 case IX86_BUILTIN_FNSTENV
:
11541 icode
= CODE_FOR_fnstenv
;
11543 case IX86_BUILTIN_FLDENV
:
11544 icode
= CODE_FOR_fldenv
;
11547 gcc_unreachable ();
11550 arg0
= CALL_EXPR_ARG (exp
, 0);
11551 op0
= expand_normal (arg0
);
11553 if (!address_operand (op0
, VOIDmode
))
11555 op0
= convert_memory_address (Pmode
, op0
);
11556 op0
= copy_addr_to_reg (op0
);
11558 op0
= gen_rtx_MEM (mode0
, op0
);
11560 pat
= GEN_FCN (icode
) (op0
);
11565 case IX86_BUILTIN_XSETBV
:
11566 arg0
= CALL_EXPR_ARG (exp
, 0);
11567 arg1
= CALL_EXPR_ARG (exp
, 1);
11568 op0
= expand_normal (arg0
);
11569 op1
= expand_normal (arg1
);
11572 op0
= copy_to_mode_reg (SImode
, op0
);
11574 op1
= force_reg (DImode
, op1
);
11578 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
11579 NULL
, 1, OPTAB_DIRECT
);
11581 icode
= CODE_FOR_xsetbv_rex64
;
11583 op2
= gen_lowpart (SImode
, op2
);
11584 op1
= gen_lowpart (SImode
, op1
);
11585 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11589 icode
= CODE_FOR_xsetbv
;
11591 pat
= GEN_FCN (icode
) (op0
, op1
);
11597 case IX86_BUILTIN_XSAVE
:
11598 case IX86_BUILTIN_XRSTOR
:
11599 case IX86_BUILTIN_XSAVE64
:
11600 case IX86_BUILTIN_XRSTOR64
:
11601 case IX86_BUILTIN_XSAVEOPT
:
11602 case IX86_BUILTIN_XSAVEOPT64
:
11603 case IX86_BUILTIN_XSAVES
:
11604 case IX86_BUILTIN_XRSTORS
:
11605 case IX86_BUILTIN_XSAVES64
:
11606 case IX86_BUILTIN_XRSTORS64
:
11607 case IX86_BUILTIN_XSAVEC
:
11608 case IX86_BUILTIN_XSAVEC64
:
11609 arg0
= CALL_EXPR_ARG (exp
, 0);
11610 arg1
= CALL_EXPR_ARG (exp
, 1);
11611 op0
= expand_normal (arg0
);
11612 op1
= expand_normal (arg1
);
11614 if (!address_operand (op0
, VOIDmode
))
11616 op0
= convert_memory_address (Pmode
, op0
);
11617 op0
= copy_addr_to_reg (op0
);
11619 op0
= gen_rtx_MEM (BLKmode
, op0
);
11621 op1
= force_reg (DImode
, op1
);
11625 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
11626 NULL
, 1, OPTAB_DIRECT
);
11629 case IX86_BUILTIN_XSAVE
:
11630 icode
= CODE_FOR_xsave_rex64
;
11632 case IX86_BUILTIN_XRSTOR
:
11633 icode
= CODE_FOR_xrstor_rex64
;
11635 case IX86_BUILTIN_XSAVE64
:
11636 icode
= CODE_FOR_xsave64
;
11638 case IX86_BUILTIN_XRSTOR64
:
11639 icode
= CODE_FOR_xrstor64
;
11641 case IX86_BUILTIN_XSAVEOPT
:
11642 icode
= CODE_FOR_xsaveopt_rex64
;
11644 case IX86_BUILTIN_XSAVEOPT64
:
11645 icode
= CODE_FOR_xsaveopt64
;
11647 case IX86_BUILTIN_XSAVES
:
11648 icode
= CODE_FOR_xsaves_rex64
;
11650 case IX86_BUILTIN_XRSTORS
:
11651 icode
= CODE_FOR_xrstors_rex64
;
11653 case IX86_BUILTIN_XSAVES64
:
11654 icode
= CODE_FOR_xsaves64
;
11656 case IX86_BUILTIN_XRSTORS64
:
11657 icode
= CODE_FOR_xrstors64
;
11659 case IX86_BUILTIN_XSAVEC
:
11660 icode
= CODE_FOR_xsavec_rex64
;
11662 case IX86_BUILTIN_XSAVEC64
:
11663 icode
= CODE_FOR_xsavec64
;
11666 gcc_unreachable ();
11669 op2
= gen_lowpart (SImode
, op2
);
11670 op1
= gen_lowpart (SImode
, op1
);
11671 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
11677 case IX86_BUILTIN_XSAVE
:
11678 icode
= CODE_FOR_xsave
;
11680 case IX86_BUILTIN_XRSTOR
:
11681 icode
= CODE_FOR_xrstor
;
11683 case IX86_BUILTIN_XSAVEOPT
:
11684 icode
= CODE_FOR_xsaveopt
;
11686 case IX86_BUILTIN_XSAVES
:
11687 icode
= CODE_FOR_xsaves
;
11689 case IX86_BUILTIN_XRSTORS
:
11690 icode
= CODE_FOR_xrstors
;
11692 case IX86_BUILTIN_XSAVEC
:
11693 icode
= CODE_FOR_xsavec
;
11696 gcc_unreachable ();
11698 pat
= GEN_FCN (icode
) (op0
, op1
);
11705 case IX86_BUILTIN_LLWPCB
:
11706 arg0
= CALL_EXPR_ARG (exp
, 0);
11707 op0
= expand_normal (arg0
);
11708 icode
= CODE_FOR_lwp_llwpcb
;
11709 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
11710 op0
= ix86_zero_extend_to_Pmode (op0
);
11711 emit_insn (gen_lwp_llwpcb (op0
));
11714 case IX86_BUILTIN_SLWPCB
:
11715 icode
= CODE_FOR_lwp_slwpcb
;
11717 || !insn_data
[icode
].operand
[0].predicate (target
, Pmode
))
11718 target
= gen_reg_rtx (Pmode
);
11719 emit_insn (gen_lwp_slwpcb (target
));
11722 case IX86_BUILTIN_BEXTRI32
:
11723 case IX86_BUILTIN_BEXTRI64
:
11724 arg0
= CALL_EXPR_ARG (exp
, 0);
11725 arg1
= CALL_EXPR_ARG (exp
, 1);
11726 op0
= expand_normal (arg0
);
11727 op1
= expand_normal (arg1
);
11728 icode
= (fcode
== IX86_BUILTIN_BEXTRI32
11729 ? CODE_FOR_tbm_bextri_si
11730 : CODE_FOR_tbm_bextri_di
);
11731 if (!CONST_INT_P (op1
))
11733 error ("last argument must be an immediate");
11738 unsigned char length
= (INTVAL (op1
) >> 8) & 0xFF;
11739 unsigned char lsb_index
= INTVAL (op1
) & 0xFF;
11740 op1
= GEN_INT (length
);
11741 op2
= GEN_INT (lsb_index
);
11743 mode1
= insn_data
[icode
].operand
[1].mode
;
11744 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode1
))
11745 op0
= copy_to_mode_reg (mode1
, op0
);
11747 mode0
= insn_data
[icode
].operand
[0].mode
;
11749 || !register_operand (target
, mode0
))
11750 target
= gen_reg_rtx (mode0
);
11752 pat
= GEN_FCN (icode
) (target
, op0
, op1
, op2
);
11758 case IX86_BUILTIN_RDRAND16_STEP
:
11759 icode
= CODE_FOR_rdrandhi_1
;
11763 case IX86_BUILTIN_RDRAND32_STEP
:
11764 icode
= CODE_FOR_rdrandsi_1
;
11768 case IX86_BUILTIN_RDRAND64_STEP
:
11769 icode
= CODE_FOR_rdranddi_1
;
11773 arg0
= CALL_EXPR_ARG (exp
, 0);
11774 op1
= expand_normal (arg0
);
11775 if (!address_operand (op1
, VOIDmode
))
11777 op1
= convert_memory_address (Pmode
, op1
);
11778 op1
= copy_addr_to_reg (op1
);
11781 op0
= gen_reg_rtx (mode0
);
11782 emit_insn (GEN_FCN (icode
) (op0
));
11784 emit_move_insn (gen_rtx_MEM (mode0
, op1
), op0
);
11786 op1
= gen_reg_rtx (SImode
);
11787 emit_move_insn (op1
, CONST1_RTX (SImode
));
11789 /* Emit SImode conditional move. */
11790 if (mode0
== HImode
)
11792 if (TARGET_ZERO_EXTEND_WITH_AND
11793 && optimize_function_for_speed_p (cfun
))
11795 op2
= force_reg (SImode
, const0_rtx
);
11797 emit_insn (gen_movstricthi
11798 (gen_lowpart (HImode
, op2
), op0
));
11802 op2
= gen_reg_rtx (SImode
);
11804 emit_insn (gen_zero_extendhisi2 (op2
, op0
));
11807 else if (mode0
== SImode
)
11810 op2
= gen_rtx_SUBREG (SImode
, op0
, 0);
11813 || !register_operand (target
, SImode
))
11814 target
= gen_reg_rtx (SImode
);
11816 pat
= gen_rtx_GEU (VOIDmode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
11818 emit_insn (gen_rtx_SET (target
,
11819 gen_rtx_IF_THEN_ELSE (SImode
, pat
, op2
, op1
)));
11822 case IX86_BUILTIN_RDSEED16_STEP
:
11823 icode
= CODE_FOR_rdseedhi_1
;
11827 case IX86_BUILTIN_RDSEED32_STEP
:
11828 icode
= CODE_FOR_rdseedsi_1
;
11832 case IX86_BUILTIN_RDSEED64_STEP
:
11833 icode
= CODE_FOR_rdseeddi_1
;
11837 arg0
= CALL_EXPR_ARG (exp
, 0);
11838 op1
= expand_normal (arg0
);
11839 if (!address_operand (op1
, VOIDmode
))
11841 op1
= convert_memory_address (Pmode
, op1
);
11842 op1
= copy_addr_to_reg (op1
);
11845 op0
= gen_reg_rtx (mode0
);
11846 emit_insn (GEN_FCN (icode
) (op0
));
11848 emit_move_insn (gen_rtx_MEM (mode0
, op1
), op0
);
11850 op2
= gen_reg_rtx (QImode
);
11852 pat
= gen_rtx_LTU (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
11854 emit_insn (gen_rtx_SET (op2
, pat
));
11857 || !register_operand (target
, SImode
))
11858 target
= gen_reg_rtx (SImode
);
11860 emit_insn (gen_zero_extendqisi2 (target
, op2
));
11863 case IX86_BUILTIN_SBB32
:
11864 icode
= CODE_FOR_subborrowsi
;
11865 icode2
= CODE_FOR_subborrowsi_0
;
11871 case IX86_BUILTIN_SBB64
:
11872 icode
= CODE_FOR_subborrowdi
;
11873 icode2
= CODE_FOR_subborrowdi_0
;
11879 case IX86_BUILTIN_ADDCARRYX32
:
11880 icode
= CODE_FOR_addcarrysi
;
11881 icode2
= CODE_FOR_addcarrysi_0
;
11887 case IX86_BUILTIN_ADDCARRYX64
:
11888 icode
= CODE_FOR_addcarrydi
;
11889 icode2
= CODE_FOR_addcarrydi_0
;
11895 arg0
= CALL_EXPR_ARG (exp
, 0); /* unsigned char c_in. */
11896 arg1
= CALL_EXPR_ARG (exp
, 1); /* unsigned int src1. */
11897 arg2
= CALL_EXPR_ARG (exp
, 2); /* unsigned int src2. */
11898 arg3
= CALL_EXPR_ARG (exp
, 3); /* unsigned int *sum_out. */
11900 op1
= expand_normal (arg0
);
11901 if (!integer_zerop (arg0
))
11902 op1
= copy_to_mode_reg (QImode
, convert_to_mode (QImode
, op1
, 1));
11904 op2
= expand_normal (arg1
);
11905 if (!register_operand (op2
, mode0
))
11906 op2
= copy_to_mode_reg (mode0
, op2
);
11908 op3
= expand_normal (arg2
);
11909 if (!register_operand (op3
, mode0
))
11910 op3
= copy_to_mode_reg (mode0
, op3
);
11912 op4
= expand_normal (arg3
);
11913 if (!address_operand (op4
, VOIDmode
))
11915 op4
= convert_memory_address (Pmode
, op4
);
11916 op4
= copy_addr_to_reg (op4
);
11919 op0
= gen_reg_rtx (mode0
);
11920 if (integer_zerop (arg0
))
11922 /* If arg0 is 0, optimize right away into add or sub
11923 instruction that sets CCCmode flags. */
11924 op1
= gen_rtx_REG (mode2
, FLAGS_REG
);
11925 emit_insn (GEN_FCN (icode2
) (op0
, op2
, op3
));
11929 /* Generate CF from input operand. */
11930 emit_insn (gen_addqi3_cconly_overflow (op1
, constm1_rtx
));
11932 /* Generate instruction that consumes CF. */
11933 op1
= gen_rtx_REG (CCCmode
, FLAGS_REG
);
11934 pat
= gen_rtx_LTU (mode1
, op1
, const0_rtx
);
11935 pat2
= gen_rtx_LTU (mode0
, op1
, const0_rtx
);
11936 emit_insn (GEN_FCN (icode
) (op0
, op2
, op3
, op1
, pat
, pat2
));
11939 /* Return current CF value. */
11941 target
= gen_reg_rtx (QImode
);
11943 pat
= gen_rtx_LTU (QImode
, op1
, const0_rtx
);
11944 emit_insn (gen_rtx_SET (target
, pat
));
11946 /* Store the result. */
11947 emit_move_insn (gen_rtx_MEM (mode0
, op4
), op0
);
11951 case IX86_BUILTIN_READ_FLAGS
:
11952 emit_insn (gen_push (gen_rtx_REG (word_mode
, FLAGS_REG
)));
11955 || target
== NULL_RTX
11956 || !nonimmediate_operand (target
, word_mode
)
11957 || GET_MODE (target
) != word_mode
)
11958 target
= gen_reg_rtx (word_mode
);
11960 emit_insn (gen_pop (target
));
11963 case IX86_BUILTIN_WRITE_FLAGS
:
11965 arg0
= CALL_EXPR_ARG (exp
, 0);
11966 op0
= expand_normal (arg0
);
11967 if (!general_no_elim_operand (op0
, word_mode
))
11968 op0
= copy_to_mode_reg (word_mode
, op0
);
11970 emit_insn (gen_push (op0
));
11971 emit_insn (gen_pop (gen_rtx_REG (word_mode
, FLAGS_REG
)));
11974 case IX86_BUILTIN_KTESTC8
:
11975 icode
= CODE_FOR_ktestqi
;
11979 case IX86_BUILTIN_KTESTZ8
:
11980 icode
= CODE_FOR_ktestqi
;
11984 case IX86_BUILTIN_KTESTC16
:
11985 icode
= CODE_FOR_ktesthi
;
11989 case IX86_BUILTIN_KTESTZ16
:
11990 icode
= CODE_FOR_ktesthi
;
11994 case IX86_BUILTIN_KTESTC32
:
11995 icode
= CODE_FOR_ktestsi
;
11999 case IX86_BUILTIN_KTESTZ32
:
12000 icode
= CODE_FOR_ktestsi
;
12004 case IX86_BUILTIN_KTESTC64
:
12005 icode
= CODE_FOR_ktestdi
;
12009 case IX86_BUILTIN_KTESTZ64
:
12010 icode
= CODE_FOR_ktestdi
;
12014 case IX86_BUILTIN_KORTESTC8
:
12015 icode
= CODE_FOR_kortestqi
;
12019 case IX86_BUILTIN_KORTESTZ8
:
12020 icode
= CODE_FOR_kortestqi
;
12024 case IX86_BUILTIN_KORTESTC16
:
12025 icode
= CODE_FOR_kortesthi
;
12029 case IX86_BUILTIN_KORTESTZ16
:
12030 icode
= CODE_FOR_kortesthi
;
12034 case IX86_BUILTIN_KORTESTC32
:
12035 icode
= CODE_FOR_kortestsi
;
12039 case IX86_BUILTIN_KORTESTZ32
:
12040 icode
= CODE_FOR_kortestsi
;
12044 case IX86_BUILTIN_KORTESTC64
:
12045 icode
= CODE_FOR_kortestdi
;
12049 case IX86_BUILTIN_KORTESTZ64
:
12050 icode
= CODE_FOR_kortestdi
;
12054 arg0
= CALL_EXPR_ARG (exp
, 0); /* Mask reg src1. */
12055 arg1
= CALL_EXPR_ARG (exp
, 1); /* Mask reg src2. */
12056 op0
= expand_normal (arg0
);
12057 op1
= expand_normal (arg1
);
12059 mode0
= insn_data
[icode
].operand
[0].mode
;
12060 mode1
= insn_data
[icode
].operand
[1].mode
;
12062 if (GET_MODE (op0
) != VOIDmode
)
12063 op0
= force_reg (GET_MODE (op0
), op0
);
12065 op0
= gen_lowpart (mode0
, op0
);
12067 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
12068 op0
= copy_to_mode_reg (mode0
, op0
);
12070 if (GET_MODE (op1
) != VOIDmode
)
12071 op1
= force_reg (GET_MODE (op1
), op1
);
12073 op1
= gen_lowpart (mode1
, op1
);
12075 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
12076 op1
= copy_to_mode_reg (mode1
, op1
);
12078 target
= gen_reg_rtx (QImode
);
12080 /* Emit kortest. */
12081 emit_insn (GEN_FCN (icode
) (op0
, op1
));
12082 /* And use setcc to return result from flags. */
12083 ix86_expand_setcc (target
, EQ
,
12084 gen_rtx_REG (mode3
, FLAGS_REG
), const0_rtx
);
12087 case IX86_BUILTIN_GATHERSIV2DF
:
12088 icode
= CODE_FOR_avx2_gathersiv2df
;
12090 case IX86_BUILTIN_GATHERSIV4DF
:
12091 icode
= CODE_FOR_avx2_gathersiv4df
;
12093 case IX86_BUILTIN_GATHERDIV2DF
:
12094 icode
= CODE_FOR_avx2_gatherdiv2df
;
12096 case IX86_BUILTIN_GATHERDIV4DF
:
12097 icode
= CODE_FOR_avx2_gatherdiv4df
;
12099 case IX86_BUILTIN_GATHERSIV4SF
:
12100 icode
= CODE_FOR_avx2_gathersiv4sf
;
12102 case IX86_BUILTIN_GATHERSIV8SF
:
12103 icode
= CODE_FOR_avx2_gathersiv8sf
;
12105 case IX86_BUILTIN_GATHERDIV4SF
:
12106 icode
= CODE_FOR_avx2_gatherdiv4sf
;
12108 case IX86_BUILTIN_GATHERDIV8SF
:
12109 icode
= CODE_FOR_avx2_gatherdiv8sf
;
12111 case IX86_BUILTIN_GATHERSIV2DI
:
12112 icode
= CODE_FOR_avx2_gathersiv2di
;
12114 case IX86_BUILTIN_GATHERSIV4DI
:
12115 icode
= CODE_FOR_avx2_gathersiv4di
;
12117 case IX86_BUILTIN_GATHERDIV2DI
:
12118 icode
= CODE_FOR_avx2_gatherdiv2di
;
12120 case IX86_BUILTIN_GATHERDIV4DI
:
12121 icode
= CODE_FOR_avx2_gatherdiv4di
;
12123 case IX86_BUILTIN_GATHERSIV4SI
:
12124 icode
= CODE_FOR_avx2_gathersiv4si
;
12126 case IX86_BUILTIN_GATHERSIV8SI
:
12127 icode
= CODE_FOR_avx2_gathersiv8si
;
12129 case IX86_BUILTIN_GATHERDIV4SI
:
12130 icode
= CODE_FOR_avx2_gatherdiv4si
;
12132 case IX86_BUILTIN_GATHERDIV8SI
:
12133 icode
= CODE_FOR_avx2_gatherdiv8si
;
12135 case IX86_BUILTIN_GATHERALTSIV4DF
:
12136 icode
= CODE_FOR_avx2_gathersiv4df
;
12138 case IX86_BUILTIN_GATHERALTDIV8SF
:
12139 icode
= CODE_FOR_avx2_gatherdiv8sf
;
12141 case IX86_BUILTIN_GATHERALTSIV4DI
:
12142 icode
= CODE_FOR_avx2_gathersiv4di
;
12144 case IX86_BUILTIN_GATHERALTDIV8SI
:
12145 icode
= CODE_FOR_avx2_gatherdiv8si
;
12147 case IX86_BUILTIN_GATHER3SIV16SF
:
12148 icode
= CODE_FOR_avx512f_gathersiv16sf
;
12150 case IX86_BUILTIN_GATHER3SIV8DF
:
12151 icode
= CODE_FOR_avx512f_gathersiv8df
;
12153 case IX86_BUILTIN_GATHER3DIV16SF
:
12154 icode
= CODE_FOR_avx512f_gatherdiv16sf
;
12156 case IX86_BUILTIN_GATHER3DIV8DF
:
12157 icode
= CODE_FOR_avx512f_gatherdiv8df
;
12159 case IX86_BUILTIN_GATHER3SIV16SI
:
12160 icode
= CODE_FOR_avx512f_gathersiv16si
;
12162 case IX86_BUILTIN_GATHER3SIV8DI
:
12163 icode
= CODE_FOR_avx512f_gathersiv8di
;
12165 case IX86_BUILTIN_GATHER3DIV16SI
:
12166 icode
= CODE_FOR_avx512f_gatherdiv16si
;
12168 case IX86_BUILTIN_GATHER3DIV8DI
:
12169 icode
= CODE_FOR_avx512f_gatherdiv8di
;
12171 case IX86_BUILTIN_GATHER3ALTSIV8DF
:
12172 icode
= CODE_FOR_avx512f_gathersiv8df
;
12174 case IX86_BUILTIN_GATHER3ALTDIV16SF
:
12175 icode
= CODE_FOR_avx512f_gatherdiv16sf
;
12177 case IX86_BUILTIN_GATHER3ALTSIV8DI
:
12178 icode
= CODE_FOR_avx512f_gathersiv8di
;
12180 case IX86_BUILTIN_GATHER3ALTDIV16SI
:
12181 icode
= CODE_FOR_avx512f_gatherdiv16si
;
12183 case IX86_BUILTIN_GATHER3SIV2DF
:
12184 icode
= CODE_FOR_avx512vl_gathersiv2df
;
12186 case IX86_BUILTIN_GATHER3SIV4DF
:
12187 icode
= CODE_FOR_avx512vl_gathersiv4df
;
12189 case IX86_BUILTIN_GATHER3DIV2DF
:
12190 icode
= CODE_FOR_avx512vl_gatherdiv2df
;
12192 case IX86_BUILTIN_GATHER3DIV4DF
:
12193 icode
= CODE_FOR_avx512vl_gatherdiv4df
;
12195 case IX86_BUILTIN_GATHER3SIV4SF
:
12196 icode
= CODE_FOR_avx512vl_gathersiv4sf
;
12198 case IX86_BUILTIN_GATHER3SIV8SF
:
12199 icode
= CODE_FOR_avx512vl_gathersiv8sf
;
12201 case IX86_BUILTIN_GATHER3DIV4SF
:
12202 icode
= CODE_FOR_avx512vl_gatherdiv4sf
;
12204 case IX86_BUILTIN_GATHER3DIV8SF
:
12205 icode
= CODE_FOR_avx512vl_gatherdiv8sf
;
12207 case IX86_BUILTIN_GATHER3SIV2DI
:
12208 icode
= CODE_FOR_avx512vl_gathersiv2di
;
12210 case IX86_BUILTIN_GATHER3SIV4DI
:
12211 icode
= CODE_FOR_avx512vl_gathersiv4di
;
12213 case IX86_BUILTIN_GATHER3DIV2DI
:
12214 icode
= CODE_FOR_avx512vl_gatherdiv2di
;
12216 case IX86_BUILTIN_GATHER3DIV4DI
:
12217 icode
= CODE_FOR_avx512vl_gatherdiv4di
;
12219 case IX86_BUILTIN_GATHER3SIV4SI
:
12220 icode
= CODE_FOR_avx512vl_gathersiv4si
;
12222 case IX86_BUILTIN_GATHER3SIV8SI
:
12223 icode
= CODE_FOR_avx512vl_gathersiv8si
;
12225 case IX86_BUILTIN_GATHER3DIV4SI
:
12226 icode
= CODE_FOR_avx512vl_gatherdiv4si
;
12228 case IX86_BUILTIN_GATHER3DIV8SI
:
12229 icode
= CODE_FOR_avx512vl_gatherdiv8si
;
12231 case IX86_BUILTIN_GATHER3ALTSIV4DF
:
12232 icode
= CODE_FOR_avx512vl_gathersiv4df
;
12234 case IX86_BUILTIN_GATHER3ALTDIV8SF
:
12235 icode
= CODE_FOR_avx512vl_gatherdiv8sf
;
12237 case IX86_BUILTIN_GATHER3ALTSIV4DI
:
12238 icode
= CODE_FOR_avx512vl_gathersiv4di
;
12240 case IX86_BUILTIN_GATHER3ALTDIV8SI
:
12241 icode
= CODE_FOR_avx512vl_gatherdiv8si
;
12243 case IX86_BUILTIN_SCATTERSIV16SF
:
12244 icode
= CODE_FOR_avx512f_scattersiv16sf
;
12246 case IX86_BUILTIN_SCATTERSIV8DF
:
12247 icode
= CODE_FOR_avx512f_scattersiv8df
;
12249 case IX86_BUILTIN_SCATTERDIV16SF
:
12250 icode
= CODE_FOR_avx512f_scatterdiv16sf
;
12252 case IX86_BUILTIN_SCATTERDIV8DF
:
12253 icode
= CODE_FOR_avx512f_scatterdiv8df
;
12255 case IX86_BUILTIN_SCATTERSIV16SI
:
12256 icode
= CODE_FOR_avx512f_scattersiv16si
;
12258 case IX86_BUILTIN_SCATTERSIV8DI
:
12259 icode
= CODE_FOR_avx512f_scattersiv8di
;
12261 case IX86_BUILTIN_SCATTERDIV16SI
:
12262 icode
= CODE_FOR_avx512f_scatterdiv16si
;
12264 case IX86_BUILTIN_SCATTERDIV8DI
:
12265 icode
= CODE_FOR_avx512f_scatterdiv8di
;
12267 case IX86_BUILTIN_SCATTERSIV8SF
:
12268 icode
= CODE_FOR_avx512vl_scattersiv8sf
;
12270 case IX86_BUILTIN_SCATTERSIV4SF
:
12271 icode
= CODE_FOR_avx512vl_scattersiv4sf
;
12273 case IX86_BUILTIN_SCATTERSIV4DF
:
12274 icode
= CODE_FOR_avx512vl_scattersiv4df
;
12276 case IX86_BUILTIN_SCATTERSIV2DF
:
12277 icode
= CODE_FOR_avx512vl_scattersiv2df
;
12279 case IX86_BUILTIN_SCATTERDIV8SF
:
12280 icode
= CODE_FOR_avx512vl_scatterdiv8sf
;
12282 case IX86_BUILTIN_SCATTERDIV4SF
:
12283 icode
= CODE_FOR_avx512vl_scatterdiv4sf
;
12285 case IX86_BUILTIN_SCATTERDIV4DF
:
12286 icode
= CODE_FOR_avx512vl_scatterdiv4df
;
12288 case IX86_BUILTIN_SCATTERDIV2DF
:
12289 icode
= CODE_FOR_avx512vl_scatterdiv2df
;
12291 case IX86_BUILTIN_SCATTERSIV8SI
:
12292 icode
= CODE_FOR_avx512vl_scattersiv8si
;
12294 case IX86_BUILTIN_SCATTERSIV4SI
:
12295 icode
= CODE_FOR_avx512vl_scattersiv4si
;
12297 case IX86_BUILTIN_SCATTERSIV4DI
:
12298 icode
= CODE_FOR_avx512vl_scattersiv4di
;
12300 case IX86_BUILTIN_SCATTERSIV2DI
:
12301 icode
= CODE_FOR_avx512vl_scattersiv2di
;
12303 case IX86_BUILTIN_SCATTERDIV8SI
:
12304 icode
= CODE_FOR_avx512vl_scatterdiv8si
;
12306 case IX86_BUILTIN_SCATTERDIV4SI
:
12307 icode
= CODE_FOR_avx512vl_scatterdiv4si
;
12309 case IX86_BUILTIN_SCATTERDIV4DI
:
12310 icode
= CODE_FOR_avx512vl_scatterdiv4di
;
12312 case IX86_BUILTIN_SCATTERDIV2DI
:
12313 icode
= CODE_FOR_avx512vl_scatterdiv2di
;
12315 case IX86_BUILTIN_GATHERPFDPD
:
12316 icode
= CODE_FOR_avx512pf_gatherpfv8sidf
;
12317 goto vec_prefetch_gen
;
12318 case IX86_BUILTIN_SCATTERALTSIV8DF
:
12319 icode
= CODE_FOR_avx512f_scattersiv8df
;
12321 case IX86_BUILTIN_SCATTERALTDIV16SF
:
12322 icode
= CODE_FOR_avx512f_scatterdiv16sf
;
12324 case IX86_BUILTIN_SCATTERALTSIV8DI
:
12325 icode
= CODE_FOR_avx512f_scattersiv8di
;
12327 case IX86_BUILTIN_SCATTERALTDIV16SI
:
12328 icode
= CODE_FOR_avx512f_scatterdiv16si
;
12330 case IX86_BUILTIN_SCATTERALTSIV4DF
:
12331 icode
= CODE_FOR_avx512vl_scattersiv4df
;
12333 case IX86_BUILTIN_SCATTERALTDIV8SF
:
12334 icode
= CODE_FOR_avx512vl_scatterdiv8sf
;
12336 case IX86_BUILTIN_SCATTERALTSIV4DI
:
12337 icode
= CODE_FOR_avx512vl_scattersiv4di
;
12339 case IX86_BUILTIN_SCATTERALTDIV8SI
:
12340 icode
= CODE_FOR_avx512vl_scatterdiv8si
;
12342 case IX86_BUILTIN_SCATTERALTSIV2DF
:
12343 icode
= CODE_FOR_avx512vl_scattersiv2df
;
12345 case IX86_BUILTIN_SCATTERALTDIV4SF
:
12346 icode
= CODE_FOR_avx512vl_scatterdiv4sf
;
12348 case IX86_BUILTIN_SCATTERALTSIV2DI
:
12349 icode
= CODE_FOR_avx512vl_scattersiv2di
;
12351 case IX86_BUILTIN_SCATTERALTDIV4SI
:
12352 icode
= CODE_FOR_avx512vl_scatterdiv4si
;
12354 case IX86_BUILTIN_GATHERPFDPS
:
12355 icode
= CODE_FOR_avx512pf_gatherpfv16sisf
;
12356 goto vec_prefetch_gen
;
12357 case IX86_BUILTIN_GATHERPFQPD
:
12358 icode
= CODE_FOR_avx512pf_gatherpfv8didf
;
12359 goto vec_prefetch_gen
;
12360 case IX86_BUILTIN_GATHERPFQPS
:
12361 icode
= CODE_FOR_avx512pf_gatherpfv8disf
;
12362 goto vec_prefetch_gen
;
12363 case IX86_BUILTIN_SCATTERPFDPD
:
12364 icode
= CODE_FOR_avx512pf_scatterpfv8sidf
;
12365 goto vec_prefetch_gen
;
12366 case IX86_BUILTIN_SCATTERPFDPS
:
12367 icode
= CODE_FOR_avx512pf_scatterpfv16sisf
;
12368 goto vec_prefetch_gen
;
12369 case IX86_BUILTIN_SCATTERPFQPD
:
12370 icode
= CODE_FOR_avx512pf_scatterpfv8didf
;
12371 goto vec_prefetch_gen
;
12372 case IX86_BUILTIN_SCATTERPFQPS
:
12373 icode
= CODE_FOR_avx512pf_scatterpfv8disf
;
12374 goto vec_prefetch_gen
;
12378 rtx (*gen
) (rtx
, rtx
);
12380 arg0
= CALL_EXPR_ARG (exp
, 0);
12381 arg1
= CALL_EXPR_ARG (exp
, 1);
12382 arg2
= CALL_EXPR_ARG (exp
, 2);
12383 arg3
= CALL_EXPR_ARG (exp
, 3);
12384 arg4
= CALL_EXPR_ARG (exp
, 4);
12385 op0
= expand_normal (arg0
);
12386 op1
= expand_normal (arg1
);
12387 op2
= expand_normal (arg2
);
12388 op3
= expand_normal (arg3
);
12389 op4
= expand_normal (arg4
);
12390 /* Note the arg order is different from the operand order. */
12391 mode0
= insn_data
[icode
].operand
[1].mode
;
12392 mode2
= insn_data
[icode
].operand
[3].mode
;
12393 mode3
= insn_data
[icode
].operand
[4].mode
;
12394 mode4
= insn_data
[icode
].operand
[5].mode
;
12396 if (target
== NULL_RTX
12397 || GET_MODE (target
) != insn_data
[icode
].operand
[0].mode
12398 || !insn_data
[icode
].operand
[0].predicate (target
,
12399 GET_MODE (target
)))
12400 subtarget
= gen_reg_rtx (insn_data
[icode
].operand
[0].mode
);
12402 subtarget
= target
;
12406 case IX86_BUILTIN_GATHER3ALTSIV8DF
:
12407 case IX86_BUILTIN_GATHER3ALTSIV8DI
:
12408 half
= gen_reg_rtx (V8SImode
);
12409 if (!nonimmediate_operand (op2
, V16SImode
))
12410 op2
= copy_to_mode_reg (V16SImode
, op2
);
12411 emit_insn (gen_vec_extract_lo_v16si (half
, op2
));
12414 case IX86_BUILTIN_GATHER3ALTSIV4DF
:
12415 case IX86_BUILTIN_GATHER3ALTSIV4DI
:
12416 case IX86_BUILTIN_GATHERALTSIV4DF
:
12417 case IX86_BUILTIN_GATHERALTSIV4DI
:
12418 half
= gen_reg_rtx (V4SImode
);
12419 if (!nonimmediate_operand (op2
, V8SImode
))
12420 op2
= copy_to_mode_reg (V8SImode
, op2
);
12421 emit_insn (gen_vec_extract_lo_v8si (half
, op2
));
12424 case IX86_BUILTIN_GATHER3ALTDIV16SF
:
12425 case IX86_BUILTIN_GATHER3ALTDIV16SI
:
12426 half
= gen_reg_rtx (mode0
);
12427 if (mode0
== V8SFmode
)
12428 gen
= gen_vec_extract_lo_v16sf
;
12430 gen
= gen_vec_extract_lo_v16si
;
12431 if (!nonimmediate_operand (op0
, GET_MODE (op0
)))
12432 op0
= copy_to_mode_reg (GET_MODE (op0
), op0
);
12433 emit_insn (gen (half
, op0
));
12435 op3
= lowpart_subreg (QImode
, op3
, HImode
);
12437 case IX86_BUILTIN_GATHER3ALTDIV8SF
:
12438 case IX86_BUILTIN_GATHER3ALTDIV8SI
:
12439 case IX86_BUILTIN_GATHERALTDIV8SF
:
12440 case IX86_BUILTIN_GATHERALTDIV8SI
:
12441 half
= gen_reg_rtx (mode0
);
12442 if (mode0
== V4SFmode
)
12443 gen
= gen_vec_extract_lo_v8sf
;
12445 gen
= gen_vec_extract_lo_v8si
;
12446 if (!nonimmediate_operand (op0
, GET_MODE (op0
)))
12447 op0
= copy_to_mode_reg (GET_MODE (op0
), op0
);
12448 emit_insn (gen (half
, op0
));
12450 if (VECTOR_MODE_P (GET_MODE (op3
)))
12452 half
= gen_reg_rtx (mode0
);
12453 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12454 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12455 emit_insn (gen (half
, op3
));
12463 /* Force memory operand only with base register here. But we
12464 don't want to do it on memory operand for other builtin
12466 op1
= ix86_zero_extend_to_Pmode (op1
);
12468 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
12469 op0
= copy_to_mode_reg (mode0
, op0
);
12470 if (!insn_data
[icode
].operand
[2].predicate (op1
, Pmode
))
12471 op1
= copy_to_mode_reg (Pmode
, op1
);
12472 if (!insn_data
[icode
].operand
[3].predicate (op2
, mode2
))
12473 op2
= copy_to_mode_reg (mode2
, op2
);
12475 op3
= fixup_modeless_constant (op3
, mode3
);
12477 if (GET_MODE (op3
) == mode3
|| GET_MODE (op3
) == VOIDmode
)
12479 if (!insn_data
[icode
].operand
[4].predicate (op3
, mode3
))
12480 op3
= copy_to_mode_reg (mode3
, op3
);
12484 op3
= copy_to_reg (op3
);
12485 op3
= lowpart_subreg (mode3
, op3
, GET_MODE (op3
));
12487 if (!insn_data
[icode
].operand
[5].predicate (op4
, mode4
))
12489 error ("the last argument must be scale 1, 2, 4, 8");
12493 /* Optimize. If mask is known to have all high bits set,
12494 replace op0 with pc_rtx to signal that the instruction
12495 overwrites the whole destination and doesn't use its
12496 previous contents. */
12499 if (TREE_CODE (arg3
) == INTEGER_CST
)
12501 if (integer_all_onesp (arg3
))
12504 else if (TREE_CODE (arg3
) == VECTOR_CST
)
12506 unsigned int negative
= 0;
12507 for (i
= 0; i
< VECTOR_CST_NELTS (arg3
); ++i
)
12509 tree cst
= VECTOR_CST_ELT (arg3
, i
);
12510 if (TREE_CODE (cst
) == INTEGER_CST
12511 && tree_int_cst_sign_bit (cst
))
12513 else if (TREE_CODE (cst
) == REAL_CST
12514 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst
)))
12517 if (negative
== TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3
)))
12520 else if (TREE_CODE (arg3
) == SSA_NAME
12521 && TREE_CODE (TREE_TYPE (arg3
)) == VECTOR_TYPE
)
12523 /* Recognize also when mask is like:
12524 __v2df src = _mm_setzero_pd ();
12525 __v2df mask = _mm_cmpeq_pd (src, src);
12527 __v8sf src = _mm256_setzero_ps ();
12528 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
12529 as that is a cheaper way to load all ones into
12530 a register than having to load a constant from
12532 gimple
*def_stmt
= SSA_NAME_DEF_STMT (arg3
);
12533 if (is_gimple_call (def_stmt
))
12535 tree fndecl
= gimple_call_fndecl (def_stmt
);
12537 && fndecl_built_in_p (fndecl
, BUILT_IN_MD
))
12538 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl
))
12540 case IX86_BUILTIN_CMPPD
:
12541 case IX86_BUILTIN_CMPPS
:
12542 case IX86_BUILTIN_CMPPD256
:
12543 case IX86_BUILTIN_CMPPS256
:
12544 if (!integer_zerop (gimple_call_arg (def_stmt
, 2)))
12547 case IX86_BUILTIN_CMPEQPD
:
12548 case IX86_BUILTIN_CMPEQPS
:
12549 if (initializer_zerop (gimple_call_arg (def_stmt
, 0))
12550 && initializer_zerop (gimple_call_arg (def_stmt
,
12561 pat
= GEN_FCN (icode
) (subtarget
, op0
, op1
, op2
, op3
, op4
);
12568 case IX86_BUILTIN_GATHER3DIV16SF
:
12569 if (target
== NULL_RTX
)
12570 target
= gen_reg_rtx (V8SFmode
);
12571 emit_insn (gen_vec_extract_lo_v16sf (target
, subtarget
));
12573 case IX86_BUILTIN_GATHER3DIV16SI
:
12574 if (target
== NULL_RTX
)
12575 target
= gen_reg_rtx (V8SImode
);
12576 emit_insn (gen_vec_extract_lo_v16si (target
, subtarget
));
12578 case IX86_BUILTIN_GATHER3DIV8SF
:
12579 case IX86_BUILTIN_GATHERDIV8SF
:
12580 if (target
== NULL_RTX
)
12581 target
= gen_reg_rtx (V4SFmode
);
12582 emit_insn (gen_vec_extract_lo_v8sf (target
, subtarget
));
12584 case IX86_BUILTIN_GATHER3DIV8SI
:
12585 case IX86_BUILTIN_GATHERDIV8SI
:
12586 if (target
== NULL_RTX
)
12587 target
= gen_reg_rtx (V4SImode
);
12588 emit_insn (gen_vec_extract_lo_v8si (target
, subtarget
));
12591 target
= subtarget
;
12597 arg0
= CALL_EXPR_ARG (exp
, 0);
12598 arg1
= CALL_EXPR_ARG (exp
, 1);
12599 arg2
= CALL_EXPR_ARG (exp
, 2);
12600 arg3
= CALL_EXPR_ARG (exp
, 3);
12601 arg4
= CALL_EXPR_ARG (exp
, 4);
12602 op0
= expand_normal (arg0
);
12603 op1
= expand_normal (arg1
);
12604 op2
= expand_normal (arg2
);
12605 op3
= expand_normal (arg3
);
12606 op4
= expand_normal (arg4
);
12607 mode1
= insn_data
[icode
].operand
[1].mode
;
12608 mode2
= insn_data
[icode
].operand
[2].mode
;
12609 mode3
= insn_data
[icode
].operand
[3].mode
;
12610 mode4
= insn_data
[icode
].operand
[4].mode
;
12612 /* Scatter instruction stores operand op3 to memory with
12613 indices from op2 and scale from op4 under writemask op1.
12614 If index operand op2 has more elements then source operand
12615 op3 one need to use only its low half. And vice versa. */
12618 case IX86_BUILTIN_SCATTERALTSIV8DF
:
12619 case IX86_BUILTIN_SCATTERALTSIV8DI
:
12620 half
= gen_reg_rtx (V8SImode
);
12621 if (!nonimmediate_operand (op2
, V16SImode
))
12622 op2
= copy_to_mode_reg (V16SImode
, op2
);
12623 emit_insn (gen_vec_extract_lo_v16si (half
, op2
));
12626 case IX86_BUILTIN_SCATTERALTDIV16SF
:
12627 case IX86_BUILTIN_SCATTERALTDIV16SI
:
12628 half
= gen_reg_rtx (mode3
);
12629 if (mode3
== V8SFmode
)
12630 gen
= gen_vec_extract_lo_v16sf
;
12632 gen
= gen_vec_extract_lo_v16si
;
12633 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12634 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12635 emit_insn (gen (half
, op3
));
12638 case IX86_BUILTIN_SCATTERALTSIV4DF
:
12639 case IX86_BUILTIN_SCATTERALTSIV4DI
:
12640 half
= gen_reg_rtx (V4SImode
);
12641 if (!nonimmediate_operand (op2
, V8SImode
))
12642 op2
= copy_to_mode_reg (V8SImode
, op2
);
12643 emit_insn (gen_vec_extract_lo_v8si (half
, op2
));
12646 case IX86_BUILTIN_SCATTERALTDIV8SF
:
12647 case IX86_BUILTIN_SCATTERALTDIV8SI
:
12648 half
= gen_reg_rtx (mode3
);
12649 if (mode3
== V4SFmode
)
12650 gen
= gen_vec_extract_lo_v8sf
;
12652 gen
= gen_vec_extract_lo_v8si
;
12653 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12654 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12655 emit_insn (gen (half
, op3
));
12658 case IX86_BUILTIN_SCATTERALTSIV2DF
:
12659 case IX86_BUILTIN_SCATTERALTSIV2DI
:
12660 if (!nonimmediate_operand (op2
, V4SImode
))
12661 op2
= copy_to_mode_reg (V4SImode
, op2
);
12663 case IX86_BUILTIN_SCATTERALTDIV4SF
:
12664 case IX86_BUILTIN_SCATTERALTDIV4SI
:
12665 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
12666 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
12672 /* Force memory operand only with base register here. But we
12673 don't want to do it on memory operand for other builtin
12675 op0
= force_reg (Pmode
, convert_to_mode (Pmode
, op0
, 1));
12677 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
12678 op0
= copy_to_mode_reg (Pmode
, op0
);
12680 op1
= fixup_modeless_constant (op1
, mode1
);
12682 if (GET_MODE (op1
) == mode1
|| GET_MODE (op1
) == VOIDmode
)
12684 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
12685 op1
= copy_to_mode_reg (mode1
, op1
);
12689 op1
= copy_to_reg (op1
);
12690 op1
= lowpart_subreg (mode1
, op1
, GET_MODE (op1
));
12693 if (!insn_data
[icode
].operand
[2].predicate (op2
, mode2
))
12694 op2
= copy_to_mode_reg (mode2
, op2
);
12696 if (!insn_data
[icode
].operand
[3].predicate (op3
, mode3
))
12697 op3
= copy_to_mode_reg (mode3
, op3
);
12699 if (!insn_data
[icode
].operand
[4].predicate (op4
, mode4
))
12701 error ("the last argument must be scale 1, 2, 4, 8");
12705 pat
= GEN_FCN (icode
) (op0
, op1
, op2
, op3
, op4
);
12713 arg0
= CALL_EXPR_ARG (exp
, 0);
12714 arg1
= CALL_EXPR_ARG (exp
, 1);
12715 arg2
= CALL_EXPR_ARG (exp
, 2);
12716 arg3
= CALL_EXPR_ARG (exp
, 3);
12717 arg4
= CALL_EXPR_ARG (exp
, 4);
12718 op0
= expand_normal (arg0
);
12719 op1
= expand_normal (arg1
);
12720 op2
= expand_normal (arg2
);
12721 op3
= expand_normal (arg3
);
12722 op4
= expand_normal (arg4
);
12723 mode0
= insn_data
[icode
].operand
[0].mode
;
12724 mode1
= insn_data
[icode
].operand
[1].mode
;
12725 mode3
= insn_data
[icode
].operand
[3].mode
;
12726 mode4
= insn_data
[icode
].operand
[4].mode
;
12728 op0
= fixup_modeless_constant (op0
, mode0
);
12730 if (GET_MODE (op0
) == mode0
|| GET_MODE (op0
) == VOIDmode
)
12732 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
12733 op0
= copy_to_mode_reg (mode0
, op0
);
12737 op0
= copy_to_reg (op0
);
12738 op0
= lowpart_subreg (mode0
, op0
, GET_MODE (op0
));
12741 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
12742 op1
= copy_to_mode_reg (mode1
, op1
);
12744 /* Force memory operand only with base register here. But we
12745 don't want to do it on memory operand for other builtin
12747 op2
= force_reg (Pmode
, convert_to_mode (Pmode
, op2
, 1));
12749 if (!insn_data
[icode
].operand
[2].predicate (op2
, Pmode
))
12750 op2
= copy_to_mode_reg (Pmode
, op2
);
12752 if (!insn_data
[icode
].operand
[3].predicate (op3
, mode3
))
12754 error ("the forth argument must be scale 1, 2, 4, 8");
12758 if (!insn_data
[icode
].operand
[4].predicate (op4
, mode4
))
12760 error ("incorrect hint operand");
12764 pat
= GEN_FCN (icode
) (op0
, op1
, op2
, op3
, op4
);
12772 case IX86_BUILTIN_XABORT
:
12773 icode
= CODE_FOR_xabort
;
12774 arg0
= CALL_EXPR_ARG (exp
, 0);
12775 op0
= expand_normal (arg0
);
12776 mode0
= insn_data
[icode
].operand
[0].mode
;
12777 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
12779 error ("the argument to %<xabort%> intrinsic must "
12780 "be an 8-bit immediate");
12783 emit_insn (gen_xabort (op0
));
12786 case IX86_BUILTIN_RSTORSSP
:
12787 case IX86_BUILTIN_CLRSSBSY
:
12788 arg0
= CALL_EXPR_ARG (exp
, 0);
12789 op0
= expand_normal (arg0
);
12790 icode
= (fcode
== IX86_BUILTIN_RSTORSSP
12791 ? CODE_FOR_rstorssp
12792 : CODE_FOR_clrssbsy
);
12793 if (!address_operand (op0
, VOIDmode
))
12795 op1
= convert_memory_address (Pmode
, op0
);
12796 op0
= copy_addr_to_reg (op1
);
12798 emit_insn (GEN_FCN (icode
) (gen_rtx_MEM (Pmode
, op0
)));
12801 case IX86_BUILTIN_WRSSD
:
12802 case IX86_BUILTIN_WRSSQ
:
12803 case IX86_BUILTIN_WRUSSD
:
12804 case IX86_BUILTIN_WRUSSQ
:
12805 arg0
= CALL_EXPR_ARG (exp
, 0);
12806 op0
= expand_normal (arg0
);
12807 arg1
= CALL_EXPR_ARG (exp
, 1);
12808 op1
= expand_normal (arg1
);
12811 case IX86_BUILTIN_WRSSD
:
12812 icode
= CODE_FOR_wrsssi
;
12815 case IX86_BUILTIN_WRSSQ
:
12816 icode
= CODE_FOR_wrssdi
;
12819 case IX86_BUILTIN_WRUSSD
:
12820 icode
= CODE_FOR_wrusssi
;
12823 case IX86_BUILTIN_WRUSSQ
:
12824 icode
= CODE_FOR_wrussdi
;
12828 op0
= force_reg (mode
, op0
);
12829 if (!address_operand (op1
, VOIDmode
))
12831 op2
= convert_memory_address (Pmode
, op1
);
12832 op1
= copy_addr_to_reg (op2
);
12834 emit_insn (GEN_FCN (icode
) (op0
, gen_rtx_MEM (mode
, op1
)));
12841 if (fcode
>= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
12842 && fcode
<= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST
)
12844 i
= fcode
- IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
;
12845 return ix86_expand_special_args_builtin (bdesc_special_args
+ i
, exp
,
12849 if (fcode
>= IX86_BUILTIN__BDESC_ARGS_FIRST
12850 && fcode
<= IX86_BUILTIN__BDESC_ARGS_LAST
)
12852 i
= fcode
- IX86_BUILTIN__BDESC_ARGS_FIRST
;
12853 rtx (*fcn
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
12854 rtx (*fcn_mask
) (rtx
, rtx
, rtx
, rtx
, rtx
);
12855 rtx (*fcn_maskz
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
);
12857 machine_mode mode
, wide_mode
, nar_mode
;
12859 nar_mode
= V4SFmode
;
12861 wide_mode
= V64SFmode
;
12862 fcn_mask
= gen_avx5124fmaddps_4fmaddps_mask
;
12863 fcn_maskz
= gen_avx5124fmaddps_4fmaddps_maskz
;
12867 case IX86_BUILTIN_4FMAPS
:
12868 fcn
= gen_avx5124fmaddps_4fmaddps
;
12872 case IX86_BUILTIN_4DPWSSD
:
12873 nar_mode
= V4SImode
;
12875 wide_mode
= V64SImode
;
12876 fcn
= gen_avx5124vnniw_vp4dpwssd
;
12880 case IX86_BUILTIN_4DPWSSDS
:
12881 nar_mode
= V4SImode
;
12883 wide_mode
= V64SImode
;
12884 fcn
= gen_avx5124vnniw_vp4dpwssds
;
12888 case IX86_BUILTIN_4FNMAPS
:
12889 fcn
= gen_avx5124fmaddps_4fnmaddps
;
12893 case IX86_BUILTIN_4FNMAPS_MASK
:
12894 fcn_mask
= gen_avx5124fmaddps_4fnmaddps_mask
;
12895 fcn_maskz
= gen_avx5124fmaddps_4fnmaddps_maskz
;
12898 case IX86_BUILTIN_4DPWSSD_MASK
:
12899 nar_mode
= V4SImode
;
12901 wide_mode
= V64SImode
;
12902 fcn_mask
= gen_avx5124vnniw_vp4dpwssd_mask
;
12903 fcn_maskz
= gen_avx5124vnniw_vp4dpwssd_maskz
;
12906 case IX86_BUILTIN_4DPWSSDS_MASK
:
12907 nar_mode
= V4SImode
;
12909 wide_mode
= V64SImode
;
12910 fcn_mask
= gen_avx5124vnniw_vp4dpwssds_mask
;
12911 fcn_maskz
= gen_avx5124vnniw_vp4dpwssds_maskz
;
12914 case IX86_BUILTIN_4FMAPS_MASK
:
12924 wide_reg
= gen_reg_rtx (wide_mode
);
12925 for (i
= 0; i
< 4; i
++)
12927 args
[i
] = CALL_EXPR_ARG (exp
, i
);
12928 ops
[i
] = expand_normal (args
[i
]);
12930 emit_move_insn (gen_rtx_SUBREG (mode
, wide_reg
, i
* 64),
12934 accum
= expand_normal (CALL_EXPR_ARG (exp
, 4));
12935 accum
= force_reg (mode
, accum
);
12937 addr
= expand_normal (CALL_EXPR_ARG (exp
, 5));
12938 addr
= force_reg (Pmode
, addr
);
12940 mem
= gen_rtx_MEM (nar_mode
, addr
);
12942 target
= gen_reg_rtx (mode
);
12944 emit_move_insn (target
, accum
);
12947 emit_insn (fcn (target
, accum
, wide_reg
, mem
));
12951 merge
= expand_normal (CALL_EXPR_ARG (exp
, 6));
12953 mask
= expand_normal (CALL_EXPR_ARG (exp
, 7));
12955 if (CONST_INT_P (mask
))
12956 mask
= fixup_modeless_constant (mask
, HImode
);
12958 mask
= force_reg (HImode
, mask
);
12960 if (GET_MODE (mask
) != HImode
)
12961 mask
= gen_rtx_SUBREG (HImode
, mask
, 0);
12963 /* If merge is 0 then we're about to emit z-masked variant. */
12964 if (const0_operand (merge
, mode
))
12965 emit_insn (fcn_maskz (target
, accum
, wide_reg
, mem
, merge
, mask
));
12966 /* If merge is the same as accum then emit merge-masked variant. */
12967 else if (CALL_EXPR_ARG (exp
, 6) == CALL_EXPR_ARG (exp
, 4))
12969 merge
= force_reg (mode
, merge
);
12970 emit_insn (fcn_mask (target
, wide_reg
, mem
, merge
, mask
));
12972 /* Merge with something unknown might happen if we z-mask w/ -O0. */
12975 target
= gen_reg_rtx (mode
);
12976 emit_move_insn (target
, merge
);
12977 emit_insn (fcn_mask (target
, wide_reg
, mem
, target
, mask
));
12983 case IX86_BUILTIN_4FNMASS
:
12984 fcn
= gen_avx5124fmaddps_4fnmaddss
;
12988 case IX86_BUILTIN_4FMASS
:
12989 fcn
= gen_avx5124fmaddps_4fmaddss
;
12993 case IX86_BUILTIN_4FNMASS_MASK
:
12994 fcn_mask
= gen_avx5124fmaddps_4fnmaddss_mask
;
12995 fcn_maskz
= gen_avx5124fmaddps_4fnmaddss_maskz
;
12998 case IX86_BUILTIN_4FMASS_MASK
:
13007 fcn_mask
= gen_avx5124fmaddps_4fmaddss_mask
;
13008 fcn_maskz
= gen_avx5124fmaddps_4fmaddss_maskz
;
13012 wide_reg
= gen_reg_rtx (V64SFmode
);
13013 for (i
= 0; i
< 4; i
++)
13016 args
[i
] = CALL_EXPR_ARG (exp
, i
);
13017 ops
[i
] = expand_normal (args
[i
]);
13019 tmp
= gen_reg_rtx (SFmode
);
13020 emit_move_insn (tmp
, gen_rtx_SUBREG (SFmode
, ops
[i
], 0));
13022 emit_move_insn (gen_rtx_SUBREG (V16SFmode
, wide_reg
, i
* 64),
13023 gen_rtx_SUBREG (V16SFmode
, tmp
, 0));
13026 accum
= expand_normal (CALL_EXPR_ARG (exp
, 4));
13027 accum
= force_reg (V4SFmode
, accum
);
13029 addr
= expand_normal (CALL_EXPR_ARG (exp
, 5));
13030 addr
= force_reg (Pmode
, addr
);
13032 mem
= gen_rtx_MEM (V4SFmode
, addr
);
13034 target
= gen_reg_rtx (V4SFmode
);
13036 emit_move_insn (target
, accum
);
13039 emit_insn (fcn (target
, accum
, wide_reg
, mem
));
13043 merge
= expand_normal (CALL_EXPR_ARG (exp
, 6));
13045 mask
= expand_normal (CALL_EXPR_ARG (exp
, 7));
13047 if (CONST_INT_P (mask
))
13048 mask
= fixup_modeless_constant (mask
, QImode
);
13050 mask
= force_reg (QImode
, mask
);
13052 if (GET_MODE (mask
) != QImode
)
13053 mask
= gen_rtx_SUBREG (QImode
, mask
, 0);
13055 /* If merge is 0 then we're about to emit z-masked variant. */
13056 if (const0_operand (merge
, mode
))
13057 emit_insn (fcn_maskz (target
, accum
, wide_reg
, mem
, merge
, mask
));
13058 /* If merge is the same as accum then emit merge-masked
13060 else if (CALL_EXPR_ARG (exp
, 6) == CALL_EXPR_ARG (exp
, 4))
13062 merge
= force_reg (mode
, merge
);
13063 emit_insn (fcn_mask (target
, wide_reg
, mem
, merge
, mask
));
13065 /* Merge with something unknown might happen if we z-mask
13069 target
= gen_reg_rtx (mode
);
13070 emit_move_insn (target
, merge
);
13071 emit_insn (fcn_mask (target
, wide_reg
, mem
, target
, mask
));
13076 case IX86_BUILTIN_RDPID
:
13077 return ix86_expand_special_args_builtin (bdesc_args
+ i
, exp
,
13079 case IX86_BUILTIN_FABSQ
:
13080 case IX86_BUILTIN_COPYSIGNQ
:
13082 /* Emit a normal call if SSE isn't available. */
13083 return expand_call (exp
, target
, ignore
);
13086 return ix86_expand_args_builtin (bdesc_args
+ i
, exp
, target
);
13090 if (fcode
>= IX86_BUILTIN__BDESC_COMI_FIRST
13091 && fcode
<= IX86_BUILTIN__BDESC_COMI_LAST
)
13093 i
= fcode
- IX86_BUILTIN__BDESC_COMI_FIRST
;
13094 return ix86_expand_sse_comi (bdesc_comi
+ i
, exp
, target
);
13097 if (fcode
>= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
13098 && fcode
<= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST
)
13100 i
= fcode
- IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
;
13101 return ix86_expand_round_builtin (bdesc_round_args
+ i
, exp
, target
);
13104 if (fcode
>= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
13105 && fcode
<= IX86_BUILTIN__BDESC_PCMPESTR_LAST
)
13107 i
= fcode
- IX86_BUILTIN__BDESC_PCMPESTR_FIRST
;
13108 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr
+ i
, exp
, target
);
13111 if (fcode
>= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
13112 && fcode
<= IX86_BUILTIN__BDESC_PCMPISTR_LAST
)
13114 i
= fcode
- IX86_BUILTIN__BDESC_PCMPISTR_FIRST
;
13115 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr
+ i
, exp
, target
);
13118 if (fcode
>= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
13119 && fcode
<= IX86_BUILTIN__BDESC_MULTI_ARG_LAST
)
13121 i
= fcode
- IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
;
13122 const struct builtin_description
*d
= bdesc_multi_arg
+ i
;
13123 return ix86_expand_multi_arg_builtin (d
->icode
, exp
, target
,
13124 (enum ix86_builtin_func_type
)
13125 d
->flag
, d
->comparison
);
13128 if (fcode
>= IX86_BUILTIN__BDESC_CET_FIRST
13129 && fcode
<= IX86_BUILTIN__BDESC_CET_LAST
)
13131 i
= fcode
- IX86_BUILTIN__BDESC_CET_FIRST
;
13132 return ix86_expand_special_args_builtin (bdesc_cet
+ i
, exp
,
13136 if (fcode
>= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
13137 && fcode
<= IX86_BUILTIN__BDESC_CET_NORMAL_LAST
)
13139 i
= fcode
- IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
;
13140 return ix86_expand_special_args_builtin (bdesc_cet_rdssp
+ i
, exp
,
13144 gcc_unreachable ();
13147 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
13148 fill target with val via vec_duplicate. */
13151 ix86_vector_duplicate_value (machine_mode mode
, rtx target
, rtx val
)
13157 /* First attempt to recognize VAL as-is. */
13158 dup
= gen_vec_duplicate (mode
, val
);
13159 insn
= emit_insn (gen_rtx_SET (target
, dup
));
13160 if (recog_memoized (insn
) < 0)
13163 machine_mode innermode
= GET_MODE_INNER (mode
);
13166 /* If that fails, force VAL into a register. */
13169 reg
= force_reg (innermode
, val
);
13170 if (GET_MODE (reg
) != innermode
)
13171 reg
= gen_lowpart (innermode
, reg
);
13172 SET_SRC (PATTERN (insn
)) = gen_vec_duplicate (mode
, reg
);
13173 seq
= get_insns ();
13176 emit_insn_before (seq
, insn
);
13178 ok
= recog_memoized (insn
) >= 0;
13184 /* Get a vector mode of the same size as the original but with elements
13185 twice as wide. This is only guaranteed to apply to integral vectors. */
13187 static machine_mode
13188 get_mode_wider_vector (machine_mode o
)
13190 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
13191 machine_mode n
= GET_MODE_WIDER_MODE (o
).require ();
13192 gcc_assert (GET_MODE_NUNITS (o
) == GET_MODE_NUNITS (n
) * 2);
13193 gcc_assert (GET_MODE_SIZE (o
) == GET_MODE_SIZE (n
));
13197 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d
*d
);
13198 static bool expand_vec_perm_1 (struct expand_vec_perm_d
*d
);
13200 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13201 with all elements equal to VAR. Return true if successful. */
13204 ix86_expand_vector_init_duplicate (bool mmx_ok
, machine_mode mode
,
13205 rtx target
, rtx val
)
13229 return ix86_vector_duplicate_value (mode
, target
, val
);
13234 if (TARGET_SSE
|| TARGET_3DNOW_A
)
13238 val
= gen_lowpart (SImode
, val
);
13239 x
= gen_rtx_TRUNCATE (HImode
, val
);
13240 x
= gen_rtx_VEC_DUPLICATE (mode
, x
);
13241 emit_insn (gen_rtx_SET (target
, x
));
13253 return ix86_vector_duplicate_value (mode
, target
, val
);
13257 struct expand_vec_perm_d dperm
;
13261 memset (&dperm
, 0, sizeof (dperm
));
13262 dperm
.target
= target
;
13263 dperm
.vmode
= mode
;
13264 dperm
.nelt
= GET_MODE_NUNITS (mode
);
13265 dperm
.op0
= dperm
.op1
= gen_reg_rtx (mode
);
13266 dperm
.one_operand_p
= true;
13268 /* Extend to SImode using a paradoxical SUBREG. */
13269 tmp1
= gen_reg_rtx (SImode
);
13270 emit_move_insn (tmp1
, gen_lowpart (SImode
, val
));
13272 /* Insert the SImode value as low element of a V4SImode vector. */
13273 tmp2
= gen_reg_rtx (V4SImode
);
13274 emit_insn (gen_vec_setv4si_0 (tmp2
, CONST0_RTX (V4SImode
), tmp1
));
13275 emit_move_insn (dperm
.op0
, gen_lowpart (mode
, tmp2
));
13277 ok
= (expand_vec_perm_1 (&dperm
)
13278 || expand_vec_perm_broadcast_1 (&dperm
));
13286 return ix86_vector_duplicate_value (mode
, target
, val
);
13293 /* Replicate the value once into the next wider mode and recurse. */
13295 machine_mode smode
, wsmode
, wvmode
;
13298 smode
= GET_MODE_INNER (mode
);
13299 wvmode
= get_mode_wider_vector (mode
);
13300 wsmode
= GET_MODE_INNER (wvmode
);
13302 val
= convert_modes (wsmode
, smode
, val
, true);
13303 x
= expand_simple_binop (wsmode
, ASHIFT
, val
,
13304 GEN_INT (GET_MODE_BITSIZE (smode
)),
13305 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
13306 val
= expand_simple_binop (wsmode
, IOR
, val
, x
, x
, 1, OPTAB_LIB_WIDEN
);
13308 x
= gen_reg_rtx (wvmode
);
13309 ok
= ix86_expand_vector_init_duplicate (mmx_ok
, wvmode
, x
, val
);
13311 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), x
));
13318 return ix86_vector_duplicate_value (mode
, target
, val
);
13321 machine_mode hvmode
= (mode
== V16HImode
? V8HImode
: V16QImode
);
13322 rtx x
= gen_reg_rtx (hvmode
);
13324 ok
= ix86_expand_vector_init_duplicate (false, hvmode
, x
, val
);
13327 x
= gen_rtx_VEC_CONCAT (mode
, x
, x
);
13328 emit_insn (gen_rtx_SET (target
, x
));
13334 if (TARGET_AVX512BW
)
13335 return ix86_vector_duplicate_value (mode
, target
, val
);
13338 machine_mode hvmode
= (mode
== V32HImode
? V16HImode
: V32QImode
);
13339 rtx x
= gen_reg_rtx (hvmode
);
13341 ok
= ix86_expand_vector_init_duplicate (false, hvmode
, x
, val
);
13344 x
= gen_rtx_VEC_CONCAT (mode
, x
, x
);
13345 emit_insn (gen_rtx_SET (target
, x
));
13354 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13355 whose ONE_VAR element is VAR, and other elements are zero. Return true
13359 ix86_expand_vector_init_one_nonzero (bool mmx_ok
, machine_mode mode
,
13360 rtx target
, rtx var
, int one_var
)
13362 machine_mode vsimode
;
13365 bool use_vector_set
= false;
13366 rtx (*gen_vec_set_0
) (rtx
, rtx
, rtx
) = NULL
;
13371 /* For SSE4.1, we normally use vector set. But if the second
13372 element is zero and inter-unit moves are OK, we use movq
13374 use_vector_set
= (TARGET_64BIT
&& TARGET_SSE4_1
13375 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
13381 use_vector_set
= TARGET_SSE4_1
;
13384 use_vector_set
= TARGET_SSE2
;
13387 use_vector_set
= TARGET_SSE
|| TARGET_3DNOW_A
;
13391 use_vector_set
= TARGET_AVX
;
13394 use_vector_set
= TARGET_AVX
;
13395 gen_vec_set_0
= gen_vec_setv8si_0
;
13398 use_vector_set
= TARGET_AVX
;
13399 gen_vec_set_0
= gen_vec_setv8sf_0
;
13402 use_vector_set
= TARGET_AVX
;
13403 gen_vec_set_0
= gen_vec_setv4df_0
;
13406 /* Use ix86_expand_vector_set in 64bit mode only. */
13407 use_vector_set
= TARGET_AVX
&& TARGET_64BIT
;
13408 gen_vec_set_0
= gen_vec_setv4di_0
;
13411 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
13412 gen_vec_set_0
= gen_vec_setv16si_0
;
13415 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
13416 gen_vec_set_0
= gen_vec_setv16sf_0
;
13419 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
13420 gen_vec_set_0
= gen_vec_setv8df_0
;
13423 /* Use ix86_expand_vector_set in 64bit mode only. */
13424 use_vector_set
= TARGET_AVX512F
&& TARGET_64BIT
&& one_var
== 0;
13425 gen_vec_set_0
= gen_vec_setv8di_0
;
13431 if (use_vector_set
)
13433 if (gen_vec_set_0
&& one_var
== 0)
13435 var
= force_reg (GET_MODE_INNER (mode
), var
);
13436 emit_insn (gen_vec_set_0 (target
, CONST0_RTX (mode
), var
));
13439 emit_insn (gen_rtx_SET (target
, CONST0_RTX (mode
)));
13440 var
= force_reg (GET_MODE_INNER (mode
), var
);
13441 ix86_expand_vector_set (mmx_ok
, target
, var
, one_var
);
13457 var
= force_reg (GET_MODE_INNER (mode
), var
);
13458 x
= gen_rtx_VEC_CONCAT (mode
, var
, CONST0_RTX (GET_MODE_INNER (mode
)));
13459 emit_insn (gen_rtx_SET (target
, x
));
13464 if (!REG_P (target
) || REGNO (target
) < FIRST_PSEUDO_REGISTER
)
13465 new_target
= gen_reg_rtx (mode
);
13467 new_target
= target
;
13468 var
= force_reg (GET_MODE_INNER (mode
), var
);
13469 x
= gen_rtx_VEC_DUPLICATE (mode
, var
);
13470 x
= gen_rtx_VEC_MERGE (mode
, x
, CONST0_RTX (mode
), const1_rtx
);
13471 emit_insn (gen_rtx_SET (new_target
, x
));
13474 /* We need to shuffle the value to the correct position, so
13475 create a new pseudo to store the intermediate result. */
13477 /* With SSE2, we can use the integer shuffle insns. */
13478 if (mode
!= V4SFmode
&& TARGET_SSE2
)
13480 emit_insn (gen_sse2_pshufd_1 (new_target
, new_target
,
13482 GEN_INT (one_var
== 1 ? 0 : 1),
13483 GEN_INT (one_var
== 2 ? 0 : 1),
13484 GEN_INT (one_var
== 3 ? 0 : 1)));
13485 if (target
!= new_target
)
13486 emit_move_insn (target
, new_target
);
13490 /* Otherwise convert the intermediate result to V4SFmode and
13491 use the SSE1 shuffle instructions. */
13492 if (mode
!= V4SFmode
)
13494 tmp
= gen_reg_rtx (V4SFmode
);
13495 emit_move_insn (tmp
, gen_lowpart (V4SFmode
, new_target
));
13500 emit_insn (gen_sse_shufps_v4sf (tmp
, tmp
, tmp
,
13502 GEN_INT (one_var
== 1 ? 0 : 1),
13503 GEN_INT (one_var
== 2 ? 0+4 : 1+4),
13504 GEN_INT (one_var
== 3 ? 0+4 : 1+4)));
13506 if (mode
!= V4SFmode
)
13507 emit_move_insn (target
, gen_lowpart (V4SImode
, tmp
));
13508 else if (tmp
!= target
)
13509 emit_move_insn (target
, tmp
);
13511 else if (target
!= new_target
)
13512 emit_move_insn (target
, new_target
);
13517 vsimode
= V4SImode
;
13523 vsimode
= V2SImode
;
13529 /* Zero extend the variable element to SImode and recurse. */
13530 var
= convert_modes (SImode
, GET_MODE_INNER (mode
), var
, true);
13532 x
= gen_reg_rtx (vsimode
);
13533 if (!ix86_expand_vector_init_one_nonzero (mmx_ok
, vsimode
, x
,
13535 gcc_unreachable ();
13537 emit_move_insn (target
, gen_lowpart (mode
, x
));
13545 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
13546 consisting of the values in VALS. It is known that all elements
13547 except ONE_VAR are constants. Return true if successful. */
13550 ix86_expand_vector_init_one_var (bool mmx_ok
, machine_mode mode
,
13551 rtx target
, rtx vals
, int one_var
)
13553 rtx var
= XVECEXP (vals
, 0, one_var
);
13554 machine_mode wmode
;
13557 const_vec
= copy_rtx (vals
);
13558 XVECEXP (const_vec
, 0, one_var
) = CONST0_RTX (GET_MODE_INNER (mode
));
13559 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (const_vec
, 0));
13567 /* For the two element vectors, it's just as easy to use
13568 the general case. */
13572 /* Use ix86_expand_vector_set in 64bit mode only. */
13596 /* There's no way to set one QImode entry easily. Combine
13597 the variable value with its adjacent constant value, and
13598 promote to an HImode set. */
13599 x
= XVECEXP (vals
, 0, one_var
^ 1);
13602 var
= convert_modes (HImode
, QImode
, var
, true);
13603 var
= expand_simple_binop (HImode
, ASHIFT
, var
, GEN_INT (8),
13604 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
13605 x
= GEN_INT (INTVAL (x
) & 0xff);
13609 var
= convert_modes (HImode
, QImode
, var
, true);
13610 x
= gen_int_mode (UINTVAL (x
) << 8, HImode
);
13612 if (x
!= const0_rtx
)
13613 var
= expand_simple_binop (HImode
, IOR
, var
, x
, var
,
13614 1, OPTAB_LIB_WIDEN
);
13616 x
= gen_reg_rtx (wmode
);
13617 emit_move_insn (x
, gen_lowpart (wmode
, const_vec
));
13618 ix86_expand_vector_set (mmx_ok
, x
, var
, one_var
>> 1);
13620 emit_move_insn (target
, gen_lowpart (mode
, x
));
13627 emit_move_insn (target
, const_vec
);
13628 ix86_expand_vector_set (mmx_ok
, target
, var
, one_var
);
13632 /* A subroutine of ix86_expand_vector_init_general. Use vector
13633 concatenate to handle the most general case: all values variable,
13634 and none identical. */
13637 ix86_expand_vector_init_concat (machine_mode mode
,
13638 rtx target
, rtx
*ops
, int n
)
13640 machine_mode cmode
, hmode
= VOIDmode
, gmode
= VOIDmode
;
13641 rtx first
[16], second
[8], third
[4];
13693 gcc_unreachable ();
13696 if (!register_operand (ops
[1], cmode
))
13697 ops
[1] = force_reg (cmode
, ops
[1]);
13698 if (!register_operand (ops
[0], cmode
))
13699 ops
[0] = force_reg (cmode
, ops
[0]);
13700 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, ops
[0],
13720 gcc_unreachable ();
13744 gcc_unreachable ();
13762 gcc_unreachable ();
13767 /* FIXME: We process inputs backward to help RA. PR 36222. */
13770 for (; i
> 0; i
-= 2, j
--)
13772 first
[j
] = gen_reg_rtx (cmode
);
13773 v
= gen_rtvec (2, ops
[i
- 1], ops
[i
]);
13774 ix86_expand_vector_init (false, first
[j
],
13775 gen_rtx_PARALLEL (cmode
, v
));
13781 gcc_assert (hmode
!= VOIDmode
);
13782 gcc_assert (gmode
!= VOIDmode
);
13783 for (i
= j
= 0; i
< n
; i
+= 2, j
++)
13785 second
[j
] = gen_reg_rtx (hmode
);
13786 ix86_expand_vector_init_concat (hmode
, second
[j
],
13790 for (i
= j
= 0; i
< n
; i
+= 2, j
++)
13792 third
[j
] = gen_reg_rtx (gmode
);
13793 ix86_expand_vector_init_concat (gmode
, third
[j
],
13797 ix86_expand_vector_init_concat (mode
, target
, third
, n
);
13801 gcc_assert (hmode
!= VOIDmode
);
13802 for (i
= j
= 0; i
< n
; i
+= 2, j
++)
13804 second
[j
] = gen_reg_rtx (hmode
);
13805 ix86_expand_vector_init_concat (hmode
, second
[j
],
13809 ix86_expand_vector_init_concat (mode
, target
, second
, n
);
13812 ix86_expand_vector_init_concat (mode
, target
, first
, n
);
13816 gcc_unreachable ();
13820 /* A subroutine of ix86_expand_vector_init_general. Use vector
13821 interleave to handle the most general case: all values variable,
13822 and none identical. */
13825 ix86_expand_vector_init_interleave (machine_mode mode
,
13826 rtx target
, rtx
*ops
, int n
)
13828 machine_mode first_imode
, second_imode
, third_imode
, inner_mode
;
13831 rtx (*gen_load_even
) (rtx
, rtx
, rtx
);
13832 rtx (*gen_interleave_first_low
) (rtx
, rtx
, rtx
);
13833 rtx (*gen_interleave_second_low
) (rtx
, rtx
, rtx
);
13838 gen_load_even
= gen_vec_setv8hi
;
13839 gen_interleave_first_low
= gen_vec_interleave_lowv4si
;
13840 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
13841 inner_mode
= HImode
;
13842 first_imode
= V4SImode
;
13843 second_imode
= V2DImode
;
13844 third_imode
= VOIDmode
;
13847 gen_load_even
= gen_vec_setv16qi
;
13848 gen_interleave_first_low
= gen_vec_interleave_lowv8hi
;
13849 gen_interleave_second_low
= gen_vec_interleave_lowv4si
;
13850 inner_mode
= QImode
;
13851 first_imode
= V8HImode
;
13852 second_imode
= V4SImode
;
13853 third_imode
= V2DImode
;
13856 gcc_unreachable ();
13859 for (i
= 0; i
< n
; i
++)
13861 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
13862 op0
= gen_reg_rtx (SImode
);
13863 emit_move_insn (op0
, gen_lowpart (SImode
, ops
[i
+ i
]));
13865 /* Insert the SImode value as low element of V4SImode vector. */
13866 op1
= gen_reg_rtx (V4SImode
);
13867 op0
= gen_rtx_VEC_MERGE (V4SImode
,
13868 gen_rtx_VEC_DUPLICATE (V4SImode
,
13870 CONST0_RTX (V4SImode
),
13872 emit_insn (gen_rtx_SET (op1
, op0
));
13874 /* Cast the V4SImode vector back to a vector in orignal mode. */
13875 op0
= gen_reg_rtx (mode
);
13876 emit_move_insn (op0
, gen_lowpart (mode
, op1
));
13878 /* Load even elements into the second position. */
13879 emit_insn (gen_load_even (op0
,
13880 force_reg (inner_mode
,
13884 /* Cast vector to FIRST_IMODE vector. */
13885 ops
[i
] = gen_reg_rtx (first_imode
);
13886 emit_move_insn (ops
[i
], gen_lowpart (first_imode
, op0
));
13889 /* Interleave low FIRST_IMODE vectors. */
13890 for (i
= j
= 0; i
< n
; i
+= 2, j
++)
13892 op0
= gen_reg_rtx (first_imode
);
13893 emit_insn (gen_interleave_first_low (op0
, ops
[i
], ops
[i
+ 1]));
13895 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
13896 ops
[j
] = gen_reg_rtx (second_imode
);
13897 emit_move_insn (ops
[j
], gen_lowpart (second_imode
, op0
));
13900 /* Interleave low SECOND_IMODE vectors. */
13901 switch (second_imode
)
13904 for (i
= j
= 0; i
< n
/ 2; i
+= 2, j
++)
13906 op0
= gen_reg_rtx (second_imode
);
13907 emit_insn (gen_interleave_second_low (op0
, ops
[i
],
13910 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
13912 ops
[j
] = gen_reg_rtx (third_imode
);
13913 emit_move_insn (ops
[j
], gen_lowpart (third_imode
, op0
));
13915 second_imode
= V2DImode
;
13916 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
13920 op0
= gen_reg_rtx (second_imode
);
13921 emit_insn (gen_interleave_second_low (op0
, ops
[0],
13924 /* Cast the SECOND_IMODE vector back to a vector on original
13926 emit_insn (gen_rtx_SET (target
, gen_lowpart (mode
, op0
)));
13930 gcc_unreachable ();
13934 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
13935 all values variable, and none identical. */
13938 ix86_expand_vector_init_general (bool mmx_ok
, machine_mode mode
,
13939 rtx target
, rtx vals
)
13941 rtx ops
[64], op0
, op1
, op2
, op3
, op4
, op5
;
13942 machine_mode half_mode
= VOIDmode
;
13943 machine_mode quarter_mode
= VOIDmode
;
13950 if (!mmx_ok
&& !TARGET_SSE
)
13966 n
= GET_MODE_NUNITS (mode
);
13967 for (i
= 0; i
< n
; i
++)
13968 ops
[i
] = XVECEXP (vals
, 0, i
);
13969 ix86_expand_vector_init_concat (mode
, target
, ops
, n
);
13973 for (i
= 0; i
< 2; i
++)
13974 ops
[i
] = gen_lowpart (V2DImode
, XVECEXP (vals
, 0, i
));
13975 op0
= gen_reg_rtx (V4DImode
);
13976 ix86_expand_vector_init_concat (V4DImode
, op0
, ops
, 2);
13977 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), op0
));
13981 for (i
= 0; i
< 4; i
++)
13982 ops
[i
] = gen_lowpart (V2DImode
, XVECEXP (vals
, 0, i
));
13983 ops
[4] = gen_reg_rtx (V4DImode
);
13984 ix86_expand_vector_init_concat (V4DImode
, ops
[4], ops
, 2);
13985 ops
[5] = gen_reg_rtx (V4DImode
);
13986 ix86_expand_vector_init_concat (V4DImode
, ops
[5], ops
+ 2, 2);
13987 op0
= gen_reg_rtx (V8DImode
);
13988 ix86_expand_vector_init_concat (V8DImode
, op0
, ops
+ 4, 2);
13989 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), op0
));
13993 half_mode
= V16QImode
;
13997 half_mode
= V8HImode
;
14001 n
= GET_MODE_NUNITS (mode
);
14002 for (i
= 0; i
< n
; i
++)
14003 ops
[i
] = XVECEXP (vals
, 0, i
);
14004 op0
= gen_reg_rtx (half_mode
);
14005 op1
= gen_reg_rtx (half_mode
);
14006 ix86_expand_vector_init_interleave (half_mode
, op0
, ops
,
14008 ix86_expand_vector_init_interleave (half_mode
, op1
,
14009 &ops
[n
>> 1], n
>> 2);
14010 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, op0
, op1
)));
14014 quarter_mode
= V16QImode
;
14015 half_mode
= V32QImode
;
14019 quarter_mode
= V8HImode
;
14020 half_mode
= V16HImode
;
14024 n
= GET_MODE_NUNITS (mode
);
14025 for (i
= 0; i
< n
; i
++)
14026 ops
[i
] = XVECEXP (vals
, 0, i
);
14027 op0
= gen_reg_rtx (quarter_mode
);
14028 op1
= gen_reg_rtx (quarter_mode
);
14029 op2
= gen_reg_rtx (quarter_mode
);
14030 op3
= gen_reg_rtx (quarter_mode
);
14031 op4
= gen_reg_rtx (half_mode
);
14032 op5
= gen_reg_rtx (half_mode
);
14033 ix86_expand_vector_init_interleave (quarter_mode
, op0
, ops
,
14035 ix86_expand_vector_init_interleave (quarter_mode
, op1
,
14036 &ops
[n
>> 2], n
>> 3);
14037 ix86_expand_vector_init_interleave (quarter_mode
, op2
,
14038 &ops
[n
>> 1], n
>> 3);
14039 ix86_expand_vector_init_interleave (quarter_mode
, op3
,
14040 &ops
[(n
>> 1) | (n
>> 2)], n
>> 3);
14041 emit_insn (gen_rtx_SET (op4
, gen_rtx_VEC_CONCAT (half_mode
, op0
, op1
)));
14042 emit_insn (gen_rtx_SET (op5
, gen_rtx_VEC_CONCAT (half_mode
, op2
, op3
)));
14043 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, op4
, op5
)));
14047 if (!TARGET_SSE4_1
)
14055 /* Don't use ix86_expand_vector_init_interleave if we can't
14056 move from GPR to SSE register directly. */
14057 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
)
14060 n
= GET_MODE_NUNITS (mode
);
14061 for (i
= 0; i
< n
; i
++)
14062 ops
[i
] = XVECEXP (vals
, 0, i
);
14063 ix86_expand_vector_init_interleave (mode
, target
, ops
, n
>> 1);
14071 gcc_unreachable ();
14075 int i
, j
, n_elts
, n_words
, n_elt_per_word
;
14076 machine_mode inner_mode
;
14077 rtx words
[4], shift
;
14079 inner_mode
= GET_MODE_INNER (mode
);
14080 n_elts
= GET_MODE_NUNITS (mode
);
14081 n_words
= GET_MODE_SIZE (mode
) / UNITS_PER_WORD
;
14082 n_elt_per_word
= n_elts
/ n_words
;
14083 shift
= GEN_INT (GET_MODE_BITSIZE (inner_mode
));
14085 for (i
= 0; i
< n_words
; ++i
)
14087 rtx word
= NULL_RTX
;
14089 for (j
= 0; j
< n_elt_per_word
; ++j
)
14091 rtx elt
= XVECEXP (vals
, 0, (i
+1)*n_elt_per_word
- j
- 1);
14092 elt
= convert_modes (word_mode
, inner_mode
, elt
, true);
14098 word
= expand_simple_binop (word_mode
, ASHIFT
, word
, shift
,
14099 word
, 1, OPTAB_LIB_WIDEN
);
14100 word
= expand_simple_binop (word_mode
, IOR
, word
, elt
,
14101 word
, 1, OPTAB_LIB_WIDEN
);
14109 emit_move_insn (target
, gen_lowpart (mode
, words
[0]));
14110 else if (n_words
== 2)
14112 rtx tmp
= gen_reg_rtx (mode
);
14113 emit_clobber (tmp
);
14114 emit_move_insn (gen_lowpart (word_mode
, tmp
), words
[0]);
14115 emit_move_insn (gen_highpart (word_mode
, tmp
), words
[1]);
14116 emit_move_insn (target
, tmp
);
14118 else if (n_words
== 4)
14120 rtx tmp
= gen_reg_rtx (V4SImode
);
14121 gcc_assert (word_mode
== SImode
);
14122 vals
= gen_rtx_PARALLEL (V4SImode
, gen_rtvec_v (4, words
));
14123 ix86_expand_vector_init_general (false, V4SImode
, tmp
, vals
);
14124 emit_move_insn (target
, gen_lowpart (mode
, tmp
));
14127 gcc_unreachable ();
14131 /* Initialize vector TARGET via VALS. Suppress the use of MMX
14132 instructions unless MMX_OK is true. */
14135 ix86_expand_vector_init (bool mmx_ok
, rtx target
, rtx vals
)
14137 machine_mode mode
= GET_MODE (target
);
14138 machine_mode inner_mode
= GET_MODE_INNER (mode
);
14139 int n_elts
= GET_MODE_NUNITS (mode
);
14140 int n_var
= 0, one_var
= -1;
14141 bool all_same
= true, all_const_zero
= true;
14145 /* Handle first initialization from vector elts. */
14146 if (n_elts
!= XVECLEN (vals
, 0))
14148 rtx subtarget
= target
;
14149 x
= XVECEXP (vals
, 0, 0);
14150 gcc_assert (GET_MODE_INNER (GET_MODE (x
)) == inner_mode
);
14151 if (GET_MODE_NUNITS (GET_MODE (x
)) * 2 == n_elts
)
14153 rtx ops
[2] = { XVECEXP (vals
, 0, 0), XVECEXP (vals
, 0, 1) };
14154 if (inner_mode
== QImode
|| inner_mode
== HImode
)
14156 unsigned int n_bits
= n_elts
* GET_MODE_SIZE (inner_mode
);
14157 mode
= mode_for_vector (SImode
, n_bits
/ 4).require ();
14158 inner_mode
= mode_for_vector (SImode
, n_bits
/ 8).require ();
14159 ops
[0] = gen_lowpart (inner_mode
, ops
[0]);
14160 ops
[1] = gen_lowpart (inner_mode
, ops
[1]);
14161 subtarget
= gen_reg_rtx (mode
);
14163 ix86_expand_vector_init_concat (mode
, subtarget
, ops
, 2);
14164 if (subtarget
!= target
)
14165 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), subtarget
));
14168 gcc_unreachable ();
14171 for (i
= 0; i
< n_elts
; ++i
)
14173 x
= XVECEXP (vals
, 0, i
);
14174 if (!(CONST_SCALAR_INT_P (x
)
14175 || CONST_DOUBLE_P (x
)
14176 || CONST_FIXED_P (x
)))
14177 n_var
++, one_var
= i
;
14178 else if (x
!= CONST0_RTX (inner_mode
))
14179 all_const_zero
= false;
14180 if (i
> 0 && !rtx_equal_p (x
, XVECEXP (vals
, 0, 0)))
14184 /* Constants are best loaded from the constant pool. */
14187 emit_move_insn (target
, gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0)));
14191 /* If all values are identical, broadcast the value. */
14193 && ix86_expand_vector_init_duplicate (mmx_ok
, mode
, target
,
14194 XVECEXP (vals
, 0, 0)))
14197 /* Values where only one field is non-constant are best loaded from
14198 the pool and overwritten via move later. */
14202 && ix86_expand_vector_init_one_nonzero (mmx_ok
, mode
, target
,
14203 XVECEXP (vals
, 0, one_var
),
14207 if (ix86_expand_vector_init_one_var (mmx_ok
, mode
, target
, vals
, one_var
))
14211 ix86_expand_vector_init_general (mmx_ok
, mode
, target
, vals
);
14215 ix86_expand_vector_set (bool mmx_ok
, rtx target
, rtx val
, int elt
)
14217 machine_mode mode
= GET_MODE (target
);
14218 machine_mode inner_mode
= GET_MODE_INNER (mode
);
14219 machine_mode half_mode
;
14220 bool use_vec_merge
= false;
14222 static rtx (*gen_extract
[6][2]) (rtx
, rtx
)
14224 { gen_vec_extract_lo_v32qi
, gen_vec_extract_hi_v32qi
},
14225 { gen_vec_extract_lo_v16hi
, gen_vec_extract_hi_v16hi
},
14226 { gen_vec_extract_lo_v8si
, gen_vec_extract_hi_v8si
},
14227 { gen_vec_extract_lo_v4di
, gen_vec_extract_hi_v4di
},
14228 { gen_vec_extract_lo_v8sf
, gen_vec_extract_hi_v8sf
},
14229 { gen_vec_extract_lo_v4df
, gen_vec_extract_hi_v4df
}
14231 static rtx (*gen_insert
[6][2]) (rtx
, rtx
, rtx
)
14233 { gen_vec_set_lo_v32qi
, gen_vec_set_hi_v32qi
},
14234 { gen_vec_set_lo_v16hi
, gen_vec_set_hi_v16hi
},
14235 { gen_vec_set_lo_v8si
, gen_vec_set_hi_v8si
},
14236 { gen_vec_set_lo_v4di
, gen_vec_set_hi_v4di
},
14237 { gen_vec_set_lo_v8sf
, gen_vec_set_hi_v8sf
},
14238 { gen_vec_set_lo_v4df
, gen_vec_set_hi_v4df
}
14241 machine_mode mmode
= VOIDmode
;
14242 rtx (*gen_blendm
) (rtx
, rtx
, rtx
, rtx
);
14247 use_vec_merge
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
14255 tmp
= gen_reg_rtx (GET_MODE_INNER (mode
));
14256 ix86_expand_vector_extract (true, tmp
, target
, 1 - elt
);
14258 tmp
= gen_rtx_VEC_CONCAT (mode
, val
, tmp
);
14260 tmp
= gen_rtx_VEC_CONCAT (mode
, tmp
, val
);
14261 emit_insn (gen_rtx_SET (target
, tmp
));
14267 use_vec_merge
= TARGET_SSE4_1
&& TARGET_64BIT
;
14271 tmp
= gen_reg_rtx (GET_MODE_INNER (mode
));
14272 ix86_expand_vector_extract (false, tmp
, target
, 1 - elt
);
14274 tmp
= gen_rtx_VEC_CONCAT (mode
, val
, tmp
);
14276 tmp
= gen_rtx_VEC_CONCAT (mode
, tmp
, val
);
14277 emit_insn (gen_rtx_SET (target
, tmp
));
14281 /* NB: For ELT == 0, use standard scalar operation patterns which
14282 preserve the rest of the vector for combiner:
14285 (vec_duplicate:V2DF (reg:DF))
14295 /* For the two element vectors, we implement a VEC_CONCAT with
14296 the extraction of the other element. */
14298 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (1 - elt
)));
14299 tmp
= gen_rtx_VEC_SELECT (inner_mode
, target
, tmp
);
14302 op0
= val
, op1
= tmp
;
14304 op0
= tmp
, op1
= val
;
14306 tmp
= gen_rtx_VEC_CONCAT (mode
, op0
, op1
);
14307 emit_insn (gen_rtx_SET (target
, tmp
));
14312 use_vec_merge
= TARGET_SSE4_1
;
14319 use_vec_merge
= true;
14323 /* tmp = target = A B C D */
14324 tmp
= copy_to_reg (target
);
14325 /* target = A A B B */
14326 emit_insn (gen_vec_interleave_lowv4sf (target
, target
, target
));
14327 /* target = X A B B */
14328 ix86_expand_vector_set (false, target
, val
, 0);
14329 /* target = A X C D */
14330 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
14331 const1_rtx
, const0_rtx
,
14332 GEN_INT (2+4), GEN_INT (3+4)));
14336 /* tmp = target = A B C D */
14337 tmp
= copy_to_reg (target
);
14338 /* tmp = X B C D */
14339 ix86_expand_vector_set (false, tmp
, val
, 0);
14340 /* target = A B X D */
14341 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
14342 const0_rtx
, const1_rtx
,
14343 GEN_INT (0+4), GEN_INT (3+4)));
14347 /* tmp = target = A B C D */
14348 tmp
= copy_to_reg (target
);
14349 /* tmp = X B C D */
14350 ix86_expand_vector_set (false, tmp
, val
, 0);
14351 /* target = A B X D */
14352 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
14353 const0_rtx
, const1_rtx
,
14354 GEN_INT (2+4), GEN_INT (0+4)));
14358 gcc_unreachable ();
14363 use_vec_merge
= TARGET_SSE4_1
;
14367 /* Element 0 handled by vec_merge below. */
14370 use_vec_merge
= true;
14376 /* With SSE2, use integer shuffles to swap element 0 and ELT,
14377 store into element 0, then shuffle them back. */
14381 order
[0] = GEN_INT (elt
);
14382 order
[1] = const1_rtx
;
14383 order
[2] = const2_rtx
;
14384 order
[3] = GEN_INT (3);
14385 order
[elt
] = const0_rtx
;
14387 emit_insn (gen_sse2_pshufd_1 (target
, target
, order
[0],
14388 order
[1], order
[2], order
[3]));
14390 ix86_expand_vector_set (false, target
, val
, 0);
14392 emit_insn (gen_sse2_pshufd_1 (target
, target
, order
[0],
14393 order
[1], order
[2], order
[3]));
14397 /* For SSE1, we have to reuse the V4SF code. */
14398 rtx t
= gen_reg_rtx (V4SFmode
);
14399 emit_move_insn (t
, gen_lowpart (V4SFmode
, target
));
14400 ix86_expand_vector_set (false, t
, gen_lowpart (SFmode
, val
), elt
);
14401 emit_move_insn (target
, gen_lowpart (mode
, t
));
14406 use_vec_merge
= TARGET_SSE2
;
14409 use_vec_merge
= mmx_ok
&& (TARGET_SSE
|| TARGET_3DNOW_A
);
14413 use_vec_merge
= TARGET_SSE4_1
;
14417 use_vec_merge
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
14421 half_mode
= V16QImode
;
14427 half_mode
= V8HImode
;
14433 half_mode
= V4SImode
;
14439 half_mode
= V2DImode
;
14445 half_mode
= V4SFmode
;
14451 half_mode
= V2DFmode
;
14457 /* Compute offset. */
14461 gcc_assert (i
<= 1);
14463 /* Extract the half. */
14464 tmp
= gen_reg_rtx (half_mode
);
14465 emit_insn (gen_extract
[j
][i
] (tmp
, target
));
14467 /* Put val in tmp at elt. */
14468 ix86_expand_vector_set (false, tmp
, val
, elt
);
14471 emit_insn (gen_insert
[j
][i
] (target
, target
, tmp
));
14475 if (TARGET_AVX512F
)
14478 gen_blendm
= gen_avx512f_blendmv8df
;
14483 if (TARGET_AVX512F
)
14486 gen_blendm
= gen_avx512f_blendmv8di
;
14491 if (TARGET_AVX512F
)
14494 gen_blendm
= gen_avx512f_blendmv16sf
;
14499 if (TARGET_AVX512F
)
14502 gen_blendm
= gen_avx512f_blendmv16si
;
14507 if (TARGET_AVX512BW
)
14510 gen_blendm
= gen_avx512bw_blendmv32hi
;
14512 else if (TARGET_AVX512F
)
14514 half_mode
= E_V8HImode
;
14521 if (TARGET_AVX512BW
)
14524 gen_blendm
= gen_avx512bw_blendmv64qi
;
14526 else if (TARGET_AVX512F
)
14528 half_mode
= E_V16QImode
;
14535 /* Compute offset. */
14539 gcc_assert (i
<= 3);
14542 /* Extract the quarter. */
14543 tmp
= gen_reg_rtx (V4SImode
);
14544 rtx tmp2
= gen_lowpart (V16SImode
, target
);
14545 rtx mask
= gen_reg_rtx (QImode
);
14547 emit_move_insn (mask
, constm1_rtx
);
14548 emit_insn (gen_avx512f_vextracti32x4_mask (tmp
, tmp2
, GEN_INT (i
),
14551 tmp2
= gen_reg_rtx (half_mode
);
14552 emit_move_insn (tmp2
, gen_lowpart (half_mode
, tmp
));
14555 /* Put val in tmp at elt. */
14556 ix86_expand_vector_set (false, tmp
, val
, elt
);
14559 tmp2
= gen_reg_rtx (V16SImode
);
14560 rtx tmp3
= gen_lowpart (V16SImode
, target
);
14561 mask
= gen_reg_rtx (HImode
);
14562 emit_move_insn (mask
, constm1_rtx
);
14563 tmp
= gen_lowpart (V4SImode
, tmp
);
14564 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2
, tmp3
, tmp
, GEN_INT (i
),
14566 emit_move_insn (target
, gen_lowpart (mode
, tmp2
));
14574 if (mmode
!= VOIDmode
)
14576 tmp
= gen_reg_rtx (mode
);
14577 emit_insn (gen_rtx_SET (tmp
, gen_rtx_VEC_DUPLICATE (mode
, val
)));
14578 /* The avx512*_blendm<mode> expanders have different operand order
14579 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
14580 elements where the mask is set and second input operand otherwise,
14581 in {sse,avx}*_*blend* the first input operand is used for elements
14582 where the mask is clear and second input operand otherwise. */
14583 emit_insn (gen_blendm (target
, target
, tmp
,
14585 gen_int_mode (HOST_WIDE_INT_1U
<< elt
,
14588 else if (use_vec_merge
)
14591 tmp
= gen_rtx_VEC_DUPLICATE (mode
, val
);
14592 tmp
= gen_rtx_VEC_MERGE (mode
, tmp
, target
,
14593 GEN_INT (HOST_WIDE_INT_1U
<< elt
));
14594 emit_insn (gen_rtx_SET (target
, tmp
));
14598 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
14600 emit_move_insn (mem
, target
);
14602 tmp
= adjust_address (mem
, inner_mode
, elt
* GET_MODE_SIZE (inner_mode
));
14603 emit_move_insn (tmp
, val
);
14605 emit_move_insn (target
, mem
);
14610 ix86_expand_vector_extract (bool mmx_ok
, rtx target
, rtx vec
, int elt
)
14612 machine_mode mode
= GET_MODE (vec
);
14613 machine_mode inner_mode
= GET_MODE_INNER (mode
);
14614 bool use_vec_extr
= false;
14629 use_vec_extr
= true;
14633 use_vec_extr
= TARGET_SSE4_1
;
14645 tmp
= gen_reg_rtx (mode
);
14646 emit_insn (gen_sse_shufps_v4sf (tmp
, vec
, vec
,
14647 GEN_INT (elt
), GEN_INT (elt
),
14648 GEN_INT (elt
+4), GEN_INT (elt
+4)));
14652 tmp
= gen_reg_rtx (mode
);
14653 emit_insn (gen_vec_interleave_highv4sf (tmp
, vec
, vec
));
14657 gcc_unreachable ();
14660 use_vec_extr
= true;
14665 use_vec_extr
= TARGET_SSE4_1
;
14679 tmp
= gen_reg_rtx (mode
);
14680 emit_insn (gen_sse2_pshufd_1 (tmp
, vec
,
14681 GEN_INT (elt
), GEN_INT (elt
),
14682 GEN_INT (elt
), GEN_INT (elt
)));
14686 tmp
= gen_reg_rtx (mode
);
14687 emit_insn (gen_vec_interleave_highv4si (tmp
, vec
, vec
));
14691 gcc_unreachable ();
14694 use_vec_extr
= true;
14699 /* For SSE1, we have to reuse the V4SF code. */
14700 ix86_expand_vector_extract (false, gen_lowpart (SFmode
, target
),
14701 gen_lowpart (V4SFmode
, vec
), elt
);
14707 use_vec_extr
= TARGET_SSE2
;
14710 use_vec_extr
= mmx_ok
&& (TARGET_SSE
|| TARGET_3DNOW_A
);
14714 use_vec_extr
= TARGET_SSE4_1
;
14718 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC
))
14720 tmp
= gen_reg_rtx (SImode
);
14721 ix86_expand_vector_extract (false, tmp
, gen_lowpart (V4SImode
, vec
),
14723 emit_insn (gen_rtx_SET (target
, gen_lowpart (QImode
, tmp
)));
14731 tmp
= gen_reg_rtx (V4SFmode
);
14733 emit_insn (gen_vec_extract_lo_v8sf (tmp
, vec
));
14735 emit_insn (gen_vec_extract_hi_v8sf (tmp
, vec
));
14736 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
14744 tmp
= gen_reg_rtx (V2DFmode
);
14746 emit_insn (gen_vec_extract_lo_v4df (tmp
, vec
));
14748 emit_insn (gen_vec_extract_hi_v4df (tmp
, vec
));
14749 ix86_expand_vector_extract (false, target
, tmp
, elt
& 1);
14757 tmp
= gen_reg_rtx (V16QImode
);
14759 emit_insn (gen_vec_extract_lo_v32qi (tmp
, vec
));
14761 emit_insn (gen_vec_extract_hi_v32qi (tmp
, vec
));
14762 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
14770 tmp
= gen_reg_rtx (V8HImode
);
14772 emit_insn (gen_vec_extract_lo_v16hi (tmp
, vec
));
14774 emit_insn (gen_vec_extract_hi_v16hi (tmp
, vec
));
14775 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
14783 tmp
= gen_reg_rtx (V4SImode
);
14785 emit_insn (gen_vec_extract_lo_v8si (tmp
, vec
));
14787 emit_insn (gen_vec_extract_hi_v8si (tmp
, vec
));
14788 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
14796 tmp
= gen_reg_rtx (V2DImode
);
14798 emit_insn (gen_vec_extract_lo_v4di (tmp
, vec
));
14800 emit_insn (gen_vec_extract_hi_v4di (tmp
, vec
));
14801 ix86_expand_vector_extract (false, target
, tmp
, elt
& 1);
14807 if (TARGET_AVX512BW
)
14809 tmp
= gen_reg_rtx (V16HImode
);
14811 emit_insn (gen_vec_extract_lo_v32hi (tmp
, vec
));
14813 emit_insn (gen_vec_extract_hi_v32hi (tmp
, vec
));
14814 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
14820 if (TARGET_AVX512BW
)
14822 tmp
= gen_reg_rtx (V32QImode
);
14824 emit_insn (gen_vec_extract_lo_v64qi (tmp
, vec
));
14826 emit_insn (gen_vec_extract_hi_v64qi (tmp
, vec
));
14827 ix86_expand_vector_extract (false, target
, tmp
, elt
& 31);
14833 tmp
= gen_reg_rtx (V8SFmode
);
14835 emit_insn (gen_vec_extract_lo_v16sf (tmp
, vec
));
14837 emit_insn (gen_vec_extract_hi_v16sf (tmp
, vec
));
14838 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
14842 tmp
= gen_reg_rtx (V4DFmode
);
14844 emit_insn (gen_vec_extract_lo_v8df (tmp
, vec
));
14846 emit_insn (gen_vec_extract_hi_v8df (tmp
, vec
));
14847 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
14851 tmp
= gen_reg_rtx (V8SImode
);
14853 emit_insn (gen_vec_extract_lo_v16si (tmp
, vec
));
14855 emit_insn (gen_vec_extract_hi_v16si (tmp
, vec
));
14856 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
14860 tmp
= gen_reg_rtx (V4DImode
);
14862 emit_insn (gen_vec_extract_lo_v8di (tmp
, vec
));
14864 emit_insn (gen_vec_extract_hi_v8di (tmp
, vec
));
14865 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
14869 /* ??? Could extract the appropriate HImode element and shift. */
14876 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (elt
)));
14877 tmp
= gen_rtx_VEC_SELECT (inner_mode
, vec
, tmp
);
14879 /* Let the rtl optimizers know about the zero extension performed. */
14880 if (inner_mode
== QImode
|| inner_mode
== HImode
)
14882 tmp
= gen_rtx_ZERO_EXTEND (SImode
, tmp
);
14883 target
= gen_lowpart (SImode
, target
);
14886 emit_insn (gen_rtx_SET (target
, tmp
));
14890 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
14892 emit_move_insn (mem
, vec
);
14894 tmp
= adjust_address (mem
, inner_mode
, elt
*GET_MODE_SIZE (inner_mode
));
14895 emit_move_insn (target
, tmp
);
14899 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
14900 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
14901 The upper bits of DEST are undefined, though they shouldn't cause
14902 exceptions (some bits from src or all zeros are ok). */
14905 emit_reduc_half (rtx dest
, rtx src
, int i
)
14908 switch (GET_MODE (src
))
14912 tem
= gen_sse_movhlps (dest
, src
, src
);
14914 tem
= gen_sse_shufps_v4sf (dest
, src
, src
, const1_rtx
, const1_rtx
,
14915 GEN_INT (1 + 4), GEN_INT (1 + 4));
14918 tem
= gen_vec_interleave_highv2df (dest
, src
, src
);
14924 d
= gen_reg_rtx (V1TImode
);
14925 tem
= gen_sse2_lshrv1ti3 (d
, gen_lowpart (V1TImode
, src
),
14930 tem
= gen_avx_vperm2f128v8sf3 (dest
, src
, src
, const1_rtx
);
14932 tem
= gen_avx_shufps256 (dest
, src
, src
,
14933 GEN_INT (i
== 128 ? 2 + (3 << 2) : 1));
14937 tem
= gen_avx_vperm2f128v4df3 (dest
, src
, src
, const1_rtx
);
14939 tem
= gen_avx_shufpd256 (dest
, src
, src
, const1_rtx
);
14947 if (GET_MODE (dest
) != V4DImode
)
14948 d
= gen_reg_rtx (V4DImode
);
14949 tem
= gen_avx2_permv2ti (d
, gen_lowpart (V4DImode
, src
),
14950 gen_lowpart (V4DImode
, src
),
14955 d
= gen_reg_rtx (V2TImode
);
14956 tem
= gen_avx2_lshrv2ti3 (d
, gen_lowpart (V2TImode
, src
),
14967 tem
= gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode
, dest
),
14968 gen_lowpart (V16SImode
, src
),
14969 gen_lowpart (V16SImode
, src
),
14970 GEN_INT (0x4 + (i
== 512 ? 4 : 0)),
14971 GEN_INT (0x5 + (i
== 512 ? 4 : 0)),
14972 GEN_INT (0x6 + (i
== 512 ? 4 : 0)),
14973 GEN_INT (0x7 + (i
== 512 ? 4 : 0)),
14974 GEN_INT (0xC), GEN_INT (0xD),
14975 GEN_INT (0xE), GEN_INT (0xF),
14976 GEN_INT (0x10), GEN_INT (0x11),
14977 GEN_INT (0x12), GEN_INT (0x13),
14978 GEN_INT (0x14), GEN_INT (0x15),
14979 GEN_INT (0x16), GEN_INT (0x17));
14981 tem
= gen_avx512f_pshufd_1 (gen_lowpart (V16SImode
, dest
),
14982 gen_lowpart (V16SImode
, src
),
14983 GEN_INT (i
== 128 ? 0x2 : 0x1),
14987 GEN_INT (i
== 128 ? 0x6 : 0x5),
14991 GEN_INT (i
== 128 ? 0xA : 0x9),
14995 GEN_INT (i
== 128 ? 0xE : 0xD),
15001 gcc_unreachable ();
15005 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), d
));
15008 /* Expand a vector reduction. FN is the binary pattern to reduce;
15009 DEST is the destination; IN is the input vector. */
15012 ix86_expand_reduc (rtx (*fn
) (rtx
, rtx
, rtx
), rtx dest
, rtx in
)
15014 rtx half
, dst
, vec
= in
;
15015 machine_mode mode
= GET_MODE (in
);
15018 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
15020 && mode
== V8HImode
15021 && fn
== gen_uminv8hi3
)
15023 emit_insn (gen_sse4_1_phminposuw (dest
, in
));
15027 for (i
= GET_MODE_BITSIZE (mode
);
15028 i
> GET_MODE_UNIT_BITSIZE (mode
);
15031 half
= gen_reg_rtx (mode
);
15032 emit_reduc_half (half
, vec
, i
);
15033 if (i
== GET_MODE_UNIT_BITSIZE (mode
) * 2)
15036 dst
= gen_reg_rtx (mode
);
15037 emit_insn (fn (dst
, half
, vec
));
15042 /* Output code to perform a conditional jump to LABEL, if C2 flag in
15043 FP status register is set. */
15046 ix86_emit_fp_unordered_jump (rtx label
)
15048 rtx reg
= gen_reg_rtx (HImode
);
15052 emit_insn (gen_x86_fnstsw_1 (reg
));
15054 if (TARGET_SAHF
&& (TARGET_USE_SAHF
|| optimize_insn_for_size_p ()))
15056 emit_insn (gen_x86_sahf_1 (reg
));
15058 temp
= gen_rtx_REG (CCmode
, FLAGS_REG
);
15059 temp
= gen_rtx_UNORDERED (VOIDmode
, temp
, const0_rtx
);
15063 emit_insn (gen_testqi_ext_1_ccno (reg
, GEN_INT (0x04)));
15065 temp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15066 temp
= gen_rtx_NE (VOIDmode
, temp
, const0_rtx
);
15069 temp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, temp
,
15070 gen_rtx_LABEL_REF (VOIDmode
, label
),
15072 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, temp
));
15073 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
15074 JUMP_LABEL (insn
) = label
;
15077 /* Output code to perform an sinh XFmode calculation. */
15079 void ix86_emit_i387_sinh (rtx op0
, rtx op1
)
15081 rtx e1
= gen_reg_rtx (XFmode
);
15082 rtx e2
= gen_reg_rtx (XFmode
);
15083 rtx scratch
= gen_reg_rtx (HImode
);
15084 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15085 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15087 rtx_code_label
*jump_label
= gen_label_rtx ();
15090 /* scratch = fxam (op1) */
15091 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15093 /* e1 = expm1 (|op1|) */
15094 emit_insn (gen_absxf2 (e2
, op1
));
15095 emit_insn (gen_expm1xf2 (e1
, e2
));
15097 /* e2 = e1 / (e1 + 1.0) + e1 */
15098 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15099 emit_insn (gen_addxf3 (e2
, e1
, cst1
));
15100 emit_insn (gen_divxf3 (e2
, e1
, e2
));
15101 emit_insn (gen_addxf3 (e2
, e2
, e1
));
15103 /* flags = signbit (op1) */
15104 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15106 /* if (flags) then e2 = -e2 */
15107 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15108 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
15109 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15111 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15112 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15113 JUMP_LABEL (insn
) = jump_label
;
15115 emit_insn (gen_negxf2 (e2
, e2
));
15117 emit_label (jump_label
);
15118 LABEL_NUSES (jump_label
) = 1;
15120 /* op0 = 0.5 * e2 */
15121 half
= force_reg (XFmode
, half
);
15122 emit_insn (gen_mulxf3 (op0
, e2
, half
));
15125 /* Output code to perform an cosh XFmode calculation. */
15127 void ix86_emit_i387_cosh (rtx op0
, rtx op1
)
15129 rtx e1
= gen_reg_rtx (XFmode
);
15130 rtx e2
= gen_reg_rtx (XFmode
);
15131 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15134 /* e1 = exp (op1) */
15135 emit_insn (gen_expxf2 (e1
, op1
));
15137 /* e2 = e1 + 1.0 / e1 */
15138 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15139 emit_insn (gen_divxf3 (e2
, cst1
, e1
));
15140 emit_insn (gen_addxf3 (e2
, e1
, e2
));
15142 /* op0 = 0.5 * e2 */
15143 half
= force_reg (XFmode
, half
);
15144 emit_insn (gen_mulxf3 (op0
, e2
, half
));
15147 /* Output code to perform an tanh XFmode calculation. */
15149 void ix86_emit_i387_tanh (rtx op0
, rtx op1
)
15151 rtx e1
= gen_reg_rtx (XFmode
);
15152 rtx e2
= gen_reg_rtx (XFmode
);
15153 rtx scratch
= gen_reg_rtx (HImode
);
15154 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15156 rtx_code_label
*jump_label
= gen_label_rtx ();
15159 /* scratch = fxam (op1) */
15160 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15162 /* e1 = expm1 (-|2 * op1|) */
15163 emit_insn (gen_addxf3 (e2
, op1
, op1
));
15164 emit_insn (gen_absxf2 (e2
, e2
));
15165 emit_insn (gen_negxf2 (e2
, e2
));
15166 emit_insn (gen_expm1xf2 (e1
, e2
));
15168 /* e2 = e1 / (e1 + 2.0) */
15169 cst2
= force_reg (XFmode
, CONST2_RTX (XFmode
));
15170 emit_insn (gen_addxf3 (e2
, e1
, cst2
));
15171 emit_insn (gen_divxf3 (e2
, e1
, e2
));
15173 /* flags = signbit (op1) */
15174 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15176 /* if (!flags) then e2 = -e2 */
15177 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15178 gen_rtx_NE (VOIDmode
, flags
, const0_rtx
),
15179 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15181 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15182 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15183 JUMP_LABEL (insn
) = jump_label
;
15185 emit_insn (gen_negxf2 (e2
, e2
));
15187 emit_label (jump_label
);
15188 LABEL_NUSES (jump_label
) = 1;
15190 emit_move_insn (op0
, e2
);
15193 /* Output code to perform an asinh XFmode calculation. */
15195 void ix86_emit_i387_asinh (rtx op0
, rtx op1
)
15197 rtx e1
= gen_reg_rtx (XFmode
);
15198 rtx e2
= gen_reg_rtx (XFmode
);
15199 rtx scratch
= gen_reg_rtx (HImode
);
15200 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15202 rtx_code_label
*jump_label
= gen_label_rtx ();
15205 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
15206 emit_insn (gen_mulxf3 (e1
, op1
, op1
));
15207 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15208 emit_insn (gen_addxf3 (e2
, e1
, cst1
));
15209 emit_insn (gen_sqrtxf2 (e2
, e2
));
15210 emit_insn (gen_addxf3 (e2
, e2
, cst1
));
15213 emit_insn (gen_divxf3 (e1
, e1
, e2
));
15215 /* scratch = fxam (op1) */
15216 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15218 /* e1 = e1 + |op1| */
15219 emit_insn (gen_absxf2 (e2
, op1
));
15220 emit_insn (gen_addxf3 (e1
, e1
, e2
));
15222 /* e2 = log1p (e1) */
15223 ix86_emit_i387_log1p (e2
, e1
);
15225 /* flags = signbit (op1) */
15226 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15228 /* if (flags) then e2 = -e2 */
15229 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15230 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
15231 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15233 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15234 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15235 JUMP_LABEL (insn
) = jump_label
;
15237 emit_insn (gen_negxf2 (e2
, e2
));
15239 emit_label (jump_label
);
15240 LABEL_NUSES (jump_label
) = 1;
15242 emit_move_insn (op0
, e2
);
15245 /* Output code to perform an acosh XFmode calculation. */
15247 void ix86_emit_i387_acosh (rtx op0
, rtx op1
)
15249 rtx e1
= gen_reg_rtx (XFmode
);
15250 rtx e2
= gen_reg_rtx (XFmode
);
15251 rtx cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15253 /* e2 = sqrt (op1 + 1.0) */
15254 emit_insn (gen_addxf3 (e2
, op1
, cst1
));
15255 emit_insn (gen_sqrtxf2 (e2
, e2
));
15257 /* e1 = sqrt (op1 - 1.0) */
15258 emit_insn (gen_subxf3 (e1
, op1
, cst1
));
15259 emit_insn (gen_sqrtxf2 (e1
, e1
));
15262 emit_insn (gen_mulxf3 (e1
, e1
, e2
));
15264 /* e1 = e1 + op1 */
15265 emit_insn (gen_addxf3 (e1
, e1
, op1
));
15267 /* op0 = log (e1) */
15268 emit_insn (gen_logxf2 (op0
, e1
));
15271 /* Output code to perform an atanh XFmode calculation. */
15273 void ix86_emit_i387_atanh (rtx op0
, rtx op1
)
15275 rtx e1
= gen_reg_rtx (XFmode
);
15276 rtx e2
= gen_reg_rtx (XFmode
);
15277 rtx scratch
= gen_reg_rtx (HImode
);
15278 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15279 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15281 rtx_code_label
*jump_label
= gen_label_rtx ();
15284 /* scratch = fxam (op1) */
15285 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15288 emit_insn (gen_absxf2 (e2
, op1
));
15290 /* e1 = -(e2 + e2) / (e2 + 1.0) */
15291 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15292 emit_insn (gen_addxf3 (e1
, e2
, cst1
));
15293 emit_insn (gen_addxf3 (e2
, e2
, e2
));
15294 emit_insn (gen_negxf2 (e2
, e2
));
15295 emit_insn (gen_divxf3 (e1
, e2
, e1
));
15297 /* e2 = log1p (e1) */
15298 ix86_emit_i387_log1p (e2
, e1
);
15300 /* flags = signbit (op1) */
15301 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15303 /* if (!flags) then e2 = -e2 */
15304 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15305 gen_rtx_NE (VOIDmode
, flags
, const0_rtx
),
15306 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15308 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15309 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15310 JUMP_LABEL (insn
) = jump_label
;
15312 emit_insn (gen_negxf2 (e2
, e2
));
15314 emit_label (jump_label
);
15315 LABEL_NUSES (jump_label
) = 1;
15317 /* op0 = 0.5 * e2 */
15318 half
= force_reg (XFmode
, half
);
15319 emit_insn (gen_mulxf3 (op0
, e2
, half
));
15322 /* Output code to perform a log1p XFmode calculation. */
15324 void ix86_emit_i387_log1p (rtx op0
, rtx op1
)
15326 rtx_code_label
*label1
= gen_label_rtx ();
15327 rtx_code_label
*label2
= gen_label_rtx ();
15329 rtx tmp
= gen_reg_rtx (XFmode
);
15330 rtx res
= gen_reg_rtx (XFmode
);
15331 rtx cst
, cstln2
, cst1
;
15334 cst
= const_double_from_real_value
15335 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode
), XFmode
);
15336 cstln2
= force_reg (XFmode
, standard_80387_constant_rtx (4)); /* fldln2 */
15338 emit_insn (gen_absxf2 (tmp
, op1
));
15340 cst
= force_reg (XFmode
, cst
);
15341 ix86_expand_branch (GE
, tmp
, cst
, label1
);
15342 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
15343 insn
= get_last_insn ();
15344 JUMP_LABEL (insn
) = label1
;
15346 emit_insn (gen_fyl2xp1xf3_i387 (res
, op1
, cstln2
));
15347 emit_jump (label2
);
15349 emit_label (label1
);
15350 LABEL_NUSES (label1
) = 1;
15352 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
15353 emit_insn (gen_rtx_SET (tmp
, gen_rtx_PLUS (XFmode
, op1
, cst1
)));
15354 emit_insn (gen_fyl2xxf3_i387 (res
, tmp
, cstln2
));
15356 emit_label (label2
);
15357 LABEL_NUSES (label2
) = 1;
15359 emit_move_insn (op0
, res
);
15362 /* Emit code for round calculation. */
15363 void ix86_emit_i387_round (rtx op0
, rtx op1
)
15365 machine_mode inmode
= GET_MODE (op1
);
15366 machine_mode outmode
= GET_MODE (op0
);
15367 rtx e1
= gen_reg_rtx (XFmode
);
15368 rtx e2
= gen_reg_rtx (XFmode
);
15369 rtx scratch
= gen_reg_rtx (HImode
);
15370 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
15371 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
15372 rtx res
= gen_reg_rtx (outmode
);
15373 rtx_code_label
*jump_label
= gen_label_rtx ();
15374 rtx (*floor_insn
) (rtx
, rtx
);
15375 rtx (*neg_insn
) (rtx
, rtx
);
15383 tmp
= gen_reg_rtx (XFmode
);
15385 emit_insn (gen_rtx_SET (tmp
, gen_rtx_FLOAT_EXTEND (XFmode
, op1
)));
15391 gcc_unreachable ();
15397 floor_insn
= gen_frndintxf2_floor
;
15398 neg_insn
= gen_negsf2
;
15401 floor_insn
= gen_frndintxf2_floor
;
15402 neg_insn
= gen_negdf2
;
15405 floor_insn
= gen_frndintxf2_floor
;
15406 neg_insn
= gen_negxf2
;
15409 floor_insn
= gen_lfloorxfhi2
;
15410 neg_insn
= gen_neghi2
;
15413 floor_insn
= gen_lfloorxfsi2
;
15414 neg_insn
= gen_negsi2
;
15417 floor_insn
= gen_lfloorxfdi2
;
15418 neg_insn
= gen_negdi2
;
15421 gcc_unreachable ();
15424 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
15426 /* scratch = fxam(op1) */
15427 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
15429 /* e1 = fabs(op1) */
15430 emit_insn (gen_absxf2 (e1
, op1
));
15432 /* e2 = e1 + 0.5 */
15433 half
= force_reg (XFmode
, half
);
15434 emit_insn (gen_rtx_SET (e2
, gen_rtx_PLUS (XFmode
, e1
, half
)));
15436 /* res = floor(e2) */
15442 tmp
= gen_reg_rtx (XFmode
);
15444 emit_insn (floor_insn (tmp
, e2
));
15445 emit_insn (gen_rtx_SET (res
,
15446 gen_rtx_UNSPEC (outmode
, gen_rtvec (1, tmp
),
15447 UNSPEC_TRUNC_NOOP
)));
15451 emit_insn (floor_insn (res
, e2
));
15454 /* flags = signbit(a) */
15455 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
15457 /* if (flags) then res = -res */
15458 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
15459 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
15460 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
15462 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15463 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
15464 JUMP_LABEL (insn
) = jump_label
;
15466 emit_insn (neg_insn (res
, res
));
15468 emit_label (jump_label
);
15469 LABEL_NUSES (jump_label
) = 1;
15471 emit_move_insn (op0
, res
);
15474 /* Output code to perform a Newton-Rhapson approximation of a single precision
15475 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
15477 void ix86_emit_swdivsf (rtx res
, rtx a
, rtx b
, machine_mode mode
)
15479 rtx x0
, x1
, e0
, e1
;
15481 x0
= gen_reg_rtx (mode
);
15482 e0
= gen_reg_rtx (mode
);
15483 e1
= gen_reg_rtx (mode
);
15484 x1
= gen_reg_rtx (mode
);
15486 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
15488 b
= force_reg (mode
, b
);
15490 /* x0 = rcp(b) estimate */
15491 if (mode
== V16SFmode
|| mode
== V8DFmode
)
15493 if (TARGET_AVX512ER
)
15495 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
15498 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, a
, x0
)));
15502 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
15506 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
15510 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, b
)));
15513 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, e0
)));
15516 emit_insn (gen_rtx_SET (e1
, gen_rtx_PLUS (mode
, x0
, x0
)));
15519 emit_insn (gen_rtx_SET (x1
, gen_rtx_MINUS (mode
, e1
, e0
)));
15522 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, a
, x1
)));
15525 /* Output code to perform a Newton-Rhapson approximation of a
15526 single precision floating point [reciprocal] square root. */
15528 void ix86_emit_swsqrtsf (rtx res
, rtx a
, machine_mode mode
, bool recip
)
15530 rtx x0
, e0
, e1
, e2
, e3
, mthree
, mhalf
;
15534 x0
= gen_reg_rtx (mode
);
15535 e0
= gen_reg_rtx (mode
);
15536 e1
= gen_reg_rtx (mode
);
15537 e2
= gen_reg_rtx (mode
);
15538 e3
= gen_reg_rtx (mode
);
15540 if (TARGET_AVX512ER
&& mode
== V16SFmode
)
15543 /* res = rsqrt28(a) estimate */
15544 emit_insn (gen_rtx_SET (res
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
15548 /* x0 = rsqrt28(a) estimate */
15549 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
15551 /* res = rcp28(x0) estimate */
15552 emit_insn (gen_rtx_SET (res
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, x0
),
15558 real_from_integer (&r
, VOIDmode
, -3, SIGNED
);
15559 mthree
= const_double_from_real_value (r
, SFmode
);
15561 real_arithmetic (&r
, NEGATE_EXPR
, &dconsthalf
, NULL
);
15562 mhalf
= const_double_from_real_value (r
, SFmode
);
15563 unspec
= UNSPEC_RSQRT
;
15565 if (VECTOR_MODE_P (mode
))
15567 mthree
= ix86_build_const_vector (mode
, true, mthree
);
15568 mhalf
= ix86_build_const_vector (mode
, true, mhalf
);
15569 /* There is no 512-bit rsqrt. There is however rsqrt14. */
15570 if (GET_MODE_SIZE (mode
) == 64)
15571 unspec
= UNSPEC_RSQRT14
;
15574 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
15575 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
15577 a
= force_reg (mode
, a
);
15579 /* x0 = rsqrt(a) estimate */
15580 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
15583 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
15586 rtx zero
= force_reg (mode
, CONST0_RTX(mode
));
15589 /* Handle masked compare. */
15590 if (VECTOR_MODE_P (mode
) && GET_MODE_SIZE (mode
) == 64)
15592 mask
= gen_reg_rtx (HImode
);
15593 /* Imm value 0x4 corresponds to not-equal comparison. */
15594 emit_insn (gen_avx512f_cmpv16sf3 (mask
, zero
, a
, GEN_INT (0x4)));
15595 emit_insn (gen_avx512f_blendmv16sf (x0
, zero
, x0
, mask
));
15599 mask
= gen_reg_rtx (mode
);
15600 emit_insn (gen_rtx_SET (mask
, gen_rtx_NE (mode
, zero
, a
)));
15601 emit_insn (gen_rtx_SET (x0
, gen_rtx_AND (mode
, x0
, mask
)));
15606 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, a
)));
15608 emit_insn (gen_rtx_SET (e1
, gen_rtx_MULT (mode
, e0
, x0
)));
15611 mthree
= force_reg (mode
, mthree
);
15612 emit_insn (gen_rtx_SET (e2
, gen_rtx_PLUS (mode
, e1
, mthree
)));
15614 mhalf
= force_reg (mode
, mhalf
);
15616 /* e3 = -.5 * x0 */
15617 emit_insn (gen_rtx_SET (e3
, gen_rtx_MULT (mode
, x0
, mhalf
)));
15619 /* e3 = -.5 * e0 */
15620 emit_insn (gen_rtx_SET (e3
, gen_rtx_MULT (mode
, e0
, mhalf
)));
15621 /* ret = e2 * e3 */
15622 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, e2
, e3
)));
15625 /* Expand fabs (OP0) and return a new rtx that holds the result. The
15626 mask for masking out the sign-bit is stored in *SMASK, if that is
15630 ix86_expand_sse_fabs (rtx op0
, rtx
*smask
)
15632 machine_mode vmode
, mode
= GET_MODE (op0
);
15635 xa
= gen_reg_rtx (mode
);
15636 if (mode
== SFmode
)
15638 else if (mode
== DFmode
)
15642 mask
= ix86_build_signbit_mask (vmode
, VECTOR_MODE_P (mode
), true);
15643 if (!VECTOR_MODE_P (mode
))
15645 /* We need to generate a scalar mode mask in this case. */
15646 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const0_rtx
));
15647 tmp
= gen_rtx_VEC_SELECT (mode
, mask
, tmp
);
15648 mask
= gen_reg_rtx (mode
);
15649 emit_insn (gen_rtx_SET (mask
, tmp
));
15651 emit_insn (gen_rtx_SET (xa
, gen_rtx_AND (mode
, op0
, mask
)));
15659 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
15660 swapping the operands if SWAP_OPERANDS is true. The expanded
15661 code is a forward jump to a newly created label in case the
15662 comparison is true. The generated label rtx is returned. */
15663 static rtx_code_label
*
15664 ix86_expand_sse_compare_and_jump (enum rtx_code code
, rtx op0
, rtx op1
,
15665 bool swap_operands
)
15667 bool unordered_compare
= ix86_unordered_fp_compare (code
);
15668 rtx_code_label
*label
;
15672 std::swap (op0
, op1
);
15674 label
= gen_label_rtx ();
15675 tmp
= gen_rtx_COMPARE (CCFPmode
, op0
, op1
);
15676 if (unordered_compare
)
15677 tmp
= gen_rtx_UNSPEC (CCFPmode
, gen_rtvec (1, tmp
), UNSPEC_NOTRAP
);
15678 reg
= gen_rtx_REG (CCFPmode
, FLAGS_REG
);
15679 emit_insn (gen_rtx_SET (reg
, tmp
));
15680 tmp
= gen_rtx_fmt_ee (code
, VOIDmode
, reg
, const0_rtx
);
15681 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
15682 gen_rtx_LABEL_REF (VOIDmode
, label
), pc_rtx
);
15683 tmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
15684 JUMP_LABEL (tmp
) = label
;
15689 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
15690 using comparison code CODE. Operands are swapped for the comparison if
15691 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
15693 ix86_expand_sse_compare_mask (enum rtx_code code
, rtx op0
, rtx op1
,
15694 bool swap_operands
)
15696 rtx (*insn
)(rtx
, rtx
, rtx
, rtx
);
15697 machine_mode mode
= GET_MODE (op0
);
15698 rtx mask
= gen_reg_rtx (mode
);
15701 std::swap (op0
, op1
);
15703 insn
= mode
== DFmode
? gen_setcc_df_sse
: gen_setcc_sf_sse
;
15705 emit_insn (insn (mask
, op0
, op1
,
15706 gen_rtx_fmt_ee (code
, mode
, op0
, op1
)));
15710 /* Expand copysign from SIGN to the positive value ABS_VALUE
15711 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
15715 ix86_sse_copysign_to_positive (rtx result
, rtx abs_value
, rtx sign
, rtx mask
)
15717 machine_mode mode
= GET_MODE (sign
);
15718 rtx sgn
= gen_reg_rtx (mode
);
15719 if (mask
== NULL_RTX
)
15721 machine_mode vmode
;
15723 if (mode
== SFmode
)
15725 else if (mode
== DFmode
)
15730 mask
= ix86_build_signbit_mask (vmode
, VECTOR_MODE_P (mode
), false);
15731 if (!VECTOR_MODE_P (mode
))
15733 /* We need to generate a scalar mode mask in this case. */
15734 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const0_rtx
));
15735 tmp
= gen_rtx_VEC_SELECT (mode
, mask
, tmp
);
15736 mask
= gen_reg_rtx (mode
);
15737 emit_insn (gen_rtx_SET (mask
, tmp
));
15741 mask
= gen_rtx_NOT (mode
, mask
);
15742 emit_insn (gen_rtx_SET (sgn
, gen_rtx_AND (mode
, mask
, sign
)));
15743 emit_insn (gen_rtx_SET (result
, gen_rtx_IOR (mode
, abs_value
, sgn
)));
15746 /* Expand SSE sequence for computing lround from OP1 storing
15750 ix86_expand_lround (rtx op0
, rtx op1
)
15752 /* C code for the stuff we're doing below:
15753 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
15756 machine_mode mode
= GET_MODE (op1
);
15757 const struct real_format
*fmt
;
15758 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
15761 /* load nextafter (0.5, 0.0) */
15762 fmt
= REAL_MODE_FORMAT (mode
);
15763 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
15764 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
15766 /* adj = copysign (0.5, op1) */
15767 adj
= force_reg (mode
, const_double_from_real_value (pred_half
, mode
));
15768 ix86_sse_copysign_to_positive (adj
, adj
, force_reg (mode
, op1
), NULL_RTX
);
15770 /* adj = op1 + adj */
15771 adj
= expand_simple_binop (mode
, PLUS
, adj
, op1
, NULL_RTX
, 0, OPTAB_DIRECT
);
15773 /* op0 = (imode)adj */
15774 expand_fix (op0
, adj
, 0);
15777 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
15781 ix86_expand_lfloorceil (rtx op0
, rtx op1
, bool do_floor
)
15783 /* C code for the stuff we're doing below (for do_floor):
15785 xi -= (double)xi > op1 ? 1 : 0;
15788 machine_mode fmode
= GET_MODE (op1
);
15789 machine_mode imode
= GET_MODE (op0
);
15790 rtx ireg
, freg
, tmp
;
15791 rtx_code_label
*label
;
15793 /* reg = (long)op1 */
15794 ireg
= gen_reg_rtx (imode
);
15795 expand_fix (ireg
, op1
, 0);
15797 /* freg = (double)reg */
15798 freg
= gen_reg_rtx (fmode
);
15799 expand_float (freg
, ireg
, 0);
15801 /* ireg = (freg > op1) ? ireg - 1 : ireg */
15802 label
= ix86_expand_sse_compare_and_jump (UNLE
,
15803 freg
, op1
, !do_floor
);
15804 tmp
= expand_simple_binop (imode
, do_floor
? MINUS
: PLUS
,
15805 ireg
, const1_rtx
, NULL_RTX
, 0, OPTAB_DIRECT
);
15806 emit_move_insn (ireg
, tmp
);
15808 emit_label (label
);
15809 LABEL_NUSES (label
) = 1;
15811 emit_move_insn (op0
, ireg
);
15814 /* Generate and return a rtx of mode MODE for 2**n where n is the number
15815 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
15818 ix86_gen_TWO52 (machine_mode mode
)
15820 REAL_VALUE_TYPE TWO52r
;
15823 real_ldexp (&TWO52r
, &dconst1
, mode
== DFmode
? 52 : 23);
15824 TWO52
= const_double_from_real_value (TWO52r
, mode
);
15825 TWO52
= force_reg (mode
, TWO52
);
15830 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
15833 ix86_expand_rint (rtx operand0
, rtx operand1
)
15835 /* C code for the stuff we're doing below:
15836 xa = fabs (operand1);
15837 if (!isless (xa, 2**52))
15840 if (flag_rounding_math)
15842 two52 = copysign (two52, operand1);
15845 xa = xa + two52 - two52;
15846 return copysign (xa, operand1);
15848 machine_mode mode
= GET_MODE (operand0
);
15849 rtx res
, xa
, TWO52
, two52
, mask
;
15850 rtx_code_label
*label
;
15852 res
= gen_reg_rtx (mode
);
15853 emit_move_insn (res
, operand1
);
15855 /* xa = abs (operand1) */
15856 xa
= ix86_expand_sse_fabs (res
, &mask
);
15858 /* if (!isless (xa, TWO52)) goto label; */
15859 TWO52
= ix86_gen_TWO52 (mode
);
15860 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
15863 if (flag_rounding_math
)
15865 two52
= gen_reg_rtx (mode
);
15866 ix86_sse_copysign_to_positive (two52
, TWO52
, res
, mask
);
15870 xa
= expand_simple_binop (mode
, PLUS
, xa
, two52
, NULL_RTX
, 0, OPTAB_DIRECT
);
15871 xa
= expand_simple_binop (mode
, MINUS
, xa
, two52
, xa
, 0, OPTAB_DIRECT
);
15873 ix86_sse_copysign_to_positive (res
, xa
, res
, mask
);
15875 emit_label (label
);
15876 LABEL_NUSES (label
) = 1;
15878 emit_move_insn (operand0
, res
);
15881 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
15884 ix86_expand_floorceildf_32 (rtx operand0
, rtx operand1
, bool do_floor
)
15886 /* C code for the stuff we expand below.
15887 double xa = fabs (x), x2;
15888 if (!isless (xa, TWO52))
15890 xa = xa + TWO52 - TWO52;
15891 x2 = copysign (xa, x);
15898 if (HONOR_SIGNED_ZEROS (mode))
15899 x2 = copysign (x2, x);
15902 machine_mode mode
= GET_MODE (operand0
);
15903 rtx xa
, TWO52
, tmp
, one
, res
, mask
;
15904 rtx_code_label
*label
;
15906 TWO52
= ix86_gen_TWO52 (mode
);
15908 /* Temporary for holding the result, initialized to the input
15909 operand to ease control flow. */
15910 res
= gen_reg_rtx (mode
);
15911 emit_move_insn (res
, operand1
);
15913 /* xa = abs (operand1) */
15914 xa
= ix86_expand_sse_fabs (res
, &mask
);
15916 /* if (!isless (xa, TWO52)) goto label; */
15917 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
15919 /* xa = xa + TWO52 - TWO52; */
15920 xa
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
15921 xa
= expand_simple_binop (mode
, MINUS
, xa
, TWO52
, xa
, 0, OPTAB_DIRECT
);
15923 /* xa = copysign (xa, operand1) */
15924 ix86_sse_copysign_to_positive (xa
, xa
, res
, mask
);
15927 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
15929 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
15930 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa
, res
, !do_floor
);
15931 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
15932 tmp
= expand_simple_binop (mode
, do_floor
? MINUS
: PLUS
,
15933 xa
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
15934 if (!do_floor
&& HONOR_SIGNED_ZEROS (mode
))
15935 ix86_sse_copysign_to_positive (tmp
, tmp
, res
, mask
);
15936 emit_move_insn (res
, tmp
);
15938 emit_label (label
);
15939 LABEL_NUSES (label
) = 1;
15941 emit_move_insn (operand0
, res
);
15944 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
15947 ix86_expand_floorceil (rtx operand0
, rtx operand1
, bool do_floor
)
15949 /* C code for the stuff we expand below.
15950 double xa = fabs (x), x2;
15951 if (!isless (xa, TWO52))
15953 x2 = (double)(long)x;
15960 if (HONOR_SIGNED_ZEROS (mode))
15961 return copysign (x2, x);
15964 machine_mode mode
= GET_MODE (operand0
);
15965 rtx xa
, xi
, TWO52
, tmp
, one
, res
, mask
;
15966 rtx_code_label
*label
;
15968 TWO52
= ix86_gen_TWO52 (mode
);
15970 /* Temporary for holding the result, initialized to the input
15971 operand to ease control flow. */
15972 res
= gen_reg_rtx (mode
);
15973 emit_move_insn (res
, operand1
);
15975 /* xa = abs (operand1) */
15976 xa
= ix86_expand_sse_fabs (res
, &mask
);
15978 /* if (!isless (xa, TWO52)) goto label; */
15979 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
15981 /* xa = (double)(long)x */
15982 xi
= gen_reg_rtx (mode
== DFmode
? DImode
: SImode
);
15983 expand_fix (xi
, res
, 0);
15984 expand_float (xa
, xi
, 0);
15987 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
15989 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
15990 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa
, res
, !do_floor
);
15991 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
15992 tmp
= expand_simple_binop (mode
, do_floor
? MINUS
: PLUS
,
15993 xa
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
15994 emit_move_insn (res
, tmp
);
15996 if (HONOR_SIGNED_ZEROS (mode
))
15997 ix86_sse_copysign_to_positive (res
, res
, force_reg (mode
, operand1
), mask
);
15999 emit_label (label
);
16000 LABEL_NUSES (label
) = 1;
16002 emit_move_insn (operand0
, res
);
16005 /* Expand SSE sequence for computing round from OPERAND1 storing
16006 into OPERAND0. Sequence that works without relying on DImode truncation
16007 via cvttsd2siq that is only available on 64bit targets. */
16009 ix86_expand_rounddf_32 (rtx operand0
, rtx operand1
)
16011 /* C code for the stuff we expand below.
16012 double xa = fabs (x), xa2, x2;
16013 if (!isless (xa, TWO52))
16015 Using the absolute value and copying back sign makes
16016 -0.0 -> -0.0 correct.
16017 xa2 = xa + TWO52 - TWO52;
16022 else if (dxa > 0.5)
16024 x2 = copysign (xa2, x);
16027 machine_mode mode
= GET_MODE (operand0
);
16028 rtx xa
, xa2
, dxa
, TWO52
, tmp
, half
, mhalf
, one
, res
, mask
;
16029 rtx_code_label
*label
;
16031 TWO52
= ix86_gen_TWO52 (mode
);
16033 /* Temporary for holding the result, initialized to the input
16034 operand to ease control flow. */
16035 res
= gen_reg_rtx (mode
);
16036 emit_move_insn (res
, operand1
);
16038 /* xa = abs (operand1) */
16039 xa
= ix86_expand_sse_fabs (res
, &mask
);
16041 /* if (!isless (xa, TWO52)) goto label; */
16042 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16044 /* xa2 = xa + TWO52 - TWO52; */
16045 xa2
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
16046 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, TWO52
, xa2
, 0, OPTAB_DIRECT
);
16048 /* dxa = xa2 - xa; */
16049 dxa
= expand_simple_binop (mode
, MINUS
, xa2
, xa
, NULL_RTX
, 0, OPTAB_DIRECT
);
16051 /* generate 0.5, 1.0 and -0.5 */
16052 half
= force_reg (mode
, const_double_from_real_value (dconsthalf
, mode
));
16053 one
= expand_simple_binop (mode
, PLUS
, half
, half
, NULL_RTX
, 0, OPTAB_DIRECT
);
16054 mhalf
= expand_simple_binop (mode
, MINUS
, half
, one
, NULL_RTX
,
16058 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
16059 tmp
= ix86_expand_sse_compare_mask (UNGT
, dxa
, half
, false);
16060 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, tmp
, one
)));
16061 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
16062 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
16063 tmp
= ix86_expand_sse_compare_mask (UNGE
, mhalf
, dxa
, false);
16064 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, tmp
, one
)));
16065 xa2
= expand_simple_binop (mode
, PLUS
, xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
16067 /* res = copysign (xa2, operand1) */
16068 ix86_sse_copysign_to_positive (res
, xa2
, force_reg (mode
, operand1
), mask
);
16070 emit_label (label
);
16071 LABEL_NUSES (label
) = 1;
16073 emit_move_insn (operand0
, res
);
16076 /* Expand SSE sequence for computing trunc from OPERAND1 storing
16079 ix86_expand_trunc (rtx operand0
, rtx operand1
)
16081 /* C code for SSE variant we expand below.
16082 double xa = fabs (x), x2;
16083 if (!isless (xa, TWO52))
16085 x2 = (double)(long)x;
16086 if (HONOR_SIGNED_ZEROS (mode))
16087 return copysign (x2, x);
16090 machine_mode mode
= GET_MODE (operand0
);
16091 rtx xa
, xi
, TWO52
, res
, mask
;
16092 rtx_code_label
*label
;
16094 TWO52
= ix86_gen_TWO52 (mode
);
16096 /* Temporary for holding the result, initialized to the input
16097 operand to ease control flow. */
16098 res
= gen_reg_rtx (mode
);
16099 emit_move_insn (res
, operand1
);
16101 /* xa = abs (operand1) */
16102 xa
= ix86_expand_sse_fabs (res
, &mask
);
16104 /* if (!isless (xa, TWO52)) goto label; */
16105 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16107 /* x = (double)(long)x */
16108 xi
= gen_reg_rtx (mode
== DFmode
? DImode
: SImode
);
16109 expand_fix (xi
, res
, 0);
16110 expand_float (res
, xi
, 0);
16112 if (HONOR_SIGNED_ZEROS (mode
))
16113 ix86_sse_copysign_to_positive (res
, res
, force_reg (mode
, operand1
), mask
);
16115 emit_label (label
);
16116 LABEL_NUSES (label
) = 1;
16118 emit_move_insn (operand0
, res
);
16121 /* Expand SSE sequence for computing trunc from OPERAND1 storing
16124 ix86_expand_truncdf_32 (rtx operand0
, rtx operand1
)
16126 machine_mode mode
= GET_MODE (operand0
);
16127 rtx xa
, mask
, TWO52
, one
, res
, smask
, tmp
;
16128 rtx_code_label
*label
;
16130 /* C code for SSE variant we expand below.
16131 double xa = fabs (x), x2;
16132 if (!isless (xa, TWO52))
16134 xa2 = xa + TWO52 - TWO52;
16138 x2 = copysign (xa2, x);
16142 TWO52
= ix86_gen_TWO52 (mode
);
16144 /* Temporary for holding the result, initialized to the input
16145 operand to ease control flow. */
16146 res
= gen_reg_rtx (mode
);
16147 emit_move_insn (res
, operand1
);
16149 /* xa = abs (operand1) */
16150 xa
= ix86_expand_sse_fabs (res
, &smask
);
16152 /* if (!isless (xa, TWO52)) goto label; */
16153 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16155 /* res = xa + TWO52 - TWO52; */
16156 tmp
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
16157 tmp
= expand_simple_binop (mode
, MINUS
, tmp
, TWO52
, tmp
, 0, OPTAB_DIRECT
);
16158 emit_move_insn (res
, tmp
);
16161 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
16163 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
16164 mask
= ix86_expand_sse_compare_mask (UNGT
, res
, xa
, false);
16165 emit_insn (gen_rtx_SET (mask
, gen_rtx_AND (mode
, mask
, one
)));
16166 tmp
= expand_simple_binop (mode
, MINUS
,
16167 res
, mask
, NULL_RTX
, 0, OPTAB_DIRECT
);
16168 emit_move_insn (res
, tmp
);
16170 /* res = copysign (res, operand1) */
16171 ix86_sse_copysign_to_positive (res
, res
, force_reg (mode
, operand1
), smask
);
16173 emit_label (label
);
16174 LABEL_NUSES (label
) = 1;
16176 emit_move_insn (operand0
, res
);
16179 /* Expand SSE sequence for computing round from OPERAND1 storing
16182 ix86_expand_round (rtx operand0
, rtx operand1
)
16184 /* C code for the stuff we're doing below:
16185 double xa = fabs (x);
16186 if (!isless (xa, TWO52))
16188 xa = (double)(long)(xa + nextafter (0.5, 0.0));
16189 return copysign (xa, x);
16191 machine_mode mode
= GET_MODE (operand0
);
16192 rtx res
, TWO52
, xa
, xi
, half
, mask
;
16193 rtx_code_label
*label
;
16194 const struct real_format
*fmt
;
16195 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
16197 /* Temporary for holding the result, initialized to the input
16198 operand to ease control flow. */
16199 res
= gen_reg_rtx (mode
);
16200 emit_move_insn (res
, operand1
);
16202 TWO52
= ix86_gen_TWO52 (mode
);
16203 xa
= ix86_expand_sse_fabs (res
, &mask
);
16204 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
16206 /* load nextafter (0.5, 0.0) */
16207 fmt
= REAL_MODE_FORMAT (mode
);
16208 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
16209 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
16211 /* xa = xa + 0.5 */
16212 half
= force_reg (mode
, const_double_from_real_value (pred_half
, mode
));
16213 xa
= expand_simple_binop (mode
, PLUS
, xa
, half
, NULL_RTX
, 0, OPTAB_DIRECT
);
16215 /* xa = (double)(int64_t)xa */
16216 xi
= gen_reg_rtx (mode
== DFmode
? DImode
: SImode
);
16217 expand_fix (xi
, xa
, 0);
16218 expand_float (xa
, xi
, 0);
16220 /* res = copysign (xa, operand1) */
16221 ix86_sse_copysign_to_positive (res
, xa
, force_reg (mode
, operand1
), mask
);
16223 emit_label (label
);
16224 LABEL_NUSES (label
) = 1;
16226 emit_move_insn (operand0
, res
);
16229 /* Expand SSE sequence for computing round
16230 from OP1 storing into OP0 using sse4 round insn. */
16232 ix86_expand_round_sse4 (rtx op0
, rtx op1
)
16234 machine_mode mode
= GET_MODE (op0
);
16235 rtx e1
, e2
, res
, half
;
16236 const struct real_format
*fmt
;
16237 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
16238 rtx (*gen_copysign
) (rtx
, rtx
, rtx
);
16239 rtx (*gen_round
) (rtx
, rtx
, rtx
);
16244 gen_copysign
= gen_copysignsf3
;
16245 gen_round
= gen_sse4_1_roundsf2
;
16248 gen_copysign
= gen_copysigndf3
;
16249 gen_round
= gen_sse4_1_rounddf2
;
16252 gcc_unreachable ();
16255 /* round (a) = trunc (a + copysign (0.5, a)) */
16257 /* load nextafter (0.5, 0.0) */
16258 fmt
= REAL_MODE_FORMAT (mode
);
16259 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
16260 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
16261 half
= const_double_from_real_value (pred_half
, mode
);
16263 /* e1 = copysign (0.5, op1) */
16264 e1
= gen_reg_rtx (mode
);
16265 emit_insn (gen_copysign (e1
, half
, op1
));
16267 /* e2 = op1 + e1 */
16268 e2
= expand_simple_binop (mode
, PLUS
, op1
, e1
, NULL_RTX
, 0, OPTAB_DIRECT
);
16270 /* res = trunc (e2) */
16271 res
= gen_reg_rtx (mode
);
16272 emit_insn (gen_round (res
, e2
, GEN_INT (ROUND_TRUNC
)));
16274 emit_move_insn (op0
, res
);
16277 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
16278 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
16279 insn every time. */
16281 static GTY(()) rtx_insn
*vselect_insn
;
16283 /* Initialize vselect_insn. */
16286 init_vselect_insn (void)
16291 x
= gen_rtx_PARALLEL (VOIDmode
, rtvec_alloc (MAX_VECT_LEN
));
16292 for (i
= 0; i
< MAX_VECT_LEN
; ++i
)
16293 XVECEXP (x
, 0, i
) = const0_rtx
;
16294 x
= gen_rtx_VEC_SELECT (V2DFmode
, gen_rtx_VEC_CONCAT (V4DFmode
, const0_rtx
,
16296 x
= gen_rtx_SET (const0_rtx
, x
);
16298 vselect_insn
= emit_insn (x
);
16302 /* Construct (set target (vec_select op0 (parallel perm))) and
16303 return true if that's a valid instruction in the active ISA. */
16306 expand_vselect (rtx target
, rtx op0
, const unsigned char *perm
,
16307 unsigned nelt
, bool testing_p
)
16310 rtx x
, save_vconcat
;
16313 if (vselect_insn
== NULL_RTX
)
16314 init_vselect_insn ();
16316 x
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 1);
16317 PUT_NUM_ELEM (XVEC (x
, 0), nelt
);
16318 for (i
= 0; i
< nelt
; ++i
)
16319 XVECEXP (x
, 0, i
) = GEN_INT (perm
[i
]);
16320 save_vconcat
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 0);
16321 XEXP (SET_SRC (PATTERN (vselect_insn
)), 0) = op0
;
16322 PUT_MODE (SET_SRC (PATTERN (vselect_insn
)), GET_MODE (target
));
16323 SET_DEST (PATTERN (vselect_insn
)) = target
;
16324 icode
= recog_memoized (vselect_insn
);
16326 if (icode
>= 0 && !testing_p
)
16327 emit_insn (copy_rtx (PATTERN (vselect_insn
)));
16329 SET_DEST (PATTERN (vselect_insn
)) = const0_rtx
;
16330 XEXP (SET_SRC (PATTERN (vselect_insn
)), 0) = save_vconcat
;
16331 INSN_CODE (vselect_insn
) = -1;
16336 /* Similar, but generate a vec_concat from op0 and op1 as well. */
16339 expand_vselect_vconcat (rtx target
, rtx op0
, rtx op1
,
16340 const unsigned char *perm
, unsigned nelt
,
16343 machine_mode v2mode
;
16347 if (vselect_insn
== NULL_RTX
)
16348 init_vselect_insn ();
16350 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0
)).exists (&v2mode
))
16352 x
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 0);
16353 PUT_MODE (x
, v2mode
);
16356 ok
= expand_vselect (target
, x
, perm
, nelt
, testing_p
);
16357 XEXP (x
, 0) = const0_rtx
;
16358 XEXP (x
, 1) = const0_rtx
;
16362 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
16363 using movss or movsd. */
16365 expand_vec_perm_movs (struct expand_vec_perm_d
*d
)
16367 machine_mode vmode
= d
->vmode
;
16368 unsigned i
, nelt
= d
->nelt
;
16371 if (d
->one_operand_p
)
16374 if (!(TARGET_SSE
&& vmode
== V4SFmode
)
16375 && !(TARGET_SSE2
&& vmode
== V2DFmode
))
16378 /* Only the first element is changed. */
16379 if (d
->perm
[0] != nelt
&& d
->perm
[0] != 0)
16381 for (i
= 1; i
< nelt
; ++i
)
16382 if (d
->perm
[i
] != i
+ nelt
- d
->perm
[0])
16388 if (d
->perm
[0] == nelt
)
16389 x
= gen_rtx_VEC_MERGE (vmode
, d
->op1
, d
->op0
, GEN_INT (1));
16391 x
= gen_rtx_VEC_MERGE (vmode
, d
->op0
, d
->op1
, GEN_INT (1));
16393 emit_insn (gen_rtx_SET (d
->target
, x
));
16398 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
16399 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
16402 expand_vec_perm_blend (struct expand_vec_perm_d
*d
)
16404 machine_mode mmode
, vmode
= d
->vmode
;
16405 unsigned i
, nelt
= d
->nelt
;
16406 unsigned HOST_WIDE_INT mask
;
16407 rtx target
, op0
, op1
, maskop
, x
;
16408 rtx rperm
[32], vperm
;
16410 if (d
->one_operand_p
)
16412 if (TARGET_AVX512F
&& GET_MODE_SIZE (vmode
) == 64
16413 && (TARGET_AVX512BW
16414 || GET_MODE_UNIT_SIZE (vmode
) >= 4))
16416 else if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
16418 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
16420 else if (TARGET_SSE4_1
&& GET_MODE_SIZE (vmode
) == 16)
16425 /* This is a blend, not a permute. Elements must stay in their
16426 respective lanes. */
16427 for (i
= 0; i
< nelt
; ++i
)
16429 unsigned e
= d
->perm
[i
];
16430 if (!(e
== i
|| e
== i
+ nelt
))
16437 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
16438 decision should be extracted elsewhere, so that we only try that
16439 sequence once all budget==3 options have been tried. */
16440 target
= d
->target
;
16459 for (i
= 0; i
< nelt
; ++i
)
16460 mask
|= ((unsigned HOST_WIDE_INT
) (d
->perm
[i
] >= nelt
)) << i
;
16464 for (i
= 0; i
< 2; ++i
)
16465 mask
|= (d
->perm
[i
] >= 2 ? 15 : 0) << (i
* 4);
16470 for (i
= 0; i
< 4; ++i
)
16471 mask
|= (d
->perm
[i
] >= 4 ? 3 : 0) << (i
* 2);
16476 /* See if bytes move in pairs so we can use pblendw with
16477 an immediate argument, rather than pblendvb with a vector
16479 for (i
= 0; i
< 16; i
+= 2)
16480 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
16483 for (i
= 0; i
< nelt
; ++i
)
16484 rperm
[i
] = (d
->perm
[i
] < nelt
? const0_rtx
: constm1_rtx
);
16487 vperm
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
16488 vperm
= force_reg (vmode
, vperm
);
16490 if (GET_MODE_SIZE (vmode
) == 16)
16491 emit_insn (gen_sse4_1_pblendvb (target
, op0
, op1
, vperm
));
16493 emit_insn (gen_avx2_pblendvb (target
, op0
, op1
, vperm
));
16494 if (target
!= d
->target
)
16495 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
16499 for (i
= 0; i
< 8; ++i
)
16500 mask
|= (d
->perm
[i
* 2] >= 16) << i
;
16505 target
= gen_reg_rtx (vmode
);
16506 op0
= gen_lowpart (vmode
, op0
);
16507 op1
= gen_lowpart (vmode
, op1
);
16511 /* See if bytes move in pairs. If not, vpblendvb must be used. */
16512 for (i
= 0; i
< 32; i
+= 2)
16513 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
16515 /* See if bytes move in quadruplets. If yes, vpblendd
16516 with immediate can be used. */
16517 for (i
= 0; i
< 32; i
+= 4)
16518 if (d
->perm
[i
] + 2 != d
->perm
[i
+ 2])
16522 /* See if bytes move the same in both lanes. If yes,
16523 vpblendw with immediate can be used. */
16524 for (i
= 0; i
< 16; i
+= 2)
16525 if (d
->perm
[i
] + 16 != d
->perm
[i
+ 16])
16528 /* Use vpblendw. */
16529 for (i
= 0; i
< 16; ++i
)
16530 mask
|= (d
->perm
[i
* 2] >= 32) << i
;
16535 /* Use vpblendd. */
16536 for (i
= 0; i
< 8; ++i
)
16537 mask
|= (d
->perm
[i
* 4] >= 32) << i
;
16542 /* See if words move in pairs. If yes, vpblendd can be used. */
16543 for (i
= 0; i
< 16; i
+= 2)
16544 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
16548 /* See if words move the same in both lanes. If not,
16549 vpblendvb must be used. */
16550 for (i
= 0; i
< 8; i
++)
16551 if (d
->perm
[i
] + 8 != d
->perm
[i
+ 8])
16553 /* Use vpblendvb. */
16554 for (i
= 0; i
< 32; ++i
)
16555 rperm
[i
] = (d
->perm
[i
/ 2] < 16 ? const0_rtx
: constm1_rtx
);
16559 target
= gen_reg_rtx (vmode
);
16560 op0
= gen_lowpart (vmode
, op0
);
16561 op1
= gen_lowpart (vmode
, op1
);
16562 goto finish_pblendvb
;
16565 /* Use vpblendw. */
16566 for (i
= 0; i
< 16; ++i
)
16567 mask
|= (d
->perm
[i
] >= 16) << i
;
16571 /* Use vpblendd. */
16572 for (i
= 0; i
< 8; ++i
)
16573 mask
|= (d
->perm
[i
* 2] >= 16) << i
;
16578 /* Use vpblendd. */
16579 for (i
= 0; i
< 4; ++i
)
16580 mask
|= (d
->perm
[i
] >= 4 ? 3 : 0) << (i
* 2);
16585 gcc_unreachable ();
16608 if (mmode
!= VOIDmode
)
16609 maskop
= force_reg (mmode
, gen_int_mode (mask
, mmode
));
16611 maskop
= GEN_INT (mask
);
16613 /* This matches five different patterns with the different modes. */
16614 x
= gen_rtx_VEC_MERGE (vmode
, op1
, op0
, maskop
);
16615 x
= gen_rtx_SET (target
, x
);
16617 if (target
!= d
->target
)
16618 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
16623 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
16624 in terms of the variable form of vpermilps.
16626 Note that we will have already failed the immediate input vpermilps,
16627 which requires that the high and low part shuffle be identical; the
16628 variable form doesn't require that. */
16631 expand_vec_perm_vpermil (struct expand_vec_perm_d
*d
)
16633 rtx rperm
[8], vperm
;
16636 if (!TARGET_AVX
|| d
->vmode
!= V8SFmode
|| !d
->one_operand_p
)
16639 /* We can only permute within the 128-bit lane. */
16640 for (i
= 0; i
< 8; ++i
)
16642 unsigned e
= d
->perm
[i
];
16643 if (i
< 4 ? e
>= 4 : e
< 4)
16650 for (i
= 0; i
< 8; ++i
)
16652 unsigned e
= d
->perm
[i
];
16654 /* Within each 128-bit lane, the elements of op0 are numbered
16655 from 0 and the elements of op1 are numbered from 4. */
16661 rperm
[i
] = GEN_INT (e
);
16664 vperm
= gen_rtx_CONST_VECTOR (V8SImode
, gen_rtvec_v (8, rperm
));
16665 vperm
= force_reg (V8SImode
, vperm
);
16666 emit_insn (gen_avx_vpermilvarv8sf3 (d
->target
, d
->op0
, vperm
));
16671 /* Return true if permutation D can be performed as VMODE permutation
16675 valid_perm_using_mode_p (machine_mode vmode
, struct expand_vec_perm_d
*d
)
16677 unsigned int i
, j
, chunk
;
16679 if (GET_MODE_CLASS (vmode
) != MODE_VECTOR_INT
16680 || GET_MODE_CLASS (d
->vmode
) != MODE_VECTOR_INT
16681 || GET_MODE_SIZE (vmode
) != GET_MODE_SIZE (d
->vmode
))
16684 if (GET_MODE_NUNITS (vmode
) >= d
->nelt
)
16687 chunk
= d
->nelt
/ GET_MODE_NUNITS (vmode
);
16688 for (i
= 0; i
< d
->nelt
; i
+= chunk
)
16689 if (d
->perm
[i
] & (chunk
- 1))
16692 for (j
= 1; j
< chunk
; ++j
)
16693 if (d
->perm
[i
] + j
!= d
->perm
[i
+ j
])
16699 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
16700 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
16703 expand_vec_perm_pshufb (struct expand_vec_perm_d
*d
)
16705 unsigned i
, nelt
, eltsz
, mask
;
16706 unsigned char perm
[64];
16707 machine_mode vmode
= V16QImode
;
16708 rtx rperm
[64], vperm
, target
, op0
, op1
;
16712 if (!d
->one_operand_p
)
16714 if (!TARGET_XOP
|| GET_MODE_SIZE (d
->vmode
) != 16)
16717 && valid_perm_using_mode_p (V2TImode
, d
))
16722 /* Use vperm2i128 insn. The pattern uses
16723 V4DImode instead of V2TImode. */
16724 target
= d
->target
;
16725 if (d
->vmode
!= V4DImode
)
16726 target
= gen_reg_rtx (V4DImode
);
16727 op0
= gen_lowpart (V4DImode
, d
->op0
);
16728 op1
= gen_lowpart (V4DImode
, d
->op1
);
16730 = GEN_INT ((d
->perm
[0] / (nelt
/ 2))
16731 | ((d
->perm
[nelt
/ 2] / (nelt
/ 2)) * 16));
16732 emit_insn (gen_avx2_permv2ti (target
, op0
, op1
, rperm
[0]));
16733 if (target
!= d
->target
)
16734 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
16742 if (GET_MODE_SIZE (d
->vmode
) == 16)
16747 else if (GET_MODE_SIZE (d
->vmode
) == 32)
16752 /* V4DImode should be already handled through
16753 expand_vselect by vpermq instruction. */
16754 gcc_assert (d
->vmode
!= V4DImode
);
16757 if (d
->vmode
== V8SImode
16758 || d
->vmode
== V16HImode
16759 || d
->vmode
== V32QImode
)
16761 /* First see if vpermq can be used for
16762 V8SImode/V16HImode/V32QImode. */
16763 if (valid_perm_using_mode_p (V4DImode
, d
))
16765 for (i
= 0; i
< 4; i
++)
16766 perm
[i
] = (d
->perm
[i
* nelt
/ 4] * 4 / nelt
) & 3;
16769 target
= gen_reg_rtx (V4DImode
);
16770 if (expand_vselect (target
, gen_lowpart (V4DImode
, d
->op0
),
16773 emit_move_insn (d
->target
,
16774 gen_lowpart (d
->vmode
, target
));
16780 /* Next see if vpermd can be used. */
16781 if (valid_perm_using_mode_p (V8SImode
, d
))
16784 /* Or if vpermps can be used. */
16785 else if (d
->vmode
== V8SFmode
)
16788 if (vmode
== V32QImode
)
16790 /* vpshufb only works intra lanes, it is not
16791 possible to shuffle bytes in between the lanes. */
16792 for (i
= 0; i
< nelt
; ++i
)
16793 if ((d
->perm
[i
] ^ i
) & (nelt
/ 2))
16797 else if (GET_MODE_SIZE (d
->vmode
) == 64)
16799 if (!TARGET_AVX512BW
)
16802 /* If vpermq didn't work, vpshufb won't work either. */
16803 if (d
->vmode
== V8DFmode
|| d
->vmode
== V8DImode
)
16807 if (d
->vmode
== V16SImode
16808 || d
->vmode
== V32HImode
16809 || d
->vmode
== V64QImode
)
16811 /* First see if vpermq can be used for
16812 V16SImode/V32HImode/V64QImode. */
16813 if (valid_perm_using_mode_p (V8DImode
, d
))
16815 for (i
= 0; i
< 8; i
++)
16816 perm
[i
] = (d
->perm
[i
* nelt
/ 8] * 8 / nelt
) & 7;
16819 target
= gen_reg_rtx (V8DImode
);
16820 if (expand_vselect (target
, gen_lowpart (V8DImode
, d
->op0
),
16823 emit_move_insn (d
->target
,
16824 gen_lowpart (d
->vmode
, target
));
16830 /* Next see if vpermd can be used. */
16831 if (valid_perm_using_mode_p (V16SImode
, d
))
16834 /* Or if vpermps can be used. */
16835 else if (d
->vmode
== V16SFmode
)
16837 if (vmode
== V64QImode
)
16839 /* vpshufb only works intra lanes, it is not
16840 possible to shuffle bytes in between the lanes. */
16841 for (i
= 0; i
< nelt
; ++i
)
16842 if ((d
->perm
[i
] ^ i
) & (nelt
/ 4))
16853 if (vmode
== V8SImode
)
16854 for (i
= 0; i
< 8; ++i
)
16855 rperm
[i
] = GEN_INT ((d
->perm
[i
* nelt
/ 8] * 8 / nelt
) & 7);
16856 else if (vmode
== V16SImode
)
16857 for (i
= 0; i
< 16; ++i
)
16858 rperm
[i
] = GEN_INT ((d
->perm
[i
* nelt
/ 16] * 16 / nelt
) & 15);
16861 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
16862 if (!d
->one_operand_p
)
16863 mask
= 2 * nelt
- 1;
16864 else if (vmode
== V16QImode
)
16866 else if (vmode
== V64QImode
)
16867 mask
= nelt
/ 4 - 1;
16869 mask
= nelt
/ 2 - 1;
16871 for (i
= 0; i
< nelt
; ++i
)
16873 unsigned j
, e
= d
->perm
[i
] & mask
;
16874 for (j
= 0; j
< eltsz
; ++j
)
16875 rperm
[i
* eltsz
+ j
] = GEN_INT (e
* eltsz
+ j
);
16879 vperm
= gen_rtx_CONST_VECTOR (vmode
,
16880 gen_rtvec_v (GET_MODE_NUNITS (vmode
), rperm
));
16881 vperm
= force_reg (vmode
, vperm
);
16883 target
= d
->target
;
16884 if (d
->vmode
!= vmode
)
16885 target
= gen_reg_rtx (vmode
);
16886 op0
= gen_lowpart (vmode
, d
->op0
);
16887 if (d
->one_operand_p
)
16889 if (vmode
== V16QImode
)
16890 emit_insn (gen_ssse3_pshufbv16qi3 (target
, op0
, vperm
));
16891 else if (vmode
== V32QImode
)
16892 emit_insn (gen_avx2_pshufbv32qi3 (target
, op0
, vperm
));
16893 else if (vmode
== V64QImode
)
16894 emit_insn (gen_avx512bw_pshufbv64qi3 (target
, op0
, vperm
));
16895 else if (vmode
== V8SFmode
)
16896 emit_insn (gen_avx2_permvarv8sf (target
, op0
, vperm
));
16897 else if (vmode
== V8SImode
)
16898 emit_insn (gen_avx2_permvarv8si (target
, op0
, vperm
));
16899 else if (vmode
== V16SFmode
)
16900 emit_insn (gen_avx512f_permvarv16sf (target
, op0
, vperm
));
16901 else if (vmode
== V16SImode
)
16902 emit_insn (gen_avx512f_permvarv16si (target
, op0
, vperm
));
16904 gcc_unreachable ();
16908 op1
= gen_lowpart (vmode
, d
->op1
);
16909 emit_insn (gen_xop_pperm (target
, op0
, op1
, vperm
));
16911 if (target
!= d
->target
)
16912 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
16917 /* For V*[QHS]Imode permutations, check if the same permutation
16918 can't be performed in a 2x, 4x or 8x wider inner mode. */
16921 canonicalize_vector_int_perm (const struct expand_vec_perm_d
*d
,
16922 struct expand_vec_perm_d
*nd
)
16925 machine_mode mode
= VOIDmode
;
16929 case E_V16QImode
: mode
= V8HImode
; break;
16930 case E_V32QImode
: mode
= V16HImode
; break;
16931 case E_V64QImode
: mode
= V32HImode
; break;
16932 case E_V8HImode
: mode
= V4SImode
; break;
16933 case E_V16HImode
: mode
= V8SImode
; break;
16934 case E_V32HImode
: mode
= V16SImode
; break;
16935 case E_V4SImode
: mode
= V2DImode
; break;
16936 case E_V8SImode
: mode
= V4DImode
; break;
16937 case E_V16SImode
: mode
= V8DImode
; break;
16938 default: return false;
16940 for (i
= 0; i
< d
->nelt
; i
+= 2)
16941 if ((d
->perm
[i
] & 1) || d
->perm
[i
+ 1] != d
->perm
[i
] + 1)
16944 nd
->nelt
= d
->nelt
/ 2;
16945 for (i
= 0; i
< nd
->nelt
; i
++)
16946 nd
->perm
[i
] = d
->perm
[2 * i
] / 2;
16947 if (GET_MODE_INNER (mode
) != DImode
)
16948 canonicalize_vector_int_perm (nd
, nd
);
16951 nd
->one_operand_p
= d
->one_operand_p
;
16952 nd
->testing_p
= d
->testing_p
;
16953 if (d
->op0
== d
->op1
)
16954 nd
->op0
= nd
->op1
= gen_lowpart (nd
->vmode
, d
->op0
);
16957 nd
->op0
= gen_lowpart (nd
->vmode
, d
->op0
);
16958 nd
->op1
= gen_lowpart (nd
->vmode
, d
->op1
);
16961 nd
->target
= gen_raw_REG (nd
->vmode
, LAST_VIRTUAL_REGISTER
+ 1);
16963 nd
->target
= gen_reg_rtx (nd
->vmode
);
16968 /* Try to expand one-operand permutation with constant mask. */
16971 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d
*d
)
16973 machine_mode mode
= GET_MODE (d
->op0
);
16974 machine_mode maskmode
= mode
;
16975 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
16976 rtx target
, op0
, mask
;
16979 if (!rtx_equal_p (d
->op0
, d
->op1
))
16982 if (!TARGET_AVX512F
)
16988 gen
= gen_avx512f_permvarv16si
;
16991 gen
= gen_avx512f_permvarv16sf
;
16992 maskmode
= V16SImode
;
16995 gen
= gen_avx512f_permvarv8di
;
16998 gen
= gen_avx512f_permvarv8df
;
16999 maskmode
= V8DImode
;
17005 target
= d
->target
;
17007 for (int i
= 0; i
< d
->nelt
; ++i
)
17008 vec
[i
] = GEN_INT (d
->perm
[i
]);
17009 mask
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (d
->nelt
, vec
));
17010 emit_insn (gen (target
, op0
, force_reg (maskmode
, mask
)));
17014 static bool expand_vec_perm_palignr (struct expand_vec_perm_d
*d
, bool);
17016 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
17017 in a single instruction. */
17020 expand_vec_perm_1 (struct expand_vec_perm_d
*d
)
17022 unsigned i
, nelt
= d
->nelt
;
17023 struct expand_vec_perm_d nd
;
17025 /* Check plain VEC_SELECT first, because AVX has instructions that could
17026 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
17027 input where SEL+CONCAT may not. */
17028 if (d
->one_operand_p
)
17030 int mask
= nelt
- 1;
17031 bool identity_perm
= true;
17032 bool broadcast_perm
= true;
17034 for (i
= 0; i
< nelt
; i
++)
17036 nd
.perm
[i
] = d
->perm
[i
] & mask
;
17037 if (nd
.perm
[i
] != i
)
17038 identity_perm
= false;
17040 broadcast_perm
= false;
17046 emit_move_insn (d
->target
, d
->op0
);
17049 else if (broadcast_perm
&& TARGET_AVX2
)
17051 /* Use vpbroadcast{b,w,d}. */
17052 rtx (*gen
) (rtx
, rtx
) = NULL
;
17056 if (TARGET_AVX512BW
)
17057 gen
= gen_avx512bw_vec_dupv64qi_1
;
17060 gen
= gen_avx2_pbroadcastv32qi_1
;
17063 if (TARGET_AVX512BW
)
17064 gen
= gen_avx512bw_vec_dupv32hi_1
;
17067 gen
= gen_avx2_pbroadcastv16hi_1
;
17070 if (TARGET_AVX512F
)
17071 gen
= gen_avx512f_vec_dupv16si_1
;
17074 gen
= gen_avx2_pbroadcastv8si_1
;
17077 gen
= gen_avx2_pbroadcastv16qi
;
17080 gen
= gen_avx2_pbroadcastv8hi
;
17083 if (TARGET_AVX512F
)
17084 gen
= gen_avx512f_vec_dupv16sf_1
;
17087 gen
= gen_avx2_vec_dupv8sf_1
;
17090 if (TARGET_AVX512F
)
17091 gen
= gen_avx512f_vec_dupv8df_1
;
17094 if (TARGET_AVX512F
)
17095 gen
= gen_avx512f_vec_dupv8di_1
;
17097 /* For other modes prefer other shuffles this function creates. */
17103 emit_insn (gen (d
->target
, d
->op0
));
17108 if (expand_vselect (d
->target
, d
->op0
, nd
.perm
, nelt
, d
->testing_p
))
17111 /* There are plenty of patterns in sse.md that are written for
17112 SEL+CONCAT and are not replicated for a single op. Perhaps
17113 that should be changed, to avoid the nastiness here. */
17115 /* Recognize interleave style patterns, which means incrementing
17116 every other permutation operand. */
17117 for (i
= 0; i
< nelt
; i
+= 2)
17119 nd
.perm
[i
] = d
->perm
[i
] & mask
;
17120 nd
.perm
[i
+ 1] = (d
->perm
[i
+ 1] & mask
) + nelt
;
17122 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op0
, nd
.perm
, nelt
,
17126 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
17129 for (i
= 0; i
< nelt
; i
+= 4)
17131 nd
.perm
[i
+ 0] = d
->perm
[i
+ 0] & mask
;
17132 nd
.perm
[i
+ 1] = d
->perm
[i
+ 1] & mask
;
17133 nd
.perm
[i
+ 2] = (d
->perm
[i
+ 2] & mask
) + nelt
;
17134 nd
.perm
[i
+ 3] = (d
->perm
[i
+ 3] & mask
) + nelt
;
17137 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op0
, nd
.perm
, nelt
,
17143 /* Try movss/movsd instructions. */
17144 if (expand_vec_perm_movs (d
))
17147 /* Finally, try the fully general two operand permute. */
17148 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op1
, d
->perm
, nelt
,
17152 /* Recognize interleave style patterns with reversed operands. */
17153 if (!d
->one_operand_p
)
17155 for (i
= 0; i
< nelt
; ++i
)
17157 unsigned e
= d
->perm
[i
];
17165 if (expand_vselect_vconcat (d
->target
, d
->op1
, d
->op0
, nd
.perm
, nelt
,
17170 /* Try the SSE4.1 blend variable merge instructions. */
17171 if (expand_vec_perm_blend (d
))
17174 /* Try one of the AVX vpermil variable permutations. */
17175 if (expand_vec_perm_vpermil (d
))
17178 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
17179 vpshufb, vpermd, vpermps or vpermq variable permutation. */
17180 if (expand_vec_perm_pshufb (d
))
17183 /* Try the AVX2 vpalignr instruction. */
17184 if (expand_vec_perm_palignr (d
, true))
17187 /* Try the AVX512F vperm{s,d} instructions. */
17188 if (ix86_expand_vec_one_operand_perm_avx512 (d
))
17191 /* Try the AVX512F vpermt2/vpermi2 instructions. */
17192 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX
, NULL_RTX
, NULL_RTX
, NULL_RTX
, d
))
17195 /* See if we can get the same permutation in different vector integer
17197 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
17200 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
17206 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
17207 in terms of a pair of pshuflw + pshufhw instructions. */
17210 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d
*d
)
17212 unsigned char perm2
[MAX_VECT_LEN
];
17216 if (d
->vmode
!= V8HImode
|| !d
->one_operand_p
)
17219 /* The two permutations only operate in 64-bit lanes. */
17220 for (i
= 0; i
< 4; ++i
)
17221 if (d
->perm
[i
] >= 4)
17223 for (i
= 4; i
< 8; ++i
)
17224 if (d
->perm
[i
] < 4)
17230 /* Emit the pshuflw. */
17231 memcpy (perm2
, d
->perm
, 4);
17232 for (i
= 4; i
< 8; ++i
)
17234 ok
= expand_vselect (d
->target
, d
->op0
, perm2
, 8, d
->testing_p
);
17237 /* Emit the pshufhw. */
17238 memcpy (perm2
+ 4, d
->perm
+ 4, 4);
17239 for (i
= 0; i
< 4; ++i
)
17241 ok
= expand_vselect (d
->target
, d
->target
, perm2
, 8, d
->testing_p
);
17247 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
17248 the permutation using the SSSE3 palignr instruction. This succeeds
17249 when all of the elements in PERM fit within one vector and we merely
17250 need to shift them down so that a single vector permutation has a
17251 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
17252 the vpalignr instruction itself can perform the requested permutation. */
17255 expand_vec_perm_palignr (struct expand_vec_perm_d
*d
, bool single_insn_only_p
)
17257 unsigned i
, nelt
= d
->nelt
;
17258 unsigned min
, max
, minswap
, maxswap
;
17259 bool in_order
, ok
, swap
= false;
17261 struct expand_vec_perm_d dcopy
;
17263 /* Even with AVX, palignr only operates on 128-bit vectors,
17264 in AVX2 palignr operates on both 128-bit lanes. */
17265 if ((!TARGET_SSSE3
|| GET_MODE_SIZE (d
->vmode
) != 16)
17266 && (!TARGET_AVX2
|| GET_MODE_SIZE (d
->vmode
) != 32))
17271 minswap
= 2 * nelt
;
17273 for (i
= 0; i
< nelt
; ++i
)
17275 unsigned e
= d
->perm
[i
];
17276 unsigned eswap
= d
->perm
[i
] ^ nelt
;
17277 if (GET_MODE_SIZE (d
->vmode
) == 32)
17279 e
= (e
& ((nelt
/ 2) - 1)) | ((e
& nelt
) >> 1);
17280 eswap
= e
^ (nelt
/ 2);
17286 if (eswap
< minswap
)
17288 if (eswap
> maxswap
)
17292 || max
- min
>= (GET_MODE_SIZE (d
->vmode
) == 32 ? nelt
/ 2 : nelt
))
17294 if (d
->one_operand_p
17296 || maxswap
- minswap
>= (GET_MODE_SIZE (d
->vmode
) == 32
17297 ? nelt
/ 2 : nelt
))
17304 /* Given that we have SSSE3, we know we'll be able to implement the
17305 single operand permutation after the palignr with pshufb for
17306 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
17308 if (d
->testing_p
&& GET_MODE_SIZE (d
->vmode
) == 16 && !single_insn_only_p
)
17314 dcopy
.op0
= d
->op1
;
17315 dcopy
.op1
= d
->op0
;
17316 for (i
= 0; i
< nelt
; ++i
)
17317 dcopy
.perm
[i
] ^= nelt
;
17321 for (i
= 0; i
< nelt
; ++i
)
17323 unsigned e
= dcopy
.perm
[i
];
17324 if (GET_MODE_SIZE (d
->vmode
) == 32
17326 && (e
& (nelt
/ 2 - 1)) < min
)
17327 e
= e
- min
- (nelt
/ 2);
17334 dcopy
.one_operand_p
= true;
17336 if (single_insn_only_p
&& !in_order
)
17339 /* For AVX2, test whether we can permute the result in one instruction. */
17344 dcopy
.op1
= dcopy
.op0
;
17345 return expand_vec_perm_1 (&dcopy
);
17348 shift
= GEN_INT (min
* GET_MODE_UNIT_BITSIZE (d
->vmode
));
17349 if (GET_MODE_SIZE (d
->vmode
) == 16)
17351 target
= gen_reg_rtx (TImode
);
17352 emit_insn (gen_ssse3_palignrti (target
, gen_lowpart (TImode
, dcopy
.op1
),
17353 gen_lowpart (TImode
, dcopy
.op0
), shift
));
17357 target
= gen_reg_rtx (V2TImode
);
17358 emit_insn (gen_avx2_palignrv2ti (target
,
17359 gen_lowpart (V2TImode
, dcopy
.op1
),
17360 gen_lowpart (V2TImode
, dcopy
.op0
),
17364 dcopy
.op0
= dcopy
.op1
= gen_lowpart (d
->vmode
, target
);
17366 /* Test for the degenerate case where the alignment by itself
17367 produces the desired permutation. */
17370 emit_move_insn (d
->target
, dcopy
.op0
);
17374 ok
= expand_vec_perm_1 (&dcopy
);
17375 gcc_assert (ok
|| GET_MODE_SIZE (d
->vmode
) == 32);
17380 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
17381 the permutation using the SSE4_1 pblendv instruction. Potentially
17382 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
17385 expand_vec_perm_pblendv (struct expand_vec_perm_d
*d
)
17387 unsigned i
, which
, nelt
= d
->nelt
;
17388 struct expand_vec_perm_d dcopy
, dcopy1
;
17389 machine_mode vmode
= d
->vmode
;
17392 /* Use the same checks as in expand_vec_perm_blend. */
17393 if (d
->one_operand_p
)
17395 if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
17397 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
17399 else if (TARGET_SSE4_1
&& GET_MODE_SIZE (vmode
) == 16)
17404 /* Figure out where permutation elements stay not in their
17405 respective lanes. */
17406 for (i
= 0, which
= 0; i
< nelt
; ++i
)
17408 unsigned e
= d
->perm
[i
];
17410 which
|= (e
< nelt
? 1 : 2);
17412 /* We can pblend the part where elements stay not in their
17413 respective lanes only when these elements are all in one
17414 half of a permutation.
17415 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
17416 lanes, but both 8 and 9 >= 8
17417 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
17418 respective lanes and 8 >= 8, but 2 not. */
17419 if (which
!= 1 && which
!= 2)
17421 if (d
->testing_p
&& GET_MODE_SIZE (vmode
) == 16)
17424 /* First we apply one operand permutation to the part where
17425 elements stay not in their respective lanes. */
17428 dcopy
.op0
= dcopy
.op1
= d
->op1
;
17430 dcopy
.op0
= dcopy
.op1
= d
->op0
;
17432 dcopy
.target
= gen_reg_rtx (vmode
);
17433 dcopy
.one_operand_p
= true;
17435 for (i
= 0; i
< nelt
; ++i
)
17436 dcopy
.perm
[i
] = d
->perm
[i
] & (nelt
- 1);
17438 ok
= expand_vec_perm_1 (&dcopy
);
17439 if (GET_MODE_SIZE (vmode
) != 16 && !ok
)
17446 /* Next we put permuted elements into their positions. */
17449 dcopy1
.op1
= dcopy
.target
;
17451 dcopy1
.op0
= dcopy
.target
;
17453 for (i
= 0; i
< nelt
; ++i
)
17454 dcopy1
.perm
[i
] = ((d
->perm
[i
] >= nelt
) ? (nelt
+ i
) : i
);
17456 ok
= expand_vec_perm_blend (&dcopy1
);
17462 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d
*d
);
17464 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
17465 a two vector permutation into a single vector permutation by using
17466 an interleave operation to merge the vectors. */
17469 expand_vec_perm_interleave2 (struct expand_vec_perm_d
*d
)
17471 struct expand_vec_perm_d dremap
, dfinal
;
17472 unsigned i
, nelt
= d
->nelt
, nelt2
= nelt
/ 2;
17473 unsigned HOST_WIDE_INT contents
;
17474 unsigned char remap
[2 * MAX_VECT_LEN
];
17476 bool ok
, same_halves
= false;
17478 if (GET_MODE_SIZE (d
->vmode
) == 16)
17480 if (d
->one_operand_p
)
17483 else if (GET_MODE_SIZE (d
->vmode
) == 32)
17487 /* For 32-byte modes allow even d->one_operand_p.
17488 The lack of cross-lane shuffling in some instructions
17489 might prevent a single insn shuffle. */
17491 dfinal
.testing_p
= true;
17492 /* If expand_vec_perm_interleave3 can expand this into
17493 a 3 insn sequence, give up and let it be expanded as
17494 3 insn sequence. While that is one insn longer,
17495 it doesn't need a memory operand and in the common
17496 case that both interleave low and high permutations
17497 with the same operands are adjacent needs 4 insns
17498 for both after CSE. */
17499 if (expand_vec_perm_interleave3 (&dfinal
))
17505 /* Examine from whence the elements come. */
17507 for (i
= 0; i
< nelt
; ++i
)
17508 contents
|= HOST_WIDE_INT_1U
<< d
->perm
[i
];
17510 memset (remap
, 0xff, sizeof (remap
));
17513 if (GET_MODE_SIZE (d
->vmode
) == 16)
17515 unsigned HOST_WIDE_INT h1
, h2
, h3
, h4
;
17517 /* Split the two input vectors into 4 halves. */
17518 h1
= (HOST_WIDE_INT_1U
<< nelt2
) - 1;
17523 /* If the elements from the low halves use interleave low, and similarly
17524 for interleave high. If the elements are from mis-matched halves, we
17525 can use shufps for V4SF/V4SI or do a DImode shuffle. */
17526 if ((contents
& (h1
| h3
)) == contents
)
17529 for (i
= 0; i
< nelt2
; ++i
)
17532 remap
[i
+ nelt
] = i
* 2 + 1;
17533 dremap
.perm
[i
* 2] = i
;
17534 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
17536 if (!TARGET_SSE2
&& d
->vmode
== V4SImode
)
17537 dremap
.vmode
= V4SFmode
;
17539 else if ((contents
& (h2
| h4
)) == contents
)
17542 for (i
= 0; i
< nelt2
; ++i
)
17544 remap
[i
+ nelt2
] = i
* 2;
17545 remap
[i
+ nelt
+ nelt2
] = i
* 2 + 1;
17546 dremap
.perm
[i
* 2] = i
+ nelt2
;
17547 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt2
;
17549 if (!TARGET_SSE2
&& d
->vmode
== V4SImode
)
17550 dremap
.vmode
= V4SFmode
;
17552 else if ((contents
& (h1
| h4
)) == contents
)
17555 for (i
= 0; i
< nelt2
; ++i
)
17558 remap
[i
+ nelt
+ nelt2
] = i
+ nelt2
;
17559 dremap
.perm
[i
] = i
;
17560 dremap
.perm
[i
+ nelt2
] = i
+ nelt
+ nelt2
;
17565 dremap
.vmode
= V2DImode
;
17567 dremap
.perm
[0] = 0;
17568 dremap
.perm
[1] = 3;
17571 else if ((contents
& (h2
| h3
)) == contents
)
17574 for (i
= 0; i
< nelt2
; ++i
)
17576 remap
[i
+ nelt2
] = i
;
17577 remap
[i
+ nelt
] = i
+ nelt2
;
17578 dremap
.perm
[i
] = i
+ nelt2
;
17579 dremap
.perm
[i
+ nelt2
] = i
+ nelt
;
17584 dremap
.vmode
= V2DImode
;
17586 dremap
.perm
[0] = 1;
17587 dremap
.perm
[1] = 2;
17595 unsigned int nelt4
= nelt
/ 4, nzcnt
= 0;
17596 unsigned HOST_WIDE_INT q
[8];
17597 unsigned int nonzero_halves
[4];
17599 /* Split the two input vectors into 8 quarters. */
17600 q
[0] = (HOST_WIDE_INT_1U
<< nelt4
) - 1;
17601 for (i
= 1; i
< 8; ++i
)
17602 q
[i
] = q
[0] << (nelt4
* i
);
17603 for (i
= 0; i
< 4; ++i
)
17604 if (((q
[2 * i
] | q
[2 * i
+ 1]) & contents
) != 0)
17606 nonzero_halves
[nzcnt
] = i
;
17612 gcc_assert (d
->one_operand_p
);
17613 nonzero_halves
[1] = nonzero_halves
[0];
17614 same_halves
= true;
17616 else if (d
->one_operand_p
)
17618 gcc_assert (nonzero_halves
[0] == 0);
17619 gcc_assert (nonzero_halves
[1] == 1);
17624 if (d
->perm
[0] / nelt2
== nonzero_halves
[1])
17626 /* Attempt to increase the likelihood that dfinal
17627 shuffle will be intra-lane. */
17628 std::swap (nonzero_halves
[0], nonzero_halves
[1]);
17631 /* vperm2f128 or vperm2i128. */
17632 for (i
= 0; i
< nelt2
; ++i
)
17634 remap
[i
+ nonzero_halves
[1] * nelt2
] = i
+ nelt2
;
17635 remap
[i
+ nonzero_halves
[0] * nelt2
] = i
;
17636 dremap
.perm
[i
+ nelt2
] = i
+ nonzero_halves
[1] * nelt2
;
17637 dremap
.perm
[i
] = i
+ nonzero_halves
[0] * nelt2
;
17640 if (d
->vmode
!= V8SFmode
17641 && d
->vmode
!= V4DFmode
17642 && d
->vmode
!= V8SImode
)
17644 dremap
.vmode
= V8SImode
;
17646 for (i
= 0; i
< 4; ++i
)
17648 dremap
.perm
[i
] = i
+ nonzero_halves
[0] * 4;
17649 dremap
.perm
[i
+ 4] = i
+ nonzero_halves
[1] * 4;
17653 else if (d
->one_operand_p
)
17655 else if (TARGET_AVX2
17656 && (contents
& (q
[0] | q
[2] | q
[4] | q
[6])) == contents
)
17659 for (i
= 0; i
< nelt4
; ++i
)
17662 remap
[i
+ nelt
] = i
* 2 + 1;
17663 remap
[i
+ nelt2
] = i
* 2 + nelt2
;
17664 remap
[i
+ nelt
+ nelt2
] = i
* 2 + nelt2
+ 1;
17665 dremap
.perm
[i
* 2] = i
;
17666 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
17667 dremap
.perm
[i
* 2 + nelt2
] = i
+ nelt2
;
17668 dremap
.perm
[i
* 2 + nelt2
+ 1] = i
+ nelt
+ nelt2
;
17671 else if (TARGET_AVX2
17672 && (contents
& (q
[1] | q
[3] | q
[5] | q
[7])) == contents
)
17675 for (i
= 0; i
< nelt4
; ++i
)
17677 remap
[i
+ nelt4
] = i
* 2;
17678 remap
[i
+ nelt
+ nelt4
] = i
* 2 + 1;
17679 remap
[i
+ nelt2
+ nelt4
] = i
* 2 + nelt2
;
17680 remap
[i
+ nelt
+ nelt2
+ nelt4
] = i
* 2 + nelt2
+ 1;
17681 dremap
.perm
[i
* 2] = i
+ nelt4
;
17682 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt4
;
17683 dremap
.perm
[i
* 2 + nelt2
] = i
+ nelt2
+ nelt4
;
17684 dremap
.perm
[i
* 2 + nelt2
+ 1] = i
+ nelt
+ nelt2
+ nelt4
;
17691 /* Use the remapping array set up above to move the elements from their
17692 swizzled locations into their final destinations. */
17694 for (i
= 0; i
< nelt
; ++i
)
17696 unsigned e
= remap
[d
->perm
[i
]];
17697 gcc_assert (e
< nelt
);
17698 /* If same_halves is true, both halves of the remapped vector are the
17699 same. Avoid cross-lane accesses if possible. */
17700 if (same_halves
&& i
>= nelt2
)
17702 gcc_assert (e
< nelt2
);
17703 dfinal
.perm
[i
] = e
+ nelt2
;
17706 dfinal
.perm
[i
] = e
;
17710 dremap
.target
= gen_reg_rtx (dremap
.vmode
);
17711 dfinal
.op0
= gen_lowpart (dfinal
.vmode
, dremap
.target
);
17713 dfinal
.op1
= dfinal
.op0
;
17714 dfinal
.one_operand_p
= true;
17716 /* Test if the final remap can be done with a single insn. For V4SFmode or
17717 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
17719 ok
= expand_vec_perm_1 (&dfinal
);
17720 seq
= get_insns ();
17729 if (dremap
.vmode
!= dfinal
.vmode
)
17731 dremap
.op0
= gen_lowpart (dremap
.vmode
, dremap
.op0
);
17732 dremap
.op1
= gen_lowpart (dremap
.vmode
, dremap
.op1
);
17735 ok
= expand_vec_perm_1 (&dremap
);
17742 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
17743 a single vector cross-lane permutation into vpermq followed
17744 by any of the single insn permutations. */
17747 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d
*d
)
17749 struct expand_vec_perm_d dremap
, dfinal
;
17750 unsigned i
, j
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, nelt4
= nelt
/ 4;
17751 unsigned contents
[2];
17755 && (d
->vmode
== V32QImode
|| d
->vmode
== V16HImode
)
17756 && d
->one_operand_p
))
17761 for (i
= 0; i
< nelt2
; ++i
)
17763 contents
[0] |= 1u << (d
->perm
[i
] / nelt4
);
17764 contents
[1] |= 1u << (d
->perm
[i
+ nelt2
] / nelt4
);
17767 for (i
= 0; i
< 2; ++i
)
17769 unsigned int cnt
= 0;
17770 for (j
= 0; j
< 4; ++j
)
17771 if ((contents
[i
] & (1u << j
)) != 0 && ++cnt
> 2)
17779 dremap
.vmode
= V4DImode
;
17781 dremap
.target
= gen_reg_rtx (V4DImode
);
17782 dremap
.op0
= gen_lowpart (V4DImode
, d
->op0
);
17783 dremap
.op1
= dremap
.op0
;
17784 dremap
.one_operand_p
= true;
17785 for (i
= 0; i
< 2; ++i
)
17787 unsigned int cnt
= 0;
17788 for (j
= 0; j
< 4; ++j
)
17789 if ((contents
[i
] & (1u << j
)) != 0)
17790 dremap
.perm
[2 * i
+ cnt
++] = j
;
17791 for (; cnt
< 2; ++cnt
)
17792 dremap
.perm
[2 * i
+ cnt
] = 0;
17796 dfinal
.op0
= gen_lowpart (dfinal
.vmode
, dremap
.target
);
17797 dfinal
.op1
= dfinal
.op0
;
17798 dfinal
.one_operand_p
= true;
17799 for (i
= 0, j
= 0; i
< nelt
; ++i
)
17803 dfinal
.perm
[i
] = (d
->perm
[i
] & (nelt4
- 1)) | (j
? nelt2
: 0);
17804 if ((d
->perm
[i
] / nelt4
) == dremap
.perm
[j
])
17806 else if ((d
->perm
[i
] / nelt4
) == dremap
.perm
[j
+ 1])
17807 dfinal
.perm
[i
] |= nelt4
;
17809 gcc_unreachable ();
17812 ok
= expand_vec_perm_1 (&dremap
);
17815 ok
= expand_vec_perm_1 (&dfinal
);
17821 static bool canonicalize_perm (struct expand_vec_perm_d
*d
);
17823 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
17824 a vector permutation using two instructions, vperm2f128 resp.
17825 vperm2i128 followed by any single in-lane permutation. */
17828 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d
*d
)
17830 struct expand_vec_perm_d dfirst
, dsecond
;
17831 unsigned i
, j
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, perm
;
17835 || GET_MODE_SIZE (d
->vmode
) != 32
17836 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
&& !TARGET_AVX2
))
17840 dsecond
.one_operand_p
= false;
17841 dsecond
.testing_p
= true;
17843 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
17844 immediate. For perm < 16 the second permutation uses
17845 d->op0 as first operand, for perm >= 16 it uses d->op1
17846 as first operand. The second operand is the result of
17848 for (perm
= 0; perm
< 32; perm
++)
17850 /* Ignore permutations which do not move anything cross-lane. */
17853 /* The second shuffle for e.g. V4DFmode has
17854 0123 and ABCD operands.
17855 Ignore AB23, as 23 is already in the second lane
17856 of the first operand. */
17857 if ((perm
& 0xc) == (1 << 2)) continue;
17858 /* And 01CD, as 01 is in the first lane of the first
17860 if ((perm
& 3) == 0) continue;
17861 /* And 4567, as then the vperm2[fi]128 doesn't change
17862 anything on the original 4567 second operand. */
17863 if ((perm
& 0xf) == ((3 << 2) | 2)) continue;
17867 /* The second shuffle for e.g. V4DFmode has
17868 4567 and ABCD operands.
17869 Ignore AB67, as 67 is already in the second lane
17870 of the first operand. */
17871 if ((perm
& 0xc) == (3 << 2)) continue;
17872 /* And 45CD, as 45 is in the first lane of the first
17874 if ((perm
& 3) == 2) continue;
17875 /* And 0123, as then the vperm2[fi]128 doesn't change
17876 anything on the original 0123 first operand. */
17877 if ((perm
& 0xf) == (1 << 2)) continue;
17880 for (i
= 0; i
< nelt
; i
++)
17882 j
= d
->perm
[i
] / nelt2
;
17883 if (j
== ((perm
>> (2 * (i
>= nelt2
))) & 3))
17884 dsecond
.perm
[i
] = nelt
+ (i
& nelt2
) + (d
->perm
[i
] & (nelt2
- 1));
17885 else if (j
== (unsigned) (i
>= nelt2
) + 2 * (perm
>= 16))
17886 dsecond
.perm
[i
] = d
->perm
[i
] & (nelt
- 1);
17894 ok
= expand_vec_perm_1 (&dsecond
);
17905 /* Found a usable second shuffle. dfirst will be
17906 vperm2f128 on d->op0 and d->op1. */
17907 dsecond
.testing_p
= false;
17909 dfirst
.target
= gen_reg_rtx (d
->vmode
);
17910 for (i
= 0; i
< nelt
; i
++)
17911 dfirst
.perm
[i
] = (i
& (nelt2
- 1))
17912 + ((perm
>> (2 * (i
>= nelt2
))) & 3) * nelt2
;
17914 canonicalize_perm (&dfirst
);
17915 ok
= expand_vec_perm_1 (&dfirst
);
17918 /* And dsecond is some single insn shuffle, taking
17919 d->op0 and result of vperm2f128 (if perm < 16) or
17920 d->op1 and result of vperm2f128 (otherwise). */
17922 dsecond
.op0
= dsecond
.op1
;
17923 dsecond
.op1
= dfirst
.target
;
17925 ok
= expand_vec_perm_1 (&dsecond
);
17931 /* For one operand, the only useful vperm2f128 permutation is 0x01
17933 if (d
->one_operand_p
)
17940 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
17941 a two vector permutation using 2 intra-lane interleave insns
17942 and cross-lane shuffle for 32-byte vectors. */
17945 expand_vec_perm_interleave3 (struct expand_vec_perm_d
*d
)
17948 rtx (*gen
) (rtx
, rtx
, rtx
);
17950 if (d
->one_operand_p
)
17952 if (TARGET_AVX2
&& GET_MODE_SIZE (d
->vmode
) == 32)
17954 else if (TARGET_AVX
&& (d
->vmode
== V8SFmode
|| d
->vmode
== V4DFmode
))
17960 if (d
->perm
[0] != 0 && d
->perm
[0] != nelt
/ 2)
17962 for (i
= 0; i
< nelt
; i
+= 2)
17963 if (d
->perm
[i
] != d
->perm
[0] + i
/ 2
17964 || d
->perm
[i
+ 1] != d
->perm
[0] + i
/ 2 + nelt
)
17974 gen
= gen_vec_interleave_highv32qi
;
17976 gen
= gen_vec_interleave_lowv32qi
;
17980 gen
= gen_vec_interleave_highv16hi
;
17982 gen
= gen_vec_interleave_lowv16hi
;
17986 gen
= gen_vec_interleave_highv8si
;
17988 gen
= gen_vec_interleave_lowv8si
;
17992 gen
= gen_vec_interleave_highv4di
;
17994 gen
= gen_vec_interleave_lowv4di
;
17998 gen
= gen_vec_interleave_highv8sf
;
18000 gen
= gen_vec_interleave_lowv8sf
;
18004 gen
= gen_vec_interleave_highv4df
;
18006 gen
= gen_vec_interleave_lowv4df
;
18009 gcc_unreachable ();
18012 emit_insn (gen (d
->target
, d
->op0
, d
->op1
));
18016 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
18017 a single vector permutation using a single intra-lane vector
18018 permutation, vperm2f128 swapping the lanes and vblend* insn blending
18019 the non-swapped and swapped vectors together. */
18022 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d
*d
)
18024 struct expand_vec_perm_d dfirst
, dsecond
;
18025 unsigned i
, j
, msk
, nelt
= d
->nelt
, nelt2
= nelt
/ 2;
18028 rtx (*blend
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
18032 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
)
18033 || !d
->one_operand_p
)
18037 for (i
= 0; i
< nelt
; i
++)
18038 dfirst
.perm
[i
] = 0xff;
18039 for (i
= 0, msk
= 0; i
< nelt
; i
++)
18041 j
= (d
->perm
[i
] & nelt2
) ? i
| nelt2
: i
& ~nelt2
;
18042 if (dfirst
.perm
[j
] != 0xff && dfirst
.perm
[j
] != d
->perm
[i
])
18044 dfirst
.perm
[j
] = d
->perm
[i
];
18048 for (i
= 0; i
< nelt
; i
++)
18049 if (dfirst
.perm
[i
] == 0xff)
18050 dfirst
.perm
[i
] = i
;
18053 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
18056 ok
= expand_vec_perm_1 (&dfirst
);
18057 seq
= get_insns ();
18069 dsecond
.op0
= dfirst
.target
;
18070 dsecond
.op1
= dfirst
.target
;
18071 dsecond
.one_operand_p
= true;
18072 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
18073 for (i
= 0; i
< nelt
; i
++)
18074 dsecond
.perm
[i
] = i
^ nelt2
;
18076 ok
= expand_vec_perm_1 (&dsecond
);
18079 blend
= d
->vmode
== V8SFmode
? gen_avx_blendps256
: gen_avx_blendpd256
;
18080 emit_insn (blend (d
->target
, dfirst
.target
, dsecond
.target
, GEN_INT (msk
)));
18084 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
18085 permutation using two vperm2f128, followed by a vshufpd insn blending
18086 the two vectors together. */
18089 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d
*d
)
18091 struct expand_vec_perm_d dfirst
, dsecond
, dthird
;
18094 if (!TARGET_AVX
|| (d
->vmode
!= V4DFmode
))
18104 dfirst
.perm
[0] = (d
->perm
[0] & ~1);
18105 dfirst
.perm
[1] = (d
->perm
[0] & ~1) + 1;
18106 dfirst
.perm
[2] = (d
->perm
[2] & ~1);
18107 dfirst
.perm
[3] = (d
->perm
[2] & ~1) + 1;
18108 dsecond
.perm
[0] = (d
->perm
[1] & ~1);
18109 dsecond
.perm
[1] = (d
->perm
[1] & ~1) + 1;
18110 dsecond
.perm
[2] = (d
->perm
[3] & ~1);
18111 dsecond
.perm
[3] = (d
->perm
[3] & ~1) + 1;
18112 dthird
.perm
[0] = (d
->perm
[0] % 2);
18113 dthird
.perm
[1] = (d
->perm
[1] % 2) + 4;
18114 dthird
.perm
[2] = (d
->perm
[2] % 2) + 2;
18115 dthird
.perm
[3] = (d
->perm
[3] % 2) + 6;
18117 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
18118 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
18119 dthird
.op0
= dfirst
.target
;
18120 dthird
.op1
= dsecond
.target
;
18121 dthird
.one_operand_p
= false;
18123 canonicalize_perm (&dfirst
);
18124 canonicalize_perm (&dsecond
);
18126 ok
= expand_vec_perm_1 (&dfirst
)
18127 && expand_vec_perm_1 (&dsecond
)
18128 && expand_vec_perm_1 (&dthird
);
18135 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
18136 permutation with two pshufb insns and an ior. We should have already
18137 failed all two instruction sequences. */
18140 expand_vec_perm_pshufb2 (struct expand_vec_perm_d
*d
)
18142 rtx rperm
[2][16], vperm
, l
, h
, op
, m128
;
18143 unsigned int i
, nelt
, eltsz
;
18145 if (!TARGET_SSSE3
|| GET_MODE_SIZE (d
->vmode
) != 16)
18147 gcc_assert (!d
->one_operand_p
);
18153 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
18155 /* Generate two permutation masks. If the required element is within
18156 the given vector it is shuffled into the proper lane. If the required
18157 element is in the other vector, force a zero into the lane by setting
18158 bit 7 in the permutation mask. */
18159 m128
= GEN_INT (-128);
18160 for (i
= 0; i
< nelt
; ++i
)
18162 unsigned j
, e
= d
->perm
[i
];
18163 unsigned which
= (e
>= nelt
);
18167 for (j
= 0; j
< eltsz
; ++j
)
18169 rperm
[which
][i
*eltsz
+ j
] = GEN_INT (e
*eltsz
+ j
);
18170 rperm
[1-which
][i
*eltsz
+ j
] = m128
;
18174 vperm
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, rperm
[0]));
18175 vperm
= force_reg (V16QImode
, vperm
);
18177 l
= gen_reg_rtx (V16QImode
);
18178 op
= gen_lowpart (V16QImode
, d
->op0
);
18179 emit_insn (gen_ssse3_pshufbv16qi3 (l
, op
, vperm
));
18181 vperm
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, rperm
[1]));
18182 vperm
= force_reg (V16QImode
, vperm
);
18184 h
= gen_reg_rtx (V16QImode
);
18185 op
= gen_lowpart (V16QImode
, d
->op1
);
18186 emit_insn (gen_ssse3_pshufbv16qi3 (h
, op
, vperm
));
18189 if (d
->vmode
!= V16QImode
)
18190 op
= gen_reg_rtx (V16QImode
);
18191 emit_insn (gen_iorv16qi3 (op
, l
, h
));
18192 if (op
!= d
->target
)
18193 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
18198 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
18199 with two vpshufb insns, vpermq and vpor. We should have already failed
18200 all two or three instruction sequences. */
18203 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d
*d
)
18205 rtx rperm
[2][32], vperm
, l
, h
, hp
, op
, m128
;
18206 unsigned int i
, nelt
, eltsz
;
18209 || !d
->one_operand_p
18210 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
18217 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
18219 /* Generate two permutation masks. If the required element is within
18220 the same lane, it is shuffled in. If the required element from the
18221 other lane, force a zero by setting bit 7 in the permutation mask.
18222 In the other mask the mask has non-negative elements if element
18223 is requested from the other lane, but also moved to the other lane,
18224 so that the result of vpshufb can have the two V2TImode halves
18226 m128
= GEN_INT (-128);
18227 for (i
= 0; i
< nelt
; ++i
)
18229 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
18230 unsigned which
= ((d
->perm
[i
] ^ i
) & (nelt
/ 2)) * eltsz
;
18232 for (j
= 0; j
< eltsz
; ++j
)
18234 rperm
[!!which
][(i
* eltsz
+ j
) ^ which
] = GEN_INT (e
* eltsz
+ j
);
18235 rperm
[!which
][(i
* eltsz
+ j
) ^ (which
^ 16)] = m128
;
18239 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[1]));
18240 vperm
= force_reg (V32QImode
, vperm
);
18242 h
= gen_reg_rtx (V32QImode
);
18243 op
= gen_lowpart (V32QImode
, d
->op0
);
18244 emit_insn (gen_avx2_pshufbv32qi3 (h
, op
, vperm
));
18246 /* Swap the 128-byte lanes of h into hp. */
18247 hp
= gen_reg_rtx (V4DImode
);
18248 op
= gen_lowpart (V4DImode
, h
);
18249 emit_insn (gen_avx2_permv4di_1 (hp
, op
, const2_rtx
, GEN_INT (3), const0_rtx
,
18252 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[0]));
18253 vperm
= force_reg (V32QImode
, vperm
);
18255 l
= gen_reg_rtx (V32QImode
);
18256 op
= gen_lowpart (V32QImode
, d
->op0
);
18257 emit_insn (gen_avx2_pshufbv32qi3 (l
, op
, vperm
));
18260 if (d
->vmode
!= V32QImode
)
18261 op
= gen_reg_rtx (V32QImode
);
18262 emit_insn (gen_iorv32qi3 (op
, l
, gen_lowpart (V32QImode
, hp
)));
18263 if (op
!= d
->target
)
18264 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
18269 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18270 and extract-odd permutations of two V32QImode and V16QImode operand
18271 with two vpshufb insns, vpor and vpermq. We should have already
18272 failed all two or three instruction sequences. */
18275 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d
*d
)
18277 rtx rperm
[2][32], vperm
, l
, h
, ior
, op
, m128
;
18278 unsigned int i
, nelt
, eltsz
;
18281 || d
->one_operand_p
18282 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
18285 for (i
= 0; i
< d
->nelt
; ++i
)
18286 if ((d
->perm
[i
] ^ (i
* 2)) & (3 * d
->nelt
/ 2))
18293 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
18295 /* Generate two permutation masks. In the first permutation mask
18296 the first quarter will contain indexes for the first half
18297 of the op0, the second quarter will contain bit 7 set, third quarter
18298 will contain indexes for the second half of the op0 and the
18299 last quarter bit 7 set. In the second permutation mask
18300 the first quarter will contain bit 7 set, the second quarter
18301 indexes for the first half of the op1, the third quarter bit 7 set
18302 and last quarter indexes for the second half of the op1.
18303 I.e. the first mask e.g. for V32QImode extract even will be:
18304 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
18305 (all values masked with 0xf except for -128) and second mask
18306 for extract even will be
18307 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
18308 m128
= GEN_INT (-128);
18309 for (i
= 0; i
< nelt
; ++i
)
18311 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
18312 unsigned which
= d
->perm
[i
] >= nelt
;
18313 unsigned xorv
= (i
>= nelt
/ 4 && i
< 3 * nelt
/ 4) ? 24 : 0;
18315 for (j
= 0; j
< eltsz
; ++j
)
18317 rperm
[which
][(i
* eltsz
+ j
) ^ xorv
] = GEN_INT (e
* eltsz
+ j
);
18318 rperm
[1 - which
][(i
* eltsz
+ j
) ^ xorv
] = m128
;
18322 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[0]));
18323 vperm
= force_reg (V32QImode
, vperm
);
18325 l
= gen_reg_rtx (V32QImode
);
18326 op
= gen_lowpart (V32QImode
, d
->op0
);
18327 emit_insn (gen_avx2_pshufbv32qi3 (l
, op
, vperm
));
18329 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[1]));
18330 vperm
= force_reg (V32QImode
, vperm
);
18332 h
= gen_reg_rtx (V32QImode
);
18333 op
= gen_lowpart (V32QImode
, d
->op1
);
18334 emit_insn (gen_avx2_pshufbv32qi3 (h
, op
, vperm
));
18336 ior
= gen_reg_rtx (V32QImode
);
18337 emit_insn (gen_iorv32qi3 (ior
, l
, h
));
18339 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
18340 op
= gen_reg_rtx (V4DImode
);
18341 ior
= gen_lowpart (V4DImode
, ior
);
18342 emit_insn (gen_avx2_permv4di_1 (op
, ior
, const0_rtx
, const2_rtx
,
18343 const1_rtx
, GEN_INT (3)));
18344 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
18349 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18350 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
18351 with two "and" and "pack" or two "shift" and "pack" insns. We should
18352 have already failed all two instruction sequences. */
18355 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d
*d
)
18357 rtx op
, dop0
, dop1
, t
;
18358 unsigned i
, odd
, c
, s
, nelt
= d
->nelt
;
18359 bool end_perm
= false;
18360 machine_mode half_mode
;
18361 rtx (*gen_and
) (rtx
, rtx
, rtx
);
18362 rtx (*gen_pack
) (rtx
, rtx
, rtx
);
18363 rtx (*gen_shift
) (rtx
, rtx
, rtx
);
18365 if (d
->one_operand_p
)
18371 /* Required for "pack". */
18372 if (!TARGET_SSE4_1
)
18376 half_mode
= V4SImode
;
18377 gen_and
= gen_andv4si3
;
18378 gen_pack
= gen_sse4_1_packusdw
;
18379 gen_shift
= gen_lshrv4si3
;
18382 /* No check as all instructions are SSE2. */
18385 half_mode
= V8HImode
;
18386 gen_and
= gen_andv8hi3
;
18387 gen_pack
= gen_sse2_packuswb
;
18388 gen_shift
= gen_lshrv8hi3
;
18395 half_mode
= V8SImode
;
18396 gen_and
= gen_andv8si3
;
18397 gen_pack
= gen_avx2_packusdw
;
18398 gen_shift
= gen_lshrv8si3
;
18406 half_mode
= V16HImode
;
18407 gen_and
= gen_andv16hi3
;
18408 gen_pack
= gen_avx2_packuswb
;
18409 gen_shift
= gen_lshrv16hi3
;
18413 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
18414 general shuffles. */
18418 /* Check that permutation is even or odd. */
18423 for (i
= 1; i
< nelt
; ++i
)
18424 if (d
->perm
[i
] != 2 * i
+ odd
)
18430 dop0
= gen_reg_rtx (half_mode
);
18431 dop1
= gen_reg_rtx (half_mode
);
18434 t
= gen_const_vec_duplicate (half_mode
, GEN_INT (c
));
18435 t
= force_reg (half_mode
, t
);
18436 emit_insn (gen_and (dop0
, t
, gen_lowpart (half_mode
, d
->op0
)));
18437 emit_insn (gen_and (dop1
, t
, gen_lowpart (half_mode
, d
->op1
)));
18441 emit_insn (gen_shift (dop0
,
18442 gen_lowpart (half_mode
, d
->op0
),
18444 emit_insn (gen_shift (dop1
,
18445 gen_lowpart (half_mode
, d
->op1
),
18448 /* In AVX2 for 256 bit case we need to permute pack result. */
18449 if (TARGET_AVX2
&& end_perm
)
18451 op
= gen_reg_rtx (d
->vmode
);
18452 t
= gen_reg_rtx (V4DImode
);
18453 emit_insn (gen_pack (op
, dop0
, dop1
));
18454 emit_insn (gen_avx2_permv4di_1 (t
,
18455 gen_lowpart (V4DImode
, op
),
18460 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, t
));
18463 emit_insn (gen_pack (d
->target
, dop0
, dop1
));
18468 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
18469 and extract-odd permutations of two V64QI operands
18470 with two "shifts", two "truncs" and one "concat" insns for "odd"
18471 and two "truncs" and one concat insn for "even."
18472 Have already failed all two instruction sequences. */
18475 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d
*d
)
18477 rtx t1
, t2
, t3
, t4
;
18478 unsigned i
, odd
, nelt
= d
->nelt
;
18480 if (!TARGET_AVX512BW
18481 || d
->one_operand_p
18482 || d
->vmode
!= V64QImode
)
18485 /* Check that permutation is even or odd. */
18490 for (i
= 1; i
< nelt
; ++i
)
18491 if (d
->perm
[i
] != 2 * i
+ odd
)
18500 t1
= gen_reg_rtx (V32HImode
);
18501 t2
= gen_reg_rtx (V32HImode
);
18502 emit_insn (gen_lshrv32hi3 (t1
,
18503 gen_lowpart (V32HImode
, d
->op0
),
18505 emit_insn (gen_lshrv32hi3 (t2
,
18506 gen_lowpart (V32HImode
, d
->op1
),
18511 t1
= gen_lowpart (V32HImode
, d
->op0
);
18512 t2
= gen_lowpart (V32HImode
, d
->op1
);
18515 t3
= gen_reg_rtx (V32QImode
);
18516 t4
= gen_reg_rtx (V32QImode
);
18517 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3
, t1
));
18518 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4
, t2
));
18519 emit_insn (gen_avx_vec_concatv64qi (d
->target
, t3
, t4
));
18524 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
18525 and extract-odd permutations. */
18528 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d
*d
, unsigned odd
)
18530 rtx t1
, t2
, t3
, t4
, t5
;
18537 t1
= gen_reg_rtx (V4DFmode
);
18538 t2
= gen_reg_rtx (V4DFmode
);
18540 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
18541 emit_insn (gen_avx_vperm2f128v4df3 (t1
, d
->op0
, d
->op1
, GEN_INT (0x20)));
18542 emit_insn (gen_avx_vperm2f128v4df3 (t2
, d
->op0
, d
->op1
, GEN_INT (0x31)));
18544 /* Now an unpck[lh]pd will produce the result required. */
18546 t3
= gen_avx_unpckhpd256 (d
->target
, t1
, t2
);
18548 t3
= gen_avx_unpcklpd256 (d
->target
, t1
, t2
);
18554 int mask
= odd
? 0xdd : 0x88;
18558 t1
= gen_reg_rtx (V8SFmode
);
18559 t2
= gen_reg_rtx (V8SFmode
);
18560 t3
= gen_reg_rtx (V8SFmode
);
18562 /* Shuffle within the 128-bit lanes to produce:
18563 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
18564 emit_insn (gen_avx_shufps256 (t1
, d
->op0
, d
->op1
,
18567 /* Shuffle the lanes around to produce:
18568 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
18569 emit_insn (gen_avx_vperm2f128v8sf3 (t2
, t1
, t1
,
18572 /* Shuffle within the 128-bit lanes to produce:
18573 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
18574 emit_insn (gen_avx_shufps256 (t3
, t1
, t2
, GEN_INT (0x44)));
18576 /* Shuffle within the 128-bit lanes to produce:
18577 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
18578 emit_insn (gen_avx_shufps256 (t2
, t1
, t2
, GEN_INT (0xee)));
18580 /* Shuffle the lanes around to produce:
18581 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
18582 emit_insn (gen_avx_vperm2f128v8sf3 (d
->target
, t3
, t2
,
18591 /* These are always directly implementable by expand_vec_perm_1. */
18592 gcc_unreachable ();
18596 return expand_vec_perm_even_odd_pack (d
);
18597 else if (TARGET_SSSE3
&& !TARGET_SLOW_PSHUFB
)
18598 return expand_vec_perm_pshufb2 (d
);
18603 /* We need 2*log2(N)-1 operations to achieve odd/even
18604 with interleave. */
18605 t1
= gen_reg_rtx (V8HImode
);
18606 t2
= gen_reg_rtx (V8HImode
);
18607 emit_insn (gen_vec_interleave_highv8hi (t1
, d
->op0
, d
->op1
));
18608 emit_insn (gen_vec_interleave_lowv8hi (d
->target
, d
->op0
, d
->op1
));
18609 emit_insn (gen_vec_interleave_highv8hi (t2
, d
->target
, t1
));
18610 emit_insn (gen_vec_interleave_lowv8hi (d
->target
, d
->target
, t1
));
18612 t3
= gen_vec_interleave_highv8hi (d
->target
, d
->target
, t2
);
18614 t3
= gen_vec_interleave_lowv8hi (d
->target
, d
->target
, t2
);
18620 return expand_vec_perm_even_odd_pack (d
);
18624 return expand_vec_perm_even_odd_pack (d
);
18627 return expand_vec_perm_even_odd_trunc (d
);
18632 struct expand_vec_perm_d d_copy
= *d
;
18633 d_copy
.vmode
= V4DFmode
;
18635 d_copy
.target
= gen_raw_REG (V4DFmode
, LAST_VIRTUAL_REGISTER
+ 1);
18637 d_copy
.target
= gen_reg_rtx (V4DFmode
);
18638 d_copy
.op0
= gen_lowpart (V4DFmode
, d
->op0
);
18639 d_copy
.op1
= gen_lowpart (V4DFmode
, d
->op1
);
18640 if (expand_vec_perm_even_odd_1 (&d_copy
, odd
))
18643 emit_move_insn (d
->target
,
18644 gen_lowpart (V4DImode
, d_copy
.target
));
18653 t1
= gen_reg_rtx (V4DImode
);
18654 t2
= gen_reg_rtx (V4DImode
);
18656 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
18657 emit_insn (gen_avx2_permv2ti (t1
, d
->op0
, d
->op1
, GEN_INT (0x20)));
18658 emit_insn (gen_avx2_permv2ti (t2
, d
->op0
, d
->op1
, GEN_INT (0x31)));
18660 /* Now an vpunpck[lh]qdq will produce the result required. */
18662 t3
= gen_avx2_interleave_highv4di (d
->target
, t1
, t2
);
18664 t3
= gen_avx2_interleave_lowv4di (d
->target
, t1
, t2
);
18671 struct expand_vec_perm_d d_copy
= *d
;
18672 d_copy
.vmode
= V8SFmode
;
18674 d_copy
.target
= gen_raw_REG (V8SFmode
, LAST_VIRTUAL_REGISTER
+ 1);
18676 d_copy
.target
= gen_reg_rtx (V8SFmode
);
18677 d_copy
.op0
= gen_lowpart (V8SFmode
, d
->op0
);
18678 d_copy
.op1
= gen_lowpart (V8SFmode
, d
->op1
);
18679 if (expand_vec_perm_even_odd_1 (&d_copy
, odd
))
18682 emit_move_insn (d
->target
,
18683 gen_lowpart (V8SImode
, d_copy
.target
));
18692 t1
= gen_reg_rtx (V8SImode
);
18693 t2
= gen_reg_rtx (V8SImode
);
18694 t3
= gen_reg_rtx (V4DImode
);
18695 t4
= gen_reg_rtx (V4DImode
);
18696 t5
= gen_reg_rtx (V4DImode
);
18698 /* Shuffle the lanes around into
18699 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
18700 emit_insn (gen_avx2_permv2ti (t3
, gen_lowpart (V4DImode
, d
->op0
),
18701 gen_lowpart (V4DImode
, d
->op1
),
18703 emit_insn (gen_avx2_permv2ti (t4
, gen_lowpart (V4DImode
, d
->op0
),
18704 gen_lowpart (V4DImode
, d
->op1
),
18707 /* Swap the 2nd and 3rd position in each lane into
18708 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
18709 emit_insn (gen_avx2_pshufdv3 (t1
, gen_lowpart (V8SImode
, t3
),
18710 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18711 emit_insn (gen_avx2_pshufdv3 (t2
, gen_lowpart (V8SImode
, t4
),
18712 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
18714 /* Now an vpunpck[lh]qdq will produce
18715 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
18717 t3
= gen_avx2_interleave_highv4di (t5
, gen_lowpart (V4DImode
, t1
),
18718 gen_lowpart (V4DImode
, t2
));
18720 t3
= gen_avx2_interleave_lowv4di (t5
, gen_lowpart (V4DImode
, t1
),
18721 gen_lowpart (V4DImode
, t2
));
18723 emit_move_insn (d
->target
, gen_lowpart (V8SImode
, t5
));
18727 gcc_unreachable ();
18733 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
18734 extract-even and extract-odd permutations. */
18737 expand_vec_perm_even_odd (struct expand_vec_perm_d
*d
)
18739 unsigned i
, odd
, nelt
= d
->nelt
;
18742 if (odd
!= 0 && odd
!= 1)
18745 for (i
= 1; i
< nelt
; ++i
)
18746 if (d
->perm
[i
] != 2 * i
+ odd
)
18749 return expand_vec_perm_even_odd_1 (d
, odd
);
18752 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
18753 permutations. We assume that expand_vec_perm_1 has already failed. */
18756 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d
*d
)
18758 unsigned elt
= d
->perm
[0], nelt2
= d
->nelt
/ 2;
18759 machine_mode vmode
= d
->vmode
;
18760 unsigned char perm2
[4];
18761 rtx op0
= d
->op0
, dest
;
18768 /* These are special-cased in sse.md so that we can optionally
18769 use the vbroadcast instruction. They expand to two insns
18770 if the input happens to be in a register. */
18771 gcc_unreachable ();
18777 /* These are always implementable using standard shuffle patterns. */
18778 gcc_unreachable ();
18782 /* These can be implemented via interleave. We save one insn by
18783 stopping once we have promoted to V4SImode and then use pshufd. */
18789 rtx (*gen
) (rtx
, rtx
, rtx
)
18790 = vmode
== V16QImode
? gen_vec_interleave_lowv16qi
18791 : gen_vec_interleave_lowv8hi
;
18795 gen
= vmode
== V16QImode
? gen_vec_interleave_highv16qi
18796 : gen_vec_interleave_highv8hi
;
18801 dest
= gen_reg_rtx (vmode
);
18802 emit_insn (gen (dest
, op0
, op0
));
18803 vmode
= get_mode_wider_vector (vmode
);
18804 op0
= gen_lowpart (vmode
, dest
);
18806 while (vmode
!= V4SImode
);
18808 memset (perm2
, elt
, 4);
18809 dest
= gen_reg_rtx (V4SImode
);
18810 ok
= expand_vselect (dest
, op0
, perm2
, 4, d
->testing_p
);
18813 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
18821 /* For AVX2 broadcasts of the first element vpbroadcast* or
18822 vpermq should be used by expand_vec_perm_1. */
18823 gcc_assert (!TARGET_AVX2
|| d
->perm
[0]);
18827 gcc_unreachable ();
18831 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
18832 broadcast permutations. */
18835 expand_vec_perm_broadcast (struct expand_vec_perm_d
*d
)
18837 unsigned i
, elt
, nelt
= d
->nelt
;
18839 if (!d
->one_operand_p
)
18843 for (i
= 1; i
< nelt
; ++i
)
18844 if (d
->perm
[i
] != elt
)
18847 return expand_vec_perm_broadcast_1 (d
);
18850 /* Implement arbitrary permutations of two V64QImode operands
18851 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
18853 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d
*d
)
18855 if (!TARGET_AVX512BW
|| !(d
->vmode
== V64QImode
))
18861 struct expand_vec_perm_d ds
[2];
18862 rtx rperm
[128], vperm
, target0
, target1
;
18863 unsigned int i
, nelt
;
18864 machine_mode vmode
;
18869 for (i
= 0; i
< 2; i
++)
18872 ds
[i
].vmode
= V32HImode
;
18874 ds
[i
].target
= gen_reg_rtx (V32HImode
);
18875 ds
[i
].op0
= gen_lowpart (V32HImode
, d
->op0
);
18876 ds
[i
].op1
= gen_lowpart (V32HImode
, d
->op1
);
18879 /* Prepare permutations such that the first one takes care of
18880 putting the even bytes into the right positions or one higher
18881 positions (ds[0]) and the second one takes care of
18882 putting the odd bytes into the right positions or one below
18885 for (i
= 0; i
< nelt
; i
++)
18887 ds
[i
& 1].perm
[i
/ 2] = d
->perm
[i
] / 2;
18890 rperm
[i
] = constm1_rtx
;
18891 rperm
[i
+ 64] = GEN_INT ((i
& 14) + (d
->perm
[i
] & 1));
18895 rperm
[i
] = GEN_INT ((i
& 14) + (d
->perm
[i
] & 1));
18896 rperm
[i
+ 64] = constm1_rtx
;
18900 bool ok
= expand_vec_perm_1 (&ds
[0]);
18902 ds
[0].target
= gen_lowpart (V64QImode
, ds
[0].target
);
18904 ok
= expand_vec_perm_1 (&ds
[1]);
18906 ds
[1].target
= gen_lowpart (V64QImode
, ds
[1].target
);
18908 vperm
= gen_rtx_CONST_VECTOR (V64QImode
, gen_rtvec_v (64, rperm
));
18909 vperm
= force_reg (vmode
, vperm
);
18910 target0
= gen_reg_rtx (V64QImode
);
18911 emit_insn (gen_avx512bw_pshufbv64qi3 (target0
, ds
[0].target
, vperm
));
18913 vperm
= gen_rtx_CONST_VECTOR (V64QImode
, gen_rtvec_v (64, rperm
+ 64));
18914 vperm
= force_reg (vmode
, vperm
);
18915 target1
= gen_reg_rtx (V64QImode
);
18916 emit_insn (gen_avx512bw_pshufbv64qi3 (target1
, ds
[1].target
, vperm
));
18918 emit_insn (gen_iorv64qi3 (d
->target
, target0
, target1
));
18922 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
18923 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
18924 all the shorter instruction sequences. */
18927 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d
*d
)
18929 rtx rperm
[4][32], vperm
, l
[2], h
[2], op
, m128
;
18930 unsigned int i
, nelt
, eltsz
;
18934 || d
->one_operand_p
18935 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
18942 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
18944 /* Generate 4 permutation masks. If the required element is within
18945 the same lane, it is shuffled in. If the required element from the
18946 other lane, force a zero by setting bit 7 in the permutation mask.
18947 In the other mask the mask has non-negative elements if element
18948 is requested from the other lane, but also moved to the other lane,
18949 so that the result of vpshufb can have the two V2TImode halves
18951 m128
= GEN_INT (-128);
18952 for (i
= 0; i
< 32; ++i
)
18954 rperm
[0][i
] = m128
;
18955 rperm
[1][i
] = m128
;
18956 rperm
[2][i
] = m128
;
18957 rperm
[3][i
] = m128
;
18963 for (i
= 0; i
< nelt
; ++i
)
18965 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
18966 unsigned xlane
= ((d
->perm
[i
] ^ i
) & (nelt
/ 2)) * eltsz
;
18967 unsigned int which
= ((d
->perm
[i
] & nelt
) ? 2 : 0) + (xlane
? 1 : 0);
18969 for (j
= 0; j
< eltsz
; ++j
)
18970 rperm
[which
][(i
* eltsz
+ j
) ^ xlane
] = GEN_INT (e
* eltsz
+ j
);
18971 used
[which
] = true;
18974 for (i
= 0; i
< 2; ++i
)
18976 if (!used
[2 * i
+ 1])
18981 vperm
= gen_rtx_CONST_VECTOR (V32QImode
,
18982 gen_rtvec_v (32, rperm
[2 * i
+ 1]));
18983 vperm
= force_reg (V32QImode
, vperm
);
18984 h
[i
] = gen_reg_rtx (V32QImode
);
18985 op
= gen_lowpart (V32QImode
, i
? d
->op1
: d
->op0
);
18986 emit_insn (gen_avx2_pshufbv32qi3 (h
[i
], op
, vperm
));
18989 /* Swap the 128-byte lanes of h[X]. */
18990 for (i
= 0; i
< 2; ++i
)
18992 if (h
[i
] == NULL_RTX
)
18994 op
= gen_reg_rtx (V4DImode
);
18995 emit_insn (gen_avx2_permv4di_1 (op
, gen_lowpart (V4DImode
, h
[i
]),
18996 const2_rtx
, GEN_INT (3), const0_rtx
,
18998 h
[i
] = gen_lowpart (V32QImode
, op
);
19001 for (i
= 0; i
< 2; ++i
)
19008 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[2 * i
]));
19009 vperm
= force_reg (V32QImode
, vperm
);
19010 l
[i
] = gen_reg_rtx (V32QImode
);
19011 op
= gen_lowpart (V32QImode
, i
? d
->op1
: d
->op0
);
19012 emit_insn (gen_avx2_pshufbv32qi3 (l
[i
], op
, vperm
));
19015 for (i
= 0; i
< 2; ++i
)
19019 op
= gen_reg_rtx (V32QImode
);
19020 emit_insn (gen_iorv32qi3 (op
, l
[i
], h
[i
]));
19027 gcc_assert (l
[0] && l
[1]);
19029 if (d
->vmode
!= V32QImode
)
19030 op
= gen_reg_rtx (V32QImode
);
19031 emit_insn (gen_iorv32qi3 (op
, l
[0], l
[1]));
19032 if (op
!= d
->target
)
19033 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
19037 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
19038 taken care of, perform the expansion in D and return true on success. */
19041 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
19043 /* Try a single instruction expansion. */
19044 if (expand_vec_perm_1 (d
))
19047 /* Try sequences of two instructions. */
19049 if (expand_vec_perm_pshuflw_pshufhw (d
))
19052 if (expand_vec_perm_palignr (d
, false))
19055 if (expand_vec_perm_interleave2 (d
))
19058 if (expand_vec_perm_broadcast (d
))
19061 if (expand_vec_perm_vpermq_perm_1 (d
))
19064 if (expand_vec_perm_vperm2f128 (d
))
19067 if (expand_vec_perm_pblendv (d
))
19070 /* Try sequences of three instructions. */
19072 if (expand_vec_perm_even_odd_pack (d
))
19075 if (expand_vec_perm_2vperm2f128_vshuf (d
))
19078 if (expand_vec_perm_pshufb2 (d
))
19081 if (expand_vec_perm_interleave3 (d
))
19084 if (expand_vec_perm_vperm2f128_vblend (d
))
19087 /* Try sequences of four instructions. */
19089 if (expand_vec_perm_even_odd_trunc (d
))
19091 if (expand_vec_perm_vpshufb2_vpermq (d
))
19094 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d
))
19097 if (expand_vec_perm_vpermt2_vpshub2 (d
))
19100 /* ??? Look for narrow permutations whose element orderings would
19101 allow the promotion to a wider mode. */
19103 /* ??? Look for sequences of interleave or a wider permute that place
19104 the data into the correct lanes for a half-vector shuffle like
19105 pshuf[lh]w or vpermilps. */
19107 /* ??? Look for sequences of interleave that produce the desired results.
19108 The combinatorics of punpck[lh] get pretty ugly... */
19110 if (expand_vec_perm_even_odd (d
))
19113 /* Even longer sequences. */
19114 if (expand_vec_perm_vpshufb4_vpermq2 (d
))
19117 /* See if we can get the same permutation in different vector integer
19119 struct expand_vec_perm_d nd
;
19120 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
19123 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
19130 /* If a permutation only uses one operand, make it clear. Returns true
19131 if the permutation references both operands. */
19134 canonicalize_perm (struct expand_vec_perm_d
*d
)
19136 int i
, which
, nelt
= d
->nelt
;
19138 for (i
= which
= 0; i
< nelt
; ++i
)
19139 which
|= (d
->perm
[i
] < nelt
? 1 : 2);
19141 d
->one_operand_p
= true;
19148 if (!rtx_equal_p (d
->op0
, d
->op1
))
19150 d
->one_operand_p
= false;
19153 /* The elements of PERM do not suggest that only the first operand
19154 is used, but both operands are identical. Allow easier matching
19155 of the permutation by folding the permutation into the single
19160 for (i
= 0; i
< nelt
; ++i
)
19161 d
->perm
[i
] &= nelt
- 1;
19170 return (which
== 3);
19173 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
19176 ix86_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
19177 rtx op1
, const vec_perm_indices
&sel
)
19179 struct expand_vec_perm_d d
;
19180 unsigned char perm
[MAX_VECT_LEN
];
19181 unsigned int i
, nelt
, which
;
19189 gcc_assert (VECTOR_MODE_P (d
.vmode
));
19190 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
19191 d
.testing_p
= !target
;
19193 gcc_assert (sel
.length () == nelt
);
19194 gcc_checking_assert (sizeof (d
.perm
) == sizeof (perm
));
19196 /* Given sufficient ISA support we can just return true here
19197 for selected vector modes. */
19204 if (!TARGET_AVX512F
)
19206 /* All implementable with a single vperm[it]2 insn. */
19211 if (!TARGET_AVX512BW
)
19214 /* All implementable with a single vperm[it]2 insn. */
19218 if (!TARGET_AVX512BW
)
19221 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
19230 if (d
.testing_p
&& TARGET_AVX512VL
)
19231 /* All implementable with a single vperm[it]2 insn. */
19237 if (d
.testing_p
&& TARGET_AVX2
)
19238 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19244 if (d
.testing_p
&& TARGET_AVX2
)
19245 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
19252 /* Fall through. */
19257 /* All implementable with a single vpperm insn. */
19258 if (d
.testing_p
&& TARGET_XOP
)
19260 /* All implementable with 2 pshufb + 1 ior. */
19261 if (d
.testing_p
&& TARGET_SSSE3
)
19268 /* All implementable with shufpd or unpck[lh]pd. */
19276 for (i
= which
= 0; i
< nelt
; ++i
)
19278 unsigned char e
= sel
[i
];
19279 gcc_assert (e
< 2 * nelt
);
19282 which
|= (e
< nelt
? 1 : 2);
19287 /* For all elements from second vector, fold the elements to first. */
19289 for (i
= 0; i
< nelt
; ++i
)
19292 /* Check whether the mask can be applied to the vector type. */
19293 d
.one_operand_p
= (which
!= 3);
19295 /* Implementable with shufps or pshufd. */
19296 if (d
.one_operand_p
&& (d
.vmode
== V4SFmode
|| d
.vmode
== V4SImode
))
19299 /* Otherwise we have to go through the motions and see if we can
19300 figure out how to generate the requested permutation. */
19301 d
.target
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 1);
19302 d
.op1
= d
.op0
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 2);
19303 if (!d
.one_operand_p
)
19304 d
.op1
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 3);
19307 bool ret
= ix86_expand_vec_perm_const_1 (&d
);
19313 two_args
= canonicalize_perm (&d
);
19315 if (ix86_expand_vec_perm_const_1 (&d
))
19318 /* If the selector says both arguments are needed, but the operands are the
19319 same, the above tried to expand with one_operand_p and flattened selector.
19320 If that didn't work, retry without one_operand_p; we succeeded with that
19322 if (two_args
&& d
.one_operand_p
)
19324 d
.one_operand_p
= false;
19325 memcpy (d
.perm
, perm
, sizeof (perm
));
19326 return ix86_expand_vec_perm_const_1 (&d
);
19333 ix86_expand_vec_extract_even_odd (rtx targ
, rtx op0
, rtx op1
, unsigned odd
)
19335 struct expand_vec_perm_d d
;
19341 d
.vmode
= GET_MODE (targ
);
19342 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
19343 d
.one_operand_p
= false;
19344 d
.testing_p
= false;
19346 for (i
= 0; i
< nelt
; ++i
)
19347 d
.perm
[i
] = i
* 2 + odd
;
19349 /* We'll either be able to implement the permutation directly... */
19350 if (expand_vec_perm_1 (&d
))
19353 /* ... or we use the special-case patterns. */
19354 expand_vec_perm_even_odd_1 (&d
, odd
);
19358 ix86_expand_vec_interleave (rtx targ
, rtx op0
, rtx op1
, bool high_p
)
19360 struct expand_vec_perm_d d
;
19361 unsigned i
, nelt
, base
;
19367 d
.vmode
= GET_MODE (targ
);
19368 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
19369 d
.one_operand_p
= false;
19370 d
.testing_p
= false;
19372 base
= high_p
? nelt
/ 2 : 0;
19373 for (i
= 0; i
< nelt
/ 2; ++i
)
19375 d
.perm
[i
* 2] = i
+ base
;
19376 d
.perm
[i
* 2 + 1] = i
+ base
+ nelt
;
19379 /* Note that for AVX this isn't one instruction. */
19380 ok
= ix86_expand_vec_perm_const_1 (&d
);
19385 /* Expand a vector operation CODE for a V*QImode in terms of the
19386 same operation on V*HImode. */
19389 ix86_expand_vecop_qihi (enum rtx_code code
, rtx dest
, rtx op1
, rtx op2
)
19391 machine_mode qimode
= GET_MODE (dest
);
19392 machine_mode himode
;
19393 rtx (*gen_il
) (rtx
, rtx
, rtx
);
19394 rtx (*gen_ih
) (rtx
, rtx
, rtx
);
19395 rtx op1_l
, op1_h
, op2_l
, op2_h
, res_l
, res_h
;
19396 struct expand_vec_perm_d d
;
19397 bool ok
, full_interleave
;
19398 bool uns_p
= false;
19405 gen_il
= gen_vec_interleave_lowv16qi
;
19406 gen_ih
= gen_vec_interleave_highv16qi
;
19409 himode
= V16HImode
;
19410 gen_il
= gen_avx2_interleave_lowv32qi
;
19411 gen_ih
= gen_avx2_interleave_highv32qi
;
19414 himode
= V32HImode
;
19415 gen_il
= gen_avx512bw_interleave_lowv64qi
;
19416 gen_ih
= gen_avx512bw_interleave_highv64qi
;
19419 gcc_unreachable ();
19422 op2_l
= op2_h
= op2
;
19426 /* Unpack data such that we've got a source byte in each low byte of
19427 each word. We don't care what goes into the high byte of each word.
19428 Rather than trying to get zero in there, most convenient is to let
19429 it be a copy of the low byte. */
19430 op2_l
= gen_reg_rtx (qimode
);
19431 op2_h
= gen_reg_rtx (qimode
);
19432 emit_insn (gen_il (op2_l
, op2
, op2
));
19433 emit_insn (gen_ih (op2_h
, op2
, op2
));
19435 op1_l
= gen_reg_rtx (qimode
);
19436 op1_h
= gen_reg_rtx (qimode
);
19437 emit_insn (gen_il (op1_l
, op1
, op1
));
19438 emit_insn (gen_ih (op1_h
, op1
, op1
));
19439 full_interleave
= qimode
== V16QImode
;
19447 op1_l
= gen_reg_rtx (himode
);
19448 op1_h
= gen_reg_rtx (himode
);
19449 ix86_expand_sse_unpack (op1_l
, op1
, uns_p
, false);
19450 ix86_expand_sse_unpack (op1_h
, op1
, uns_p
, true);
19451 full_interleave
= true;
19454 gcc_unreachable ();
19457 /* Perform the operation. */
19458 res_l
= expand_simple_binop (himode
, code
, op1_l
, op2_l
, NULL_RTX
,
19460 res_h
= expand_simple_binop (himode
, code
, op1_h
, op2_h
, NULL_RTX
,
19462 gcc_assert (res_l
&& res_h
);
19464 /* Merge the data back into the right place. */
19466 d
.op0
= gen_lowpart (qimode
, res_l
);
19467 d
.op1
= gen_lowpart (qimode
, res_h
);
19469 d
.nelt
= GET_MODE_NUNITS (qimode
);
19470 d
.one_operand_p
= false;
19471 d
.testing_p
= false;
19473 if (full_interleave
)
19475 /* For SSE2, we used an full interleave, so the desired
19476 results are in the even elements. */
19477 for (i
= 0; i
< d
.nelt
; ++i
)
19482 /* For AVX, the interleave used above was not cross-lane. So the
19483 extraction is evens but with the second and third quarter swapped.
19484 Happily, that is even one insn shorter than even extraction.
19485 For AVX512BW we have 4 lanes. We extract evens from within a lane,
19486 always first from the first and then from the second source operand,
19487 the index bits above the low 4 bits remains the same.
19488 Thus, for d.nelt == 32 we want permutation
19489 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
19490 and for d.nelt == 64 we want permutation
19491 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
19492 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
19493 for (i
= 0; i
< d
.nelt
; ++i
)
19494 d
.perm
[i
] = ((i
* 2) & 14) + ((i
& 8) ? d
.nelt
: 0) + (i
& ~15);
19497 ok
= ix86_expand_vec_perm_const_1 (&d
);
19500 set_unique_reg_note (get_last_insn (), REG_EQUAL
,
19501 gen_rtx_fmt_ee (code
, qimode
, op1
, op2
));
19504 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
19505 if op is CONST_VECTOR with all odd elements equal to their
19506 preceding element. */
19509 const_vector_equal_evenodd_p (rtx op
)
19511 machine_mode mode
= GET_MODE (op
);
19512 int i
, nunits
= GET_MODE_NUNITS (mode
);
19513 if (GET_CODE (op
) != CONST_VECTOR
19514 || nunits
!= CONST_VECTOR_NUNITS (op
))
19516 for (i
= 0; i
< nunits
; i
+= 2)
19517 if (CONST_VECTOR_ELT (op
, i
) != CONST_VECTOR_ELT (op
, i
+ 1))
19523 ix86_expand_mul_widen_evenodd (rtx dest
, rtx op1
, rtx op2
,
19524 bool uns_p
, bool odd_p
)
19526 machine_mode mode
= GET_MODE (op1
);
19527 machine_mode wmode
= GET_MODE (dest
);
19529 rtx orig_op1
= op1
, orig_op2
= op2
;
19531 if (!nonimmediate_operand (op1
, mode
))
19532 op1
= force_reg (mode
, op1
);
19533 if (!nonimmediate_operand (op2
, mode
))
19534 op2
= force_reg (mode
, op2
);
19536 /* We only play even/odd games with vectors of SImode. */
19537 gcc_assert (mode
== V4SImode
|| mode
== V8SImode
|| mode
== V16SImode
);
19539 /* If we're looking for the odd results, shift those members down to
19540 the even slots. For some cpus this is faster than a PSHUFD. */
19543 /* For XOP use vpmacsdqh, but only for smult, as it is only
19545 if (TARGET_XOP
&& mode
== V4SImode
&& !uns_p
)
19547 x
= force_reg (wmode
, CONST0_RTX (wmode
));
19548 emit_insn (gen_xop_pmacsdqh (dest
, op1
, op2
, x
));
19552 x
= GEN_INT (GET_MODE_UNIT_BITSIZE (mode
));
19553 if (!const_vector_equal_evenodd_p (orig_op1
))
19554 op1
= expand_binop (wmode
, lshr_optab
, gen_lowpart (wmode
, op1
),
19555 x
, NULL
, 1, OPTAB_DIRECT
);
19556 if (!const_vector_equal_evenodd_p (orig_op2
))
19557 op2
= expand_binop (wmode
, lshr_optab
, gen_lowpart (wmode
, op2
),
19558 x
, NULL
, 1, OPTAB_DIRECT
);
19559 op1
= gen_lowpart (mode
, op1
);
19560 op2
= gen_lowpart (mode
, op2
);
19563 if (mode
== V16SImode
)
19566 x
= gen_vec_widen_umult_even_v16si (dest
, op1
, op2
);
19568 x
= gen_vec_widen_smult_even_v16si (dest
, op1
, op2
);
19570 else if (mode
== V8SImode
)
19573 x
= gen_vec_widen_umult_even_v8si (dest
, op1
, op2
);
19575 x
= gen_vec_widen_smult_even_v8si (dest
, op1
, op2
);
19578 x
= gen_vec_widen_umult_even_v4si (dest
, op1
, op2
);
19579 else if (TARGET_SSE4_1
)
19580 x
= gen_sse4_1_mulv2siv2di3 (dest
, op1
, op2
);
19583 rtx s1
, s2
, t0
, t1
, t2
;
19585 /* The easiest way to implement this without PMULDQ is to go through
19586 the motions as if we are performing a full 64-bit multiply. With
19587 the exception that we need to do less shuffling of the elements. */
19589 /* Compute the sign-extension, aka highparts, of the two operands. */
19590 s1
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), GT
, CONST0_RTX (mode
),
19591 op1
, pc_rtx
, pc_rtx
);
19592 s2
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), GT
, CONST0_RTX (mode
),
19593 op2
, pc_rtx
, pc_rtx
);
19595 /* Multiply LO(A) * HI(B), and vice-versa. */
19596 t1
= gen_reg_rtx (wmode
);
19597 t2
= gen_reg_rtx (wmode
);
19598 emit_insn (gen_vec_widen_umult_even_v4si (t1
, s1
, op2
));
19599 emit_insn (gen_vec_widen_umult_even_v4si (t2
, s2
, op1
));
19601 /* Multiply LO(A) * LO(B). */
19602 t0
= gen_reg_rtx (wmode
);
19603 emit_insn (gen_vec_widen_umult_even_v4si (t0
, op1
, op2
));
19605 /* Combine and shift the highparts into place. */
19606 t1
= expand_binop (wmode
, add_optab
, t1
, t2
, t1
, 1, OPTAB_DIRECT
);
19607 t1
= expand_binop (wmode
, ashl_optab
, t1
, GEN_INT (32), t1
,
19610 /* Combine high and low parts. */
19611 force_expand_binop (wmode
, add_optab
, t0
, t1
, dest
, 1, OPTAB_DIRECT
);
19618 ix86_expand_mul_widen_hilo (rtx dest
, rtx op1
, rtx op2
,
19619 bool uns_p
, bool high_p
)
19621 machine_mode wmode
= GET_MODE (dest
);
19622 machine_mode mode
= GET_MODE (op1
);
19623 rtx t1
, t2
, t3
, t4
, mask
;
19628 t1
= gen_reg_rtx (mode
);
19629 t2
= gen_reg_rtx (mode
);
19630 if (TARGET_XOP
&& !uns_p
)
19632 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
19633 shuffle the elements once so that all elements are in the right
19634 place for immediate use: { A C B D }. */
19635 emit_insn (gen_sse2_pshufd_1 (t1
, op1
, const0_rtx
, const2_rtx
,
19636 const1_rtx
, GEN_INT (3)));
19637 emit_insn (gen_sse2_pshufd_1 (t2
, op2
, const0_rtx
, const2_rtx
,
19638 const1_rtx
, GEN_INT (3)));
19642 /* Put the elements into place for the multiply. */
19643 ix86_expand_vec_interleave (t1
, op1
, op1
, high_p
);
19644 ix86_expand_vec_interleave (t2
, op2
, op2
, high_p
);
19647 ix86_expand_mul_widen_evenodd (dest
, t1
, t2
, uns_p
, high_p
);
19651 /* Shuffle the elements between the lanes. After this we
19652 have { A B E F | C D G H } for each operand. */
19653 t1
= gen_reg_rtx (V4DImode
);
19654 t2
= gen_reg_rtx (V4DImode
);
19655 emit_insn (gen_avx2_permv4di_1 (t1
, gen_lowpart (V4DImode
, op1
),
19656 const0_rtx
, const2_rtx
,
19657 const1_rtx
, GEN_INT (3)));
19658 emit_insn (gen_avx2_permv4di_1 (t2
, gen_lowpart (V4DImode
, op2
),
19659 const0_rtx
, const2_rtx
,
19660 const1_rtx
, GEN_INT (3)));
19662 /* Shuffle the elements within the lanes. After this we
19663 have { A A B B | C C D D } or { E E F F | G G H H }. */
19664 t3
= gen_reg_rtx (V8SImode
);
19665 t4
= gen_reg_rtx (V8SImode
);
19666 mask
= GEN_INT (high_p
19667 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
19668 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
19669 emit_insn (gen_avx2_pshufdv3 (t3
, gen_lowpart (V8SImode
, t1
), mask
));
19670 emit_insn (gen_avx2_pshufdv3 (t4
, gen_lowpart (V8SImode
, t2
), mask
));
19672 ix86_expand_mul_widen_evenodd (dest
, t3
, t4
, uns_p
, false);
19677 t1
= expand_binop (mode
, smul_optab
, op1
, op2
, NULL_RTX
,
19678 uns_p
, OPTAB_DIRECT
);
19679 t2
= expand_binop (mode
,
19680 uns_p
? umul_highpart_optab
: smul_highpart_optab
,
19681 op1
, op2
, NULL_RTX
, uns_p
, OPTAB_DIRECT
);
19682 gcc_assert (t1
&& t2
);
19684 t3
= gen_reg_rtx (mode
);
19685 ix86_expand_vec_interleave (t3
, t1
, t2
, high_p
);
19686 emit_move_insn (dest
, gen_lowpart (wmode
, t3
));
19694 t1
= gen_reg_rtx (wmode
);
19695 t2
= gen_reg_rtx (wmode
);
19696 ix86_expand_sse_unpack (t1
, op1
, uns_p
, high_p
);
19697 ix86_expand_sse_unpack (t2
, op2
, uns_p
, high_p
);
19699 emit_insn (gen_rtx_SET (dest
, gen_rtx_MULT (wmode
, t1
, t2
)));
19703 gcc_unreachable ();
19708 ix86_expand_sse2_mulv4si3 (rtx op0
, rtx op1
, rtx op2
)
19710 rtx res_1
, res_2
, res_3
, res_4
;
19712 res_1
= gen_reg_rtx (V4SImode
);
19713 res_2
= gen_reg_rtx (V4SImode
);
19714 res_3
= gen_reg_rtx (V2DImode
);
19715 res_4
= gen_reg_rtx (V2DImode
);
19716 ix86_expand_mul_widen_evenodd (res_3
, op1
, op2
, true, false);
19717 ix86_expand_mul_widen_evenodd (res_4
, op1
, op2
, true, true);
19719 /* Move the results in element 2 down to element 1; we don't care
19720 what goes in elements 2 and 3. Then we can merge the parts
19721 back together with an interleave.
19723 Note that two other sequences were tried:
19724 (1) Use interleaves at the start instead of psrldq, which allows
19725 us to use a single shufps to merge things back at the end.
19726 (2) Use shufps here to combine the two vectors, then pshufd to
19727 put the elements in the correct order.
19728 In both cases the cost of the reformatting stall was too high
19729 and the overall sequence slower. */
19731 emit_insn (gen_sse2_pshufd_1 (res_1
, gen_lowpart (V4SImode
, res_3
),
19732 const0_rtx
, const2_rtx
,
19733 const0_rtx
, const0_rtx
));
19734 emit_insn (gen_sse2_pshufd_1 (res_2
, gen_lowpart (V4SImode
, res_4
),
19735 const0_rtx
, const2_rtx
,
19736 const0_rtx
, const0_rtx
));
19737 res_1
= emit_insn (gen_vec_interleave_lowv4si (op0
, res_1
, res_2
));
19739 set_unique_reg_note (res_1
, REG_EQUAL
, gen_rtx_MULT (V4SImode
, op1
, op2
));
19743 ix86_expand_sse2_mulvxdi3 (rtx op0
, rtx op1
, rtx op2
)
19745 machine_mode mode
= GET_MODE (op0
);
19746 rtx t1
, t2
, t3
, t4
, t5
, t6
;
19748 if (TARGET_AVX512DQ
&& mode
== V8DImode
)
19749 emit_insn (gen_avx512dq_mulv8di3 (op0
, op1
, op2
));
19750 else if (TARGET_AVX512DQ
&& TARGET_AVX512VL
&& mode
== V4DImode
)
19751 emit_insn (gen_avx512dq_mulv4di3 (op0
, op1
, op2
));
19752 else if (TARGET_AVX512DQ
&& TARGET_AVX512VL
&& mode
== V2DImode
)
19753 emit_insn (gen_avx512dq_mulv2di3 (op0
, op1
, op2
));
19754 else if (TARGET_XOP
&& mode
== V2DImode
)
19756 /* op1: A,B,C,D, op2: E,F,G,H */
19757 op1
= gen_lowpart (V4SImode
, op1
);
19758 op2
= gen_lowpart (V4SImode
, op2
);
19760 t1
= gen_reg_rtx (V4SImode
);
19761 t2
= gen_reg_rtx (V4SImode
);
19762 t3
= gen_reg_rtx (V2DImode
);
19763 t4
= gen_reg_rtx (V2DImode
);
19766 emit_insn (gen_sse2_pshufd_1 (t1
, op1
,
19772 /* t2: (B*E),(A*F),(D*G),(C*H) */
19773 emit_insn (gen_mulv4si3 (t2
, t1
, op2
));
19775 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
19776 emit_insn (gen_xop_phadddq (t3
, t2
));
19778 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
19779 emit_insn (gen_ashlv2di3 (t4
, t3
, GEN_INT (32)));
19781 /* Multiply lower parts and add all */
19782 t5
= gen_reg_rtx (V2DImode
);
19783 emit_insn (gen_vec_widen_umult_even_v4si (t5
,
19784 gen_lowpart (V4SImode
, op1
),
19785 gen_lowpart (V4SImode
, op2
)));
19786 force_expand_binop (mode
, add_optab
, t5
, t4
, op0
, 1, OPTAB_DIRECT
);
19790 machine_mode nmode
;
19791 rtx (*umul
) (rtx
, rtx
, rtx
);
19793 if (mode
== V2DImode
)
19795 umul
= gen_vec_widen_umult_even_v4si
;
19798 else if (mode
== V4DImode
)
19800 umul
= gen_vec_widen_umult_even_v8si
;
19803 else if (mode
== V8DImode
)
19805 umul
= gen_vec_widen_umult_even_v16si
;
19809 gcc_unreachable ();
19812 /* Multiply low parts. */
19813 t1
= gen_reg_rtx (mode
);
19814 emit_insn (umul (t1
, gen_lowpart (nmode
, op1
), gen_lowpart (nmode
, op2
)));
19816 /* Shift input vectors right 32 bits so we can multiply high parts. */
19818 t2
= expand_binop (mode
, lshr_optab
, op1
, t6
, NULL
, 1, OPTAB_DIRECT
);
19819 t3
= expand_binop (mode
, lshr_optab
, op2
, t6
, NULL
, 1, OPTAB_DIRECT
);
19821 /* Multiply high parts by low parts. */
19822 t4
= gen_reg_rtx (mode
);
19823 t5
= gen_reg_rtx (mode
);
19824 emit_insn (umul (t4
, gen_lowpart (nmode
, t2
), gen_lowpart (nmode
, op2
)));
19825 emit_insn (umul (t5
, gen_lowpart (nmode
, t3
), gen_lowpart (nmode
, op1
)));
19827 /* Combine and shift the highparts back. */
19828 t4
= expand_binop (mode
, add_optab
, t4
, t5
, t4
, 1, OPTAB_DIRECT
);
19829 t4
= expand_binop (mode
, ashl_optab
, t4
, t6
, t4
, 1, OPTAB_DIRECT
);
19831 /* Combine high and low parts. */
19832 force_expand_binop (mode
, add_optab
, t1
, t4
, op0
, 1, OPTAB_DIRECT
);
19835 set_unique_reg_note (get_last_insn (), REG_EQUAL
,
19836 gen_rtx_MULT (mode
, op1
, op2
));
19839 /* Return 1 if control tansfer instruction INSN
19840 should be encoded with notrack prefix. */
19843 ix86_notrack_prefixed_insn_p (rtx insn
)
19845 if (!insn
|| !((flag_cf_protection
& CF_BRANCH
)))
19850 rtx call
= get_call_rtx_from (insn
);
19851 gcc_assert (call
!= NULL_RTX
);
19852 rtx addr
= XEXP (call
, 0);
19854 /* Do not emit 'notrack' if it's not an indirect call. */
19856 && GET_CODE (XEXP (addr
, 0)) == SYMBOL_REF
)
19859 return find_reg_note (insn
, REG_CALL_NOCF_CHECK
, 0);
19862 if (JUMP_P (insn
) && !flag_cet_switch
)
19864 rtx target
= JUMP_LABEL (insn
);
19865 if (target
== NULL_RTX
|| ANY_RETURN_P (target
))
19868 /* Check the jump is a switch table. */
19869 rtx_insn
*label
= as_a
<rtx_insn
*> (target
);
19870 rtx_insn
*table
= next_insn (label
);
19871 if (table
== NULL_RTX
|| !JUMP_TABLE_DATA_P (table
))
19879 /* Calculate integer abs() using only SSE2 instructions. */
19882 ix86_expand_sse2_abs (rtx target
, rtx input
)
19884 machine_mode mode
= GET_MODE (target
);
19891 /* For 64-bit signed integer X, with SSE4.2 use
19892 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
19893 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
19894 32 and use logical instead of arithmetic right shift (which is
19895 unimplemented) and subtract. */
19898 tmp0
= gen_reg_rtx (mode
);
19899 tmp1
= gen_reg_rtx (mode
);
19900 emit_move_insn (tmp1
, CONST0_RTX (mode
));
19901 if (mode
== E_V2DImode
)
19902 emit_insn (gen_sse4_2_gtv2di3 (tmp0
, tmp1
, input
));
19904 emit_insn (gen_avx2_gtv4di3 (tmp0
, tmp1
, input
));
19908 tmp0
= expand_simple_binop (mode
, LSHIFTRT
, input
,
19909 GEN_INT (GET_MODE_UNIT_BITSIZE (mode
)
19910 - 1), NULL
, 0, OPTAB_DIRECT
);
19911 tmp0
= expand_simple_unop (mode
, NEG
, tmp0
, NULL
, false);
19914 tmp1
= expand_simple_binop (mode
, XOR
, tmp0
, input
,
19915 NULL
, 0, OPTAB_DIRECT
);
19916 x
= expand_simple_binop (mode
, MINUS
, tmp1
, tmp0
,
19917 target
, 0, OPTAB_DIRECT
);
19921 /* For 32-bit signed integer X, the best way to calculate the absolute
19922 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
19923 tmp0
= expand_simple_binop (mode
, ASHIFTRT
, input
,
19924 GEN_INT (GET_MODE_UNIT_BITSIZE (mode
) - 1),
19925 NULL
, 0, OPTAB_DIRECT
);
19926 tmp1
= expand_simple_binop (mode
, XOR
, tmp0
, input
,
19927 NULL
, 0, OPTAB_DIRECT
);
19928 x
= expand_simple_binop (mode
, MINUS
, tmp1
, tmp0
,
19929 target
, 0, OPTAB_DIRECT
);
19933 /* For 16-bit signed integer X, the best way to calculate the absolute
19934 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
19935 tmp0
= expand_unop (mode
, neg_optab
, input
, NULL_RTX
, 0);
19937 x
= expand_simple_binop (mode
, SMAX
, tmp0
, input
,
19938 target
, 0, OPTAB_DIRECT
);
19942 /* For 8-bit signed integer X, the best way to calculate the absolute
19943 value of X is min ((unsigned char) X, (unsigned char) (-X)),
19944 as SSE2 provides the PMINUB insn. */
19945 tmp0
= expand_unop (mode
, neg_optab
, input
, NULL_RTX
, 0);
19947 x
= expand_simple_binop (V16QImode
, UMIN
, tmp0
, input
,
19948 target
, 0, OPTAB_DIRECT
);
19952 gcc_unreachable ();
19956 emit_move_insn (target
, x
);
19959 /* Expand an extract from a vector register through pextr insn.
19960 Return true if successful. */
19963 ix86_expand_pextr (rtx
*operands
)
19965 rtx dst
= operands
[0];
19966 rtx src
= operands
[1];
19968 unsigned int size
= INTVAL (operands
[2]);
19969 unsigned int pos
= INTVAL (operands
[3]);
19971 if (SUBREG_P (dst
))
19973 /* Reject non-lowpart subregs. */
19974 if (SUBREG_BYTE (dst
) > 0)
19976 dst
= SUBREG_REG (dst
);
19979 if (SUBREG_P (src
))
19981 pos
+= SUBREG_BYTE (src
) * BITS_PER_UNIT
;
19982 src
= SUBREG_REG (src
);
19985 switch (GET_MODE (src
))
19994 machine_mode srcmode
, dstmode
;
19997 if (!int_mode_for_size (size
, 0).exists (&dstmode
))
20003 if (!TARGET_SSE4_1
)
20005 srcmode
= V16QImode
;
20011 srcmode
= V8HImode
;
20015 if (!TARGET_SSE4_1
)
20017 srcmode
= V4SImode
;
20021 gcc_assert (TARGET_64BIT
);
20022 if (!TARGET_SSE4_1
)
20024 srcmode
= V2DImode
;
20031 /* Reject extractions from misaligned positions. */
20032 if (pos
& (size
-1))
20035 if (GET_MODE (dst
) == dstmode
)
20038 d
= gen_reg_rtx (dstmode
);
20040 /* Construct insn pattern. */
20041 pat
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (pos
/ size
)));
20042 pat
= gen_rtx_VEC_SELECT (dstmode
, gen_lowpart (srcmode
, src
), pat
);
20044 /* Let the rtl optimizers know about the zero extension performed. */
20045 if (dstmode
== QImode
|| dstmode
== HImode
)
20047 pat
= gen_rtx_ZERO_EXTEND (SImode
, pat
);
20048 d
= gen_lowpart (SImode
, d
);
20051 emit_insn (gen_rtx_SET (d
, pat
));
20054 emit_move_insn (dst
, gen_lowpart (GET_MODE (dst
), d
));
20063 /* Expand an insert into a vector register through pinsr insn.
20064 Return true if successful. */
20067 ix86_expand_pinsr (rtx
*operands
)
20069 rtx dst
= operands
[0];
20070 rtx src
= operands
[3];
20072 unsigned int size
= INTVAL (operands
[1]);
20073 unsigned int pos
= INTVAL (operands
[2]);
20075 if (SUBREG_P (dst
))
20077 pos
+= SUBREG_BYTE (dst
) * BITS_PER_UNIT
;
20078 dst
= SUBREG_REG (dst
);
20081 switch (GET_MODE (dst
))
20090 machine_mode srcmode
, dstmode
;
20091 rtx (*pinsr
)(rtx
, rtx
, rtx
, rtx
);
20094 if (!int_mode_for_size (size
, 0).exists (&srcmode
))
20100 if (!TARGET_SSE4_1
)
20102 dstmode
= V16QImode
;
20103 pinsr
= gen_sse4_1_pinsrb
;
20109 dstmode
= V8HImode
;
20110 pinsr
= gen_sse2_pinsrw
;
20114 if (!TARGET_SSE4_1
)
20116 dstmode
= V4SImode
;
20117 pinsr
= gen_sse4_1_pinsrd
;
20121 gcc_assert (TARGET_64BIT
);
20122 if (!TARGET_SSE4_1
)
20124 dstmode
= V2DImode
;
20125 pinsr
= gen_sse4_1_pinsrq
;
20132 /* Reject insertions to misaligned positions. */
20133 if (pos
& (size
-1))
20136 if (SUBREG_P (src
))
20138 unsigned int srcpos
= SUBREG_BYTE (src
);
20144 extr_ops
[0] = gen_reg_rtx (srcmode
);
20145 extr_ops
[1] = gen_lowpart (srcmode
, SUBREG_REG (src
));
20146 extr_ops
[2] = GEN_INT (size
);
20147 extr_ops
[3] = GEN_INT (srcpos
* BITS_PER_UNIT
);
20149 if (!ix86_expand_pextr (extr_ops
))
20155 src
= gen_lowpart (srcmode
, SUBREG_REG (src
));
20158 if (GET_MODE (dst
) == dstmode
)
20161 d
= gen_reg_rtx (dstmode
);
20163 emit_insn (pinsr (d
, gen_lowpart (dstmode
, dst
),
20164 gen_lowpart (srcmode
, src
),
20165 GEN_INT (1 << (pos
/ size
))));
20167 emit_move_insn (dst
, gen_lowpart (GET_MODE (dst
), d
));
20176 /* All CPUs prefer to avoid cross-lane operations so perform reductions
20177 upper against lower halves up to SSE reg size. */
20180 ix86_split_reduction (machine_mode mode
)
20182 /* Reduce lowpart against highpart until we reach SSE reg width to
20183 avoid cross-lane operations. */
20209 /* Generate call to __divmoddi4. */
20212 ix86_expand_divmod_libfunc (rtx libfunc
, machine_mode mode
,
20214 rtx
*quot_p
, rtx
*rem_p
)
20216 rtx rem
= assign_386_stack_local (mode
, SLOT_TEMP
);
20218 rtx quot
= emit_library_call_value (libfunc
, NULL_RTX
, LCT_NORMAL
,
20219 mode
, op0
, mode
, op1
, mode
,
20220 XEXP (rem
, 0), Pmode
);
20225 #include "gt-i386-expand.h"