1 /* Copyright (C) 1988-2022 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
19 #define IN_TARGET_CODE 1
23 #include "coretypes.h"
33 #include "stringpool.h"
40 #include "diagnostic.h"
43 #include "fold-const.h"
46 #include "stor-layout.h"
49 #include "insn-attr.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
60 #include "tm-constrs.h"
62 #include "sched-int.h"
64 #include "tree-pass.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
73 #include "tree-iterator.h"
75 #include "case-cfn-macros.h"
77 #include "fold-const-call.h"
79 #include "tree-ssanames.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
85 #include "symbol-summary.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
91 #include "dwarf2out.h"
92 #include "i386-options.h"
93 #include "i386-builtins.h"
94 #include "i386-expand.h"
96 /* Split one or more double-mode RTL references into pairs of half-mode
97 references. The RTL can be REG, offsettable MEM, integer constant, or
98 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
99 split and "num" is its length. lo_half and hi_half are output arrays
100 that parallel "operands". */
103 split_double_mode (machine_mode mode
, rtx operands
[],
104 int num
, rtx lo_half
[], rtx hi_half
[])
106 machine_mode half_mode
;
108 rtx mem_op
= NULL_RTX
;
129 byte
= GET_MODE_SIZE (half_mode
);
133 rtx op
= operands
[num
];
135 /* simplify_subreg refuse to split volatile memory addresses,
136 but we still have to handle it. */
139 if (mem_op
&& rtx_equal_p (op
, mem_op
))
141 lo_half
[num
] = lo_half
[mem_num
];
142 hi_half
[num
] = hi_half
[mem_num
];
148 lo_half
[num
] = adjust_address (op
, half_mode
, 0);
149 hi_half
[num
] = adjust_address (op
, half_mode
, byte
);
154 lo_half
[num
] = simplify_gen_subreg (half_mode
, op
,
155 GET_MODE (op
) == VOIDmode
156 ? mode
: GET_MODE (op
), 0);
158 rtx tmp
= simplify_gen_subreg (half_mode
, op
,
159 GET_MODE (op
) == VOIDmode
160 ? mode
: GET_MODE (op
), byte
);
161 /* simplify_gen_subreg will return NULL RTX for the
162 high half of the paradoxical subreg. */
163 hi_half
[num
] = tmp
? tmp
: gen_reg_rtx (half_mode
);
168 /* Emit the double word assignment DST = { LO, HI }. */
171 split_double_concat (machine_mode mode
, rtx dst
, rtx lo
, rtx hi
)
174 int deleted_move_count
= 0;
175 split_double_mode (mode
, &dst
, 1, &dlo
, &dhi
);
176 if (!rtx_equal_p (dlo
, hi
))
178 if (!rtx_equal_p (dlo
, lo
))
179 emit_move_insn (dlo
, lo
);
181 deleted_move_count
++;
182 if (!rtx_equal_p (dhi
, hi
))
183 emit_move_insn (dhi
, hi
);
185 deleted_move_count
++;
187 else if (!rtx_equal_p (lo
, dhi
))
189 if (!rtx_equal_p (dhi
, hi
))
190 emit_move_insn (dhi
, hi
);
192 deleted_move_count
++;
193 if (!rtx_equal_p (dlo
, lo
))
194 emit_move_insn (dlo
, lo
);
196 deleted_move_count
++;
198 else if (mode
== TImode
)
199 emit_insn (gen_swapdi (dlo
, dhi
));
201 emit_insn (gen_swapsi (dlo
, dhi
));
203 if (deleted_move_count
== 2)
204 emit_note (NOTE_INSN_DELETED
);
208 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
212 ix86_expand_clear (rtx dest
)
216 /* We play register width games, which are only valid after reload. */
217 gcc_assert (reload_completed
);
219 /* Avoid HImode and its attendant prefix byte. */
220 if (GET_MODE_SIZE (GET_MODE (dest
)) < 4)
221 dest
= gen_rtx_REG (SImode
, REGNO (dest
));
222 tmp
= gen_rtx_SET (dest
, const0_rtx
);
224 if (!TARGET_USE_MOV0
|| optimize_insn_for_size_p ())
226 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
227 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, tmp
, clob
));
233 /* Return true if V can be broadcasted from an integer of WIDTH bits
234 which is returned in VAL_BROADCAST. Otherwise, return false. */
237 ix86_broadcast (HOST_WIDE_INT v
, unsigned int width
,
238 HOST_WIDE_INT
&val_broadcast
)
240 wide_int val
= wi::uhwi (v
, HOST_BITS_PER_WIDE_INT
);
241 val_broadcast
= wi::extract_uhwi (val
, 0, width
);
242 for (unsigned int i
= width
; i
< HOST_BITS_PER_WIDE_INT
; i
+= width
)
244 HOST_WIDE_INT each
= wi::extract_uhwi (val
, i
, width
);
245 if (val_broadcast
!= each
)
248 val_broadcast
= sext_hwi (val_broadcast
, width
);
252 /* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */
255 ix86_convert_const_wide_int_to_broadcast (machine_mode mode
, rtx op
)
257 /* Don't use integer vector broadcast if we can't move from GPR to SSE
258 register directly. */
259 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
)
262 /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
263 broadcast only if vector broadcast is available. */
265 || !CONST_WIDE_INT_P (op
)
266 || standard_sse_constant_p (op
, mode
))
269 HOST_WIDE_INT val
= CONST_WIDE_INT_ELT (op
, 0);
270 HOST_WIDE_INT val_broadcast
;
271 scalar_int_mode broadcast_mode
;
273 && ix86_broadcast (val
, GET_MODE_BITSIZE (QImode
),
275 broadcast_mode
= QImode
;
277 && ix86_broadcast (val
, GET_MODE_BITSIZE (HImode
),
279 broadcast_mode
= HImode
;
280 else if (ix86_broadcast (val
, GET_MODE_BITSIZE (SImode
),
282 broadcast_mode
= SImode
;
283 else if (TARGET_64BIT
284 && ix86_broadcast (val
, GET_MODE_BITSIZE (DImode
),
286 broadcast_mode
= DImode
;
290 /* Check if OP can be broadcasted from VAL. */
291 for (int i
= 1; i
< CONST_WIDE_INT_NUNITS (op
); i
++)
292 if (val
!= CONST_WIDE_INT_ELT (op
, i
))
295 unsigned int nunits
= (GET_MODE_SIZE (mode
)
296 / GET_MODE_SIZE (broadcast_mode
));
297 machine_mode vector_mode
;
298 if (!mode_for_vector (broadcast_mode
, nunits
).exists (&vector_mode
))
300 rtx target
= ix86_gen_scratch_sse_rtx (vector_mode
);
301 bool ok
= ix86_expand_vector_init_duplicate (false, vector_mode
,
303 GEN_INT (val_broadcast
));
305 target
= lowpart_subreg (mode
, target
, vector_mode
);
310 ix86_expand_move (machine_mode mode
, rtx operands
[])
313 rtx tmp
, addend
= NULL_RTX
;
314 enum tls_model model
;
319 /* Avoid complex sets of likely spilled hard registers before reload. */
320 if (!ix86_hardreg_mov_ok (op0
, op1
))
322 tmp
= gen_reg_rtx (mode
);
324 ix86_expand_move (mode
, operands
);
330 switch (GET_CODE (op1
))
335 if (GET_CODE (tmp
) != PLUS
336 || GET_CODE (XEXP (tmp
, 0)) != SYMBOL_REF
)
340 addend
= XEXP (tmp
, 1);
344 model
= SYMBOL_REF_TLS_MODEL (op1
);
347 op1
= legitimize_tls_address (op1
, model
, true);
348 else if (ix86_force_load_from_GOT_p (op1
))
350 /* Load the external function address via GOT slot to avoid PLT. */
351 op1
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, op1
),
355 op1
= gen_rtx_CONST (Pmode
, op1
);
356 op1
= gen_const_mem (Pmode
, op1
);
357 set_mem_alias_set (op1
, ix86_GOT_alias_set ());
361 tmp
= legitimize_pe_coff_symbol (op1
, addend
!= NULL_RTX
);
377 op1
= force_operand (op1
, NULL_RTX
);
378 op1
= expand_simple_binop (Pmode
, PLUS
, op1
, addend
,
379 op0
, 1, OPTAB_DIRECT
);
382 op1
= force_operand (op1
, op0
);
387 op1
= convert_to_mode (mode
, op1
, 1);
393 if ((flag_pic
|| MACHOPIC_INDIRECT
)
394 && symbolic_operand (op1
, mode
))
396 if (TARGET_MACHO
&& !TARGET_64BIT
)
400 if (MACHOPIC_INDIRECT
)
402 rtx temp
= (op0
&& REG_P (op0
) && mode
== Pmode
)
403 ? op0
: gen_reg_rtx (Pmode
);
404 op1
= machopic_indirect_data_reference (op1
, temp
);
406 op1
= machopic_legitimize_pic_address (op1
, mode
,
407 temp
== op1
? 0 : temp
);
409 if (op0
!= op1
&& GET_CODE (op0
) != MEM
)
411 rtx insn
= gen_rtx_SET (op0
, op1
);
415 if (GET_CODE (op0
) == MEM
)
416 op1
= force_reg (Pmode
, op1
);
420 if (GET_CODE (temp
) != REG
)
421 temp
= gen_reg_rtx (Pmode
);
422 temp
= legitimize_pic_address (op1
, temp
);
433 op1
= force_reg (mode
, op1
);
434 else if (!(TARGET_64BIT
&& x86_64_movabs_operand (op1
, DImode
)))
436 rtx reg
= can_create_pseudo_p () ? NULL_RTX
: op0
;
437 op1
= legitimize_pic_address (op1
, reg
);
440 op1
= convert_to_mode (mode
, op1
, 1);
447 && (PUSH_ROUNDING (GET_MODE_SIZE (mode
)) != GET_MODE_SIZE (mode
)
448 || !push_operand (op0
, mode
))
450 op1
= force_reg (mode
, op1
);
452 if (push_operand (op0
, mode
)
453 && ! general_no_elim_operand (op1
, mode
))
454 op1
= copy_to_mode_reg (mode
, op1
);
456 /* Force large constants in 64bit compilation into register
457 to get them CSEed. */
458 if (can_create_pseudo_p ()
459 && (mode
== DImode
) && TARGET_64BIT
460 && immediate_operand (op1
, mode
)
461 && !x86_64_zext_immediate_operand (op1
, VOIDmode
)
462 && !register_operand (op0
, mode
)
464 op1
= copy_to_mode_reg (mode
, op1
);
466 if (can_create_pseudo_p ())
468 if (CONST_DOUBLE_P (op1
))
470 /* If we are loading a floating point constant to a
471 register, force the value to memory now, since we'll
472 get better code out the back end. */
474 op1
= validize_mem (force_const_mem (mode
, op1
));
475 if (!register_operand (op0
, mode
))
477 rtx temp
= gen_reg_rtx (mode
);
478 emit_insn (gen_rtx_SET (temp
, op1
));
479 emit_move_insn (op0
, temp
);
483 else if (GET_MODE_SIZE (mode
) >= 16)
485 rtx tmp
= ix86_convert_const_wide_int_to_broadcast
486 (GET_MODE (op0
), op1
);
493 emit_insn (gen_rtx_SET (op0
, op1
));
496 /* OP is a memref of CONST_VECTOR, return scalar constant mem
497 if CONST_VECTOR is a vec_duplicate, else return NULL. */
499 ix86_broadcast_from_constant (machine_mode mode
, rtx op
)
501 int nunits
= GET_MODE_NUNITS (mode
);
505 /* Don't use integer vector broadcast if we can't move from GPR to SSE
506 register directly. */
507 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
508 && INTEGRAL_MODE_P (mode
))
511 /* Convert CONST_VECTOR to a non-standard SSE constant integer
512 broadcast only if vector broadcast is available. */
515 && (GET_MODE_INNER (mode
) == SImode
516 || GET_MODE_INNER (mode
) == DImode
))
517 || FLOAT_MODE_P (mode
))
518 || standard_sse_constant_p (op
, mode
))
521 /* Don't broadcast from a 64-bit integer constant in 32-bit mode.
522 We can still put 64-bit integer constant in memory when
523 avx512 embed broadcast is available. */
524 if (GET_MODE_INNER (mode
) == DImode
&& !TARGET_64BIT
526 || (GET_MODE_SIZE (mode
) < 64 && !TARGET_AVX512VL
)))
529 if (GET_MODE_INNER (mode
) == TImode
)
532 rtx constant
= get_pool_constant (XEXP (op
, 0));
533 if (GET_CODE (constant
) != CONST_VECTOR
)
536 /* There could be some rtx like
537 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
538 but with "*.LC1" refer to V2DI constant vector. */
539 if (GET_MODE (constant
) != mode
)
541 constant
= simplify_subreg (mode
, constant
, GET_MODE (constant
),
543 if (constant
== nullptr || GET_CODE (constant
) != CONST_VECTOR
)
547 rtx first
= XVECEXP (constant
, 0, 0);
549 for (int i
= 1; i
< nunits
; ++i
)
551 rtx tmp
= XVECEXP (constant
, 0, i
);
552 /* Vector duplicate value. */
553 if (!rtx_equal_p (tmp
, first
))
561 ix86_expand_vector_move (machine_mode mode
, rtx operands
[])
563 rtx op0
= operands
[0], op1
= operands
[1];
564 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
565 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
566 unsigned int align
= (TARGET_IAMCU
567 ? GET_MODE_BITSIZE (mode
)
568 : GET_MODE_ALIGNMENT (mode
));
570 if (push_operand (op0
, VOIDmode
))
571 op0
= emit_move_resolve_push (mode
, op0
);
573 /* Force constants other than zero into memory. We do not know how
574 the instructions used to build constants modify the upper 64 bits
575 of the register, once we have that information we may be able
576 to handle some of them more efficiently. */
577 if (can_create_pseudo_p ()
580 && CONSTANT_P (SUBREG_REG (op1
))))
581 && ((register_operand (op0
, mode
)
582 && !standard_sse_constant_p (op1
, mode
))
583 /* ix86_expand_vector_move_misalign() does not like constants. */
584 || (SSE_REG_MODE_P (mode
)
586 && MEM_ALIGN (op0
) < align
)))
590 machine_mode imode
= GET_MODE (SUBREG_REG (op1
));
591 rtx r
= force_const_mem (imode
, SUBREG_REG (op1
));
593 r
= validize_mem (r
);
595 r
= force_reg (imode
, SUBREG_REG (op1
));
596 op1
= simplify_gen_subreg (mode
, r
, imode
, SUBREG_BYTE (op1
));
600 machine_mode mode
= GET_MODE (op0
);
601 rtx tmp
= ix86_convert_const_wide_int_to_broadcast
604 op1
= validize_mem (force_const_mem (mode
, op1
));
610 if (can_create_pseudo_p ()
611 && GET_MODE_SIZE (mode
) >= 16
612 && VECTOR_MODE_P (mode
)
614 && SYMBOL_REF_P (XEXP (op1
, 0))
615 && CONSTANT_POOL_ADDRESS_P (XEXP (op1
, 0))))
617 rtx first
= ix86_broadcast_from_constant (mode
, op1
);
618 if (first
!= nullptr)
620 /* Broadcast to XMM/YMM/ZMM register from an integer
621 constant or scalar mem. */
622 op1
= gen_reg_rtx (mode
);
623 if (FLOAT_MODE_P (mode
)
624 || (!TARGET_64BIT
&& GET_MODE_INNER (mode
) == DImode
))
625 first
= force_const_mem (GET_MODE_INNER (mode
), first
);
626 bool ok
= ix86_expand_vector_init_duplicate (false, mode
,
629 emit_move_insn (op0
, op1
);
634 /* We need to check memory alignment for SSE mode since attribute
635 can make operands unaligned. */
636 if (can_create_pseudo_p ()
637 && SSE_REG_MODE_P (mode
)
638 && ((MEM_P (op0
) && (MEM_ALIGN (op0
) < align
))
639 || (MEM_P (op1
) && (MEM_ALIGN (op1
) < align
))))
643 /* ix86_expand_vector_move_misalign() does not like both
644 arguments in memory. */
645 if (!register_operand (op0
, mode
)
646 && !register_operand (op1
, mode
))
648 rtx scratch
= ix86_gen_scratch_sse_rtx (mode
);
649 emit_move_insn (scratch
, op1
);
653 tmp
[0] = op0
; tmp
[1] = op1
;
654 ix86_expand_vector_move_misalign (mode
, tmp
);
658 /* Special case TImode to V1TImode conversions, via V2DI. */
661 && GET_MODE (SUBREG_REG (op1
)) == TImode
662 && TARGET_64BIT
&& TARGET_SSE
663 && can_create_pseudo_p ())
665 rtx tmp
= gen_reg_rtx (V2DImode
);
666 rtx lo
= gen_reg_rtx (DImode
);
667 rtx hi
= gen_reg_rtx (DImode
);
668 emit_move_insn (lo
, gen_lowpart (DImode
, SUBREG_REG (op1
)));
669 emit_move_insn (hi
, gen_highpart (DImode
, SUBREG_REG (op1
)));
670 emit_insn (gen_vec_concatv2di (tmp
, lo
, hi
));
671 emit_move_insn (op0
, gen_lowpart (V1TImode
, tmp
));
675 /* If operand0 is a hard register, make operand1 a pseudo. */
676 if (can_create_pseudo_p ()
677 && !ix86_hardreg_mov_ok (op0
, op1
))
679 rtx tmp
= gen_reg_rtx (GET_MODE (op0
));
680 emit_move_insn (tmp
, op1
);
681 emit_move_insn (op0
, tmp
);
685 /* Make operand1 a register if it isn't already. */
686 if (can_create_pseudo_p ()
687 && !register_operand (op0
, mode
)
688 && !register_operand (op1
, mode
))
690 rtx tmp
= ix86_gen_scratch_sse_rtx (GET_MODE (op0
));
691 emit_move_insn (tmp
, op1
);
692 emit_move_insn (op0
, tmp
);
696 emit_insn (gen_rtx_SET (op0
, op1
));
699 /* Split 32-byte AVX unaligned load and store if needed. */
702 ix86_avx256_split_vector_move_misalign (rtx op0
, rtx op1
)
705 rtx (*extract
) (rtx
, rtx
, rtx
);
708 if ((MEM_P (op1
) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD
)
709 || (MEM_P (op0
) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE
))
711 emit_insn (gen_rtx_SET (op0
, op1
));
715 rtx orig_op0
= NULL_RTX
;
716 mode
= GET_MODE (op0
);
717 switch (GET_MODE_CLASS (mode
))
719 case MODE_VECTOR_INT
:
721 if (mode
!= V32QImode
)
726 op0
= gen_reg_rtx (V32QImode
);
729 op0
= gen_lowpart (V32QImode
, op0
);
730 op1
= gen_lowpart (V32QImode
, op1
);
734 case MODE_VECTOR_FLOAT
:
745 extract
= gen_avx_vextractf128v32qi
;
749 extract
= gen_avx_vextractf128v16bf
;
753 extract
= gen_avx_vextractf128v16hf
;
757 extract
= gen_avx_vextractf128v8sf
;
761 extract
= gen_avx_vextractf128v4df
;
768 rtx r
= gen_reg_rtx (mode
);
769 m
= adjust_address (op1
, mode
, 0);
770 emit_move_insn (r
, m
);
771 m
= adjust_address (op1
, mode
, 16);
772 r
= gen_rtx_VEC_CONCAT (GET_MODE (op0
), r
, m
);
773 emit_move_insn (op0
, r
);
775 else if (MEM_P (op0
))
777 m
= adjust_address (op0
, mode
, 0);
778 emit_insn (extract (m
, op1
, const0_rtx
));
779 m
= adjust_address (op0
, mode
, 16);
780 emit_insn (extract (m
, copy_rtx (op1
), const1_rtx
));
786 emit_move_insn (orig_op0
, gen_lowpart (GET_MODE (orig_op0
), op0
));
789 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
790 straight to ix86_expand_vector_move. */
791 /* Code generation for scalar reg-reg moves of single and double precision data:
792 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
796 if (x86_sse_partial_reg_dependency == true)
801 Code generation for scalar loads of double precision data:
802 if (x86_sse_split_regs == true)
803 movlpd mem, reg (gas syntax)
807 Code generation for unaligned packed loads of single precision data
808 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
809 if (x86_sse_unaligned_move_optimal)
812 if (x86_sse_partial_reg_dependency == true)
824 Code generation for unaligned packed loads of double precision data
825 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
826 if (x86_sse_unaligned_move_optimal)
829 if (x86_sse_split_regs == true)
842 ix86_expand_vector_move_misalign (machine_mode mode
, rtx operands
[])
849 /* Use unaligned load/store for AVX512 or when optimizing for size. */
850 if (GET_MODE_SIZE (mode
) == 64 || optimize_insn_for_size_p ())
852 emit_insn (gen_rtx_SET (op0
, op1
));
858 if (GET_MODE_SIZE (mode
) == 32)
859 ix86_avx256_split_vector_move_misalign (op0
, op1
);
861 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
862 emit_insn (gen_rtx_SET (op0
, op1
));
866 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
867 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
)
869 emit_insn (gen_rtx_SET (op0
, op1
));
873 /* ??? If we have typed data, then it would appear that using
874 movdqu is the only way to get unaligned data loaded with
876 if (TARGET_SSE2
&& GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
878 emit_insn (gen_rtx_SET (op0
, op1
));
884 if (TARGET_SSE2
&& mode
== V2DFmode
)
888 /* When SSE registers are split into halves, we can avoid
889 writing to the top half twice. */
890 if (TARGET_SSE_SPLIT_REGS
)
897 /* ??? Not sure about the best option for the Intel chips.
898 The following would seem to satisfy; the register is
899 entirely cleared, breaking the dependency chain. We
900 then store to the upper half, with a dependency depth
901 of one. A rumor has it that Intel recommends two movsd
902 followed by an unpacklpd, but this is unconfirmed. And
903 given that the dependency depth of the unpacklpd would
904 still be one, I'm not sure why this would be better. */
905 zero
= CONST0_RTX (V2DFmode
);
908 m
= adjust_address (op1
, DFmode
, 0);
909 emit_insn (gen_sse2_loadlpd (op0
, zero
, m
));
910 m
= adjust_address (op1
, DFmode
, 8);
911 emit_insn (gen_sse2_loadhpd (op0
, op0
, m
));
917 if (mode
!= V4SFmode
)
918 t
= gen_reg_rtx (V4SFmode
);
922 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY
)
923 emit_move_insn (t
, CONST0_RTX (V4SFmode
));
927 m
= adjust_address (op1
, V2SFmode
, 0);
928 emit_insn (gen_sse_loadlps (t
, t
, m
));
929 m
= adjust_address (op1
, V2SFmode
, 8);
930 emit_insn (gen_sse_loadhps (t
, t
, m
));
931 if (mode
!= V4SFmode
)
932 emit_move_insn (op0
, gen_lowpart (mode
, t
));
935 else if (MEM_P (op0
))
937 if (TARGET_SSE2
&& mode
== V2DFmode
)
939 m
= adjust_address (op0
, DFmode
, 0);
940 emit_insn (gen_sse2_storelpd (m
, op1
));
941 m
= adjust_address (op0
, DFmode
, 8);
942 emit_insn (gen_sse2_storehpd (m
, op1
));
946 if (mode
!= V4SFmode
)
947 op1
= gen_lowpart (V4SFmode
, op1
);
949 m
= adjust_address (op0
, V2SFmode
, 0);
950 emit_insn (gen_sse_storelps (m
, op1
));
951 m
= adjust_address (op0
, V2SFmode
, 8);
952 emit_insn (gen_sse_storehps (m
, copy_rtx (op1
)));
959 /* Move bits 64:95 to bits 32:63. */
962 ix86_move_vector_high_sse_to_mmx (rtx op
)
964 rtx mask
= gen_rtx_PARALLEL (VOIDmode
,
965 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
966 GEN_INT (0), GEN_INT (0)));
967 rtx dest
= lowpart_subreg (V4SImode
, op
, GET_MODE (op
));
968 op
= gen_rtx_VEC_SELECT (V4SImode
, dest
, mask
);
969 rtx insn
= gen_rtx_SET (dest
, op
);
973 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
976 ix86_split_mmx_pack (rtx operands
[], enum rtx_code code
)
978 rtx op0
= operands
[0];
979 rtx op1
= operands
[1];
980 rtx op2
= operands
[2];
982 machine_mode dmode
= GET_MODE (op0
);
983 machine_mode smode
= GET_MODE (op1
);
984 machine_mode inner_dmode
= GET_MODE_INNER (dmode
);
985 machine_mode inner_smode
= GET_MODE_INNER (smode
);
987 /* Get the corresponding SSE mode for destination. */
988 int nunits
= 16 / GET_MODE_SIZE (inner_dmode
);
989 machine_mode sse_dmode
= mode_for_vector (GET_MODE_INNER (dmode
),
991 machine_mode sse_half_dmode
= mode_for_vector (GET_MODE_INNER (dmode
),
992 nunits
/ 2).require ();
994 /* Get the corresponding SSE mode for source. */
995 nunits
= 16 / GET_MODE_SIZE (inner_smode
);
996 machine_mode sse_smode
= mode_for_vector (GET_MODE_INNER (smode
),
999 /* Generate SSE pack with signed/unsigned saturation. */
1000 rtx dest
= lowpart_subreg (sse_dmode
, op0
, GET_MODE (op0
));
1001 op1
= lowpart_subreg (sse_smode
, op1
, GET_MODE (op1
));
1002 op2
= lowpart_subreg (sse_smode
, op2
, GET_MODE (op2
));
1004 op1
= gen_rtx_fmt_e (code
, sse_half_dmode
, op1
);
1005 op2
= gen_rtx_fmt_e (code
, sse_half_dmode
, op2
);
1006 rtx insn
= gen_rtx_SET (dest
, gen_rtx_VEC_CONCAT (sse_dmode
,
1010 ix86_move_vector_high_sse_to_mmx (op0
);
1013 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
1016 ix86_split_mmx_punpck (rtx operands
[], bool high_p
)
1018 rtx op0
= operands
[0];
1019 rtx op1
= operands
[1];
1020 rtx op2
= operands
[2];
1021 machine_mode mode
= GET_MODE (op0
);
1023 /* The corresponding SSE mode. */
1024 machine_mode sse_mode
, double_sse_mode
;
1030 sse_mode
= V16QImode
;
1031 double_sse_mode
= V32QImode
;
1032 mask
= gen_rtx_PARALLEL (VOIDmode
,
1034 GEN_INT (0), GEN_INT (16),
1035 GEN_INT (1), GEN_INT (17),
1036 GEN_INT (2), GEN_INT (18),
1037 GEN_INT (3), GEN_INT (19),
1038 GEN_INT (4), GEN_INT (20),
1039 GEN_INT (5), GEN_INT (21),
1040 GEN_INT (6), GEN_INT (22),
1041 GEN_INT (7), GEN_INT (23)));
1046 sse_mode
= V8HImode
;
1047 double_sse_mode
= V16HImode
;
1048 mask
= gen_rtx_PARALLEL (VOIDmode
,
1050 GEN_INT (0), GEN_INT (8),
1051 GEN_INT (1), GEN_INT (9),
1052 GEN_INT (2), GEN_INT (10),
1053 GEN_INT (3), GEN_INT (11)));
1057 sse_mode
= V4SImode
;
1058 double_sse_mode
= V8SImode
;
1059 mask
= gen_rtx_PARALLEL (VOIDmode
,
1061 GEN_INT (0), GEN_INT (4),
1062 GEN_INT (1), GEN_INT (5)));
1066 sse_mode
= V4SFmode
;
1067 double_sse_mode
= V8SFmode
;
1068 mask
= gen_rtx_PARALLEL (VOIDmode
,
1070 GEN_INT (0), GEN_INT (4),
1071 GEN_INT (1), GEN_INT (5)));
1078 /* Generate SSE punpcklXX. */
1079 rtx dest
= lowpart_subreg (sse_mode
, op0
, GET_MODE (op0
));
1080 op1
= lowpart_subreg (sse_mode
, op1
, GET_MODE (op1
));
1081 op2
= lowpart_subreg (sse_mode
, op2
, GET_MODE (op2
));
1083 op1
= gen_rtx_VEC_CONCAT (double_sse_mode
, op1
, op2
);
1084 op2
= gen_rtx_VEC_SELECT (sse_mode
, op1
, mask
);
1085 rtx insn
= gen_rtx_SET (dest
, op2
);
1088 /* Move high bits to low bits. */
1091 if (sse_mode
== V4SFmode
)
1093 mask
= gen_rtx_PARALLEL (VOIDmode
,
1094 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1095 GEN_INT (4), GEN_INT (5)));
1096 op2
= gen_rtx_VEC_CONCAT (V8SFmode
, dest
, dest
);
1097 op1
= gen_rtx_VEC_SELECT (V4SFmode
, op2
, mask
);
1101 int sz
= GET_MODE_SIZE (mode
);
1104 mask
= gen_rtx_PARALLEL (VOIDmode
,
1105 gen_rtvec (4, GEN_INT (1), GEN_INT (0),
1106 GEN_INT (0), GEN_INT (1)));
1108 mask
= gen_rtx_PARALLEL (VOIDmode
,
1109 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1110 GEN_INT (0), GEN_INT (1)));
1114 dest
= lowpart_subreg (V4SImode
, dest
, GET_MODE (dest
));
1115 op1
= gen_rtx_VEC_SELECT (V4SImode
, dest
, mask
);
1118 insn
= gen_rtx_SET (dest
, op1
);
1123 /* Helper function of ix86_fixup_binary_operands to canonicalize
1124 operand order. Returns true if the operands should be swapped. */
1127 ix86_swap_binary_operands_p (enum rtx_code code
, machine_mode mode
,
1130 rtx dst
= operands
[0];
1131 rtx src1
= operands
[1];
1132 rtx src2
= operands
[2];
1134 /* If the operation is not commutative, we can't do anything. */
1135 if (GET_RTX_CLASS (code
) != RTX_COMM_ARITH
1136 && GET_RTX_CLASS (code
) != RTX_COMM_COMPARE
)
1139 /* Highest priority is that src1 should match dst. */
1140 if (rtx_equal_p (dst
, src1
))
1142 if (rtx_equal_p (dst
, src2
))
1145 /* Next highest priority is that immediate constants come second. */
1146 if (immediate_operand (src2
, mode
))
1148 if (immediate_operand (src1
, mode
))
1151 /* Lowest priority is that memory references should come second. */
1161 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
1162 destination to use for the operation. If different from the true
1163 destination in operands[0], a copy operation will be required. */
1166 ix86_fixup_binary_operands (enum rtx_code code
, machine_mode mode
,
1169 rtx dst
= operands
[0];
1170 rtx src1
= operands
[1];
1171 rtx src2
= operands
[2];
1173 /* Canonicalize operand order. */
1174 if (ix86_swap_binary_operands_p (code
, mode
, operands
))
1176 /* It is invalid to swap operands of different modes. */
1177 gcc_assert (GET_MODE (src1
) == GET_MODE (src2
));
1179 std::swap (src1
, src2
);
1182 /* Both source operands cannot be in memory. */
1183 if (MEM_P (src1
) && MEM_P (src2
))
1185 /* Optimization: Only read from memory once. */
1186 if (rtx_equal_p (src1
, src2
))
1188 src2
= force_reg (mode
, src2
);
1191 else if (rtx_equal_p (dst
, src1
))
1192 src2
= force_reg (mode
, src2
);
1194 src1
= force_reg (mode
, src1
);
1197 /* If the destination is memory, and we do not have matching source
1198 operands, do things in registers. */
1199 if (MEM_P (dst
) && !rtx_equal_p (dst
, src1
))
1200 dst
= gen_reg_rtx (mode
);
1202 /* Source 1 cannot be a constant. */
1203 if (CONSTANT_P (src1
))
1204 src1
= force_reg (mode
, src1
);
1206 /* Source 1 cannot be a non-matching memory. */
1207 if (MEM_P (src1
) && !rtx_equal_p (dst
, src1
))
1208 src1
= force_reg (mode
, src1
);
1210 /* Improve address combine. */
1212 && GET_MODE_CLASS (mode
) == MODE_INT
1214 src2
= force_reg (mode
, src2
);
1221 /* Similarly, but assume that the destination has already been
1225 ix86_fixup_binary_operands_no_copy (enum rtx_code code
,
1226 machine_mode mode
, rtx operands
[])
1228 rtx dst
= ix86_fixup_binary_operands (code
, mode
, operands
);
1229 gcc_assert (dst
== operands
[0]);
1232 /* Attempt to expand a binary operator. Make the expansion closer to the
1233 actual machine, then just general_operand, which will allow 3 separate
1234 memory references (one output, two input) in a single insn. */
1237 ix86_expand_binary_operator (enum rtx_code code
, machine_mode mode
,
1240 rtx src1
, src2
, dst
, op
, clob
;
1242 dst
= ix86_fixup_binary_operands (code
, mode
, operands
);
1246 /* Emit the instruction. */
1248 op
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, src1
, src2
));
1250 if (reload_completed
1252 && !rtx_equal_p (dst
, src1
))
1254 /* This is going to be an LEA; avoid splitting it later. */
1259 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1260 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1263 /* Fix up the destination if needed. */
1264 if (dst
!= operands
[0])
1265 emit_move_insn (operands
[0], dst
);
1268 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
1269 the given OPERANDS. */
1272 ix86_expand_vector_logical_operator (enum rtx_code code
, machine_mode mode
,
1275 rtx op1
= NULL_RTX
, op2
= NULL_RTX
;
1276 if (SUBREG_P (operands
[1]))
1281 else if (SUBREG_P (operands
[2]))
1286 /* Optimize (__m128i) d | (__m128i) e and similar code
1287 when d and e are float vectors into float vector logical
1288 insn. In C/C++ without using intrinsics there is no other way
1289 to express vector logical operation on float vectors than
1290 to cast them temporarily to integer vectors. */
1292 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
1293 && (SUBREG_P (op2
) || GET_CODE (op2
) == CONST_VECTOR
)
1294 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1
))) == MODE_VECTOR_FLOAT
1295 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1
))) == GET_MODE_SIZE (mode
)
1296 && SUBREG_BYTE (op1
) == 0
1297 && (GET_CODE (op2
) == CONST_VECTOR
1298 || (GET_MODE (SUBREG_REG (op1
)) == GET_MODE (SUBREG_REG (op2
))
1299 && SUBREG_BYTE (op2
) == 0))
1300 && can_create_pseudo_p ())
1303 switch (GET_MODE (SUBREG_REG (op1
)))
1311 dst
= gen_reg_rtx (GET_MODE (SUBREG_REG (op1
)));
1312 if (GET_CODE (op2
) == CONST_VECTOR
)
1314 op2
= gen_lowpart (GET_MODE (dst
), op2
);
1315 op2
= force_reg (GET_MODE (dst
), op2
);
1320 op2
= SUBREG_REG (operands
[2]);
1321 if (!vector_operand (op2
, GET_MODE (dst
)))
1322 op2
= force_reg (GET_MODE (dst
), op2
);
1324 op1
= SUBREG_REG (op1
);
1325 if (!vector_operand (op1
, GET_MODE (dst
)))
1326 op1
= force_reg (GET_MODE (dst
), op1
);
1327 emit_insn (gen_rtx_SET (dst
,
1328 gen_rtx_fmt_ee (code
, GET_MODE (dst
),
1330 emit_move_insn (operands
[0], gen_lowpart (mode
, dst
));
1336 if (!vector_operand (operands
[1], mode
))
1337 operands
[1] = force_reg (mode
, operands
[1]);
1338 if (!vector_operand (operands
[2], mode
))
1339 operands
[2] = force_reg (mode
, operands
[2]);
1340 ix86_fixup_binary_operands_no_copy (code
, mode
, operands
);
1341 emit_insn (gen_rtx_SET (operands
[0],
1342 gen_rtx_fmt_ee (code
, mode
, operands
[1],
1346 /* Return TRUE or FALSE depending on whether the binary operator meets the
1347 appropriate constraints. */
1350 ix86_binary_operator_ok (enum rtx_code code
, machine_mode mode
,
1353 rtx dst
= operands
[0];
1354 rtx src1
= operands
[1];
1355 rtx src2
= operands
[2];
1357 /* Both source operands cannot be in memory. */
1358 if ((MEM_P (src1
) || bcst_mem_operand (src1
, mode
))
1359 && (MEM_P (src2
) || bcst_mem_operand (src2
, mode
)))
1362 /* Canonicalize operand order for commutative operators. */
1363 if (ix86_swap_binary_operands_p (code
, mode
, operands
))
1364 std::swap (src1
, src2
);
1366 /* If the destination is memory, we must have a matching source operand. */
1367 if (MEM_P (dst
) && !rtx_equal_p (dst
, src1
))
1370 /* Source 1 cannot be a constant. */
1371 if (CONSTANT_P (src1
))
1374 /* Source 1 cannot be a non-matching memory. */
1375 if (MEM_P (src1
) && !rtx_equal_p (dst
, src1
))
1376 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1380 || (TARGET_64BIT
&& mode
== DImode
))
1381 && satisfies_constraint_L (src2
));
1386 /* Attempt to expand a unary operator. Make the expansion closer to the
1387 actual machine, then just general_operand, which will allow 2 separate
1388 memory references (one output, one input) in a single insn. */
1391 ix86_expand_unary_operator (enum rtx_code code
, machine_mode mode
,
1394 bool matching_memory
= false;
1395 rtx src
, dst
, op
, clob
;
1400 /* If the destination is memory, and we do not have matching source
1401 operands, do things in registers. */
1404 if (rtx_equal_p (dst
, src
))
1405 matching_memory
= true;
1407 dst
= gen_reg_rtx (mode
);
1410 /* When source operand is memory, destination must match. */
1411 if (MEM_P (src
) && !matching_memory
)
1412 src
= force_reg (mode
, src
);
1414 /* Emit the instruction. */
1416 op
= gen_rtx_SET (dst
, gen_rtx_fmt_e (code
, mode
, src
));
1422 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1423 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1426 /* Fix up the destination if needed. */
1427 if (dst
!= operands
[0])
1428 emit_move_insn (operands
[0], dst
);
1431 /* Predict just emitted jump instruction to be taken with probability PROB. */
1434 predict_jump (int prob
)
1436 rtx_insn
*insn
= get_last_insn ();
1437 gcc_assert (JUMP_P (insn
));
1438 add_reg_br_prob_note (insn
, profile_probability::from_reg_br_prob_base (prob
));
1441 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1442 divisor are within the range [0-255]. */
1445 ix86_split_idivmod (machine_mode mode
, rtx operands
[],
1448 rtx_code_label
*end_label
, *qimode_label
;
1451 rtx scratch
, tmp0
, tmp1
, tmp2
;
1452 rtx (*gen_divmod4_1
) (rtx
, rtx
, rtx
, rtx
);
1454 operands
[2] = force_reg (mode
, operands
[2]);
1455 operands
[3] = force_reg (mode
, operands
[3]);
1460 if (GET_MODE (operands
[0]) == SImode
)
1462 if (GET_MODE (operands
[1]) == SImode
)
1463 gen_divmod4_1
= unsigned_p
? gen_udivmodsi4_1
: gen_divmodsi4_1
;
1466 = unsigned_p
? gen_udivmodsi4_zext_2
: gen_divmodsi4_zext_2
;
1470 = unsigned_p
? gen_udivmodsi4_zext_1
: gen_divmodsi4_zext_1
;
1474 gen_divmod4_1
= unsigned_p
? gen_udivmoddi4_1
: gen_divmoddi4_1
;
1481 end_label
= gen_label_rtx ();
1482 qimode_label
= gen_label_rtx ();
1484 scratch
= gen_reg_rtx (mode
);
1486 /* Use 8bit unsigned divimod if dividend and divisor are within
1487 the range [0-255]. */
1488 emit_move_insn (scratch
, operands
[2]);
1489 scratch
= expand_simple_binop (mode
, IOR
, scratch
, operands
[3],
1490 scratch
, 1, OPTAB_DIRECT
);
1491 emit_insn (gen_test_ccno_1 (mode
, scratch
, GEN_INT (-0x100)));
1492 tmp0
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
1493 tmp0
= gen_rtx_EQ (VOIDmode
, tmp0
, const0_rtx
);
1494 tmp0
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp0
,
1495 gen_rtx_LABEL_REF (VOIDmode
, qimode_label
),
1497 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp0
));
1498 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
1499 JUMP_LABEL (insn
) = qimode_label
;
1501 /* Generate original signed/unsigned divimod. */
1502 emit_insn (gen_divmod4_1 (operands
[0], operands
[1],
1503 operands
[2], operands
[3]));
1505 /* Branch to the end. */
1506 emit_jump_insn (gen_jump (end_label
));
1509 /* Generate 8bit unsigned divide. */
1510 emit_label (qimode_label
);
1511 /* Don't use operands[0] for result of 8bit divide since not all
1512 registers support QImode ZERO_EXTRACT. */
1513 tmp0
= lowpart_subreg (HImode
, scratch
, mode
);
1514 tmp1
= lowpart_subreg (HImode
, operands
[2], mode
);
1515 tmp2
= lowpart_subreg (QImode
, operands
[3], mode
);
1516 emit_insn (gen_udivmodhiqi3 (tmp0
, tmp1
, tmp2
));
1520 div
= gen_rtx_UDIV (mode
, operands
[2], operands
[3]);
1521 mod
= gen_rtx_UMOD (mode
, operands
[2], operands
[3]);
1525 div
= gen_rtx_DIV (mode
, operands
[2], operands
[3]);
1526 mod
= gen_rtx_MOD (mode
, operands
[2], operands
[3]);
1530 if (GET_MODE (operands
[0]) != SImode
)
1531 div
= gen_rtx_ZERO_EXTEND (DImode
, div
);
1532 if (GET_MODE (operands
[1]) != SImode
)
1533 mod
= gen_rtx_ZERO_EXTEND (DImode
, mod
);
1536 /* Extract remainder from AH. */
1537 scratch
= gen_lowpart (GET_MODE (operands
[1]), scratch
);
1538 tmp1
= gen_rtx_ZERO_EXTRACT (GET_MODE (operands
[1]), scratch
,
1539 GEN_INT (8), GEN_INT (8));
1540 insn
= emit_move_insn (operands
[1], tmp1
);
1541 set_unique_reg_note (insn
, REG_EQUAL
, mod
);
1543 /* Zero extend quotient from AL. */
1544 tmp1
= gen_lowpart (QImode
, tmp0
);
1545 insn
= emit_insn (gen_extend_insn
1547 GET_MODE (operands
[0]), QImode
, 1));
1548 set_unique_reg_note (insn
, REG_EQUAL
, div
);
1550 emit_label (end_label
);
1553 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1554 matches destination. RTX includes clobber of FLAGS_REG. */
1557 ix86_emit_binop (enum rtx_code code
, machine_mode mode
,
1562 op
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, dst
, src
));
1563 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
1565 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, op
, clob
)));
1568 /* Return true if regno1 def is nearest to the insn. */
1571 find_nearest_reg_def (rtx_insn
*insn
, int regno1
, int regno2
)
1573 rtx_insn
*prev
= insn
;
1574 rtx_insn
*start
= BB_HEAD (BLOCK_FOR_INSN (insn
));
1578 while (prev
&& prev
!= start
)
1580 if (!INSN_P (prev
) || !NONDEBUG_INSN_P (prev
))
1582 prev
= PREV_INSN (prev
);
1585 if (insn_defines_reg (regno1
, INVALID_REGNUM
, prev
))
1587 else if (insn_defines_reg (regno2
, INVALID_REGNUM
, prev
))
1589 prev
= PREV_INSN (prev
);
1592 /* None of the regs is defined in the bb. */
1596 /* INSN_UID of the last insn emitted by zero store peephole2s. */
1597 int ix86_last_zero_store_uid
;
1599 /* Split lea instructions into a sequence of instructions
1600 which are executed on ALU to avoid AGU stalls.
1601 It is assumed that it is allowed to clobber flags register
1605 ix86_split_lea_for_addr (rtx_insn
*insn
, rtx operands
[], machine_mode mode
)
1607 unsigned int regno0
, regno1
, regno2
;
1608 struct ix86_address parts
;
1612 ok
= ix86_decompose_address (operands
[1], &parts
);
1615 target
= gen_lowpart (mode
, operands
[0]);
1617 regno0
= true_regnum (target
);
1618 regno1
= INVALID_REGNUM
;
1619 regno2
= INVALID_REGNUM
;
1623 parts
.base
= gen_lowpart (mode
, parts
.base
);
1624 regno1
= true_regnum (parts
.base
);
1629 parts
.index
= gen_lowpart (mode
, parts
.index
);
1630 regno2
= true_regnum (parts
.index
);
1634 parts
.disp
= gen_lowpart (mode
, parts
.disp
);
1636 if (parts
.scale
> 1)
1638 /* Case r1 = r1 + ... */
1639 if (regno1
== regno0
)
1641 /* If we have a case r1 = r1 + C * r2 then we
1642 should use multiplication which is very
1643 expensive. Assume cost model is wrong if we
1644 have such case here. */
1645 gcc_assert (regno2
!= regno0
);
1647 for (adds
= parts
.scale
; adds
> 0; adds
--)
1648 ix86_emit_binop (PLUS
, mode
, target
, parts
.index
);
1652 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1653 if (regno0
!= regno2
)
1654 emit_insn (gen_rtx_SET (target
, parts
.index
));
1656 /* Use shift for scaling, but emit it as MULT instead
1657 to avoid it being immediately peephole2 optimized back
1659 ix86_emit_binop (MULT
, mode
, target
, GEN_INT (parts
.scale
));
1662 ix86_emit_binop (PLUS
, mode
, target
, parts
.base
);
1664 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1665 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1668 else if (!parts
.base
&& !parts
.index
)
1670 gcc_assert(parts
.disp
);
1671 emit_insn (gen_rtx_SET (target
, parts
.disp
));
1677 if (regno0
!= regno2
)
1678 emit_insn (gen_rtx_SET (target
, parts
.index
));
1680 else if (!parts
.index
)
1682 if (regno0
!= regno1
)
1683 emit_insn (gen_rtx_SET (target
, parts
.base
));
1687 if (regno0
== regno1
)
1689 else if (regno0
== regno2
)
1695 /* Find better operand for SET instruction, depending
1696 on which definition is farther from the insn. */
1697 if (find_nearest_reg_def (insn
, regno1
, regno2
))
1698 tmp
= parts
.index
, tmp1
= parts
.base
;
1700 tmp
= parts
.base
, tmp1
= parts
.index
;
1702 emit_insn (gen_rtx_SET (target
, tmp
));
1704 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1705 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1707 ix86_emit_binop (PLUS
, mode
, target
, tmp1
);
1711 ix86_emit_binop (PLUS
, mode
, target
, tmp
);
1714 if (parts
.disp
&& parts
.disp
!= const0_rtx
)
1715 ix86_emit_binop (PLUS
, mode
, target
, parts
.disp
);
1719 /* Post-reload splitter for converting an SF or DFmode value in an
1720 SSE register into an unsigned SImode. */
1723 ix86_split_convert_uns_si_sse (rtx operands
[])
1725 machine_mode vecmode
;
1726 rtx value
, large
, zero_or_two31
, input
, two31
, x
;
1728 large
= operands
[1];
1729 zero_or_two31
= operands
[2];
1730 input
= operands
[3];
1731 two31
= operands
[4];
1732 vecmode
= GET_MODE (large
);
1733 value
= gen_rtx_REG (vecmode
, REGNO (operands
[0]));
1735 /* Load up the value into the low element. We must ensure that the other
1736 elements are valid floats -- zero is the easiest such value. */
1739 if (vecmode
== V4SFmode
)
1740 emit_insn (gen_vec_setv4sf_0 (value
, CONST0_RTX (V4SFmode
), input
));
1742 emit_insn (gen_sse2_loadlpd (value
, CONST0_RTX (V2DFmode
), input
));
1746 input
= gen_rtx_REG (vecmode
, REGNO (input
));
1747 emit_move_insn (value
, CONST0_RTX (vecmode
));
1748 if (vecmode
== V4SFmode
)
1749 emit_insn (gen_sse_movss (value
, value
, input
));
1751 emit_insn (gen_sse2_movsd (value
, value
, input
));
1754 emit_move_insn (large
, two31
);
1755 emit_move_insn (zero_or_two31
, MEM_P (two31
) ? large
: two31
);
1757 x
= gen_rtx_fmt_ee (LE
, vecmode
, large
, value
);
1758 emit_insn (gen_rtx_SET (large
, x
));
1760 x
= gen_rtx_AND (vecmode
, zero_or_two31
, large
);
1761 emit_insn (gen_rtx_SET (zero_or_two31
, x
));
1763 x
= gen_rtx_MINUS (vecmode
, value
, zero_or_two31
);
1764 emit_insn (gen_rtx_SET (value
, x
));
1766 large
= gen_rtx_REG (V4SImode
, REGNO (large
));
1767 emit_insn (gen_ashlv4si3 (large
, large
, GEN_INT (31)));
1769 x
= gen_rtx_REG (V4SImode
, REGNO (value
));
1770 if (vecmode
== V4SFmode
)
1771 emit_insn (gen_fix_truncv4sfv4si2 (x
, value
));
1773 emit_insn (gen_sse2_cvttpd2dq (x
, value
));
1776 emit_insn (gen_xorv4si3 (value
, value
, large
));
1779 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok
,
1780 machine_mode mode
, rtx target
,
1781 rtx var
, int one_var
);
1783 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1784 Expects the 64-bit DImode to be supplied in a pair of integral
1785 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1786 -mfpmath=sse, !optimize_size only. */
1789 ix86_expand_convert_uns_didf_sse (rtx target
, rtx input
)
1791 REAL_VALUE_TYPE bias_lo_rvt
, bias_hi_rvt
;
1792 rtx int_xmm
, fp_xmm
;
1793 rtx biases
, exponents
;
1796 int_xmm
= gen_reg_rtx (V4SImode
);
1797 if (TARGET_INTER_UNIT_MOVES_TO_VEC
)
1798 emit_insn (gen_movdi_to_sse (int_xmm
, input
));
1799 else if (TARGET_SSE_SPLIT_REGS
)
1801 emit_clobber (int_xmm
);
1802 emit_move_insn (gen_lowpart (DImode
, int_xmm
), input
);
1806 x
= gen_reg_rtx (V2DImode
);
1807 ix86_expand_vector_init_one_nonzero (false, V2DImode
, x
, input
, 0);
1808 emit_move_insn (int_xmm
, gen_lowpart (V4SImode
, x
));
1811 x
= gen_rtx_CONST_VECTOR (V4SImode
,
1812 gen_rtvec (4, GEN_INT (0x43300000UL
),
1813 GEN_INT (0x45300000UL
),
1814 const0_rtx
, const0_rtx
));
1815 exponents
= validize_mem (force_const_mem (V4SImode
, x
));
1817 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1818 emit_insn (gen_vec_interleave_lowv4si (int_xmm
, int_xmm
, exponents
));
1820 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1821 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1822 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1823 (0x1.0p84 + double(fp_value_hi_xmm)).
1824 Note these exponents differ by 32. */
1826 fp_xmm
= copy_to_mode_reg (V2DFmode
, gen_lowpart (V2DFmode
, int_xmm
));
1828 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1829 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1830 real_ldexp (&bias_lo_rvt
, &dconst1
, 52);
1831 real_ldexp (&bias_hi_rvt
, &dconst1
, 84);
1832 biases
= const_double_from_real_value (bias_lo_rvt
, DFmode
);
1833 x
= const_double_from_real_value (bias_hi_rvt
, DFmode
);
1834 biases
= gen_rtx_CONST_VECTOR (V2DFmode
, gen_rtvec (2, biases
, x
));
1835 biases
= validize_mem (force_const_mem (V2DFmode
, biases
));
1836 emit_insn (gen_subv2df3 (fp_xmm
, fp_xmm
, biases
));
1838 /* Add the upper and lower DFmode values together. */
1840 emit_insn (gen_sse3_haddv2df3 (fp_xmm
, fp_xmm
, fp_xmm
));
1843 x
= copy_to_mode_reg (V2DFmode
, fp_xmm
);
1844 emit_insn (gen_vec_interleave_highv2df (fp_xmm
, fp_xmm
, fp_xmm
));
1845 emit_insn (gen_addv2df3 (fp_xmm
, fp_xmm
, x
));
1848 ix86_expand_vector_extract (false, target
, fp_xmm
, 0);
1851 /* Not used, but eases macroization of patterns. */
1853 ix86_expand_convert_uns_sixf_sse (rtx
, rtx
)
1858 static rtx
ix86_expand_sse_fabs (rtx op0
, rtx
*smask
);
1860 /* Convert an unsigned SImode value into a DFmode. Only currently used
1861 for SSE, but applicable anywhere. */
1864 ix86_expand_convert_uns_sidf_sse (rtx target
, rtx input
)
1866 REAL_VALUE_TYPE TWO31r
;
1869 x
= expand_simple_binop (SImode
, PLUS
, input
, GEN_INT (-2147483647 - 1),
1870 NULL
, 1, OPTAB_DIRECT
);
1872 fp
= gen_reg_rtx (DFmode
);
1873 emit_insn (gen_floatsidf2 (fp
, x
));
1875 real_ldexp (&TWO31r
, &dconst1
, 31);
1876 x
= const_double_from_real_value (TWO31r
, DFmode
);
1878 x
= expand_simple_binop (DFmode
, PLUS
, fp
, x
, target
, 0, OPTAB_DIRECT
);
1880 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
1881 if (HONOR_SIGNED_ZEROS (DFmode
) && flag_rounding_math
)
1882 x
= ix86_expand_sse_fabs (x
, NULL
);
1885 emit_move_insn (target
, x
);
1888 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1889 32-bit mode; otherwise we have a direct convert instruction. */
1892 ix86_expand_convert_sign_didf_sse (rtx target
, rtx input
)
1894 REAL_VALUE_TYPE TWO32r
;
1895 rtx fp_lo
, fp_hi
, x
;
1897 fp_lo
= gen_reg_rtx (DFmode
);
1898 fp_hi
= gen_reg_rtx (DFmode
);
1900 emit_insn (gen_floatsidf2 (fp_hi
, gen_highpart (SImode
, input
)));
1902 real_ldexp (&TWO32r
, &dconst1
, 32);
1903 x
= const_double_from_real_value (TWO32r
, DFmode
);
1904 fp_hi
= expand_simple_binop (DFmode
, MULT
, fp_hi
, x
, fp_hi
, 0, OPTAB_DIRECT
);
1906 ix86_expand_convert_uns_sidf_sse (fp_lo
, gen_lowpart (SImode
, input
));
1908 x
= expand_simple_binop (DFmode
, PLUS
, fp_hi
, fp_lo
, target
,
1911 emit_move_insn (target
, x
);
1914 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1915 For x86_32, -mfpmath=sse, !optimize_size only. */
1917 ix86_expand_convert_uns_sisf_sse (rtx target
, rtx input
)
1919 REAL_VALUE_TYPE ONE16r
;
1920 rtx fp_hi
, fp_lo
, int_hi
, int_lo
, x
;
1922 real_ldexp (&ONE16r
, &dconst1
, 16);
1923 x
= const_double_from_real_value (ONE16r
, SFmode
);
1924 int_lo
= expand_simple_binop (SImode
, AND
, input
, GEN_INT(0xffff),
1925 NULL
, 0, OPTAB_DIRECT
);
1926 int_hi
= expand_simple_binop (SImode
, LSHIFTRT
, input
, GEN_INT(16),
1927 NULL
, 0, OPTAB_DIRECT
);
1928 fp_hi
= gen_reg_rtx (SFmode
);
1929 fp_lo
= gen_reg_rtx (SFmode
);
1930 emit_insn (gen_floatsisf2 (fp_hi
, int_hi
));
1931 emit_insn (gen_floatsisf2 (fp_lo
, int_lo
));
1934 x
= validize_mem (force_const_mem (SFmode
, x
));
1935 fp_hi
= gen_rtx_FMA (SFmode
, fp_hi
, x
, fp_lo
);
1936 emit_move_insn (target
, fp_hi
);
1940 fp_hi
= expand_simple_binop (SFmode
, MULT
, fp_hi
, x
, fp_hi
,
1942 fp_hi
= expand_simple_binop (SFmode
, PLUS
, fp_hi
, fp_lo
, target
,
1944 if (!rtx_equal_p (target
, fp_hi
))
1945 emit_move_insn (target
, fp_hi
);
1949 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1950 a vector of unsigned ints VAL to vector of floats TARGET. */
1953 ix86_expand_vector_convert_uns_vsivsf (rtx target
, rtx val
)
1956 REAL_VALUE_TYPE TWO16r
;
1957 machine_mode intmode
= GET_MODE (val
);
1958 machine_mode fltmode
= GET_MODE (target
);
1959 rtx (*cvt
) (rtx
, rtx
);
1961 if (intmode
== V4SImode
)
1962 cvt
= gen_floatv4siv4sf2
;
1964 cvt
= gen_floatv8siv8sf2
;
1965 tmp
[0] = ix86_build_const_vector (intmode
, 1, GEN_INT (0xffff));
1966 tmp
[0] = force_reg (intmode
, tmp
[0]);
1967 tmp
[1] = expand_simple_binop (intmode
, AND
, val
, tmp
[0], NULL_RTX
, 1,
1969 tmp
[2] = expand_simple_binop (intmode
, LSHIFTRT
, val
, GEN_INT (16),
1970 NULL_RTX
, 1, OPTAB_DIRECT
);
1971 tmp
[3] = gen_reg_rtx (fltmode
);
1972 emit_insn (cvt (tmp
[3], tmp
[1]));
1973 tmp
[4] = gen_reg_rtx (fltmode
);
1974 emit_insn (cvt (tmp
[4], tmp
[2]));
1975 real_ldexp (&TWO16r
, &dconst1
, 16);
1976 tmp
[5] = const_double_from_real_value (TWO16r
, SFmode
);
1977 tmp
[5] = force_reg (fltmode
, ix86_build_const_vector (fltmode
, 1, tmp
[5]));
1980 tmp
[6] = gen_rtx_FMA (fltmode
, tmp
[4], tmp
[5], tmp
[3]);
1981 emit_move_insn (target
, tmp
[6]);
1985 tmp
[6] = expand_simple_binop (fltmode
, MULT
, tmp
[4], tmp
[5],
1986 NULL_RTX
, 1, OPTAB_DIRECT
);
1987 tmp
[7] = expand_simple_binop (fltmode
, PLUS
, tmp
[3], tmp
[6],
1988 target
, 1, OPTAB_DIRECT
);
1989 if (tmp
[7] != target
)
1990 emit_move_insn (target
, tmp
[7]);
1994 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1995 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1996 This is done by doing just signed conversion if < 0x1p31, and otherwise by
1997 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
2000 ix86_expand_adjust_ufix_to_sfix_si (rtx val
, rtx
*xorp
)
2002 REAL_VALUE_TYPE TWO31r
;
2004 machine_mode mode
= GET_MODE (val
);
2005 machine_mode scalarmode
= GET_MODE_INNER (mode
);
2006 machine_mode intmode
= GET_MODE_SIZE (mode
) == 32 ? V8SImode
: V4SImode
;
2007 rtx (*cmp
) (rtx
, rtx
, rtx
, rtx
);
2010 for (i
= 0; i
< 3; i
++)
2011 tmp
[i
] = gen_reg_rtx (mode
);
2012 real_ldexp (&TWO31r
, &dconst1
, 31);
2013 two31r
= const_double_from_real_value (TWO31r
, scalarmode
);
2014 two31r
= ix86_build_const_vector (mode
, 1, two31r
);
2015 two31r
= force_reg (mode
, two31r
);
2018 case E_V8SFmode
: cmp
= gen_avx_maskcmpv8sf3
; break;
2019 case E_V4SFmode
: cmp
= gen_sse_maskcmpv4sf3
; break;
2020 case E_V4DFmode
: cmp
= gen_avx_maskcmpv4df3
; break;
2021 case E_V2DFmode
: cmp
= gen_sse2_maskcmpv2df3
; break;
2022 default: gcc_unreachable ();
2024 tmp
[3] = gen_rtx_LE (mode
, two31r
, val
);
2025 emit_insn (cmp (tmp
[0], two31r
, val
, tmp
[3]));
2026 tmp
[1] = expand_simple_binop (mode
, AND
, tmp
[0], two31r
, tmp
[1],
2028 if (intmode
== V4SImode
|| TARGET_AVX2
)
2029 *xorp
= expand_simple_binop (intmode
, ASHIFT
,
2030 gen_lowpart (intmode
, tmp
[0]),
2031 GEN_INT (31), NULL_RTX
, 0,
2035 rtx two31
= gen_int_mode (HOST_WIDE_INT_1U
<< 31, SImode
);
2036 two31
= ix86_build_const_vector (intmode
, 1, two31
);
2037 *xorp
= expand_simple_binop (intmode
, AND
,
2038 gen_lowpart (intmode
, tmp
[0]),
2042 return expand_simple_binop (mode
, MINUS
, val
, tmp
[1], tmp
[2],
2046 /* Generate code for floating point ABS or NEG. */
2049 ix86_expand_fp_absneg_operator (enum rtx_code code
, machine_mode mode
,
2053 bool use_sse
= false;
2054 bool vector_mode
= VECTOR_MODE_P (mode
);
2055 machine_mode vmode
= mode
;
2058 if (vector_mode
|| mode
== TFmode
|| mode
== HFmode
)
2064 else if (TARGET_SSE_MATH
)
2066 use_sse
= SSE_FLOAT_MODE_P (mode
);
2069 else if (mode
== DFmode
)
2076 set
= gen_rtx_fmt_e (code
, mode
, src
);
2077 set
= gen_rtx_SET (dst
, set
);
2081 rtx mask
, use
, clob
;
2083 /* NEG and ABS performed with SSE use bitwise mask operations.
2084 Create the appropriate mask now. */
2085 mask
= ix86_build_signbit_mask (vmode
, vector_mode
, code
== ABS
);
2086 use
= gen_rtx_USE (VOIDmode
, mask
);
2087 if (vector_mode
|| mode
== TFmode
)
2088 par
= gen_rtvec (2, set
, use
);
2091 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
2092 par
= gen_rtvec (3, set
, use
, clob
);
2099 /* Changing of sign for FP values is doable using integer unit too. */
2100 clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
2101 par
= gen_rtvec (2, set
, clob
);
2104 emit_insn (gen_rtx_PARALLEL (VOIDmode
, par
));
2107 /* Deconstruct a floating point ABS or NEG operation
2108 with integer registers into integer operations. */
2111 ix86_split_fp_absneg_operator (enum rtx_code code
, machine_mode mode
,
2114 enum rtx_code absneg_op
;
2117 gcc_assert (operands_match_p (operands
[0], operands
[1]));
2122 dst
= gen_lowpart (SImode
, operands
[0]);
2126 set
= gen_int_mode (0x7fffffff, SImode
);
2131 set
= gen_int_mode (0x80000000, SImode
);
2134 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
2140 dst
= gen_lowpart (DImode
, operands
[0]);
2141 dst
= gen_rtx_ZERO_EXTRACT (DImode
, dst
, const1_rtx
, GEN_INT (63));
2146 set
= gen_rtx_NOT (DImode
, dst
);
2150 dst
= gen_highpart (SImode
, operands
[0]);
2154 set
= gen_int_mode (0x7fffffff, SImode
);
2159 set
= gen_int_mode (0x80000000, SImode
);
2162 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
2167 dst
= gen_rtx_REG (SImode
,
2168 REGNO (operands
[0]) + (TARGET_64BIT
? 1 : 2));
2171 set
= GEN_INT (0x7fff);
2176 set
= GEN_INT (0x8000);
2179 set
= gen_rtx_fmt_ee (absneg_op
, SImode
, dst
, set
);
2186 set
= gen_rtx_SET (dst
, set
);
2188 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
2189 rtvec par
= gen_rtvec (2, set
, clob
);
2191 emit_insn (gen_rtx_PARALLEL (VOIDmode
, par
));
2194 /* Expand a copysign operation. Special case operand 0 being a constant. */
2197 ix86_expand_copysign (rtx operands
[])
2199 machine_mode mode
, vmode
;
2200 rtx dest
, vdest
, op0
, op1
, mask
, op2
, op3
;
2202 mode
= GET_MODE (operands
[0]);
2206 else if (mode
== SFmode
)
2208 else if (mode
== DFmode
)
2210 else if (mode
== TFmode
)
2215 if (rtx_equal_p (operands
[1], operands
[2]))
2217 emit_move_insn (operands
[0], operands
[1]);
2222 vdest
= lowpart_subreg (vmode
, dest
, mode
);
2223 if (vdest
== NULL_RTX
)
2224 vdest
= gen_reg_rtx (vmode
);
2227 op1
= lowpart_subreg (vmode
, force_reg (mode
, operands
[2]), mode
);
2228 mask
= ix86_build_signbit_mask (vmode
, 0, 0);
2230 if (CONST_DOUBLE_P (operands
[1]))
2232 op0
= simplify_unary_operation (ABS
, mode
, operands
[1], mode
);
2233 /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a. */
2234 if (op0
== CONST0_RTX (mode
))
2236 emit_move_insn (vdest
, gen_rtx_AND (vmode
, mask
, op1
));
2238 emit_move_insn (dest
, lowpart_subreg (mode
, vdest
, vmode
));
2242 if (GET_MODE_SIZE (mode
) < 16)
2243 op0
= ix86_build_const_vector (vmode
, false, op0
);
2244 op0
= force_reg (vmode
, op0
);
2247 op0
= lowpart_subreg (vmode
, force_reg (mode
, operands
[1]), mode
);
2249 op2
= gen_reg_rtx (vmode
);
2250 op3
= gen_reg_rtx (vmode
);
2251 emit_move_insn (op2
, gen_rtx_AND (vmode
,
2252 gen_rtx_NOT (vmode
, mask
),
2254 emit_move_insn (op3
, gen_rtx_AND (vmode
, mask
, op1
));
2255 emit_move_insn (vdest
, gen_rtx_IOR (vmode
, op2
, op3
));
2257 emit_move_insn (dest
, lowpart_subreg (mode
, vdest
, vmode
));
2260 /* Expand an xorsign operation. */
2263 ix86_expand_xorsign (rtx operands
[])
2265 machine_mode mode
, vmode
;
2266 rtx dest
, vdest
, op0
, op1
, mask
, x
, temp
;
2272 mode
= GET_MODE (dest
);
2276 else if (mode
== SFmode
)
2278 else if (mode
== DFmode
)
2283 temp
= gen_reg_rtx (vmode
);
2284 mask
= ix86_build_signbit_mask (vmode
, 0, 0);
2286 op1
= lowpart_subreg (vmode
, force_reg (mode
, op1
), mode
);
2287 x
= gen_rtx_AND (vmode
, op1
, mask
);
2288 emit_insn (gen_rtx_SET (temp
, x
));
2290 op0
= lowpart_subreg (vmode
, force_reg (mode
, op0
), mode
);
2291 x
= gen_rtx_XOR (vmode
, temp
, op0
);
2293 vdest
= lowpart_subreg (vmode
, dest
, mode
);
2294 if (vdest
== NULL_RTX
)
2295 vdest
= gen_reg_rtx (vmode
);
2298 emit_insn (gen_rtx_SET (vdest
, x
));
2301 emit_move_insn (dest
, lowpart_subreg (mode
, vdest
, vmode
));
2304 static rtx
ix86_expand_compare (enum rtx_code code
, rtx op0
, rtx op1
);
2307 ix86_expand_branch (enum rtx_code code
, rtx op0
, rtx op1
, rtx label
)
2309 machine_mode mode
= GET_MODE (op0
);
2312 /* Handle special case - vector comparsion with boolean result, transform
2313 it using ptest instruction. */
2314 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
2317 rtx flag
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
2318 machine_mode p_mode
= GET_MODE_SIZE (mode
) == 32 ? V4DImode
: V2DImode
;
2320 gcc_assert (code
== EQ
|| code
== NE
);
2324 op0
= lowpart_subreg (p_mode
, force_reg (mode
, op0
), mode
);
2325 op1
= lowpart_subreg (p_mode
, force_reg (mode
, op1
), mode
);
2328 /* Generate XOR since we can't check that one operand is zero vector. */
2329 tmp
= gen_reg_rtx (mode
);
2330 emit_insn (gen_rtx_SET (tmp
, gen_rtx_XOR (mode
, op0
, op1
)));
2331 tmp
= gen_lowpart (p_mode
, tmp
);
2332 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode
, FLAGS_REG
),
2333 gen_rtx_UNSPEC (CCmode
,
2334 gen_rtvec (2, tmp
, tmp
),
2336 tmp
= gen_rtx_fmt_ee (code
, VOIDmode
, flag
, const0_rtx
);
2337 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
2338 gen_rtx_LABEL_REF (VOIDmode
, label
),
2340 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2354 tmp
= ix86_expand_compare (code
, op0
, op1
);
2355 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
2356 gen_rtx_LABEL_REF (VOIDmode
, label
),
2358 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2366 /* DI and TI mode equality/inequality comparisons may be performed
2367 on SSE registers. Avoid splitting them, except when optimizing
2369 if ((code
== EQ
|| code
== NE
)
2370 && !optimize_insn_for_size_p ())
2373 /* Expand DImode branch into multiple compare+branch. */
2376 rtx_code_label
*label2
;
2377 enum rtx_code code1
, code2
, code3
;
2378 machine_mode submode
;
2380 if (CONSTANT_P (op0
) && !CONSTANT_P (op1
))
2382 std::swap (op0
, op1
);
2383 code
= swap_condition (code
);
2386 split_double_mode (mode
, &op0
, 1, lo
+0, hi
+0);
2387 split_double_mode (mode
, &op1
, 1, lo
+1, hi
+1);
2389 submode
= mode
== DImode
? SImode
: DImode
;
2391 /* If we are doing less-than or greater-or-equal-than,
2392 op1 is a constant and the low word is zero, then we can just
2393 examine the high word. Similarly for low word -1 and
2394 less-or-equal-than or greater-than. */
2396 if (CONST_INT_P (hi
[1]))
2399 case LT
: case LTU
: case GE
: case GEU
:
2400 if (lo
[1] == const0_rtx
)
2402 ix86_expand_branch (code
, hi
[0], hi
[1], label
);
2406 case LE
: case LEU
: case GT
: case GTU
:
2407 if (lo
[1] == constm1_rtx
)
2409 ix86_expand_branch (code
, hi
[0], hi
[1], label
);
2417 /* Emulate comparisons that do not depend on Zero flag with
2418 double-word subtraction. Note that only Overflow, Sign
2419 and Carry flags are valid, so swap arguments and condition
2420 of comparisons that would otherwise test Zero flag. */
2424 case LE
: case LEU
: case GT
: case GTU
:
2425 std::swap (lo
[0], lo
[1]);
2426 std::swap (hi
[0], hi
[1]);
2427 code
= swap_condition (code
);
2430 case LT
: case LTU
: case GE
: case GEU
:
2432 bool uns
= (code
== LTU
|| code
== GEU
);
2433 rtx (*sbb_insn
) (machine_mode
, rtx
, rtx
, rtx
)
2434 = uns
? gen_sub3_carry_ccc
: gen_sub3_carry_ccgz
;
2436 if (!nonimmediate_operand (lo
[0], submode
))
2437 lo
[0] = force_reg (submode
, lo
[0]);
2438 if (!x86_64_general_operand (lo
[1], submode
))
2439 lo
[1] = force_reg (submode
, lo
[1]);
2441 if (!register_operand (hi
[0], submode
))
2442 hi
[0] = force_reg (submode
, hi
[0]);
2443 if ((uns
&& !nonimmediate_operand (hi
[1], submode
))
2444 || (!uns
&& !x86_64_general_operand (hi
[1], submode
)))
2445 hi
[1] = force_reg (submode
, hi
[1]);
2447 emit_insn (gen_cmp_1 (submode
, lo
[0], lo
[1]));
2449 tmp
= gen_rtx_SCRATCH (submode
);
2450 emit_insn (sbb_insn (submode
, tmp
, hi
[0], hi
[1]));
2452 tmp
= gen_rtx_REG (uns
? CCCmode
: CCGZmode
, FLAGS_REG
);
2453 ix86_expand_branch (code
, tmp
, const0_rtx
, label
);
2461 /* Otherwise, we need two or three jumps. */
2463 label2
= gen_label_rtx ();
2466 code2
= swap_condition (code
);
2467 code3
= unsigned_condition (code
);
2471 case LT
: case GT
: case LTU
: case GTU
:
2474 case LE
: code1
= LT
; code2
= GT
; break;
2475 case GE
: code1
= GT
; code2
= LT
; break;
2476 case LEU
: code1
= LTU
; code2
= GTU
; break;
2477 case GEU
: code1
= GTU
; code2
= LTU
; break;
2479 case EQ
: code1
= UNKNOWN
; code2
= NE
; break;
2480 case NE
: code2
= UNKNOWN
; break;
2488 * if (hi(a) < hi(b)) goto true;
2489 * if (hi(a) > hi(b)) goto false;
2490 * if (lo(a) < lo(b)) goto true;
2494 if (code1
!= UNKNOWN
)
2495 ix86_expand_branch (code1
, hi
[0], hi
[1], label
);
2496 if (code2
!= UNKNOWN
)
2497 ix86_expand_branch (code2
, hi
[0], hi
[1], label2
);
2499 ix86_expand_branch (code3
, lo
[0], lo
[1], label
);
2501 if (code2
!= UNKNOWN
)
2502 emit_label (label2
);
2507 gcc_assert (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_CC
);
2512 /* Figure out whether to use unordered fp comparisons. */
2515 ix86_unordered_fp_compare (enum rtx_code code
)
2517 if (!TARGET_IEEE_FP
)
2546 /* Return a comparison we can do and that it is equivalent to
2547 swap_condition (code) apart possibly from orderedness.
2548 But, never change orderedness if TARGET_IEEE_FP, returning
2549 UNKNOWN in that case if necessary. */
2551 static enum rtx_code
2552 ix86_fp_swap_condition (enum rtx_code code
)
2556 case GT
: /* GTU - CF=0 & ZF=0 */
2557 return TARGET_IEEE_FP
? UNKNOWN
: UNLT
;
2558 case GE
: /* GEU - CF=0 */
2559 return TARGET_IEEE_FP
? UNKNOWN
: UNLE
;
2560 case UNLT
: /* LTU - CF=1 */
2561 return TARGET_IEEE_FP
? UNKNOWN
: GT
;
2562 case UNLE
: /* LEU - CF=1 | ZF=1 */
2563 return TARGET_IEEE_FP
? UNKNOWN
: GE
;
2565 return swap_condition (code
);
2569 /* Return cost of comparison CODE using the best strategy for performance.
2570 All following functions do use number of instructions as a cost metrics.
2571 In future this should be tweaked to compute bytes for optimize_size and
2572 take into account performance of various instructions on various CPUs. */
2575 ix86_fp_comparison_cost (enum rtx_code code
)
2579 /* The cost of code using bit-twiddling on %ah. */
2596 arith_cost
= TARGET_IEEE_FP
? 5 : 4;
2600 arith_cost
= TARGET_IEEE_FP
? 6 : 4;
2606 switch (ix86_fp_comparison_strategy (code
))
2608 case IX86_FPCMP_COMI
:
2609 return arith_cost
> 4 ? 3 : 2;
2610 case IX86_FPCMP_SAHF
:
2611 return arith_cost
> 4 ? 4 : 3;
2617 /* Swap, force into registers, or otherwise massage the two operands
2618 to a fp comparison. The operands are updated in place; the new
2619 comparison code is returned. */
2621 static enum rtx_code
2622 ix86_prepare_fp_compare_args (enum rtx_code code
, rtx
*pop0
, rtx
*pop1
)
2624 bool unordered_compare
= ix86_unordered_fp_compare (code
);
2625 rtx op0
= *pop0
, op1
= *pop1
;
2626 machine_mode op_mode
= GET_MODE (op0
);
2627 bool is_sse
= SSE_FLOAT_MODE_SSEMATH_OR_HF_P (op_mode
);
2629 if (op_mode
== BFmode
)
2631 rtx op
= gen_lowpart (HImode
, op0
);
2632 if (CONST_INT_P (op
))
2633 op
= simplify_const_unary_operation (FLOAT_EXTEND
, SFmode
,
2637 rtx t1
= gen_reg_rtx (SImode
);
2638 emit_insn (gen_zero_extendhisi2 (t1
, op
));
2639 emit_insn (gen_ashlsi3 (t1
, t1
, GEN_INT (16)));
2640 op
= gen_lowpart (SFmode
, t1
);
2643 op
= gen_lowpart (HImode
, op1
);
2644 if (CONST_INT_P (op
))
2645 op
= simplify_const_unary_operation (FLOAT_EXTEND
, SFmode
,
2649 rtx t1
= gen_reg_rtx (SImode
);
2650 emit_insn (gen_zero_extendhisi2 (t1
, op
));
2651 emit_insn (gen_ashlsi3 (t1
, t1
, GEN_INT (16)));
2652 op
= gen_lowpart (SFmode
, t1
);
2655 return ix86_prepare_fp_compare_args (code
, pop0
, pop1
);
2658 /* All of the unordered compare instructions only work on registers.
2659 The same is true of the fcomi compare instructions. The XFmode
2660 compare instructions require registers except when comparing
2661 against zero or when converting operand 1 from fixed point to
2665 && (unordered_compare
2666 || (op_mode
== XFmode
2667 && ! (standard_80387_constant_p (op0
) == 1
2668 || standard_80387_constant_p (op1
) == 1)
2669 && GET_CODE (op1
) != FLOAT
)
2670 || ix86_fp_comparison_strategy (code
) == IX86_FPCMP_COMI
))
2672 op0
= force_reg (op_mode
, op0
);
2673 op1
= force_reg (op_mode
, op1
);
2677 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2678 things around if they appear profitable, otherwise force op0
2681 if (standard_80387_constant_p (op0
) == 0
2683 && ! (standard_80387_constant_p (op1
) == 0
2686 enum rtx_code new_code
= ix86_fp_swap_condition (code
);
2687 if (new_code
!= UNKNOWN
)
2689 std::swap (op0
, op1
);
2695 op0
= force_reg (op_mode
, op0
);
2697 if (CONSTANT_P (op1
))
2699 int tmp
= standard_80387_constant_p (op1
);
2701 op1
= validize_mem (force_const_mem (op_mode
, op1
));
2705 op1
= force_reg (op_mode
, op1
);
2708 op1
= force_reg (op_mode
, op1
);
2712 /* Try to rearrange the comparison to make it cheaper. */
2713 if (ix86_fp_comparison_cost (code
)
2714 > ix86_fp_comparison_cost (swap_condition (code
))
2715 && (REG_P (op1
) || can_create_pseudo_p ()))
2717 std::swap (op0
, op1
);
2718 code
= swap_condition (code
);
2720 op0
= force_reg (op_mode
, op0
);
2728 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2731 ix86_expand_fp_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2733 bool unordered_compare
= ix86_unordered_fp_compare (code
);
2734 machine_mode cmp_mode
;
2737 code
= ix86_prepare_fp_compare_args (code
, &op0
, &op1
);
2739 tmp
= gen_rtx_COMPARE (CCFPmode
, op0
, op1
);
2740 if (unordered_compare
)
2741 tmp
= gen_rtx_UNSPEC (CCFPmode
, gen_rtvec (1, tmp
), UNSPEC_NOTRAP
);
2743 /* Do fcomi/sahf based test when profitable. */
2744 switch (ix86_fp_comparison_strategy (code
))
2746 case IX86_FPCMP_COMI
:
2747 cmp_mode
= CCFPmode
;
2748 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode
, FLAGS_REG
), tmp
));
2751 case IX86_FPCMP_SAHF
:
2752 cmp_mode
= CCFPmode
;
2753 tmp
= gen_rtx_UNSPEC (HImode
, gen_rtvec (1, tmp
), UNSPEC_FNSTSW
);
2754 scratch
= gen_reg_rtx (HImode
);
2755 emit_insn (gen_rtx_SET (scratch
, tmp
));
2756 emit_insn (gen_x86_sahf_1 (scratch
));
2759 case IX86_FPCMP_ARITH
:
2760 cmp_mode
= CCNOmode
;
2761 tmp
= gen_rtx_UNSPEC (HImode
, gen_rtvec (1, tmp
), UNSPEC_FNSTSW
);
2762 scratch
= gen_reg_rtx (HImode
);
2763 emit_insn (gen_rtx_SET (scratch
, tmp
));
2765 /* In the unordered case, we have to check C2 for NaN's, which
2766 doesn't happen to work out to anything nice combination-wise.
2767 So do some bit twiddling on the value we've got in AH to come
2768 up with an appropriate set of condition codes. */
2774 if (code
== GT
|| !TARGET_IEEE_FP
)
2776 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x45)));
2781 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2782 emit_insn (gen_addqi_ext_1 (scratch
, scratch
, constm1_rtx
));
2783 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x44)));
2790 if (code
== LT
&& TARGET_IEEE_FP
)
2792 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2793 emit_insn (gen_cmpqi_ext_3 (scratch
, const1_rtx
));
2799 emit_insn (gen_testqi_ext_1_ccno (scratch
, const1_rtx
));
2805 if (code
== GE
|| !TARGET_IEEE_FP
)
2807 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x05)));
2812 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2813 emit_insn (gen_xorqi_ext_1_cc (scratch
, scratch
, const1_rtx
));
2819 if (code
== LE
&& TARGET_IEEE_FP
)
2821 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2822 emit_insn (gen_addqi_ext_1 (scratch
, scratch
, constm1_rtx
));
2823 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x40)));
2829 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x45)));
2835 if (code
== EQ
&& TARGET_IEEE_FP
)
2837 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2838 emit_insn (gen_cmpqi_ext_3 (scratch
, GEN_INT (0x40)));
2844 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x40)));
2850 if (code
== NE
&& TARGET_IEEE_FP
)
2852 emit_insn (gen_andqi_ext_1 (scratch
, scratch
, GEN_INT (0x45)));
2853 emit_insn (gen_xorqi_ext_1_cc (scratch
, scratch
,
2859 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x40)));
2865 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x04)));
2869 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x04)));
2882 /* Return the test that should be put into the flags user, i.e.
2883 the bcc, scc, or cmov instruction. */
2884 return gen_rtx_fmt_ee (code
, VOIDmode
,
2885 gen_rtx_REG (cmp_mode
, FLAGS_REG
),
2889 /* Generate insn patterns to do an integer compare of OPERANDS. */
2892 ix86_expand_int_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2894 machine_mode cmpmode
;
2897 /* Swap operands to emit carry flag comparison. */
2898 if ((code
== GTU
|| code
== LEU
)
2899 && nonimmediate_operand (op1
, VOIDmode
))
2901 std::swap (op0
, op1
);
2902 code
= swap_condition (code
);
2905 cmpmode
= SELECT_CC_MODE (code
, op0
, op1
);
2906 flags
= gen_rtx_REG (cmpmode
, FLAGS_REG
);
2908 /* This is very simple, but making the interface the same as in the
2909 FP case makes the rest of the code easier. */
2910 tmp
= gen_rtx_COMPARE (cmpmode
, op0
, op1
);
2911 emit_insn (gen_rtx_SET (flags
, tmp
));
2913 /* Return the test that should be put into the flags user, i.e.
2914 the bcc, scc, or cmov instruction. */
2915 return gen_rtx_fmt_ee (code
, VOIDmode
, flags
, const0_rtx
);
2919 ix86_expand_compare (enum rtx_code code
, rtx op0
, rtx op1
)
2923 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_CC
)
2924 ret
= gen_rtx_fmt_ee (code
, VOIDmode
, op0
, op1
);
2926 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0
)))
2928 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0
)));
2929 ret
= ix86_expand_fp_compare (code
, op0
, op1
);
2932 ret
= ix86_expand_int_compare (code
, op0
, op1
);
2938 ix86_expand_setcc (rtx dest
, enum rtx_code code
, rtx op0
, rtx op1
)
2942 gcc_assert (GET_MODE (dest
) == QImode
);
2944 ret
= ix86_expand_compare (code
, op0
, op1
);
2945 PUT_MODE (ret
, QImode
);
2946 emit_insn (gen_rtx_SET (dest
, ret
));
2949 /* Expand floating point op0 <=> op1, i.e.
2950 dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : 2. */
2953 ix86_expand_fp_spaceship (rtx dest
, rtx op0
, rtx op1
)
2955 gcc_checking_assert (ix86_fp_comparison_strategy (GT
) != IX86_FPCMP_ARITH
);
2956 rtx gt
= ix86_expand_fp_compare (GT
, op0
, op1
);
2957 rtx l0
= gen_label_rtx ();
2958 rtx l1
= gen_label_rtx ();
2959 rtx l2
= TARGET_IEEE_FP
? gen_label_rtx () : NULL_RTX
;
2960 rtx lend
= gen_label_rtx ();
2965 rtx un
= gen_rtx_fmt_ee (UNORDERED
, VOIDmode
,
2966 gen_rtx_REG (CCFPmode
, FLAGS_REG
), const0_rtx
);
2967 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, un
,
2968 gen_rtx_LABEL_REF (VOIDmode
, l2
), pc_rtx
);
2969 jmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2970 add_reg_br_prob_note (jmp
, profile_probability:: very_unlikely ());
2972 rtx eq
= gen_rtx_fmt_ee (UNEQ
, VOIDmode
,
2973 gen_rtx_REG (CCFPmode
, FLAGS_REG
), const0_rtx
);
2974 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, eq
,
2975 gen_rtx_LABEL_REF (VOIDmode
, l0
), pc_rtx
);
2976 jmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2977 add_reg_br_prob_note (jmp
, profile_probability::unlikely ());
2978 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, gt
,
2979 gen_rtx_LABEL_REF (VOIDmode
, l1
), pc_rtx
);
2980 jmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
2981 add_reg_br_prob_note (jmp
, profile_probability::even ());
2982 emit_move_insn (dest
, constm1_rtx
);
2985 emit_move_insn (dest
, const0_rtx
);
2988 emit_move_insn (dest
, const1_rtx
);
2993 emit_move_insn (dest
, const2_rtx
);
2998 /* Expand comparison setting or clearing carry flag. Return true when
2999 successful and set pop for the operation. */
3001 ix86_expand_carry_flag_compare (enum rtx_code code
, rtx op0
, rtx op1
, rtx
*pop
)
3004 = GET_MODE (op0
) != VOIDmode
? GET_MODE (op0
) : GET_MODE (op1
);
3006 /* Do not handle double-mode compares that go through special path. */
3007 if (mode
== (TARGET_64BIT
? TImode
: DImode
))
3010 if (SCALAR_FLOAT_MODE_P (mode
))
3013 rtx_insn
*compare_seq
;
3015 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode
));
3017 /* Shortcut: following common codes never translate
3018 into carry flag compares. */
3019 if (code
== EQ
|| code
== NE
|| code
== UNEQ
|| code
== LTGT
3020 || code
== ORDERED
|| code
== UNORDERED
)
3023 /* These comparisons require zero flag; swap operands so they won't. */
3024 if ((code
== GT
|| code
== UNLE
|| code
== LE
|| code
== UNGT
)
3027 std::swap (op0
, op1
);
3028 code
= swap_condition (code
);
3031 /* Try to expand the comparison and verify that we end up with
3032 carry flag based comparison. This fails to be true only when
3033 we decide to expand comparison using arithmetic that is not
3034 too common scenario. */
3036 compare_op
= ix86_expand_fp_compare (code
, op0
, op1
);
3037 compare_seq
= get_insns ();
3040 if (GET_MODE (XEXP (compare_op
, 0)) == CCFPmode
)
3041 code
= ix86_fp_compare_code_to_integer (GET_CODE (compare_op
));
3043 code
= GET_CODE (compare_op
);
3045 if (code
!= LTU
&& code
!= GEU
)
3048 emit_insn (compare_seq
);
3053 if (!INTEGRAL_MODE_P (mode
))
3062 /* Convert a==0 into (unsigned)a<1. */
3065 if (op1
!= const0_rtx
)
3068 code
= (code
== EQ
? LTU
: GEU
);
3071 /* Convert a>b into b<a or a>=b-1. */
3074 if (CONST_INT_P (op1
))
3076 op1
= gen_int_mode (INTVAL (op1
) + 1, GET_MODE (op0
));
3077 /* Bail out on overflow. We still can swap operands but that
3078 would force loading of the constant into register. */
3079 if (op1
== const0_rtx
3080 || !x86_64_immediate_operand (op1
, GET_MODE (op1
)))
3082 code
= (code
== GTU
? GEU
: LTU
);
3086 std::swap (op0
, op1
);
3087 code
= (code
== GTU
? LTU
: GEU
);
3091 /* Convert a>=0 into (unsigned)a<0x80000000. */
3094 if (mode
== DImode
|| op1
!= const0_rtx
)
3096 op1
= gen_int_mode (1 << (GET_MODE_BITSIZE (mode
) - 1), mode
);
3097 code
= (code
== LT
? GEU
: LTU
);
3101 if (mode
== DImode
|| op1
!= constm1_rtx
)
3103 op1
= gen_int_mode (1 << (GET_MODE_BITSIZE (mode
) - 1), mode
);
3104 code
= (code
== LE
? GEU
: LTU
);
3110 /* Swapping operands may cause constant to appear as first operand. */
3111 if (!nonimmediate_operand (op0
, VOIDmode
))
3113 if (!can_create_pseudo_p ())
3115 op0
= force_reg (mode
, op0
);
3117 *pop
= ix86_expand_compare (code
, op0
, op1
);
3118 gcc_assert (GET_CODE (*pop
) == LTU
|| GET_CODE (*pop
) == GEU
);
3122 /* Expand conditional increment or decrement using adb/sbb instructions.
3123 The default case using setcc followed by the conditional move can be
3124 done by generic code. */
3126 ix86_expand_int_addcc (rtx operands
[])
3128 enum rtx_code code
= GET_CODE (operands
[1]);
3130 rtx (*insn
) (machine_mode
, rtx
, rtx
, rtx
, rtx
, rtx
);
3132 rtx val
= const0_rtx
;
3135 rtx op0
= XEXP (operands
[1], 0);
3136 rtx op1
= XEXP (operands
[1], 1);
3138 if (operands
[3] != const1_rtx
3139 && operands
[3] != constm1_rtx
)
3141 if (!ix86_expand_carry_flag_compare (code
, op0
, op1
, &compare_op
))
3143 code
= GET_CODE (compare_op
);
3145 flags
= XEXP (compare_op
, 0);
3147 if (GET_MODE (flags
) == CCFPmode
)
3150 code
= ix86_fp_compare_code_to_integer (code
);
3157 PUT_CODE (compare_op
,
3158 reverse_condition_maybe_unordered
3159 (GET_CODE (compare_op
)));
3161 PUT_CODE (compare_op
, reverse_condition (GET_CODE (compare_op
)));
3164 mode
= GET_MODE (operands
[0]);
3166 /* Construct either adc or sbb insn. */
3167 if ((code
== LTU
) == (operands
[3] == constm1_rtx
))
3168 insn
= gen_sub3_carry
;
3170 insn
= gen_add3_carry
;
3172 emit_insn (insn (mode
, operands
[0], operands
[2], val
, flags
, compare_op
));
3178 ix86_expand_int_movcc (rtx operands
[])
3180 enum rtx_code code
= GET_CODE (operands
[1]), compare_code
;
3181 rtx_insn
*compare_seq
;
3183 machine_mode mode
= GET_MODE (operands
[0]);
3184 bool sign_bit_compare_p
= false;
3185 bool negate_cc_compare_p
= false;
3186 rtx op0
= XEXP (operands
[1], 0);
3187 rtx op1
= XEXP (operands
[1], 1);
3188 rtx op2
= operands
[2];
3189 rtx op3
= operands
[3];
3191 if (GET_MODE (op0
) == TImode
3192 || (GET_MODE (op0
) == DImode
3196 if (GET_MODE (op0
) == BFmode
3197 && !ix86_fp_comparison_operator (operands
[1], VOIDmode
))
3201 compare_op
= ix86_expand_compare (code
, op0
, op1
);
3202 compare_seq
= get_insns ();
3205 compare_code
= GET_CODE (compare_op
);
3207 if ((op1
== const0_rtx
&& (code
== GE
|| code
== LT
))
3208 || (op1
== constm1_rtx
&& (code
== GT
|| code
== LE
)))
3209 sign_bit_compare_p
= true;
3211 /* op0 == op1 ? op0 : op3 is equivalent to op0 == op1 ? op1 : op3,
3212 but if op1 is a constant, the latter form allows more optimizations,
3213 either through the last 2 ops being constant handling, or the one
3214 constant and one variable cases. On the other side, for cmov the
3215 former might be better as we don't need to load the constant into
3216 another register. */
3217 if (code
== EQ
&& CONST_INT_P (op1
) && rtx_equal_p (op0
, op2
))
3219 /* Similarly for op0 != op1 ? op2 : op0 and op0 != op1 ? op2 : op1. */
3220 else if (code
== NE
&& CONST_INT_P (op1
) && rtx_equal_p (op0
, op3
))
3223 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
3224 HImode insns, we'd be swallowed in word prefix ops. */
3226 if ((mode
!= HImode
|| TARGET_FAST_PREFIX
)
3227 && (mode
!= (TARGET_64BIT
? TImode
: DImode
))
3228 && CONST_INT_P (op2
)
3229 && CONST_INT_P (op3
))
3231 rtx out
= operands
[0];
3232 HOST_WIDE_INT ct
= INTVAL (op2
);
3233 HOST_WIDE_INT cf
= INTVAL (op3
);
3237 || (TARGET_64BIT
&& mode
== DImode
))
3238 && (GET_MODE (op0
) == SImode
3239 || (TARGET_64BIT
&& GET_MODE (op0
) == DImode
)))
3241 /* Special case x != 0 ? -1 : y. */
3242 if (code
== NE
&& op1
== const0_rtx
&& ct
== -1)
3244 negate_cc_compare_p
= true;
3248 else if (code
== EQ
&& op1
== const0_rtx
&& cf
== -1)
3249 negate_cc_compare_p
= true;
3253 /* Sign bit compares are better done using shifts than we do by using
3255 if (sign_bit_compare_p
3256 || negate_cc_compare_p
3257 || ix86_expand_carry_flag_compare (code
, op0
, op1
, &compare_op
))
3259 /* Detect overlap between destination and compare sources. */
3262 if (negate_cc_compare_p
)
3264 if (GET_MODE (op0
) == DImode
)
3265 emit_insn (gen_x86_negdi_ccc (gen_reg_rtx (DImode
), op0
));
3267 emit_insn (gen_x86_negsi_ccc (gen_reg_rtx (SImode
),
3268 gen_lowpart (SImode
, op0
)));
3270 tmp
= gen_reg_rtx (mode
);
3272 emit_insn (gen_x86_movdicc_0_m1_neg (tmp
));
3274 emit_insn (gen_x86_movsicc_0_m1_neg (gen_lowpart (SImode
,
3277 else if (!sign_bit_compare_p
)
3282 compare_code
= GET_CODE (compare_op
);
3284 flags
= XEXP (compare_op
, 0);
3286 if (GET_MODE (flags
) == CCFPmode
)
3290 = ix86_fp_compare_code_to_integer (compare_code
);
3293 /* To simplify rest of code, restrict to the GEU case. */
3294 if (compare_code
== LTU
)
3297 compare_code
= reverse_condition (compare_code
);
3298 code
= reverse_condition (code
);
3303 PUT_CODE (compare_op
,
3304 reverse_condition_maybe_unordered
3305 (GET_CODE (compare_op
)));
3307 PUT_CODE (compare_op
,
3308 reverse_condition (GET_CODE (compare_op
)));
3312 if (reg_overlap_mentioned_p (out
, compare_op
))
3313 tmp
= gen_reg_rtx (mode
);
3316 emit_insn (gen_x86_movdicc_0_m1 (tmp
, flags
, compare_op
));
3318 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode
, tmp
),
3319 flags
, compare_op
));
3323 if (code
== GT
|| code
== GE
)
3324 code
= reverse_condition (code
);
3330 tmp
= emit_store_flag (tmp
, code
, op0
, op1
, VOIDmode
, 0, -1);
3343 tmp
= expand_simple_binop (mode
, PLUS
,
3345 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3356 tmp
= expand_simple_binop (mode
, IOR
,
3358 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3360 else if (diff
== -1 && ct
)
3370 tmp
= expand_simple_unop (mode
, NOT
, tmp
, copy_rtx (tmp
), 1);
3372 tmp
= expand_simple_binop (mode
, PLUS
,
3373 copy_rtx (tmp
), GEN_INT (cf
),
3374 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3382 * andl cf - ct, dest
3392 tmp
= expand_simple_unop (mode
, NOT
, tmp
, copy_rtx (tmp
), 1);
3395 tmp
= expand_simple_binop (mode
, AND
,
3397 gen_int_mode (cf
- ct
, mode
),
3398 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3400 tmp
= expand_simple_binop (mode
, PLUS
,
3401 copy_rtx (tmp
), GEN_INT (ct
),
3402 copy_rtx (tmp
), 1, OPTAB_DIRECT
);
3405 if (!rtx_equal_p (tmp
, out
))
3406 emit_move_insn (copy_rtx (out
), copy_rtx (tmp
));
3413 machine_mode cmp_mode
= GET_MODE (op0
);
3414 enum rtx_code new_code
;
3416 if (SCALAR_FLOAT_MODE_P (cmp_mode
))
3418 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode
));
3420 /* We may be reversing a non-trapping
3421 comparison to a trapping comparison. */
3422 if (HONOR_NANS (cmp_mode
) && flag_trapping_math
3423 && code
!= EQ
&& code
!= NE
3424 && code
!= ORDERED
&& code
!= UNORDERED
)
3427 new_code
= reverse_condition_maybe_unordered (code
);
3430 new_code
= ix86_reverse_condition (code
, cmp_mode
);
3431 if (new_code
!= UNKNOWN
)
3439 compare_code
= UNKNOWN
;
3440 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
3441 && CONST_INT_P (op1
))
3443 if (op1
== const0_rtx
3444 && (code
== LT
|| code
== GE
))
3445 compare_code
= code
;
3446 else if (op1
== constm1_rtx
)
3450 else if (code
== GT
)
3455 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3456 if (compare_code
!= UNKNOWN
3457 && GET_MODE (op0
) == GET_MODE (out
)
3458 && (cf
== -1 || ct
== -1))
3460 /* If lea code below could be used, only optimize
3461 if it results in a 2 insn sequence. */
3463 if (! (diff
== 1 || diff
== 2 || diff
== 4 || diff
== 8
3464 || diff
== 3 || diff
== 5 || diff
== 9)
3465 || (compare_code
== LT
&& ct
== -1)
3466 || (compare_code
== GE
&& cf
== -1))
3469 * notl op1 (if necessary)
3477 code
= reverse_condition (code
);
3480 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, -1);
3482 out
= expand_simple_binop (mode
, IOR
,
3484 out
, 1, OPTAB_DIRECT
);
3485 if (out
!= operands
[0])
3486 emit_move_insn (operands
[0], out
);
3493 if ((diff
== 1 || diff
== 2 || diff
== 4 || diff
== 8
3494 || diff
== 3 || diff
== 5 || diff
== 9)
3495 && ((mode
!= QImode
&& mode
!= HImode
) || !TARGET_PARTIAL_REG_STALL
)
3497 || x86_64_immediate_operand (GEN_INT (cf
), VOIDmode
)))
3503 * lea cf(dest*(ct-cf)),dest
3507 * This also catches the degenerate setcc-only case.
3513 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, 1);
3516 /* On x86_64 the lea instruction operates on Pmode, so we need
3517 to get arithmetics done in proper mode to match. */
3519 tmp
= copy_rtx (out
);
3523 out1
= copy_rtx (out
);
3524 tmp
= gen_rtx_MULT (mode
, out1
, GEN_INT (diff
& ~1));
3528 tmp
= gen_rtx_PLUS (mode
, tmp
, out1
);
3534 tmp
= plus_constant (mode
, tmp
, cf
);
3537 if (!rtx_equal_p (tmp
, out
))
3540 out
= force_operand (tmp
, copy_rtx (out
));
3542 emit_insn (gen_rtx_SET (copy_rtx (out
), copy_rtx (tmp
)));
3544 if (!rtx_equal_p (out
, operands
[0]))
3545 emit_move_insn (operands
[0], copy_rtx (out
));
3551 * General case: Jumpful:
3552 * xorl dest,dest cmpl op1, op2
3553 * cmpl op1, op2 movl ct, dest
3555 * decl dest movl cf, dest
3556 * andl (cf-ct),dest 1:
3561 * This is reasonably steep, but branch mispredict costs are
3562 * high on modern cpus, so consider failing only if optimizing
3566 if ((!TARGET_CMOVE
|| (mode
== QImode
&& TARGET_PARTIAL_REG_STALL
))
3567 && BRANCH_COST (optimize_insn_for_speed_p (),
3572 machine_mode cmp_mode
= GET_MODE (op0
);
3573 enum rtx_code new_code
;
3575 if (SCALAR_FLOAT_MODE_P (cmp_mode
))
3577 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode
));
3579 /* We may be reversing a non-trapping
3580 comparison to a trapping comparison. */
3581 if (HONOR_NANS (cmp_mode
) && flag_trapping_math
3582 && code
!= EQ
&& code
!= NE
3583 && code
!= ORDERED
&& code
!= UNORDERED
)
3586 new_code
= reverse_condition_maybe_unordered (code
);
3591 new_code
= ix86_reverse_condition (code
, cmp_mode
);
3592 if (compare_code
!= UNKNOWN
&& new_code
!= UNKNOWN
)
3593 compare_code
= reverse_condition (compare_code
);
3596 if (new_code
!= UNKNOWN
)
3604 if (compare_code
!= UNKNOWN
)
3606 /* notl op1 (if needed)
3611 For x < 0 (resp. x <= -1) there will be no notl,
3612 so if possible swap the constants to get rid of the
3614 True/false will be -1/0 while code below (store flag
3615 followed by decrement) is 0/-1, so the constants need
3616 to be exchanged once more. */
3618 if (compare_code
== GE
|| !cf
)
3620 code
= reverse_condition (code
);
3626 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, -1);
3630 out
= emit_store_flag (out
, code
, op0
, op1
, VOIDmode
, 0, 1);
3632 out
= expand_simple_binop (mode
, PLUS
, copy_rtx (out
),
3634 copy_rtx (out
), 1, OPTAB_DIRECT
);
3637 out
= expand_simple_binop (mode
, AND
, copy_rtx (out
),
3638 gen_int_mode (cf
- ct
, mode
),
3639 copy_rtx (out
), 1, OPTAB_DIRECT
);
3641 out
= expand_simple_binop (mode
, PLUS
, copy_rtx (out
), GEN_INT (ct
),
3642 copy_rtx (out
), 1, OPTAB_DIRECT
);
3643 if (!rtx_equal_p (out
, operands
[0]))
3644 emit_move_insn (operands
[0], copy_rtx (out
));
3650 if (!TARGET_CMOVE
|| (mode
== QImode
&& TARGET_PARTIAL_REG_STALL
))
3652 /* Try a few things more with specific constants and a variable. */
3655 rtx var
, orig_out
, out
, tmp
;
3657 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3663 /* If one of the two operands is an interesting constant, load a
3664 constant with the above and mask it in with a logical operation. */
3666 if (CONST_INT_P (operands
[2]))
3669 if (INTVAL (operands
[2]) == 0 && operands
[3] != constm1_rtx
)
3670 operands
[3] = constm1_rtx
, op
= and_optab
;
3671 else if (INTVAL (operands
[2]) == -1 && operands
[3] != const0_rtx
)
3672 operands
[3] = const0_rtx
, op
= ior_optab
;
3676 else if (CONST_INT_P (operands
[3]))
3679 if (INTVAL (operands
[3]) == 0 && operands
[2] != constm1_rtx
)
3681 /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3682 "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
3683 if (code
== LE
&& op1
== const0_rtx
&& rtx_equal_p (op0
, var
))
3684 operands
[1] = simplify_gen_relational (LT
, VOIDmode
,
3688 operands
[2] = constm1_rtx
;
3691 else if (INTVAL (operands
[3]) == -1 && operands
[3] != const0_rtx
)
3692 operands
[2] = const0_rtx
, op
= ior_optab
;
3699 orig_out
= operands
[0];
3700 tmp
= gen_reg_rtx (mode
);
3703 /* Recurse to get the constant loaded. */
3704 if (!ix86_expand_int_movcc (operands
))
3707 /* Mask in the interesting variable. */
3708 out
= expand_binop (mode
, op
, var
, tmp
, orig_out
, 0,
3710 if (!rtx_equal_p (out
, orig_out
))
3711 emit_move_insn (copy_rtx (orig_out
), copy_rtx (out
));
3717 * For comparison with above,
3727 if (! nonimmediate_operand (operands
[2], mode
))
3728 operands
[2] = force_reg (mode
, operands
[2]);
3729 if (! nonimmediate_operand (operands
[3], mode
))
3730 operands
[3] = force_reg (mode
, operands
[3]);
3732 if (! register_operand (operands
[2], VOIDmode
)
3734 || ! register_operand (operands
[3], VOIDmode
)))
3735 operands
[2] = force_reg (mode
, operands
[2]);
3738 && ! register_operand (operands
[3], VOIDmode
))
3739 operands
[3] = force_reg (mode
, operands
[3]);
3741 emit_insn (compare_seq
);
3742 emit_insn (gen_rtx_SET (operands
[0],
3743 gen_rtx_IF_THEN_ELSE (mode
,
3744 compare_op
, operands
[2],
3749 /* Detect conditional moves that exactly match min/max operational
3750 semantics. Note that this is IEEE safe, as long as we don't
3751 interchange the operands.
3753 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3754 and TRUE if the operation is successful and instructions are emitted. */
3757 ix86_expand_sse_fp_minmax (rtx dest
, enum rtx_code code
, rtx cmp_op0
,
3758 rtx cmp_op1
, rtx if_true
, rtx if_false
)
3766 else if (code
== UNGE
)
3767 std::swap (if_true
, if_false
);
3771 if (rtx_equal_p (cmp_op0
, if_true
) && rtx_equal_p (cmp_op1
, if_false
))
3773 else if (rtx_equal_p (cmp_op1
, if_true
) && rtx_equal_p (cmp_op0
, if_false
))
3778 mode
= GET_MODE (dest
);
3780 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3781 but MODE may be a vector mode and thus not appropriate. */
3782 if (!flag_finite_math_only
|| flag_signed_zeros
)
3784 int u
= is_min
? UNSPEC_IEEE_MIN
: UNSPEC_IEEE_MAX
;
3787 if_true
= force_reg (mode
, if_true
);
3788 v
= gen_rtvec (2, if_true
, if_false
);
3789 tmp
= gen_rtx_UNSPEC (mode
, v
, u
);
3793 code
= is_min
? SMIN
: SMAX
;
3794 if (MEM_P (if_true
) && MEM_P (if_false
))
3795 if_true
= force_reg (mode
, if_true
);
3796 tmp
= gen_rtx_fmt_ee (code
, mode
, if_true
, if_false
);
3799 emit_insn (gen_rtx_SET (dest
, tmp
));
3803 /* Return true if MODE is valid for vector compare to mask register,
3804 Same result for conditionl vector move with mask register. */
3806 ix86_valid_mask_cmp_mode (machine_mode mode
)
3808 /* XOP has its own vector conditional movement. */
3809 if (TARGET_XOP
&& !TARGET_AVX512F
)
3812 /* HFmode only supports vcmpsh whose dest is mask register. */
3813 if (TARGET_AVX512FP16
&& mode
== HFmode
)
3816 /* AVX512F is needed for mask operation. */
3817 if (!(TARGET_AVX512F
&& VECTOR_MODE_P (mode
)))
3820 /* AVX512BW is needed for vector QI/HImode,
3821 AVX512VL is needed for 128/256-bit vector. */
3822 machine_mode inner_mode
= GET_MODE_INNER (mode
);
3823 int vector_size
= GET_MODE_SIZE (mode
);
3824 if ((inner_mode
== QImode
|| inner_mode
== HImode
) && !TARGET_AVX512BW
)
3827 return vector_size
== 64 || TARGET_AVX512VL
;
3830 /* Return true if integer mask comparison should be used. */
3832 ix86_use_mask_cmp_p (machine_mode mode
, machine_mode cmp_mode
,
3833 rtx op_true
, rtx op_false
)
3835 int vector_size
= GET_MODE_SIZE (mode
);
3837 if (cmp_mode
== HFmode
)
3839 else if (vector_size
< 16)
3841 else if (vector_size
== 64)
3843 else if (GET_MODE_INNER (cmp_mode
) == HFmode
)
3846 /* When op_true is NULL, op_false must be NULL, or vice versa. */
3847 gcc_assert (!op_true
== !op_false
);
3849 /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
3850 vector dest is required. */
3851 if (!op_true
|| !ix86_valid_mask_cmp_mode (cmp_mode
))
3854 /* Exclude those that could be optimized in ix86_expand_sse_movcc. */
3855 if (op_false
== CONST0_RTX (mode
)
3856 || op_true
== CONST0_RTX (mode
)
3857 || (INTEGRAL_MODE_P (mode
)
3858 && (op_true
== CONSTM1_RTX (mode
)
3859 || op_false
== CONSTM1_RTX (mode
))))
3865 /* Expand an SSE comparison. Return the register with the result. */
3868 ix86_expand_sse_cmp (rtx dest
, enum rtx_code code
, rtx cmp_op0
, rtx cmp_op1
,
3869 rtx op_true
, rtx op_false
)
3871 machine_mode mode
= GET_MODE (dest
);
3872 machine_mode cmp_ops_mode
= GET_MODE (cmp_op0
);
3874 /* In general case result of comparison can differ from operands' type. */
3875 machine_mode cmp_mode
;
3877 /* In AVX512F the result of comparison is an integer mask. */
3878 bool maskcmp
= false;
3881 if (ix86_use_mask_cmp_p (mode
, cmp_ops_mode
, op_true
, op_false
))
3883 unsigned int nbits
= GET_MODE_NUNITS (cmp_ops_mode
);
3885 cmp_mode
= nbits
> 8 ? int_mode_for_size (nbits
, 0).require () : E_QImode
;
3888 cmp_mode
= cmp_ops_mode
;
3890 cmp_op0
= force_reg (cmp_ops_mode
, cmp_op0
);
3892 bool (*op1_predicate
)(rtx
, machine_mode
)
3893 = VECTOR_MODE_P (cmp_ops_mode
) ? vector_operand
: nonimmediate_operand
;
3895 if (!op1_predicate (cmp_op1
, cmp_ops_mode
))
3896 cmp_op1
= force_reg (cmp_ops_mode
, cmp_op1
);
3899 || (maskcmp
&& cmp_mode
!= mode
)
3900 || (op_true
&& reg_overlap_mentioned_p (dest
, op_true
))
3901 || (op_false
&& reg_overlap_mentioned_p (dest
, op_false
)))
3902 dest
= gen_reg_rtx (maskcmp
? cmp_mode
: mode
);
3906 bool ok
= ix86_expand_mask_vec_cmp (dest
, code
, cmp_op0
, cmp_op1
);
3911 x
= gen_rtx_fmt_ee (code
, cmp_mode
, cmp_op0
, cmp_op1
);
3913 if (cmp_mode
!= mode
)
3915 x
= force_reg (cmp_ops_mode
, x
);
3916 convert_move (dest
, x
, false);
3919 emit_insn (gen_rtx_SET (dest
, x
));
3924 /* Emit x86 binary operand CODE in mode MODE for SSE vector
3925 instructions that can be performed using GP registers. */
3928 ix86_emit_vec_binop (enum rtx_code code
, machine_mode mode
,
3929 rtx dst
, rtx src1
, rtx src2
)
3933 tmp
= gen_rtx_SET (dst
, gen_rtx_fmt_ee (code
, mode
, src1
, src2
));
3935 if (GET_MODE_SIZE (mode
) <= GET_MODE_SIZE (SImode
)
3936 && GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
)
3938 rtx clob
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (CCmode
, FLAGS_REG
));
3939 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, tmp
, clob
));
3945 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3946 operations. This is used for both scalar and vector conditional moves. */
3949 ix86_expand_sse_movcc (rtx dest
, rtx cmp
, rtx op_true
, rtx op_false
)
3951 machine_mode mode
= GET_MODE (dest
);
3952 machine_mode cmpmode
= GET_MODE (cmp
);
3955 /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
3956 if (rtx_equal_p (op_true
, op_false
))
3958 emit_move_insn (dest
, op_true
);
3962 /* If we have an integer mask and FP value then we need
3963 to cast mask to FP mode. */
3964 if (mode
!= cmpmode
&& VECTOR_MODE_P (cmpmode
))
3966 cmp
= force_reg (cmpmode
, cmp
);
3967 cmp
= gen_rtx_SUBREG (mode
, cmp
, 0);
3970 /* In AVX512F the result of comparison is an integer mask. */
3972 && GET_MODE_CLASS (cmpmode
) == MODE_INT
)
3974 gcc_assert (ix86_valid_mask_cmp_mode (mode
));
3975 /* Using scalar/vector move with mask register. */
3976 cmp
= force_reg (cmpmode
, cmp
);
3977 /* Optimize for mask zero. */
3978 op_true
= (op_true
!= CONST0_RTX (mode
)
3979 ? force_reg (mode
, op_true
) : op_true
);
3980 op_false
= (op_false
!= CONST0_RTX (mode
)
3981 ? force_reg (mode
, op_false
) : op_false
);
3982 if (op_true
== CONST0_RTX (mode
))
3984 if (cmpmode
== E_DImode
&& !TARGET_64BIT
)
3986 x
= gen_reg_rtx (cmpmode
);
3987 emit_insn (gen_knotdi (x
, cmp
));
3990 x
= expand_simple_unop (cmpmode
, NOT
, cmp
, NULL
, 1);
3992 /* Reverse op_true op_false. */
3993 std::swap (op_true
, op_false
);
3997 emit_insn (gen_movhf_mask (dest
, op_true
, op_false
, cmp
));
3999 emit_insn (gen_rtx_SET (dest
,
4000 gen_rtx_VEC_MERGE (mode
,
4001 op_true
, op_false
, cmp
)));
4005 if (vector_all_ones_operand (op_true
, mode
)
4006 && op_false
== CONST0_RTX (mode
))
4008 emit_move_insn (dest
, cmp
);
4011 else if (op_false
== CONST0_RTX (mode
))
4013 x
= expand_simple_binop (mode
, AND
, cmp
, op_true
,
4014 dest
, 1, OPTAB_DIRECT
);
4016 emit_move_insn (dest
, x
);
4019 else if (op_true
== CONST0_RTX (mode
))
4021 op_false
= force_reg (mode
, op_false
);
4022 x
= gen_rtx_NOT (mode
, cmp
);
4023 ix86_emit_vec_binop (AND
, mode
, dest
, x
, op_false
);
4026 else if (vector_all_ones_operand (op_true
, mode
))
4028 x
= expand_simple_binop (mode
, IOR
, cmp
, op_false
,
4029 dest
, 1, OPTAB_DIRECT
);
4031 emit_move_insn (dest
, x
);
4037 op_true
= force_reg (mode
, op_true
);
4039 if (GET_MODE_SIZE (mode
) < 16
4040 || !nonimmediate_operand (op_false
, mode
))
4041 op_false
= force_reg (mode
, op_false
);
4043 emit_insn (gen_rtx_SET (dest
,
4044 gen_rtx_IF_THEN_ELSE (mode
, cmp
,
4045 op_true
, op_false
)));
4049 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
4050 machine_mode blend_mode
= mode
;
4052 if (GET_MODE_SIZE (mode
) < 16
4053 || !vector_operand (op_true
, mode
))
4054 op_true
= force_reg (mode
, op_true
);
4056 op_false
= force_reg (mode
, op_false
);
4062 gen
= gen_mmx_blendvps
;
4066 gen
= gen_sse4_1_blendvps
;
4070 gen
= gen_sse4_1_blendvpd
;
4074 gen
= gen_sse4_1_blendvss
;
4078 gen
= gen_sse4_1_blendvsd
;
4085 gen
= gen_mmx_pblendvb_v8qi
;
4086 blend_mode
= V8QImode
;
4093 gen
= gen_mmx_pblendvb_v4qi
;
4094 blend_mode
= V4QImode
;
4099 gen
= gen_mmx_pblendvb_v2qi
;
4110 gen
= gen_sse4_1_pblendvb
;
4111 blend_mode
= V16QImode
;
4116 gen
= gen_avx_blendvps256
;
4120 gen
= gen_avx_blendvpd256
;
4130 gen
= gen_avx2_pblendvb
;
4131 blend_mode
= V32QImode
;
4136 gen
= gen_avx512bw_blendmv64qi
;
4139 gen
= gen_avx512bw_blendmv32hi
;
4142 gen
= gen_avx512bw_blendmv32hf
;
4145 gen
= gen_avx512bw_blendmv32bf
;
4148 gen
= gen_avx512f_blendmv16si
;
4151 gen
= gen_avx512f_blendmv8di
;
4154 gen
= gen_avx512f_blendmv8df
;
4157 gen
= gen_avx512f_blendmv16sf
;
4166 if (blend_mode
== mode
)
4170 x
= gen_reg_rtx (blend_mode
);
4171 op_false
= gen_lowpart (blend_mode
, op_false
);
4172 op_true
= gen_lowpart (blend_mode
, op_true
);
4173 cmp
= gen_lowpart (blend_mode
, cmp
);
4176 emit_insn (gen (x
, op_false
, op_true
, cmp
));
4179 emit_move_insn (dest
, gen_lowpart (mode
, x
));
4185 t2
= expand_simple_binop (mode
, AND
, op_true
, cmp
,
4186 NULL
, 1, OPTAB_DIRECT
);
4188 t3
= gen_reg_rtx (mode
);
4189 x
= gen_rtx_NOT (mode
, cmp
);
4190 ix86_emit_vec_binop (AND
, mode
, t3
, x
, op_false
);
4192 x
= expand_simple_binop (mode
, IOR
, t3
, t2
,
4193 dest
, 1, OPTAB_DIRECT
);
4195 emit_move_insn (dest
, x
);
4199 /* Swap, force into registers, or otherwise massage the two operands
4200 to an sse comparison with a mask result. Thus we differ a bit from
4201 ix86_prepare_fp_compare_args which expects to produce a flags result.
4203 The DEST operand exists to help determine whether to commute commutative
4204 operators. The POP0/POP1 operands are updated in place. The new
4205 comparison code is returned, or UNKNOWN if not implementable. */
4207 static enum rtx_code
4208 ix86_prepare_sse_fp_compare_args (rtx dest
, enum rtx_code code
,
4209 rtx
*pop0
, rtx
*pop1
)
4215 /* AVX supports all the needed comparisons. */
4218 /* We have no LTGT as an operator. We could implement it with
4219 NE & ORDERED, but this requires an extra temporary. It's
4220 not clear that it's worth it. */
4227 /* These are supported directly. */
4234 /* AVX has 3 operand comparisons, no need to swap anything. */
4237 /* For commutative operators, try to canonicalize the destination
4238 operand to be first in the comparison - this helps reload to
4239 avoid extra moves. */
4240 if (!dest
|| !rtx_equal_p (dest
, *pop1
))
4248 /* These are not supported directly before AVX, and furthermore
4249 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
4250 comparison operands to transform into something that is
4252 std::swap (*pop0
, *pop1
);
4253 code
= swap_condition (code
);
4263 /* Expand a floating-point conditional move. Return true if successful. */
4266 ix86_expand_fp_movcc (rtx operands
[])
4268 machine_mode mode
= GET_MODE (operands
[0]);
4269 enum rtx_code code
= GET_CODE (operands
[1]);
4270 rtx tmp
, compare_op
;
4271 rtx op0
= XEXP (operands
[1], 0);
4272 rtx op1
= XEXP (operands
[1], 1);
4274 if (GET_MODE (op0
) == BFmode
4275 && !ix86_fp_comparison_operator (operands
[1], VOIDmode
))
4278 if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode
))
4282 /* Since we've no cmove for sse registers, don't force bad register
4283 allocation just to gain access to it. Deny movcc when the
4284 comparison mode doesn't match the move mode. */
4285 cmode
= GET_MODE (op0
);
4286 if (cmode
== VOIDmode
)
4287 cmode
= GET_MODE (op1
);
4291 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
, &op0
, &op1
);
4292 if (code
== UNKNOWN
)
4295 if (ix86_expand_sse_fp_minmax (operands
[0], code
, op0
, op1
,
4296 operands
[2], operands
[3]))
4299 tmp
= ix86_expand_sse_cmp (operands
[0], code
, op0
, op1
,
4300 operands
[2], operands
[3]);
4301 ix86_expand_sse_movcc (operands
[0], tmp
, operands
[2], operands
[3]);
4305 if (GET_MODE (op0
) == TImode
4306 || (GET_MODE (op0
) == DImode
4310 /* The floating point conditional move instructions don't directly
4311 support conditions resulting from a signed integer comparison. */
4313 compare_op
= ix86_expand_compare (code
, op0
, op1
);
4314 if (!fcmov_comparison_operator (compare_op
, VOIDmode
))
4316 tmp
= gen_reg_rtx (QImode
);
4317 ix86_expand_setcc (tmp
, code
, op0
, op1
);
4319 compare_op
= ix86_expand_compare (NE
, tmp
, const0_rtx
);
4322 emit_insn (gen_rtx_SET (operands
[0],
4323 gen_rtx_IF_THEN_ELSE (mode
, compare_op
,
4324 operands
[2], operands
[3])));
4329 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
4332 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code
)
4357 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
4360 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code
)
4397 /* Return immediate value to be used in UNSPEC_PCMP
4398 for comparison CODE in MODE. */
4401 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code
, machine_mode mode
)
4403 if (FLOAT_MODE_P (mode
))
4404 return ix86_fp_cmp_code_to_pcmp_immediate (code
);
4405 return ix86_int_cmp_code_to_pcmp_immediate (code
);
4408 /* Expand AVX-512 vector comparison. */
4411 ix86_expand_mask_vec_cmp (rtx dest
, enum rtx_code code
, rtx cmp_op0
, rtx cmp_op1
)
4413 machine_mode mask_mode
= GET_MODE (dest
);
4414 machine_mode cmp_mode
= GET_MODE (cmp_op0
);
4415 rtx imm
= GEN_INT (ix86_cmp_code_to_pcmp_immediate (code
, cmp_mode
));
4425 unspec_code
= UNSPEC_UNSIGNED_PCMP
;
4429 unspec_code
= UNSPEC_PCMP
;
4432 unspec
= gen_rtx_UNSPEC (mask_mode
, gen_rtvec (3, cmp_op0
, cmp_op1
, imm
),
4434 emit_insn (gen_rtx_SET (dest
, unspec
));
4439 /* Expand fp vector comparison. */
4442 ix86_expand_fp_vec_cmp (rtx operands
[])
4444 enum rtx_code code
= GET_CODE (operands
[1]);
4447 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
,
4448 &operands
[2], &operands
[3]);
4449 if (code
== UNKNOWN
)
4452 switch (GET_CODE (operands
[1]))
4455 temp
= ix86_expand_sse_cmp (operands
[0], ORDERED
, operands
[2],
4456 operands
[3], NULL
, NULL
);
4457 cmp
= ix86_expand_sse_cmp (operands
[0], NE
, operands
[2],
4458 operands
[3], NULL
, NULL
);
4462 temp
= ix86_expand_sse_cmp (operands
[0], UNORDERED
, operands
[2],
4463 operands
[3], NULL
, NULL
);
4464 cmp
= ix86_expand_sse_cmp (operands
[0], EQ
, operands
[2],
4465 operands
[3], NULL
, NULL
);
4471 cmp
= expand_simple_binop (GET_MODE (cmp
), code
, temp
, cmp
, cmp
, 1,
4475 cmp
= ix86_expand_sse_cmp (operands
[0], code
, operands
[2], operands
[3],
4478 if (operands
[0] != cmp
)
4479 emit_move_insn (operands
[0], cmp
);
4485 ix86_expand_int_sse_cmp (rtx dest
, enum rtx_code code
, rtx cop0
, rtx cop1
,
4486 rtx op_true
, rtx op_false
, bool *negate
)
4488 machine_mode data_mode
= GET_MODE (dest
);
4489 machine_mode mode
= GET_MODE (cop0
);
4494 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4496 && GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
4497 && GET_MODE_SIZE (mode
) <= 16)
4499 /* AVX512F supports all of the comparsions
4500 on all 128/256/512-bit vector int types. */
4501 else if (ix86_use_mask_cmp_p (data_mode
, mode
, op_true
, op_false
))
4505 /* Canonicalize the comparison to EQ, GT, GTU. */
4516 code
= reverse_condition (code
);
4522 code
= reverse_condition (code
);
4528 std::swap (cop0
, cop1
);
4529 code
= swap_condition (code
);
4536 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4537 if (mode
== V2DImode
)
4542 /* SSE4.1 supports EQ. */
4549 /* SSE4.2 supports GT/GTU. */
4559 rtx optrue
= op_true
? op_true
: CONSTM1_RTX (data_mode
);
4560 rtx opfalse
= op_false
? op_false
: CONST0_RTX (data_mode
);
4562 std::swap (optrue
, opfalse
);
4564 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4565 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4566 min (x, y) == x). While we add one instruction (the minimum),
4567 we remove the need for two instructions in the negation, as the
4568 result is done this way.
4569 When using masks, do it for SI/DImode element types, as it is shorter
4570 than the two subtractions. */
4572 && GET_MODE_SIZE (mode
) != 64
4573 && vector_all_ones_operand (opfalse
, data_mode
)
4574 && optrue
== CONST0_RTX (data_mode
))
4576 && GET_MODE_SIZE (GET_MODE_INNER (mode
)) >= 4
4577 /* Don't do it if not using integer masks and we'd end up with
4578 the right values in the registers though. */
4579 && (GET_MODE_SIZE (mode
) == 64
4580 || !vector_all_ones_operand (optrue
, data_mode
)
4581 || opfalse
!= CONST0_RTX (data_mode
))))
4583 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
4588 gen
= (code
== GTU
) ? gen_uminv16si3
: gen_sminv16si3
;
4591 gen
= (code
== GTU
) ? gen_uminv8di3
: gen_sminv8di3
;
4592 cop0
= force_reg (mode
, cop0
);
4593 cop1
= force_reg (mode
, cop1
);
4597 gen
= (code
== GTU
) ? gen_uminv32qi3
: gen_sminv32qi3
;
4601 gen
= (code
== GTU
) ? gen_uminv16hi3
: gen_sminv16hi3
;
4605 gen
= (code
== GTU
) ? gen_uminv8si3
: gen_sminv8si3
;
4608 if (TARGET_AVX512VL
)
4610 gen
= (code
== GTU
) ? gen_uminv4di3
: gen_sminv4di3
;
4611 cop0
= force_reg (mode
, cop0
);
4612 cop1
= force_reg (mode
, cop1
);
4616 if (code
== GTU
&& TARGET_SSE2
)
4617 gen
= gen_uminv16qi3
;
4618 else if (code
== GT
&& TARGET_SSE4_1
)
4619 gen
= gen_sminv16qi3
;
4622 if (code
== GTU
&& TARGET_SSE2
)
4623 gen
= gen_uminv8qi3
;
4624 else if (code
== GT
&& TARGET_SSE4_1
)
4625 gen
= gen_sminv8qi3
;
4628 if (code
== GTU
&& TARGET_SSE2
)
4629 gen
= gen_uminv4qi3
;
4630 else if (code
== GT
&& TARGET_SSE4_1
)
4631 gen
= gen_sminv4qi3
;
4634 if (code
== GTU
&& TARGET_SSE2
)
4635 gen
= gen_uminv2qi3
;
4636 else if (code
== GT
&& TARGET_SSE4_1
)
4637 gen
= gen_sminv2qi3
;
4640 if (code
== GTU
&& TARGET_SSE4_1
)
4641 gen
= gen_uminv8hi3
;
4642 else if (code
== GT
&& TARGET_SSE2
)
4643 gen
= gen_sminv8hi3
;
4646 if (code
== GTU
&& TARGET_SSE4_1
)
4647 gen
= gen_uminv4hi3
;
4648 else if (code
== GT
&& TARGET_SSE2
)
4649 gen
= gen_sminv4hi3
;
4652 if (code
== GTU
&& TARGET_SSE4_1
)
4653 gen
= gen_uminv2hi3
;
4654 else if (code
== GT
&& TARGET_SSE2
)
4655 gen
= gen_sminv2hi3
;
4659 gen
= (code
== GTU
) ? gen_uminv4si3
: gen_sminv4si3
;
4663 gen
= (code
== GTU
) ? gen_uminv2si3
: gen_sminv2si3
;
4666 if (TARGET_AVX512VL
)
4668 gen
= (code
== GTU
) ? gen_uminv2di3
: gen_sminv2di3
;
4669 cop0
= force_reg (mode
, cop0
);
4670 cop1
= force_reg (mode
, cop1
);
4679 rtx tem
= gen_reg_rtx (mode
);
4680 if (!vector_operand (cop0
, mode
))
4681 cop0
= force_reg (mode
, cop0
);
4682 if (!vector_operand (cop1
, mode
))
4683 cop1
= force_reg (mode
, cop1
);
4685 emit_insn (gen (tem
, cop0
, cop1
));
4691 /* Unsigned parallel compare is not supported by the hardware.
4692 Play some tricks to turn this into a signed comparison
4696 cop0
= force_reg (mode
, cop0
);
4710 /* Subtract (-(INT MAX) - 1) from both operands to make
4712 mask
= ix86_build_signbit_mask (mode
, true, false);
4713 t1
= gen_reg_rtx (mode
);
4714 emit_insn (gen_sub3_insn (t1
, cop0
, mask
));
4716 t2
= gen_reg_rtx (mode
);
4717 emit_insn (gen_sub3_insn (t2
, cop1
, mask
));
4736 /* Perform a parallel unsigned saturating subtraction. */
4737 x
= gen_reg_rtx (mode
);
4738 emit_insn (gen_rtx_SET
4739 (x
, gen_rtx_US_MINUS (mode
, cop0
, cop1
)));
4741 cop1
= CONST0_RTX (mode
);
4753 std::swap (op_true
, op_false
);
4755 /* Allow the comparison to be done in one mode, but the movcc to
4756 happen in another mode. */
4757 if (data_mode
== mode
)
4759 x
= ix86_expand_sse_cmp (dest
, code
, cop0
, cop1
,
4764 gcc_assert (GET_MODE_SIZE (data_mode
) == GET_MODE_SIZE (mode
));
4765 x
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), code
, cop0
, cop1
,
4767 if (GET_MODE (x
) == mode
)
4768 x
= gen_lowpart (data_mode
, x
);
4774 /* Expand integer vector comparison. */
4777 ix86_expand_int_vec_cmp (rtx operands
[])
4779 rtx_code code
= GET_CODE (operands
[1]);
4780 bool negate
= false;
4781 rtx cmp
= ix86_expand_int_sse_cmp (operands
[0], code
, operands
[2],
4782 operands
[3], NULL
, NULL
, &negate
);
4788 cmp
= ix86_expand_int_sse_cmp (operands
[0], EQ
, cmp
,
4789 CONST0_RTX (GET_MODE (cmp
)),
4790 NULL
, NULL
, &negate
);
4792 gcc_assert (!negate
);
4794 if (operands
[0] != cmp
)
4795 emit_move_insn (operands
[0], cmp
);
4800 /* Expand a floating-point vector conditional move; a vcond operation
4801 rather than a movcc operation. */
4804 ix86_expand_fp_vcond (rtx operands
[])
4806 enum rtx_code code
= GET_CODE (operands
[3]);
4809 code
= ix86_prepare_sse_fp_compare_args (operands
[0], code
,
4810 &operands
[4], &operands
[5]);
4811 if (code
== UNKNOWN
)
4814 switch (GET_CODE (operands
[3]))
4817 temp
= ix86_expand_sse_cmp (operands
[0], ORDERED
, operands
[4],
4818 operands
[5], operands
[0], operands
[0]);
4819 cmp
= ix86_expand_sse_cmp (operands
[0], NE
, operands
[4],
4820 operands
[5], operands
[1], operands
[2]);
4824 temp
= ix86_expand_sse_cmp (operands
[0], UNORDERED
, operands
[4],
4825 operands
[5], operands
[0], operands
[0]);
4826 cmp
= ix86_expand_sse_cmp (operands
[0], EQ
, operands
[4],
4827 operands
[5], operands
[1], operands
[2]);
4833 cmp
= expand_simple_binop (GET_MODE (cmp
), code
, temp
, cmp
, cmp
, 1,
4835 ix86_expand_sse_movcc (operands
[0], cmp
, operands
[1], operands
[2]);
4839 if (ix86_expand_sse_fp_minmax (operands
[0], code
, operands
[4],
4840 operands
[5], operands
[1], operands
[2]))
4843 cmp
= ix86_expand_sse_cmp (operands
[0], code
, operands
[4], operands
[5],
4844 operands
[1], operands
[2]);
4845 ix86_expand_sse_movcc (operands
[0], cmp
, operands
[1], operands
[2]);
4849 /* Expand a signed/unsigned integral vector conditional move. */
4852 ix86_expand_int_vcond (rtx operands
[])
4854 machine_mode data_mode
= GET_MODE (operands
[0]);
4855 machine_mode mode
= GET_MODE (operands
[4]);
4856 enum rtx_code code
= GET_CODE (operands
[3]);
4857 bool negate
= false;
4863 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4864 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4865 if ((code
== LT
|| code
== GE
)
4866 && data_mode
== mode
4867 && cop1
== CONST0_RTX (mode
)
4868 && operands
[1 + (code
== LT
)] == CONST0_RTX (data_mode
)
4869 && GET_MODE_UNIT_SIZE (data_mode
) > 1
4870 && GET_MODE_UNIT_SIZE (data_mode
) <= 8
4871 && (GET_MODE_SIZE (data_mode
) == 16
4872 || (TARGET_AVX2
&& GET_MODE_SIZE (data_mode
) == 32)))
4874 rtx negop
= operands
[2 - (code
== LT
)];
4875 int shift
= GET_MODE_UNIT_BITSIZE (data_mode
) - 1;
4876 if (negop
== CONST1_RTX (data_mode
))
4878 rtx res
= expand_simple_binop (mode
, LSHIFTRT
, cop0
, GEN_INT (shift
),
4879 operands
[0], 1, OPTAB_DIRECT
);
4880 if (res
!= operands
[0])
4881 emit_move_insn (operands
[0], res
);
4884 else if (GET_MODE_INNER (data_mode
) != DImode
4885 && vector_all_ones_operand (negop
, data_mode
))
4887 rtx res
= expand_simple_binop (mode
, ASHIFTRT
, cop0
, GEN_INT (shift
),
4888 operands
[0], 0, OPTAB_DIRECT
);
4889 if (res
!= operands
[0])
4890 emit_move_insn (operands
[0], res
);
4895 if (!nonimmediate_operand (cop1
, mode
))
4896 cop1
= force_reg (mode
, cop1
);
4897 if (!general_operand (operands
[1], data_mode
))
4898 operands
[1] = force_reg (data_mode
, operands
[1]);
4899 if (!general_operand (operands
[2], data_mode
))
4900 operands
[2] = force_reg (data_mode
, operands
[2]);
4902 x
= ix86_expand_int_sse_cmp (operands
[0], code
, cop0
, cop1
,
4903 operands
[1], operands
[2], &negate
);
4908 ix86_expand_sse_movcc (operands
[0], x
, operands
[1+negate
],
4909 operands
[2-negate
]);
4914 ix86_expand_vec_perm_vpermt2 (rtx target
, rtx mask
, rtx op0
, rtx op1
,
4915 struct expand_vec_perm_d
*d
)
4917 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4918 expander, so args are either in d, or in op0, op1 etc. */
4919 machine_mode mode
= GET_MODE (d
? d
->op0
: op0
);
4920 machine_mode maskmode
= mode
;
4921 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
4926 if (TARGET_AVX512VL
&& TARGET_AVX512VBMI
)
4927 gen
= gen_avx512vl_vpermt2varv16qi3
;
4930 if (TARGET_AVX512VL
&& TARGET_AVX512VBMI
)
4931 gen
= gen_avx512vl_vpermt2varv32qi3
;
4934 if (TARGET_AVX512VBMI
)
4935 gen
= gen_avx512bw_vpermt2varv64qi3
;
4938 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
4939 gen
= gen_avx512vl_vpermt2varv8hi3
;
4942 if (TARGET_AVX512VL
&& TARGET_AVX512BW
)
4943 gen
= gen_avx512vl_vpermt2varv16hi3
;
4946 if (TARGET_AVX512BW
)
4947 gen
= gen_avx512bw_vpermt2varv32hi3
;
4950 if (TARGET_AVX512VL
)
4951 gen
= gen_avx512vl_vpermt2varv4si3
;
4954 if (TARGET_AVX512VL
)
4955 gen
= gen_avx512vl_vpermt2varv8si3
;
4959 gen
= gen_avx512f_vpermt2varv16si3
;
4962 if (TARGET_AVX512VL
)
4964 gen
= gen_avx512vl_vpermt2varv4sf3
;
4965 maskmode
= V4SImode
;
4969 if (TARGET_AVX512VL
)
4971 gen
= gen_avx512vl_vpermt2varv8sf3
;
4972 maskmode
= V8SImode
;
4978 gen
= gen_avx512f_vpermt2varv16sf3
;
4979 maskmode
= V16SImode
;
4983 if (TARGET_AVX512VL
)
4984 gen
= gen_avx512vl_vpermt2varv2di3
;
4987 if (TARGET_AVX512VL
)
4988 gen
= gen_avx512vl_vpermt2varv4di3
;
4992 gen
= gen_avx512f_vpermt2varv8di3
;
4995 if (TARGET_AVX512VL
)
4997 gen
= gen_avx512vl_vpermt2varv2df3
;
4998 maskmode
= V2DImode
;
5002 if (TARGET_AVX512VL
)
5004 gen
= gen_avx512vl_vpermt2varv4df3
;
5005 maskmode
= V4DImode
;
5011 gen
= gen_avx512f_vpermt2varv8df3
;
5012 maskmode
= V8DImode
;
5022 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5023 expander, so args are either in d, or in op0, op1 etc. */
5030 for (int i
= 0; i
< d
->nelt
; ++i
)
5031 vec
[i
] = GEN_INT (d
->perm
[i
]);
5032 mask
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (d
->nelt
, vec
));
5035 emit_insn (gen (target
, force_reg (maskmode
, mask
), op0
, op1
));
5039 /* Expand a variable vector permutation. */
5042 ix86_expand_vec_perm (rtx operands
[])
5044 rtx target
= operands
[0];
5045 rtx op0
= operands
[1];
5046 rtx op1
= operands
[2];
5047 rtx mask
= operands
[3];
5048 rtx t1
, t2
, t3
, t4
, t5
, t6
, t7
, t8
, vt
, vt2
, vec
[32];
5049 machine_mode mode
= GET_MODE (op0
);
5050 machine_mode maskmode
= GET_MODE (mask
);
5052 bool one_operand_shuffle
= rtx_equal_p (op0
, op1
);
5054 /* Number of elements in the vector. */
5055 w
= GET_MODE_NUNITS (mode
);
5056 e
= GET_MODE_UNIT_SIZE (mode
);
5057 gcc_assert (w
<= 64);
5059 /* For HF mode vector, convert it to HI using subreg. */
5060 if (GET_MODE_INNER (mode
) == HFmode
)
5062 machine_mode orig_mode
= mode
;
5063 mode
= mode_for_vector (HImode
, w
).require ();
5064 target
= lowpart_subreg (mode
, target
, orig_mode
);
5065 op0
= lowpart_subreg (mode
, op0
, orig_mode
);
5066 op1
= lowpart_subreg (mode
, op1
, orig_mode
);
5069 if (TARGET_AVX512F
&& one_operand_shuffle
)
5071 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
5075 gen
=gen_avx512f_permvarv16si
;
5078 gen
= gen_avx512f_permvarv16sf
;
5081 gen
= gen_avx512f_permvarv8di
;
5084 gen
= gen_avx512f_permvarv8df
;
5091 emit_insn (gen (target
, op0
, mask
));
5096 if (ix86_expand_vec_perm_vpermt2 (target
, mask
, op0
, op1
, NULL
))
5101 if (mode
== V4DImode
|| mode
== V4DFmode
|| mode
== V16HImode
)
5103 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
5104 an constant shuffle operand. With a tiny bit of effort we can
5105 use VPERMD instead. A re-interpretation stall for V4DFmode is
5106 unfortunate but there's no avoiding it.
5107 Similarly for V16HImode we don't have instructions for variable
5108 shuffling, while for V32QImode we can use after preparing suitable
5109 masks vpshufb; vpshufb; vpermq; vpor. */
5111 if (mode
== V16HImode
)
5113 maskmode
= mode
= V32QImode
;
5119 maskmode
= mode
= V8SImode
;
5123 t1
= gen_reg_rtx (maskmode
);
5125 /* Replicate the low bits of the V4DImode mask into V8SImode:
5127 t1 = { A A B B C C D D }. */
5128 for (i
= 0; i
< w
/ 2; ++i
)
5129 vec
[i
*2 + 1] = vec
[i
*2] = GEN_INT (i
* 2);
5130 vt
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (w
, vec
));
5131 vt
= force_reg (maskmode
, vt
);
5132 mask
= gen_lowpart (maskmode
, mask
);
5133 if (maskmode
== V8SImode
)
5134 emit_insn (gen_avx2_permvarv8si (t1
, mask
, vt
));
5136 emit_insn (gen_avx2_pshufbv32qi3 (t1
, mask
, vt
));
5138 /* Multiply the shuffle indicies by two. */
5139 t1
= expand_simple_binop (maskmode
, PLUS
, t1
, t1
, t1
, 1,
5142 /* Add one to the odd shuffle indicies:
5143 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
5144 for (i
= 0; i
< w
/ 2; ++i
)
5146 vec
[i
* 2] = const0_rtx
;
5147 vec
[i
* 2 + 1] = const1_rtx
;
5149 vt
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (w
, vec
));
5150 vt
= validize_mem (force_const_mem (maskmode
, vt
));
5151 t1
= expand_simple_binop (maskmode
, PLUS
, t1
, vt
, t1
, 1,
5154 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
5155 operands
[3] = mask
= t1
;
5156 target
= gen_reg_rtx (mode
);
5157 op0
= gen_lowpart (mode
, op0
);
5158 op1
= gen_lowpart (mode
, op1
);
5164 /* The VPERMD and VPERMPS instructions already properly ignore
5165 the high bits of the shuffle elements. No need for us to
5166 perform an AND ourselves. */
5167 if (one_operand_shuffle
)
5169 emit_insn (gen_avx2_permvarv8si (target
, op0
, mask
));
5170 if (target
!= operands
[0])
5171 emit_move_insn (operands
[0],
5172 gen_lowpart (GET_MODE (operands
[0]), target
));
5176 t1
= gen_reg_rtx (V8SImode
);
5177 t2
= gen_reg_rtx (V8SImode
);
5178 emit_insn (gen_avx2_permvarv8si (t1
, op0
, mask
));
5179 emit_insn (gen_avx2_permvarv8si (t2
, op1
, mask
));
5185 mask
= gen_lowpart (V8SImode
, mask
);
5186 if (one_operand_shuffle
)
5187 emit_insn (gen_avx2_permvarv8sf (target
, op0
, mask
));
5190 t1
= gen_reg_rtx (V8SFmode
);
5191 t2
= gen_reg_rtx (V8SFmode
);
5192 emit_insn (gen_avx2_permvarv8sf (t1
, op0
, mask
));
5193 emit_insn (gen_avx2_permvarv8sf (t2
, op1
, mask
));
5199 /* By combining the two 128-bit input vectors into one 256-bit
5200 input vector, we can use VPERMD and VPERMPS for the full
5201 two-operand shuffle. */
5202 t1
= gen_reg_rtx (V8SImode
);
5203 t2
= gen_reg_rtx (V8SImode
);
5204 emit_insn (gen_avx_vec_concatv8si (t1
, op0
, op1
));
5205 emit_insn (gen_avx_vec_concatv8si (t2
, mask
, mask
));
5206 emit_insn (gen_avx2_permvarv8si (t1
, t1
, t2
));
5207 emit_insn (gen_avx_vextractf128v8si (target
, t1
, const0_rtx
));
5211 t1
= gen_reg_rtx (V8SFmode
);
5212 t2
= gen_reg_rtx (V8SImode
);
5213 mask
= gen_lowpart (V4SImode
, mask
);
5214 emit_insn (gen_avx_vec_concatv8sf (t1
, op0
, op1
));
5215 emit_insn (gen_avx_vec_concatv8si (t2
, mask
, mask
));
5216 emit_insn (gen_avx2_permvarv8sf (t1
, t1
, t2
));
5217 emit_insn (gen_avx_vextractf128v8sf (target
, t1
, const0_rtx
));
5221 t1
= gen_reg_rtx (V32QImode
);
5222 t2
= gen_reg_rtx (V32QImode
);
5223 t3
= gen_reg_rtx (V32QImode
);
5224 vt2
= GEN_INT (-128);
5225 vt
= gen_const_vec_duplicate (V32QImode
, vt2
);
5226 vt
= force_reg (V32QImode
, vt
);
5227 for (i
= 0; i
< 32; i
++)
5228 vec
[i
] = i
< 16 ? vt2
: const0_rtx
;
5229 vt2
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, vec
));
5230 vt2
= force_reg (V32QImode
, vt2
);
5231 /* From mask create two adjusted masks, which contain the same
5232 bits as mask in the low 7 bits of each vector element.
5233 The first mask will have the most significant bit clear
5234 if it requests element from the same 128-bit lane
5235 and MSB set if it requests element from the other 128-bit lane.
5236 The second mask will have the opposite values of the MSB,
5237 and additionally will have its 128-bit lanes swapped.
5238 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
5239 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
5240 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
5241 stands for other 12 bytes. */
5242 /* The bit whether element is from the same lane or the other
5243 lane is bit 4, so shift it up by 3 to the MSB position. */
5244 t5
= gen_reg_rtx (V4DImode
);
5245 emit_insn (gen_ashlv4di3 (t5
, gen_lowpart (V4DImode
, mask
),
5247 /* Clear MSB bits from the mask just in case it had them set. */
5248 emit_insn (gen_avx2_andnotv32qi3 (t2
, vt
, mask
));
5249 /* After this t1 will have MSB set for elements from other lane. */
5250 emit_insn (gen_xorv32qi3 (t1
, gen_lowpart (V32QImode
, t5
), vt2
));
5251 /* Clear bits other than MSB. */
5252 emit_insn (gen_andv32qi3 (t1
, t1
, vt
));
5253 /* Or in the lower bits from mask into t3. */
5254 emit_insn (gen_iorv32qi3 (t3
, t1
, t2
));
5255 /* And invert MSB bits in t1, so MSB is set for elements from the same
5257 emit_insn (gen_xorv32qi3 (t1
, t1
, vt
));
5258 /* Swap 128-bit lanes in t3. */
5259 t6
= gen_reg_rtx (V4DImode
);
5260 emit_insn (gen_avx2_permv4di_1 (t6
, gen_lowpart (V4DImode
, t3
),
5261 const2_rtx
, GEN_INT (3),
5262 const0_rtx
, const1_rtx
));
5263 /* And or in the lower bits from mask into t1. */
5264 emit_insn (gen_iorv32qi3 (t1
, t1
, t2
));
5265 if (one_operand_shuffle
)
5267 /* Each of these shuffles will put 0s in places where
5268 element from the other 128-bit lane is needed, otherwise
5269 will shuffle in the requested value. */
5270 emit_insn (gen_avx2_pshufbv32qi3 (t3
, op0
,
5271 gen_lowpart (V32QImode
, t6
)));
5272 emit_insn (gen_avx2_pshufbv32qi3 (t1
, op0
, t1
));
5273 /* For t3 the 128-bit lanes are swapped again. */
5274 t7
= gen_reg_rtx (V4DImode
);
5275 emit_insn (gen_avx2_permv4di_1 (t7
, gen_lowpart (V4DImode
, t3
),
5276 const2_rtx
, GEN_INT (3),
5277 const0_rtx
, const1_rtx
));
5278 /* And oring both together leads to the result. */
5279 emit_insn (gen_iorv32qi3 (target
, t1
,
5280 gen_lowpart (V32QImode
, t7
)));
5281 if (target
!= operands
[0])
5282 emit_move_insn (operands
[0],
5283 gen_lowpart (GET_MODE (operands
[0]), target
));
5287 t4
= gen_reg_rtx (V32QImode
);
5288 /* Similarly to the above one_operand_shuffle code,
5289 just for repeated twice for each operand. merge_two:
5290 code will merge the two results together. */
5291 emit_insn (gen_avx2_pshufbv32qi3 (t4
, op0
,
5292 gen_lowpart (V32QImode
, t6
)));
5293 emit_insn (gen_avx2_pshufbv32qi3 (t3
, op1
,
5294 gen_lowpart (V32QImode
, t6
)));
5295 emit_insn (gen_avx2_pshufbv32qi3 (t2
, op0
, t1
));
5296 emit_insn (gen_avx2_pshufbv32qi3 (t1
, op1
, t1
));
5297 t7
= gen_reg_rtx (V4DImode
);
5298 emit_insn (gen_avx2_permv4di_1 (t7
, gen_lowpart (V4DImode
, t4
),
5299 const2_rtx
, GEN_INT (3),
5300 const0_rtx
, const1_rtx
));
5301 t8
= gen_reg_rtx (V4DImode
);
5302 emit_insn (gen_avx2_permv4di_1 (t8
, gen_lowpart (V4DImode
, t3
),
5303 const2_rtx
, GEN_INT (3),
5304 const0_rtx
, const1_rtx
));
5305 emit_insn (gen_iorv32qi3 (t4
, t2
, gen_lowpart (V32QImode
, t7
)));
5306 emit_insn (gen_iorv32qi3 (t3
, t1
, gen_lowpart (V32QImode
, t8
)));
5312 gcc_assert (GET_MODE_SIZE (mode
) <= 16);
5319 /* The XOP VPPERM insn supports three inputs. By ignoring the
5320 one_operand_shuffle special case, we avoid creating another
5321 set of constant vectors in memory. */
5322 one_operand_shuffle
= false;
5324 /* mask = mask & {2*w-1, ...} */
5325 vt
= GEN_INT (2*w
- 1);
5329 /* mask = mask & {w-1, ...} */
5330 vt
= GEN_INT (w
- 1);
5333 vt
= gen_const_vec_duplicate (maskmode
, vt
);
5334 mask
= expand_simple_binop (maskmode
, AND
, mask
, vt
,
5335 NULL_RTX
, 0, OPTAB_DIRECT
);
5337 /* For non-QImode operations, convert the word permutation control
5338 into a byte permutation control. */
5339 if (mode
!= V16QImode
)
5341 mask
= expand_simple_binop (maskmode
, ASHIFT
, mask
,
5342 GEN_INT (exact_log2 (e
)),
5343 NULL_RTX
, 0, OPTAB_DIRECT
);
5345 /* Convert mask to vector of chars. */
5346 mask
= force_reg (V16QImode
, gen_lowpart (V16QImode
, mask
));
5348 /* Replicate each of the input bytes into byte positions:
5349 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
5350 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
5351 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
5352 for (i
= 0; i
< 16; ++i
)
5353 vec
[i
] = GEN_INT (i
/e
* e
);
5354 vt
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, vec
));
5355 vt
= validize_mem (force_const_mem (V16QImode
, vt
));
5357 emit_insn (gen_xop_pperm (mask
, mask
, mask
, vt
));
5359 emit_insn (gen_ssse3_pshufbv16qi3 (mask
, mask
, vt
));
5361 /* Convert it into the byte positions by doing
5362 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
5363 for (i
= 0; i
< 16; ++i
)
5364 vec
[i
] = GEN_INT (i
% e
);
5365 vt
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, vec
));
5366 vt
= validize_mem (force_const_mem (V16QImode
, vt
));
5367 emit_insn (gen_addv16qi3 (mask
, mask
, vt
));
5370 /* The actual shuffle operations all operate on V16QImode. */
5371 op0
= gen_lowpart (V16QImode
, op0
);
5372 op1
= gen_lowpart (V16QImode
, op1
);
5376 if (GET_MODE (target
) != V16QImode
)
5377 target
= gen_reg_rtx (V16QImode
);
5378 emit_insn (gen_xop_pperm (target
, op0
, op1
, mask
));
5379 if (target
!= operands
[0])
5380 emit_move_insn (operands
[0],
5381 gen_lowpart (GET_MODE (operands
[0]), target
));
5383 else if (one_operand_shuffle
)
5385 if (GET_MODE (target
) != V16QImode
)
5386 target
= gen_reg_rtx (V16QImode
);
5387 emit_insn (gen_ssse3_pshufbv16qi3 (target
, op0
, mask
));
5388 if (target
!= operands
[0])
5389 emit_move_insn (operands
[0],
5390 gen_lowpart (GET_MODE (operands
[0]), target
));
5397 /* Shuffle the two input vectors independently. */
5398 t1
= gen_reg_rtx (V16QImode
);
5399 t2
= gen_reg_rtx (V16QImode
);
5400 emit_insn (gen_ssse3_pshufbv16qi3 (t1
, op0
, mask
));
5401 emit_insn (gen_ssse3_pshufbv16qi3 (t2
, op1
, mask
));
5404 /* Then merge them together. The key is whether any given control
5405 element contained a bit set that indicates the second word. */
5408 if (maskmode
== V2DImode
&& !TARGET_SSE4_1
)
5410 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
5411 more shuffle to convert the V2DI input mask into a V4SI
5412 input mask. At which point the masking that expand_int_vcond
5413 will work as desired. */
5414 rtx t3
= gen_reg_rtx (V4SImode
);
5415 emit_insn (gen_sse2_pshufd_1 (t3
, gen_lowpart (V4SImode
, mask
),
5416 const0_rtx
, const0_rtx
,
5417 const2_rtx
, const2_rtx
));
5419 maskmode
= V4SImode
;
5423 vt
= gen_const_vec_duplicate (maskmode
, vt
);
5424 vt
= force_reg (maskmode
, vt
);
5425 mask
= expand_simple_binop (maskmode
, AND
, mask
, vt
,
5426 NULL_RTX
, 0, OPTAB_DIRECT
);
5428 if (GET_MODE (target
) != mode
)
5429 target
= gen_reg_rtx (mode
);
5431 xops
[1] = gen_lowpart (mode
, t2
);
5432 xops
[2] = gen_lowpart (mode
, t1
);
5433 xops
[3] = gen_rtx_EQ (maskmode
, mask
, vt
);
5436 ok
= ix86_expand_int_vcond (xops
);
5438 if (target
!= operands
[0])
5439 emit_move_insn (operands
[0],
5440 gen_lowpart (GET_MODE (operands
[0]), target
));
5444 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
5445 true if we should do zero extension, else sign extension. HIGH_P is
5446 true if we want the N/2 high elements, else the low elements. */
5449 ix86_expand_sse_unpack (rtx dest
, rtx src
, bool unsigned_p
, bool high_p
)
5451 machine_mode imode
= GET_MODE (src
);
5456 rtx (*unpack
)(rtx
, rtx
);
5457 rtx (*extract
)(rtx
, rtx
) = NULL
;
5458 machine_mode halfmode
= BLKmode
;
5464 unpack
= gen_avx512bw_zero_extendv32qiv32hi2
;
5466 unpack
= gen_avx512bw_sign_extendv32qiv32hi2
;
5467 halfmode
= V32QImode
;
5469 = high_p
? gen_vec_extract_hi_v64qi
: gen_vec_extract_lo_v64qi
;
5473 unpack
= gen_avx2_zero_extendv16qiv16hi2
;
5475 unpack
= gen_avx2_sign_extendv16qiv16hi2
;
5476 halfmode
= V16QImode
;
5478 = high_p
? gen_vec_extract_hi_v32qi
: gen_vec_extract_lo_v32qi
;
5482 unpack
= gen_avx512f_zero_extendv16hiv16si2
;
5484 unpack
= gen_avx512f_sign_extendv16hiv16si2
;
5485 halfmode
= V16HImode
;
5487 = high_p
? gen_vec_extract_hi_v32hi
: gen_vec_extract_lo_v32hi
;
5491 unpack
= gen_avx2_zero_extendv8hiv8si2
;
5493 unpack
= gen_avx2_sign_extendv8hiv8si2
;
5494 halfmode
= V8HImode
;
5496 = high_p
? gen_vec_extract_hi_v16hi
: gen_vec_extract_lo_v16hi
;
5500 unpack
= gen_avx512f_zero_extendv8siv8di2
;
5502 unpack
= gen_avx512f_sign_extendv8siv8di2
;
5503 halfmode
= V8SImode
;
5505 = high_p
? gen_vec_extract_hi_v16si
: gen_vec_extract_lo_v16si
;
5509 unpack
= gen_avx2_zero_extendv4siv4di2
;
5511 unpack
= gen_avx2_sign_extendv4siv4di2
;
5512 halfmode
= V4SImode
;
5514 = high_p
? gen_vec_extract_hi_v8si
: gen_vec_extract_lo_v8si
;
5518 unpack
= gen_sse4_1_zero_extendv8qiv8hi2
;
5520 unpack
= gen_sse4_1_sign_extendv8qiv8hi2
;
5524 unpack
= gen_sse4_1_zero_extendv4hiv4si2
;
5526 unpack
= gen_sse4_1_sign_extendv4hiv4si2
;
5530 unpack
= gen_sse4_1_zero_extendv2siv2di2
;
5532 unpack
= gen_sse4_1_sign_extendv2siv2di2
;
5536 unpack
= gen_sse4_1_zero_extendv4qiv4hi2
;
5538 unpack
= gen_sse4_1_sign_extendv4qiv4hi2
;
5542 unpack
= gen_sse4_1_zero_extendv2hiv2si2
;
5544 unpack
= gen_sse4_1_sign_extendv2hiv2si2
;
5548 unpack
= gen_sse4_1_zero_extendv2qiv2hi2
;
5550 unpack
= gen_sse4_1_sign_extendv2qiv2hi2
;
5556 if (GET_MODE_SIZE (imode
) >= 32)
5558 tmp
= gen_reg_rtx (halfmode
);
5559 emit_insn (extract (tmp
, src
));
5563 switch (GET_MODE_SIZE (imode
))
5566 /* Shift higher 8 bytes to lower 8 bytes. */
5567 tmp
= gen_reg_rtx (V1TImode
);
5568 emit_insn (gen_sse2_lshrv1ti3 (tmp
, gen_lowpart (V1TImode
, src
),
5572 /* Shift higher 4 bytes to lower 4 bytes. */
5573 tmp
= gen_reg_rtx (V1DImode
);
5574 emit_insn (gen_mmx_lshrv1di3 (tmp
, gen_lowpart (V1DImode
, src
),
5578 /* Shift higher 2 bytes to lower 2 bytes. */
5579 tmp
= gen_reg_rtx (V1SImode
);
5580 emit_insn (gen_mmx_lshrv1si3 (tmp
, gen_lowpart (V1SImode
, src
),
5587 tmp
= gen_lowpart (imode
, tmp
);
5592 emit_insn (unpack (dest
, tmp
));
5596 rtx (*unpack
)(rtx
, rtx
, rtx
);
5602 unpack
= gen_vec_interleave_highv16qi
;
5604 unpack
= gen_vec_interleave_lowv16qi
;
5608 unpack
= gen_vec_interleave_highv8hi
;
5610 unpack
= gen_vec_interleave_lowv8hi
;
5614 unpack
= gen_vec_interleave_highv4si
;
5616 unpack
= gen_vec_interleave_lowv4si
;
5620 unpack
= gen_mmx_punpckhbw
;
5622 unpack
= gen_mmx_punpcklbw
;
5626 unpack
= gen_mmx_punpckhwd
;
5628 unpack
= gen_mmx_punpcklwd
;
5632 unpack
= gen_mmx_punpckhbw_low
;
5634 unpack
= gen_mmx_punpcklbw_low
;
5641 tmp
= force_reg (imode
, CONST0_RTX (imode
));
5643 tmp
= ix86_expand_sse_cmp (gen_reg_rtx (imode
), GT
, CONST0_RTX (imode
),
5644 src
, pc_rtx
, pc_rtx
);
5646 rtx tmp2
= gen_reg_rtx (imode
);
5647 emit_insn (unpack (tmp2
, src
, tmp
));
5648 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), tmp2
));
5652 /* Return true if mem is pool constant which contains a const_vector
5653 perm index, assign the index to PERM. */
5655 ix86_extract_perm_from_pool_constant (int* perm
, rtx mem
)
5657 machine_mode mode
= GET_MODE (mem
);
5658 int nelt
= GET_MODE_NUNITS (mode
);
5660 if (!INTEGRAL_MODE_P (mode
))
5663 /* Needs to be constant pool. */
5665 || !SYMBOL_REF_P (XEXP (mem
, 0))
5666 || !CONSTANT_POOL_ADDRESS_P (XEXP (mem
, 0)))
5669 rtx constant
= get_pool_constant (XEXP (mem
, 0));
5671 if (GET_CODE (constant
) != CONST_VECTOR
)
5674 /* There could be some rtx like
5675 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
5676 but with "*.LC1" refer to V2DI constant vector. */
5677 if (GET_MODE (constant
) != mode
)
5679 constant
= simplify_subreg (mode
, constant
, GET_MODE (constant
), 0);
5681 if (constant
== nullptr || GET_CODE (constant
) != CONST_VECTOR
)
5685 for (int i
= 0; i
!= nelt
; i
++)
5686 perm
[i
] = UINTVAL (XVECEXP (constant
, 0, i
));
5691 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5692 but works for floating pointer parameters and nonoffsetable memories.
5693 For pushes, it returns just stack offsets; the values will be saved
5694 in the right order. Maximally three parts are generated. */
5697 ix86_split_to_parts (rtx operand
, rtx
*parts
, machine_mode mode
)
5702 size
= mode
==XFmode
? 3 : GET_MODE_SIZE (mode
) / 4;
5704 size
= (GET_MODE_SIZE (mode
) + 4) / 8;
5706 gcc_assert (!REG_P (operand
) || !MMX_REGNO_P (REGNO (operand
)));
5707 gcc_assert (size
>= 2 && size
<= 4);
5709 /* Optimize constant pool reference to immediates. This is used by fp
5710 moves, that force all constants to memory to allow combining. */
5711 if (MEM_P (operand
) && MEM_READONLY_P (operand
))
5712 operand
= avoid_constant_pool_reference (operand
);
5714 if (MEM_P (operand
) && !offsettable_memref_p (operand
))
5716 /* The only non-offsetable memories we handle are pushes. */
5717 int ok
= push_operand (operand
, VOIDmode
);
5721 operand
= copy_rtx (operand
);
5722 PUT_MODE (operand
, word_mode
);
5723 parts
[0] = parts
[1] = parts
[2] = parts
[3] = operand
;
5727 if (GET_CODE (operand
) == CONST_VECTOR
)
5729 scalar_int_mode imode
= int_mode_for_mode (mode
).require ();
5730 /* Caution: if we looked through a constant pool memory above,
5731 the operand may actually have a different mode now. That's
5732 ok, since we want to pun this all the way back to an integer. */
5733 operand
= simplify_subreg (imode
, operand
, GET_MODE (operand
), 0);
5734 gcc_assert (operand
!= NULL
);
5741 split_double_mode (mode
, &operand
, 1, &parts
[0], &parts
[1]);
5746 if (REG_P (operand
))
5748 gcc_assert (reload_completed
);
5749 for (i
= 0; i
< size
; i
++)
5750 parts
[i
] = gen_rtx_REG (SImode
, REGNO (operand
) + i
);
5752 else if (offsettable_memref_p (operand
))
5754 operand
= adjust_address (operand
, SImode
, 0);
5756 for (i
= 1; i
< size
; i
++)
5757 parts
[i
] = adjust_address (operand
, SImode
, 4 * i
);
5759 else if (CONST_DOUBLE_P (operand
))
5761 const REAL_VALUE_TYPE
*r
;
5764 r
= CONST_DOUBLE_REAL_VALUE (operand
);
5768 real_to_target (l
, r
, mode
);
5769 parts
[3] = gen_int_mode (l
[3], SImode
);
5770 parts
[2] = gen_int_mode (l
[2], SImode
);
5773 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5774 long double may not be 80-bit. */
5775 real_to_target (l
, r
, mode
);
5776 parts
[2] = gen_int_mode (l
[2], SImode
);
5779 REAL_VALUE_TO_TARGET_DOUBLE (*r
, l
);
5784 parts
[1] = gen_int_mode (l
[1], SImode
);
5785 parts
[0] = gen_int_mode (l
[0], SImode
);
5794 split_double_mode (mode
, &operand
, 1, &parts
[0], &parts
[1]);
5795 if (mode
== XFmode
|| mode
== TFmode
)
5797 machine_mode upper_mode
= mode
==XFmode
? SImode
: DImode
;
5798 if (REG_P (operand
))
5800 gcc_assert (reload_completed
);
5801 parts
[0] = gen_rtx_REG (DImode
, REGNO (operand
) + 0);
5802 parts
[1] = gen_rtx_REG (upper_mode
, REGNO (operand
) + 1);
5804 else if (offsettable_memref_p (operand
))
5806 operand
= adjust_address (operand
, DImode
, 0);
5808 parts
[1] = adjust_address (operand
, upper_mode
, 8);
5810 else if (CONST_DOUBLE_P (operand
))
5814 real_to_target (l
, CONST_DOUBLE_REAL_VALUE (operand
), mode
);
5816 /* real_to_target puts 32-bit pieces in each long. */
5817 parts
[0] = gen_int_mode ((l
[0] & HOST_WIDE_INT_C (0xffffffff))
5818 | ((l
[1] & HOST_WIDE_INT_C (0xffffffff))
5821 if (upper_mode
== SImode
)
5822 parts
[1] = gen_int_mode (l
[2], SImode
);
5825 = gen_int_mode ((l
[2] & HOST_WIDE_INT_C (0xffffffff))
5826 | ((l
[3] & HOST_WIDE_INT_C (0xffffffff))
5837 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5838 Return false when normal moves are needed; true when all required
5839 insns have been emitted. Operands 2-4 contain the input values
5840 int the correct order; operands 5-7 contain the output values. */
5843 ix86_split_long_move (rtx operands
[])
5849 machine_mode mode
= GET_MODE (operands
[0]);
5850 bool collisionparts
[4];
5852 /* The DFmode expanders may ask us to move double.
5853 For 64bit target this is single move. By hiding the fact
5854 here we simplify i386.md splitters. */
5855 if (TARGET_64BIT
&& GET_MODE_SIZE (GET_MODE (operands
[0])) == 8)
5857 /* Optimize constant pool reference to immediates. This is used by
5858 fp moves, that force all constants to memory to allow combining. */
5860 if (MEM_P (operands
[1])
5861 && GET_CODE (XEXP (operands
[1], 0)) == SYMBOL_REF
5862 && CONSTANT_POOL_ADDRESS_P (XEXP (operands
[1], 0)))
5863 operands
[1] = get_pool_constant (XEXP (operands
[1], 0));
5864 if (push_operand (operands
[0], VOIDmode
))
5866 operands
[0] = copy_rtx (operands
[0]);
5867 PUT_MODE (operands
[0], word_mode
);
5870 operands
[0] = gen_lowpart (DImode
, operands
[0]);
5871 operands
[1] = gen_lowpart (DImode
, operands
[1]);
5872 emit_move_insn (operands
[0], operands
[1]);
5876 /* The only non-offsettable memory we handle is push. */
5877 if (push_operand (operands
[0], VOIDmode
))
5880 gcc_assert (!MEM_P (operands
[0])
5881 || offsettable_memref_p (operands
[0]));
5883 nparts
= ix86_split_to_parts (operands
[1], part
[1], GET_MODE (operands
[0]));
5884 ix86_split_to_parts (operands
[0], part
[0], GET_MODE (operands
[0]));
5886 /* When emitting push, take care for source operands on the stack. */
5887 if (push
&& MEM_P (operands
[1])
5888 && reg_overlap_mentioned_p (stack_pointer_rtx
, operands
[1]))
5890 rtx src_base
= XEXP (part
[1][nparts
- 1], 0);
5892 /* Compensate for the stack decrement by 4. */
5893 if (!TARGET_64BIT
&& nparts
== 3
5894 && mode
== XFmode
&& TARGET_128BIT_LONG_DOUBLE
)
5895 src_base
= plus_constant (Pmode
, src_base
, 4);
5897 /* src_base refers to the stack pointer and is
5898 automatically decreased by emitted push. */
5899 for (i
= 0; i
< nparts
; i
++)
5900 part
[1][i
] = change_address (part
[1][i
],
5901 GET_MODE (part
[1][i
]), src_base
);
5904 /* We need to do copy in the right order in case an address register
5905 of the source overlaps the destination. */
5906 if (REG_P (part
[0][0]) && MEM_P (part
[1][0]))
5910 for (i
= 0; i
< nparts
; i
++)
5913 = reg_overlap_mentioned_p (part
[0][i
], XEXP (part
[1][0], 0));
5914 if (collisionparts
[i
])
5918 /* Collision in the middle part can be handled by reordering. */
5919 if (collisions
== 1 && nparts
== 3 && collisionparts
[1])
5921 std::swap (part
[0][1], part
[0][2]);
5922 std::swap (part
[1][1], part
[1][2]);
5924 else if (collisions
== 1
5926 && (collisionparts
[1] || collisionparts
[2]))
5928 if (collisionparts
[1])
5930 std::swap (part
[0][1], part
[0][2]);
5931 std::swap (part
[1][1], part
[1][2]);
5935 std::swap (part
[0][2], part
[0][3]);
5936 std::swap (part
[1][2], part
[1][3]);
5940 /* If there are more collisions, we can't handle it by reordering.
5941 Do an lea to the last part and use only one colliding move. */
5942 else if (collisions
> 1)
5948 base
= part
[0][nparts
- 1];
5950 /* Handle the case when the last part isn't valid for lea.
5951 Happens in 64-bit mode storing the 12-byte XFmode. */
5952 if (GET_MODE (base
) != Pmode
)
5953 base
= gen_rtx_REG (Pmode
, REGNO (base
));
5955 addr
= XEXP (part
[1][0], 0);
5956 if (TARGET_TLS_DIRECT_SEG_REFS
)
5958 struct ix86_address parts
;
5959 int ok
= ix86_decompose_address (addr
, &parts
);
5961 /* It is not valid to use %gs: or %fs: in lea. */
5962 gcc_assert (parts
.seg
== ADDR_SPACE_GENERIC
);
5964 emit_insn (gen_rtx_SET (base
, addr
));
5965 part
[1][0] = replace_equiv_address (part
[1][0], base
);
5966 for (i
= 1; i
< nparts
; i
++)
5968 tmp
= plus_constant (Pmode
, base
, UNITS_PER_WORD
* i
);
5969 part
[1][i
] = replace_equiv_address (part
[1][i
], tmp
);
5980 if (TARGET_128BIT_LONG_DOUBLE
&& mode
== XFmode
)
5981 emit_insn (gen_add2_insn (stack_pointer_rtx
, GEN_INT (-4)));
5982 emit_move_insn (part
[0][2], part
[1][2]);
5984 else if (nparts
== 4)
5986 emit_move_insn (part
[0][3], part
[1][3]);
5987 emit_move_insn (part
[0][2], part
[1][2]);
5992 /* In 64bit mode we don't have 32bit push available. In case this is
5993 register, it is OK - we will just use larger counterpart. We also
5994 retype memory - these comes from attempt to avoid REX prefix on
5995 moving of second half of TFmode value. */
5996 if (GET_MODE (part
[1][1]) == SImode
)
5998 switch (GET_CODE (part
[1][1]))
6001 part
[1][1] = adjust_address (part
[1][1], DImode
, 0);
6005 part
[1][1] = gen_rtx_REG (DImode
, REGNO (part
[1][1]));
6012 if (GET_MODE (part
[1][0]) == SImode
)
6013 part
[1][0] = part
[1][1];
6016 emit_move_insn (part
[0][1], part
[1][1]);
6017 emit_move_insn (part
[0][0], part
[1][0]);
6021 /* Choose correct order to not overwrite the source before it is copied. */
6022 if ((REG_P (part
[0][0])
6023 && REG_P (part
[1][1])
6024 && (REGNO (part
[0][0]) == REGNO (part
[1][1])
6026 && REGNO (part
[0][0]) == REGNO (part
[1][2]))
6028 && REGNO (part
[0][0]) == REGNO (part
[1][3]))))
6030 && reg_overlap_mentioned_p (part
[0][0], XEXP (part
[1][0], 0))))
6032 for (i
= 0, j
= nparts
- 1; i
< nparts
; i
++, j
--)
6034 operands
[2 + i
] = part
[0][j
];
6035 operands
[6 + i
] = part
[1][j
];
6040 for (i
= 0; i
< nparts
; i
++)
6042 operands
[2 + i
] = part
[0][i
];
6043 operands
[6 + i
] = part
[1][i
];
6047 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
6048 if (optimize_insn_for_size_p ())
6050 for (j
= 0; j
< nparts
- 1; j
++)
6051 if (CONST_INT_P (operands
[6 + j
])
6052 && operands
[6 + j
] != const0_rtx
6053 && REG_P (operands
[2 + j
]))
6054 for (i
= j
; i
< nparts
- 1; i
++)
6055 if (CONST_INT_P (operands
[7 + i
])
6056 && INTVAL (operands
[7 + i
]) == INTVAL (operands
[6 + j
]))
6057 operands
[7 + i
] = operands
[2 + j
];
6060 for (i
= 0; i
< nparts
; i
++)
6061 emit_move_insn (operands
[2 + i
], operands
[6 + i
]);
6066 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
6067 left shift by a constant, either using a single shift or
6068 a sequence of add instructions. */
6071 ix86_expand_ashl_const (rtx operand
, int count
, machine_mode mode
)
6074 || (count
* ix86_cost
->add
<= ix86_cost
->shift_const
6075 && !optimize_insn_for_size_p ()))
6078 emit_insn (gen_add2_insn (operand
, operand
));
6082 rtx (*insn
)(rtx
, rtx
, rtx
);
6084 insn
= mode
== DImode
? gen_ashlsi3
: gen_ashldi3
;
6085 emit_insn (insn (operand
, operand
, GEN_INT (count
)));
6090 ix86_split_ashl (rtx
*operands
, rtx scratch
, machine_mode mode
)
6092 rtx (*gen_ashl3
)(rtx
, rtx
, rtx
);
6093 rtx (*gen_shld
)(rtx
, rtx
, rtx
);
6094 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
6095 machine_mode half_mode
;
6097 rtx low
[2], high
[2];
6100 if (CONST_INT_P (operands
[2]))
6102 split_double_mode (mode
, operands
, 2, low
, high
);
6103 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
6105 if (count
>= half_width
)
6107 emit_move_insn (high
[0], low
[1]);
6108 emit_move_insn (low
[0], const0_rtx
);
6110 if (count
> half_width
)
6111 ix86_expand_ashl_const (high
[0], count
- half_width
, mode
);
6115 gen_shld
= mode
== DImode
? gen_x86_shld
: gen_x86_64_shld
;
6117 if (!rtx_equal_p (operands
[0], operands
[1]))
6118 emit_move_insn (operands
[0], operands
[1]);
6120 emit_insn (gen_shld (high
[0], low
[0], GEN_INT (count
)));
6121 ix86_expand_ashl_const (low
[0], count
, mode
);
6126 split_double_mode (mode
, operands
, 1, low
, high
);
6127 half_mode
= mode
== DImode
? SImode
: DImode
;
6129 gen_ashl3
= mode
== DImode
? gen_ashlsi3
: gen_ashldi3
;
6131 if (operands
[1] == const1_rtx
)
6133 /* Assuming we've chosen a QImode capable registers, then 1 << N
6134 can be done with two 32/64-bit shifts, no branches, no cmoves. */
6135 if (ANY_QI_REG_P (low
[0]) && ANY_QI_REG_P (high
[0]))
6137 rtx s
, d
, flags
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
6139 ix86_expand_clear (low
[0]);
6140 ix86_expand_clear (high
[0]);
6141 emit_insn (gen_testqi_ccz_1 (operands
[2], GEN_INT (half_width
)));
6143 d
= gen_lowpart (QImode
, low
[0]);
6144 d
= gen_rtx_STRICT_LOW_PART (VOIDmode
, d
);
6145 s
= gen_rtx_EQ (QImode
, flags
, const0_rtx
);
6146 emit_insn (gen_rtx_SET (d
, s
));
6148 d
= gen_lowpart (QImode
, high
[0]);
6149 d
= gen_rtx_STRICT_LOW_PART (VOIDmode
, d
);
6150 s
= gen_rtx_NE (QImode
, flags
, const0_rtx
);
6151 emit_insn (gen_rtx_SET (d
, s
));
6154 /* Otherwise, we can get the same results by manually performing
6155 a bit extract operation on bit 5/6, and then performing the two
6156 shifts. The two methods of getting 0/1 into low/high are exactly
6157 the same size. Avoiding the shift in the bit extract case helps
6158 pentium4 a bit; no one else seems to care much either way. */
6161 rtx (*gen_lshr3
)(rtx
, rtx
, rtx
);
6162 rtx (*gen_and3
)(rtx
, rtx
, rtx
);
6163 rtx (*gen_xor3
)(rtx
, rtx
, rtx
);
6169 gen_lshr3
= gen_lshrsi3
;
6170 gen_and3
= gen_andsi3
;
6171 gen_xor3
= gen_xorsi3
;
6176 gen_lshr3
= gen_lshrdi3
;
6177 gen_and3
= gen_anddi3
;
6178 gen_xor3
= gen_xordi3
;
6182 if (TARGET_PARTIAL_REG_STALL
&& !optimize_insn_for_size_p ())
6183 x
= gen_rtx_ZERO_EXTEND (half_mode
, operands
[2]);
6185 x
= gen_lowpart (half_mode
, operands
[2]);
6186 emit_insn (gen_rtx_SET (high
[0], x
));
6188 emit_insn (gen_lshr3 (high
[0], high
[0], GEN_INT (bits
)));
6189 emit_insn (gen_and3 (high
[0], high
[0], const1_rtx
));
6190 emit_move_insn (low
[0], high
[0]);
6191 emit_insn (gen_xor3 (low
[0], low
[0], const1_rtx
));
6194 emit_insn (gen_ashl3 (low
[0], low
[0], operands
[2]));
6195 emit_insn (gen_ashl3 (high
[0], high
[0], operands
[2]));
6199 if (operands
[1] == constm1_rtx
)
6201 /* For -1 << N, we can avoid the shld instruction, because we
6202 know that we're shifting 0...31/63 ones into a -1. */
6203 emit_move_insn (low
[0], constm1_rtx
);
6204 if (optimize_insn_for_size_p ())
6205 emit_move_insn (high
[0], low
[0]);
6207 emit_move_insn (high
[0], constm1_rtx
);
6211 gen_shld
= mode
== DImode
? gen_x86_shld
: gen_x86_64_shld
;
6213 if (!rtx_equal_p (operands
[0], operands
[1]))
6214 emit_move_insn (operands
[0], operands
[1]);
6216 split_double_mode (mode
, operands
, 1, low
, high
);
6217 emit_insn (gen_shld (high
[0], low
[0], operands
[2]));
6220 emit_insn (gen_ashl3 (low
[0], low
[0], operands
[2]));
6222 if (TARGET_CMOVE
&& scratch
)
6224 ix86_expand_clear (scratch
);
6225 emit_insn (gen_x86_shift_adj_1
6226 (half_mode
, high
[0], low
[0], operands
[2], scratch
));
6229 emit_insn (gen_x86_shift_adj_2 (half_mode
, high
[0], low
[0], operands
[2]));
6233 ix86_split_ashr (rtx
*operands
, rtx scratch
, machine_mode mode
)
6235 rtx (*gen_ashr3
)(rtx
, rtx
, rtx
)
6236 = mode
== DImode
? gen_ashrsi3
: gen_ashrdi3
;
6237 rtx (*gen_shrd
)(rtx
, rtx
, rtx
);
6238 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
6240 rtx low
[2], high
[2];
6243 if (CONST_INT_P (operands
[2]))
6245 split_double_mode (mode
, operands
, 2, low
, high
);
6246 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
6248 if (count
== GET_MODE_BITSIZE (mode
) - 1)
6250 emit_move_insn (high
[0], high
[1]);
6251 emit_insn (gen_ashr3 (high
[0], high
[0],
6252 GEN_INT (half_width
- 1)));
6253 emit_move_insn (low
[0], high
[0]);
6256 else if (count
>= half_width
)
6258 emit_move_insn (low
[0], high
[1]);
6259 emit_move_insn (high
[0], low
[0]);
6260 emit_insn (gen_ashr3 (high
[0], high
[0],
6261 GEN_INT (half_width
- 1)));
6263 if (count
> half_width
)
6264 emit_insn (gen_ashr3 (low
[0], low
[0],
6265 GEN_INT (count
- half_width
)));
6269 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
6271 if (!rtx_equal_p (operands
[0], operands
[1]))
6272 emit_move_insn (operands
[0], operands
[1]);
6274 emit_insn (gen_shrd (low
[0], high
[0], GEN_INT (count
)));
6275 emit_insn (gen_ashr3 (high
[0], high
[0], GEN_INT (count
)));
6280 machine_mode half_mode
;
6282 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
6284 if (!rtx_equal_p (operands
[0], operands
[1]))
6285 emit_move_insn (operands
[0], operands
[1]);
6287 split_double_mode (mode
, operands
, 1, low
, high
);
6288 half_mode
= mode
== DImode
? SImode
: DImode
;
6290 emit_insn (gen_shrd (low
[0], high
[0], operands
[2]));
6291 emit_insn (gen_ashr3 (high
[0], high
[0], operands
[2]));
6293 if (TARGET_CMOVE
&& scratch
)
6295 emit_move_insn (scratch
, high
[0]);
6296 emit_insn (gen_ashr3 (scratch
, scratch
,
6297 GEN_INT (half_width
- 1)));
6298 emit_insn (gen_x86_shift_adj_1
6299 (half_mode
, low
[0], high
[0], operands
[2], scratch
));
6302 emit_insn (gen_x86_shift_adj_3
6303 (half_mode
, low
[0], high
[0], operands
[2]));
6308 ix86_split_lshr (rtx
*operands
, rtx scratch
, machine_mode mode
)
6310 rtx (*gen_lshr3
)(rtx
, rtx
, rtx
)
6311 = mode
== DImode
? gen_lshrsi3
: gen_lshrdi3
;
6312 rtx (*gen_shrd
)(rtx
, rtx
, rtx
);
6313 int half_width
= GET_MODE_BITSIZE (mode
) >> 1;
6315 rtx low
[2], high
[2];
6318 if (CONST_INT_P (operands
[2]))
6320 split_double_mode (mode
, operands
, 2, low
, high
);
6321 count
= INTVAL (operands
[2]) & (GET_MODE_BITSIZE (mode
) - 1);
6323 if (count
>= half_width
)
6325 emit_move_insn (low
[0], high
[1]);
6326 ix86_expand_clear (high
[0]);
6328 if (count
> half_width
)
6329 emit_insn (gen_lshr3 (low
[0], low
[0],
6330 GEN_INT (count
- half_width
)));
6334 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
6336 if (!rtx_equal_p (operands
[0], operands
[1]))
6337 emit_move_insn (operands
[0], operands
[1]);
6339 emit_insn (gen_shrd (low
[0], high
[0], GEN_INT (count
)));
6340 emit_insn (gen_lshr3 (high
[0], high
[0], GEN_INT (count
)));
6345 machine_mode half_mode
;
6347 gen_shrd
= mode
== DImode
? gen_x86_shrd
: gen_x86_64_shrd
;
6349 if (!rtx_equal_p (operands
[0], operands
[1]))
6350 emit_move_insn (operands
[0], operands
[1]);
6352 split_double_mode (mode
, operands
, 1, low
, high
);
6353 half_mode
= mode
== DImode
? SImode
: DImode
;
6355 emit_insn (gen_shrd (low
[0], high
[0], operands
[2]));
6356 emit_insn (gen_lshr3 (high
[0], high
[0], operands
[2]));
6358 if (TARGET_CMOVE
&& scratch
)
6360 ix86_expand_clear (scratch
);
6361 emit_insn (gen_x86_shift_adj_1
6362 (half_mode
, low
[0], high
[0], operands
[2], scratch
));
6365 emit_insn (gen_x86_shift_adj_2
6366 (half_mode
, low
[0], high
[0], operands
[2]));
6370 /* Expand move of V1TI mode register X to a new TI mode register. */
6372 ix86_expand_v1ti_to_ti (rtx x
)
6374 rtx result
= gen_reg_rtx (TImode
);
6377 rtx temp
= force_reg (V2DImode
, gen_lowpart (V2DImode
, x
));
6378 rtx lo
= gen_lowpart (DImode
, result
);
6379 emit_insn (gen_vec_extractv2didi (lo
, temp
, const0_rtx
));
6380 rtx hi
= gen_highpart (DImode
, result
);
6381 emit_insn (gen_vec_extractv2didi (hi
, temp
, const1_rtx
));
6384 emit_move_insn (result
, gen_lowpart (TImode
, x
));
6388 /* Expand move of TI mode register X to a new V1TI mode register. */
6390 ix86_expand_ti_to_v1ti (rtx x
)
6394 rtx lo
= gen_lowpart (DImode
, x
);
6395 rtx hi
= gen_highpart (DImode
, x
);
6396 rtx tmp
= gen_reg_rtx (V2DImode
);
6397 emit_insn (gen_vec_concatv2di (tmp
, lo
, hi
));
6398 return force_reg (V1TImode
, gen_lowpart (V1TImode
, tmp
));
6401 return force_reg (V1TImode
, gen_lowpart (V1TImode
, x
));
6404 /* Expand V1TI mode shift (of rtx_code CODE) by constant. */
6406 ix86_expand_v1ti_shift (enum rtx_code code
, rtx operands
[])
6408 rtx op1
= force_reg (V1TImode
, operands
[1]);
6410 if (!CONST_INT_P (operands
[2]))
6412 rtx tmp1
= ix86_expand_v1ti_to_ti (op1
);
6413 rtx tmp2
= gen_reg_rtx (TImode
);
6414 rtx (*shift
) (rtx
, rtx
, rtx
)
6415 = (code
== ASHIFT
) ? gen_ashlti3
: gen_lshrti3
;
6416 emit_insn (shift (tmp2
, tmp1
, operands
[2]));
6417 rtx tmp3
= ix86_expand_ti_to_v1ti (tmp2
);
6418 emit_move_insn (operands
[0], tmp3
);
6422 HOST_WIDE_INT bits
= INTVAL (operands
[2]) & 127;
6426 emit_move_insn (operands
[0], op1
);
6430 if ((bits
& 7) == 0)
6432 rtx tmp
= gen_reg_rtx (V1TImode
);
6434 emit_insn (gen_sse2_ashlv1ti3 (tmp
, op1
, GEN_INT (bits
)));
6436 emit_insn (gen_sse2_lshrv1ti3 (tmp
, op1
, GEN_INT (bits
)));
6437 emit_move_insn (operands
[0], tmp
);
6441 rtx tmp1
= gen_reg_rtx (V1TImode
);
6443 emit_insn (gen_sse2_ashlv1ti3 (tmp1
, op1
, GEN_INT (64)));
6445 emit_insn (gen_sse2_lshrv1ti3 (tmp1
, op1
, GEN_INT (64)));
6447 /* tmp2 is operands[1] shifted by 64, in V2DImode. */
6448 rtx tmp2
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp1
));
6450 /* tmp3 will be the V2DImode result. */
6451 rtx tmp3
= gen_reg_rtx (V2DImode
);
6456 emit_insn (gen_ashlv2di3 (tmp3
, tmp2
, GEN_INT (bits
- 64)));
6458 emit_insn (gen_lshrv2di3 (tmp3
, tmp2
, GEN_INT (bits
- 64)));
6462 /* tmp4 is operands[1], in V2DImode. */
6463 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, op1
));
6465 rtx tmp5
= gen_reg_rtx (V2DImode
);
6467 emit_insn (gen_ashlv2di3 (tmp5
, tmp4
, GEN_INT (bits
)));
6469 emit_insn (gen_lshrv2di3 (tmp5
, tmp4
, GEN_INT (bits
)));
6471 rtx tmp6
= gen_reg_rtx (V2DImode
);
6473 emit_insn (gen_lshrv2di3 (tmp6
, tmp2
, GEN_INT (64 - bits
)));
6475 emit_insn (gen_ashlv2di3 (tmp6
, tmp2
, GEN_INT (64 - bits
)));
6477 emit_insn (gen_iorv2di3 (tmp3
, tmp5
, tmp6
));
6480 /* Convert the result back to V1TImode and store in operands[0]. */
6481 rtx tmp7
= force_reg (V1TImode
, gen_lowpart (V1TImode
, tmp3
));
6482 emit_move_insn (operands
[0], tmp7
);
6485 /* Expand V1TI mode rotate (of rtx_code CODE) by constant. */
6487 ix86_expand_v1ti_rotate (enum rtx_code code
, rtx operands
[])
6489 rtx op1
= force_reg (V1TImode
, operands
[1]);
6491 if (!CONST_INT_P (operands
[2]))
6493 rtx tmp1
= ix86_expand_v1ti_to_ti (op1
);
6494 rtx tmp2
= gen_reg_rtx (TImode
);
6495 rtx (*rotate
) (rtx
, rtx
, rtx
)
6496 = (code
== ROTATE
) ? gen_rotlti3
: gen_rotrti3
;
6497 emit_insn (rotate (tmp2
, tmp1
, operands
[2]));
6498 rtx tmp3
= ix86_expand_ti_to_v1ti (tmp2
);
6499 emit_move_insn (operands
[0], tmp3
);
6503 HOST_WIDE_INT bits
= INTVAL (operands
[2]) & 127;
6507 emit_move_insn (operands
[0], op1
);
6511 if (code
== ROTATERT
)
6514 if ((bits
& 31) == 0)
6516 rtx tmp2
= gen_reg_rtx (V4SImode
);
6517 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6519 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0x93)));
6520 else if (bits
== 64)
6521 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0x4e)));
6523 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0x39)));
6524 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp2
));
6528 if ((bits
& 7) == 0)
6530 rtx tmp1
= gen_reg_rtx (V1TImode
);
6531 rtx tmp2
= gen_reg_rtx (V1TImode
);
6532 rtx tmp3
= gen_reg_rtx (V1TImode
);
6534 emit_insn (gen_sse2_ashlv1ti3 (tmp1
, op1
, GEN_INT (bits
)));
6535 emit_insn (gen_sse2_lshrv1ti3 (tmp2
, op1
, GEN_INT (128 - bits
)));
6536 emit_insn (gen_iorv1ti3 (tmp3
, tmp1
, tmp2
));
6537 emit_move_insn (operands
[0], tmp3
);
6541 rtx op1_v4si
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6550 hibits
= gen_reg_rtx (V4SImode
);
6551 emit_insn (gen_sse2_pshufd (hibits
, op1_v4si
, GEN_INT (0x93)));
6555 lobits
= gen_reg_rtx (V4SImode
);
6556 hibits
= gen_reg_rtx (V4SImode
);
6557 emit_insn (gen_sse2_pshufd (lobits
, op1_v4si
, GEN_INT (0x93)));
6558 emit_insn (gen_sse2_pshufd (hibits
, op1_v4si
, GEN_INT (0x4e)));
6562 lobits
= gen_reg_rtx (V4SImode
);
6563 hibits
= gen_reg_rtx (V4SImode
);
6564 emit_insn (gen_sse2_pshufd (lobits
, op1_v4si
, GEN_INT (0x4e)));
6565 emit_insn (gen_sse2_pshufd (hibits
, op1_v4si
, GEN_INT (0x39)));
6569 lobits
= gen_reg_rtx (V4SImode
);
6570 emit_insn (gen_sse2_pshufd (lobits
, op1_v4si
, GEN_INT (0x39)));
6575 rtx tmp1
= gen_reg_rtx (V4SImode
);
6576 rtx tmp2
= gen_reg_rtx (V4SImode
);
6577 rtx tmp3
= gen_reg_rtx (V4SImode
);
6579 emit_insn (gen_ashlv4si3 (tmp1
, lobits
, GEN_INT (bits
& 31)));
6580 emit_insn (gen_lshrv4si3 (tmp2
, hibits
, GEN_INT (32 - (bits
& 31))));
6581 emit_insn (gen_iorv4si3 (tmp3
, tmp1
, tmp2
));
6583 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp3
));
6586 /* Expand V1TI mode ashiftrt by constant. */
6588 ix86_expand_v1ti_ashiftrt (rtx operands
[])
6590 rtx op1
= force_reg (V1TImode
, operands
[1]);
6592 if (!CONST_INT_P (operands
[2]))
6594 rtx tmp1
= ix86_expand_v1ti_to_ti (op1
);
6595 rtx tmp2
= gen_reg_rtx (TImode
);
6596 emit_insn (gen_ashrti3 (tmp2
, tmp1
, operands
[2]));
6597 rtx tmp3
= ix86_expand_ti_to_v1ti (tmp2
);
6598 emit_move_insn (operands
[0], tmp3
);
6602 HOST_WIDE_INT bits
= INTVAL (operands
[2]) & 127;
6606 emit_move_insn (operands
[0], op1
);
6612 /* Two operations. */
6613 rtx tmp1
= force_reg(V4SImode
, gen_lowpart (V4SImode
, op1
));
6614 rtx tmp2
= gen_reg_rtx (V4SImode
);
6615 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
6617 rtx tmp3
= gen_reg_rtx (V4SImode
);
6618 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
6620 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp3
));
6626 /* Three operations. */
6627 rtx tmp1
= force_reg(V4SImode
, gen_lowpart (V4SImode
, op1
));
6628 rtx tmp2
= gen_reg_rtx (V4SImode
);
6629 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
6631 rtx tmp3
= gen_reg_rtx (V4SImode
);
6632 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
6634 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp1
));
6635 rtx tmp5
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp3
));
6636 rtx tmp6
= gen_reg_rtx (V2DImode
);
6637 emit_insn (gen_vec_interleave_highv2di (tmp6
, tmp4
, tmp5
));
6639 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp6
));
6645 /* Three operations. */
6646 rtx tmp1
= force_reg(V4SImode
, gen_lowpart (V4SImode
, op1
));
6647 rtx tmp2
= gen_reg_rtx (V4SImode
);
6648 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (31)));
6650 rtx tmp3
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp1
));
6651 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp2
));
6652 rtx tmp5
= gen_reg_rtx (V2DImode
);
6653 emit_insn (gen_vec_interleave_highv2di (tmp5
, tmp3
, tmp4
));
6655 rtx tmp6
= force_reg(V4SImode
, gen_lowpart (V4SImode
, tmp5
));
6656 rtx tmp7
= gen_reg_rtx (V4SImode
);
6657 emit_insn (gen_sse2_pshufd (tmp7
, tmp6
, GEN_INT (0xfd)));
6659 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp7
));
6665 /* Three operations. */
6666 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6667 rtx tmp2
= gen_reg_rtx (V4SImode
);
6668 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (bits
- 96)));
6670 rtx tmp3
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp2
));
6671 rtx tmp4
= gen_reg_rtx (V8HImode
);
6672 emit_insn (gen_sse2_pshufhw (tmp4
, tmp3
, GEN_INT (0xfe)));
6674 rtx tmp5
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp4
));
6675 rtx tmp6
= gen_reg_rtx (V4SImode
);
6676 emit_insn (gen_sse2_pshufd (tmp6
, tmp5
, GEN_INT (0xfe)));
6678 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp6
));
6682 if (TARGET_AVX2
|| TARGET_SSE4_1
)
6684 /* Three operations. */
6687 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6688 rtx tmp2
= gen_reg_rtx (V4SImode
);
6689 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (31)));
6691 rtx tmp3
= gen_reg_rtx (V1TImode
);
6692 emit_insn (gen_sse2_lshrv1ti3 (tmp3
, op1
, GEN_INT (32)));
6696 rtx tmp4
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp3
));
6697 rtx tmp5
= gen_reg_rtx (V4SImode
);
6698 emit_insn (gen_avx2_pblenddv4si (tmp5
, tmp2
, tmp4
,
6701 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp5
));
6705 rtx tmp4
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp2
));
6706 rtx tmp5
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp3
));
6707 rtx tmp6
= gen_reg_rtx (V8HImode
);
6708 emit_insn (gen_sse4_1_pblendw (tmp6
, tmp4
, tmp5
,
6711 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp6
));
6716 /* Three operations. */
6717 if (bits
== 8 || bits
== 16 || bits
== 24)
6719 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6720 rtx tmp2
= gen_reg_rtx (V4SImode
);
6721 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (bits
)));
6723 rtx tmp3
= gen_reg_rtx (V1TImode
);
6724 emit_insn (gen_sse2_lshrv1ti3 (tmp3
, op1
, GEN_INT (bits
)));
6728 rtx tmp4
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp3
));
6729 rtx tmp5
= gen_reg_rtx (V4SImode
);
6730 emit_insn (gen_avx2_pblenddv4si (tmp5
, tmp2
, tmp4
,
6733 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp5
));
6737 rtx tmp4
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp2
));
6738 rtx tmp5
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp3
));
6739 rtx tmp6
= gen_reg_rtx (V8HImode
);
6740 emit_insn (gen_sse4_1_pblendw (tmp6
, tmp4
, tmp5
,
6743 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp6
));
6751 /* Four operations. */
6752 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6753 rtx tmp2
= gen_reg_rtx (V4SImode
);
6754 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (bits
- 96)));
6756 rtx tmp3
= gen_reg_rtx (V4SImode
);
6757 emit_insn (gen_ashrv4si3 (tmp3
, tmp1
, GEN_INT (31)));
6759 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp2
));
6760 rtx tmp5
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp3
));
6761 rtx tmp6
= gen_reg_rtx (V2DImode
);
6762 emit_insn (gen_vec_interleave_highv2di (tmp6
, tmp4
, tmp5
));
6764 rtx tmp7
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp6
));
6765 rtx tmp8
= gen_reg_rtx (V4SImode
);
6766 emit_insn (gen_sse2_pshufd (tmp8
, tmp7
, GEN_INT (0xfd)));
6768 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp8
));
6772 if (TARGET_SSE4_1
&& (bits
== 48 || bits
== 80))
6774 /* Four operations. */
6775 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6776 rtx tmp2
= gen_reg_rtx (V4SImode
);
6777 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
6779 rtx tmp3
= gen_reg_rtx (V4SImode
);
6780 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
6782 rtx tmp4
= gen_reg_rtx (V1TImode
);
6783 emit_insn (gen_sse2_lshrv1ti3 (tmp4
, op1
, GEN_INT (bits
)));
6785 rtx tmp5
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp3
));
6786 rtx tmp6
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp4
));
6787 rtx tmp7
= gen_reg_rtx (V8HImode
);
6788 emit_insn (gen_sse4_1_pblendw (tmp7
, tmp5
, tmp6
,
6789 GEN_INT (bits
== 48 ? 0x1f : 0x07)));
6791 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp7
));
6795 if ((bits
& 7) == 0)
6797 /* Five operations. */
6798 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6799 rtx tmp2
= gen_reg_rtx (V4SImode
);
6800 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
6802 rtx tmp3
= gen_reg_rtx (V4SImode
);
6803 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
6805 rtx tmp4
= gen_reg_rtx (V1TImode
);
6806 emit_insn (gen_sse2_lshrv1ti3 (tmp4
, op1
, GEN_INT (bits
)));
6808 rtx tmp5
= force_reg (V1TImode
, gen_lowpart (V1TImode
, tmp3
));
6809 rtx tmp6
= gen_reg_rtx (V1TImode
);
6810 emit_insn (gen_sse2_ashlv1ti3 (tmp6
, tmp5
, GEN_INT (128 - bits
)));
6812 rtx tmp7
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp4
));
6813 rtx tmp8
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp6
));
6814 rtx tmp9
= gen_reg_rtx (V2DImode
);
6815 emit_insn (gen_iorv2di3 (tmp9
, tmp7
, tmp8
));
6817 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp9
));
6821 if (TARGET_AVX2
&& bits
< 32)
6823 /* Six operations. */
6824 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6825 rtx tmp2
= gen_reg_rtx (V4SImode
);
6826 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (bits
)));
6828 rtx tmp3
= gen_reg_rtx (V1TImode
);
6829 emit_insn (gen_sse2_lshrv1ti3 (tmp3
, op1
, GEN_INT (64)));
6831 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, op1
));
6832 rtx tmp5
= gen_reg_rtx (V2DImode
);
6833 emit_insn (gen_lshrv2di3 (tmp5
, tmp4
, GEN_INT (bits
)));
6835 rtx tmp6
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp3
));
6836 rtx tmp7
= gen_reg_rtx (V2DImode
);
6837 emit_insn (gen_ashlv2di3 (tmp7
, tmp6
, GEN_INT (64 - bits
)));
6839 rtx tmp8
= gen_reg_rtx (V2DImode
);
6840 emit_insn (gen_iorv2di3 (tmp8
, tmp5
, tmp7
));
6842 rtx tmp9
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp8
));
6843 rtx tmp10
= gen_reg_rtx (V4SImode
);
6844 emit_insn (gen_avx2_pblenddv4si (tmp10
, tmp2
, tmp9
, GEN_INT (7)));
6846 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp10
));
6850 if (TARGET_SSE4_1
&& bits
< 15)
6852 /* Six operations. */
6853 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6854 rtx tmp2
= gen_reg_rtx (V4SImode
);
6855 emit_insn (gen_ashrv4si3 (tmp2
, tmp1
, GEN_INT (bits
)));
6857 rtx tmp3
= gen_reg_rtx (V1TImode
);
6858 emit_insn (gen_sse2_lshrv1ti3 (tmp3
, op1
, GEN_INT (64)));
6860 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, op1
));
6861 rtx tmp5
= gen_reg_rtx (V2DImode
);
6862 emit_insn (gen_lshrv2di3 (tmp5
, tmp4
, GEN_INT (bits
)));
6864 rtx tmp6
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp3
));
6865 rtx tmp7
= gen_reg_rtx (V2DImode
);
6866 emit_insn (gen_ashlv2di3 (tmp7
, tmp6
, GEN_INT (64 - bits
)));
6868 rtx tmp8
= gen_reg_rtx (V2DImode
);
6869 emit_insn (gen_iorv2di3 (tmp8
, tmp5
, tmp7
));
6871 rtx tmp9
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp2
));
6872 rtx tmp10
= force_reg (V8HImode
, gen_lowpart (V8HImode
, tmp8
));
6873 rtx tmp11
= gen_reg_rtx (V8HImode
);
6874 emit_insn (gen_sse4_1_pblendw (tmp11
, tmp9
, tmp10
, GEN_INT (0x3f)));
6876 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp11
));
6882 /* Eight operations. */
6883 rtx tmp1
= gen_reg_rtx (V1TImode
);
6884 emit_insn (gen_sse2_lshrv1ti3 (tmp1
, op1
, GEN_INT (64)));
6886 rtx tmp2
= force_reg (V2DImode
, gen_lowpart (V2DImode
, op1
));
6887 rtx tmp3
= gen_reg_rtx (V2DImode
);
6888 emit_insn (gen_lshrv2di3 (tmp3
, tmp2
, GEN_INT (1)));
6890 rtx tmp4
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp1
));
6891 rtx tmp5
= gen_reg_rtx (V2DImode
);
6892 emit_insn (gen_ashlv2di3 (tmp5
, tmp4
, GEN_INT (63)));
6894 rtx tmp6
= gen_reg_rtx (V2DImode
);
6895 emit_insn (gen_iorv2di3 (tmp6
, tmp3
, tmp5
));
6897 rtx tmp7
= gen_reg_rtx (V2DImode
);
6898 emit_insn (gen_lshrv2di3 (tmp7
, tmp2
, GEN_INT (63)));
6900 rtx tmp8
= force_reg (V4SImode
, gen_lowpart (V4SImode
, tmp7
));
6901 rtx tmp9
= gen_reg_rtx (V4SImode
);
6902 emit_insn (gen_sse2_pshufd (tmp9
, tmp8
, GEN_INT (0xbf)));
6904 rtx tmp10
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp9
));
6905 rtx tmp11
= gen_reg_rtx (V2DImode
);
6906 emit_insn (gen_ashlv2di3 (tmp11
, tmp10
, GEN_INT (31)));
6908 rtx tmp12
= gen_reg_rtx (V2DImode
);
6909 emit_insn (gen_iorv2di3 (tmp12
, tmp6
, tmp11
));
6911 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp12
));
6917 /* Eight operations. */
6918 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6919 rtx tmp2
= gen_reg_rtx (V4SImode
);
6920 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
6922 rtx tmp3
= gen_reg_rtx (V4SImode
);
6923 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
6925 rtx tmp4
= gen_reg_rtx (V1TImode
);
6926 emit_insn (gen_sse2_lshrv1ti3 (tmp4
, op1
, GEN_INT (64)));
6928 rtx tmp5
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp4
));
6929 rtx tmp6
= gen_reg_rtx (V2DImode
);
6930 emit_insn (gen_lshrv2di3 (tmp6
, tmp5
, GEN_INT (bits
- 64)));
6932 rtx tmp7
= force_reg (V1TImode
, gen_lowpart (V1TImode
, tmp3
));
6933 rtx tmp8
= gen_reg_rtx (V1TImode
);
6934 emit_insn (gen_sse2_ashlv1ti3 (tmp8
, tmp7
, GEN_INT (64)));
6936 rtx tmp9
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp3
));
6937 rtx tmp10
= gen_reg_rtx (V2DImode
);
6938 emit_insn (gen_ashlv2di3 (tmp10
, tmp9
, GEN_INT (128 - bits
)));
6940 rtx tmp11
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp8
));
6941 rtx tmp12
= gen_reg_rtx (V2DImode
);
6942 emit_insn (gen_iorv2di3 (tmp12
, tmp10
, tmp11
));
6944 rtx tmp13
= gen_reg_rtx (V2DImode
);
6945 emit_insn (gen_iorv2di3 (tmp13
, tmp6
, tmp12
));
6947 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp13
));
6951 /* Nine operations. */
6952 rtx tmp1
= force_reg (V4SImode
, gen_lowpart (V4SImode
, op1
));
6953 rtx tmp2
= gen_reg_rtx (V4SImode
);
6954 emit_insn (gen_sse2_pshufd (tmp2
, tmp1
, GEN_INT (0xff)));
6956 rtx tmp3
= gen_reg_rtx (V4SImode
);
6957 emit_insn (gen_ashrv4si3 (tmp3
, tmp2
, GEN_INT (31)));
6959 rtx tmp4
= gen_reg_rtx (V1TImode
);
6960 emit_insn (gen_sse2_lshrv1ti3 (tmp4
, op1
, GEN_INT (64)));
6962 rtx tmp5
= force_reg (V2DImode
, gen_lowpart (V2DImode
, op1
));
6963 rtx tmp6
= gen_reg_rtx (V2DImode
);
6964 emit_insn (gen_lshrv2di3 (tmp6
, tmp5
, GEN_INT (bits
)));
6966 rtx tmp7
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp4
));
6967 rtx tmp8
= gen_reg_rtx (V2DImode
);
6968 emit_insn (gen_ashlv2di3 (tmp8
, tmp7
, GEN_INT (64 - bits
)));
6970 rtx tmp9
= gen_reg_rtx (V2DImode
);
6971 emit_insn (gen_iorv2di3 (tmp9
, tmp6
, tmp8
));
6973 rtx tmp10
= force_reg (V1TImode
, gen_lowpart (V1TImode
, tmp3
));
6974 rtx tmp11
= gen_reg_rtx (V1TImode
);
6975 emit_insn (gen_sse2_ashlv1ti3 (tmp11
, tmp10
, GEN_INT (64)));
6977 rtx tmp12
= force_reg (V2DImode
, gen_lowpart (V2DImode
, tmp11
));
6978 rtx tmp13
= gen_reg_rtx (V2DImode
);
6979 emit_insn (gen_ashlv2di3 (tmp13
, tmp12
, GEN_INT (64 - bits
)));
6981 rtx tmp14
= gen_reg_rtx (V2DImode
);
6982 emit_insn (gen_iorv2di3 (tmp14
, tmp9
, tmp13
));
6984 emit_move_insn (operands
[0], gen_lowpart (V1TImode
, tmp14
));
6988 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
6989 DImode for constant loop counts. */
6992 counter_mode (rtx count_exp
)
6994 if (GET_MODE (count_exp
) != VOIDmode
)
6995 return GET_MODE (count_exp
);
6996 if (!CONST_INT_P (count_exp
))
6998 if (TARGET_64BIT
&& (INTVAL (count_exp
) & ~0xffffffff))
7003 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
7004 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
7005 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
7006 memory by VALUE (supposed to be in MODE).
7008 The size is rounded down to whole number of chunk size moved at once.
7009 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
7013 expand_set_or_cpymem_via_loop (rtx destmem
, rtx srcmem
,
7014 rtx destptr
, rtx srcptr
, rtx value
,
7015 rtx count
, machine_mode mode
, int unroll
,
7016 int expected_size
, bool issetmem
)
7018 rtx_code_label
*out_label
, *top_label
;
7020 machine_mode iter_mode
= counter_mode (count
);
7021 int piece_size_n
= GET_MODE_SIZE (mode
) * unroll
;
7022 rtx piece_size
= GEN_INT (piece_size_n
);
7023 rtx piece_size_mask
= GEN_INT (~((GET_MODE_SIZE (mode
) * unroll
) - 1));
7027 top_label
= gen_label_rtx ();
7028 out_label
= gen_label_rtx ();
7029 iter
= gen_reg_rtx (iter_mode
);
7031 size
= expand_simple_binop (iter_mode
, AND
, count
, piece_size_mask
,
7032 NULL
, 1, OPTAB_DIRECT
);
7033 /* Those two should combine. */
7034 if (piece_size
== const1_rtx
)
7036 emit_cmp_and_jump_insns (size
, const0_rtx
, EQ
, NULL_RTX
, iter_mode
,
7038 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
7040 emit_move_insn (iter
, const0_rtx
);
7042 emit_label (top_label
);
7044 tmp
= convert_modes (Pmode
, iter_mode
, iter
, true);
7046 /* This assert could be relaxed - in this case we'll need to compute
7047 smallest power of two, containing in PIECE_SIZE_N and pass it to
7049 gcc_assert ((piece_size_n
& (piece_size_n
- 1)) == 0);
7050 destmem
= offset_address (destmem
, tmp
, piece_size_n
);
7051 destmem
= adjust_address (destmem
, mode
, 0);
7055 srcmem
= offset_address (srcmem
, copy_rtx (tmp
), piece_size_n
);
7056 srcmem
= adjust_address (srcmem
, mode
, 0);
7058 /* When unrolling for chips that reorder memory reads and writes,
7059 we can save registers by using single temporary.
7060 Also using 4 temporaries is overkill in 32bit mode. */
7061 if (!TARGET_64BIT
&& 0)
7063 for (i
= 0; i
< unroll
; i
++)
7067 destmem
= adjust_address (copy_rtx (destmem
), mode
,
7068 GET_MODE_SIZE (mode
));
7069 srcmem
= adjust_address (copy_rtx (srcmem
), mode
,
7070 GET_MODE_SIZE (mode
));
7072 emit_move_insn (destmem
, srcmem
);
7078 gcc_assert (unroll
<= 4);
7079 for (i
= 0; i
< unroll
; i
++)
7081 tmpreg
[i
] = gen_reg_rtx (mode
);
7083 srcmem
= adjust_address (copy_rtx (srcmem
), mode
,
7084 GET_MODE_SIZE (mode
));
7085 emit_move_insn (tmpreg
[i
], srcmem
);
7087 for (i
= 0; i
< unroll
; i
++)
7090 destmem
= adjust_address (copy_rtx (destmem
), mode
,
7091 GET_MODE_SIZE (mode
));
7092 emit_move_insn (destmem
, tmpreg
[i
]);
7097 for (i
= 0; i
< unroll
; i
++)
7100 destmem
= adjust_address (copy_rtx (destmem
), mode
,
7101 GET_MODE_SIZE (mode
));
7102 emit_move_insn (destmem
, value
);
7105 tmp
= expand_simple_binop (iter_mode
, PLUS
, iter
, piece_size
, iter
,
7106 true, OPTAB_LIB_WIDEN
);
7108 emit_move_insn (iter
, tmp
);
7110 emit_cmp_and_jump_insns (iter
, size
, LT
, NULL_RTX
, iter_mode
,
7112 if (expected_size
!= -1)
7114 expected_size
/= GET_MODE_SIZE (mode
) * unroll
;
7115 if (expected_size
== 0)
7117 else if (expected_size
> REG_BR_PROB_BASE
)
7118 predict_jump (REG_BR_PROB_BASE
- 1);
7120 predict_jump (REG_BR_PROB_BASE
- (REG_BR_PROB_BASE
+ expected_size
/ 2)
7124 predict_jump (REG_BR_PROB_BASE
* 80 / 100);
7125 iter
= ix86_zero_extend_to_Pmode (iter
);
7126 tmp
= expand_simple_binop (Pmode
, PLUS
, destptr
, iter
, destptr
,
7127 true, OPTAB_LIB_WIDEN
);
7129 emit_move_insn (destptr
, tmp
);
7132 tmp
= expand_simple_binop (Pmode
, PLUS
, srcptr
, iter
, srcptr
,
7133 true, OPTAB_LIB_WIDEN
);
7135 emit_move_insn (srcptr
, tmp
);
7137 emit_label (out_label
);
7140 /* Divide COUNTREG by SCALE. */
7142 scale_counter (rtx countreg
, int scale
)
7148 if (CONST_INT_P (countreg
))
7149 return GEN_INT (INTVAL (countreg
) / scale
);
7150 gcc_assert (REG_P (countreg
));
7152 sc
= expand_simple_binop (GET_MODE (countreg
), LSHIFTRT
, countreg
,
7153 GEN_INT (exact_log2 (scale
)),
7154 NULL
, 1, OPTAB_DIRECT
);
7158 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
7159 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
7160 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
7161 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
7162 ORIG_VALUE is the original value passed to memset to fill the memory with.
7163 Other arguments have same meaning as for previous function. */
7166 expand_set_or_cpymem_via_rep (rtx destmem
, rtx srcmem
,
7167 rtx destptr
, rtx srcptr
, rtx value
, rtx orig_value
,
7169 machine_mode mode
, bool issetmem
)
7174 HOST_WIDE_INT rounded_count
;
7176 /* If possible, it is shorter to use rep movs.
7177 TODO: Maybe it is better to move this logic to decide_alg. */
7178 if (mode
== QImode
&& CONST_INT_P (count
) && !(INTVAL (count
) & 3)
7179 && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
7180 && (!issetmem
|| orig_value
== const0_rtx
))
7183 if (destptr
!= XEXP (destmem
, 0) || GET_MODE (destmem
) != BLKmode
)
7184 destmem
= adjust_automodify_address_nv (destmem
, BLKmode
, destptr
, 0);
7186 countreg
= ix86_zero_extend_to_Pmode (scale_counter (count
,
7187 GET_MODE_SIZE (mode
)));
7190 destexp
= gen_rtx_ASHIFT (Pmode
, countreg
,
7191 GEN_INT (exact_log2 (GET_MODE_SIZE (mode
))));
7192 destexp
= gen_rtx_PLUS (Pmode
, destexp
, destptr
);
7195 destexp
= gen_rtx_PLUS (Pmode
, destptr
, countreg
);
7196 if ((!issetmem
|| orig_value
== const0_rtx
) && CONST_INT_P (count
))
7199 = ROUND_DOWN (INTVAL (count
), (HOST_WIDE_INT
) GET_MODE_SIZE (mode
));
7200 destmem
= shallow_copy_rtx (destmem
);
7201 set_mem_size (destmem
, rounded_count
);
7203 else if (MEM_SIZE_KNOWN_P (destmem
))
7204 clear_mem_size (destmem
);
7208 value
= force_reg (mode
, gen_lowpart (mode
, value
));
7209 emit_insn (gen_rep_stos (destptr
, countreg
, destmem
, value
, destexp
));
7213 if (srcptr
!= XEXP (srcmem
, 0) || GET_MODE (srcmem
) != BLKmode
)
7214 srcmem
= adjust_automodify_address_nv (srcmem
, BLKmode
, srcptr
, 0);
7217 srcexp
= gen_rtx_ASHIFT (Pmode
, countreg
,
7218 GEN_INT (exact_log2 (GET_MODE_SIZE (mode
))));
7219 srcexp
= gen_rtx_PLUS (Pmode
, srcexp
, srcptr
);
7222 srcexp
= gen_rtx_PLUS (Pmode
, srcptr
, countreg
);
7223 if (CONST_INT_P (count
))
7226 = ROUND_DOWN (INTVAL (count
), (HOST_WIDE_INT
) GET_MODE_SIZE (mode
));
7227 srcmem
= shallow_copy_rtx (srcmem
);
7228 set_mem_size (srcmem
, rounded_count
);
7232 if (MEM_SIZE_KNOWN_P (srcmem
))
7233 clear_mem_size (srcmem
);
7235 emit_insn (gen_rep_mov (destptr
, destmem
, srcptr
, srcmem
, countreg
,
7240 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
7242 SRC is passed by pointer to be updated on return.
7243 Return value is updated DST. */
7245 emit_memmov (rtx destmem
, rtx
*srcmem
, rtx destptr
, rtx srcptr
,
7246 HOST_WIDE_INT size_to_move
)
7248 rtx dst
= destmem
, src
= *srcmem
, tempreg
;
7249 enum insn_code code
;
7250 machine_mode move_mode
;
7253 /* Find the widest mode in which we could perform moves.
7254 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7255 it until move of such size is supported. */
7256 piece_size
= 1 << floor_log2 (size_to_move
);
7257 while (!int_mode_for_size (piece_size
* BITS_PER_UNIT
, 0).exists (&move_mode
)
7258 || (code
= optab_handler (mov_optab
, move_mode
)) == CODE_FOR_nothing
)
7260 gcc_assert (piece_size
> 1);
7264 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7265 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7266 if (GET_MODE_SIZE (move_mode
) > GET_MODE_SIZE (word_mode
))
7268 int nunits
= GET_MODE_SIZE (move_mode
) / GET_MODE_SIZE (word_mode
);
7269 if (!mode_for_vector (word_mode
, nunits
).exists (&move_mode
)
7270 || (code
= optab_handler (mov_optab
, move_mode
)) == CODE_FOR_nothing
)
7272 move_mode
= word_mode
;
7273 piece_size
= GET_MODE_SIZE (move_mode
);
7274 code
= optab_handler (mov_optab
, move_mode
);
7277 gcc_assert (code
!= CODE_FOR_nothing
);
7279 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
, 0);
7280 src
= adjust_automodify_address_nv (src
, move_mode
, srcptr
, 0);
7282 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7283 gcc_assert (size_to_move
% piece_size
== 0);
7285 for (i
= 0; i
< size_to_move
; i
+= piece_size
)
7287 /* We move from memory to memory, so we'll need to do it via
7288 a temporary register. */
7289 tempreg
= gen_reg_rtx (move_mode
);
7290 emit_insn (GEN_FCN (code
) (tempreg
, src
));
7291 emit_insn (GEN_FCN (code
) (dst
, tempreg
));
7293 emit_move_insn (destptr
,
7294 plus_constant (Pmode
, copy_rtx (destptr
), piece_size
));
7295 emit_move_insn (srcptr
,
7296 plus_constant (Pmode
, copy_rtx (srcptr
), piece_size
));
7298 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
7300 src
= adjust_automodify_address_nv (src
, move_mode
, srcptr
,
7304 /* Update DST and SRC rtx. */
7309 /* Helper function for the string operations below. Dest VARIABLE whether
7310 it is aligned to VALUE bytes. If true, jump to the label. */
7312 static rtx_code_label
*
7313 ix86_expand_aligntest (rtx variable
, int value
, bool epilogue
)
7315 rtx_code_label
*label
= gen_label_rtx ();
7316 rtx tmpcount
= gen_reg_rtx (GET_MODE (variable
));
7317 if (GET_MODE (variable
) == DImode
)
7318 emit_insn (gen_anddi3 (tmpcount
, variable
, GEN_INT (value
)));
7320 emit_insn (gen_andsi3 (tmpcount
, variable
, GEN_INT (value
)));
7321 emit_cmp_and_jump_insns (tmpcount
, const0_rtx
, EQ
, 0, GET_MODE (variable
),
7324 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
7326 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
7331 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
7334 expand_cpymem_epilogue (rtx destmem
, rtx srcmem
,
7335 rtx destptr
, rtx srcptr
, rtx count
, int max_size
)
7338 if (CONST_INT_P (count
))
7340 HOST_WIDE_INT countval
= INTVAL (count
);
7341 HOST_WIDE_INT epilogue_size
= countval
% max_size
;
7344 /* For now MAX_SIZE should be a power of 2. This assert could be
7345 relaxed, but it'll require a bit more complicated epilogue
7347 gcc_assert ((max_size
& (max_size
- 1)) == 0);
7348 for (i
= max_size
; i
>= 1; i
>>= 1)
7350 if (epilogue_size
& i
)
7351 destmem
= emit_memmov (destmem
, &srcmem
, destptr
, srcptr
, i
);
7357 count
= expand_simple_binop (GET_MODE (count
), AND
, count
, GEN_INT (max_size
- 1),
7358 count
, 1, OPTAB_DIRECT
);
7359 expand_set_or_cpymem_via_loop (destmem
, srcmem
, destptr
, srcptr
, NULL
,
7360 count
, QImode
, 1, 4, false);
7364 /* When there are stringops, we can cheaply increase dest and src pointers.
7365 Otherwise we save code size by maintaining offset (zero is readily
7366 available from preceding rep operation) and using x86 addressing modes.
7368 if (TARGET_SINGLE_STRINGOP
)
7372 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
7373 src
= change_address (srcmem
, SImode
, srcptr
);
7374 dest
= change_address (destmem
, SImode
, destptr
);
7375 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
7377 LABEL_NUSES (label
) = 1;
7381 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
7382 src
= change_address (srcmem
, HImode
, srcptr
);
7383 dest
= change_address (destmem
, HImode
, destptr
);
7384 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
7386 LABEL_NUSES (label
) = 1;
7390 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
7391 src
= change_address (srcmem
, QImode
, srcptr
);
7392 dest
= change_address (destmem
, QImode
, destptr
);
7393 emit_insn (gen_strmov (destptr
, dest
, srcptr
, src
));
7395 LABEL_NUSES (label
) = 1;
7400 rtx offset
= force_reg (Pmode
, const0_rtx
);
7405 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
7406 src
= change_address (srcmem
, SImode
, srcptr
);
7407 dest
= change_address (destmem
, SImode
, destptr
);
7408 emit_move_insn (dest
, src
);
7409 tmp
= expand_simple_binop (Pmode
, PLUS
, offset
, GEN_INT (4), NULL
,
7410 true, OPTAB_LIB_WIDEN
);
7412 emit_move_insn (offset
, tmp
);
7414 LABEL_NUSES (label
) = 1;
7418 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
7419 tmp
= gen_rtx_PLUS (Pmode
, srcptr
, offset
);
7420 src
= change_address (srcmem
, HImode
, tmp
);
7421 tmp
= gen_rtx_PLUS (Pmode
, destptr
, offset
);
7422 dest
= change_address (destmem
, HImode
, tmp
);
7423 emit_move_insn (dest
, src
);
7424 tmp
= expand_simple_binop (Pmode
, PLUS
, offset
, GEN_INT (2), tmp
,
7425 true, OPTAB_LIB_WIDEN
);
7427 emit_move_insn (offset
, tmp
);
7429 LABEL_NUSES (label
) = 1;
7433 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
7434 tmp
= gen_rtx_PLUS (Pmode
, srcptr
, offset
);
7435 src
= change_address (srcmem
, QImode
, tmp
);
7436 tmp
= gen_rtx_PLUS (Pmode
, destptr
, offset
);
7437 dest
= change_address (destmem
, QImode
, tmp
);
7438 emit_move_insn (dest
, src
);
7440 LABEL_NUSES (label
) = 1;
7445 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
7446 with value PROMOTED_VAL.
7447 SRC is passed by pointer to be updated on return.
7448 Return value is updated DST. */
7450 emit_memset (rtx destmem
, rtx destptr
, rtx promoted_val
,
7451 HOST_WIDE_INT size_to_move
)
7454 enum insn_code code
;
7455 machine_mode move_mode
;
7458 /* Find the widest mode in which we could perform moves.
7459 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7460 it until move of such size is supported. */
7461 move_mode
= GET_MODE (promoted_val
);
7462 if (move_mode
== VOIDmode
)
7464 if (size_to_move
< GET_MODE_SIZE (move_mode
))
7466 unsigned int move_bits
= size_to_move
* BITS_PER_UNIT
;
7467 move_mode
= int_mode_for_size (move_bits
, 0).require ();
7468 promoted_val
= gen_lowpart (move_mode
, promoted_val
);
7470 piece_size
= GET_MODE_SIZE (move_mode
);
7471 code
= optab_handler (mov_optab
, move_mode
);
7472 gcc_assert (code
!= CODE_FOR_nothing
&& promoted_val
!= NULL_RTX
);
7474 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
, 0);
7476 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7477 gcc_assert (size_to_move
% piece_size
== 0);
7479 for (i
= 0; i
< size_to_move
; i
+= piece_size
)
7481 if (piece_size
<= GET_MODE_SIZE (word_mode
))
7483 emit_insn (gen_strset (destptr
, dst
, promoted_val
));
7484 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
7489 emit_insn (GEN_FCN (code
) (dst
, promoted_val
));
7491 emit_move_insn (destptr
,
7492 plus_constant (Pmode
, copy_rtx (destptr
), piece_size
));
7494 dst
= adjust_automodify_address_nv (dst
, move_mode
, destptr
,
7498 /* Update DST rtx. */
7501 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7503 expand_setmem_epilogue_via_loop (rtx destmem
, rtx destptr
, rtx value
,
7504 rtx count
, int max_size
)
7506 count
= expand_simple_binop (counter_mode (count
), AND
, count
,
7507 GEN_INT (max_size
- 1), count
, 1, OPTAB_DIRECT
);
7508 expand_set_or_cpymem_via_loop (destmem
, NULL
, destptr
, NULL
,
7509 gen_lowpart (QImode
, value
), count
, QImode
,
7510 1, max_size
/ 2, true);
7513 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7515 expand_setmem_epilogue (rtx destmem
, rtx destptr
, rtx value
, rtx vec_value
,
7516 rtx count
, int max_size
)
7520 if (CONST_INT_P (count
))
7522 HOST_WIDE_INT countval
= INTVAL (count
);
7523 HOST_WIDE_INT epilogue_size
= countval
% max_size
;
7526 /* For now MAX_SIZE should be a power of 2. This assert could be
7527 relaxed, but it'll require a bit more complicated epilogue
7529 gcc_assert ((max_size
& (max_size
- 1)) == 0);
7530 for (i
= max_size
; i
>= 1; i
>>= 1)
7532 if (epilogue_size
& i
)
7534 if (vec_value
&& i
> GET_MODE_SIZE (GET_MODE (value
)))
7535 destmem
= emit_memset (destmem
, destptr
, vec_value
, i
);
7537 destmem
= emit_memset (destmem
, destptr
, value
, i
);
7544 expand_setmem_epilogue_via_loop (destmem
, destptr
, value
, count
, max_size
);
7549 rtx_code_label
*label
= ix86_expand_aligntest (count
, 16, true);
7552 dest
= change_address (destmem
, DImode
, destptr
);
7553 emit_insn (gen_strset (destptr
, dest
, value
));
7554 dest
= adjust_automodify_address_nv (dest
, DImode
, destptr
, 8);
7555 emit_insn (gen_strset (destptr
, dest
, value
));
7559 dest
= change_address (destmem
, SImode
, destptr
);
7560 emit_insn (gen_strset (destptr
, dest
, value
));
7561 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 4);
7562 emit_insn (gen_strset (destptr
, dest
, value
));
7563 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 8);
7564 emit_insn (gen_strset (destptr
, dest
, value
));
7565 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 12);
7566 emit_insn (gen_strset (destptr
, dest
, value
));
7569 LABEL_NUSES (label
) = 1;
7573 rtx_code_label
*label
= ix86_expand_aligntest (count
, 8, true);
7576 dest
= change_address (destmem
, DImode
, destptr
);
7577 emit_insn (gen_strset (destptr
, dest
, value
));
7581 dest
= change_address (destmem
, SImode
, destptr
);
7582 emit_insn (gen_strset (destptr
, dest
, value
));
7583 dest
= adjust_automodify_address_nv (dest
, SImode
, destptr
, 4);
7584 emit_insn (gen_strset (destptr
, dest
, value
));
7587 LABEL_NUSES (label
) = 1;
7591 rtx_code_label
*label
= ix86_expand_aligntest (count
, 4, true);
7592 dest
= change_address (destmem
, SImode
, destptr
);
7593 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (SImode
, value
)));
7595 LABEL_NUSES (label
) = 1;
7599 rtx_code_label
*label
= ix86_expand_aligntest (count
, 2, true);
7600 dest
= change_address (destmem
, HImode
, destptr
);
7601 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (HImode
, value
)));
7603 LABEL_NUSES (label
) = 1;
7607 rtx_code_label
*label
= ix86_expand_aligntest (count
, 1, true);
7608 dest
= change_address (destmem
, QImode
, destptr
);
7609 emit_insn (gen_strset (destptr
, dest
, gen_lowpart (QImode
, value
)));
7611 LABEL_NUSES (label
) = 1;
7615 /* Adjust COUNTER by the VALUE. */
7617 ix86_adjust_counter (rtx countreg
, HOST_WIDE_INT value
)
7619 emit_insn (gen_add2_insn (countreg
, GEN_INT (-value
)));
7622 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
7623 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
7624 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
7626 Return value is updated DESTMEM. */
7629 expand_set_or_cpymem_prologue (rtx destmem
, rtx srcmem
,
7630 rtx destptr
, rtx srcptr
, rtx value
,
7631 rtx vec_value
, rtx count
, int align
,
7632 int desired_alignment
, bool issetmem
)
7635 for (i
= 1; i
< desired_alignment
; i
<<= 1)
7639 rtx_code_label
*label
= ix86_expand_aligntest (destptr
, i
, false);
7642 if (vec_value
&& i
> GET_MODE_SIZE (GET_MODE (value
)))
7643 destmem
= emit_memset (destmem
, destptr
, vec_value
, i
);
7645 destmem
= emit_memset (destmem
, destptr
, value
, i
);
7648 destmem
= emit_memmov (destmem
, &srcmem
, destptr
, srcptr
, i
);
7649 ix86_adjust_counter (count
, i
);
7651 LABEL_NUSES (label
) = 1;
7652 set_mem_align (destmem
, i
* 2 * BITS_PER_UNIT
);
7658 /* Test if COUNT&SIZE is nonzero and if so, expand movme
7659 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
7660 and jump to DONE_LABEL. */
7662 expand_small_cpymem_or_setmem (rtx destmem
, rtx srcmem
,
7663 rtx destptr
, rtx srcptr
,
7664 rtx value
, rtx vec_value
,
7665 rtx count
, int size
,
7666 rtx done_label
, bool issetmem
)
7668 rtx_code_label
*label
= ix86_expand_aligntest (count
, size
, false);
7669 machine_mode mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 1).else_blk ();
7673 /* If we do not have vector value to copy, we must reduce size. */
7678 if (GET_MODE (value
) == VOIDmode
&& size
> 8)
7680 else if (GET_MODE_SIZE (mode
) > GET_MODE_SIZE (GET_MODE (value
)))
7681 mode
= GET_MODE (value
);
7684 mode
= GET_MODE (vec_value
), value
= vec_value
;
7688 /* Choose appropriate vector mode. */
7690 mode
= TARGET_AVX
? V32QImode
: TARGET_SSE
? V16QImode
: DImode
;
7691 else if (size
>= 16)
7692 mode
= TARGET_SSE
? V16QImode
: DImode
;
7693 srcmem
= change_address (srcmem
, mode
, srcptr
);
7695 destmem
= change_address (destmem
, mode
, destptr
);
7696 modesize
= GEN_INT (GET_MODE_SIZE (mode
));
7697 gcc_assert (GET_MODE_SIZE (mode
) <= size
);
7698 for (n
= 0; n
* GET_MODE_SIZE (mode
) < size
; n
++)
7701 emit_move_insn (destmem
, gen_lowpart (mode
, value
));
7704 emit_move_insn (destmem
, srcmem
);
7705 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
7707 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
7710 destmem
= offset_address (destmem
, count
, 1);
7711 destmem
= offset_address (destmem
, GEN_INT (-2 * size
),
7712 GET_MODE_SIZE (mode
));
7715 srcmem
= offset_address (srcmem
, count
, 1);
7716 srcmem
= offset_address (srcmem
, GEN_INT (-2 * size
),
7717 GET_MODE_SIZE (mode
));
7719 for (n
= 0; n
* GET_MODE_SIZE (mode
) < size
; n
++)
7722 emit_move_insn (destmem
, gen_lowpart (mode
, value
));
7725 emit_move_insn (destmem
, srcmem
);
7726 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
7728 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
7730 emit_jump_insn (gen_jump (done_label
));
7734 LABEL_NUSES (label
) = 1;
7737 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
7738 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
7739 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
7740 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
7741 DONE_LABEL is a label after the whole copying sequence. The label is created
7742 on demand if *DONE_LABEL is NULL.
7743 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
7744 bounds after the initial copies.
7746 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
7747 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
7748 we will dispatch to a library call for large blocks.
7750 In pseudocode we do:
7754 Assume that SIZE is 4. Bigger sizes are handled analogously
7757 copy 4 bytes from SRCPTR to DESTPTR
7758 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
7763 copy 1 byte from SRCPTR to DESTPTR
7766 copy 2 bytes from SRCPTR to DESTPTR
7767 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
7772 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
7773 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
7775 OLD_DESPTR = DESTPTR;
7776 Align DESTPTR up to DESIRED_ALIGN
7777 SRCPTR += DESTPTR - OLD_DESTPTR
7778 COUNT -= DEST_PTR - OLD_DESTPTR
7780 Round COUNT down to multiple of SIZE
7781 << optional caller supplied zero size guard is here >>
7782 << optional caller supplied dynamic check is here >>
7783 << caller supplied main copy loop is here >>
7788 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem
, rtx srcmem
,
7789 rtx
*destptr
, rtx
*srcptr
,
7791 rtx value
, rtx vec_value
,
7793 rtx_code_label
**done_label
,
7797 unsigned HOST_WIDE_INT
*min_size
,
7801 rtx_code_label
*loop_label
= NULL
, *label
;
7804 int prolog_size
= 0;
7807 /* Chose proper value to copy. */
7808 if (issetmem
&& VECTOR_MODE_P (mode
))
7809 mode_value
= vec_value
;
7812 gcc_assert (GET_MODE_SIZE (mode
) <= size
);
7814 /* See if block is big or small, handle small blocks. */
7815 if (!CONST_INT_P (*count
) && *min_size
< (unsigned HOST_WIDE_INT
)size
)
7818 loop_label
= gen_label_rtx ();
7821 *done_label
= gen_label_rtx ();
7823 emit_cmp_and_jump_insns (*count
, GEN_INT (size2
), GE
, 0, GET_MODE (*count
),
7827 /* Handle sizes > 3. */
7828 for (;size2
> 2; size2
>>= 1)
7829 expand_small_cpymem_or_setmem (destmem
, srcmem
,
7833 size2
, *done_label
, issetmem
);
7834 /* Nothing to copy? Jump to DONE_LABEL if so */
7835 emit_cmp_and_jump_insns (*count
, const0_rtx
, EQ
, 0, GET_MODE (*count
),
7838 /* Do a byte copy. */
7839 destmem
= change_address (destmem
, QImode
, *destptr
);
7841 emit_move_insn (destmem
, gen_lowpart (QImode
, value
));
7844 srcmem
= change_address (srcmem
, QImode
, *srcptr
);
7845 emit_move_insn (destmem
, srcmem
);
7848 /* Handle sizes 2 and 3. */
7849 label
= ix86_expand_aligntest (*count
, 2, false);
7850 destmem
= change_address (destmem
, HImode
, *destptr
);
7851 destmem
= offset_address (destmem
, *count
, 1);
7852 destmem
= offset_address (destmem
, GEN_INT (-2), 2);
7854 emit_move_insn (destmem
, gen_lowpart (HImode
, value
));
7857 srcmem
= change_address (srcmem
, HImode
, *srcptr
);
7858 srcmem
= offset_address (srcmem
, *count
, 1);
7859 srcmem
= offset_address (srcmem
, GEN_INT (-2), 2);
7860 emit_move_insn (destmem
, srcmem
);
7864 LABEL_NUSES (label
) = 1;
7865 emit_jump_insn (gen_jump (*done_label
));
7869 gcc_assert (*min_size
>= (unsigned HOST_WIDE_INT
)size
7870 || UINTVAL (*count
) >= (unsigned HOST_WIDE_INT
)size
);
7872 /* Start memcpy for COUNT >= SIZE. */
7875 emit_label (loop_label
);
7876 LABEL_NUSES (loop_label
) = 1;
7879 /* Copy first desired_align bytes. */
7881 srcmem
= change_address (srcmem
, mode
, *srcptr
);
7882 destmem
= change_address (destmem
, mode
, *destptr
);
7883 modesize
= GEN_INT (GET_MODE_SIZE (mode
));
7884 for (n
= 0; prolog_size
< desired_align
- align
; n
++)
7887 emit_move_insn (destmem
, mode_value
);
7890 emit_move_insn (destmem
, srcmem
);
7891 srcmem
= offset_address (srcmem
, modesize
, GET_MODE_SIZE (mode
));
7893 destmem
= offset_address (destmem
, modesize
, GET_MODE_SIZE (mode
));
7894 prolog_size
+= GET_MODE_SIZE (mode
);
7898 /* Copy last SIZE bytes. */
7899 destmem
= offset_address (destmem
, *count
, 1);
7900 destmem
= offset_address (destmem
,
7901 GEN_INT (-size
- prolog_size
),
7904 emit_move_insn (destmem
, mode_value
);
7907 srcmem
= offset_address (srcmem
, *count
, 1);
7908 srcmem
= offset_address (srcmem
,
7909 GEN_INT (-size
- prolog_size
),
7911 emit_move_insn (destmem
, srcmem
);
7913 for (n
= 1; n
* GET_MODE_SIZE (mode
) < size
; n
++)
7915 destmem
= offset_address (destmem
, modesize
, 1);
7917 emit_move_insn (destmem
, mode_value
);
7920 srcmem
= offset_address (srcmem
, modesize
, 1);
7921 emit_move_insn (destmem
, srcmem
);
7925 /* Align destination. */
7926 if (desired_align
> 1 && desired_align
> align
)
7928 rtx saveddest
= *destptr
;
7930 gcc_assert (desired_align
<= size
);
7931 /* Align destptr up, place it to new register. */
7932 *destptr
= expand_simple_binop (GET_MODE (*destptr
), PLUS
, *destptr
,
7933 GEN_INT (prolog_size
),
7934 NULL_RTX
, 1, OPTAB_DIRECT
);
7935 if (REG_P (*destptr
) && REG_P (saveddest
) && REG_POINTER (saveddest
))
7936 REG_POINTER (*destptr
) = 1;
7937 *destptr
= expand_simple_binop (GET_MODE (*destptr
), AND
, *destptr
,
7938 GEN_INT (-desired_align
),
7939 *destptr
, 1, OPTAB_DIRECT
);
7940 /* See how many bytes we skipped. */
7941 saveddest
= expand_simple_binop (GET_MODE (*destptr
), MINUS
, saveddest
,
7943 saveddest
, 1, OPTAB_DIRECT
);
7944 /* Adjust srcptr and count. */
7946 *srcptr
= expand_simple_binop (GET_MODE (*srcptr
), MINUS
, *srcptr
,
7947 saveddest
, *srcptr
, 1, OPTAB_DIRECT
);
7948 *count
= expand_simple_binop (GET_MODE (*count
), PLUS
, *count
,
7949 saveddest
, *count
, 1, OPTAB_DIRECT
);
7950 /* We copied at most size + prolog_size. */
7951 if (*min_size
> (unsigned HOST_WIDE_INT
)(size
+ prolog_size
))
7953 = ROUND_DOWN (*min_size
- size
, (unsigned HOST_WIDE_INT
)size
);
7957 /* Our loops always round down the block size, but for dispatch to
7958 library we need precise value. */
7960 *count
= expand_simple_binop (GET_MODE (*count
), AND
, *count
,
7961 GEN_INT (-size
), *count
, 1, OPTAB_DIRECT
);
7965 gcc_assert (prolog_size
== 0);
7966 /* Decrease count, so we won't end up copying last word twice. */
7967 if (!CONST_INT_P (*count
))
7968 *count
= expand_simple_binop (GET_MODE (*count
), PLUS
, *count
,
7969 constm1_rtx
, *count
, 1, OPTAB_DIRECT
);
7971 *count
= GEN_INT (ROUND_DOWN (UINTVAL (*count
) - 1,
7972 (unsigned HOST_WIDE_INT
)size
));
7974 *min_size
= ROUND_DOWN (*min_size
- 1, (unsigned HOST_WIDE_INT
)size
);
7979 /* This function is like the previous one, except here we know how many bytes
7980 need to be copied. That allows us to update alignment not only of DST, which
7981 is returned, but also of SRC, which is passed as a pointer for that
7984 expand_set_or_cpymem_constant_prologue (rtx dst
, rtx
*srcp
, rtx destreg
,
7985 rtx srcreg
, rtx value
, rtx vec_value
,
7986 int desired_align
, int align_bytes
,
7991 rtx orig_src
= NULL
;
7993 int copied_bytes
= 0;
7997 gcc_assert (srcp
!= NULL
);
8002 for (piece_size
= 1;
8003 piece_size
<= desired_align
&& copied_bytes
< align_bytes
;
8006 if (align_bytes
& piece_size
)
8010 if (vec_value
&& piece_size
> GET_MODE_SIZE (GET_MODE (value
)))
8011 dst
= emit_memset (dst
, destreg
, vec_value
, piece_size
);
8013 dst
= emit_memset (dst
, destreg
, value
, piece_size
);
8016 dst
= emit_memmov (dst
, &src
, destreg
, srcreg
, piece_size
);
8017 copied_bytes
+= piece_size
;
8020 if (MEM_ALIGN (dst
) < (unsigned int) desired_align
* BITS_PER_UNIT
)
8021 set_mem_align (dst
, desired_align
* BITS_PER_UNIT
);
8022 if (MEM_SIZE_KNOWN_P (orig_dst
))
8023 set_mem_size (dst
, MEM_SIZE (orig_dst
) - align_bytes
);
8027 int src_align_bytes
= get_mem_align_offset (src
, desired_align
8029 if (src_align_bytes
>= 0)
8030 src_align_bytes
= desired_align
- src_align_bytes
;
8031 if (src_align_bytes
>= 0)
8033 unsigned int src_align
;
8034 for (src_align
= desired_align
; src_align
>= 2; src_align
>>= 1)
8036 if ((src_align_bytes
& (src_align
- 1))
8037 == (align_bytes
& (src_align
- 1)))
8040 if (src_align
> (unsigned int) desired_align
)
8041 src_align
= desired_align
;
8042 if (MEM_ALIGN (src
) < src_align
* BITS_PER_UNIT
)
8043 set_mem_align (src
, src_align
* BITS_PER_UNIT
);
8045 if (MEM_SIZE_KNOWN_P (orig_src
))
8046 set_mem_size (src
, MEM_SIZE (orig_src
) - align_bytes
);
8053 /* Return true if ALG can be used in current context.
8054 Assume we expand memset if MEMSET is true. */
8056 alg_usable_p (enum stringop_alg alg
, bool memset
, bool have_as
)
8058 if (alg
== no_stringop
)
8060 if (alg
== vector_loop
)
8061 return TARGET_SSE
|| TARGET_AVX
;
8062 /* Algorithms using the rep prefix want at least edi and ecx;
8063 additionally, memset wants eax and memcpy wants esi. Don't
8064 consider such algorithms if the user has appropriated those
8065 registers for their own purposes, or if we have a non-default
8066 address space, since some string insns cannot override the segment. */
8067 if (alg
== rep_prefix_1_byte
8068 || alg
== rep_prefix_4_byte
8069 || alg
== rep_prefix_8_byte
)
8073 if (fixed_regs
[CX_REG
]
8074 || fixed_regs
[DI_REG
]
8075 || (memset
? fixed_regs
[AX_REG
] : fixed_regs
[SI_REG
]))
8081 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
8082 static enum stringop_alg
8083 decide_alg (HOST_WIDE_INT count
, HOST_WIDE_INT expected_size
,
8084 unsigned HOST_WIDE_INT min_size
, unsigned HOST_WIDE_INT max_size
,
8085 bool memset
, bool zero_memset
, bool have_as
,
8086 int *dynamic_check
, bool *noalign
, bool recur
)
8088 const struct stringop_algs
*algs
;
8089 bool optimize_for_speed
;
8091 const struct processor_costs
*cost
;
8093 bool any_alg_usable_p
= false;
8096 *dynamic_check
= -1;
8098 /* Even if the string operation call is cold, we still might spend a lot
8099 of time processing large blocks. */
8100 if (optimize_function_for_size_p (cfun
)
8101 || (optimize_insn_for_size_p ()
8103 || (expected_size
!= -1 && expected_size
< 256))))
8104 optimize_for_speed
= false;
8106 optimize_for_speed
= true;
8108 cost
= optimize_for_speed
? ix86_cost
: &ix86_size_cost
;
8110 algs
= &cost
->memset
[TARGET_64BIT
!= 0];
8112 algs
= &cost
->memcpy
[TARGET_64BIT
!= 0];
8114 /* See maximal size for user defined algorithm. */
8115 for (i
= 0; i
< MAX_STRINGOP_ALGS
; i
++)
8117 enum stringop_alg candidate
= algs
->size
[i
].alg
;
8118 bool usable
= alg_usable_p (candidate
, memset
, have_as
);
8119 any_alg_usable_p
|= usable
;
8121 if (candidate
!= libcall
&& candidate
&& usable
)
8122 max
= algs
->size
[i
].max
;
8125 /* If expected size is not known but max size is small enough
8126 so inline version is a win, set expected size into
8128 if (((max
> 1 && (unsigned HOST_WIDE_INT
) max
>= max_size
) || max
== -1)
8129 && expected_size
== -1)
8130 expected_size
= min_size
/ 2 + max_size
/ 2;
8132 /* If user specified the algorithm, honor it if possible. */
8133 if (ix86_stringop_alg
!= no_stringop
8134 && alg_usable_p (ix86_stringop_alg
, memset
, have_as
))
8135 return ix86_stringop_alg
;
8136 /* rep; movq or rep; movl is the smallest variant. */
8137 else if (!optimize_for_speed
)
8140 if (!count
|| (count
& 3) || (memset
&& !zero_memset
))
8141 return alg_usable_p (rep_prefix_1_byte
, memset
, have_as
)
8142 ? rep_prefix_1_byte
: loop_1_byte
;
8144 return alg_usable_p (rep_prefix_4_byte
, memset
, have_as
)
8145 ? rep_prefix_4_byte
: loop
;
8147 /* Very tiny blocks are best handled via the loop, REP is expensive to
8149 else if (expected_size
!= -1 && expected_size
< 4)
8151 else if (expected_size
!= -1)
8153 enum stringop_alg alg
= libcall
;
8154 bool alg_noalign
= false;
8155 for (i
= 0; i
< MAX_STRINGOP_ALGS
; i
++)
8157 /* We get here if the algorithms that were not libcall-based
8158 were rep-prefix based and we are unable to use rep prefixes
8159 based on global register usage. Break out of the loop and
8160 use the heuristic below. */
8161 if (algs
->size
[i
].max
== 0)
8163 if (algs
->size
[i
].max
>= expected_size
|| algs
->size
[i
].max
== -1)
8165 enum stringop_alg candidate
= algs
->size
[i
].alg
;
8167 if (candidate
!= libcall
8168 && alg_usable_p (candidate
, memset
, have_as
))
8171 alg_noalign
= algs
->size
[i
].noalign
;
8173 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
8174 last non-libcall inline algorithm. */
8175 if (TARGET_INLINE_ALL_STRINGOPS
)
8177 /* When the current size is best to be copied by a libcall,
8178 but we are still forced to inline, run the heuristic below
8179 that will pick code for medium sized blocks. */
8182 *noalign
= alg_noalign
;
8185 else if (!any_alg_usable_p
)
8188 else if (alg_usable_p (candidate
, memset
, have_as
)
8189 && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
8190 && candidate
== rep_prefix_1_byte
8191 /* NB: If min_size != max_size, size is
8193 && min_size
!= max_size
))
8195 *noalign
= algs
->size
[i
].noalign
;
8201 /* When asked to inline the call anyway, try to pick meaningful choice.
8202 We look for maximal size of block that is faster to copy by hand and
8203 take blocks of at most of that size guessing that average size will
8204 be roughly half of the block.
8206 If this turns out to be bad, we might simply specify the preferred
8207 choice in ix86_costs. */
8208 if ((TARGET_INLINE_ALL_STRINGOPS
|| TARGET_INLINE_STRINGOPS_DYNAMICALLY
)
8209 && (algs
->unknown_size
== libcall
8210 || !alg_usable_p (algs
->unknown_size
, memset
, have_as
)))
8212 enum stringop_alg alg
;
8213 HOST_WIDE_INT new_expected_size
= (max
> 0 ? max
: 4096) / 2;
8215 /* If there aren't any usable algorithms or if recursing already,
8216 then recursing on smaller sizes or same size isn't going to
8217 find anything. Just return the simple byte-at-a-time copy loop. */
8218 if (!any_alg_usable_p
|| recur
)
8220 /* Pick something reasonable. */
8221 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY
&& !recur
)
8222 *dynamic_check
= 128;
8225 alg
= decide_alg (count
, new_expected_size
, min_size
, max_size
, memset
,
8226 zero_memset
, have_as
, dynamic_check
, noalign
, true);
8227 gcc_assert (*dynamic_check
== -1);
8228 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY
)
8229 *dynamic_check
= max
;
8231 gcc_assert (alg
!= libcall
);
8234 return (alg_usable_p (algs
->unknown_size
, memset
, have_as
)
8235 ? algs
->unknown_size
: libcall
);
8238 /* Decide on alignment. We know that the operand is already aligned to ALIGN
8239 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
8241 decide_alignment (int align
,
8242 enum stringop_alg alg
,
8244 machine_mode move_mode
)
8246 int desired_align
= 0;
8248 gcc_assert (alg
!= no_stringop
);
8252 if (move_mode
== VOIDmode
)
8255 desired_align
= GET_MODE_SIZE (move_mode
);
8256 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
8257 copying whole cacheline at once. */
8258 if (TARGET_CPU_P (PENTIUMPRO
)
8259 && (alg
== rep_prefix_4_byte
|| alg
== rep_prefix_1_byte
))
8264 if (desired_align
< align
)
8265 desired_align
= align
;
8266 if (expected_size
!= -1 && expected_size
< 4)
8267 desired_align
= align
;
8269 return desired_align
;
8273 /* Helper function for memcpy. For QImode value 0xXY produce
8274 0xXYXYXYXY of wide specified by MODE. This is essentially
8275 a * 0x10101010, but we can do slightly better than
8276 synth_mult by unwinding the sequence by hand on CPUs with
8279 promote_duplicated_reg (machine_mode mode
, rtx val
)
8281 machine_mode valmode
= GET_MODE (val
);
8283 int nops
= mode
== DImode
? 3 : 2;
8285 gcc_assert (mode
== SImode
|| mode
== DImode
|| val
== const0_rtx
);
8286 if (val
== const0_rtx
)
8287 return copy_to_mode_reg (mode
, CONST0_RTX (mode
));
8288 if (CONST_INT_P (val
))
8290 HOST_WIDE_INT v
= INTVAL (val
) & 255;
8295 v
|= (v
<< 16) << 16;
8296 return copy_to_mode_reg (mode
, gen_int_mode (v
, mode
));
8299 if (valmode
== VOIDmode
)
8301 if (valmode
!= QImode
)
8302 val
= gen_lowpart (QImode
, val
);
8305 if (!TARGET_PARTIAL_REG_STALL
)
8307 if (ix86_cost
->mult_init
[mode
== DImode
? 3 : 2]
8308 + ix86_cost
->mult_bit
* (mode
== DImode
? 8 : 4)
8309 <= (ix86_cost
->shift_const
+ ix86_cost
->add
) * nops
8310 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL
== 0)))
8312 rtx reg
= convert_modes (mode
, QImode
, val
, true);
8313 tmp
= promote_duplicated_reg (mode
, const1_rtx
);
8314 return expand_simple_binop (mode
, MULT
, reg
, tmp
, NULL
, 1,
8319 rtx reg
= convert_modes (mode
, QImode
, val
, true);
8321 if (!TARGET_PARTIAL_REG_STALL
)
8322 emit_insn (gen_insv_1 (mode
, reg
, reg
));
8325 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (8),
8326 NULL
, 1, OPTAB_DIRECT
);
8327 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1,
8330 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (16),
8331 NULL
, 1, OPTAB_DIRECT
);
8332 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1, OPTAB_DIRECT
);
8335 tmp
= expand_simple_binop (mode
, ASHIFT
, reg
, GEN_INT (32),
8336 NULL
, 1, OPTAB_DIRECT
);
8337 reg
= expand_simple_binop (mode
, IOR
, reg
, tmp
, reg
, 1, OPTAB_DIRECT
);
8342 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
8343 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
8344 alignment from ALIGN to DESIRED_ALIGN. */
8346 promote_duplicated_reg_to_size (rtx val
, int size_needed
, int desired_align
,
8352 && (size_needed
> 4 || (desired_align
> align
&& desired_align
> 4)))
8353 promoted_val
= promote_duplicated_reg (DImode
, val
);
8354 else if (size_needed
> 2 || (desired_align
> align
&& desired_align
> 2))
8355 promoted_val
= promote_duplicated_reg (SImode
, val
);
8356 else if (size_needed
> 1 || (desired_align
> align
&& desired_align
> 1))
8357 promoted_val
= promote_duplicated_reg (HImode
, val
);
8361 return promoted_val
;
8364 /* Copy the address to a Pmode register. This is used for x32 to
8365 truncate DImode TLS address to a SImode register. */
8368 ix86_copy_addr_to_reg (rtx addr
)
8371 if (GET_MODE (addr
) == Pmode
|| GET_MODE (addr
) == VOIDmode
)
8373 reg
= copy_addr_to_reg (addr
);
8374 REG_POINTER (reg
) = 1;
8379 gcc_assert (GET_MODE (addr
) == DImode
&& Pmode
== SImode
);
8380 reg
= copy_to_mode_reg (DImode
, addr
);
8381 REG_POINTER (reg
) = 1;
8382 return gen_rtx_SUBREG (SImode
, reg
, 0);
8386 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
8387 operations when profitable. The code depends upon architecture, block size
8388 and alignment, but always has one of the following overall structures:
8390 Aligned move sequence:
8392 1) Prologue guard: Conditional that jumps up to epilogues for small
8393 blocks that can be handled by epilogue alone. This is faster
8394 but also needed for correctness, since prologue assume the block
8395 is larger than the desired alignment.
8397 Optional dynamic check for size and libcall for large
8398 blocks is emitted here too, with -minline-stringops-dynamically.
8400 2) Prologue: copy first few bytes in order to get destination
8401 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
8402 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
8403 copied. We emit either a jump tree on power of two sized
8404 blocks, or a byte loop.
8406 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8407 with specified algorithm.
8409 4) Epilogue: code copying tail of the block that is too small to be
8410 handled by main body (or up to size guarded by prologue guard).
8412 Misaligned move sequence
8414 1) missaligned move prologue/epilogue containing:
8415 a) Prologue handling small memory blocks and jumping to done_label
8416 (skipped if blocks are known to be large enough)
8417 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
8418 needed by single possibly misaligned move
8419 (skipped if alignment is not needed)
8420 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
8422 2) Zero size guard dispatching to done_label, if needed
8424 3) dispatch to library call, if needed,
8426 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8427 with specified algorithm. */
8429 ix86_expand_set_or_cpymem (rtx dst
, rtx src
, rtx count_exp
, rtx val_exp
,
8430 rtx align_exp
, rtx expected_align_exp
,
8431 rtx expected_size_exp
, rtx min_size_exp
,
8432 rtx max_size_exp
, rtx probable_max_size_exp
,
8437 rtx_code_label
*label
= NULL
;
8439 rtx_code_label
*jump_around_label
= NULL
;
8440 HOST_WIDE_INT align
= 1;
8441 unsigned HOST_WIDE_INT count
= 0;
8442 HOST_WIDE_INT expected_size
= -1;
8443 int size_needed
= 0, epilogue_size_needed
;
8444 int desired_align
= 0, align_bytes
= 0;
8445 enum stringop_alg alg
;
8446 rtx promoted_val
= NULL
;
8447 rtx vec_promoted_val
= NULL
;
8448 bool force_loopy_epilogue
= false;
8450 bool need_zero_guard
= false;
8452 machine_mode move_mode
= VOIDmode
;
8453 machine_mode wider_mode
;
8454 int unroll_factor
= 1;
8455 /* TODO: Once value ranges are available, fill in proper data. */
8456 unsigned HOST_WIDE_INT min_size
= 0;
8457 unsigned HOST_WIDE_INT max_size
= -1;
8458 unsigned HOST_WIDE_INT probable_max_size
= -1;
8459 bool misaligned_prologue_used
= false;
8462 if (CONST_INT_P (align_exp
))
8463 align
= INTVAL (align_exp
);
8464 /* i386 can do misaligned access on reasonably increased cost. */
8465 if (CONST_INT_P (expected_align_exp
)
8466 && INTVAL (expected_align_exp
) > align
)
8467 align
= INTVAL (expected_align_exp
);
8468 /* ALIGN is the minimum of destination and source alignment, but we care here
8469 just about destination alignment. */
8471 && MEM_ALIGN (dst
) > (unsigned HOST_WIDE_INT
) align
* BITS_PER_UNIT
)
8472 align
= MEM_ALIGN (dst
) / BITS_PER_UNIT
;
8474 if (CONST_INT_P (count_exp
))
8476 min_size
= max_size
= probable_max_size
= count
= expected_size
8477 = INTVAL (count_exp
);
8478 /* When COUNT is 0, there is nothing to do. */
8485 min_size
= INTVAL (min_size_exp
);
8487 max_size
= INTVAL (max_size_exp
);
8488 if (probable_max_size_exp
)
8489 probable_max_size
= INTVAL (probable_max_size_exp
);
8490 if (CONST_INT_P (expected_size_exp
))
8491 expected_size
= INTVAL (expected_size_exp
);
8494 /* Make sure we don't need to care about overflow later on. */
8495 if (count
> (HOST_WIDE_INT_1U
<< 30))
8498 have_as
= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst
));
8500 have_as
|= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src
));
8502 /* Step 0: Decide on preferred algorithm, desired alignment and
8503 size of chunks to be copied by main loop. */
8504 alg
= decide_alg (count
, expected_size
, min_size
, probable_max_size
,
8506 issetmem
&& val_exp
== const0_rtx
, have_as
,
8507 &dynamic_check
, &noalign
, false);
8510 fprintf (dump_file
, "Selected stringop expansion strategy: %s\n",
8511 stringop_alg_names
[alg
]);
8515 gcc_assert (alg
!= no_stringop
);
8517 /* For now vector-version of memset is generated only for memory zeroing, as
8518 creating of promoted vector value is very cheap in this case. */
8519 if (issetmem
&& alg
== vector_loop
&& val_exp
!= const0_rtx
)
8520 alg
= unrolled_loop
;
8523 count_exp
= copy_to_mode_reg (GET_MODE (count_exp
), count_exp
);
8524 destreg
= ix86_copy_addr_to_reg (XEXP (dst
, 0));
8526 srcreg
= ix86_copy_addr_to_reg (XEXP (src
, 0));
8529 move_mode
= word_mode
;
8537 need_zero_guard
= true;
8541 need_zero_guard
= true;
8544 need_zero_guard
= true;
8545 unroll_factor
= (TARGET_64BIT
? 4 : 2);
8548 need_zero_guard
= true;
8550 /* Find the widest supported mode. */
8551 move_mode
= word_mode
;
8552 while (GET_MODE_WIDER_MODE (move_mode
).exists (&wider_mode
)
8553 && optab_handler (mov_optab
, wider_mode
) != CODE_FOR_nothing
)
8554 move_mode
= wider_mode
;
8556 if (TARGET_AVX256_SPLIT_REGS
&& GET_MODE_BITSIZE (move_mode
) > 128)
8559 /* Find the corresponding vector mode with the same size as MOVE_MODE.
8560 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
8561 if (GET_MODE_SIZE (move_mode
) > GET_MODE_SIZE (word_mode
))
8563 int nunits
= GET_MODE_SIZE (move_mode
) / GET_MODE_SIZE (word_mode
);
8564 if (!mode_for_vector (word_mode
, nunits
).exists (&move_mode
)
8565 || optab_handler (mov_optab
, move_mode
) == CODE_FOR_nothing
)
8566 move_mode
= word_mode
;
8568 gcc_assert (optab_handler (mov_optab
, move_mode
) != CODE_FOR_nothing
);
8570 case rep_prefix_8_byte
:
8573 case rep_prefix_4_byte
:
8576 case rep_prefix_1_byte
:
8580 size_needed
= GET_MODE_SIZE (move_mode
) * unroll_factor
;
8581 epilogue_size_needed
= size_needed
;
8583 /* If we are going to call any library calls conditionally, make sure any
8584 pending stack adjustment happen before the first conditional branch,
8585 otherwise they will be emitted before the library call only and won't
8586 happen from the other branches. */
8587 if (dynamic_check
!= -1)
8588 do_pending_stack_adjust ();
8590 desired_align
= decide_alignment (align
, alg
, expected_size
, move_mode
);
8591 if (!TARGET_ALIGN_STRINGOPS
|| noalign
)
8592 align
= desired_align
;
8594 /* Step 1: Prologue guard. */
8596 /* Alignment code needs count to be in register. */
8597 if (CONST_INT_P (count_exp
) && desired_align
> align
)
8599 if (INTVAL (count_exp
) > desired_align
8600 && INTVAL (count_exp
) > size_needed
)
8603 = get_mem_align_offset (dst
, desired_align
* BITS_PER_UNIT
);
8604 if (align_bytes
<= 0)
8607 align_bytes
= desired_align
- align_bytes
;
8609 if (align_bytes
== 0)
8610 count_exp
= force_reg (counter_mode (count_exp
), count_exp
);
8612 gcc_assert (desired_align
>= 1 && align
>= 1);
8614 /* Misaligned move sequences handle both prologue and epilogue at once.
8615 Default code generation results in a smaller code for large alignments
8616 and also avoids redundant job when sizes are known precisely. */
8617 misaligned_prologue_used
8618 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
8619 && MAX (desired_align
, epilogue_size_needed
) <= 32
8620 && desired_align
<= epilogue_size_needed
8621 && ((desired_align
> align
&& !align_bytes
)
8622 || (!count
&& epilogue_size_needed
> 1)));
8624 /* Do the cheap promotion to allow better CSE across the
8625 main loop and epilogue (ie one load of the big constant in the
8627 For now the misaligned move sequences do not have fast path
8628 without broadcasting. */
8629 if (issetmem
&& ((CONST_INT_P (val_exp
) || misaligned_prologue_used
)))
8631 if (alg
== vector_loop
)
8633 gcc_assert (val_exp
== const0_rtx
);
8634 vec_promoted_val
= promote_duplicated_reg (move_mode
, val_exp
);
8635 promoted_val
= promote_duplicated_reg_to_size (val_exp
,
8636 GET_MODE_SIZE (word_mode
),
8637 desired_align
, align
);
8641 promoted_val
= promote_duplicated_reg_to_size (val_exp
, size_needed
,
8642 desired_align
, align
);
8645 /* Misaligned move sequences handles both prologues and epilogues at once.
8646 Default code generation results in smaller code for large alignments and
8647 also avoids redundant job when sizes are known precisely. */
8648 if (misaligned_prologue_used
)
8650 /* Misaligned move prologue handled small blocks by itself. */
8651 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
8652 (dst
, src
, &destreg
, &srcreg
,
8653 move_mode
, promoted_val
, vec_promoted_val
,
8656 desired_align
< align
8657 ? MAX (desired_align
, epilogue_size_needed
) : epilogue_size_needed
,
8658 desired_align
, align
, &min_size
, dynamic_check
, issetmem
);
8660 src
= change_address (src
, BLKmode
, srcreg
);
8661 dst
= change_address (dst
, BLKmode
, destreg
);
8662 set_mem_align (dst
, desired_align
* BITS_PER_UNIT
);
8663 epilogue_size_needed
= 0;
8665 && min_size
< (unsigned HOST_WIDE_INT
) size_needed
)
8667 /* It is possible that we copied enough so the main loop will not
8669 gcc_assert (size_needed
> 1);
8670 if (jump_around_label
== NULL_RTX
)
8671 jump_around_label
= gen_label_rtx ();
8672 emit_cmp_and_jump_insns (count_exp
,
8673 GEN_INT (size_needed
),
8674 LTU
, 0, counter_mode (count_exp
), 1, jump_around_label
);
8675 if (expected_size
== -1
8676 || expected_size
< (desired_align
- align
) / 2 + size_needed
)
8677 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
8679 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
8682 /* Ensure that alignment prologue won't copy past end of block. */
8683 else if (size_needed
> 1 || (desired_align
> 1 && desired_align
> align
))
8685 epilogue_size_needed
= MAX (size_needed
- 1, desired_align
- align
);
8686 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
8687 Make sure it is power of 2. */
8688 epilogue_size_needed
= 1 << (floor_log2 (epilogue_size_needed
) + 1);
8690 /* To improve performance of small blocks, we jump around the VAL
8691 promoting mode. This mean that if the promoted VAL is not constant,
8692 we might not use it in the epilogue and have to use byte
8694 if (issetmem
&& epilogue_size_needed
> 2 && !promoted_val
)
8695 force_loopy_epilogue
= true;
8696 if ((count
&& count
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
8697 || max_size
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
8699 /* If main algorithm works on QImode, no epilogue is needed.
8700 For small sizes just don't align anything. */
8701 if (size_needed
== 1)
8702 desired_align
= align
;
8707 && min_size
< (unsigned HOST_WIDE_INT
) epilogue_size_needed
)
8709 label
= gen_label_rtx ();
8710 emit_cmp_and_jump_insns (count_exp
,
8711 GEN_INT (epilogue_size_needed
),
8712 LTU
, 0, counter_mode (count_exp
), 1, label
);
8713 if (expected_size
== -1 || expected_size
< epilogue_size_needed
)
8714 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
8716 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
8720 /* Emit code to decide on runtime whether library call or inline should be
8722 if (dynamic_check
!= -1)
8724 if (!issetmem
&& CONST_INT_P (count_exp
))
8726 if (UINTVAL (count_exp
) >= (unsigned HOST_WIDE_INT
)dynamic_check
)
8728 emit_block_copy_via_libcall (dst
, src
, count_exp
);
8729 count_exp
= const0_rtx
;
8735 rtx_code_label
*hot_label
= gen_label_rtx ();
8736 if (jump_around_label
== NULL_RTX
)
8737 jump_around_label
= gen_label_rtx ();
8738 emit_cmp_and_jump_insns (count_exp
, GEN_INT (dynamic_check
- 1),
8739 LEU
, 0, counter_mode (count_exp
),
8741 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
8743 set_storage_via_libcall (dst
, count_exp
, val_exp
);
8745 emit_block_copy_via_libcall (dst
, src
, count_exp
);
8746 emit_jump (jump_around_label
);
8747 emit_label (hot_label
);
8751 /* Step 2: Alignment prologue. */
8752 /* Do the expensive promotion once we branched off the small blocks. */
8753 if (issetmem
&& !promoted_val
)
8754 promoted_val
= promote_duplicated_reg_to_size (val_exp
, size_needed
,
8755 desired_align
, align
);
8757 if (desired_align
> align
&& !misaligned_prologue_used
)
8759 if (align_bytes
== 0)
8761 /* Except for the first move in prologue, we no longer know
8762 constant offset in aliasing info. It don't seems to worth
8763 the pain to maintain it for the first move, so throw away
8765 dst
= change_address (dst
, BLKmode
, destreg
);
8767 src
= change_address (src
, BLKmode
, srcreg
);
8768 dst
= expand_set_or_cpymem_prologue (dst
, src
, destreg
, srcreg
,
8769 promoted_val
, vec_promoted_val
,
8770 count_exp
, align
, desired_align
,
8772 /* At most desired_align - align bytes are copied. */
8773 if (min_size
< (unsigned)(desired_align
- align
))
8776 min_size
-= desired_align
- align
;
8780 /* If we know how many bytes need to be stored before dst is
8781 sufficiently aligned, maintain aliasing info accurately. */
8782 dst
= expand_set_or_cpymem_constant_prologue (dst
, &src
, destreg
,
8790 count_exp
= plus_constant (counter_mode (count_exp
),
8791 count_exp
, -align_bytes
);
8792 count
-= align_bytes
;
8793 min_size
-= align_bytes
;
8794 max_size
-= align_bytes
;
8797 && min_size
< (unsigned HOST_WIDE_INT
) size_needed
8798 && (count
< (unsigned HOST_WIDE_INT
) size_needed
8799 || (align_bytes
== 0
8800 && count
< ((unsigned HOST_WIDE_INT
) size_needed
8801 + desired_align
- align
))))
8803 /* It is possible that we copied enough so the main loop will not
8805 gcc_assert (size_needed
> 1);
8806 if (label
== NULL_RTX
)
8807 label
= gen_label_rtx ();
8808 emit_cmp_and_jump_insns (count_exp
,
8809 GEN_INT (size_needed
),
8810 LTU
, 0, counter_mode (count_exp
), 1, label
);
8811 if (expected_size
== -1
8812 || expected_size
< (desired_align
- align
) / 2 + size_needed
)
8813 predict_jump (REG_BR_PROB_BASE
* 20 / 100);
8815 predict_jump (REG_BR_PROB_BASE
* 60 / 100);
8818 if (label
&& size_needed
== 1)
8821 LABEL_NUSES (label
) = 1;
8823 epilogue_size_needed
= 1;
8825 promoted_val
= val_exp
;
8827 else if (label
== NULL_RTX
&& !misaligned_prologue_used
)
8828 epilogue_size_needed
= size_needed
;
8830 /* Step 3: Main loop. */
8841 expand_set_or_cpymem_via_loop (dst
, src
, destreg
, srcreg
, promoted_val
,
8842 count_exp
, move_mode
, unroll_factor
,
8843 expected_size
, issetmem
);
8846 expand_set_or_cpymem_via_loop (dst
, src
, destreg
, srcreg
,
8847 vec_promoted_val
, count_exp
, move_mode
,
8848 unroll_factor
, expected_size
, issetmem
);
8850 case rep_prefix_8_byte
:
8851 case rep_prefix_4_byte
:
8852 case rep_prefix_1_byte
:
8853 expand_set_or_cpymem_via_rep (dst
, src
, destreg
, srcreg
, promoted_val
,
8854 val_exp
, count_exp
, move_mode
, issetmem
);
8857 /* Adjust properly the offset of src and dest memory for aliasing. */
8858 if (CONST_INT_P (count_exp
))
8861 src
= adjust_automodify_address_nv (src
, BLKmode
, srcreg
,
8862 (count
/ size_needed
) * size_needed
);
8863 dst
= adjust_automodify_address_nv (dst
, BLKmode
, destreg
,
8864 (count
/ size_needed
) * size_needed
);
8869 src
= change_address (src
, BLKmode
, srcreg
);
8870 dst
= change_address (dst
, BLKmode
, destreg
);
8873 /* Step 4: Epilogue to copy the remaining bytes. */
8877 /* When the main loop is done, COUNT_EXP might hold original count,
8878 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
8879 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
8880 bytes. Compensate if needed. */
8882 if (size_needed
< epilogue_size_needed
)
8884 tmp
= expand_simple_binop (counter_mode (count_exp
), AND
, count_exp
,
8885 GEN_INT (size_needed
- 1), count_exp
, 1,
8887 if (tmp
!= count_exp
)
8888 emit_move_insn (count_exp
, tmp
);
8891 LABEL_NUSES (label
) = 1;
8894 if (count_exp
!= const0_rtx
&& epilogue_size_needed
> 1)
8896 if (force_loopy_epilogue
)
8897 expand_setmem_epilogue_via_loop (dst
, destreg
, val_exp
, count_exp
,
8898 epilogue_size_needed
);
8902 expand_setmem_epilogue (dst
, destreg
, promoted_val
,
8903 vec_promoted_val
, count_exp
,
8904 epilogue_size_needed
);
8906 expand_cpymem_epilogue (dst
, src
, destreg
, srcreg
, count_exp
,
8907 epilogue_size_needed
);
8910 if (jump_around_label
)
8911 emit_label (jump_around_label
);
8915 /* Expand cmpstrn or memcmp. */
8918 ix86_expand_cmpstrn_or_cmpmem (rtx result
, rtx src1
, rtx src2
,
8919 rtx length
, rtx align
, bool is_cmpstrn
)
8921 /* Expand strncmp and memcmp only with -minline-all-stringops since
8922 "repz cmpsb" can be much slower than strncmp and memcmp functions
8923 implemented with vector instructions, see
8925 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
8927 if (!TARGET_INLINE_ALL_STRINGOPS
)
8930 /* Can't use this if the user has appropriated ecx, esi or edi. */
8931 if (fixed_regs
[CX_REG
] || fixed_regs
[SI_REG
] || fixed_regs
[DI_REG
])
8936 /* For strncmp, length is the maximum length, which can be larger
8937 than actual string lengths. We can expand the cmpstrn pattern
8938 to "repz cmpsb" only if one of the strings is a constant so
8939 that expand_builtin_strncmp() can write the length argument to
8940 be the minimum of the const string length and the actual length
8941 argument. Otherwise, "repz cmpsb" may pass the 0 byte. */
8942 tree t1
= MEM_EXPR (src1
);
8943 tree t2
= MEM_EXPR (src2
);
8944 if (!((t1
&& TREE_CODE (t1
) == MEM_REF
8945 && TREE_CODE (TREE_OPERAND (t1
, 0)) == ADDR_EXPR
8946 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1
, 0), 0))
8948 || (t2
&& TREE_CODE (t2
) == MEM_REF
8949 && TREE_CODE (TREE_OPERAND (t2
, 0)) == ADDR_EXPR
8950 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2
, 0), 0))
8955 rtx addr1
= copy_addr_to_reg (XEXP (src1
, 0));
8956 rtx addr2
= copy_addr_to_reg (XEXP (src2
, 0));
8957 if (addr1
!= XEXP (src1
, 0))
8958 src1
= replace_equiv_address_nv (src1
, addr1
);
8959 if (addr2
!= XEXP (src2
, 0))
8960 src2
= replace_equiv_address_nv (src2
, addr2
);
8962 /* NB: Make a copy of the data length to avoid changing the original
8963 data length by cmpstrnqi patterns. */
8964 length
= ix86_zero_extend_to_Pmode (length
);
8965 rtx lengthreg
= gen_reg_rtx (Pmode
);
8966 emit_move_insn (lengthreg
, length
);
8968 /* If we are testing strict equality, we can use known alignment to
8969 good advantage. This may be possible with combine, particularly
8970 once cc0 is dead. */
8971 if (CONST_INT_P (length
))
8973 if (length
== const0_rtx
)
8975 emit_move_insn (result
, const0_rtx
);
8978 emit_insn (gen_cmpstrnqi_nz_1 (addr1
, addr2
, lengthreg
, align
,
8983 emit_insn (gen_cmp_1 (Pmode
, lengthreg
, lengthreg
));
8984 emit_insn (gen_cmpstrnqi_1 (addr1
, addr2
, lengthreg
, align
,
8988 rtx out
= gen_lowpart (QImode
, result
);
8989 emit_insn (gen_cmpintqi (out
));
8990 emit_move_insn (result
, gen_rtx_SIGN_EXTEND (SImode
, out
));
8995 /* Expand the appropriate insns for doing strlen if not just doing
8998 out = result, initialized with the start address
8999 align_rtx = alignment of the address.
9000 scratch = scratch register, initialized with the startaddress when
9001 not aligned, otherwise undefined
9003 This is just the body. It needs the initializations mentioned above and
9004 some address computing at the end. These things are done in i386.md. */
9007 ix86_expand_strlensi_unroll_1 (rtx out
, rtx src
, rtx align_rtx
)
9011 rtx_code_label
*align_2_label
= NULL
;
9012 rtx_code_label
*align_3_label
= NULL
;
9013 rtx_code_label
*align_4_label
= gen_label_rtx ();
9014 rtx_code_label
*end_0_label
= gen_label_rtx ();
9016 rtx tmpreg
= gen_reg_rtx (SImode
);
9017 rtx scratch
= gen_reg_rtx (SImode
);
9021 if (CONST_INT_P (align_rtx
))
9022 align
= INTVAL (align_rtx
);
9024 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
9026 /* Is there a known alignment and is it less than 4? */
9029 rtx scratch1
= gen_reg_rtx (Pmode
);
9030 emit_move_insn (scratch1
, out
);
9031 /* Is there a known alignment and is it not 2? */
9034 align_3_label
= gen_label_rtx (); /* Label when aligned to 3-byte */
9035 align_2_label
= gen_label_rtx (); /* Label when aligned to 2-byte */
9037 /* Leave just the 3 lower bits. */
9038 align_rtx
= expand_binop (Pmode
, and_optab
, scratch1
, GEN_INT (3),
9039 NULL_RTX
, 0, OPTAB_WIDEN
);
9041 emit_cmp_and_jump_insns (align_rtx
, const0_rtx
, EQ
, NULL
,
9042 Pmode
, 1, align_4_label
);
9043 emit_cmp_and_jump_insns (align_rtx
, const2_rtx
, EQ
, NULL
,
9044 Pmode
, 1, align_2_label
);
9045 emit_cmp_and_jump_insns (align_rtx
, const2_rtx
, GTU
, NULL
,
9046 Pmode
, 1, align_3_label
);
9050 /* Since the alignment is 2, we have to check 2 or 0 bytes;
9051 check if is aligned to 4 - byte. */
9053 align_rtx
= expand_binop (Pmode
, and_optab
, scratch1
, const2_rtx
,
9054 NULL_RTX
, 0, OPTAB_WIDEN
);
9056 emit_cmp_and_jump_insns (align_rtx
, const0_rtx
, EQ
, NULL
,
9057 Pmode
, 1, align_4_label
);
9060 mem
= change_address (src
, QImode
, out
);
9062 /* Now compare the bytes. */
9064 /* Compare the first n unaligned byte on a byte per byte basis. */
9065 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
,
9066 QImode
, 1, end_0_label
);
9068 /* Increment the address. */
9069 emit_insn (gen_add2_insn (out
, const1_rtx
));
9071 /* Not needed with an alignment of 2 */
9074 emit_label (align_2_label
);
9076 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
, QImode
, 1,
9079 emit_insn (gen_add2_insn (out
, const1_rtx
));
9081 emit_label (align_3_label
);
9084 emit_cmp_and_jump_insns (mem
, const0_rtx
, EQ
, NULL
, QImode
, 1,
9087 emit_insn (gen_add2_insn (out
, const1_rtx
));
9090 /* Generate loop to check 4 bytes at a time. It is not a good idea to
9091 align this loop. It gives only huge programs, but does not help to
9093 emit_label (align_4_label
);
9095 mem
= change_address (src
, SImode
, out
);
9096 emit_move_insn (scratch
, mem
);
9097 emit_insn (gen_add2_insn (out
, GEN_INT (4)));
9099 /* This formula yields a nonzero result iff one of the bytes is zero.
9100 This saves three branches inside loop and many cycles. */
9102 emit_insn (gen_addsi3 (tmpreg
, scratch
, GEN_INT (-0x01010101)));
9103 emit_insn (gen_one_cmplsi2 (scratch
, scratch
));
9104 emit_insn (gen_andsi3 (tmpreg
, tmpreg
, scratch
));
9105 emit_insn (gen_andsi3 (tmpreg
, tmpreg
,
9106 gen_int_mode (0x80808080, SImode
)));
9107 emit_cmp_and_jump_insns (tmpreg
, const0_rtx
, EQ
, 0, SImode
, 1,
9112 rtx reg
= gen_reg_rtx (SImode
);
9113 rtx reg2
= gen_reg_rtx (Pmode
);
9114 emit_move_insn (reg
, tmpreg
);
9115 emit_insn (gen_lshrsi3 (reg
, reg
, GEN_INT (16)));
9117 /* If zero is not in the first two bytes, move two bytes forward. */
9118 emit_insn (gen_testsi_ccno_1 (tmpreg
, GEN_INT (0x8080)));
9119 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
9120 tmp
= gen_rtx_EQ (VOIDmode
, tmp
, const0_rtx
);
9121 emit_insn (gen_rtx_SET (tmpreg
,
9122 gen_rtx_IF_THEN_ELSE (SImode
, tmp
,
9125 /* Emit lea manually to avoid clobbering of flags. */
9126 emit_insn (gen_rtx_SET (reg2
, plus_constant (Pmode
, out
, 2)));
9128 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
9129 tmp
= gen_rtx_EQ (VOIDmode
, tmp
, const0_rtx
);
9130 emit_insn (gen_rtx_SET (out
,
9131 gen_rtx_IF_THEN_ELSE (Pmode
, tmp
,
9137 rtx_code_label
*end_2_label
= gen_label_rtx ();
9138 /* Is zero in the first two bytes? */
9140 emit_insn (gen_testsi_ccno_1 (tmpreg
, GEN_INT (0x8080)));
9141 tmp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
9142 tmp
= gen_rtx_NE (VOIDmode
, tmp
, const0_rtx
);
9143 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
9144 gen_rtx_LABEL_REF (VOIDmode
, end_2_label
),
9146 tmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
9147 JUMP_LABEL (tmp
) = end_2_label
;
9149 /* Not in the first two. Move two bytes forward. */
9150 emit_insn (gen_lshrsi3 (tmpreg
, tmpreg
, GEN_INT (16)));
9151 emit_insn (gen_add2_insn (out
, const2_rtx
));
9153 emit_label (end_2_label
);
9157 /* Avoid branch in fixing the byte. */
9158 tmpreg
= gen_lowpart (QImode
, tmpreg
);
9159 emit_insn (gen_addqi3_cconly_overflow (tmpreg
, tmpreg
));
9160 tmp
= gen_rtx_REG (CCmode
, FLAGS_REG
);
9161 cmp
= gen_rtx_LTU (VOIDmode
, tmp
, const0_rtx
);
9162 emit_insn (gen_sub3_carry (Pmode
, out
, out
, GEN_INT (3), tmp
, cmp
));
9164 emit_label (end_0_label
);
9167 /* Expand strlen. */
9170 ix86_expand_strlen (rtx out
, rtx src
, rtx eoschar
, rtx align
)
9172 if (TARGET_UNROLL_STRLEN
9173 && TARGET_INLINE_ALL_STRINGOPS
9174 && eoschar
== const0_rtx
9177 /* The generic case of strlen expander is long. Avoid it's
9178 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
9179 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
9180 /* Well it seems that some optimizer does not combine a call like
9181 foo(strlen(bar), strlen(bar));
9182 when the move and the subtraction is done here. It does calculate
9183 the length just once when these instructions are done inside of
9184 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
9185 often used and I use one fewer register for the lifetime of
9186 output_strlen_unroll() this is better. */
9188 emit_move_insn (out
, addr
);
9190 ix86_expand_strlensi_unroll_1 (out
, src
, align
);
9192 /* strlensi_unroll_1 returns the address of the zero at the end of
9193 the string, like memchr(), so compute the length by subtracting
9194 the start address. */
9195 emit_insn (gen_sub2_insn (out
, addr
));
9202 /* For given symbol (function) construct code to compute address of it's PLT
9203 entry in large x86-64 PIC model. */
9206 construct_plt_address (rtx symbol
)
9210 gcc_assert (GET_CODE (symbol
) == SYMBOL_REF
);
9211 gcc_assert (ix86_cmodel
== CM_LARGE_PIC
&& !TARGET_PECOFF
);
9212 gcc_assert (Pmode
== DImode
);
9214 tmp
= gen_reg_rtx (Pmode
);
9215 unspec
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, symbol
), UNSPEC_PLTOFF
);
9217 emit_move_insn (tmp
, gen_rtx_CONST (Pmode
, unspec
));
9218 emit_insn (gen_add2_insn (tmp
, pic_offset_table_rtx
));
9222 /* Additional registers that are clobbered by SYSV calls. */
9224 static int const x86_64_ms_sysv_extra_clobbered_registers
9225 [NUM_X86_64_MS_CLOBBERED_REGS
] =
9229 XMM8_REG
, XMM9_REG
, XMM10_REG
, XMM11_REG
,
9230 XMM12_REG
, XMM13_REG
, XMM14_REG
, XMM15_REG
9234 ix86_expand_call (rtx retval
, rtx fnaddr
, rtx callarg1
,
9236 rtx pop
, bool sibcall
)
9239 rtx use
= NULL
, call
;
9240 unsigned int vec_len
= 0;
9243 if (GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
9245 fndecl
= SYMBOL_REF_DECL (XEXP (fnaddr
, 0));
9247 && (lookup_attribute ("interrupt",
9248 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
)))))
9249 error ("interrupt service routine cannot be called directly");
9254 if (pop
== const0_rtx
)
9256 gcc_assert (!TARGET_64BIT
|| !pop
);
9258 rtx addr
= XEXP (fnaddr
, 0);
9259 if (TARGET_MACHO
&& !TARGET_64BIT
)
9262 if (flag_pic
&& GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
9263 fnaddr
= machopic_indirect_call_target (fnaddr
);
9268 /* Static functions and indirect calls don't need the pic register. Also,
9269 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
9270 it an indirect call. */
9272 && GET_CODE (addr
) == SYMBOL_REF
9273 && ix86_call_use_plt_p (addr
))
9276 && (SYMBOL_REF_DECL (addr
) == NULL_TREE
9277 || !lookup_attribute ("noplt",
9278 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr
)))))
9281 || (ix86_cmodel
== CM_LARGE_PIC
9282 && DEFAULT_ABI
!= MS_ABI
))
9284 use_reg (&use
, gen_rtx_REG (Pmode
,
9285 REAL_PIC_OFFSET_TABLE_REGNUM
));
9286 if (ix86_use_pseudo_pic_reg ())
9287 emit_move_insn (gen_rtx_REG (Pmode
,
9288 REAL_PIC_OFFSET_TABLE_REGNUM
),
9289 pic_offset_table_rtx
);
9292 else if (!TARGET_PECOFF
&& !TARGET_MACHO
)
9295 && ix86_cmodel
== CM_LARGE_PIC
9296 && DEFAULT_ABI
!= MS_ABI
)
9298 fnaddr
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, addr
),
9300 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
9301 fnaddr
= force_reg (Pmode
, fnaddr
);
9302 fnaddr
= gen_rtx_PLUS (Pmode
, pic_offset_table_rtx
, fnaddr
);
9304 else if (TARGET_64BIT
)
9306 fnaddr
= gen_rtx_UNSPEC (Pmode
,
9307 gen_rtvec (1, addr
),
9309 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
9313 fnaddr
= gen_rtx_UNSPEC (Pmode
, gen_rtvec (1, addr
),
9315 fnaddr
= gen_rtx_CONST (Pmode
, fnaddr
);
9316 fnaddr
= gen_rtx_PLUS (Pmode
, pic_offset_table_rtx
,
9319 fnaddr
= gen_const_mem (Pmode
, fnaddr
);
9320 /* Pmode may not be the same as word_mode for x32, which
9321 doesn't support indirect branch via 32-bit memory slot.
9322 Since x32 GOT slot is 64 bit with zero upper 32 bits,
9323 indirect branch via x32 GOT slot is OK. */
9324 if (GET_MODE (fnaddr
) != word_mode
)
9325 fnaddr
= gen_rtx_ZERO_EXTEND (word_mode
, fnaddr
);
9326 fnaddr
= gen_rtx_MEM (QImode
, fnaddr
);
9331 /* Skip setting up RAX register for -mskip-rax-setup when there are no
9332 parameters passed in vector registers. */
9334 && (INTVAL (callarg2
) > 0
9335 || (INTVAL (callarg2
) == 0
9336 && (TARGET_SSE
|| !flag_skip_rax_setup
))))
9338 rtx al
= gen_rtx_REG (QImode
, AX_REG
);
9339 emit_move_insn (al
, callarg2
);
9343 if (ix86_cmodel
== CM_LARGE_PIC
9346 && GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
9347 && !local_symbolic_operand (XEXP (fnaddr
, 0), VOIDmode
))
9348 fnaddr
= gen_rtx_MEM (QImode
, construct_plt_address (XEXP (fnaddr
, 0)));
9349 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
9350 branch via x32 GOT slot is OK. */
9351 else if (!(TARGET_X32
9353 && GET_CODE (XEXP (fnaddr
, 0)) == ZERO_EXTEND
9354 && GOT_memory_operand (XEXP (XEXP (fnaddr
, 0), 0), Pmode
))
9356 ? !sibcall_insn_operand (XEXP (fnaddr
, 0), word_mode
)
9357 : !call_insn_operand (XEXP (fnaddr
, 0), word_mode
)))
9359 fnaddr
= convert_to_mode (word_mode
, XEXP (fnaddr
, 0), 1);
9360 fnaddr
= gen_rtx_MEM (QImode
, copy_to_mode_reg (word_mode
, fnaddr
));
9363 call
= gen_rtx_CALL (VOIDmode
, fnaddr
, callarg1
);
9366 call
= gen_rtx_SET (retval
, call
);
9367 vec
[vec_len
++] = call
;
9371 pop
= gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, pop
);
9372 pop
= gen_rtx_SET (stack_pointer_rtx
, pop
);
9373 vec
[vec_len
++] = pop
;
9376 if (cfun
->machine
->no_caller_saved_registers
9378 || (!TREE_THIS_VOLATILE (fndecl
)
9379 && !lookup_attribute ("no_caller_saved_registers",
9380 TYPE_ATTRIBUTES (TREE_TYPE (fndecl
))))))
9382 static const char ix86_call_used_regs
[] = CALL_USED_REGISTERS
;
9383 bool is_64bit_ms_abi
= (TARGET_64BIT
9384 && ix86_function_abi (fndecl
) == MS_ABI
);
9385 char c_mask
= CALL_USED_REGISTERS_MASK (is_64bit_ms_abi
);
9387 /* If there are no caller-saved registers, add all registers
9388 that are clobbered by the call which returns. */
9389 for (int i
= 0; i
< FIRST_PSEUDO_REGISTER
; i
++)
9391 && (ix86_call_used_regs
[i
] == 1
9392 || (ix86_call_used_regs
[i
] & c_mask
))
9393 && !STACK_REGNO_P (i
)
9394 && !MMX_REGNO_P (i
))
9396 gen_rtx_REG (GET_MODE (regno_reg_rtx
[i
]), i
));
9398 else if (TARGET_64BIT_MS_ABI
9399 && (!callarg2
|| INTVAL (callarg2
) != -2))
9403 for (i
= 0; i
< NUM_X86_64_MS_CLOBBERED_REGS
; i
++)
9405 int regno
= x86_64_ms_sysv_extra_clobbered_registers
[i
];
9406 machine_mode mode
= SSE_REGNO_P (regno
) ? TImode
: DImode
;
9408 clobber_reg (&use
, gen_rtx_REG (mode
, regno
));
9411 /* Set here, but it may get cleared later. */
9412 if (TARGET_CALL_MS2SYSV_XLOGUES
)
9417 /* Don't break hot-patched functions. */
9418 else if (ix86_function_ms_hook_prologue (current_function_decl
))
9421 /* TODO: Cases not yet examined. */
9422 else if (flag_split_stack
)
9423 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
9427 gcc_assert (!reload_completed
);
9428 cfun
->machine
->call_ms2sysv
= true;
9433 if (TARGET_MACHO
&& TARGET_64BIT
&& !sibcall
9434 && ((GET_CODE (addr
) == SYMBOL_REF
&& !SYMBOL_REF_LOCAL_P (addr
))
9435 || !fndecl
|| TREE_PUBLIC (fndecl
)))
9437 /* We allow public functions defined in a TU to bind locally for PIC
9438 code (the default) on 64bit Mach-O.
9439 If such functions are not inlined, we cannot tell at compile-time if
9440 they will be called via the lazy symbol resolver (this can depend on
9441 options given at link-time). Therefore, we must assume that the lazy
9442 resolver could be used which clobbers R11 and R10. */
9443 clobber_reg (&use
, gen_rtx_REG (DImode
, R11_REG
));
9444 clobber_reg (&use
, gen_rtx_REG (DImode
, R10_REG
));
9448 call
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec_v (vec_len
, vec
));
9449 rtx_insn
*call_insn
= emit_call_insn (call
);
9451 CALL_INSN_FUNCTION_USAGE (call_insn
) = use
;
9456 /* Split simple return with popping POPC bytes from stack to indirect
9457 branch with stack adjustment . */
9460 ix86_split_simple_return_pop_internal (rtx popc
)
9462 struct machine_function
*m
= cfun
->machine
;
9463 rtx ecx
= gen_rtx_REG (SImode
, CX_REG
);
9466 /* There is no "pascal" calling convention in any 64bit ABI. */
9467 gcc_assert (!TARGET_64BIT
);
9469 insn
= emit_insn (gen_pop (ecx
));
9470 m
->fs
.cfa_offset
-= UNITS_PER_WORD
;
9471 m
->fs
.sp_offset
-= UNITS_PER_WORD
;
9473 rtx x
= plus_constant (Pmode
, stack_pointer_rtx
, UNITS_PER_WORD
);
9474 x
= gen_rtx_SET (stack_pointer_rtx
, x
);
9475 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, x
);
9476 add_reg_note (insn
, REG_CFA_REGISTER
, gen_rtx_SET (ecx
, pc_rtx
));
9477 RTX_FRAME_RELATED_P (insn
) = 1;
9479 x
= gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, popc
);
9480 x
= gen_rtx_SET (stack_pointer_rtx
, x
);
9481 insn
= emit_insn (x
);
9482 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, x
);
9483 RTX_FRAME_RELATED_P (insn
) = 1;
9485 /* Now return address is in ECX. */
9486 emit_jump_insn (gen_simple_return_indirect_internal (ecx
));
9489 /* Errors in the source file can cause expand_expr to return const0_rtx
9490 where we expect a vector. To avoid crashing, use one of the vector
9491 clear instructions. */
9494 safe_vector_operand (rtx x
, machine_mode mode
)
9496 if (x
== const0_rtx
)
9497 x
= CONST0_RTX (mode
);
9501 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
9504 ix86_expand_binop_builtin (enum insn_code icode
, tree exp
, rtx target
)
9507 tree arg0
= CALL_EXPR_ARG (exp
, 0);
9508 tree arg1
= CALL_EXPR_ARG (exp
, 1);
9509 rtx op0
= expand_normal (arg0
);
9510 rtx op1
= expand_normal (arg1
);
9511 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
9512 machine_mode mode0
= insn_data
[icode
].operand
[1].mode
;
9513 machine_mode mode1
= insn_data
[icode
].operand
[2].mode
;
9515 if (VECTOR_MODE_P (mode0
))
9516 op0
= safe_vector_operand (op0
, mode0
);
9517 if (VECTOR_MODE_P (mode1
))
9518 op1
= safe_vector_operand (op1
, mode1
);
9520 if (optimize
|| !target
9521 || GET_MODE (target
) != tmode
9522 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
9523 target
= gen_reg_rtx (tmode
);
9525 if (GET_MODE (op1
) == SImode
&& mode1
== TImode
)
9527 rtx x
= gen_reg_rtx (V4SImode
);
9528 emit_insn (gen_sse2_loadd (x
, op1
));
9529 op1
= gen_lowpart (TImode
, x
);
9532 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
9533 op0
= copy_to_mode_reg (mode0
, op0
);
9534 if (!insn_data
[icode
].operand
[2].predicate (op1
, mode1
))
9535 op1
= copy_to_mode_reg (mode1
, op1
);
9537 pat
= GEN_FCN (icode
) (target
, op0
, op1
);
9546 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
9549 ix86_expand_multi_arg_builtin (enum insn_code icode
, tree exp
, rtx target
,
9550 enum ix86_builtin_func_type m_type
,
9551 enum rtx_code sub_code
)
9554 unsigned int i
, nargs
;
9555 bool comparison_p
= false;
9557 bool last_arg_constant
= false;
9561 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
9565 case MULTI_ARG_4_DF2_DI_I
:
9566 case MULTI_ARG_4_DF2_DI_I1
:
9567 case MULTI_ARG_4_SF2_SI_I
:
9568 case MULTI_ARG_4_SF2_SI_I1
:
9570 last_arg_constant
= true;
9573 case MULTI_ARG_3_SF
:
9574 case MULTI_ARG_3_DF
:
9575 case MULTI_ARG_3_SF2
:
9576 case MULTI_ARG_3_DF2
:
9577 case MULTI_ARG_3_DI
:
9578 case MULTI_ARG_3_SI
:
9579 case MULTI_ARG_3_SI_DI
:
9580 case MULTI_ARG_3_HI
:
9581 case MULTI_ARG_3_HI_SI
:
9582 case MULTI_ARG_3_QI
:
9583 case MULTI_ARG_3_DI2
:
9584 case MULTI_ARG_3_SI2
:
9585 case MULTI_ARG_3_HI2
:
9586 case MULTI_ARG_3_QI2
:
9590 case MULTI_ARG_2_SF
:
9591 case MULTI_ARG_2_DF
:
9592 case MULTI_ARG_2_DI
:
9593 case MULTI_ARG_2_SI
:
9594 case MULTI_ARG_2_HI
:
9595 case MULTI_ARG_2_QI
:
9599 case MULTI_ARG_2_DI_IMM
:
9600 case MULTI_ARG_2_SI_IMM
:
9601 case MULTI_ARG_2_HI_IMM
:
9602 case MULTI_ARG_2_QI_IMM
:
9604 last_arg_constant
= true;
9607 case MULTI_ARG_1_SF
:
9608 case MULTI_ARG_1_DF
:
9609 case MULTI_ARG_1_SF2
:
9610 case MULTI_ARG_1_DF2
:
9611 case MULTI_ARG_1_DI
:
9612 case MULTI_ARG_1_SI
:
9613 case MULTI_ARG_1_HI
:
9614 case MULTI_ARG_1_QI
:
9615 case MULTI_ARG_1_SI_DI
:
9616 case MULTI_ARG_1_HI_DI
:
9617 case MULTI_ARG_1_HI_SI
:
9618 case MULTI_ARG_1_QI_DI
:
9619 case MULTI_ARG_1_QI_SI
:
9620 case MULTI_ARG_1_QI_HI
:
9624 case MULTI_ARG_2_DI_CMP
:
9625 case MULTI_ARG_2_SI_CMP
:
9626 case MULTI_ARG_2_HI_CMP
:
9627 case MULTI_ARG_2_QI_CMP
:
9629 comparison_p
= true;
9632 case MULTI_ARG_2_SF_TF
:
9633 case MULTI_ARG_2_DF_TF
:
9634 case MULTI_ARG_2_DI_TF
:
9635 case MULTI_ARG_2_SI_TF
:
9636 case MULTI_ARG_2_HI_TF
:
9637 case MULTI_ARG_2_QI_TF
:
9646 if (optimize
|| !target
9647 || GET_MODE (target
) != tmode
9648 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
9649 target
= gen_reg_rtx (tmode
);
9650 else if (memory_operand (target
, tmode
))
9653 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
9655 for (i
= 0; i
< nargs
; i
++)
9657 tree arg
= CALL_EXPR_ARG (exp
, i
);
9658 rtx op
= expand_normal (arg
);
9659 int adjust
= (comparison_p
) ? 1 : 0;
9660 machine_mode mode
= insn_data
[icode
].operand
[i
+adjust
+1].mode
;
9662 if (last_arg_constant
&& i
== nargs
- 1)
9664 if (!insn_data
[icode
].operand
[i
+ 1].predicate (op
, mode
))
9666 enum insn_code new_icode
= icode
;
9669 case CODE_FOR_xop_vpermil2v2df3
:
9670 case CODE_FOR_xop_vpermil2v4sf3
:
9671 case CODE_FOR_xop_vpermil2v4df3
:
9672 case CODE_FOR_xop_vpermil2v8sf3
:
9673 error ("the last argument must be a 2-bit immediate");
9674 return gen_reg_rtx (tmode
);
9675 case CODE_FOR_xop_rotlv2di3
:
9676 new_icode
= CODE_FOR_rotlv2di3
;
9678 case CODE_FOR_xop_rotlv4si3
:
9679 new_icode
= CODE_FOR_rotlv4si3
;
9681 case CODE_FOR_xop_rotlv8hi3
:
9682 new_icode
= CODE_FOR_rotlv8hi3
;
9684 case CODE_FOR_xop_rotlv16qi3
:
9685 new_icode
= CODE_FOR_rotlv16qi3
;
9687 if (CONST_INT_P (op
))
9689 int mask
= GET_MODE_UNIT_BITSIZE (tmode
) - 1;
9690 op
= GEN_INT (INTVAL (op
) & mask
);
9692 (insn_data
[icode
].operand
[i
+ 1].predicate (op
, mode
));
9698 && insn_data
[new_icode
].operand
[0].mode
== tmode
9699 && insn_data
[new_icode
].operand
[1].mode
== tmode
9700 && insn_data
[new_icode
].operand
[2].mode
== mode
9701 && insn_data
[new_icode
].operand
[0].predicate
9702 == insn_data
[icode
].operand
[0].predicate
9703 && insn_data
[new_icode
].operand
[1].predicate
9704 == insn_data
[icode
].operand
[1].predicate
);
9717 if (VECTOR_MODE_P (mode
))
9718 op
= safe_vector_operand (op
, mode
);
9720 /* If we aren't optimizing, only allow one memory operand to be
9722 if (memory_operand (op
, mode
))
9725 gcc_assert (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
);
9728 || !insn_data
[icode
].operand
[i
+adjust
+1].predicate (op
, mode
)
9730 op
= force_reg (mode
, op
);
9739 pat
= GEN_FCN (icode
) (target
, xops
[0]);
9744 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
9745 GEN_INT ((int)sub_code
));
9746 else if (! comparison_p
)
9747 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1]);
9750 rtx cmp_op
= gen_rtx_fmt_ee (sub_code
, GET_MODE (target
),
9753 pat
= GEN_FCN (icode
) (target
, cmp_op
, xops
[0], xops
[1]);
9758 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2]);
9762 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2], xops
[3]);
9776 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
9777 insns with vec_merge. */
9780 ix86_expand_unop_vec_merge_builtin (enum insn_code icode
, tree exp
,
9784 tree arg0
= CALL_EXPR_ARG (exp
, 0);
9785 rtx op1
, op0
= expand_normal (arg0
);
9786 machine_mode tmode
= insn_data
[icode
].operand
[0].mode
;
9787 machine_mode mode0
= insn_data
[icode
].operand
[1].mode
;
9789 if (optimize
|| !target
9790 || GET_MODE (target
) != tmode
9791 || !insn_data
[icode
].operand
[0].predicate (target
, tmode
))
9792 target
= gen_reg_rtx (tmode
);
9794 if (VECTOR_MODE_P (mode0
))
9795 op0
= safe_vector_operand (op0
, mode0
);
9797 if ((optimize
&& !register_operand (op0
, mode0
))
9798 || !insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
9799 op0
= copy_to_mode_reg (mode0
, op0
);
9802 if (!insn_data
[icode
].operand
[2].predicate (op1
, mode0
))
9803 op1
= copy_to_mode_reg (mode0
, op1
);
9805 pat
= GEN_FCN (icode
) (target
, op0
, op1
);
9812 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
9815 ix86_expand_sse_compare (const struct builtin_description
*d
,
9816 tree exp
, rtx target
, bool swap
)
9819 tree arg0
= CALL_EXPR_ARG (exp
, 0);
9820 tree arg1
= CALL_EXPR_ARG (exp
, 1);
9821 rtx op0
= expand_normal (arg0
);
9822 rtx op1
= expand_normal (arg1
);
9824 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
9825 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
9826 machine_mode mode1
= insn_data
[d
->icode
].operand
[2].mode
;
9827 enum rtx_code comparison
= d
->comparison
;
9829 if (VECTOR_MODE_P (mode0
))
9830 op0
= safe_vector_operand (op0
, mode0
);
9831 if (VECTOR_MODE_P (mode1
))
9832 op1
= safe_vector_operand (op1
, mode1
);
9834 /* Swap operands if we have a comparison that isn't available in
9837 std::swap (op0
, op1
);
9839 if (optimize
|| !target
9840 || GET_MODE (target
) != tmode
9841 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
9842 target
= gen_reg_rtx (tmode
);
9844 if ((optimize
&& !register_operand (op0
, mode0
))
9845 || !insn_data
[d
->icode
].operand
[1].predicate (op0
, mode0
))
9846 op0
= copy_to_mode_reg (mode0
, op0
);
9847 if ((optimize
&& !register_operand (op1
, mode1
))
9848 || !insn_data
[d
->icode
].operand
[2].predicate (op1
, mode1
))
9849 op1
= copy_to_mode_reg (mode1
, op1
);
9851 op2
= gen_rtx_fmt_ee (comparison
, mode0
, op0
, op1
);
9852 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
, op2
);
9859 /* Subroutine of ix86_sse_comi and ix86_sse_comi_round to take care of
9860 * ordered EQ or unordered NE, generate PF jump. */
9863 ix86_ssecom_setcc (const enum rtx_code comparison
,
9864 bool check_unordered
, machine_mode mode
,
9865 rtx set_dst
, rtx target
)
9868 rtx_code_label
*label
= NULL
;
9870 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
9871 with NAN operands. */
9872 if (check_unordered
)
9874 gcc_assert (comparison
== EQ
|| comparison
== NE
);
9876 rtx flag
= gen_rtx_REG (CCFPmode
, FLAGS_REG
);
9877 label
= gen_label_rtx ();
9878 rtx tmp
= gen_rtx_fmt_ee (UNORDERED
, VOIDmode
, flag
, const0_rtx
);
9879 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
9880 gen_rtx_LABEL_REF (VOIDmode
, label
),
9882 emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
9885 /* NB: Set CCFPmode and check a different CCmode which is in subset
9887 if (GET_MODE (set_dst
) != mode
)
9889 gcc_assert (mode
== CCAmode
|| mode
== CCCmode
9890 || mode
== CCOmode
|| mode
== CCPmode
9891 || mode
== CCSmode
|| mode
== CCZmode
);
9892 set_dst
= gen_rtx_REG (mode
, FLAGS_REG
);
9895 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
9896 gen_rtx_fmt_ee (comparison
, QImode
,
9903 return SUBREG_REG (target
);
9906 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
9909 ix86_expand_sse_comi (const struct builtin_description
*d
, tree exp
,
9913 tree arg0
= CALL_EXPR_ARG (exp
, 0);
9914 tree arg1
= CALL_EXPR_ARG (exp
, 1);
9915 rtx op0
= expand_normal (arg0
);
9916 rtx op1
= expand_normal (arg1
);
9917 enum insn_code icode
= d
->icode
;
9918 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
9919 machine_mode mode0
= insn_p
->operand
[0].mode
;
9920 machine_mode mode1
= insn_p
->operand
[1].mode
;
9922 if (VECTOR_MODE_P (mode0
))
9923 op0
= safe_vector_operand (op0
, mode0
);
9924 if (VECTOR_MODE_P (mode1
))
9925 op1
= safe_vector_operand (op1
, mode1
);
9927 enum rtx_code comparison
= d
->comparison
;
9928 rtx const_val
= const0_rtx
;
9930 bool check_unordered
= false;
9931 machine_mode mode
= CCFPmode
;
9934 case LE
: /* -> GE */
9935 case LT
: /* -> GT */
9936 std::swap (op0
, op1
);
9937 comparison
= swap_condition (comparison
);
9943 check_unordered
= true;
9947 check_unordered
= true;
9949 const_val
= const1_rtx
;
9955 target
= gen_reg_rtx (SImode
);
9956 emit_move_insn (target
, const_val
);
9957 target
= gen_rtx_SUBREG (QImode
, target
, 0);
9959 if ((optimize
&& !register_operand (op0
, mode0
))
9960 || !insn_p
->operand
[0].predicate (op0
, mode0
))
9961 op0
= copy_to_mode_reg (mode0
, op0
);
9962 if ((optimize
&& !register_operand (op1
, mode1
))
9963 || !insn_p
->operand
[1].predicate (op1
, mode1
))
9964 op1
= copy_to_mode_reg (mode1
, op1
);
9966 pat
= GEN_FCN (icode
) (op0
, op1
);
9970 set_dst
= SET_DEST (pat
);
9972 return ix86_ssecom_setcc (comparison
, check_unordered
, mode
,
9976 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
9979 ix86_expand_sse_round (const struct builtin_description
*d
, tree exp
,
9983 tree arg0
= CALL_EXPR_ARG (exp
, 0);
9984 rtx op1
, op0
= expand_normal (arg0
);
9985 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
9986 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
9988 if (optimize
|| target
== 0
9989 || GET_MODE (target
) != tmode
9990 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
9991 target
= gen_reg_rtx (tmode
);
9993 if (VECTOR_MODE_P (mode0
))
9994 op0
= safe_vector_operand (op0
, mode0
);
9996 if ((optimize
&& !register_operand (op0
, mode0
))
9997 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
9998 op0
= copy_to_mode_reg (mode0
, op0
);
10000 op1
= GEN_INT (d
->comparison
);
10002 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
);
10010 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description
*d
,
10011 tree exp
, rtx target
)
10014 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10015 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10016 rtx op0
= expand_normal (arg0
);
10017 rtx op1
= expand_normal (arg1
);
10019 machine_mode tmode
= insn_data
[d
->icode
].operand
[0].mode
;
10020 machine_mode mode0
= insn_data
[d
->icode
].operand
[1].mode
;
10021 machine_mode mode1
= insn_data
[d
->icode
].operand
[2].mode
;
10023 if (optimize
|| target
== 0
10024 || GET_MODE (target
) != tmode
10025 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode
))
10026 target
= gen_reg_rtx (tmode
);
10028 op0
= safe_vector_operand (op0
, mode0
);
10029 op1
= safe_vector_operand (op1
, mode1
);
10031 if ((optimize
&& !register_operand (op0
, mode0
))
10032 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
10033 op0
= copy_to_mode_reg (mode0
, op0
);
10034 if ((optimize
&& !register_operand (op1
, mode1
))
10035 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
10036 op1
= copy_to_mode_reg (mode1
, op1
);
10038 op2
= GEN_INT (d
->comparison
);
10040 pat
= GEN_FCN (d
->icode
) (target
, op0
, op1
, op2
);
10047 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
10050 ix86_expand_sse_ptest (const struct builtin_description
*d
, tree exp
,
10054 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10055 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10056 rtx op0
= expand_normal (arg0
);
10057 rtx op1
= expand_normal (arg1
);
10058 machine_mode mode0
= insn_data
[d
->icode
].operand
[0].mode
;
10059 machine_mode mode1
= insn_data
[d
->icode
].operand
[1].mode
;
10060 enum rtx_code comparison
= d
->comparison
;
10062 if (VECTOR_MODE_P (mode0
))
10063 op0
= safe_vector_operand (op0
, mode0
);
10064 if (VECTOR_MODE_P (mode1
))
10065 op1
= safe_vector_operand (op1
, mode1
);
10067 target
= gen_reg_rtx (SImode
);
10068 emit_move_insn (target
, const0_rtx
);
10069 target
= gen_rtx_SUBREG (QImode
, target
, 0);
10071 if ((optimize
&& !register_operand (op0
, mode0
))
10072 || !insn_data
[d
->icode
].operand
[0].predicate (op0
, mode0
))
10073 op0
= copy_to_mode_reg (mode0
, op0
);
10074 if ((optimize
&& !register_operand (op1
, mode1
))
10075 || !insn_data
[d
->icode
].operand
[1].predicate (op1
, mode1
))
10076 op1
= copy_to_mode_reg (mode1
, op1
);
10078 pat
= GEN_FCN (d
->icode
) (op0
, op1
);
10082 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
10083 gen_rtx_fmt_ee (comparison
, QImode
,
10087 return SUBREG_REG (target
);
10090 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
10093 ix86_expand_sse_pcmpestr (const struct builtin_description
*d
,
10094 tree exp
, rtx target
)
10097 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10098 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10099 tree arg2
= CALL_EXPR_ARG (exp
, 2);
10100 tree arg3
= CALL_EXPR_ARG (exp
, 3);
10101 tree arg4
= CALL_EXPR_ARG (exp
, 4);
10102 rtx scratch0
, scratch1
;
10103 rtx op0
= expand_normal (arg0
);
10104 rtx op1
= expand_normal (arg1
);
10105 rtx op2
= expand_normal (arg2
);
10106 rtx op3
= expand_normal (arg3
);
10107 rtx op4
= expand_normal (arg4
);
10108 machine_mode tmode0
, tmode1
, modev2
, modei3
, modev4
, modei5
, modeimm
;
10110 tmode0
= insn_data
[d
->icode
].operand
[0].mode
;
10111 tmode1
= insn_data
[d
->icode
].operand
[1].mode
;
10112 modev2
= insn_data
[d
->icode
].operand
[2].mode
;
10113 modei3
= insn_data
[d
->icode
].operand
[3].mode
;
10114 modev4
= insn_data
[d
->icode
].operand
[4].mode
;
10115 modei5
= insn_data
[d
->icode
].operand
[5].mode
;
10116 modeimm
= insn_data
[d
->icode
].operand
[6].mode
;
10118 if (VECTOR_MODE_P (modev2
))
10119 op0
= safe_vector_operand (op0
, modev2
);
10120 if (VECTOR_MODE_P (modev4
))
10121 op2
= safe_vector_operand (op2
, modev4
);
10123 if (!insn_data
[d
->icode
].operand
[2].predicate (op0
, modev2
))
10124 op0
= copy_to_mode_reg (modev2
, op0
);
10125 if (!insn_data
[d
->icode
].operand
[3].predicate (op1
, modei3
))
10126 op1
= copy_to_mode_reg (modei3
, op1
);
10127 if ((optimize
&& !register_operand (op2
, modev4
))
10128 || !insn_data
[d
->icode
].operand
[4].predicate (op2
, modev4
))
10129 op2
= copy_to_mode_reg (modev4
, op2
);
10130 if (!insn_data
[d
->icode
].operand
[5].predicate (op3
, modei5
))
10131 op3
= copy_to_mode_reg (modei5
, op3
);
10133 if (!insn_data
[d
->icode
].operand
[6].predicate (op4
, modeimm
))
10135 error ("the fifth argument must be an 8-bit immediate");
10139 if (d
->code
== IX86_BUILTIN_PCMPESTRI128
)
10141 if (optimize
|| !target
10142 || GET_MODE (target
) != tmode0
10143 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode0
))
10144 target
= gen_reg_rtx (tmode0
);
10146 scratch1
= gen_reg_rtx (tmode1
);
10148 pat
= GEN_FCN (d
->icode
) (target
, scratch1
, op0
, op1
, op2
, op3
, op4
);
10150 else if (d
->code
== IX86_BUILTIN_PCMPESTRM128
)
10152 if (optimize
|| !target
10153 || GET_MODE (target
) != tmode1
10154 || !insn_data
[d
->icode
].operand
[1].predicate (target
, tmode1
))
10155 target
= gen_reg_rtx (tmode1
);
10157 scratch0
= gen_reg_rtx (tmode0
);
10159 pat
= GEN_FCN (d
->icode
) (scratch0
, target
, op0
, op1
, op2
, op3
, op4
);
10163 gcc_assert (d
->flag
);
10165 scratch0
= gen_reg_rtx (tmode0
);
10166 scratch1
= gen_reg_rtx (tmode1
);
10168 pat
= GEN_FCN (d
->icode
) (scratch0
, scratch1
, op0
, op1
, op2
, op3
, op4
);
10178 target
= gen_reg_rtx (SImode
);
10179 emit_move_insn (target
, const0_rtx
);
10180 target
= gen_rtx_SUBREG (QImode
, target
, 0);
10183 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
10184 gen_rtx_fmt_ee (EQ
, QImode
,
10185 gen_rtx_REG ((machine_mode
) d
->flag
,
10188 return SUBREG_REG (target
);
10195 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
10198 ix86_expand_sse_pcmpistr (const struct builtin_description
*d
,
10199 tree exp
, rtx target
)
10202 tree arg0
= CALL_EXPR_ARG (exp
, 0);
10203 tree arg1
= CALL_EXPR_ARG (exp
, 1);
10204 tree arg2
= CALL_EXPR_ARG (exp
, 2);
10205 rtx scratch0
, scratch1
;
10206 rtx op0
= expand_normal (arg0
);
10207 rtx op1
= expand_normal (arg1
);
10208 rtx op2
= expand_normal (arg2
);
10209 machine_mode tmode0
, tmode1
, modev2
, modev3
, modeimm
;
10211 tmode0
= insn_data
[d
->icode
].operand
[0].mode
;
10212 tmode1
= insn_data
[d
->icode
].operand
[1].mode
;
10213 modev2
= insn_data
[d
->icode
].operand
[2].mode
;
10214 modev3
= insn_data
[d
->icode
].operand
[3].mode
;
10215 modeimm
= insn_data
[d
->icode
].operand
[4].mode
;
10217 if (VECTOR_MODE_P (modev2
))
10218 op0
= safe_vector_operand (op0
, modev2
);
10219 if (VECTOR_MODE_P (modev3
))
10220 op1
= safe_vector_operand (op1
, modev3
);
10222 if (!insn_data
[d
->icode
].operand
[2].predicate (op0
, modev2
))
10223 op0
= copy_to_mode_reg (modev2
, op0
);
10224 if ((optimize
&& !register_operand (op1
, modev3
))
10225 || !insn_data
[d
->icode
].operand
[3].predicate (op1
, modev3
))
10226 op1
= copy_to_mode_reg (modev3
, op1
);
10228 if (!insn_data
[d
->icode
].operand
[4].predicate (op2
, modeimm
))
10230 error ("the third argument must be an 8-bit immediate");
10234 if (d
->code
== IX86_BUILTIN_PCMPISTRI128
)
10236 if (optimize
|| !target
10237 || GET_MODE (target
) != tmode0
10238 || !insn_data
[d
->icode
].operand
[0].predicate (target
, tmode0
))
10239 target
= gen_reg_rtx (tmode0
);
10241 scratch1
= gen_reg_rtx (tmode1
);
10243 pat
= GEN_FCN (d
->icode
) (target
, scratch1
, op0
, op1
, op2
);
10245 else if (d
->code
== IX86_BUILTIN_PCMPISTRM128
)
10247 if (optimize
|| !target
10248 || GET_MODE (target
) != tmode1
10249 || !insn_data
[d
->icode
].operand
[1].predicate (target
, tmode1
))
10250 target
= gen_reg_rtx (tmode1
);
10252 scratch0
= gen_reg_rtx (tmode0
);
10254 pat
= GEN_FCN (d
->icode
) (scratch0
, target
, op0
, op1
, op2
);
10258 gcc_assert (d
->flag
);
10260 scratch0
= gen_reg_rtx (tmode0
);
10261 scratch1
= gen_reg_rtx (tmode1
);
10263 pat
= GEN_FCN (d
->icode
) (scratch0
, scratch1
, op0
, op1
, op2
);
10273 target
= gen_reg_rtx (SImode
);
10274 emit_move_insn (target
, const0_rtx
);
10275 target
= gen_rtx_SUBREG (QImode
, target
, 0);
10278 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
10279 gen_rtx_fmt_ee (EQ
, QImode
,
10280 gen_rtx_REG ((machine_mode
) d
->flag
,
10283 return SUBREG_REG (target
);
10289 /* Fixup modeless constants to fit required mode. */
10292 fixup_modeless_constant (rtx x
, machine_mode mode
)
10294 if (GET_MODE (x
) == VOIDmode
)
10295 x
= convert_to_mode (mode
, x
, 1);
10299 /* Subroutine of ix86_expand_builtin to take care of insns with
10300 variable number of operands. */
10303 ix86_expand_args_builtin (const struct builtin_description
*d
,
10304 tree exp
, rtx target
)
10306 rtx pat
, real_target
;
10307 unsigned int i
, nargs
;
10308 unsigned int nargs_constant
= 0;
10309 unsigned int mask_pos
= 0;
10310 int num_memory
= 0;
10312 bool second_arg_count
= false;
10313 enum insn_code icode
= d
->icode
;
10314 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
10315 machine_mode tmode
= insn_p
->operand
[0].mode
;
10316 machine_mode rmode
= VOIDmode
;
10318 enum rtx_code comparison
= d
->comparison
;
10320 switch ((enum ix86_builtin_func_type
) d
->flag
)
10322 case V2DF_FTYPE_V2DF_ROUND
:
10323 case V4DF_FTYPE_V4DF_ROUND
:
10324 case V8DF_FTYPE_V8DF_ROUND
:
10325 case V4SF_FTYPE_V4SF_ROUND
:
10326 case V8SF_FTYPE_V8SF_ROUND
:
10327 case V16SF_FTYPE_V16SF_ROUND
:
10328 case V8HF_FTYPE_V8HF_ROUND
:
10329 case V16HF_FTYPE_V16HF_ROUND
:
10330 case V32HF_FTYPE_V32HF_ROUND
:
10331 case V4SI_FTYPE_V4SF_ROUND
:
10332 case V8SI_FTYPE_V8SF_ROUND
:
10333 case V16SI_FTYPE_V16SF_ROUND
:
10334 return ix86_expand_sse_round (d
, exp
, target
);
10335 case V4SI_FTYPE_V2DF_V2DF_ROUND
:
10336 case V8SI_FTYPE_V4DF_V4DF_ROUND
:
10337 case V16SI_FTYPE_V8DF_V8DF_ROUND
:
10338 return ix86_expand_sse_round_vec_pack_sfix (d
, exp
, target
);
10339 case INT_FTYPE_V8SF_V8SF_PTEST
:
10340 case INT_FTYPE_V4DI_V4DI_PTEST
:
10341 case INT_FTYPE_V4DF_V4DF_PTEST
:
10342 case INT_FTYPE_V4SF_V4SF_PTEST
:
10343 case INT_FTYPE_V2DI_V2DI_PTEST
:
10344 case INT_FTYPE_V2DF_V2DF_PTEST
:
10345 return ix86_expand_sse_ptest (d
, exp
, target
);
10346 case FLOAT128_FTYPE_FLOAT128
:
10347 case FLOAT_FTYPE_FLOAT
:
10348 case INT_FTYPE_INT
:
10349 case UINT_FTYPE_UINT
:
10350 case UINT16_FTYPE_UINT16
:
10351 case UINT64_FTYPE_INT
:
10352 case UINT64_FTYPE_UINT64
:
10353 case INT64_FTYPE_INT64
:
10354 case INT64_FTYPE_V4SF
:
10355 case INT64_FTYPE_V2DF
:
10356 case INT_FTYPE_V16QI
:
10357 case INT_FTYPE_V8QI
:
10358 case INT_FTYPE_V8SF
:
10359 case INT_FTYPE_V4DF
:
10360 case INT_FTYPE_V4SF
:
10361 case INT_FTYPE_V2DF
:
10362 case INT_FTYPE_V32QI
:
10363 case V16QI_FTYPE_V16QI
:
10364 case V8SI_FTYPE_V8SF
:
10365 case V8SI_FTYPE_V4SI
:
10366 case V8HI_FTYPE_V8HI
:
10367 case V8HI_FTYPE_V16QI
:
10368 case V8QI_FTYPE_V8QI
:
10369 case V8SF_FTYPE_V8SF
:
10370 case V8SF_FTYPE_V8SI
:
10371 case V8SF_FTYPE_V4SF
:
10372 case V8SF_FTYPE_V8HI
:
10373 case V4SI_FTYPE_V4SI
:
10374 case V4SI_FTYPE_V16QI
:
10375 case V4SI_FTYPE_V4SF
:
10376 case V4SI_FTYPE_V8SI
:
10377 case V4SI_FTYPE_V8HI
:
10378 case V4SI_FTYPE_V4DF
:
10379 case V4SI_FTYPE_V2DF
:
10380 case V4HI_FTYPE_V4HI
:
10381 case V4DF_FTYPE_V4DF
:
10382 case V4DF_FTYPE_V4SI
:
10383 case V4DF_FTYPE_V4SF
:
10384 case V4DF_FTYPE_V2DF
:
10385 case V4SF_FTYPE_V4SF
:
10386 case V4SF_FTYPE_V4SI
:
10387 case V4SF_FTYPE_V8SF
:
10388 case V4SF_FTYPE_V4DF
:
10389 case V4SF_FTYPE_V8HI
:
10390 case V4SF_FTYPE_V2DF
:
10391 case V2DI_FTYPE_V2DI
:
10392 case V2DI_FTYPE_V16QI
:
10393 case V2DI_FTYPE_V8HI
:
10394 case V2DI_FTYPE_V4SI
:
10395 case V2DF_FTYPE_V2DF
:
10396 case V2DF_FTYPE_V4SI
:
10397 case V2DF_FTYPE_V4DF
:
10398 case V2DF_FTYPE_V4SF
:
10399 case V2DF_FTYPE_V2SI
:
10400 case V2SI_FTYPE_V2SI
:
10401 case V2SI_FTYPE_V4SF
:
10402 case V2SI_FTYPE_V2SF
:
10403 case V2SI_FTYPE_V2DF
:
10404 case V2SF_FTYPE_V2SF
:
10405 case V2SF_FTYPE_V2SI
:
10406 case V32QI_FTYPE_V32QI
:
10407 case V32QI_FTYPE_V16QI
:
10408 case V16HI_FTYPE_V16HI
:
10409 case V16HI_FTYPE_V8HI
:
10410 case V8SI_FTYPE_V8SI
:
10411 case V16HI_FTYPE_V16QI
:
10412 case V8SI_FTYPE_V16QI
:
10413 case V4DI_FTYPE_V16QI
:
10414 case V8SI_FTYPE_V8HI
:
10415 case V4DI_FTYPE_V8HI
:
10416 case V4DI_FTYPE_V4SI
:
10417 case V4DI_FTYPE_V2DI
:
10418 case UQI_FTYPE_UQI
:
10419 case UHI_FTYPE_UHI
:
10420 case USI_FTYPE_USI
:
10421 case USI_FTYPE_UQI
:
10422 case USI_FTYPE_UHI
:
10423 case UDI_FTYPE_UDI
:
10424 case UHI_FTYPE_V16QI
:
10425 case USI_FTYPE_V32QI
:
10426 case UDI_FTYPE_V64QI
:
10427 case V16QI_FTYPE_UHI
:
10428 case V32QI_FTYPE_USI
:
10429 case V64QI_FTYPE_UDI
:
10430 case V8HI_FTYPE_UQI
:
10431 case V16HI_FTYPE_UHI
:
10432 case V32HI_FTYPE_USI
:
10433 case V4SI_FTYPE_UQI
:
10434 case V8SI_FTYPE_UQI
:
10435 case V4SI_FTYPE_UHI
:
10436 case V8SI_FTYPE_UHI
:
10437 case UQI_FTYPE_V8HI
:
10438 case UHI_FTYPE_V16HI
:
10439 case USI_FTYPE_V32HI
:
10440 case UQI_FTYPE_V4SI
:
10441 case UQI_FTYPE_V8SI
:
10442 case UHI_FTYPE_V16SI
:
10443 case UQI_FTYPE_V2DI
:
10444 case UQI_FTYPE_V4DI
:
10445 case UQI_FTYPE_V8DI
:
10446 case V16SI_FTYPE_UHI
:
10447 case V2DI_FTYPE_UQI
:
10448 case V4DI_FTYPE_UQI
:
10449 case V16SI_FTYPE_INT
:
10450 case V16SF_FTYPE_V8SF
:
10451 case V16SI_FTYPE_V8SI
:
10452 case V16SF_FTYPE_V4SF
:
10453 case V16SI_FTYPE_V4SI
:
10454 case V16SI_FTYPE_V16SF
:
10455 case V16SI_FTYPE_V16SI
:
10456 case V64QI_FTYPE_V64QI
:
10457 case V32HI_FTYPE_V32HI
:
10458 case V16SF_FTYPE_V16SF
:
10459 case V8DI_FTYPE_UQI
:
10460 case V8DI_FTYPE_V8DI
:
10461 case V8DF_FTYPE_V4DF
:
10462 case V8DF_FTYPE_V2DF
:
10463 case V8DF_FTYPE_V8DF
:
10464 case V4DI_FTYPE_V4DI
:
10465 case V16BF_FTYPE_V16SF
:
10466 case V8BF_FTYPE_V8SF
:
10467 case V8BF_FTYPE_V4SF
:
10470 case V4SF_FTYPE_V4SF_VEC_MERGE
:
10471 case V2DF_FTYPE_V2DF_VEC_MERGE
:
10472 return ix86_expand_unop_vec_merge_builtin (icode
, exp
, target
);
10473 case FLOAT128_FTYPE_FLOAT128_FLOAT128
:
10474 case V16QI_FTYPE_V16QI_V16QI
:
10475 case V16QI_FTYPE_V8HI_V8HI
:
10476 case V16HF_FTYPE_V16HF_V16HF
:
10477 case V16SF_FTYPE_V16SF_V16SF
:
10478 case V8QI_FTYPE_V8QI_V8QI
:
10479 case V8QI_FTYPE_V4HI_V4HI
:
10480 case V8HI_FTYPE_V8HI_V8HI
:
10481 case V8HI_FTYPE_V16QI_V16QI
:
10482 case V8HI_FTYPE_V4SI_V4SI
:
10483 case V8HF_FTYPE_V8HF_V8HF
:
10484 case V8SF_FTYPE_V8SF_V8SF
:
10485 case V8SF_FTYPE_V8SF_V8SI
:
10486 case V8DF_FTYPE_V8DF_V8DF
:
10487 case V4SI_FTYPE_V4SI_V4SI
:
10488 case V4SI_FTYPE_V8HI_V8HI
:
10489 case V4SI_FTYPE_V2DF_V2DF
:
10490 case V4HI_FTYPE_V4HI_V4HI
:
10491 case V4HI_FTYPE_V8QI_V8QI
:
10492 case V4HI_FTYPE_V2SI_V2SI
:
10493 case V4DF_FTYPE_V4DF_V4DF
:
10494 case V4DF_FTYPE_V4DF_V4DI
:
10495 case V4SF_FTYPE_V4SF_V4SF
:
10496 case V4SF_FTYPE_V4SF_V4SI
:
10497 case V4SF_FTYPE_V4SF_V2SI
:
10498 case V4SF_FTYPE_V4SF_V2DF
:
10499 case V4SF_FTYPE_V4SF_UINT
:
10500 case V4SF_FTYPE_V4SF_DI
:
10501 case V4SF_FTYPE_V4SF_SI
:
10502 case V2DI_FTYPE_V2DI_V2DI
:
10503 case V2DI_FTYPE_V16QI_V16QI
:
10504 case V2DI_FTYPE_V4SI_V4SI
:
10505 case V2DI_FTYPE_V2DI_V16QI
:
10506 case V2SI_FTYPE_V2SI_V2SI
:
10507 case V2SI_FTYPE_V4HI_V4HI
:
10508 case V2SI_FTYPE_V2SF_V2SF
:
10509 case V2DF_FTYPE_V2DF_V2DF
:
10510 case V2DF_FTYPE_V2DF_V4SF
:
10511 case V2DF_FTYPE_V2DF_V2DI
:
10512 case V2DF_FTYPE_V2DF_DI
:
10513 case V2DF_FTYPE_V2DF_SI
:
10514 case V2DF_FTYPE_V2DF_UINT
:
10515 case V2SF_FTYPE_V2SF_V2SF
:
10516 case V1DI_FTYPE_V1DI_V1DI
:
10517 case V1DI_FTYPE_V8QI_V8QI
:
10518 case V1DI_FTYPE_V2SI_V2SI
:
10519 case V32QI_FTYPE_V16HI_V16HI
:
10520 case V16HI_FTYPE_V8SI_V8SI
:
10521 case V64QI_FTYPE_V64QI_V64QI
:
10522 case V32QI_FTYPE_V32QI_V32QI
:
10523 case V16HI_FTYPE_V32QI_V32QI
:
10524 case V16HI_FTYPE_V16HI_V16HI
:
10525 case V8SI_FTYPE_V4DF_V4DF
:
10526 case V8SI_FTYPE_V8SI_V8SI
:
10527 case V8SI_FTYPE_V16HI_V16HI
:
10528 case V4DI_FTYPE_V4DI_V4DI
:
10529 case V4DI_FTYPE_V8SI_V8SI
:
10530 case V4DI_FTYPE_V32QI_V32QI
:
10531 case V8DI_FTYPE_V64QI_V64QI
:
10532 if (comparison
== UNKNOWN
)
10533 return ix86_expand_binop_builtin (icode
, exp
, target
);
10536 case V4SF_FTYPE_V4SF_V4SF_SWAP
:
10537 case V2DF_FTYPE_V2DF_V2DF_SWAP
:
10538 gcc_assert (comparison
!= UNKNOWN
);
10542 case V16HI_FTYPE_V16HI_V8HI_COUNT
:
10543 case V16HI_FTYPE_V16HI_SI_COUNT
:
10544 case V8SI_FTYPE_V8SI_V4SI_COUNT
:
10545 case V8SI_FTYPE_V8SI_SI_COUNT
:
10546 case V4DI_FTYPE_V4DI_V2DI_COUNT
:
10547 case V4DI_FTYPE_V4DI_INT_COUNT
:
10548 case V8HI_FTYPE_V8HI_V8HI_COUNT
:
10549 case V8HI_FTYPE_V8HI_SI_COUNT
:
10550 case V4SI_FTYPE_V4SI_V4SI_COUNT
:
10551 case V4SI_FTYPE_V4SI_SI_COUNT
:
10552 case V4HI_FTYPE_V4HI_V4HI_COUNT
:
10553 case V4HI_FTYPE_V4HI_SI_COUNT
:
10554 case V2DI_FTYPE_V2DI_V2DI_COUNT
:
10555 case V2DI_FTYPE_V2DI_SI_COUNT
:
10556 case V2SI_FTYPE_V2SI_V2SI_COUNT
:
10557 case V2SI_FTYPE_V2SI_SI_COUNT
:
10558 case V1DI_FTYPE_V1DI_V1DI_COUNT
:
10559 case V1DI_FTYPE_V1DI_SI_COUNT
:
10561 second_arg_count
= true;
10563 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT
:
10564 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT
:
10565 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT
:
10566 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT
:
10567 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT
:
10568 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT
:
10569 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT
:
10570 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT
:
10571 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT
:
10572 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT
:
10573 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT
:
10574 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT
:
10575 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT
:
10576 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT
:
10577 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT
:
10578 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT
:
10579 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT
:
10580 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT
:
10582 second_arg_count
= true;
10584 case UINT64_FTYPE_UINT64_UINT64
:
10585 case UINT_FTYPE_UINT_UINT
:
10586 case UINT_FTYPE_UINT_USHORT
:
10587 case UINT_FTYPE_UINT_UCHAR
:
10588 case UINT16_FTYPE_UINT16_INT
:
10589 case UINT8_FTYPE_UINT8_INT
:
10590 case UQI_FTYPE_UQI_UQI
:
10591 case UHI_FTYPE_UHI_UHI
:
10592 case USI_FTYPE_USI_USI
:
10593 case UDI_FTYPE_UDI_UDI
:
10594 case V16SI_FTYPE_V8DF_V8DF
:
10595 case V32BF_FTYPE_V16SF_V16SF
:
10596 case V16BF_FTYPE_V8SF_V8SF
:
10597 case V8BF_FTYPE_V4SF_V4SF
:
10598 case V16BF_FTYPE_V16SF_UHI
:
10599 case V8BF_FTYPE_V8SF_UQI
:
10600 case V8BF_FTYPE_V4SF_UQI
:
10603 case V2DI_FTYPE_V2DI_INT_CONVERT
:
10606 nargs_constant
= 1;
10608 case V4DI_FTYPE_V4DI_INT_CONVERT
:
10611 nargs_constant
= 1;
10613 case V8DI_FTYPE_V8DI_INT_CONVERT
:
10616 nargs_constant
= 1;
10618 case V8HI_FTYPE_V8HI_INT
:
10619 case V8HI_FTYPE_V8SF_INT
:
10620 case V16HI_FTYPE_V16SF_INT
:
10621 case V8HI_FTYPE_V4SF_INT
:
10622 case V8SF_FTYPE_V8SF_INT
:
10623 case V4SF_FTYPE_V16SF_INT
:
10624 case V16SF_FTYPE_V16SF_INT
:
10625 case V4SI_FTYPE_V4SI_INT
:
10626 case V4SI_FTYPE_V8SI_INT
:
10627 case V4HI_FTYPE_V4HI_INT
:
10628 case V4DF_FTYPE_V4DF_INT
:
10629 case V4DF_FTYPE_V8DF_INT
:
10630 case V4SF_FTYPE_V4SF_INT
:
10631 case V4SF_FTYPE_V8SF_INT
:
10632 case V2DI_FTYPE_V2DI_INT
:
10633 case V2DF_FTYPE_V2DF_INT
:
10634 case V2DF_FTYPE_V4DF_INT
:
10635 case V16HI_FTYPE_V16HI_INT
:
10636 case V8SI_FTYPE_V8SI_INT
:
10637 case V16SI_FTYPE_V16SI_INT
:
10638 case V4SI_FTYPE_V16SI_INT
:
10639 case V4DI_FTYPE_V4DI_INT
:
10640 case V2DI_FTYPE_V4DI_INT
:
10641 case V4DI_FTYPE_V8DI_INT
:
10642 case UQI_FTYPE_UQI_UQI_CONST
:
10643 case UHI_FTYPE_UHI_UQI
:
10644 case USI_FTYPE_USI_UQI
:
10645 case UDI_FTYPE_UDI_UQI
:
10647 nargs_constant
= 1;
10649 case V16QI_FTYPE_V16QI_V16QI_V16QI
:
10650 case V8SF_FTYPE_V8SF_V8SF_V8SF
:
10651 case V4DF_FTYPE_V4DF_V4DF_V4DF
:
10652 case V4SF_FTYPE_V4SF_V4SF_V4SF
:
10653 case V2DF_FTYPE_V2DF_V2DF_V2DF
:
10654 case V32QI_FTYPE_V32QI_V32QI_V32QI
:
10655 case UHI_FTYPE_V16SI_V16SI_UHI
:
10656 case UQI_FTYPE_V8DI_V8DI_UQI
:
10657 case V16HI_FTYPE_V16SI_V16HI_UHI
:
10658 case V16QI_FTYPE_V16SI_V16QI_UHI
:
10659 case V16QI_FTYPE_V8DI_V16QI_UQI
:
10660 case V32HF_FTYPE_V32HF_V32HF_USI
:
10661 case V16SF_FTYPE_V16SF_V16SF_UHI
:
10662 case V16SF_FTYPE_V4SF_V16SF_UHI
:
10663 case V16SI_FTYPE_SI_V16SI_UHI
:
10664 case V16SI_FTYPE_V16HI_V16SI_UHI
:
10665 case V16SI_FTYPE_V16QI_V16SI_UHI
:
10666 case V8SF_FTYPE_V4SF_V8SF_UQI
:
10667 case V4DF_FTYPE_V2DF_V4DF_UQI
:
10668 case V8SI_FTYPE_V4SI_V8SI_UQI
:
10669 case V8SI_FTYPE_SI_V8SI_UQI
:
10670 case V4SI_FTYPE_V4SI_V4SI_UQI
:
10671 case V4SI_FTYPE_SI_V4SI_UQI
:
10672 case V4DI_FTYPE_V2DI_V4DI_UQI
:
10673 case V4DI_FTYPE_DI_V4DI_UQI
:
10674 case V2DI_FTYPE_V2DI_V2DI_UQI
:
10675 case V2DI_FTYPE_DI_V2DI_UQI
:
10676 case V64QI_FTYPE_V64QI_V64QI_UDI
:
10677 case V64QI_FTYPE_V16QI_V64QI_UDI
:
10678 case V64QI_FTYPE_QI_V64QI_UDI
:
10679 case V32QI_FTYPE_V32QI_V32QI_USI
:
10680 case V32QI_FTYPE_V16QI_V32QI_USI
:
10681 case V32QI_FTYPE_QI_V32QI_USI
:
10682 case V16QI_FTYPE_V16QI_V16QI_UHI
:
10683 case V16QI_FTYPE_QI_V16QI_UHI
:
10684 case V32HI_FTYPE_V8HI_V32HI_USI
:
10685 case V32HI_FTYPE_HI_V32HI_USI
:
10686 case V16HI_FTYPE_V8HI_V16HI_UHI
:
10687 case V16HI_FTYPE_HI_V16HI_UHI
:
10688 case V8HI_FTYPE_V8HI_V8HI_UQI
:
10689 case V8HI_FTYPE_HI_V8HI_UQI
:
10690 case V16HF_FTYPE_V16HF_V16HF_UHI
:
10691 case V8SF_FTYPE_V8HI_V8SF_UQI
:
10692 case V4SF_FTYPE_V8HI_V4SF_UQI
:
10693 case V8SI_FTYPE_V8HF_V8SI_UQI
:
10694 case V8SF_FTYPE_V8HF_V8SF_UQI
:
10695 case V8SI_FTYPE_V8SF_V8SI_UQI
:
10696 case V4SI_FTYPE_V4SF_V4SI_UQI
:
10697 case V4SI_FTYPE_V8HF_V4SI_UQI
:
10698 case V4SF_FTYPE_V8HF_V4SF_UQI
:
10699 case V4DI_FTYPE_V8HF_V4DI_UQI
:
10700 case V4DI_FTYPE_V4SF_V4DI_UQI
:
10701 case V2DI_FTYPE_V8HF_V2DI_UQI
:
10702 case V2DI_FTYPE_V4SF_V2DI_UQI
:
10703 case V8HF_FTYPE_V8HF_V8HF_UQI
:
10704 case V8HF_FTYPE_V8HF_V8HF_V8HF
:
10705 case V8HF_FTYPE_V8HI_V8HF_UQI
:
10706 case V8HF_FTYPE_V8SI_V8HF_UQI
:
10707 case V8HF_FTYPE_V8SF_V8HF_UQI
:
10708 case V8HF_FTYPE_V4SI_V8HF_UQI
:
10709 case V8HF_FTYPE_V4SF_V8HF_UQI
:
10710 case V8HF_FTYPE_V4DI_V8HF_UQI
:
10711 case V8HF_FTYPE_V4DF_V8HF_UQI
:
10712 case V8HF_FTYPE_V2DI_V8HF_UQI
:
10713 case V8HF_FTYPE_V2DF_V8HF_UQI
:
10714 case V4SF_FTYPE_V4DI_V4SF_UQI
:
10715 case V4SF_FTYPE_V2DI_V4SF_UQI
:
10716 case V4DF_FTYPE_V4DI_V4DF_UQI
:
10717 case V4DF_FTYPE_V8HF_V4DF_UQI
:
10718 case V2DF_FTYPE_V8HF_V2DF_UQI
:
10719 case V2DF_FTYPE_V2DI_V2DF_UQI
:
10720 case V16QI_FTYPE_V8HI_V16QI_UQI
:
10721 case V16QI_FTYPE_V16HI_V16QI_UHI
:
10722 case V16QI_FTYPE_V4SI_V16QI_UQI
:
10723 case V16QI_FTYPE_V8SI_V16QI_UQI
:
10724 case V8HI_FTYPE_V8HF_V8HI_UQI
:
10725 case V8HI_FTYPE_V4SI_V8HI_UQI
:
10726 case V8HI_FTYPE_V8SI_V8HI_UQI
:
10727 case V16QI_FTYPE_V2DI_V16QI_UQI
:
10728 case V16QI_FTYPE_V4DI_V16QI_UQI
:
10729 case V8HI_FTYPE_V2DI_V8HI_UQI
:
10730 case V8HI_FTYPE_V4DI_V8HI_UQI
:
10731 case V4SI_FTYPE_V2DI_V4SI_UQI
:
10732 case V4SI_FTYPE_V4DI_V4SI_UQI
:
10733 case V32QI_FTYPE_V32HI_V32QI_USI
:
10734 case UHI_FTYPE_V16QI_V16QI_UHI
:
10735 case USI_FTYPE_V32QI_V32QI_USI
:
10736 case UDI_FTYPE_V64QI_V64QI_UDI
:
10737 case UQI_FTYPE_V8HI_V8HI_UQI
:
10738 case UHI_FTYPE_V16HI_V16HI_UHI
:
10739 case USI_FTYPE_V32HI_V32HI_USI
:
10740 case UQI_FTYPE_V4SI_V4SI_UQI
:
10741 case UQI_FTYPE_V8SI_V8SI_UQI
:
10742 case UQI_FTYPE_V2DI_V2DI_UQI
:
10743 case UQI_FTYPE_V4DI_V4DI_UQI
:
10744 case V4SF_FTYPE_V2DF_V4SF_UQI
:
10745 case V4SF_FTYPE_V4DF_V4SF_UQI
:
10746 case V16SI_FTYPE_V16SI_V16SI_UHI
:
10747 case V16SI_FTYPE_V4SI_V16SI_UHI
:
10748 case V2DI_FTYPE_V4SI_V2DI_UQI
:
10749 case V2DI_FTYPE_V8HI_V2DI_UQI
:
10750 case V2DI_FTYPE_V16QI_V2DI_UQI
:
10751 case V4DI_FTYPE_V4DI_V4DI_UQI
:
10752 case V4DI_FTYPE_V4SI_V4DI_UQI
:
10753 case V4DI_FTYPE_V8HI_V4DI_UQI
:
10754 case V4DI_FTYPE_V16QI_V4DI_UQI
:
10755 case V4DI_FTYPE_V4DF_V4DI_UQI
:
10756 case V2DI_FTYPE_V2DF_V2DI_UQI
:
10757 case V4SI_FTYPE_V4DF_V4SI_UQI
:
10758 case V4SI_FTYPE_V2DF_V4SI_UQI
:
10759 case V4SI_FTYPE_V8HI_V4SI_UQI
:
10760 case V4SI_FTYPE_V16QI_V4SI_UQI
:
10761 case V4DI_FTYPE_V4DI_V4DI_V4DI
:
10762 case V8DF_FTYPE_V2DF_V8DF_UQI
:
10763 case V8DF_FTYPE_V4DF_V8DF_UQI
:
10764 case V8DF_FTYPE_V8DF_V8DF_UQI
:
10765 case V8SF_FTYPE_V8SF_V8SF_UQI
:
10766 case V8SF_FTYPE_V8SI_V8SF_UQI
:
10767 case V4DF_FTYPE_V4DF_V4DF_UQI
:
10768 case V4SF_FTYPE_V4SF_V4SF_UQI
:
10769 case V2DF_FTYPE_V2DF_V2DF_UQI
:
10770 case V2DF_FTYPE_V4SF_V2DF_UQI
:
10771 case V2DF_FTYPE_V4SI_V2DF_UQI
:
10772 case V4SF_FTYPE_V4SI_V4SF_UQI
:
10773 case V4DF_FTYPE_V4SF_V4DF_UQI
:
10774 case V4DF_FTYPE_V4SI_V4DF_UQI
:
10775 case V8SI_FTYPE_V8SI_V8SI_UQI
:
10776 case V8SI_FTYPE_V8HI_V8SI_UQI
:
10777 case V8SI_FTYPE_V16QI_V8SI_UQI
:
10778 case V8DF_FTYPE_V8SI_V8DF_UQI
:
10779 case V8DI_FTYPE_DI_V8DI_UQI
:
10780 case V16SF_FTYPE_V8SF_V16SF_UHI
:
10781 case V16SI_FTYPE_V8SI_V16SI_UHI
:
10782 case V16HF_FTYPE_V16HI_V16HF_UHI
:
10783 case V16HF_FTYPE_V16HF_V16HF_V16HF
:
10784 case V16HI_FTYPE_V16HF_V16HI_UHI
:
10785 case V16HI_FTYPE_V16HI_V16HI_UHI
:
10786 case V8HI_FTYPE_V16QI_V8HI_UQI
:
10787 case V16HI_FTYPE_V16QI_V16HI_UHI
:
10788 case V32HI_FTYPE_V32HI_V32HI_USI
:
10789 case V32HI_FTYPE_V32QI_V32HI_USI
:
10790 case V8DI_FTYPE_V16QI_V8DI_UQI
:
10791 case V8DI_FTYPE_V2DI_V8DI_UQI
:
10792 case V8DI_FTYPE_V4DI_V8DI_UQI
:
10793 case V8DI_FTYPE_V8DI_V8DI_UQI
:
10794 case V8DI_FTYPE_V8HI_V8DI_UQI
:
10795 case V8DI_FTYPE_V8SI_V8DI_UQI
:
10796 case V8HI_FTYPE_V8DI_V8HI_UQI
:
10797 case V8SI_FTYPE_V8DI_V8SI_UQI
:
10798 case V4SI_FTYPE_V4SI_V4SI_V4SI
:
10799 case V16SI_FTYPE_V16SI_V16SI_V16SI
:
10800 case V8DI_FTYPE_V8DI_V8DI_V8DI
:
10801 case V32HI_FTYPE_V32HI_V32HI_V32HI
:
10802 case V2DI_FTYPE_V2DI_V2DI_V2DI
:
10803 case V16HI_FTYPE_V16HI_V16HI_V16HI
:
10804 case V8SI_FTYPE_V8SI_V8SI_V8SI
:
10805 case V8HI_FTYPE_V8HI_V8HI_V8HI
:
10806 case V32BF_FTYPE_V16SF_V16SF_USI
:
10807 case V16BF_FTYPE_V8SF_V8SF_UHI
:
10808 case V8BF_FTYPE_V4SF_V4SF_UQI
:
10809 case V16BF_FTYPE_V16SF_V16BF_UHI
:
10810 case V8BF_FTYPE_V8SF_V8BF_UQI
:
10811 case V8BF_FTYPE_V4SF_V8BF_UQI
:
10812 case V16SF_FTYPE_V16SF_V32BF_V32BF
:
10813 case V8SF_FTYPE_V8SF_V16BF_V16BF
:
10814 case V4SF_FTYPE_V4SF_V8BF_V8BF
:
10817 case V32QI_FTYPE_V32QI_V32QI_INT
:
10818 case V16HI_FTYPE_V16HI_V16HI_INT
:
10819 case V16QI_FTYPE_V16QI_V16QI_INT
:
10820 case V4DI_FTYPE_V4DI_V4DI_INT
:
10821 case V8HI_FTYPE_V8HI_V8HI_INT
:
10822 case V8SI_FTYPE_V8SI_V8SI_INT
:
10823 case V8SI_FTYPE_V8SI_V4SI_INT
:
10824 case V8SF_FTYPE_V8SF_V8SF_INT
:
10825 case V8SF_FTYPE_V8SF_V4SF_INT
:
10826 case V4SI_FTYPE_V4SI_V4SI_INT
:
10827 case V4DF_FTYPE_V4DF_V4DF_INT
:
10828 case V16SF_FTYPE_V16SF_V16SF_INT
:
10829 case V16SF_FTYPE_V16SF_V4SF_INT
:
10830 case V16SI_FTYPE_V16SI_V4SI_INT
:
10831 case V4DF_FTYPE_V4DF_V2DF_INT
:
10832 case V4SF_FTYPE_V4SF_V4SF_INT
:
10833 case V2DI_FTYPE_V2DI_V2DI_INT
:
10834 case V4DI_FTYPE_V4DI_V2DI_INT
:
10835 case V2DF_FTYPE_V2DF_V2DF_INT
:
10836 case UQI_FTYPE_V8DI_V8UDI_INT
:
10837 case UQI_FTYPE_V8DF_V8DF_INT
:
10838 case UQI_FTYPE_V2DF_V2DF_INT
:
10839 case UQI_FTYPE_V4SF_V4SF_INT
:
10840 case UHI_FTYPE_V16SI_V16SI_INT
:
10841 case UHI_FTYPE_V16SF_V16SF_INT
:
10842 case V64QI_FTYPE_V64QI_V64QI_INT
:
10843 case V32HI_FTYPE_V32HI_V32HI_INT
:
10844 case V16SI_FTYPE_V16SI_V16SI_INT
:
10845 case V8DI_FTYPE_V8DI_V8DI_INT
:
10847 nargs_constant
= 1;
10849 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT
:
10852 nargs_constant
= 1;
10854 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT
:
10857 nargs_constant
= 1;
10859 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT
:
10862 nargs_constant
= 1;
10864 case V2DI_FTYPE_V2DI_UINT_UINT
:
10866 nargs_constant
= 2;
10868 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT
:
10871 nargs_constant
= 1;
10873 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT
:
10877 nargs_constant
= 1;
10879 case QI_FTYPE_V8DF_INT_UQI
:
10880 case QI_FTYPE_V4DF_INT_UQI
:
10881 case QI_FTYPE_V2DF_INT_UQI
:
10882 case HI_FTYPE_V16SF_INT_UHI
:
10883 case QI_FTYPE_V8SF_INT_UQI
:
10884 case QI_FTYPE_V4SF_INT_UQI
:
10885 case QI_FTYPE_V8HF_INT_UQI
:
10886 case HI_FTYPE_V16HF_INT_UHI
:
10887 case SI_FTYPE_V32HF_INT_USI
:
10888 case V4SI_FTYPE_V4SI_V4SI_UHI
:
10889 case V8SI_FTYPE_V8SI_V8SI_UHI
:
10892 nargs_constant
= 1;
10894 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT
:
10898 nargs_constant
= 1;
10900 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT
:
10904 nargs_constant
= 1;
10906 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI
:
10907 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI
:
10908 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI
:
10909 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI
:
10910 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI
:
10911 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI
:
10912 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI
:
10913 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI
:
10914 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI
:
10915 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI
:
10916 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI
:
10917 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI
:
10918 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI
:
10919 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI
:
10920 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI
:
10921 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI
:
10922 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI
:
10923 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI
:
10924 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI
:
10925 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI
:
10926 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI
:
10927 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI
:
10928 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI
:
10929 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI
:
10930 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI
:
10931 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI
:
10932 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI
:
10933 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI
:
10934 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI
:
10935 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI
:
10936 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI
:
10937 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI
:
10938 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI
:
10939 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI
:
10940 case V16HF_FTYPE_V16HF_V16HF_V16HF_UQI
:
10941 case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI
:
10942 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI
:
10943 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI
:
10944 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI
:
10945 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI
:
10946 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI
:
10947 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI
:
10948 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI
:
10949 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI
:
10950 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI
:
10951 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI
:
10952 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI
:
10953 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI
:
10954 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI
:
10955 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI
:
10956 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI
:
10957 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI
:
10958 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI
:
10959 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI
:
10960 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI
:
10961 case V32BF_FTYPE_V16SF_V16SF_V32BF_USI
:
10962 case V16BF_FTYPE_V8SF_V8SF_V16BF_UHI
:
10963 case V8BF_FTYPE_V4SF_V4SF_V8BF_UQI
:
10966 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT
:
10967 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT
:
10968 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT
:
10969 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT
:
10970 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT
:
10972 nargs_constant
= 1;
10974 case UQI_FTYPE_V4DI_V4DI_INT_UQI
:
10975 case UQI_FTYPE_V8SI_V8SI_INT_UQI
:
10976 case QI_FTYPE_V4DF_V4DF_INT_UQI
:
10977 case QI_FTYPE_V8SF_V8SF_INT_UQI
:
10978 case UHI_FTYPE_V16HF_V16HF_INT_UHI
:
10979 case UQI_FTYPE_V2DI_V2DI_INT_UQI
:
10980 case UQI_FTYPE_V4SI_V4SI_INT_UQI
:
10981 case UQI_FTYPE_V2DF_V2DF_INT_UQI
:
10982 case UQI_FTYPE_V4SF_V4SF_INT_UQI
:
10983 case UQI_FTYPE_V8HF_V8HF_INT_UQI
:
10984 case UDI_FTYPE_V64QI_V64QI_INT_UDI
:
10985 case USI_FTYPE_V32QI_V32QI_INT_USI
:
10986 case UHI_FTYPE_V16QI_V16QI_INT_UHI
:
10987 case USI_FTYPE_V32HI_V32HI_INT_USI
:
10988 case USI_FTYPE_V32HF_V32HF_INT_USI
:
10989 case UHI_FTYPE_V16HI_V16HI_INT_UHI
:
10990 case UQI_FTYPE_V8HI_V8HI_INT_UQI
:
10993 nargs_constant
= 1;
10995 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT
:
10997 nargs_constant
= 2;
10999 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED
:
11000 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG
:
11001 case V16SF_FTYPE_V16SF_V32BF_V32BF_UHI
:
11002 case V8SF_FTYPE_V8SF_V16BF_V16BF_UQI
:
11003 case V4SF_FTYPE_V4SF_V8BF_V8BF_UQI
:
11006 case UQI_FTYPE_V8DI_V8DI_INT_UQI
:
11007 case UHI_FTYPE_V16SI_V16SI_INT_UHI
:
11010 nargs_constant
= 1;
11012 case V8SF_FTYPE_V8SF_INT_V8SF_UQI
:
11013 case V4SF_FTYPE_V4SF_INT_V4SF_UQI
:
11014 case V2DF_FTYPE_V4DF_INT_V2DF_UQI
:
11015 case V2DI_FTYPE_V4DI_INT_V2DI_UQI
:
11016 case V8SF_FTYPE_V16SF_INT_V8SF_UQI
:
11017 case V8SI_FTYPE_V16SI_INT_V8SI_UQI
:
11018 case V2DF_FTYPE_V8DF_INT_V2DF_UQI
:
11019 case V2DI_FTYPE_V8DI_INT_V2DI_UQI
:
11020 case V4SF_FTYPE_V8SF_INT_V4SF_UQI
:
11021 case V4SI_FTYPE_V8SI_INT_V4SI_UQI
:
11022 case V8HI_FTYPE_V8SF_INT_V8HI_UQI
:
11023 case V8HI_FTYPE_V4SF_INT_V8HI_UQI
:
11024 case V32HI_FTYPE_V32HI_INT_V32HI_USI
:
11025 case V16HI_FTYPE_V16HI_INT_V16HI_UHI
:
11026 case V8HI_FTYPE_V8HI_INT_V8HI_UQI
:
11027 case V4DI_FTYPE_V4DI_INT_V4DI_UQI
:
11028 case V2DI_FTYPE_V2DI_INT_V2DI_UQI
:
11029 case V8SI_FTYPE_V8SI_INT_V8SI_UQI
:
11030 case V4SI_FTYPE_V4SI_INT_V4SI_UQI
:
11031 case V4DF_FTYPE_V4DF_INT_V4DF_UQI
:
11032 case V2DF_FTYPE_V2DF_INT_V2DF_UQI
:
11033 case V8DF_FTYPE_V8DF_INT_V8DF_UQI
:
11034 case V16SF_FTYPE_V16SF_INT_V16SF_UHI
:
11035 case V16HI_FTYPE_V16SF_INT_V16HI_UHI
:
11036 case V16SI_FTYPE_V16SI_INT_V16SI_UHI
:
11037 case V16HF_FTYPE_V16HF_INT_V16HF_UHI
:
11038 case V8HF_FTYPE_V8HF_INT_V8HF_UQI
:
11039 case V4SI_FTYPE_V16SI_INT_V4SI_UQI
:
11040 case V4DI_FTYPE_V8DI_INT_V4DI_UQI
:
11041 case V4DF_FTYPE_V8DF_INT_V4DF_UQI
:
11042 case V4SF_FTYPE_V16SF_INT_V4SF_UQI
:
11043 case V8DI_FTYPE_V8DI_INT_V8DI_UQI
:
11046 nargs_constant
= 1;
11048 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI
:
11049 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI
:
11050 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI
:
11051 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI
:
11052 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI
:
11053 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI
:
11054 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI
:
11055 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI
:
11056 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI
:
11057 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI
:
11058 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI
:
11059 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI
:
11060 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI
:
11061 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI
:
11062 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI
:
11063 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI
:
11064 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI
:
11065 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI
:
11066 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI
:
11067 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI
:
11068 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI
:
11069 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI
:
11070 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI
:
11071 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI
:
11072 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI
:
11073 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI
:
11074 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI
:
11077 nargs_constant
= 1;
11079 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI
:
11080 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI
:
11081 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI
:
11082 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI
:
11083 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI
:
11084 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI
:
11085 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI
:
11086 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI
:
11087 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI
:
11088 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI
:
11091 nargs_constant
= 1;
11093 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI
:
11094 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI
:
11095 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI
:
11096 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT
:
11097 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT
:
11098 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT
:
11099 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT
:
11100 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT
:
11101 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT
:
11102 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT
:
11103 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT
:
11104 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT
:
11107 nargs_constant
= 2;
11111 gcc_unreachable ();
11114 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
11116 if (comparison
!= UNKNOWN
)
11118 gcc_assert (nargs
== 2);
11119 return ix86_expand_sse_compare (d
, exp
, target
, swap
);
11122 if (rmode
== VOIDmode
|| rmode
== tmode
)
11126 || GET_MODE (target
) != tmode
11127 || !insn_p
->operand
[0].predicate (target
, tmode
))
11128 target
= gen_reg_rtx (tmode
);
11129 else if (memory_operand (target
, tmode
))
11131 real_target
= target
;
11135 real_target
= gen_reg_rtx (tmode
);
11136 target
= lowpart_subreg (rmode
, real_target
, tmode
);
11139 for (i
= 0; i
< nargs
; i
++)
11141 tree arg
= CALL_EXPR_ARG (exp
, i
);
11142 rtx op
= expand_normal (arg
);
11143 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
11144 bool match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
11146 if (second_arg_count
&& i
== 1)
11148 /* SIMD shift insns take either an 8-bit immediate or
11149 register as count. But builtin functions take int as
11150 count. If count doesn't match, we put it in register.
11151 The instructions are using 64-bit count, if op is just
11152 32-bit, zero-extend it, as negative shift counts
11153 are undefined behavior and zero-extension is more
11157 if (SCALAR_INT_MODE_P (GET_MODE (op
)))
11158 op
= convert_modes (mode
, GET_MODE (op
), op
, 1);
11160 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
11161 if (!insn_p
->operand
[i
+ 1].predicate (op
, mode
))
11162 op
= copy_to_reg (op
);
11165 else if ((mask_pos
&& (nargs
- i
- mask_pos
) == nargs_constant
) ||
11166 (!mask_pos
&& (nargs
- i
) <= nargs_constant
))
11171 case CODE_FOR_avx_vinsertf128v4di
:
11172 case CODE_FOR_avx_vextractf128v4di
:
11173 error ("the last argument must be an 1-bit immediate");
11176 case CODE_FOR_avx512f_cmpv8di3_mask
:
11177 case CODE_FOR_avx512f_cmpv16si3_mask
:
11178 case CODE_FOR_avx512f_ucmpv8di3_mask
:
11179 case CODE_FOR_avx512f_ucmpv16si3_mask
:
11180 case CODE_FOR_avx512vl_cmpv4di3_mask
:
11181 case CODE_FOR_avx512vl_cmpv8si3_mask
:
11182 case CODE_FOR_avx512vl_ucmpv4di3_mask
:
11183 case CODE_FOR_avx512vl_ucmpv8si3_mask
:
11184 case CODE_FOR_avx512vl_cmpv2di3_mask
:
11185 case CODE_FOR_avx512vl_cmpv4si3_mask
:
11186 case CODE_FOR_avx512vl_ucmpv2di3_mask
:
11187 case CODE_FOR_avx512vl_ucmpv4si3_mask
:
11188 error ("the last argument must be a 3-bit immediate");
11191 case CODE_FOR_sse4_1_roundsd
:
11192 case CODE_FOR_sse4_1_roundss
:
11194 case CODE_FOR_sse4_1_roundpd
:
11195 case CODE_FOR_sse4_1_roundps
:
11196 case CODE_FOR_avx_roundpd256
:
11197 case CODE_FOR_avx_roundps256
:
11199 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix
:
11200 case CODE_FOR_sse4_1_roundps_sfix
:
11201 case CODE_FOR_avx_roundpd_vec_pack_sfix256
:
11202 case CODE_FOR_avx_roundps_sfix256
:
11204 case CODE_FOR_sse4_1_blendps
:
11205 case CODE_FOR_avx_blendpd256
:
11206 case CODE_FOR_avx_vpermilv4df
:
11207 case CODE_FOR_avx_vpermilv4df_mask
:
11208 case CODE_FOR_avx512f_getmantv8df_mask
:
11209 case CODE_FOR_avx512f_getmantv16sf_mask
:
11210 case CODE_FOR_avx512vl_getmantv16hf_mask
:
11211 case CODE_FOR_avx512vl_getmantv8sf_mask
:
11212 case CODE_FOR_avx512vl_getmantv4df_mask
:
11213 case CODE_FOR_avx512fp16_getmantv8hf_mask
:
11214 case CODE_FOR_avx512vl_getmantv4sf_mask
:
11215 case CODE_FOR_avx512vl_getmantv2df_mask
:
11216 case CODE_FOR_avx512dq_rangepv8df_mask_round
:
11217 case CODE_FOR_avx512dq_rangepv16sf_mask_round
:
11218 case CODE_FOR_avx512dq_rangepv4df_mask
:
11219 case CODE_FOR_avx512dq_rangepv8sf_mask
:
11220 case CODE_FOR_avx512dq_rangepv2df_mask
:
11221 case CODE_FOR_avx512dq_rangepv4sf_mask
:
11222 case CODE_FOR_avx_shufpd256_mask
:
11223 error ("the last argument must be a 4-bit immediate");
11226 case CODE_FOR_sha1rnds4
:
11227 case CODE_FOR_sse4_1_blendpd
:
11228 case CODE_FOR_avx_vpermilv2df
:
11229 case CODE_FOR_avx_vpermilv2df_mask
:
11230 case CODE_FOR_xop_vpermil2v2df3
:
11231 case CODE_FOR_xop_vpermil2v4sf3
:
11232 case CODE_FOR_xop_vpermil2v4df3
:
11233 case CODE_FOR_xop_vpermil2v8sf3
:
11234 case CODE_FOR_avx512f_vinsertf32x4_mask
:
11235 case CODE_FOR_avx512f_vinserti32x4_mask
:
11236 case CODE_FOR_avx512f_vextractf32x4_mask
:
11237 case CODE_FOR_avx512f_vextracti32x4_mask
:
11238 case CODE_FOR_sse2_shufpd
:
11239 case CODE_FOR_sse2_shufpd_mask
:
11240 case CODE_FOR_avx512dq_shuf_f64x2_mask
:
11241 case CODE_FOR_avx512dq_shuf_i64x2_mask
:
11242 case CODE_FOR_avx512vl_shuf_i32x4_mask
:
11243 case CODE_FOR_avx512vl_shuf_f32x4_mask
:
11244 error ("the last argument must be a 2-bit immediate");
11247 case CODE_FOR_avx_vextractf128v4df
:
11248 case CODE_FOR_avx_vextractf128v8sf
:
11249 case CODE_FOR_avx_vextractf128v8si
:
11250 case CODE_FOR_avx_vinsertf128v4df
:
11251 case CODE_FOR_avx_vinsertf128v8sf
:
11252 case CODE_FOR_avx_vinsertf128v8si
:
11253 case CODE_FOR_avx512f_vinsertf64x4_mask
:
11254 case CODE_FOR_avx512f_vinserti64x4_mask
:
11255 case CODE_FOR_avx512f_vextractf64x4_mask
:
11256 case CODE_FOR_avx512f_vextracti64x4_mask
:
11257 case CODE_FOR_avx512dq_vinsertf32x8_mask
:
11258 case CODE_FOR_avx512dq_vinserti32x8_mask
:
11259 case CODE_FOR_avx512vl_vinsertv4df
:
11260 case CODE_FOR_avx512vl_vinsertv4di
:
11261 case CODE_FOR_avx512vl_vinsertv8sf
:
11262 case CODE_FOR_avx512vl_vinsertv8si
:
11263 error ("the last argument must be a 1-bit immediate");
11266 case CODE_FOR_avx_vmcmpv2df3
:
11267 case CODE_FOR_avx_vmcmpv4sf3
:
11268 case CODE_FOR_avx_cmpv2df3
:
11269 case CODE_FOR_avx_cmpv4sf3
:
11270 case CODE_FOR_avx_cmpv4df3
:
11271 case CODE_FOR_avx_cmpv8sf3
:
11272 case CODE_FOR_avx512f_cmpv8df3_mask
:
11273 case CODE_FOR_avx512f_cmpv16sf3_mask
:
11274 case CODE_FOR_avx512f_vmcmpv2df3_mask
:
11275 case CODE_FOR_avx512f_vmcmpv4sf3_mask
:
11276 case CODE_FOR_avx512bw_cmpv32hf3_mask
:
11277 case CODE_FOR_avx512vl_cmpv16hf3_mask
:
11278 case CODE_FOR_avx512fp16_cmpv8hf3_mask
:
11279 error ("the last argument must be a 5-bit immediate");
11283 switch (nargs_constant
)
11286 if ((mask_pos
&& (nargs
- i
- mask_pos
) == nargs_constant
) ||
11287 (!mask_pos
&& (nargs
- i
) == nargs_constant
))
11289 error ("the next to last argument must be an 8-bit immediate");
11294 error ("the last argument must be an 8-bit immediate");
11297 gcc_unreachable ();
11304 if (VECTOR_MODE_P (mode
))
11305 op
= safe_vector_operand (op
, mode
);
11307 /* If we aren't optimizing, only allow one memory operand to
11309 if (memory_operand (op
, mode
))
11312 op
= fixup_modeless_constant (op
, mode
);
11314 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
11316 if (optimize
|| !match
|| num_memory
> 1)
11317 op
= copy_to_mode_reg (mode
, op
);
11321 op
= copy_to_reg (op
);
11322 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
11332 pat
= GEN_FCN (icode
) (real_target
, xops
[0]);
11335 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1]);
11338 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1], xops
[2]);
11341 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1],
11345 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1],
11346 xops
[2], xops
[3], xops
[4]);
11349 pat
= GEN_FCN (icode
) (real_target
, xops
[0], xops
[1],
11350 xops
[2], xops
[3], xops
[4], xops
[5]);
11353 gcc_unreachable ();
11363 /* Transform pattern of following layout:
11365 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
11371 ix86_erase_embedded_rounding (rtx pat
)
11373 if (GET_CODE (pat
) == INSN
)
11374 pat
= PATTERN (pat
);
11376 gcc_assert (GET_CODE (pat
) == SET
);
11377 rtx src
= SET_SRC (pat
);
11378 gcc_assert (XVECLEN (src
, 0) == 2);
11379 rtx p0
= XVECEXP (src
, 0, 0);
11380 gcc_assert (GET_CODE (src
) == UNSPEC
11381 && XINT (src
, 1) == UNSPEC_EMBEDDED_ROUNDING
);
11382 rtx res
= gen_rtx_SET (SET_DEST (pat
), p0
);
11386 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
11389 ix86_expand_sse_comi_round (const struct builtin_description
*d
,
11390 tree exp
, rtx target
)
11393 tree arg0
= CALL_EXPR_ARG (exp
, 0);
11394 tree arg1
= CALL_EXPR_ARG (exp
, 1);
11395 tree arg2
= CALL_EXPR_ARG (exp
, 2);
11396 tree arg3
= CALL_EXPR_ARG (exp
, 3);
11397 rtx op0
= expand_normal (arg0
);
11398 rtx op1
= expand_normal (arg1
);
11399 rtx op2
= expand_normal (arg2
);
11400 rtx op3
= expand_normal (arg3
);
11401 enum insn_code icode
= d
->icode
;
11402 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
11403 machine_mode mode0
= insn_p
->operand
[0].mode
;
11404 machine_mode mode1
= insn_p
->operand
[1].mode
;
11406 /* See avxintrin.h for values. */
11407 static const enum rtx_code comparisons
[32] =
11409 EQ
, LT
, LE
, UNORDERED
, NE
, UNGE
, UNGT
, ORDERED
,
11410 UNEQ
, UNLT
, UNLE
, UNORDERED
, LTGT
, GE
, GT
, ORDERED
,
11411 EQ
, LT
, LE
, UNORDERED
, NE
, UNGE
, UNGT
, ORDERED
,
11412 UNEQ
, UNLT
, UNLE
, UNORDERED
, LTGT
, GE
, GT
, ORDERED
11414 static const bool ordereds
[32] =
11416 true, true, true, false, false, false, false, true,
11417 false, false, false, true, true, true, true, false,
11418 true, true, true, false, false, false, false, true,
11419 false, false, false, true, true, true, true, false
11421 static const bool non_signalings
[32] =
11423 true, false, false, true, true, false, false, true,
11424 true, false, false, true, true, false, false, true,
11425 false, true, true, false, false, true, true, false,
11426 false, true, true, false, false, true, true, false
11429 if (!CONST_INT_P (op2
))
11431 error ("the third argument must be comparison constant");
11434 if (INTVAL (op2
) < 0 || INTVAL (op2
) >= 32)
11436 error ("incorrect comparison mode");
11440 if (!insn_p
->operand
[2].predicate (op3
, SImode
))
11442 error ("incorrect rounding operand");
11446 if (VECTOR_MODE_P (mode0
))
11447 op0
= safe_vector_operand (op0
, mode0
);
11448 if (VECTOR_MODE_P (mode1
))
11449 op1
= safe_vector_operand (op1
, mode1
);
11451 enum rtx_code comparison
= comparisons
[INTVAL (op2
)];
11452 bool ordered
= ordereds
[INTVAL (op2
)];
11453 bool non_signaling
= non_signalings
[INTVAL (op2
)];
11454 rtx const_val
= const0_rtx
;
11456 bool check_unordered
= false;
11457 machine_mode mode
= CCFPmode
;
11458 switch (comparison
)
11463 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
11464 if (!non_signaling
)
11470 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
11480 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
11487 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
11488 if (!non_signaling
)
11495 case LE
: /* -> GE */
11496 case LT
: /* -> GT */
11497 case UNGE
: /* -> UNLE */
11498 case UNGT
: /* -> UNLT */
11499 std::swap (op0
, op1
);
11500 comparison
= swap_condition (comparison
);
11508 /* These are supported by CCFPmode. NB: Use ordered/signaling
11509 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
11510 with NAN operands. */
11511 if (ordered
== non_signaling
)
11512 ordered
= !ordered
;
11515 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11516 _CMP_EQ_OQ/_CMP_EQ_OS. */
11517 check_unordered
= true;
11521 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11522 _CMP_NEQ_UQ/_CMP_NEQ_US. */
11523 gcc_assert (!ordered
);
11524 check_unordered
= true;
11526 const_val
= const1_rtx
;
11529 gcc_unreachable ();
11532 target
= gen_reg_rtx (SImode
);
11533 emit_move_insn (target
, const_val
);
11534 target
= gen_rtx_SUBREG (QImode
, target
, 0);
11536 if ((optimize
&& !register_operand (op0
, mode0
))
11537 || !insn_p
->operand
[0].predicate (op0
, mode0
))
11538 op0
= copy_to_mode_reg (mode0
, op0
);
11539 if ((optimize
&& !register_operand (op1
, mode1
))
11540 || !insn_p
->operand
[1].predicate (op1
, mode1
))
11541 op1
= copy_to_mode_reg (mode1
, op1
);
11544 1. COMI: ordered and signaling.
11545 2. UCOMI: unordered and non-signaling.
11548 icode
= (icode
== CODE_FOR_sse_comi_round
11549 ? CODE_FOR_sse_ucomi_round
11550 : CODE_FOR_sse2_ucomi_round
);
11552 pat
= GEN_FCN (icode
) (op0
, op1
, op3
);
11556 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
11557 if (INTVAL (op3
) == NO_ROUND
)
11559 pat
= ix86_erase_embedded_rounding (pat
);
11563 set_dst
= SET_DEST (pat
);
11567 gcc_assert (GET_CODE (pat
) == SET
);
11568 set_dst
= SET_DEST (pat
);
11573 return ix86_ssecom_setcc (comparison
, check_unordered
, mode
,
11578 ix86_expand_round_builtin (const struct builtin_description
*d
,
11579 tree exp
, rtx target
)
11582 unsigned int i
, nargs
;
11584 enum insn_code icode
= d
->icode
;
11585 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
11586 machine_mode tmode
= insn_p
->operand
[0].mode
;
11587 unsigned int nargs_constant
= 0;
11588 unsigned int redundant_embed_rnd
= 0;
11590 switch ((enum ix86_builtin_func_type
) d
->flag
)
11592 case UINT64_FTYPE_V2DF_INT
:
11593 case UINT64_FTYPE_V4SF_INT
:
11594 case UINT64_FTYPE_V8HF_INT
:
11595 case UINT_FTYPE_V2DF_INT
:
11596 case UINT_FTYPE_V4SF_INT
:
11597 case UINT_FTYPE_V8HF_INT
:
11598 case INT64_FTYPE_V2DF_INT
:
11599 case INT64_FTYPE_V4SF_INT
:
11600 case INT64_FTYPE_V8HF_INT
:
11601 case INT_FTYPE_V2DF_INT
:
11602 case INT_FTYPE_V4SF_INT
:
11603 case INT_FTYPE_V8HF_INT
:
11606 case V32HF_FTYPE_V32HF_V32HF_INT
:
11607 case V8HF_FTYPE_V8HF_V8HF_INT
:
11608 case V8HF_FTYPE_V8HF_INT_INT
:
11609 case V8HF_FTYPE_V8HF_UINT_INT
:
11610 case V8HF_FTYPE_V8HF_INT64_INT
:
11611 case V8HF_FTYPE_V8HF_UINT64_INT
:
11612 case V4SF_FTYPE_V4SF_UINT_INT
:
11613 case V4SF_FTYPE_V4SF_UINT64_INT
:
11614 case V2DF_FTYPE_V2DF_UINT64_INT
:
11615 case V4SF_FTYPE_V4SF_INT_INT
:
11616 case V4SF_FTYPE_V4SF_INT64_INT
:
11617 case V2DF_FTYPE_V2DF_INT64_INT
:
11618 case V4SF_FTYPE_V4SF_V4SF_INT
:
11619 case V2DF_FTYPE_V2DF_V2DF_INT
:
11620 case V4SF_FTYPE_V4SF_V2DF_INT
:
11621 case V2DF_FTYPE_V2DF_V4SF_INT
:
11624 case V8SF_FTYPE_V8DF_V8SF_QI_INT
:
11625 case V8DF_FTYPE_V8DF_V8DF_QI_INT
:
11626 case V32HI_FTYPE_V32HF_V32HI_USI_INT
:
11627 case V8SI_FTYPE_V8DF_V8SI_QI_INT
:
11628 case V8DI_FTYPE_V8HF_V8DI_UQI_INT
:
11629 case V8DI_FTYPE_V8DF_V8DI_QI_INT
:
11630 case V8SF_FTYPE_V8DI_V8SF_QI_INT
:
11631 case V8DF_FTYPE_V8DI_V8DF_QI_INT
:
11632 case V8DF_FTYPE_V8HF_V8DF_UQI_INT
:
11633 case V16SF_FTYPE_V16HF_V16SF_UHI_INT
:
11634 case V32HF_FTYPE_V32HI_V32HF_USI_INT
:
11635 case V32HF_FTYPE_V32HF_V32HF_USI_INT
:
11636 case V32HF_FTYPE_V32HF_V32HF_V32HF_INT
:
11637 case V16SF_FTYPE_V16SF_V16SF_HI_INT
:
11638 case V8DI_FTYPE_V8SF_V8DI_QI_INT
:
11639 case V16SF_FTYPE_V16SI_V16SF_HI_INT
:
11640 case V16SI_FTYPE_V16SF_V16SI_HI_INT
:
11641 case V16SI_FTYPE_V16HF_V16SI_UHI_INT
:
11642 case V16HF_FTYPE_V16SI_V16HF_UHI_INT
:
11643 case V8DF_FTYPE_V8SF_V8DF_QI_INT
:
11644 case V16SF_FTYPE_V16HI_V16SF_HI_INT
:
11645 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT
:
11646 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT
:
11647 case V8HF_FTYPE_V8DI_V8HF_UQI_INT
:
11648 case V8HF_FTYPE_V8DF_V8HF_UQI_INT
:
11649 case V16HF_FTYPE_V16SF_V16HF_UHI_INT
:
11650 case V8HF_FTYPE_V8HF_V8HF_V8HF_INT
:
11653 case V4SF_FTYPE_V4SF_V4SF_INT_INT
:
11654 case V2DF_FTYPE_V2DF_V2DF_INT_INT
:
11655 nargs_constant
= 2;
11658 case INT_FTYPE_V4SF_V4SF_INT_INT
:
11659 case INT_FTYPE_V2DF_V2DF_INT_INT
:
11660 return ix86_expand_sse_comi_round (d
, exp
, target
);
11661 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT
:
11662 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT
:
11663 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT
:
11664 case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT
:
11665 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT
:
11666 case V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT
:
11667 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT
:
11668 case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT
:
11669 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT
:
11670 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT
:
11671 case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT
:
11672 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT
:
11673 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT
:
11674 case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT
:
11675 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT
:
11676 case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT
:
11677 case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT
:
11680 case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT
:
11681 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT
:
11682 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT
:
11683 case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT
:
11684 case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT
:
11685 nargs_constant
= 4;
11688 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT
:
11689 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT
:
11690 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT
:
11691 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT
:
11692 case USI_FTYPE_V32HF_V32HF_INT_USI_INT
:
11693 case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT
:
11694 nargs_constant
= 3;
11697 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT
:
11698 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT
:
11699 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT
:
11700 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT
:
11701 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT
:
11702 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT
:
11703 case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT
:
11705 nargs_constant
= 4;
11707 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT
:
11708 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT
:
11709 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT
:
11710 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT
:
11712 nargs_constant
= 3;
11715 gcc_unreachable ();
11717 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
11721 || GET_MODE (target
) != tmode
11722 || !insn_p
->operand
[0].predicate (target
, tmode
))
11723 target
= gen_reg_rtx (tmode
);
11725 for (i
= 0; i
< nargs
; i
++)
11727 tree arg
= CALL_EXPR_ARG (exp
, i
);
11728 rtx op
= expand_normal (arg
);
11729 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
11730 bool match
= insn_p
->operand
[i
+ 1].predicate (op
, mode
);
11732 if (i
== nargs
- nargs_constant
)
11738 case CODE_FOR_avx512f_getmantv8df_mask_round
:
11739 case CODE_FOR_avx512f_getmantv16sf_mask_round
:
11740 case CODE_FOR_avx512bw_getmantv32hf_mask_round
:
11741 case CODE_FOR_avx512f_vgetmantv2df_round
:
11742 case CODE_FOR_avx512f_vgetmantv2df_mask_round
:
11743 case CODE_FOR_avx512f_vgetmantv4sf_round
:
11744 case CODE_FOR_avx512f_vgetmantv4sf_mask_round
:
11745 case CODE_FOR_avx512f_vgetmantv8hf_mask_round
:
11746 error ("the immediate argument must be a 4-bit immediate");
11748 case CODE_FOR_avx512f_cmpv8df3_mask_round
:
11749 case CODE_FOR_avx512f_cmpv16sf3_mask_round
:
11750 case CODE_FOR_avx512f_vmcmpv2df3_mask_round
:
11751 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round
:
11752 case CODE_FOR_avx512f_vmcmpv8hf3_mask_round
:
11753 case CODE_FOR_avx512bw_cmpv32hf3_mask_round
:
11754 error ("the immediate argument must be a 5-bit immediate");
11757 error ("the immediate argument must be an 8-bit immediate");
11762 else if (i
== nargs
-1)
11764 if (!insn_p
->operand
[nargs
].predicate (op
, SImode
))
11766 error ("incorrect rounding operand");
11770 /* If there is no rounding use normal version of the pattern. */
11771 if (INTVAL (op
) == NO_ROUND
)
11773 /* Skip erasing embedded rounding for below expanders who
11774 generates multiple insns. In ix86_erase_embedded_rounding
11775 the pattern will be transformed to a single set, and emit_insn
11776 appends the set insead of insert it to chain. So the insns
11777 emitted inside define_expander would be ignored. */
11780 case CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round
:
11781 case CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round
:
11782 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round
:
11783 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round
:
11784 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round
:
11785 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round
:
11786 redundant_embed_rnd
= 0;
11789 redundant_embed_rnd
= 1;
11796 if (VECTOR_MODE_P (mode
))
11797 op
= safe_vector_operand (op
, mode
);
11799 op
= fixup_modeless_constant (op
, mode
);
11801 if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
11803 if (optimize
|| !match
)
11804 op
= copy_to_mode_reg (mode
, op
);
11808 op
= copy_to_reg (op
);
11809 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
11819 pat
= GEN_FCN (icode
) (target
, xops
[0]);
11822 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1]);
11825 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2]);
11828 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
11832 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
11833 xops
[2], xops
[3], xops
[4]);
11836 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1],
11837 xops
[2], xops
[3], xops
[4], xops
[5]);
11840 gcc_unreachable ();
11846 if (redundant_embed_rnd
)
11847 pat
= ix86_erase_embedded_rounding (pat
);
11853 /* Subroutine of ix86_expand_builtin to take care of special insns
11854 with variable number of operands. */
11857 ix86_expand_special_args_builtin (const struct builtin_description
*d
,
11858 tree exp
, rtx target
)
11862 unsigned int i
, nargs
, arg_adjust
, memory
;
11863 unsigned int constant
= 100;
11864 bool aligned_mem
= false;
11866 enum insn_code icode
= d
->icode
;
11867 const struct insn_data_d
*insn_p
= &insn_data
[icode
];
11868 machine_mode tmode
= insn_p
->operand
[0].mode
;
11869 enum { load
, store
} klass
;
11871 switch ((enum ix86_builtin_func_type
) d
->flag
)
11873 case VOID_FTYPE_VOID
:
11874 emit_insn (GEN_FCN (icode
) (target
));
11876 case VOID_FTYPE_UINT64
:
11877 case VOID_FTYPE_UNSIGNED
:
11883 case INT_FTYPE_VOID
:
11884 case USHORT_FTYPE_VOID
:
11885 case UINT64_FTYPE_VOID
:
11886 case UINT_FTYPE_VOID
:
11887 case UINT8_FTYPE_VOID
:
11888 case UNSIGNED_FTYPE_VOID
:
11893 case UINT64_FTYPE_PUNSIGNED
:
11894 case V2DI_FTYPE_PV2DI
:
11895 case V4DI_FTYPE_PV4DI
:
11896 case V32QI_FTYPE_PCCHAR
:
11897 case V16QI_FTYPE_PCCHAR
:
11898 case V8SF_FTYPE_PCV4SF
:
11899 case V8SF_FTYPE_PCFLOAT
:
11900 case V4SF_FTYPE_PCFLOAT
:
11901 case V4SF_FTYPE_PCFLOAT16
:
11902 case V4SF_FTYPE_PCBFLOAT16
:
11903 case V4SF_FTYPE_PCV8BF
:
11904 case V4SF_FTYPE_PCV8HF
:
11905 case V8SF_FTYPE_PCFLOAT16
:
11906 case V8SF_FTYPE_PCBFLOAT16
:
11907 case V8SF_FTYPE_PCV16HF
:
11908 case V8SF_FTYPE_PCV16BF
:
11909 case V4DF_FTYPE_PCV2DF
:
11910 case V4DF_FTYPE_PCDOUBLE
:
11911 case V2DF_FTYPE_PCDOUBLE
:
11912 case VOID_FTYPE_PVOID
:
11913 case V8DI_FTYPE_PV8DI
:
11919 case CODE_FOR_sse4_1_movntdqa
:
11920 case CODE_FOR_avx2_movntdqa
:
11921 case CODE_FOR_avx512f_movntdqa
:
11922 aligned_mem
= true;
11928 case VOID_FTYPE_PV2SF_V4SF
:
11929 case VOID_FTYPE_PV8DI_V8DI
:
11930 case VOID_FTYPE_PV4DI_V4DI
:
11931 case VOID_FTYPE_PV2DI_V2DI
:
11932 case VOID_FTYPE_PCHAR_V32QI
:
11933 case VOID_FTYPE_PCHAR_V16QI
:
11934 case VOID_FTYPE_PFLOAT_V16SF
:
11935 case VOID_FTYPE_PFLOAT_V8SF
:
11936 case VOID_FTYPE_PFLOAT_V4SF
:
11937 case VOID_FTYPE_PDOUBLE_V8DF
:
11938 case VOID_FTYPE_PDOUBLE_V4DF
:
11939 case VOID_FTYPE_PDOUBLE_V2DF
:
11940 case VOID_FTYPE_PLONGLONG_LONGLONG
:
11941 case VOID_FTYPE_PULONGLONG_ULONGLONG
:
11942 case VOID_FTYPE_PUNSIGNED_UNSIGNED
:
11943 case VOID_FTYPE_PINT_INT
:
11946 /* Reserve memory operand for target. */
11947 memory
= ARRAY_SIZE (xops
);
11950 /* These builtins and instructions require the memory
11951 to be properly aligned. */
11952 case CODE_FOR_avx_movntv4di
:
11953 case CODE_FOR_sse2_movntv2di
:
11954 case CODE_FOR_avx_movntv8sf
:
11955 case CODE_FOR_sse_movntv4sf
:
11956 case CODE_FOR_sse4a_vmmovntv4sf
:
11957 case CODE_FOR_avx_movntv4df
:
11958 case CODE_FOR_sse2_movntv2df
:
11959 case CODE_FOR_sse4a_vmmovntv2df
:
11960 case CODE_FOR_sse2_movntidi
:
11961 case CODE_FOR_sse_movntq
:
11962 case CODE_FOR_sse2_movntisi
:
11963 case CODE_FOR_avx512f_movntv16sf
:
11964 case CODE_FOR_avx512f_movntv8df
:
11965 case CODE_FOR_avx512f_movntv8di
:
11966 aligned_mem
= true;
11972 case VOID_FTYPE_PVOID_PCVOID
:
11978 case V4SF_FTYPE_V4SF_PCV2SF
:
11979 case V2DF_FTYPE_V2DF_PCDOUBLE
:
11984 case V8SF_FTYPE_PCV8SF_V8SI
:
11985 case V4DF_FTYPE_PCV4DF_V4DI
:
11986 case V4SF_FTYPE_PCV4SF_V4SI
:
11987 case V2DF_FTYPE_PCV2DF_V2DI
:
11988 case V8SI_FTYPE_PCV8SI_V8SI
:
11989 case V4DI_FTYPE_PCV4DI_V4DI
:
11990 case V4SI_FTYPE_PCV4SI_V4SI
:
11991 case V2DI_FTYPE_PCV2DI_V2DI
:
11992 case VOID_FTYPE_INT_INT64
:
11997 case VOID_FTYPE_PV8DF_V8DF_UQI
:
11998 case VOID_FTYPE_PV4DF_V4DF_UQI
:
11999 case VOID_FTYPE_PV2DF_V2DF_UQI
:
12000 case VOID_FTYPE_PV16SF_V16SF_UHI
:
12001 case VOID_FTYPE_PV8SF_V8SF_UQI
:
12002 case VOID_FTYPE_PV4SF_V4SF_UQI
:
12003 case VOID_FTYPE_PV8DI_V8DI_UQI
:
12004 case VOID_FTYPE_PV4DI_V4DI_UQI
:
12005 case VOID_FTYPE_PV2DI_V2DI_UQI
:
12006 case VOID_FTYPE_PV16SI_V16SI_UHI
:
12007 case VOID_FTYPE_PV8SI_V8SI_UQI
:
12008 case VOID_FTYPE_PV4SI_V4SI_UQI
:
12009 case VOID_FTYPE_PV64QI_V64QI_UDI
:
12010 case VOID_FTYPE_PV32HI_V32HI_USI
:
12011 case VOID_FTYPE_PV32QI_V32QI_USI
:
12012 case VOID_FTYPE_PV16QI_V16QI_UHI
:
12013 case VOID_FTYPE_PV16HI_V16HI_UHI
:
12014 case VOID_FTYPE_PV8HI_V8HI_UQI
:
12017 /* These builtins and instructions require the memory
12018 to be properly aligned. */
12019 case CODE_FOR_avx512f_storev16sf_mask
:
12020 case CODE_FOR_avx512f_storev16si_mask
:
12021 case CODE_FOR_avx512f_storev8df_mask
:
12022 case CODE_FOR_avx512f_storev8di_mask
:
12023 case CODE_FOR_avx512vl_storev8sf_mask
:
12024 case CODE_FOR_avx512vl_storev8si_mask
:
12025 case CODE_FOR_avx512vl_storev4df_mask
:
12026 case CODE_FOR_avx512vl_storev4di_mask
:
12027 case CODE_FOR_avx512vl_storev4sf_mask
:
12028 case CODE_FOR_avx512vl_storev4si_mask
:
12029 case CODE_FOR_avx512vl_storev2df_mask
:
12030 case CODE_FOR_avx512vl_storev2di_mask
:
12031 aligned_mem
= true;
12037 case VOID_FTYPE_PV8SF_V8SI_V8SF
:
12038 case VOID_FTYPE_PV4DF_V4DI_V4DF
:
12039 case VOID_FTYPE_PV4SF_V4SI_V4SF
:
12040 case VOID_FTYPE_PV2DF_V2DI_V2DF
:
12041 case VOID_FTYPE_PV8SI_V8SI_V8SI
:
12042 case VOID_FTYPE_PV4DI_V4DI_V4DI
:
12043 case VOID_FTYPE_PV4SI_V4SI_V4SI
:
12044 case VOID_FTYPE_PV2DI_V2DI_V2DI
:
12045 case VOID_FTYPE_PV8SI_V8DI_UQI
:
12046 case VOID_FTYPE_PV8HI_V8DI_UQI
:
12047 case VOID_FTYPE_PV16HI_V16SI_UHI
:
12048 case VOID_FTYPE_PUDI_V8DI_UQI
:
12049 case VOID_FTYPE_PV16QI_V16SI_UHI
:
12050 case VOID_FTYPE_PV4SI_V4DI_UQI
:
12051 case VOID_FTYPE_PUDI_V2DI_UQI
:
12052 case VOID_FTYPE_PUDI_V4DI_UQI
:
12053 case VOID_FTYPE_PUSI_V2DI_UQI
:
12054 case VOID_FTYPE_PV8HI_V8SI_UQI
:
12055 case VOID_FTYPE_PUDI_V4SI_UQI
:
12056 case VOID_FTYPE_PUSI_V4DI_UQI
:
12057 case VOID_FTYPE_PUHI_V2DI_UQI
:
12058 case VOID_FTYPE_PUDI_V8SI_UQI
:
12059 case VOID_FTYPE_PUSI_V4SI_UQI
:
12060 case VOID_FTYPE_PCHAR_V64QI_UDI
:
12061 case VOID_FTYPE_PCHAR_V32QI_USI
:
12062 case VOID_FTYPE_PCHAR_V16QI_UHI
:
12063 case VOID_FTYPE_PSHORT_V32HI_USI
:
12064 case VOID_FTYPE_PSHORT_V16HI_UHI
:
12065 case VOID_FTYPE_PSHORT_V8HI_UQI
:
12066 case VOID_FTYPE_PINT_V16SI_UHI
:
12067 case VOID_FTYPE_PINT_V8SI_UQI
:
12068 case VOID_FTYPE_PINT_V4SI_UQI
:
12069 case VOID_FTYPE_PINT64_V8DI_UQI
:
12070 case VOID_FTYPE_PINT64_V4DI_UQI
:
12071 case VOID_FTYPE_PINT64_V2DI_UQI
:
12072 case VOID_FTYPE_PDOUBLE_V8DF_UQI
:
12073 case VOID_FTYPE_PDOUBLE_V4DF_UQI
:
12074 case VOID_FTYPE_PDOUBLE_V2DF_UQI
:
12075 case VOID_FTYPE_PFLOAT_V16SF_UHI
:
12076 case VOID_FTYPE_PFLOAT_V8SF_UQI
:
12077 case VOID_FTYPE_PFLOAT_V4SF_UQI
:
12078 case VOID_FTYPE_PCFLOAT16_V8HF_UQI
:
12079 case VOID_FTYPE_PV32QI_V32HI_USI
:
12080 case VOID_FTYPE_PV16QI_V16HI_UHI
:
12081 case VOID_FTYPE_PUDI_V8HI_UQI
:
12084 /* Reserve memory operand for target. */
12085 memory
= ARRAY_SIZE (xops
);
12087 case V4SF_FTYPE_PCV4SF_V4SF_UQI
:
12088 case V8SF_FTYPE_PCV8SF_V8SF_UQI
:
12089 case V16SF_FTYPE_PCV16SF_V16SF_UHI
:
12090 case V4SI_FTYPE_PCV4SI_V4SI_UQI
:
12091 case V8SI_FTYPE_PCV8SI_V8SI_UQI
:
12092 case V16SI_FTYPE_PCV16SI_V16SI_UHI
:
12093 case V2DF_FTYPE_PCV2DF_V2DF_UQI
:
12094 case V4DF_FTYPE_PCV4DF_V4DF_UQI
:
12095 case V8DF_FTYPE_PCV8DF_V8DF_UQI
:
12096 case V2DI_FTYPE_PCV2DI_V2DI_UQI
:
12097 case V4DI_FTYPE_PCV4DI_V4DI_UQI
:
12098 case V8DI_FTYPE_PCV8DI_V8DI_UQI
:
12099 case V64QI_FTYPE_PCV64QI_V64QI_UDI
:
12100 case V32HI_FTYPE_PCV32HI_V32HI_USI
:
12101 case V32QI_FTYPE_PCV32QI_V32QI_USI
:
12102 case V16QI_FTYPE_PCV16QI_V16QI_UHI
:
12103 case V16HI_FTYPE_PCV16HI_V16HI_UHI
:
12104 case V8HI_FTYPE_PCV8HI_V8HI_UQI
:
12107 /* These builtins and instructions require the memory
12108 to be properly aligned. */
12109 case CODE_FOR_avx512f_loadv16sf_mask
:
12110 case CODE_FOR_avx512f_loadv16si_mask
:
12111 case CODE_FOR_avx512f_loadv8df_mask
:
12112 case CODE_FOR_avx512f_loadv8di_mask
:
12113 case CODE_FOR_avx512vl_loadv8sf_mask
:
12114 case CODE_FOR_avx512vl_loadv8si_mask
:
12115 case CODE_FOR_avx512vl_loadv4df_mask
:
12116 case CODE_FOR_avx512vl_loadv4di_mask
:
12117 case CODE_FOR_avx512vl_loadv4sf_mask
:
12118 case CODE_FOR_avx512vl_loadv4si_mask
:
12119 case CODE_FOR_avx512vl_loadv2df_mask
:
12120 case CODE_FOR_avx512vl_loadv2di_mask
:
12121 case CODE_FOR_avx512bw_loadv64qi_mask
:
12122 case CODE_FOR_avx512vl_loadv32qi_mask
:
12123 case CODE_FOR_avx512vl_loadv16qi_mask
:
12124 case CODE_FOR_avx512bw_loadv32hi_mask
:
12125 case CODE_FOR_avx512vl_loadv16hi_mask
:
12126 case CODE_FOR_avx512vl_loadv8hi_mask
:
12127 aligned_mem
= true;
12133 case V64QI_FTYPE_PCCHAR_V64QI_UDI
:
12134 case V32QI_FTYPE_PCCHAR_V32QI_USI
:
12135 case V16QI_FTYPE_PCCHAR_V16QI_UHI
:
12136 case V32HI_FTYPE_PCSHORT_V32HI_USI
:
12137 case V16HI_FTYPE_PCSHORT_V16HI_UHI
:
12138 case V8HI_FTYPE_PCSHORT_V8HI_UQI
:
12139 case V16SI_FTYPE_PCINT_V16SI_UHI
:
12140 case V8SI_FTYPE_PCINT_V8SI_UQI
:
12141 case V4SI_FTYPE_PCINT_V4SI_UQI
:
12142 case V8DI_FTYPE_PCINT64_V8DI_UQI
:
12143 case V4DI_FTYPE_PCINT64_V4DI_UQI
:
12144 case V2DI_FTYPE_PCINT64_V2DI_UQI
:
12145 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI
:
12146 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI
:
12147 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI
:
12148 case V16SF_FTYPE_PCFLOAT_V16SF_UHI
:
12149 case V8SF_FTYPE_PCFLOAT_V8SF_UQI
:
12150 case V4SF_FTYPE_PCFLOAT_V4SF_UQI
:
12151 case V8HF_FTYPE_PCFLOAT16_V8HF_UQI
:
12156 case INT_FTYPE_PINT_INT_INT_INT
:
12157 case LONGLONG_FTYPE_PLONGLONG_LONGLONG_LONGLONG_INT
:
12164 gcc_unreachable ();
12167 gcc_assert (nargs
<= ARRAY_SIZE (xops
));
12169 if (klass
== store
)
12171 arg
= CALL_EXPR_ARG (exp
, 0);
12172 op
= expand_normal (arg
);
12173 gcc_assert (target
== 0);
12176 op
= ix86_zero_extend_to_Pmode (op
);
12177 target
= gen_rtx_MEM (tmode
, op
);
12178 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
12179 on it. Try to improve it using get_pointer_alignment,
12180 and if the special builtin is one that requires strict
12181 mode alignment, also from it's GET_MODE_ALIGNMENT.
12182 Failure to do so could lead to ix86_legitimate_combined_insn
12183 rejecting all changes to such insns. */
12184 unsigned int align
= get_pointer_alignment (arg
);
12185 if (aligned_mem
&& align
< GET_MODE_ALIGNMENT (tmode
))
12186 align
= GET_MODE_ALIGNMENT (tmode
);
12187 if (MEM_ALIGN (target
) < align
)
12188 set_mem_align (target
, align
);
12191 target
= force_reg (tmode
, op
);
12199 || !register_operand (target
, tmode
)
12200 || GET_MODE (target
) != tmode
)
12201 target
= gen_reg_rtx (tmode
);
12204 for (i
= 0; i
< nargs
; i
++)
12206 machine_mode mode
= insn_p
->operand
[i
+ 1].mode
;
12208 arg
= CALL_EXPR_ARG (exp
, i
+ arg_adjust
);
12209 op
= expand_normal (arg
);
12213 /* This must be the memory operand. */
12214 op
= ix86_zero_extend_to_Pmode (op
);
12215 op
= gen_rtx_MEM (mode
, op
);
12216 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
12217 on it. Try to improve it using get_pointer_alignment,
12218 and if the special builtin is one that requires strict
12219 mode alignment, also from it's GET_MODE_ALIGNMENT.
12220 Failure to do so could lead to ix86_legitimate_combined_insn
12221 rejecting all changes to such insns. */
12222 unsigned int align
= get_pointer_alignment (arg
);
12223 if (aligned_mem
&& align
< GET_MODE_ALIGNMENT (mode
))
12224 align
= GET_MODE_ALIGNMENT (mode
);
12225 if (MEM_ALIGN (op
) < align
)
12226 set_mem_align (op
, align
);
12228 else if (i
== constant
)
12230 /* This must be the constant. */
12231 if (!insn_p
->operand
[nargs
].predicate(op
, SImode
))
12233 error ("the fourth argument must be one of enum %qs", "_CMPCCX_ENUM");
12239 /* This must be register. */
12240 if (VECTOR_MODE_P (mode
))
12241 op
= safe_vector_operand (op
, mode
);
12243 op
= fixup_modeless_constant (op
, mode
);
12245 /* NB: 3-operands load implied it's a mask load or v{p}expand*,
12246 and that mask operand shoud be at the end.
12247 Keep all-ones mask which would be simplified by the expander. */
12248 if (nargs
== 3 && i
== 2 && klass
== load
12249 && constm1_operand (op
, mode
)
12250 && insn_p
->operand
[i
].predicate (op
, mode
))
12252 else if (GET_MODE (op
) == mode
|| GET_MODE (op
) == VOIDmode
)
12253 op
= copy_to_mode_reg (mode
, op
);
12256 op
= copy_to_reg (op
);
12257 op
= lowpart_subreg (mode
, op
, GET_MODE (op
));
12267 pat
= GEN_FCN (icode
) (target
);
12270 pat
= GEN_FCN (icode
) (target
, xops
[0]);
12273 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1]);
12276 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2]);
12279 pat
= GEN_FCN (icode
) (target
, xops
[0], xops
[1], xops
[2], xops
[3]);
12282 gcc_unreachable ();
12289 return klass
== store
? 0 : target
;
12292 /* Return the integer constant in ARG. Constrain it to be in the range
12293 of the subparts of VEC_TYPE; issue an error if not. */
12296 get_element_number (tree vec_type
, tree arg
)
12298 unsigned HOST_WIDE_INT elt
, max
= TYPE_VECTOR_SUBPARTS (vec_type
) - 1;
12300 if (!tree_fits_uhwi_p (arg
)
12301 || (elt
= tree_to_uhwi (arg
), elt
> max
))
12303 error ("selector must be an integer constant in the range "
12311 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12312 ix86_expand_vector_init. We DO have language-level syntax for this, in
12313 the form of (type){ init-list }. Except that since we can't place emms
12314 instructions from inside the compiler, we can't allow the use of MMX
12315 registers unless the user explicitly asks for it. So we do *not* define
12316 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
12317 we have builtins invoked by mmintrin.h that gives us license to emit
12318 these sorts of instructions. */
12321 ix86_expand_vec_init_builtin (tree type
, tree exp
, rtx target
)
12323 machine_mode tmode
= TYPE_MODE (type
);
12324 machine_mode inner_mode
= GET_MODE_INNER (tmode
);
12325 int i
, n_elt
= GET_MODE_NUNITS (tmode
);
12326 rtvec v
= rtvec_alloc (n_elt
);
12328 gcc_assert (VECTOR_MODE_P (tmode
));
12329 gcc_assert (call_expr_nargs (exp
) == n_elt
);
12331 for (i
= 0; i
< n_elt
; ++i
)
12333 rtx x
= expand_normal (CALL_EXPR_ARG (exp
, i
));
12334 RTVEC_ELT (v
, i
) = gen_lowpart (inner_mode
, x
);
12337 if (!target
|| !register_operand (target
, tmode
))
12338 target
= gen_reg_rtx (tmode
);
12340 ix86_expand_vector_init (true, target
, gen_rtx_PARALLEL (tmode
, v
));
12344 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12345 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
12346 had a language-level syntax for referencing vector elements. */
12349 ix86_expand_vec_ext_builtin (tree exp
, rtx target
)
12351 machine_mode tmode
, mode0
;
12356 arg0
= CALL_EXPR_ARG (exp
, 0);
12357 arg1
= CALL_EXPR_ARG (exp
, 1);
12359 op0
= expand_normal (arg0
);
12360 elt
= get_element_number (TREE_TYPE (arg0
), arg1
);
12362 tmode
= TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0
)));
12363 mode0
= TYPE_MODE (TREE_TYPE (arg0
));
12364 gcc_assert (VECTOR_MODE_P (mode0
));
12366 op0
= force_reg (mode0
, op0
);
12368 if (optimize
|| !target
|| !register_operand (target
, tmode
))
12369 target
= gen_reg_rtx (tmode
);
12371 ix86_expand_vector_extract (true, target
, op0
, elt
);
12376 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12377 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
12378 a language-level syntax for referencing vector elements. */
12381 ix86_expand_vec_set_builtin (tree exp
)
12383 machine_mode tmode
, mode1
;
12384 tree arg0
, arg1
, arg2
;
12386 rtx op0
, op1
, target
;
12388 arg0
= CALL_EXPR_ARG (exp
, 0);
12389 arg1
= CALL_EXPR_ARG (exp
, 1);
12390 arg2
= CALL_EXPR_ARG (exp
, 2);
12392 tmode
= TYPE_MODE (TREE_TYPE (arg0
));
12393 mode1
= TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0
)));
12394 gcc_assert (VECTOR_MODE_P (tmode
));
12396 op0
= expand_expr (arg0
, NULL_RTX
, tmode
, EXPAND_NORMAL
);
12397 op1
= expand_expr (arg1
, NULL_RTX
, mode1
, EXPAND_NORMAL
);
12398 elt
= get_element_number (TREE_TYPE (arg0
), arg2
);
12400 if (GET_MODE (op1
) != mode1
&& GET_MODE (op1
) != VOIDmode
)
12401 op1
= convert_modes (mode1
, GET_MODE (op1
), op1
, true);
12403 op0
= force_reg (tmode
, op0
);
12404 op1
= force_reg (mode1
, op1
);
12406 /* OP0 is the source of these builtin functions and shouldn't be
12407 modified. Create a copy, use it and return it as target. */
12408 target
= gen_reg_rtx (tmode
);
12409 emit_move_insn (target
, op0
);
12410 ix86_expand_vector_set (true, target
, op1
, elt
);
12415 /* Return true if the necessary isa options for this builtin exist,
12417 fcode = DECL_MD_FUNCTION_CODE (fndecl); */
12419 ix86_check_builtin_isa_match (unsigned int fcode
,
12420 HOST_WIDE_INT
* pbisa
,
12421 HOST_WIDE_INT
* pbisa2
)
12423 HOST_WIDE_INT isa
= ix86_isa_flags
;
12424 HOST_WIDE_INT isa2
= ix86_isa_flags2
;
12425 HOST_WIDE_INT bisa
= ix86_builtins_isa
[fcode
].isa
;
12426 HOST_WIDE_INT bisa2
= ix86_builtins_isa
[fcode
].isa2
;
12427 /* The general case is we require all the ISAs specified in bisa{,2}
12429 The exceptions are:
12430 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
12431 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
12432 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
12433 (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
12434 OPTION_MASK_ISA2_AVXVNNI
12435 (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512IFMA) or
12436 OPTION_MASK_ISA2_AVXIFMA
12437 (OPTION_MASK_ISA_AVXNECONVERT | OPTION_MASK_ISA2_AVX512BF16) or
12438 OPTION_MASK_ISA2_AVXNECONVERT
12439 where for each such pair it is sufficient if either of the ISAs is
12440 enabled, plus if it is ored with other options also those others.
12441 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
12442 if (((bisa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
))
12443 == (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
))
12444 && (isa
& (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
)) != 0)
12445 isa
|= (OPTION_MASK_ISA_SSE
| OPTION_MASK_ISA_3DNOW_A
);
12447 if (((bisa
& (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
))
12448 == (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
))
12449 && (isa
& (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
)) != 0)
12450 isa
|= (OPTION_MASK_ISA_SSE4_2
| OPTION_MASK_ISA_CRC32
);
12452 if (((bisa
& (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
))
12453 == (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
))
12454 && (isa
& (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
)) != 0)
12455 isa
|= (OPTION_MASK_ISA_FMA
| OPTION_MASK_ISA_FMA4
);
12457 if ((((bisa
& (OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
))
12458 == (OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
))
12459 || (bisa2
& OPTION_MASK_ISA2_AVXVNNI
) != 0)
12460 && (((isa
& (OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
))
12461 == (OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
))
12462 || (isa2
& OPTION_MASK_ISA2_AVXVNNI
) != 0))
12464 isa
|= OPTION_MASK_ISA_AVX512VNNI
| OPTION_MASK_ISA_AVX512VL
;
12465 isa2
|= OPTION_MASK_ISA2_AVXVNNI
;
12468 if ((((bisa
& (OPTION_MASK_ISA_AVX512IFMA
| OPTION_MASK_ISA_AVX512VL
))
12469 == (OPTION_MASK_ISA_AVX512IFMA
| OPTION_MASK_ISA_AVX512VL
))
12470 || (bisa2
& OPTION_MASK_ISA2_AVXIFMA
) != 0)
12471 && (((isa
& (OPTION_MASK_ISA_AVX512IFMA
| OPTION_MASK_ISA_AVX512VL
))
12472 == (OPTION_MASK_ISA_AVX512IFMA
| OPTION_MASK_ISA_AVX512VL
))
12473 || (isa2
& OPTION_MASK_ISA2_AVXIFMA
) != 0))
12475 isa
|= OPTION_MASK_ISA_AVX512IFMA
| OPTION_MASK_ISA_AVX512VL
;
12476 isa2
|= OPTION_MASK_ISA2_AVXIFMA
;
12479 if ((((bisa
& OPTION_MASK_ISA_AVX512VL
) != 0
12480 && (bisa2
& OPTION_MASK_ISA2_AVX512BF16
) != 0)
12481 && (bisa2
& OPTION_MASK_ISA2_AVXNECONVERT
) != 0)
12482 && (((isa
& OPTION_MASK_ISA_AVX512VL
) != 0
12483 && (isa2
& OPTION_MASK_ISA2_AVX512BF16
) != 0)
12484 || (isa2
& OPTION_MASK_ISA2_AVXNECONVERT
) != 0))
12486 isa
|= OPTION_MASK_ISA_AVX512VL
;
12487 isa2
|= OPTION_MASK_ISA2_AVXNECONVERT
| OPTION_MASK_ISA2_AVX512BF16
;
12490 if ((bisa
& OPTION_MASK_ISA_MMX
) && !TARGET_MMX
&& TARGET_MMX_WITH_SSE
12491 /* __builtin_ia32_maskmovq requires MMX registers. */
12492 && fcode
!= IX86_BUILTIN_MASKMOVQ
)
12494 bisa
&= ~OPTION_MASK_ISA_MMX
;
12495 bisa
|= OPTION_MASK_ISA_SSE2
;
12503 return (bisa
& isa
) == bisa
&& (bisa2
& isa2
) == bisa2
;
12506 /* Expand an expression EXP that calls a built-in function,
12507 with result going to TARGET if that's convenient
12508 (and in mode MODE if that's convenient).
12509 SUBTARGET may be used as the target for computing one of EXP's operands.
12510 IGNORE is nonzero if the value is to be ignored. */
12513 ix86_expand_builtin (tree exp
, rtx target
, rtx subtarget
,
12514 machine_mode mode
, int ignore
)
12517 enum insn_code icode
, icode2
;
12518 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
12519 tree arg0
, arg1
, arg2
, arg3
, arg4
;
12520 rtx op0
, op1
, op2
, op3
, op4
, pat
, pat2
, insn
;
12521 machine_mode mode0
, mode1
, mode2
, mode3
, mode4
;
12522 unsigned int fcode
= DECL_MD_FUNCTION_CODE (fndecl
);
12523 HOST_WIDE_INT bisa
, bisa2
;
12525 /* For CPU builtins that can be folded, fold first and expand the fold. */
12528 case IX86_BUILTIN_CPU_INIT
:
12530 /* Make it call __cpu_indicator_init in libgcc. */
12531 tree call_expr
, fndecl
, type
;
12532 type
= build_function_type_list (integer_type_node
, NULL_TREE
);
12533 fndecl
= build_fn_decl ("__cpu_indicator_init", type
);
12534 call_expr
= build_call_expr (fndecl
, 0);
12535 return expand_expr (call_expr
, target
, mode
, EXPAND_NORMAL
);
12537 case IX86_BUILTIN_CPU_IS
:
12538 case IX86_BUILTIN_CPU_SUPPORTS
:
12540 tree arg0
= CALL_EXPR_ARG (exp
, 0);
12541 tree fold_expr
= fold_builtin_cpu (fndecl
, &arg0
);
12542 gcc_assert (fold_expr
!= NULL_TREE
);
12543 return expand_expr (fold_expr
, target
, mode
, EXPAND_NORMAL
);
12547 if (!ix86_check_builtin_isa_match (fcode
, &bisa
, &bisa2
))
12549 bool add_abi_p
= bisa
& OPTION_MASK_ISA_64BIT
;
12550 if (TARGET_ABI_X32
)
12551 bisa
|= OPTION_MASK_ABI_X32
;
12553 bisa
|= OPTION_MASK_ABI_64
;
12554 char *opts
= ix86_target_string (bisa
, bisa2
, 0, 0, NULL
, NULL
,
12555 (enum fpmath_unit
) 0,
12556 (enum prefer_vector_width
) 0,
12557 PVW_NONE
, PVW_NONE
,
12560 error ("%qE needs unknown isa option", fndecl
);
12563 gcc_assert (opts
!= NULL
);
12564 error ("%qE needs isa option %s", fndecl
, opts
);
12567 return expand_call (exp
, target
, ignore
);
12572 case IX86_BUILTIN_MASKMOVQ
:
12573 case IX86_BUILTIN_MASKMOVDQU
:
12574 icode
= (fcode
== IX86_BUILTIN_MASKMOVQ
12575 ? CODE_FOR_mmx_maskmovq
12576 : CODE_FOR_sse2_maskmovdqu
);
12577 /* Note the arg order is different from the operand order. */
12578 arg1
= CALL_EXPR_ARG (exp
, 0);
12579 arg2
= CALL_EXPR_ARG (exp
, 1);
12580 arg0
= CALL_EXPR_ARG (exp
, 2);
12581 op0
= expand_normal (arg0
);
12582 op1
= expand_normal (arg1
);
12583 op2
= expand_normal (arg2
);
12584 mode0
= insn_data
[icode
].operand
[0].mode
;
12585 mode1
= insn_data
[icode
].operand
[1].mode
;
12586 mode2
= insn_data
[icode
].operand
[2].mode
;
12588 op0
= ix86_zero_extend_to_Pmode (op0
);
12589 op0
= gen_rtx_MEM (mode1
, op0
);
12591 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
12592 op0
= copy_to_mode_reg (mode0
, op0
);
12593 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
12594 op1
= copy_to_mode_reg (mode1
, op1
);
12595 if (!insn_data
[icode
].operand
[2].predicate (op2
, mode2
))
12596 op2
= copy_to_mode_reg (mode2
, op2
);
12597 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
12603 case IX86_BUILTIN_LDMXCSR
:
12604 op0
= expand_normal (CALL_EXPR_ARG (exp
, 0));
12605 target
= assign_386_stack_local (SImode
, SLOT_TEMP
);
12606 emit_move_insn (target
, op0
);
12607 emit_insn (gen_sse_ldmxcsr (target
));
12610 case IX86_BUILTIN_STMXCSR
:
12611 target
= assign_386_stack_local (SImode
, SLOT_TEMP
);
12612 emit_insn (gen_sse_stmxcsr (target
));
12613 return copy_to_mode_reg (SImode
, target
);
12615 case IX86_BUILTIN_CLFLUSH
:
12616 arg0
= CALL_EXPR_ARG (exp
, 0);
12617 op0
= expand_normal (arg0
);
12618 icode
= CODE_FOR_sse2_clflush
;
12619 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
12620 op0
= ix86_zero_extend_to_Pmode (op0
);
12622 emit_insn (gen_sse2_clflush (op0
));
12625 case IX86_BUILTIN_CLWB
:
12626 arg0
= CALL_EXPR_ARG (exp
, 0);
12627 op0
= expand_normal (arg0
);
12628 icode
= CODE_FOR_clwb
;
12629 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
12630 op0
= ix86_zero_extend_to_Pmode (op0
);
12632 emit_insn (gen_clwb (op0
));
12635 case IX86_BUILTIN_CLFLUSHOPT
:
12636 arg0
= CALL_EXPR_ARG (exp
, 0);
12637 op0
= expand_normal (arg0
);
12638 icode
= CODE_FOR_clflushopt
;
12639 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
12640 op0
= ix86_zero_extend_to_Pmode (op0
);
12642 emit_insn (gen_clflushopt (op0
));
12645 case IX86_BUILTIN_MONITOR
:
12646 case IX86_BUILTIN_MONITORX
:
12647 arg0
= CALL_EXPR_ARG (exp
, 0);
12648 arg1
= CALL_EXPR_ARG (exp
, 1);
12649 arg2
= CALL_EXPR_ARG (exp
, 2);
12650 op0
= expand_normal (arg0
);
12651 op1
= expand_normal (arg1
);
12652 op2
= expand_normal (arg2
);
12654 op0
= ix86_zero_extend_to_Pmode (op0
);
12656 op1
= copy_to_mode_reg (SImode
, op1
);
12658 op2
= copy_to_mode_reg (SImode
, op2
);
12660 emit_insn (fcode
== IX86_BUILTIN_MONITOR
12661 ? gen_sse3_monitor (Pmode
, op0
, op1
, op2
)
12662 : gen_monitorx (Pmode
, op0
, op1
, op2
));
12665 case IX86_BUILTIN_MWAIT
:
12666 arg0
= CALL_EXPR_ARG (exp
, 0);
12667 arg1
= CALL_EXPR_ARG (exp
, 1);
12668 op0
= expand_normal (arg0
);
12669 op1
= expand_normal (arg1
);
12671 op0
= copy_to_mode_reg (SImode
, op0
);
12673 op1
= copy_to_mode_reg (SImode
, op1
);
12674 emit_insn (gen_sse3_mwait (op0
, op1
));
12677 case IX86_BUILTIN_MWAITX
:
12678 arg0
= CALL_EXPR_ARG (exp
, 0);
12679 arg1
= CALL_EXPR_ARG (exp
, 1);
12680 arg2
= CALL_EXPR_ARG (exp
, 2);
12681 op0
= expand_normal (arg0
);
12682 op1
= expand_normal (arg1
);
12683 op2
= expand_normal (arg2
);
12685 op0
= copy_to_mode_reg (SImode
, op0
);
12687 op1
= copy_to_mode_reg (SImode
, op1
);
12689 op2
= copy_to_mode_reg (SImode
, op2
);
12690 emit_insn (gen_mwaitx (op0
, op1
, op2
));
12693 case IX86_BUILTIN_UMONITOR
:
12694 arg0
= CALL_EXPR_ARG (exp
, 0);
12695 op0
= expand_normal (arg0
);
12697 op0
= ix86_zero_extend_to_Pmode (op0
);
12698 emit_insn (gen_umonitor (Pmode
, op0
));
12701 case IX86_BUILTIN_UMWAIT
:
12702 case IX86_BUILTIN_TPAUSE
:
12703 arg0
= CALL_EXPR_ARG (exp
, 0);
12704 arg1
= CALL_EXPR_ARG (exp
, 1);
12705 op0
= expand_normal (arg0
);
12706 op1
= expand_normal (arg1
);
12709 op0
= copy_to_mode_reg (SImode
, op0
);
12711 op1
= force_reg (DImode
, op1
);
12715 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
12716 NULL
, 1, OPTAB_DIRECT
);
12719 case IX86_BUILTIN_UMWAIT
:
12720 icode
= CODE_FOR_umwait_rex64
;
12722 case IX86_BUILTIN_TPAUSE
:
12723 icode
= CODE_FOR_tpause_rex64
;
12726 gcc_unreachable ();
12729 op2
= gen_lowpart (SImode
, op2
);
12730 op1
= gen_lowpart (SImode
, op1
);
12731 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
12737 case IX86_BUILTIN_UMWAIT
:
12738 icode
= CODE_FOR_umwait
;
12740 case IX86_BUILTIN_TPAUSE
:
12741 icode
= CODE_FOR_tpause
;
12744 gcc_unreachable ();
12746 pat
= GEN_FCN (icode
) (op0
, op1
);
12755 || !register_operand (target
, QImode
))
12756 target
= gen_reg_rtx (QImode
);
12758 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
12760 emit_insn (gen_rtx_SET (target
, pat
));
12764 case IX86_BUILTIN_TESTUI
:
12765 emit_insn (gen_testui ());
12768 || !register_operand (target
, QImode
))
12769 target
= gen_reg_rtx (QImode
);
12771 pat
= gen_rtx_LTU (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
12773 emit_insn (gen_rtx_SET (target
, pat
));
12777 case IX86_BUILTIN_CLZERO
:
12778 arg0
= CALL_EXPR_ARG (exp
, 0);
12779 op0
= expand_normal (arg0
);
12781 op0
= ix86_zero_extend_to_Pmode (op0
);
12782 emit_insn (gen_clzero (Pmode
, op0
));
12785 case IX86_BUILTIN_CLDEMOTE
:
12786 arg0
= CALL_EXPR_ARG (exp
, 0);
12787 op0
= expand_normal (arg0
);
12788 icode
= CODE_FOR_cldemote
;
12789 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
12790 op0
= ix86_zero_extend_to_Pmode (op0
);
12792 emit_insn (gen_cldemote (op0
));
12795 case IX86_BUILTIN_LOADIWKEY
:
12797 arg0
= CALL_EXPR_ARG (exp
, 0);
12798 arg1
= CALL_EXPR_ARG (exp
, 1);
12799 arg2
= CALL_EXPR_ARG (exp
, 2);
12800 arg3
= CALL_EXPR_ARG (exp
, 3);
12802 op0
= expand_normal (arg0
);
12803 op1
= expand_normal (arg1
);
12804 op2
= expand_normal (arg2
);
12805 op3
= expand_normal (arg3
);
12808 op0
= copy_to_mode_reg (V2DImode
, op0
);
12810 op1
= copy_to_mode_reg (V2DImode
, op1
);
12812 op2
= copy_to_mode_reg (V2DImode
, op2
);
12814 op3
= copy_to_mode_reg (SImode
, op3
);
12816 emit_insn (gen_loadiwkey (op0
, op1
, op2
, op3
));
12821 case IX86_BUILTIN_AESDEC128KLU8
:
12822 icode
= CODE_FOR_aesdec128klu8
;
12823 goto aesdecenc_expand
;
12825 case IX86_BUILTIN_AESDEC256KLU8
:
12826 icode
= CODE_FOR_aesdec256klu8
;
12827 goto aesdecenc_expand
;
12829 case IX86_BUILTIN_AESENC128KLU8
:
12830 icode
= CODE_FOR_aesenc128klu8
;
12831 goto aesdecenc_expand
;
12833 case IX86_BUILTIN_AESENC256KLU8
:
12834 icode
= CODE_FOR_aesenc256klu8
;
12838 arg0
= CALL_EXPR_ARG (exp
, 0); // __m128i *odata
12839 arg1
= CALL_EXPR_ARG (exp
, 1); // __m128i idata
12840 arg2
= CALL_EXPR_ARG (exp
, 2); // const void *p
12842 op0
= expand_normal (arg0
);
12843 op1
= expand_normal (arg1
);
12844 op2
= expand_normal (arg2
);
12846 if (!address_operand (op0
, V2DImode
))
12848 op0
= convert_memory_address (Pmode
, op0
);
12849 op0
= copy_addr_to_reg (op0
);
12851 op0
= gen_rtx_MEM (V2DImode
, op0
);
12854 op1
= copy_to_mode_reg (V2DImode
, op1
);
12856 if (!address_operand (op2
, VOIDmode
))
12858 op2
= convert_memory_address (Pmode
, op2
);
12859 op2
= copy_addr_to_reg (op2
);
12861 op2
= gen_rtx_MEM (BLKmode
, op2
);
12863 emit_insn (GEN_FCN (icode
) (op1
, op1
, op2
));
12866 target
= gen_reg_rtx (QImode
);
12868 /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime
12869 error occurs. Then the output should be cleared for safety. */
12870 rtx_code_label
*ok_label
;
12873 tmp
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
12874 pat
= gen_rtx_EQ (QImode
, tmp
, const0_rtx
);
12875 ok_label
= gen_label_rtx ();
12876 emit_cmp_and_jump_insns (tmp
, const0_rtx
, NE
, 0, GET_MODE (tmp
),
12878 /* Usually the runtime error seldom occur, so predict OK path as
12879 hotspot to optimize it as fallthrough block. */
12880 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
12882 emit_insn (gen_rtx_SET (op1
, const0_rtx
));
12884 emit_label (ok_label
);
12885 emit_insn (gen_rtx_SET (target
, pat
));
12886 emit_insn (gen_rtx_SET (op0
, op1
));
12890 case IX86_BUILTIN_AESDECWIDE128KLU8
:
12891 icode
= CODE_FOR_aesdecwide128klu8
;
12892 goto wideaesdecenc_expand
;
12894 case IX86_BUILTIN_AESDECWIDE256KLU8
:
12895 icode
= CODE_FOR_aesdecwide256klu8
;
12896 goto wideaesdecenc_expand
;
12898 case IX86_BUILTIN_AESENCWIDE128KLU8
:
12899 icode
= CODE_FOR_aesencwide128klu8
;
12900 goto wideaesdecenc_expand
;
12902 case IX86_BUILTIN_AESENCWIDE256KLU8
:
12903 icode
= CODE_FOR_aesencwide256klu8
;
12905 wideaesdecenc_expand
:
12910 arg0
= CALL_EXPR_ARG (exp
, 0); // __m128i * odata
12911 arg1
= CALL_EXPR_ARG (exp
, 1); // const __m128i * idata
12912 arg2
= CALL_EXPR_ARG (exp
, 2); // const void *p
12914 op0
= expand_normal (arg0
);
12915 op1
= expand_normal (arg1
);
12916 op2
= expand_normal (arg2
);
12918 if (!address_operand (op2
, VOIDmode
))
12920 op2
= convert_memory_address (Pmode
, op2
);
12921 op2
= copy_addr_to_reg (op2
);
12923 op2
= gen_rtx_MEM (BLKmode
, op2
);
12925 for (i
= 0; i
< 8; i
++)
12927 xmm_regs
[i
] = gen_rtx_REG (V2DImode
, GET_SSE_REGNO (i
));
12929 op
= gen_rtx_MEM (V2DImode
,
12930 plus_constant (Pmode
, op1
, (i
* 16)));
12932 emit_move_insn (xmm_regs
[i
], op
);
12935 emit_insn (GEN_FCN (icode
) (op2
));
12938 target
= gen_reg_rtx (QImode
);
12940 tmp
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
12941 pat
= gen_rtx_EQ (QImode
, tmp
, const0_rtx
);
12942 ok_label
= gen_label_rtx ();
12943 emit_cmp_and_jump_insns (tmp
, const0_rtx
, NE
, 0, GET_MODE (tmp
),
12945 predict_jump (REG_BR_PROB_BASE
* 90 / 100);
12947 for (i
= 0; i
< 8; i
++)
12948 emit_insn (gen_rtx_SET (xmm_regs
[i
], const0_rtx
));
12950 emit_label (ok_label
);
12951 emit_insn (gen_rtx_SET (target
, pat
));
12953 for (i
= 0; i
< 8; i
++)
12955 op
= gen_rtx_MEM (V2DImode
,
12956 plus_constant (Pmode
, op0
, (i
* 16)));
12957 emit_move_insn (op
, xmm_regs
[i
]);
12962 case IX86_BUILTIN_ENCODEKEY128U32
:
12964 rtx op
, xmm_regs
[7];
12966 arg0
= CALL_EXPR_ARG (exp
, 0); // unsigned int htype
12967 arg1
= CALL_EXPR_ARG (exp
, 1); // __m128i key
12968 arg2
= CALL_EXPR_ARG (exp
, 2); // void *h
12970 op0
= expand_normal (arg0
);
12971 op1
= expand_normal (arg1
);
12972 op2
= expand_normal (arg2
);
12975 op0
= copy_to_mode_reg (SImode
, op0
);
12977 op
= gen_rtx_REG (V2DImode
, GET_SSE_REGNO (0));
12978 emit_move_insn (op
, op1
);
12980 for (i
= 0; i
< 3; i
++)
12981 xmm_regs
[i
] = gen_rtx_REG (V2DImode
, GET_SSE_REGNO (i
));
12984 target
= gen_reg_rtx (SImode
);
12986 emit_insn (gen_encodekey128u32 (target
, op0
));
12988 for (i
= 0; i
< 3; i
++)
12990 op
= gen_rtx_MEM (V2DImode
,
12991 plus_constant (Pmode
, op2
, (i
* 16)));
12992 emit_move_insn (op
, xmm_regs
[i
]);
12997 case IX86_BUILTIN_ENCODEKEY256U32
:
12999 rtx op
, xmm_regs
[7];
13001 arg0
= CALL_EXPR_ARG (exp
, 0); // unsigned int htype
13002 arg1
= CALL_EXPR_ARG (exp
, 1); // __m128i keylow
13003 arg2
= CALL_EXPR_ARG (exp
, 2); // __m128i keyhi
13004 arg3
= CALL_EXPR_ARG (exp
, 3); // void *h
13006 op0
= expand_normal (arg0
);
13007 op1
= expand_normal (arg1
);
13008 op2
= expand_normal (arg2
);
13009 op3
= expand_normal (arg3
);
13012 op0
= copy_to_mode_reg (SImode
, op0
);
13014 /* Force to use xmm0, xmm1 for keylow, keyhi*/
13015 op
= gen_rtx_REG (V2DImode
, GET_SSE_REGNO (0));
13016 emit_move_insn (op
, op1
);
13017 op
= gen_rtx_REG (V2DImode
, GET_SSE_REGNO (1));
13018 emit_move_insn (op
, op2
);
13020 for (i
= 0; i
< 4; i
++)
13021 xmm_regs
[i
] = gen_rtx_REG (V2DImode
, GET_SSE_REGNO (i
));
13024 target
= gen_reg_rtx (SImode
);
13026 emit_insn (gen_encodekey256u32 (target
, op0
));
13028 for (i
= 0; i
< 4; i
++)
13030 op
= gen_rtx_MEM (V2DImode
,
13031 plus_constant (Pmode
, op3
, (i
* 16)));
13032 emit_move_insn (op
, xmm_regs
[i
]);
13038 case IX86_BUILTIN_VEC_INIT_V2SI
:
13039 case IX86_BUILTIN_VEC_INIT_V4HI
:
13040 case IX86_BUILTIN_VEC_INIT_V8QI
:
13041 return ix86_expand_vec_init_builtin (TREE_TYPE (exp
), exp
, target
);
13043 case IX86_BUILTIN_VEC_EXT_V2DF
:
13044 case IX86_BUILTIN_VEC_EXT_V2DI
:
13045 case IX86_BUILTIN_VEC_EXT_V4SF
:
13046 case IX86_BUILTIN_VEC_EXT_V4SI
:
13047 case IX86_BUILTIN_VEC_EXT_V8HI
:
13048 case IX86_BUILTIN_VEC_EXT_V2SI
:
13049 case IX86_BUILTIN_VEC_EXT_V4HI
:
13050 case IX86_BUILTIN_VEC_EXT_V16QI
:
13051 return ix86_expand_vec_ext_builtin (exp
, target
);
13053 case IX86_BUILTIN_VEC_SET_V2DI
:
13054 case IX86_BUILTIN_VEC_SET_V4SF
:
13055 case IX86_BUILTIN_VEC_SET_V4SI
:
13056 case IX86_BUILTIN_VEC_SET_V8HI
:
13057 case IX86_BUILTIN_VEC_SET_V4HI
:
13058 case IX86_BUILTIN_VEC_SET_V16QI
:
13059 return ix86_expand_vec_set_builtin (exp
);
13061 case IX86_BUILTIN_NANQ
:
13062 case IX86_BUILTIN_NANSQ
:
13063 return expand_call (exp
, target
, ignore
);
13065 case IX86_BUILTIN_RDPID
:
13067 op0
= gen_reg_rtx (word_mode
);
13071 insn
= gen_rdpid_rex64 (op0
);
13072 op0
= convert_to_mode (SImode
, op0
, 1);
13075 insn
= gen_rdpid (op0
);
13080 || !register_operand (target
, SImode
))
13081 target
= gen_reg_rtx (SImode
);
13083 emit_move_insn (target
, op0
);
13086 case IX86_BUILTIN_2INTERSECTD512
:
13087 case IX86_BUILTIN_2INTERSECTQ512
:
13088 case IX86_BUILTIN_2INTERSECTD256
:
13089 case IX86_BUILTIN_2INTERSECTQ256
:
13090 case IX86_BUILTIN_2INTERSECTD128
:
13091 case IX86_BUILTIN_2INTERSECTQ128
:
13092 arg0
= CALL_EXPR_ARG (exp
, 0);
13093 arg1
= CALL_EXPR_ARG (exp
, 1);
13094 arg2
= CALL_EXPR_ARG (exp
, 2);
13095 arg3
= CALL_EXPR_ARG (exp
, 3);
13096 op0
= expand_normal (arg0
);
13097 op1
= expand_normal (arg1
);
13098 op2
= expand_normal (arg2
);
13099 op3
= expand_normal (arg3
);
13101 if (!address_operand (op0
, VOIDmode
))
13103 op0
= convert_memory_address (Pmode
, op0
);
13104 op0
= copy_addr_to_reg (op0
);
13106 if (!address_operand (op1
, VOIDmode
))
13108 op1
= convert_memory_address (Pmode
, op1
);
13109 op1
= copy_addr_to_reg (op1
);
13114 case IX86_BUILTIN_2INTERSECTD512
:
13116 icode
= CODE_FOR_avx512vp2intersect_2intersectv16si
;
13118 case IX86_BUILTIN_2INTERSECTQ512
:
13120 icode
= CODE_FOR_avx512vp2intersect_2intersectv8di
;
13122 case IX86_BUILTIN_2INTERSECTD256
:
13124 icode
= CODE_FOR_avx512vp2intersect_2intersectv8si
;
13126 case IX86_BUILTIN_2INTERSECTQ256
:
13128 icode
= CODE_FOR_avx512vp2intersect_2intersectv4di
;
13130 case IX86_BUILTIN_2INTERSECTD128
:
13132 icode
= CODE_FOR_avx512vp2intersect_2intersectv4si
;
13134 case IX86_BUILTIN_2INTERSECTQ128
:
13136 icode
= CODE_FOR_avx512vp2intersect_2intersectv2di
;
13139 gcc_unreachable ();
13142 mode2
= insn_data
[icode
].operand
[1].mode
;
13143 mode3
= insn_data
[icode
].operand
[2].mode
;
13144 if (!insn_data
[icode
].operand
[1].predicate (op2
, mode2
))
13145 op2
= copy_to_mode_reg (mode2
, op2
);
13146 if (!insn_data
[icode
].operand
[2].predicate (op3
, mode3
))
13147 op3
= copy_to_mode_reg (mode3
, op3
);
13149 op4
= gen_reg_rtx (mode4
);
13150 emit_insn (GEN_FCN (icode
) (op4
, op2
, op3
));
13151 mode0
= mode4
== P2HImode
? HImode
: QImode
;
13152 emit_move_insn (gen_rtx_MEM (mode0
, op0
),
13153 gen_lowpart (mode0
, op4
));
13154 emit_move_insn (gen_rtx_MEM (mode0
, op1
),
13155 gen_highpart (mode0
, op4
));
13159 case IX86_BUILTIN_RDPMC
:
13160 case IX86_BUILTIN_RDTSC
:
13161 case IX86_BUILTIN_RDTSCP
:
13162 case IX86_BUILTIN_XGETBV
:
13164 op0
= gen_reg_rtx (DImode
);
13165 op1
= gen_reg_rtx (DImode
);
13167 if (fcode
== IX86_BUILTIN_RDPMC
)
13169 arg0
= CALL_EXPR_ARG (exp
, 0);
13170 op2
= expand_normal (arg0
);
13171 if (!register_operand (op2
, SImode
))
13172 op2
= copy_to_mode_reg (SImode
, op2
);
13174 insn
= (TARGET_64BIT
13175 ? gen_rdpmc_rex64 (op0
, op1
, op2
)
13176 : gen_rdpmc (op0
, op2
));
13179 else if (fcode
== IX86_BUILTIN_XGETBV
)
13181 arg0
= CALL_EXPR_ARG (exp
, 0);
13182 op2
= expand_normal (arg0
);
13183 if (!register_operand (op2
, SImode
))
13184 op2
= copy_to_mode_reg (SImode
, op2
);
13186 insn
= (TARGET_64BIT
13187 ? gen_xgetbv_rex64 (op0
, op1
, op2
)
13188 : gen_xgetbv (op0
, op2
));
13191 else if (fcode
== IX86_BUILTIN_RDTSC
)
13193 insn
= (TARGET_64BIT
13194 ? gen_rdtsc_rex64 (op0
, op1
)
13195 : gen_rdtsc (op0
));
13200 op2
= gen_reg_rtx (SImode
);
13202 insn
= (TARGET_64BIT
13203 ? gen_rdtscp_rex64 (op0
, op1
, op2
)
13204 : gen_rdtscp (op0
, op2
));
13207 arg0
= CALL_EXPR_ARG (exp
, 0);
13208 op4
= expand_normal (arg0
);
13209 if (!address_operand (op4
, VOIDmode
))
13211 op4
= convert_memory_address (Pmode
, op4
);
13212 op4
= copy_addr_to_reg (op4
);
13214 emit_move_insn (gen_rtx_MEM (SImode
, op4
), op2
);
13218 || !register_operand (target
, DImode
))
13219 target
= gen_reg_rtx (DImode
);
13223 op1
= expand_simple_binop (DImode
, ASHIFT
, op1
, GEN_INT (32),
13224 op1
, 1, OPTAB_DIRECT
);
13225 op0
= expand_simple_binop (DImode
, IOR
, op0
, op1
,
13226 op0
, 1, OPTAB_DIRECT
);
13229 emit_move_insn (target
, op0
);
13232 case IX86_BUILTIN_ENQCMD
:
13233 case IX86_BUILTIN_ENQCMDS
:
13234 case IX86_BUILTIN_MOVDIR64B
:
13236 arg0
= CALL_EXPR_ARG (exp
, 0);
13237 arg1
= CALL_EXPR_ARG (exp
, 1);
13238 op0
= expand_normal (arg0
);
13239 op1
= expand_normal (arg1
);
13241 op0
= ix86_zero_extend_to_Pmode (op0
);
13242 if (!address_operand (op1
, VOIDmode
))
13244 op1
= convert_memory_address (Pmode
, op1
);
13245 op1
= copy_addr_to_reg (op1
);
13247 op1
= gen_rtx_MEM (XImode
, op1
);
13249 if (fcode
== IX86_BUILTIN_MOVDIR64B
)
13251 emit_insn (gen_movdir64b (Pmode
, op0
, op1
));
13257 || !register_operand (target
, SImode
))
13258 target
= gen_reg_rtx (SImode
);
13260 emit_move_insn (target
, const0_rtx
);
13261 target
= gen_rtx_SUBREG (QImode
, target
, 0);
13263 int unspecv
= (fcode
== IX86_BUILTIN_ENQCMD
13265 : UNSPECV_ENQCMDS
);
13266 icode
= code_for_enqcmd (unspecv
, Pmode
);
13267 emit_insn (GEN_FCN (icode
) (op0
, op1
));
13270 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode
, target
),
13271 gen_rtx_fmt_ee (EQ
, QImode
,
13272 gen_rtx_REG (CCZmode
, FLAGS_REG
),
13274 return SUBREG_REG (target
);
13277 case IX86_BUILTIN_FXSAVE
:
13278 case IX86_BUILTIN_FXRSTOR
:
13279 case IX86_BUILTIN_FXSAVE64
:
13280 case IX86_BUILTIN_FXRSTOR64
:
13281 case IX86_BUILTIN_FNSTENV
:
13282 case IX86_BUILTIN_FLDENV
:
13286 case IX86_BUILTIN_FXSAVE
:
13287 icode
= CODE_FOR_fxsave
;
13289 case IX86_BUILTIN_FXRSTOR
:
13290 icode
= CODE_FOR_fxrstor
;
13292 case IX86_BUILTIN_FXSAVE64
:
13293 icode
= CODE_FOR_fxsave64
;
13295 case IX86_BUILTIN_FXRSTOR64
:
13296 icode
= CODE_FOR_fxrstor64
;
13298 case IX86_BUILTIN_FNSTENV
:
13299 icode
= CODE_FOR_fnstenv
;
13301 case IX86_BUILTIN_FLDENV
:
13302 icode
= CODE_FOR_fldenv
;
13305 gcc_unreachable ();
13308 arg0
= CALL_EXPR_ARG (exp
, 0);
13309 op0
= expand_normal (arg0
);
13311 if (!address_operand (op0
, VOIDmode
))
13313 op0
= convert_memory_address (Pmode
, op0
);
13314 op0
= copy_addr_to_reg (op0
);
13316 op0
= gen_rtx_MEM (mode0
, op0
);
13318 pat
= GEN_FCN (icode
) (op0
);
13323 case IX86_BUILTIN_XSETBV
:
13324 arg0
= CALL_EXPR_ARG (exp
, 0);
13325 arg1
= CALL_EXPR_ARG (exp
, 1);
13326 op0
= expand_normal (arg0
);
13327 op1
= expand_normal (arg1
);
13330 op0
= copy_to_mode_reg (SImode
, op0
);
13332 op1
= force_reg (DImode
, op1
);
13336 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
13337 NULL
, 1, OPTAB_DIRECT
);
13339 icode
= CODE_FOR_xsetbv_rex64
;
13341 op2
= gen_lowpart (SImode
, op2
);
13342 op1
= gen_lowpart (SImode
, op1
);
13343 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
13347 icode
= CODE_FOR_xsetbv
;
13349 pat
= GEN_FCN (icode
) (op0
, op1
);
13355 case IX86_BUILTIN_XSAVE
:
13356 case IX86_BUILTIN_XRSTOR
:
13357 case IX86_BUILTIN_XSAVE64
:
13358 case IX86_BUILTIN_XRSTOR64
:
13359 case IX86_BUILTIN_XSAVEOPT
:
13360 case IX86_BUILTIN_XSAVEOPT64
:
13361 case IX86_BUILTIN_XSAVES
:
13362 case IX86_BUILTIN_XRSTORS
:
13363 case IX86_BUILTIN_XSAVES64
:
13364 case IX86_BUILTIN_XRSTORS64
:
13365 case IX86_BUILTIN_XSAVEC
:
13366 case IX86_BUILTIN_XSAVEC64
:
13367 arg0
= CALL_EXPR_ARG (exp
, 0);
13368 arg1
= CALL_EXPR_ARG (exp
, 1);
13369 op0
= expand_normal (arg0
);
13370 op1
= expand_normal (arg1
);
13372 if (!address_operand (op0
, VOIDmode
))
13374 op0
= convert_memory_address (Pmode
, op0
);
13375 op0
= copy_addr_to_reg (op0
);
13377 op0
= gen_rtx_MEM (BLKmode
, op0
);
13379 op1
= force_reg (DImode
, op1
);
13383 op2
= expand_simple_binop (DImode
, LSHIFTRT
, op1
, GEN_INT (32),
13384 NULL
, 1, OPTAB_DIRECT
);
13387 case IX86_BUILTIN_XSAVE
:
13388 icode
= CODE_FOR_xsave_rex64
;
13390 case IX86_BUILTIN_XRSTOR
:
13391 icode
= CODE_FOR_xrstor_rex64
;
13393 case IX86_BUILTIN_XSAVE64
:
13394 icode
= CODE_FOR_xsave64
;
13396 case IX86_BUILTIN_XRSTOR64
:
13397 icode
= CODE_FOR_xrstor64
;
13399 case IX86_BUILTIN_XSAVEOPT
:
13400 icode
= CODE_FOR_xsaveopt_rex64
;
13402 case IX86_BUILTIN_XSAVEOPT64
:
13403 icode
= CODE_FOR_xsaveopt64
;
13405 case IX86_BUILTIN_XSAVES
:
13406 icode
= CODE_FOR_xsaves_rex64
;
13408 case IX86_BUILTIN_XRSTORS
:
13409 icode
= CODE_FOR_xrstors_rex64
;
13411 case IX86_BUILTIN_XSAVES64
:
13412 icode
= CODE_FOR_xsaves64
;
13414 case IX86_BUILTIN_XRSTORS64
:
13415 icode
= CODE_FOR_xrstors64
;
13417 case IX86_BUILTIN_XSAVEC
:
13418 icode
= CODE_FOR_xsavec_rex64
;
13420 case IX86_BUILTIN_XSAVEC64
:
13421 icode
= CODE_FOR_xsavec64
;
13424 gcc_unreachable ();
13427 op2
= gen_lowpart (SImode
, op2
);
13428 op1
= gen_lowpart (SImode
, op1
);
13429 pat
= GEN_FCN (icode
) (op0
, op1
, op2
);
13435 case IX86_BUILTIN_XSAVE
:
13436 icode
= CODE_FOR_xsave
;
13438 case IX86_BUILTIN_XRSTOR
:
13439 icode
= CODE_FOR_xrstor
;
13441 case IX86_BUILTIN_XSAVEOPT
:
13442 icode
= CODE_FOR_xsaveopt
;
13444 case IX86_BUILTIN_XSAVES
:
13445 icode
= CODE_FOR_xsaves
;
13447 case IX86_BUILTIN_XRSTORS
:
13448 icode
= CODE_FOR_xrstors
;
13450 case IX86_BUILTIN_XSAVEC
:
13451 icode
= CODE_FOR_xsavec
;
13454 gcc_unreachable ();
13456 pat
= GEN_FCN (icode
) (op0
, op1
);
13463 case IX86_BUILTIN_LLWPCB
:
13464 arg0
= CALL_EXPR_ARG (exp
, 0);
13465 op0
= expand_normal (arg0
);
13467 if (!register_operand (op0
, Pmode
))
13468 op0
= ix86_zero_extend_to_Pmode (op0
);
13469 emit_insn (gen_lwp_llwpcb (Pmode
, op0
));
13472 case IX86_BUILTIN_SLWPCB
:
13474 || !register_operand (target
, Pmode
))
13475 target
= gen_reg_rtx (Pmode
);
13476 emit_insn (gen_lwp_slwpcb (Pmode
, target
));
13479 case IX86_BUILTIN_LWPVAL32
:
13480 case IX86_BUILTIN_LWPVAL64
:
13481 case IX86_BUILTIN_LWPINS32
:
13482 case IX86_BUILTIN_LWPINS64
:
13483 mode
= ((fcode
== IX86_BUILTIN_LWPVAL32
13484 || fcode
== IX86_BUILTIN_LWPINS32
)
13485 ? SImode
: DImode
);
13487 if (fcode
== IX86_BUILTIN_LWPVAL32
13488 || fcode
== IX86_BUILTIN_LWPVAL64
)
13489 icode
= code_for_lwp_lwpval (mode
);
13491 icode
= code_for_lwp_lwpins (mode
);
13493 arg0
= CALL_EXPR_ARG (exp
, 0);
13494 arg1
= CALL_EXPR_ARG (exp
, 1);
13495 arg2
= CALL_EXPR_ARG (exp
, 2);
13496 op0
= expand_normal (arg0
);
13497 op1
= expand_normal (arg1
);
13498 op2
= expand_normal (arg2
);
13499 mode0
= insn_data
[icode
].operand
[0].mode
;
13501 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
13502 op0
= copy_to_mode_reg (mode0
, op0
);
13503 if (!insn_data
[icode
].operand
[1].predicate (op1
, SImode
))
13504 op1
= copy_to_mode_reg (SImode
, op1
);
13506 if (!CONST_INT_P (op2
))
13508 error ("the last argument must be a 32-bit immediate");
13512 emit_insn (GEN_FCN (icode
) (op0
, op1
, op2
));
13514 if (fcode
== IX86_BUILTIN_LWPINS32
13515 || fcode
== IX86_BUILTIN_LWPINS64
)
13518 || !nonimmediate_operand (target
, QImode
))
13519 target
= gen_reg_rtx (QImode
);
13521 pat
= gen_rtx_EQ (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
13523 emit_insn (gen_rtx_SET (target
, pat
));
13530 case IX86_BUILTIN_BEXTRI32
:
13531 case IX86_BUILTIN_BEXTRI64
:
13532 mode
= (fcode
== IX86_BUILTIN_BEXTRI32
? SImode
: DImode
);
13534 arg0
= CALL_EXPR_ARG (exp
, 0);
13535 arg1
= CALL_EXPR_ARG (exp
, 1);
13536 op0
= expand_normal (arg0
);
13537 op1
= expand_normal (arg1
);
13539 if (!CONST_INT_P (op1
))
13541 error ("last argument must be an immediate");
13546 unsigned char lsb_index
= UINTVAL (op1
);
13547 unsigned char length
= UINTVAL (op1
) >> 8;
13549 unsigned char bitsize
= GET_MODE_BITSIZE (mode
);
13551 icode
= code_for_tbm_bextri (mode
);
13553 mode1
= insn_data
[icode
].operand
[1].mode
;
13554 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode1
))
13555 op0
= copy_to_mode_reg (mode1
, op0
);
13557 mode0
= insn_data
[icode
].operand
[0].mode
;
13559 || !register_operand (target
, mode0
))
13560 target
= gen_reg_rtx (mode0
);
13562 if (length
== 0 || lsb_index
>= bitsize
)
13564 emit_move_insn (target
, const0_rtx
);
13568 if (length
+ lsb_index
> bitsize
)
13569 length
= bitsize
- lsb_index
;
13571 op1
= GEN_INT (length
);
13572 op2
= GEN_INT (lsb_index
);
13574 emit_insn (GEN_FCN (icode
) (target
, op0
, op1
, op2
));
13578 case IX86_BUILTIN_RDRAND16_STEP
:
13582 case IX86_BUILTIN_RDRAND32_STEP
:
13586 case IX86_BUILTIN_RDRAND64_STEP
:
13590 arg0
= CALL_EXPR_ARG (exp
, 0);
13591 op1
= expand_normal (arg0
);
13592 if (!address_operand (op1
, VOIDmode
))
13594 op1
= convert_memory_address (Pmode
, op1
);
13595 op1
= copy_addr_to_reg (op1
);
13598 op0
= gen_reg_rtx (mode
);
13599 emit_insn (gen_rdrand (mode
, op0
));
13601 emit_move_insn (gen_rtx_MEM (mode
, op1
), op0
);
13603 op1
= force_reg (SImode
, const1_rtx
);
13605 /* Emit SImode conditional move. */
13606 if (mode
== HImode
)
13608 if (TARGET_ZERO_EXTEND_WITH_AND
13609 && optimize_function_for_speed_p (cfun
))
13611 op2
= force_reg (SImode
, const0_rtx
);
13613 emit_insn (gen_movstricthi
13614 (gen_lowpart (HImode
, op2
), op0
));
13618 op2
= gen_reg_rtx (SImode
);
13620 emit_insn (gen_zero_extendhisi2 (op2
, op0
));
13623 else if (mode
== SImode
)
13626 op2
= gen_rtx_SUBREG (SImode
, op0
, 0);
13629 || !register_operand (target
, SImode
))
13630 target
= gen_reg_rtx (SImode
);
13632 pat
= gen_rtx_GEU (VOIDmode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
13634 emit_insn (gen_rtx_SET (target
,
13635 gen_rtx_IF_THEN_ELSE (SImode
, pat
, op2
, op1
)));
13638 case IX86_BUILTIN_RDSEED16_STEP
:
13642 case IX86_BUILTIN_RDSEED32_STEP
:
13646 case IX86_BUILTIN_RDSEED64_STEP
:
13650 arg0
= CALL_EXPR_ARG (exp
, 0);
13651 op1
= expand_normal (arg0
);
13652 if (!address_operand (op1
, VOIDmode
))
13654 op1
= convert_memory_address (Pmode
, op1
);
13655 op1
= copy_addr_to_reg (op1
);
13658 op0
= gen_reg_rtx (mode
);
13659 emit_insn (gen_rdseed (mode
, op0
));
13661 emit_move_insn (gen_rtx_MEM (mode
, op1
), op0
);
13663 op2
= gen_reg_rtx (QImode
);
13665 pat
= gen_rtx_LTU (QImode
, gen_rtx_REG (CCCmode
, FLAGS_REG
),
13667 emit_insn (gen_rtx_SET (op2
, pat
));
13670 || !register_operand (target
, SImode
))
13671 target
= gen_reg_rtx (SImode
);
13673 emit_insn (gen_zero_extendqisi2 (target
, op2
));
13676 case IX86_BUILTIN_SBB32
:
13677 icode
= CODE_FOR_subborrowsi
;
13678 icode2
= CODE_FOR_subborrowsi_0
;
13684 case IX86_BUILTIN_SBB64
:
13685 icode
= CODE_FOR_subborrowdi
;
13686 icode2
= CODE_FOR_subborrowdi_0
;
13692 case IX86_BUILTIN_ADDCARRYX32
:
13693 icode
= CODE_FOR_addcarrysi
;
13694 icode2
= CODE_FOR_addcarrysi_0
;
13700 case IX86_BUILTIN_ADDCARRYX64
:
13701 icode
= CODE_FOR_addcarrydi
;
13702 icode2
= CODE_FOR_addcarrydi_0
;
13708 arg0
= CALL_EXPR_ARG (exp
, 0); /* unsigned char c_in. */
13709 arg1
= CALL_EXPR_ARG (exp
, 1); /* unsigned int src1. */
13710 arg2
= CALL_EXPR_ARG (exp
, 2); /* unsigned int src2. */
13711 arg3
= CALL_EXPR_ARG (exp
, 3); /* unsigned int *sum_out. */
13713 op1
= expand_normal (arg0
);
13714 if (!integer_zerop (arg0
))
13715 op1
= copy_to_mode_reg (QImode
, convert_to_mode (QImode
, op1
, 1));
13717 op2
= expand_normal (arg1
);
13718 if (!register_operand (op2
, mode0
))
13719 op2
= copy_to_mode_reg (mode0
, op2
);
13721 op3
= expand_normal (arg2
);
13722 if (!register_operand (op3
, mode0
))
13723 op3
= copy_to_mode_reg (mode0
, op3
);
13725 op4
= expand_normal (arg3
);
13726 if (!address_operand (op4
, VOIDmode
))
13728 op4
= convert_memory_address (Pmode
, op4
);
13729 op4
= copy_addr_to_reg (op4
);
13732 op0
= gen_reg_rtx (mode0
);
13733 if (integer_zerop (arg0
))
13735 /* If arg0 is 0, optimize right away into add or sub
13736 instruction that sets CCCmode flags. */
13737 op1
= gen_rtx_REG (mode2
, FLAGS_REG
);
13738 emit_insn (GEN_FCN (icode2
) (op0
, op2
, op3
));
13742 /* Generate CF from input operand. */
13743 emit_insn (gen_addqi3_cconly_overflow (op1
, constm1_rtx
));
13745 /* Generate instruction that consumes CF. */
13746 op1
= gen_rtx_REG (CCCmode
, FLAGS_REG
);
13747 pat
= gen_rtx_LTU (mode1
, op1
, const0_rtx
);
13748 pat2
= gen_rtx_LTU (mode0
, op1
, const0_rtx
);
13749 emit_insn (GEN_FCN (icode
) (op0
, op2
, op3
, op1
, pat
, pat2
));
13752 /* Return current CF value. */
13754 target
= gen_reg_rtx (QImode
);
13756 pat
= gen_rtx_LTU (QImode
, op1
, const0_rtx
);
13757 emit_insn (gen_rtx_SET (target
, pat
));
13759 /* Store the result. */
13760 emit_move_insn (gen_rtx_MEM (mode0
, op4
), op0
);
13764 case IX86_BUILTIN_READ_FLAGS
:
13768 emit_insn (gen_push (gen_rtx_REG (word_mode
, FLAGS_REG
)));
13771 || target
== NULL_RTX
13772 || !nonimmediate_operand (target
, word_mode
)
13773 || GET_MODE (target
) != word_mode
)
13774 target
= gen_reg_rtx (word_mode
);
13776 emit_insn (gen_pop (target
));
13779 case IX86_BUILTIN_WRITE_FLAGS
:
13781 arg0
= CALL_EXPR_ARG (exp
, 0);
13782 op0
= expand_normal (arg0
);
13783 if (!general_no_elim_operand (op0
, word_mode
))
13784 op0
= copy_to_mode_reg (word_mode
, op0
);
13786 emit_insn (gen_push (op0
));
13787 emit_insn (gen_pop (gen_rtx_REG (word_mode
, FLAGS_REG
)));
13790 case IX86_BUILTIN_KTESTC8
:
13791 icode
= CODE_FOR_ktestqi
;
13795 case IX86_BUILTIN_KTESTZ8
:
13796 icode
= CODE_FOR_ktestqi
;
13800 case IX86_BUILTIN_KTESTC16
:
13801 icode
= CODE_FOR_ktesthi
;
13805 case IX86_BUILTIN_KTESTZ16
:
13806 icode
= CODE_FOR_ktesthi
;
13810 case IX86_BUILTIN_KTESTC32
:
13811 icode
= CODE_FOR_ktestsi
;
13815 case IX86_BUILTIN_KTESTZ32
:
13816 icode
= CODE_FOR_ktestsi
;
13820 case IX86_BUILTIN_KTESTC64
:
13821 icode
= CODE_FOR_ktestdi
;
13825 case IX86_BUILTIN_KTESTZ64
:
13826 icode
= CODE_FOR_ktestdi
;
13830 case IX86_BUILTIN_KORTESTC8
:
13831 icode
= CODE_FOR_kortestqi
;
13835 case IX86_BUILTIN_KORTESTZ8
:
13836 icode
= CODE_FOR_kortestqi
;
13840 case IX86_BUILTIN_KORTESTC16
:
13841 icode
= CODE_FOR_kortesthi
;
13845 case IX86_BUILTIN_KORTESTZ16
:
13846 icode
= CODE_FOR_kortesthi
;
13850 case IX86_BUILTIN_KORTESTC32
:
13851 icode
= CODE_FOR_kortestsi
;
13855 case IX86_BUILTIN_KORTESTZ32
:
13856 icode
= CODE_FOR_kortestsi
;
13860 case IX86_BUILTIN_KORTESTC64
:
13861 icode
= CODE_FOR_kortestdi
;
13865 case IX86_BUILTIN_KORTESTZ64
:
13866 icode
= CODE_FOR_kortestdi
;
13870 arg0
= CALL_EXPR_ARG (exp
, 0); /* Mask reg src1. */
13871 arg1
= CALL_EXPR_ARG (exp
, 1); /* Mask reg src2. */
13872 op0
= expand_normal (arg0
);
13873 op1
= expand_normal (arg1
);
13875 mode0
= insn_data
[icode
].operand
[0].mode
;
13876 mode1
= insn_data
[icode
].operand
[1].mode
;
13878 if (GET_MODE (op0
) != VOIDmode
)
13879 op0
= force_reg (GET_MODE (op0
), op0
);
13881 op0
= gen_lowpart (mode0
, op0
);
13883 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
13884 op0
= copy_to_mode_reg (mode0
, op0
);
13886 if (GET_MODE (op1
) != VOIDmode
)
13887 op1
= force_reg (GET_MODE (op1
), op1
);
13889 op1
= gen_lowpart (mode1
, op1
);
13891 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
13892 op1
= copy_to_mode_reg (mode1
, op1
);
13894 target
= gen_reg_rtx (QImode
);
13896 /* Emit kortest. */
13897 emit_insn (GEN_FCN (icode
) (op0
, op1
));
13898 /* And use setcc to return result from flags. */
13899 ix86_expand_setcc (target
, EQ
,
13900 gen_rtx_REG (mode3
, FLAGS_REG
), const0_rtx
);
13903 case IX86_BUILTIN_GATHERSIV2DF
:
13904 icode
= CODE_FOR_avx2_gathersiv2df
;
13906 case IX86_BUILTIN_GATHERSIV4DF
:
13907 icode
= CODE_FOR_avx2_gathersiv4df
;
13909 case IX86_BUILTIN_GATHERDIV2DF
:
13910 icode
= CODE_FOR_avx2_gatherdiv2df
;
13912 case IX86_BUILTIN_GATHERDIV4DF
:
13913 icode
= CODE_FOR_avx2_gatherdiv4df
;
13915 case IX86_BUILTIN_GATHERSIV4SF
:
13916 icode
= CODE_FOR_avx2_gathersiv4sf
;
13918 case IX86_BUILTIN_GATHERSIV8SF
:
13919 icode
= CODE_FOR_avx2_gathersiv8sf
;
13921 case IX86_BUILTIN_GATHERDIV4SF
:
13922 icode
= CODE_FOR_avx2_gatherdiv4sf
;
13924 case IX86_BUILTIN_GATHERDIV8SF
:
13925 icode
= CODE_FOR_avx2_gatherdiv8sf
;
13927 case IX86_BUILTIN_GATHERSIV2DI
:
13928 icode
= CODE_FOR_avx2_gathersiv2di
;
13930 case IX86_BUILTIN_GATHERSIV4DI
:
13931 icode
= CODE_FOR_avx2_gathersiv4di
;
13933 case IX86_BUILTIN_GATHERDIV2DI
:
13934 icode
= CODE_FOR_avx2_gatherdiv2di
;
13936 case IX86_BUILTIN_GATHERDIV4DI
:
13937 icode
= CODE_FOR_avx2_gatherdiv4di
;
13939 case IX86_BUILTIN_GATHERSIV4SI
:
13940 icode
= CODE_FOR_avx2_gathersiv4si
;
13942 case IX86_BUILTIN_GATHERSIV8SI
:
13943 icode
= CODE_FOR_avx2_gathersiv8si
;
13945 case IX86_BUILTIN_GATHERDIV4SI
:
13946 icode
= CODE_FOR_avx2_gatherdiv4si
;
13948 case IX86_BUILTIN_GATHERDIV8SI
:
13949 icode
= CODE_FOR_avx2_gatherdiv8si
;
13951 case IX86_BUILTIN_GATHERALTSIV4DF
:
13952 icode
= CODE_FOR_avx2_gathersiv4df
;
13954 case IX86_BUILTIN_GATHERALTDIV8SF
:
13955 icode
= CODE_FOR_avx2_gatherdiv8sf
;
13957 case IX86_BUILTIN_GATHERALTSIV4DI
:
13958 icode
= CODE_FOR_avx2_gathersiv4di
;
13960 case IX86_BUILTIN_GATHERALTDIV8SI
:
13961 icode
= CODE_FOR_avx2_gatherdiv8si
;
13963 case IX86_BUILTIN_GATHER3SIV16SF
:
13964 icode
= CODE_FOR_avx512f_gathersiv16sf
;
13966 case IX86_BUILTIN_GATHER3SIV8DF
:
13967 icode
= CODE_FOR_avx512f_gathersiv8df
;
13969 case IX86_BUILTIN_GATHER3DIV16SF
:
13970 icode
= CODE_FOR_avx512f_gatherdiv16sf
;
13972 case IX86_BUILTIN_GATHER3DIV8DF
:
13973 icode
= CODE_FOR_avx512f_gatherdiv8df
;
13975 case IX86_BUILTIN_GATHER3SIV16SI
:
13976 icode
= CODE_FOR_avx512f_gathersiv16si
;
13978 case IX86_BUILTIN_GATHER3SIV8DI
:
13979 icode
= CODE_FOR_avx512f_gathersiv8di
;
13981 case IX86_BUILTIN_GATHER3DIV16SI
:
13982 icode
= CODE_FOR_avx512f_gatherdiv16si
;
13984 case IX86_BUILTIN_GATHER3DIV8DI
:
13985 icode
= CODE_FOR_avx512f_gatherdiv8di
;
13987 case IX86_BUILTIN_GATHER3ALTSIV8DF
:
13988 icode
= CODE_FOR_avx512f_gathersiv8df
;
13990 case IX86_BUILTIN_GATHER3ALTDIV16SF
:
13991 icode
= CODE_FOR_avx512f_gatherdiv16sf
;
13993 case IX86_BUILTIN_GATHER3ALTSIV8DI
:
13994 icode
= CODE_FOR_avx512f_gathersiv8di
;
13996 case IX86_BUILTIN_GATHER3ALTDIV16SI
:
13997 icode
= CODE_FOR_avx512f_gatherdiv16si
;
13999 case IX86_BUILTIN_GATHER3SIV2DF
:
14000 icode
= CODE_FOR_avx512vl_gathersiv2df
;
14002 case IX86_BUILTIN_GATHER3SIV4DF
:
14003 icode
= CODE_FOR_avx512vl_gathersiv4df
;
14005 case IX86_BUILTIN_GATHER3DIV2DF
:
14006 icode
= CODE_FOR_avx512vl_gatherdiv2df
;
14008 case IX86_BUILTIN_GATHER3DIV4DF
:
14009 icode
= CODE_FOR_avx512vl_gatherdiv4df
;
14011 case IX86_BUILTIN_GATHER3SIV4SF
:
14012 icode
= CODE_FOR_avx512vl_gathersiv4sf
;
14014 case IX86_BUILTIN_GATHER3SIV8SF
:
14015 icode
= CODE_FOR_avx512vl_gathersiv8sf
;
14017 case IX86_BUILTIN_GATHER3DIV4SF
:
14018 icode
= CODE_FOR_avx512vl_gatherdiv4sf
;
14020 case IX86_BUILTIN_GATHER3DIV8SF
:
14021 icode
= CODE_FOR_avx512vl_gatherdiv8sf
;
14023 case IX86_BUILTIN_GATHER3SIV2DI
:
14024 icode
= CODE_FOR_avx512vl_gathersiv2di
;
14026 case IX86_BUILTIN_GATHER3SIV4DI
:
14027 icode
= CODE_FOR_avx512vl_gathersiv4di
;
14029 case IX86_BUILTIN_GATHER3DIV2DI
:
14030 icode
= CODE_FOR_avx512vl_gatherdiv2di
;
14032 case IX86_BUILTIN_GATHER3DIV4DI
:
14033 icode
= CODE_FOR_avx512vl_gatherdiv4di
;
14035 case IX86_BUILTIN_GATHER3SIV4SI
:
14036 icode
= CODE_FOR_avx512vl_gathersiv4si
;
14038 case IX86_BUILTIN_GATHER3SIV8SI
:
14039 icode
= CODE_FOR_avx512vl_gathersiv8si
;
14041 case IX86_BUILTIN_GATHER3DIV4SI
:
14042 icode
= CODE_FOR_avx512vl_gatherdiv4si
;
14044 case IX86_BUILTIN_GATHER3DIV8SI
:
14045 icode
= CODE_FOR_avx512vl_gatherdiv8si
;
14047 case IX86_BUILTIN_GATHER3ALTSIV4DF
:
14048 icode
= CODE_FOR_avx512vl_gathersiv4df
;
14050 case IX86_BUILTIN_GATHER3ALTDIV8SF
:
14051 icode
= CODE_FOR_avx512vl_gatherdiv8sf
;
14053 case IX86_BUILTIN_GATHER3ALTSIV4DI
:
14054 icode
= CODE_FOR_avx512vl_gathersiv4di
;
14056 case IX86_BUILTIN_GATHER3ALTDIV8SI
:
14057 icode
= CODE_FOR_avx512vl_gatherdiv8si
;
14059 case IX86_BUILTIN_SCATTERSIV16SF
:
14060 icode
= CODE_FOR_avx512f_scattersiv16sf
;
14062 case IX86_BUILTIN_SCATTERSIV8DF
:
14063 icode
= CODE_FOR_avx512f_scattersiv8df
;
14065 case IX86_BUILTIN_SCATTERDIV16SF
:
14066 icode
= CODE_FOR_avx512f_scatterdiv16sf
;
14068 case IX86_BUILTIN_SCATTERDIV8DF
:
14069 icode
= CODE_FOR_avx512f_scatterdiv8df
;
14071 case IX86_BUILTIN_SCATTERSIV16SI
:
14072 icode
= CODE_FOR_avx512f_scattersiv16si
;
14074 case IX86_BUILTIN_SCATTERSIV8DI
:
14075 icode
= CODE_FOR_avx512f_scattersiv8di
;
14077 case IX86_BUILTIN_SCATTERDIV16SI
:
14078 icode
= CODE_FOR_avx512f_scatterdiv16si
;
14080 case IX86_BUILTIN_SCATTERDIV8DI
:
14081 icode
= CODE_FOR_avx512f_scatterdiv8di
;
14083 case IX86_BUILTIN_SCATTERSIV8SF
:
14084 icode
= CODE_FOR_avx512vl_scattersiv8sf
;
14086 case IX86_BUILTIN_SCATTERSIV4SF
:
14087 icode
= CODE_FOR_avx512vl_scattersiv4sf
;
14089 case IX86_BUILTIN_SCATTERSIV4DF
:
14090 icode
= CODE_FOR_avx512vl_scattersiv4df
;
14092 case IX86_BUILTIN_SCATTERSIV2DF
:
14093 icode
= CODE_FOR_avx512vl_scattersiv2df
;
14095 case IX86_BUILTIN_SCATTERDIV8SF
:
14096 icode
= CODE_FOR_avx512vl_scatterdiv8sf
;
14098 case IX86_BUILTIN_SCATTERDIV4SF
:
14099 icode
= CODE_FOR_avx512vl_scatterdiv4sf
;
14101 case IX86_BUILTIN_SCATTERDIV4DF
:
14102 icode
= CODE_FOR_avx512vl_scatterdiv4df
;
14104 case IX86_BUILTIN_SCATTERDIV2DF
:
14105 icode
= CODE_FOR_avx512vl_scatterdiv2df
;
14107 case IX86_BUILTIN_SCATTERSIV8SI
:
14108 icode
= CODE_FOR_avx512vl_scattersiv8si
;
14110 case IX86_BUILTIN_SCATTERSIV4SI
:
14111 icode
= CODE_FOR_avx512vl_scattersiv4si
;
14113 case IX86_BUILTIN_SCATTERSIV4DI
:
14114 icode
= CODE_FOR_avx512vl_scattersiv4di
;
14116 case IX86_BUILTIN_SCATTERSIV2DI
:
14117 icode
= CODE_FOR_avx512vl_scattersiv2di
;
14119 case IX86_BUILTIN_SCATTERDIV8SI
:
14120 icode
= CODE_FOR_avx512vl_scatterdiv8si
;
14122 case IX86_BUILTIN_SCATTERDIV4SI
:
14123 icode
= CODE_FOR_avx512vl_scatterdiv4si
;
14125 case IX86_BUILTIN_SCATTERDIV4DI
:
14126 icode
= CODE_FOR_avx512vl_scatterdiv4di
;
14128 case IX86_BUILTIN_SCATTERDIV2DI
:
14129 icode
= CODE_FOR_avx512vl_scatterdiv2di
;
14131 case IX86_BUILTIN_GATHERPFDPD
:
14132 icode
= CODE_FOR_avx512pf_gatherpfv8sidf
;
14133 goto vec_prefetch_gen
;
14134 case IX86_BUILTIN_SCATTERALTSIV8DF
:
14135 icode
= CODE_FOR_avx512f_scattersiv8df
;
14137 case IX86_BUILTIN_SCATTERALTDIV16SF
:
14138 icode
= CODE_FOR_avx512f_scatterdiv16sf
;
14140 case IX86_BUILTIN_SCATTERALTSIV8DI
:
14141 icode
= CODE_FOR_avx512f_scattersiv8di
;
14143 case IX86_BUILTIN_SCATTERALTDIV16SI
:
14144 icode
= CODE_FOR_avx512f_scatterdiv16si
;
14146 case IX86_BUILTIN_SCATTERALTSIV4DF
:
14147 icode
= CODE_FOR_avx512vl_scattersiv4df
;
14149 case IX86_BUILTIN_SCATTERALTDIV8SF
:
14150 icode
= CODE_FOR_avx512vl_scatterdiv8sf
;
14152 case IX86_BUILTIN_SCATTERALTSIV4DI
:
14153 icode
= CODE_FOR_avx512vl_scattersiv4di
;
14155 case IX86_BUILTIN_SCATTERALTDIV8SI
:
14156 icode
= CODE_FOR_avx512vl_scatterdiv8si
;
14158 case IX86_BUILTIN_SCATTERALTSIV2DF
:
14159 icode
= CODE_FOR_avx512vl_scattersiv2df
;
14161 case IX86_BUILTIN_SCATTERALTDIV4SF
:
14162 icode
= CODE_FOR_avx512vl_scatterdiv4sf
;
14164 case IX86_BUILTIN_SCATTERALTSIV2DI
:
14165 icode
= CODE_FOR_avx512vl_scattersiv2di
;
14167 case IX86_BUILTIN_SCATTERALTDIV4SI
:
14168 icode
= CODE_FOR_avx512vl_scatterdiv4si
;
14170 case IX86_BUILTIN_GATHERPFDPS
:
14171 icode
= CODE_FOR_avx512pf_gatherpfv16sisf
;
14172 goto vec_prefetch_gen
;
14173 case IX86_BUILTIN_GATHERPFQPD
:
14174 icode
= CODE_FOR_avx512pf_gatherpfv8didf
;
14175 goto vec_prefetch_gen
;
14176 case IX86_BUILTIN_GATHERPFQPS
:
14177 icode
= CODE_FOR_avx512pf_gatherpfv8disf
;
14178 goto vec_prefetch_gen
;
14179 case IX86_BUILTIN_SCATTERPFDPD
:
14180 icode
= CODE_FOR_avx512pf_scatterpfv8sidf
;
14181 goto vec_prefetch_gen
;
14182 case IX86_BUILTIN_SCATTERPFDPS
:
14183 icode
= CODE_FOR_avx512pf_scatterpfv16sisf
;
14184 goto vec_prefetch_gen
;
14185 case IX86_BUILTIN_SCATTERPFQPD
:
14186 icode
= CODE_FOR_avx512pf_scatterpfv8didf
;
14187 goto vec_prefetch_gen
;
14188 case IX86_BUILTIN_SCATTERPFQPS
:
14189 icode
= CODE_FOR_avx512pf_scatterpfv8disf
;
14190 goto vec_prefetch_gen
;
14194 rtx (*gen
) (rtx
, rtx
);
14196 arg0
= CALL_EXPR_ARG (exp
, 0);
14197 arg1
= CALL_EXPR_ARG (exp
, 1);
14198 arg2
= CALL_EXPR_ARG (exp
, 2);
14199 arg3
= CALL_EXPR_ARG (exp
, 3);
14200 arg4
= CALL_EXPR_ARG (exp
, 4);
14201 op0
= expand_normal (arg0
);
14202 op1
= expand_normal (arg1
);
14203 op2
= expand_normal (arg2
);
14204 op3
= expand_normal (arg3
);
14205 op4
= expand_normal (arg4
);
14206 /* Note the arg order is different from the operand order. */
14207 mode0
= insn_data
[icode
].operand
[1].mode
;
14208 mode2
= insn_data
[icode
].operand
[3].mode
;
14209 mode3
= insn_data
[icode
].operand
[4].mode
;
14210 mode4
= insn_data
[icode
].operand
[5].mode
;
14212 if (target
== NULL_RTX
14213 || GET_MODE (target
) != insn_data
[icode
].operand
[0].mode
14214 || !insn_data
[icode
].operand
[0].predicate (target
,
14215 GET_MODE (target
)))
14216 subtarget
= gen_reg_rtx (insn_data
[icode
].operand
[0].mode
);
14218 subtarget
= target
;
14222 case IX86_BUILTIN_GATHER3ALTSIV8DF
:
14223 case IX86_BUILTIN_GATHER3ALTSIV8DI
:
14224 half
= gen_reg_rtx (V8SImode
);
14225 if (!nonimmediate_operand (op2
, V16SImode
))
14226 op2
= copy_to_mode_reg (V16SImode
, op2
);
14227 emit_insn (gen_vec_extract_lo_v16si (half
, op2
));
14230 case IX86_BUILTIN_GATHER3ALTSIV4DF
:
14231 case IX86_BUILTIN_GATHER3ALTSIV4DI
:
14232 case IX86_BUILTIN_GATHERALTSIV4DF
:
14233 case IX86_BUILTIN_GATHERALTSIV4DI
:
14234 half
= gen_reg_rtx (V4SImode
);
14235 if (!nonimmediate_operand (op2
, V8SImode
))
14236 op2
= copy_to_mode_reg (V8SImode
, op2
);
14237 emit_insn (gen_vec_extract_lo_v8si (half
, op2
));
14240 case IX86_BUILTIN_GATHER3ALTDIV16SF
:
14241 case IX86_BUILTIN_GATHER3ALTDIV16SI
:
14242 half
= gen_reg_rtx (mode0
);
14243 if (mode0
== V8SFmode
)
14244 gen
= gen_vec_extract_lo_v16sf
;
14246 gen
= gen_vec_extract_lo_v16si
;
14247 if (!nonimmediate_operand (op0
, GET_MODE (op0
)))
14248 op0
= copy_to_mode_reg (GET_MODE (op0
), op0
);
14249 emit_insn (gen (half
, op0
));
14251 op3
= lowpart_subreg (QImode
, op3
, HImode
);
14253 case IX86_BUILTIN_GATHER3ALTDIV8SF
:
14254 case IX86_BUILTIN_GATHER3ALTDIV8SI
:
14255 case IX86_BUILTIN_GATHERALTDIV8SF
:
14256 case IX86_BUILTIN_GATHERALTDIV8SI
:
14257 half
= gen_reg_rtx (mode0
);
14258 if (mode0
== V4SFmode
)
14259 gen
= gen_vec_extract_lo_v8sf
;
14261 gen
= gen_vec_extract_lo_v8si
;
14262 if (!nonimmediate_operand (op0
, GET_MODE (op0
)))
14263 op0
= copy_to_mode_reg (GET_MODE (op0
), op0
);
14264 emit_insn (gen (half
, op0
));
14266 if (VECTOR_MODE_P (GET_MODE (op3
)))
14268 half
= gen_reg_rtx (mode0
);
14269 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
14270 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
14271 emit_insn (gen (half
, op3
));
14279 /* Force memory operand only with base register here. But we
14280 don't want to do it on memory operand for other builtin
14282 op1
= ix86_zero_extend_to_Pmode (op1
);
14284 if (!insn_data
[icode
].operand
[1].predicate (op0
, mode0
))
14285 op0
= copy_to_mode_reg (mode0
, op0
);
14286 if (!insn_data
[icode
].operand
[2].predicate (op1
, Pmode
))
14287 op1
= copy_to_mode_reg (Pmode
, op1
);
14288 if (!insn_data
[icode
].operand
[3].predicate (op2
, mode2
))
14289 op2
= copy_to_mode_reg (mode2
, op2
);
14291 op3
= fixup_modeless_constant (op3
, mode3
);
14293 if (GET_MODE (op3
) == mode3
|| GET_MODE (op3
) == VOIDmode
)
14295 if (!insn_data
[icode
].operand
[4].predicate (op3
, mode3
))
14296 op3
= copy_to_mode_reg (mode3
, op3
);
14300 op3
= copy_to_reg (op3
);
14301 op3
= lowpart_subreg (mode3
, op3
, GET_MODE (op3
));
14303 if (!insn_data
[icode
].operand
[5].predicate (op4
, mode4
))
14305 error ("the last argument must be scale 1, 2, 4, 8");
14309 /* Optimize. If mask is known to have all high bits set,
14310 replace op0 with pc_rtx to signal that the instruction
14311 overwrites the whole destination and doesn't use its
14312 previous contents. */
14315 if (TREE_CODE (arg3
) == INTEGER_CST
)
14317 if (integer_all_onesp (arg3
))
14320 else if (TREE_CODE (arg3
) == VECTOR_CST
)
14322 unsigned int negative
= 0;
14323 for (i
= 0; i
< VECTOR_CST_NELTS (arg3
); ++i
)
14325 tree cst
= VECTOR_CST_ELT (arg3
, i
);
14326 if (TREE_CODE (cst
) == INTEGER_CST
14327 && tree_int_cst_sign_bit (cst
))
14329 else if (TREE_CODE (cst
) == REAL_CST
14330 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst
)))
14333 if (negative
== TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3
)))
14336 else if (TREE_CODE (arg3
) == SSA_NAME
14337 && TREE_CODE (TREE_TYPE (arg3
)) == VECTOR_TYPE
)
14339 /* Recognize also when mask is like:
14340 __v2df src = _mm_setzero_pd ();
14341 __v2df mask = _mm_cmpeq_pd (src, src);
14343 __v8sf src = _mm256_setzero_ps ();
14344 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
14345 as that is a cheaper way to load all ones into
14346 a register than having to load a constant from
14348 gimple
*def_stmt
= SSA_NAME_DEF_STMT (arg3
);
14349 if (is_gimple_call (def_stmt
))
14351 tree fndecl
= gimple_call_fndecl (def_stmt
);
14353 && fndecl_built_in_p (fndecl
, BUILT_IN_MD
))
14354 switch (DECL_MD_FUNCTION_CODE (fndecl
))
14356 case IX86_BUILTIN_CMPPD
:
14357 case IX86_BUILTIN_CMPPS
:
14358 case IX86_BUILTIN_CMPPD256
:
14359 case IX86_BUILTIN_CMPPS256
:
14360 if (!integer_zerop (gimple_call_arg (def_stmt
, 2)))
14363 case IX86_BUILTIN_CMPEQPD
:
14364 case IX86_BUILTIN_CMPEQPS
:
14365 if (initializer_zerop (gimple_call_arg (def_stmt
, 0))
14366 && initializer_zerop (gimple_call_arg (def_stmt
,
14377 pat
= GEN_FCN (icode
) (subtarget
, op0
, op1
, op2
, op3
, op4
);
14384 case IX86_BUILTIN_GATHER3DIV16SF
:
14385 if (target
== NULL_RTX
)
14386 target
= gen_reg_rtx (V8SFmode
);
14387 emit_insn (gen_vec_extract_lo_v16sf (target
, subtarget
));
14389 case IX86_BUILTIN_GATHER3DIV16SI
:
14390 if (target
== NULL_RTX
)
14391 target
= gen_reg_rtx (V8SImode
);
14392 emit_insn (gen_vec_extract_lo_v16si (target
, subtarget
));
14394 case IX86_BUILTIN_GATHER3DIV8SF
:
14395 case IX86_BUILTIN_GATHERDIV8SF
:
14396 if (target
== NULL_RTX
)
14397 target
= gen_reg_rtx (V4SFmode
);
14398 emit_insn (gen_vec_extract_lo_v8sf (target
, subtarget
));
14400 case IX86_BUILTIN_GATHER3DIV8SI
:
14401 case IX86_BUILTIN_GATHERDIV8SI
:
14402 if (target
== NULL_RTX
)
14403 target
= gen_reg_rtx (V4SImode
);
14404 emit_insn (gen_vec_extract_lo_v8si (target
, subtarget
));
14407 target
= subtarget
;
14413 arg0
= CALL_EXPR_ARG (exp
, 0);
14414 arg1
= CALL_EXPR_ARG (exp
, 1);
14415 arg2
= CALL_EXPR_ARG (exp
, 2);
14416 arg3
= CALL_EXPR_ARG (exp
, 3);
14417 arg4
= CALL_EXPR_ARG (exp
, 4);
14418 op0
= expand_normal (arg0
);
14419 op1
= expand_normal (arg1
);
14420 op2
= expand_normal (arg2
);
14421 op3
= expand_normal (arg3
);
14422 op4
= expand_normal (arg4
);
14423 mode1
= insn_data
[icode
].operand
[1].mode
;
14424 mode2
= insn_data
[icode
].operand
[2].mode
;
14425 mode3
= insn_data
[icode
].operand
[3].mode
;
14426 mode4
= insn_data
[icode
].operand
[4].mode
;
14428 /* Scatter instruction stores operand op3 to memory with
14429 indices from op2 and scale from op4 under writemask op1.
14430 If index operand op2 has more elements then source operand
14431 op3 one need to use only its low half. And vice versa. */
14434 case IX86_BUILTIN_SCATTERALTSIV8DF
:
14435 case IX86_BUILTIN_SCATTERALTSIV8DI
:
14436 half
= gen_reg_rtx (V8SImode
);
14437 if (!nonimmediate_operand (op2
, V16SImode
))
14438 op2
= copy_to_mode_reg (V16SImode
, op2
);
14439 emit_insn (gen_vec_extract_lo_v16si (half
, op2
));
14442 case IX86_BUILTIN_SCATTERALTDIV16SF
:
14443 case IX86_BUILTIN_SCATTERALTDIV16SI
:
14444 half
= gen_reg_rtx (mode3
);
14445 if (mode3
== V8SFmode
)
14446 gen
= gen_vec_extract_lo_v16sf
;
14448 gen
= gen_vec_extract_lo_v16si
;
14449 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
14450 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
14451 emit_insn (gen (half
, op3
));
14454 case IX86_BUILTIN_SCATTERALTSIV4DF
:
14455 case IX86_BUILTIN_SCATTERALTSIV4DI
:
14456 half
= gen_reg_rtx (V4SImode
);
14457 if (!nonimmediate_operand (op2
, V8SImode
))
14458 op2
= copy_to_mode_reg (V8SImode
, op2
);
14459 emit_insn (gen_vec_extract_lo_v8si (half
, op2
));
14462 case IX86_BUILTIN_SCATTERALTDIV8SF
:
14463 case IX86_BUILTIN_SCATTERALTDIV8SI
:
14464 half
= gen_reg_rtx (mode3
);
14465 if (mode3
== V4SFmode
)
14466 gen
= gen_vec_extract_lo_v8sf
;
14468 gen
= gen_vec_extract_lo_v8si
;
14469 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
14470 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
14471 emit_insn (gen (half
, op3
));
14474 case IX86_BUILTIN_SCATTERALTSIV2DF
:
14475 case IX86_BUILTIN_SCATTERALTSIV2DI
:
14476 if (!nonimmediate_operand (op2
, V4SImode
))
14477 op2
= copy_to_mode_reg (V4SImode
, op2
);
14479 case IX86_BUILTIN_SCATTERALTDIV4SF
:
14480 case IX86_BUILTIN_SCATTERALTDIV4SI
:
14481 if (!nonimmediate_operand (op3
, GET_MODE (op3
)))
14482 op3
= copy_to_mode_reg (GET_MODE (op3
), op3
);
14488 /* Force memory operand only with base register here. But we
14489 don't want to do it on memory operand for other builtin
14491 op0
= force_reg (Pmode
, convert_to_mode (Pmode
, op0
, 1));
14493 if (!insn_data
[icode
].operand
[0].predicate (op0
, Pmode
))
14494 op0
= copy_to_mode_reg (Pmode
, op0
);
14496 op1
= fixup_modeless_constant (op1
, mode1
);
14498 if (GET_MODE (op1
) == mode1
|| GET_MODE (op1
) == VOIDmode
)
14500 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
14501 op1
= copy_to_mode_reg (mode1
, op1
);
14505 op1
= copy_to_reg (op1
);
14506 op1
= lowpart_subreg (mode1
, op1
, GET_MODE (op1
));
14509 if (!insn_data
[icode
].operand
[2].predicate (op2
, mode2
))
14510 op2
= copy_to_mode_reg (mode2
, op2
);
14512 if (!insn_data
[icode
].operand
[3].predicate (op3
, mode3
))
14513 op3
= copy_to_mode_reg (mode3
, op3
);
14515 if (!insn_data
[icode
].operand
[4].predicate (op4
, mode4
))
14517 error ("the last argument must be scale 1, 2, 4, 8");
14521 pat
= GEN_FCN (icode
) (op0
, op1
, op2
, op3
, op4
);
14529 arg0
= CALL_EXPR_ARG (exp
, 0);
14530 arg1
= CALL_EXPR_ARG (exp
, 1);
14531 arg2
= CALL_EXPR_ARG (exp
, 2);
14532 arg3
= CALL_EXPR_ARG (exp
, 3);
14533 arg4
= CALL_EXPR_ARG (exp
, 4);
14534 op0
= expand_normal (arg0
);
14535 op1
= expand_normal (arg1
);
14536 op2
= expand_normal (arg2
);
14537 op3
= expand_normal (arg3
);
14538 op4
= expand_normal (arg4
);
14539 mode0
= insn_data
[icode
].operand
[0].mode
;
14540 mode1
= insn_data
[icode
].operand
[1].mode
;
14541 mode3
= insn_data
[icode
].operand
[3].mode
;
14542 mode4
= insn_data
[icode
].operand
[4].mode
;
14544 op0
= fixup_modeless_constant (op0
, mode0
);
14546 if (GET_MODE (op0
) == mode0
|| GET_MODE (op0
) == VOIDmode
)
14548 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
14549 op0
= copy_to_mode_reg (mode0
, op0
);
14553 op0
= copy_to_reg (op0
);
14554 op0
= lowpart_subreg (mode0
, op0
, GET_MODE (op0
));
14557 if (!insn_data
[icode
].operand
[1].predicate (op1
, mode1
))
14558 op1
= copy_to_mode_reg (mode1
, op1
);
14560 /* Force memory operand only with base register here. But we
14561 don't want to do it on memory operand for other builtin
14563 op2
= force_reg (Pmode
, convert_to_mode (Pmode
, op2
, 1));
14565 if (!insn_data
[icode
].operand
[2].predicate (op2
, Pmode
))
14566 op2
= copy_to_mode_reg (Pmode
, op2
);
14568 if (!insn_data
[icode
].operand
[3].predicate (op3
, mode3
))
14570 error ("the forth argument must be scale 1, 2, 4, 8");
14574 if (!insn_data
[icode
].operand
[4].predicate (op4
, mode4
))
14576 error ("incorrect hint operand");
14580 pat
= GEN_FCN (icode
) (op0
, op1
, op2
, op3
, op4
);
14588 case IX86_BUILTIN_XABORT
:
14589 icode
= CODE_FOR_xabort
;
14590 arg0
= CALL_EXPR_ARG (exp
, 0);
14591 op0
= expand_normal (arg0
);
14592 mode0
= insn_data
[icode
].operand
[0].mode
;
14593 if (!insn_data
[icode
].operand
[0].predicate (op0
, mode0
))
14595 error ("the argument to %<xabort%> intrinsic must "
14596 "be an 8-bit immediate");
14599 emit_insn (gen_xabort (op0
));
14602 case IX86_BUILTIN_RDSSPD
:
14603 case IX86_BUILTIN_RDSSPQ
:
14604 mode
= (fcode
== IX86_BUILTIN_RDSSPD
? SImode
: DImode
);
14607 || !register_operand (target
, mode
))
14608 target
= gen_reg_rtx (mode
);
14610 op0
= force_reg (mode
, const0_rtx
);
14612 emit_insn (gen_rdssp (mode
, target
, op0
));
14615 case IX86_BUILTIN_INCSSPD
:
14616 case IX86_BUILTIN_INCSSPQ
:
14617 mode
= (fcode
== IX86_BUILTIN_INCSSPD
? SImode
: DImode
);
14619 arg0
= CALL_EXPR_ARG (exp
, 0);
14620 op0
= expand_normal (arg0
);
14622 op0
= force_reg (mode
, op0
);
14624 emit_insn (gen_incssp (mode
, op0
));
14627 case IX86_BUILTIN_HRESET
:
14628 icode
= CODE_FOR_hreset
;
14629 arg0
= CALL_EXPR_ARG (exp
, 0);
14630 op0
= expand_normal (arg0
);
14631 op0
= force_reg (SImode
, op0
);
14632 emit_insn (gen_hreset (op0
));
14635 case IX86_BUILTIN_RSTORSSP
:
14636 case IX86_BUILTIN_CLRSSBSY
:
14637 arg0
= CALL_EXPR_ARG (exp
, 0);
14638 op0
= expand_normal (arg0
);
14639 icode
= (fcode
== IX86_BUILTIN_RSTORSSP
14640 ? CODE_FOR_rstorssp
14641 : CODE_FOR_clrssbsy
);
14643 if (!address_operand (op0
, VOIDmode
))
14645 op0
= convert_memory_address (Pmode
, op0
);
14646 op0
= copy_addr_to_reg (op0
);
14648 emit_insn (GEN_FCN (icode
) (gen_rtx_MEM (DImode
, op0
)));
14651 case IX86_BUILTIN_WRSSD
:
14652 case IX86_BUILTIN_WRSSQ
:
14653 case IX86_BUILTIN_WRUSSD
:
14654 case IX86_BUILTIN_WRUSSQ
:
14655 mode
= ((fcode
== IX86_BUILTIN_WRSSD
14656 || fcode
== IX86_BUILTIN_WRUSSD
)
14657 ? SImode
: DImode
);
14659 arg0
= CALL_EXPR_ARG (exp
, 0);
14660 op0
= expand_normal (arg0
);
14661 arg1
= CALL_EXPR_ARG (exp
, 1);
14662 op1
= expand_normal (arg1
);
14664 op0
= force_reg (mode
, op0
);
14666 if (!address_operand (op1
, VOIDmode
))
14668 op1
= convert_memory_address (Pmode
, op1
);
14669 op1
= copy_addr_to_reg (op1
);
14671 op1
= gen_rtx_MEM (mode
, op1
);
14673 icode
= ((fcode
== IX86_BUILTIN_WRSSD
14674 || fcode
== IX86_BUILTIN_WRSSQ
)
14675 ? code_for_wrss (mode
)
14676 : code_for_wruss (mode
));
14677 emit_insn (GEN_FCN (icode
) (op0
, op1
));
14685 if (fcode
>= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
14686 && fcode
<= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST
)
14688 i
= fcode
- IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
;
14689 return ix86_expand_special_args_builtin (bdesc_special_args
+ i
, exp
,
14693 if (fcode
>= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
14694 && fcode
<= IX86_BUILTIN__BDESC_PURE_ARGS_LAST
)
14696 i
= fcode
- IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
;
14697 return ix86_expand_special_args_builtin (bdesc_pure_args
+ i
, exp
,
14701 if (fcode
>= IX86_BUILTIN__BDESC_ARGS_FIRST
14702 && fcode
<= IX86_BUILTIN__BDESC_ARGS_LAST
)
14704 i
= fcode
- IX86_BUILTIN__BDESC_ARGS_FIRST
;
14705 rtx (*fcn
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
14706 rtx (*fcn_mask
) (rtx
, rtx
, rtx
, rtx
, rtx
);
14707 rtx (*fcn_maskz
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
);
14709 machine_mode mode
, wide_mode
, nar_mode
;
14711 nar_mode
= V4SFmode
;
14713 wide_mode
= V64SFmode
;
14714 fcn_mask
= gen_avx5124fmaddps_4fmaddps_mask
;
14715 fcn_maskz
= gen_avx5124fmaddps_4fmaddps_maskz
;
14719 case IX86_BUILTIN_4FMAPS
:
14720 fcn
= gen_avx5124fmaddps_4fmaddps
;
14724 case IX86_BUILTIN_4DPWSSD
:
14725 nar_mode
= V4SImode
;
14727 wide_mode
= V64SImode
;
14728 fcn
= gen_avx5124vnniw_vp4dpwssd
;
14732 case IX86_BUILTIN_4DPWSSDS
:
14733 nar_mode
= V4SImode
;
14735 wide_mode
= V64SImode
;
14736 fcn
= gen_avx5124vnniw_vp4dpwssds
;
14740 case IX86_BUILTIN_4FNMAPS
:
14741 fcn
= gen_avx5124fmaddps_4fnmaddps
;
14745 case IX86_BUILTIN_4FNMAPS_MASK
:
14746 fcn_mask
= gen_avx5124fmaddps_4fnmaddps_mask
;
14747 fcn_maskz
= gen_avx5124fmaddps_4fnmaddps_maskz
;
14750 case IX86_BUILTIN_4DPWSSD_MASK
:
14751 nar_mode
= V4SImode
;
14753 wide_mode
= V64SImode
;
14754 fcn_mask
= gen_avx5124vnniw_vp4dpwssd_mask
;
14755 fcn_maskz
= gen_avx5124vnniw_vp4dpwssd_maskz
;
14758 case IX86_BUILTIN_4DPWSSDS_MASK
:
14759 nar_mode
= V4SImode
;
14761 wide_mode
= V64SImode
;
14762 fcn_mask
= gen_avx5124vnniw_vp4dpwssds_mask
;
14763 fcn_maskz
= gen_avx5124vnniw_vp4dpwssds_maskz
;
14766 case IX86_BUILTIN_4FMAPS_MASK
:
14776 wide_reg
= gen_reg_rtx (wide_mode
);
14777 for (i
= 0; i
< 4; i
++)
14779 args
[i
] = CALL_EXPR_ARG (exp
, i
);
14780 ops
[i
] = expand_normal (args
[i
]);
14782 emit_move_insn (gen_rtx_SUBREG (mode
, wide_reg
, i
* 64),
14786 accum
= expand_normal (CALL_EXPR_ARG (exp
, 4));
14787 accum
= force_reg (mode
, accum
);
14789 addr
= expand_normal (CALL_EXPR_ARG (exp
, 5));
14790 addr
= force_reg (Pmode
, addr
);
14792 mem
= gen_rtx_MEM (nar_mode
, addr
);
14794 target
= gen_reg_rtx (mode
);
14796 emit_move_insn (target
, accum
);
14799 emit_insn (fcn (target
, accum
, wide_reg
, mem
));
14803 merge
= expand_normal (CALL_EXPR_ARG (exp
, 6));
14805 mask
= expand_normal (CALL_EXPR_ARG (exp
, 7));
14807 if (CONST_INT_P (mask
))
14808 mask
= fixup_modeless_constant (mask
, HImode
);
14810 mask
= force_reg (HImode
, mask
);
14812 if (GET_MODE (mask
) != HImode
)
14813 mask
= gen_rtx_SUBREG (HImode
, mask
, 0);
14815 /* If merge is 0 then we're about to emit z-masked variant. */
14816 if (const0_operand (merge
, mode
))
14817 emit_insn (fcn_maskz (target
, accum
, wide_reg
, mem
, merge
, mask
));
14818 /* If merge is the same as accum then emit merge-masked variant. */
14819 else if (CALL_EXPR_ARG (exp
, 6) == CALL_EXPR_ARG (exp
, 4))
14821 merge
= force_reg (mode
, merge
);
14822 emit_insn (fcn_mask (target
, wide_reg
, mem
, merge
, mask
));
14824 /* Merge with something unknown might happen if we z-mask w/ -O0. */
14827 target
= gen_reg_rtx (mode
);
14828 emit_move_insn (target
, merge
);
14829 emit_insn (fcn_mask (target
, wide_reg
, mem
, target
, mask
));
14835 case IX86_BUILTIN_4FNMASS
:
14836 fcn
= gen_avx5124fmaddps_4fnmaddss
;
14840 case IX86_BUILTIN_4FMASS
:
14841 fcn
= gen_avx5124fmaddps_4fmaddss
;
14845 case IX86_BUILTIN_4FNMASS_MASK
:
14846 fcn_mask
= gen_avx5124fmaddps_4fnmaddss_mask
;
14847 fcn_maskz
= gen_avx5124fmaddps_4fnmaddss_maskz
;
14850 case IX86_BUILTIN_4FMASS_MASK
:
14859 fcn_mask
= gen_avx5124fmaddps_4fmaddss_mask
;
14860 fcn_maskz
= gen_avx5124fmaddps_4fmaddss_maskz
;
14864 wide_reg
= gen_reg_rtx (V64SFmode
);
14865 for (i
= 0; i
< 4; i
++)
14868 args
[i
] = CALL_EXPR_ARG (exp
, i
);
14869 ops
[i
] = expand_normal (args
[i
]);
14871 tmp
= gen_reg_rtx (SFmode
);
14872 emit_move_insn (tmp
, gen_rtx_SUBREG (SFmode
, ops
[i
], 0));
14874 emit_move_insn (gen_rtx_SUBREG (V16SFmode
, wide_reg
, i
* 64),
14875 gen_rtx_SUBREG (V16SFmode
, tmp
, 0));
14878 accum
= expand_normal (CALL_EXPR_ARG (exp
, 4));
14879 accum
= force_reg (V4SFmode
, accum
);
14881 addr
= expand_normal (CALL_EXPR_ARG (exp
, 5));
14882 addr
= force_reg (Pmode
, addr
);
14884 mem
= gen_rtx_MEM (V4SFmode
, addr
);
14886 target
= gen_reg_rtx (V4SFmode
);
14888 emit_move_insn (target
, accum
);
14891 emit_insn (fcn (target
, accum
, wide_reg
, mem
));
14895 merge
= expand_normal (CALL_EXPR_ARG (exp
, 6));
14897 mask
= expand_normal (CALL_EXPR_ARG (exp
, 7));
14899 if (CONST_INT_P (mask
))
14900 mask
= fixup_modeless_constant (mask
, QImode
);
14902 mask
= force_reg (QImode
, mask
);
14904 if (GET_MODE (mask
) != QImode
)
14905 mask
= gen_rtx_SUBREG (QImode
, mask
, 0);
14907 /* If merge is 0 then we're about to emit z-masked variant. */
14908 if (const0_operand (merge
, mode
))
14909 emit_insn (fcn_maskz (target
, accum
, wide_reg
, mem
, merge
, mask
));
14910 /* If merge is the same as accum then emit merge-masked
14912 else if (CALL_EXPR_ARG (exp
, 6) == CALL_EXPR_ARG (exp
, 4))
14914 merge
= force_reg (mode
, merge
);
14915 emit_insn (fcn_mask (target
, wide_reg
, mem
, merge
, mask
));
14917 /* Merge with something unknown might happen if we z-mask
14921 target
= gen_reg_rtx (mode
);
14922 emit_move_insn (target
, merge
);
14923 emit_insn (fcn_mask (target
, wide_reg
, mem
, target
, mask
));
14928 case IX86_BUILTIN_RDPID
:
14929 return ix86_expand_special_args_builtin (bdesc_args
+ i
, exp
,
14931 case IX86_BUILTIN_FABSQ
:
14932 case IX86_BUILTIN_COPYSIGNQ
:
14934 /* Emit a normal call if SSE isn't available. */
14935 return expand_call (exp
, target
, ignore
);
14938 return ix86_expand_args_builtin (bdesc_args
+ i
, exp
, target
);
14942 if (fcode
>= IX86_BUILTIN__BDESC_COMI_FIRST
14943 && fcode
<= IX86_BUILTIN__BDESC_COMI_LAST
)
14945 i
= fcode
- IX86_BUILTIN__BDESC_COMI_FIRST
;
14946 return ix86_expand_sse_comi (bdesc_comi
+ i
, exp
, target
);
14949 if (fcode
>= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
14950 && fcode
<= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST
)
14952 i
= fcode
- IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
;
14953 return ix86_expand_round_builtin (bdesc_round_args
+ i
, exp
, target
);
14956 if (fcode
>= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
14957 && fcode
<= IX86_BUILTIN__BDESC_PCMPESTR_LAST
)
14959 i
= fcode
- IX86_BUILTIN__BDESC_PCMPESTR_FIRST
;
14960 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr
+ i
, exp
, target
);
14963 if (fcode
>= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
14964 && fcode
<= IX86_BUILTIN__BDESC_PCMPISTR_LAST
)
14966 i
= fcode
- IX86_BUILTIN__BDESC_PCMPISTR_FIRST
;
14967 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr
+ i
, exp
, target
);
14970 if (fcode
>= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
14971 && fcode
<= IX86_BUILTIN__BDESC_MULTI_ARG_LAST
)
14973 i
= fcode
- IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
;
14974 const struct builtin_description
*d
= bdesc_multi_arg
+ i
;
14975 return ix86_expand_multi_arg_builtin (d
->icode
, exp
, target
,
14976 (enum ix86_builtin_func_type
)
14977 d
->flag
, d
->comparison
);
14980 if (fcode
>= IX86_BUILTIN__BDESC_CET_FIRST
14981 && fcode
<= IX86_BUILTIN__BDESC_CET_LAST
)
14983 i
= fcode
- IX86_BUILTIN__BDESC_CET_FIRST
;
14984 return ix86_expand_special_args_builtin (bdesc_cet
+ i
, exp
,
14988 gcc_unreachable ();
14991 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
14992 fill target with val via vec_duplicate. */
14995 ix86_vector_duplicate_value (machine_mode mode
, rtx target
, rtx val
)
15001 /* First attempt to recognize VAL as-is. */
15002 dup
= gen_vec_duplicate (mode
, val
);
15003 insn
= emit_insn (gen_rtx_SET (target
, dup
));
15004 if (recog_memoized (insn
) < 0)
15007 machine_mode innermode
= GET_MODE_INNER (mode
);
15010 /* If that fails, force VAL into a register. */
15013 reg
= force_reg (innermode
, val
);
15014 if (GET_MODE (reg
) != innermode
)
15015 reg
= gen_lowpart (innermode
, reg
);
15016 SET_SRC (PATTERN (insn
)) = gen_vec_duplicate (mode
, reg
);
15017 seq
= get_insns ();
15020 emit_insn_before (seq
, insn
);
15022 ok
= recog_memoized (insn
) >= 0;
15028 /* Get a vector mode of the same size as the original but with elements
15029 twice as wide. This is only guaranteed to apply to integral vectors. */
15031 static machine_mode
15032 get_mode_wider_vector (machine_mode o
)
15034 /* ??? Rely on the ordering that genmodes.cc gives to vectors. */
15035 machine_mode n
= GET_MODE_NEXT_MODE (o
).require ();
15036 gcc_assert (GET_MODE_NUNITS (o
) == GET_MODE_NUNITS (n
) * 2);
15037 gcc_assert (GET_MODE_SIZE (o
) == GET_MODE_SIZE (n
));
15041 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d
*d
);
15042 static bool expand_vec_perm_1 (struct expand_vec_perm_d
*d
);
15044 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15045 with all elements equal to VAR. Return true if successful. */
15048 ix86_expand_vector_init_duplicate (bool mmx_ok
, machine_mode mode
,
15049 rtx target
, rtx val
)
15073 return ix86_vector_duplicate_value (mode
, target
, val
);
15078 if (TARGET_SSE
|| TARGET_3DNOW_A
)
15082 val
= gen_lowpart (SImode
, val
);
15083 x
= gen_rtx_TRUNCATE (HImode
, val
);
15084 x
= gen_rtx_VEC_DUPLICATE (mode
, x
);
15085 emit_insn (gen_rtx_SET (target
, x
));
15095 val
= gen_lowpart (SImode
, val
);
15096 x
= gen_rtx_TRUNCATE (HImode
, val
);
15097 x
= gen_rtx_VEC_DUPLICATE (mode
, x
);
15098 emit_insn (gen_rtx_SET (target
, x
));
15113 return ix86_vector_duplicate_value (mode
, target
, val
);
15117 struct expand_vec_perm_d dperm
;
15121 memset (&dperm
, 0, sizeof (dperm
));
15122 dperm
.target
= target
;
15123 dperm
.vmode
= mode
;
15124 dperm
.nelt
= GET_MODE_NUNITS (mode
);
15125 dperm
.op0
= dperm
.op1
= gen_reg_rtx (mode
);
15126 dperm
.one_operand_p
= true;
15128 if (mode
== V8HFmode
|| mode
== V8BFmode
)
15130 tmp1
= force_reg (GET_MODE_INNER (mode
), val
);
15131 tmp2
= gen_reg_rtx (mode
);
15132 emit_insn (maybe_gen_vec_set_0 (mode
, tmp2
,
15133 CONST0_RTX (mode
), tmp1
));
15134 tmp1
= gen_lowpart (mode
, tmp2
);
15138 /* Extend to SImode using a paradoxical SUBREG. */
15139 tmp1
= gen_reg_rtx (SImode
);
15140 emit_move_insn (tmp1
, gen_lowpart (SImode
, val
));
15142 /* Insert the SImode value as
15143 low element of a V4SImode vector. */
15144 tmp2
= gen_reg_rtx (V4SImode
);
15145 emit_insn (gen_vec_setv4si_0 (tmp2
, CONST0_RTX (V4SImode
), tmp1
));
15146 tmp1
= gen_lowpart (mode
, tmp2
);
15149 emit_move_insn (dperm
.op0
, tmp1
);
15150 ok
= (expand_vec_perm_1 (&dperm
)
15151 || expand_vec_perm_broadcast_1 (&dperm
));
15159 return ix86_vector_duplicate_value (mode
, target
, val
);
15166 /* Replicate the value once into the next wider mode and recurse. */
15168 machine_mode smode
, wsmode
, wvmode
;
15171 smode
= GET_MODE_INNER (mode
);
15172 wvmode
= get_mode_wider_vector (mode
);
15173 wsmode
= GET_MODE_INNER (wvmode
);
15175 val
= convert_modes (wsmode
, smode
, val
, true);
15177 if (smode
== QImode
&& !TARGET_PARTIAL_REG_STALL
)
15178 emit_insn (gen_insv_1 (wsmode
, val
, val
));
15181 x
= expand_simple_binop (wsmode
, ASHIFT
, val
,
15182 GEN_INT (GET_MODE_BITSIZE (smode
)),
15183 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
15184 val
= expand_simple_binop (wsmode
, IOR
, val
, x
, x
, 1,
15188 x
= gen_reg_rtx (wvmode
);
15189 ok
= ix86_expand_vector_init_duplicate (mmx_ok
, wvmode
, x
, val
);
15191 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), x
));
15200 return ix86_vector_duplicate_value (mode
, target
, val
);
15203 machine_mode hvmode
;
15216 hvmode
= V16QImode
;
15219 gcc_unreachable ();
15221 rtx x
= gen_reg_rtx (hvmode
);
15223 ok
= ix86_expand_vector_init_duplicate (false, hvmode
, x
, val
);
15226 x
= gen_rtx_VEC_CONCAT (mode
, x
, x
);
15227 emit_insn (gen_rtx_SET (target
, x
));
15235 if (TARGET_AVX512BW
)
15236 return ix86_vector_duplicate_value (mode
, target
, val
);
15239 machine_mode hvmode
;
15243 hvmode
= V16HImode
;
15246 hvmode
= V16HFmode
;
15249 hvmode
= V16BFmode
;
15252 hvmode
= V32QImode
;
15255 gcc_unreachable ();
15257 rtx x
= gen_reg_rtx (hvmode
);
15259 ok
= ix86_expand_vector_init_duplicate (false, hvmode
, x
, val
);
15262 x
= gen_rtx_VEC_CONCAT (mode
, x
, x
);
15263 emit_insn (gen_rtx_SET (target
, x
));
15272 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15273 whose ONE_VAR element is VAR, and other elements are zero. Return true
15277 ix86_expand_vector_init_one_nonzero (bool mmx_ok
, machine_mode mode
,
15278 rtx target
, rtx var
, int one_var
)
15280 machine_mode vsimode
;
15283 bool use_vector_set
= false;
15284 rtx (*gen_vec_set_0
) (rtx
, rtx
, rtx
) = NULL
;
15289 /* For SSE4.1, we normally use vector set. But if the second
15290 element is zero and inter-unit moves are OK, we use movq
15292 use_vector_set
= (TARGET_64BIT
&& TARGET_SSE4_1
15293 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
15299 use_vector_set
= TARGET_SSE4_1
;
15302 use_vector_set
= TARGET_SSE2
;
15303 gen_vec_set_0
= TARGET_AVX512FP16
&& one_var
== 0
15304 ? gen_vec_setv8hi_0
: NULL
;
15307 use_vector_set
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
15310 use_vector_set
= TARGET_SSE
|| TARGET_3DNOW_A
;
15313 use_vector_set
= TARGET_SSE4_1
;
15316 use_vector_set
= TARGET_AVX
;
15319 use_vector_set
= TARGET_AVX
;
15320 gen_vec_set_0
= TARGET_AVX512FP16
&& one_var
== 0
15321 ? gen_vec_setv16hi_0
: NULL
;
15324 use_vector_set
= TARGET_AVX
;
15325 gen_vec_set_0
= gen_vec_setv8si_0
;
15328 use_vector_set
= TARGET_AVX
;
15329 gen_vec_set_0
= gen_vec_setv8sf_0
;
15332 use_vector_set
= TARGET_AVX
;
15333 gen_vec_set_0
= gen_vec_setv4df_0
;
15336 /* Use ix86_expand_vector_set in 64bit mode only. */
15337 use_vector_set
= TARGET_AVX
&& TARGET_64BIT
;
15338 gen_vec_set_0
= gen_vec_setv4di_0
;
15341 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
15342 gen_vec_set_0
= gen_vec_setv16si_0
;
15345 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
15346 gen_vec_set_0
= gen_vec_setv16sf_0
;
15349 use_vector_set
= TARGET_AVX512F
&& one_var
== 0;
15350 gen_vec_set_0
= gen_vec_setv8df_0
;
15353 /* Use ix86_expand_vector_set in 64bit mode only. */
15354 use_vector_set
= TARGET_AVX512F
&& TARGET_64BIT
&& one_var
== 0;
15355 gen_vec_set_0
= gen_vec_setv8di_0
;
15358 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15359 gen_vec_set_0
= gen_vec_setv8hf_0
;
15362 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15363 gen_vec_set_0
= gen_vec_setv16hf_0
;
15366 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15367 gen_vec_set_0
= gen_vec_setv32hf_0
;
15370 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15371 gen_vec_set_0
= gen_vec_setv8bf_0
;
15374 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15375 gen_vec_set_0
= gen_vec_setv16bf_0
;
15378 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15379 gen_vec_set_0
= gen_vec_setv32bf_0
;
15382 use_vector_set
= TARGET_AVX512FP16
&& one_var
== 0;
15383 gen_vec_set_0
= gen_vec_setv32hi_0
;
15388 if (use_vector_set
)
15390 if (gen_vec_set_0
&& one_var
== 0)
15392 var
= force_reg (GET_MODE_INNER (mode
), var
);
15393 emit_insn (gen_vec_set_0 (target
, CONST0_RTX (mode
), var
));
15396 emit_insn (gen_rtx_SET (target
, CONST0_RTX (mode
)));
15397 var
= force_reg (GET_MODE_INNER (mode
), var
);
15398 ix86_expand_vector_set (mmx_ok
, target
, var
, one_var
);
15414 var
= force_reg (GET_MODE_INNER (mode
), var
);
15415 x
= gen_rtx_VEC_CONCAT (mode
, var
, CONST0_RTX (GET_MODE_INNER (mode
)));
15416 emit_insn (gen_rtx_SET (target
, x
));
15421 if (!REG_P (target
) || REGNO (target
) < FIRST_PSEUDO_REGISTER
)
15422 new_target
= gen_reg_rtx (mode
);
15424 new_target
= target
;
15425 var
= force_reg (GET_MODE_INNER (mode
), var
);
15426 x
= gen_rtx_VEC_DUPLICATE (mode
, var
);
15427 x
= gen_rtx_VEC_MERGE (mode
, x
, CONST0_RTX (mode
), const1_rtx
);
15428 emit_insn (gen_rtx_SET (new_target
, x
));
15431 /* We need to shuffle the value to the correct position, so
15432 create a new pseudo to store the intermediate result. */
15434 /* With SSE2, we can use the integer shuffle insns. */
15435 if (mode
!= V4SFmode
&& TARGET_SSE2
)
15437 emit_insn (gen_sse2_pshufd_1 (new_target
, new_target
,
15439 GEN_INT (one_var
== 1 ? 0 : 1),
15440 GEN_INT (one_var
== 2 ? 0 : 1),
15441 GEN_INT (one_var
== 3 ? 0 : 1)));
15442 if (target
!= new_target
)
15443 emit_move_insn (target
, new_target
);
15447 /* Otherwise convert the intermediate result to V4SFmode and
15448 use the SSE1 shuffle instructions. */
15449 if (mode
!= V4SFmode
)
15451 tmp
= gen_reg_rtx (V4SFmode
);
15452 emit_move_insn (tmp
, gen_lowpart (V4SFmode
, new_target
));
15457 emit_insn (gen_sse_shufps_v4sf (tmp
, tmp
, tmp
,
15459 GEN_INT (one_var
== 1 ? 0 : 1),
15460 GEN_INT (one_var
== 2 ? 0+4 : 1+4),
15461 GEN_INT (one_var
== 3 ? 0+4 : 1+4)));
15463 if (mode
!= V4SFmode
)
15464 emit_move_insn (target
, gen_lowpart (V4SImode
, tmp
));
15465 else if (tmp
!= target
)
15466 emit_move_insn (target
, tmp
);
15468 else if (target
!= new_target
)
15469 emit_move_insn (target
, new_target
);
15474 vsimode
= V4SImode
;
15480 vsimode
= V2SImode
;
15486 /* Zero extend the variable element to SImode and recurse. */
15487 var
= convert_modes (SImode
, GET_MODE_INNER (mode
), var
, true);
15489 x
= gen_reg_rtx (vsimode
);
15490 if (!ix86_expand_vector_init_one_nonzero (mmx_ok
, vsimode
, x
,
15492 gcc_unreachable ();
15494 emit_move_insn (target
, gen_lowpart (mode
, x
));
15502 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15503 consisting of the values in VALS. It is known that all elements
15504 except ONE_VAR are constants. Return true if successful. */
15507 ix86_expand_vector_init_one_var (bool mmx_ok
, machine_mode mode
,
15508 rtx target
, rtx vals
, int one_var
)
15510 rtx var
= XVECEXP (vals
, 0, one_var
);
15511 machine_mode wmode
;
15514 const_vec
= copy_rtx (vals
);
15515 XVECEXP (const_vec
, 0, one_var
) = CONST0_RTX (GET_MODE_INNER (mode
));
15516 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (const_vec
, 0));
15524 /* For the two element vectors, it's just as easy to use
15525 the general case. */
15529 /* Use ix86_expand_vector_set in 64bit mode only. */
15554 if (TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
)
15563 /* There's no way to set one QImode entry easily. Combine
15564 the variable value with its adjacent constant value, and
15565 promote to an HImode set. */
15566 x
= XVECEXP (vals
, 0, one_var
^ 1);
15569 var
= convert_modes (HImode
, QImode
, var
, true);
15570 var
= expand_simple_binop (HImode
, ASHIFT
, var
, GEN_INT (8),
15571 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
15572 x
= GEN_INT (INTVAL (x
) & 0xff);
15576 var
= convert_modes (HImode
, QImode
, var
, true);
15577 x
= gen_int_mode (UINTVAL (x
) << 8, HImode
);
15579 if (x
!= const0_rtx
)
15580 var
= expand_simple_binop (HImode
, IOR
, var
, x
, var
,
15581 1, OPTAB_LIB_WIDEN
);
15583 x
= gen_reg_rtx (wmode
);
15584 emit_move_insn (x
, gen_lowpart (wmode
, const_vec
));
15585 ix86_expand_vector_set (mmx_ok
, x
, var
, one_var
>> 1);
15587 emit_move_insn (target
, gen_lowpart (mode
, x
));
15594 emit_move_insn (target
, const_vec
);
15595 ix86_expand_vector_set (mmx_ok
, target
, var
, one_var
);
15599 /* A subroutine of ix86_expand_vector_init_general. Use vector
15600 concatenate to handle the most general case: all values variable,
15601 and none identical. */
15604 ix86_expand_vector_init_concat (machine_mode mode
,
15605 rtx target
, rtx
*ops
, int n
)
15607 machine_mode half_mode
= VOIDmode
;
15618 half_mode
= V16HFmode
;
15621 half_mode
= V16BFmode
;
15624 half_mode
= V8SImode
;
15627 half_mode
= V8SFmode
;
15630 half_mode
= V4DImode
;
15633 half_mode
= V4DFmode
;
15636 half_mode
= V8HFmode
;
15639 half_mode
= V8BFmode
;
15642 half_mode
= V4SImode
;
15645 half_mode
= V4SFmode
;
15648 half_mode
= V2DImode
;
15651 half_mode
= V2DFmode
;
15654 half_mode
= V2SImode
;
15657 half_mode
= V2SFmode
;
15660 half_mode
= DImode
;
15663 half_mode
= SImode
;
15666 half_mode
= DFmode
;
15669 half_mode
= SFmode
;
15672 gcc_unreachable ();
15675 if (!register_operand (ops
[1], half_mode
))
15676 ops
[1] = force_reg (half_mode
, ops
[1]);
15677 if (!register_operand (ops
[0], half_mode
))
15678 ops
[0] = force_reg (half_mode
, ops
[0]);
15679 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, ops
[0],
15687 half_mode
= V2DImode
;
15690 half_mode
= V2DFmode
;
15693 half_mode
= V2SImode
;
15696 half_mode
= V2SFmode
;
15699 gcc_unreachable ();
15707 half_mode
= V4DImode
;
15710 half_mode
= V4DFmode
;
15713 half_mode
= V4SImode
;
15716 half_mode
= V4SFmode
;
15719 gcc_unreachable ();
15727 half_mode
= V8SImode
;
15730 half_mode
= V8SFmode
;
15733 gcc_unreachable ();
15738 /* FIXME: We process inputs backward to help RA. PR 36222. */
15740 for (j
= 1; j
!= -1; j
--)
15742 half
[j
] = gen_reg_rtx (half_mode
);
15746 v
= gen_rtvec (2, ops
[i
-1], ops
[i
]);
15750 v
= gen_rtvec (4, ops
[i
-3], ops
[i
-2], ops
[i
-1], ops
[i
]);
15754 v
= gen_rtvec (8, ops
[i
-7], ops
[i
-6], ops
[i
-5], ops
[i
-4],
15755 ops
[i
-3], ops
[i
-2], ops
[i
-1], ops
[i
]);
15759 gcc_unreachable ();
15761 ix86_expand_vector_init (false, half
[j
],
15762 gen_rtx_PARALLEL (half_mode
, v
));
15765 ix86_expand_vector_init_concat (mode
, target
, half
, 2);
15769 gcc_unreachable ();
15773 /* A subroutine of ix86_expand_vector_init_general. Use vector
15774 interleave to handle the most general case: all values variable,
15775 and none identical. */
15778 ix86_expand_vector_init_interleave (machine_mode mode
,
15779 rtx target
, rtx
*ops
, int n
)
15781 machine_mode first_imode
, second_imode
, third_imode
, inner_mode
;
15784 rtx (*gen_load_even
) (rtx
, rtx
, rtx
);
15785 rtx (*gen_interleave_first_low
) (rtx
, rtx
, rtx
);
15786 rtx (*gen_interleave_second_low
) (rtx
, rtx
, rtx
);
15791 gen_load_even
= gen_vec_interleave_lowv8hf
;
15792 gen_interleave_first_low
= gen_vec_interleave_lowv4si
;
15793 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
15794 inner_mode
= HFmode
;
15795 first_imode
= V4SImode
;
15796 second_imode
= V2DImode
;
15797 third_imode
= VOIDmode
;
15800 gen_load_even
= gen_vec_interleave_lowv8bf
;
15801 gen_interleave_first_low
= gen_vec_interleave_lowv4si
;
15802 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
15803 inner_mode
= BFmode
;
15804 first_imode
= V4SImode
;
15805 second_imode
= V2DImode
;
15806 third_imode
= VOIDmode
;
15809 gen_load_even
= gen_vec_setv8hi
;
15810 gen_interleave_first_low
= gen_vec_interleave_lowv4si
;
15811 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
15812 inner_mode
= HImode
;
15813 first_imode
= V4SImode
;
15814 second_imode
= V2DImode
;
15815 third_imode
= VOIDmode
;
15818 gen_load_even
= gen_vec_setv16qi
;
15819 gen_interleave_first_low
= gen_vec_interleave_lowv8hi
;
15820 gen_interleave_second_low
= gen_vec_interleave_lowv4si
;
15821 inner_mode
= QImode
;
15822 first_imode
= V8HImode
;
15823 second_imode
= V4SImode
;
15824 third_imode
= V2DImode
;
15827 gcc_unreachable ();
15830 for (i
= 0; i
< n
; i
++)
15833 if (inner_mode
== HFmode
|| inner_mode
== BFmode
)
15836 /* Use vpuncklwd to pack 2 HFmode or BFmode. */
15837 machine_mode vec_mode
=
15838 (inner_mode
== HFmode
) ? V8HFmode
: V8BFmode
;
15839 op0
= gen_reg_rtx (vec_mode
);
15840 even
= lowpart_subreg (vec_mode
,
15841 force_reg (inner_mode
, op
), inner_mode
);
15842 odd
= lowpart_subreg (vec_mode
,
15843 force_reg (inner_mode
, ops
[i
+ i
+ 1]),
15845 emit_insn (gen_load_even (op0
, even
, odd
));
15849 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
15850 op0
= gen_reg_rtx (SImode
);
15851 emit_move_insn (op0
, gen_lowpart (SImode
, op
));
15853 /* Insert the SImode value as low element of V4SImode vector. */
15854 op1
= gen_reg_rtx (V4SImode
);
15855 op0
= gen_rtx_VEC_MERGE (V4SImode
,
15856 gen_rtx_VEC_DUPLICATE (V4SImode
,
15858 CONST0_RTX (V4SImode
),
15860 emit_insn (gen_rtx_SET (op1
, op0
));
15862 /* Cast the V4SImode vector back to a vector in orignal mode. */
15863 op0
= gen_reg_rtx (mode
);
15864 emit_move_insn (op0
, gen_lowpart (mode
, op1
));
15866 /* Load even elements into the second position. */
15867 emit_insn (gen_load_even (op0
,
15868 force_reg (inner_mode
,
15873 /* Cast vector to FIRST_IMODE vector. */
15874 ops
[i
] = gen_reg_rtx (first_imode
);
15875 emit_move_insn (ops
[i
], gen_lowpart (first_imode
, op0
));
15878 /* Interleave low FIRST_IMODE vectors. */
15879 for (i
= j
= 0; i
< n
; i
+= 2, j
++)
15881 op0
= gen_reg_rtx (first_imode
);
15882 emit_insn (gen_interleave_first_low (op0
, ops
[i
], ops
[i
+ 1]));
15884 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
15885 ops
[j
] = gen_reg_rtx (second_imode
);
15886 emit_move_insn (ops
[j
], gen_lowpart (second_imode
, op0
));
15889 /* Interleave low SECOND_IMODE vectors. */
15890 switch (second_imode
)
15893 for (i
= j
= 0; i
< n
/ 2; i
+= 2, j
++)
15895 op0
= gen_reg_rtx (second_imode
);
15896 emit_insn (gen_interleave_second_low (op0
, ops
[i
],
15899 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
15901 ops
[j
] = gen_reg_rtx (third_imode
);
15902 emit_move_insn (ops
[j
], gen_lowpart (third_imode
, op0
));
15904 second_imode
= V2DImode
;
15905 gen_interleave_second_low
= gen_vec_interleave_lowv2di
;
15909 op0
= gen_reg_rtx (second_imode
);
15910 emit_insn (gen_interleave_second_low (op0
, ops
[0],
15913 /* Cast the SECOND_IMODE vector back to a vector on original
15915 emit_insn (gen_rtx_SET (target
, gen_lowpart (mode
, op0
)));
15919 gcc_unreachable ();
15923 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
15924 all values variable, and none identical. */
15927 ix86_expand_vector_init_general (bool mmx_ok
, machine_mode mode
,
15928 rtx target
, rtx vals
)
15930 rtx ops
[64], op0
, op1
, op2
, op3
, op4
, op5
;
15931 machine_mode half_mode
= VOIDmode
;
15932 machine_mode quarter_mode
= VOIDmode
;
15939 if (!mmx_ok
&& !TARGET_SSE
)
15955 n
= GET_MODE_NUNITS (mode
);
15956 for (i
= 0; i
< n
; i
++)
15957 ops
[i
] = XVECEXP (vals
, 0, i
);
15958 ix86_expand_vector_init_concat (mode
, target
, ops
, n
);
15962 for (i
= 0; i
< 2; i
++)
15963 ops
[i
] = gen_lowpart (V2DImode
, XVECEXP (vals
, 0, i
));
15964 op0
= gen_reg_rtx (V4DImode
);
15965 ix86_expand_vector_init_concat (V4DImode
, op0
, ops
, 2);
15966 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), op0
));
15970 for (i
= 0; i
< 4; i
++)
15971 ops
[i
] = gen_lowpart (V2DImode
, XVECEXP (vals
, 0, i
));
15972 ops
[4] = gen_reg_rtx (V4DImode
);
15973 ix86_expand_vector_init_concat (V4DImode
, ops
[4], ops
, 2);
15974 ops
[5] = gen_reg_rtx (V4DImode
);
15975 ix86_expand_vector_init_concat (V4DImode
, ops
[5], ops
+ 2, 2);
15976 op0
= gen_reg_rtx (V8DImode
);
15977 ix86_expand_vector_init_concat (V8DImode
, op0
, ops
+ 4, 2);
15978 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), op0
));
15982 half_mode
= V16QImode
;
15986 half_mode
= V8HImode
;
15990 half_mode
= V8HFmode
;
15994 half_mode
= V8BFmode
;
15998 n
= GET_MODE_NUNITS (mode
);
15999 for (i
= 0; i
< n
; i
++)
16000 ops
[i
] = XVECEXP (vals
, 0, i
);
16001 op0
= gen_reg_rtx (half_mode
);
16002 op1
= gen_reg_rtx (half_mode
);
16003 ix86_expand_vector_init_interleave (half_mode
, op0
, ops
,
16005 ix86_expand_vector_init_interleave (half_mode
, op1
,
16006 &ops
[n
>> 1], n
>> 2);
16007 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, op0
, op1
)));
16011 quarter_mode
= V16QImode
;
16012 half_mode
= V32QImode
;
16016 quarter_mode
= V8HImode
;
16017 half_mode
= V16HImode
;
16021 quarter_mode
= V8HFmode
;
16022 half_mode
= V16HFmode
;
16026 quarter_mode
= V8BFmode
;
16027 half_mode
= V16BFmode
;
16031 n
= GET_MODE_NUNITS (mode
);
16032 for (i
= 0; i
< n
; i
++)
16033 ops
[i
] = XVECEXP (vals
, 0, i
);
16034 op0
= gen_reg_rtx (quarter_mode
);
16035 op1
= gen_reg_rtx (quarter_mode
);
16036 op2
= gen_reg_rtx (quarter_mode
);
16037 op3
= gen_reg_rtx (quarter_mode
);
16038 op4
= gen_reg_rtx (half_mode
);
16039 op5
= gen_reg_rtx (half_mode
);
16040 ix86_expand_vector_init_interleave (quarter_mode
, op0
, ops
,
16042 ix86_expand_vector_init_interleave (quarter_mode
, op1
,
16043 &ops
[n
>> 2], n
>> 3);
16044 ix86_expand_vector_init_interleave (quarter_mode
, op2
,
16045 &ops
[n
>> 1], n
>> 3);
16046 ix86_expand_vector_init_interleave (quarter_mode
, op3
,
16047 &ops
[(n
>> 1) | (n
>> 2)], n
>> 3);
16048 emit_insn (gen_rtx_SET (op4
, gen_rtx_VEC_CONCAT (half_mode
, op0
, op1
)));
16049 emit_insn (gen_rtx_SET (op5
, gen_rtx_VEC_CONCAT (half_mode
, op2
, op3
)));
16050 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, op4
, op5
)));
16054 if (!TARGET_SSE4_1
)
16062 /* Don't use ix86_expand_vector_init_interleave if we can't
16063 move from GPR to SSE register directly. */
16064 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
)
16071 n
= GET_MODE_NUNITS (mode
);
16072 for (i
= 0; i
< n
; i
++)
16073 ops
[i
] = XVECEXP (vals
, 0, i
);
16074 ix86_expand_vector_init_interleave (mode
, target
, ops
, n
>> 1);
16085 gcc_unreachable ();
16089 int i
, j
, n_elts
, n_words
, n_elt_per_word
;
16090 machine_mode tmp_mode
, inner_mode
;
16091 rtx words
[4], shift
;
16093 tmp_mode
= (GET_MODE_SIZE (mode
) < UNITS_PER_WORD
) ? SImode
: word_mode
;
16095 inner_mode
= GET_MODE_INNER (mode
);
16096 n_elts
= GET_MODE_NUNITS (mode
);
16097 n_words
= GET_MODE_SIZE (mode
) / GET_MODE_SIZE (tmp_mode
);
16098 n_elt_per_word
= n_elts
/ n_words
;
16099 shift
= GEN_INT (GET_MODE_BITSIZE (inner_mode
));
16101 for (i
= 0; i
< n_words
; ++i
)
16103 rtx word
= NULL_RTX
;
16105 for (j
= 0; j
< n_elt_per_word
; ++j
)
16107 rtx elt
= XVECEXP (vals
, 0, (i
+1)*n_elt_per_word
- j
- 1);
16108 elt
= convert_modes (tmp_mode
, inner_mode
, elt
, true);
16114 word
= expand_simple_binop (tmp_mode
, ASHIFT
, word
, shift
,
16115 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
16116 word
= expand_simple_binop (tmp_mode
, IOR
, word
, elt
,
16117 NULL_RTX
, 1, OPTAB_LIB_WIDEN
);
16125 emit_move_insn (target
, gen_lowpart (mode
, words
[0]));
16126 else if (n_words
== 2)
16128 rtx tmp
= gen_reg_rtx (mode
);
16129 emit_clobber (tmp
);
16130 emit_move_insn (gen_lowpart (tmp_mode
, tmp
), words
[0]);
16131 emit_move_insn (gen_highpart (tmp_mode
, tmp
), words
[1]);
16132 emit_move_insn (target
, tmp
);
16134 else if (n_words
== 4)
16136 rtx tmp
= gen_reg_rtx (V4SImode
);
16137 gcc_assert (tmp_mode
== SImode
);
16138 vals
= gen_rtx_PARALLEL (V4SImode
, gen_rtvec_v (4, words
));
16139 ix86_expand_vector_init_general (false, V4SImode
, tmp
, vals
);
16140 emit_move_insn (target
, gen_lowpart (mode
, tmp
));
16143 gcc_unreachable ();
16147 /* Initialize vector TARGET via VALS. Suppress the use of MMX
16148 instructions unless MMX_OK is true. */
16151 ix86_expand_vector_init (bool mmx_ok
, rtx target
, rtx vals
)
16153 machine_mode mode
= GET_MODE (target
);
16154 machine_mode inner_mode
= GET_MODE_INNER (mode
);
16155 int n_elts
= GET_MODE_NUNITS (mode
);
16156 int n_var
= 0, one_var
= -1;
16157 bool all_same
= true, all_const_zero
= true;
16161 /* Handle first initialization from vector elts. */
16162 if (n_elts
!= XVECLEN (vals
, 0))
16164 rtx subtarget
= target
;
16165 x
= XVECEXP (vals
, 0, 0);
16166 gcc_assert (GET_MODE_INNER (GET_MODE (x
)) == inner_mode
);
16167 if (GET_MODE_NUNITS (GET_MODE (x
)) * 2 == n_elts
)
16169 rtx ops
[2] = { XVECEXP (vals
, 0, 0), XVECEXP (vals
, 0, 1) };
16170 if (inner_mode
== QImode
16171 || inner_mode
== HImode
16172 || inner_mode
== TImode
16173 || inner_mode
== HFmode
16174 || inner_mode
== BFmode
)
16176 unsigned int n_bits
= n_elts
* GET_MODE_SIZE (inner_mode
);
16177 scalar_mode elt_mode
= inner_mode
== TImode
? DImode
: SImode
;
16178 n_bits
/= GET_MODE_SIZE (elt_mode
);
16179 mode
= mode_for_vector (elt_mode
, n_bits
).require ();
16180 inner_mode
= mode_for_vector (elt_mode
, n_bits
/ 2).require ();
16181 ops
[0] = gen_lowpart (inner_mode
, ops
[0]);
16182 ops
[1] = gen_lowpart (inner_mode
, ops
[1]);
16183 subtarget
= gen_reg_rtx (mode
);
16185 ix86_expand_vector_init_concat (mode
, subtarget
, ops
, 2);
16186 if (subtarget
!= target
)
16187 emit_move_insn (target
, gen_lowpart (GET_MODE (target
), subtarget
));
16190 gcc_unreachable ();
16193 for (i
= 0; i
< n_elts
; ++i
)
16195 x
= XVECEXP (vals
, 0, i
);
16196 if (!(CONST_SCALAR_INT_P (x
)
16197 || CONST_DOUBLE_P (x
)
16198 || CONST_FIXED_P (x
)))
16199 n_var
++, one_var
= i
;
16200 else if (x
!= CONST0_RTX (inner_mode
))
16201 all_const_zero
= false;
16202 if (i
> 0 && !rtx_equal_p (x
, XVECEXP (vals
, 0, 0)))
16206 /* Constants are best loaded from the constant pool. */
16209 emit_move_insn (target
, gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0)));
16213 /* If all values are identical, broadcast the value. */
16215 && ix86_expand_vector_init_duplicate (mmx_ok
, mode
, target
,
16216 XVECEXP (vals
, 0, 0)))
16219 /* Values where only one field is non-constant are best loaded from
16220 the pool and overwritten via move later. */
16224 && ix86_expand_vector_init_one_nonzero (mmx_ok
, mode
, target
,
16225 XVECEXP (vals
, 0, one_var
),
16229 if (ix86_expand_vector_init_one_var (mmx_ok
, mode
, target
, vals
, one_var
))
16233 ix86_expand_vector_init_general (mmx_ok
, mode
, target
, vals
);
16237 V setg (V v, int idx, T val)
16239 V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
16240 V valv = (V){val, val, val, val, val, val, val, val};
16241 V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
16242 v = (v & ~mask) | (valv & mask);
16246 ix86_expand_vector_set_var (rtx target
, rtx val
, rtx idx
)
16249 machine_mode mode
= GET_MODE (target
);
16250 machine_mode cmp_mode
= mode
;
16251 int n_elts
= GET_MODE_NUNITS (mode
);
16252 rtx valv
,idxv
,constv
,idx_tmp
;
16255 /* 512-bits vector byte/word broadcast and comparison only available
16256 under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
16257 when without TARGET_AVX512BW. */
16258 if ((mode
== V32HImode
|| mode
== V32HFmode
|| mode
== V32BFmode
16259 || mode
== V64QImode
)
16260 && !TARGET_AVX512BW
)
16262 gcc_assert (TARGET_AVX512F
);
16263 rtx vhi
, vlo
, idx_hi
;
16264 machine_mode half_mode
;
16265 rtx (*extract_hi
)(rtx
, rtx
);
16266 rtx (*extract_lo
)(rtx
, rtx
);
16268 if (mode
== V32HImode
)
16270 half_mode
= V16HImode
;
16271 extract_hi
= gen_vec_extract_hi_v32hi
;
16272 extract_lo
= gen_vec_extract_lo_v32hi
;
16274 else if (mode
== V32HFmode
)
16276 half_mode
= V16HFmode
;
16277 extract_hi
= gen_vec_extract_hi_v32hf
;
16278 extract_lo
= gen_vec_extract_lo_v32hf
;
16280 else if (mode
== V32BFmode
)
16282 half_mode
= V16BFmode
;
16283 extract_hi
= gen_vec_extract_hi_v32bf
;
16284 extract_lo
= gen_vec_extract_lo_v32bf
;
16288 half_mode
= V32QImode
;
16289 extract_hi
= gen_vec_extract_hi_v64qi
;
16290 extract_lo
= gen_vec_extract_lo_v64qi
;
16293 vhi
= gen_reg_rtx (half_mode
);
16294 vlo
= gen_reg_rtx (half_mode
);
16295 idx_hi
= gen_reg_rtx (GET_MODE (idx
));
16296 emit_insn (extract_hi (vhi
, target
));
16297 emit_insn (extract_lo (vlo
, target
));
16300 vec
[2] = GEN_INT (n_elts
/2);
16301 ix86_expand_binary_operator (MINUS
, GET_MODE (idx
), vec
);
16302 ix86_expand_vector_set_var (vhi
, val
, idx_hi
);
16303 ix86_expand_vector_set_var (vlo
, val
, idx
);
16304 emit_insn (gen_rtx_SET (target
, gen_rtx_VEC_CONCAT (mode
, vlo
, vhi
)));
16308 if (FLOAT_MODE_P (GET_MODE_INNER (mode
)))
16313 cmp_mode
= V2DImode
;
16316 cmp_mode
= V4DImode
;
16319 cmp_mode
= V8DImode
;
16322 cmp_mode
= V2SImode
;
16325 cmp_mode
= V4SImode
;
16328 cmp_mode
= V8SImode
;
16331 cmp_mode
= V16SImode
;
16334 cmp_mode
= V8HImode
;
16337 cmp_mode
= V16HImode
;
16340 cmp_mode
= V32HImode
;
16343 cmp_mode
= V8HImode
;
16346 cmp_mode
= V16HImode
;
16349 cmp_mode
= V32HImode
;
16352 gcc_unreachable ();
16356 for (int i
= 0; i
!= n_elts
; i
++)
16357 vec
[i
] = GEN_INT (i
);
16358 constv
= gen_rtx_CONST_VECTOR (cmp_mode
, gen_rtvec_v (n_elts
, vec
));
16359 valv
= gen_reg_rtx (mode
);
16360 idxv
= gen_reg_rtx (cmp_mode
);
16361 idx_tmp
= convert_to_mode (GET_MODE_INNER (cmp_mode
), idx
, 1);
16363 ok
= ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE
,
16366 ok
= ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE
,
16367 cmp_mode
, idxv
, idx_tmp
);
16372 vec
[3] = gen_rtx_EQ (mode
, idxv
, constv
);
16375 ok
= ix86_expand_int_vcond (vec
);
16380 ix86_expand_vector_set (bool mmx_ok
, rtx target
, rtx val
, int elt
)
16382 machine_mode mode
= GET_MODE (target
);
16383 machine_mode inner_mode
= GET_MODE_INNER (mode
);
16384 machine_mode half_mode
;
16385 bool use_vec_merge
= false;
16386 bool blendm_const
= false;
16388 static rtx (*gen_extract
[8][2]) (rtx
, rtx
)
16390 { gen_vec_extract_lo_v32qi
, gen_vec_extract_hi_v32qi
},
16391 { gen_vec_extract_lo_v16hi
, gen_vec_extract_hi_v16hi
},
16392 { gen_vec_extract_lo_v8si
, gen_vec_extract_hi_v8si
},
16393 { gen_vec_extract_lo_v4di
, gen_vec_extract_hi_v4di
},
16394 { gen_vec_extract_lo_v8sf
, gen_vec_extract_hi_v8sf
},
16395 { gen_vec_extract_lo_v4df
, gen_vec_extract_hi_v4df
},
16396 { gen_vec_extract_lo_v16hf
, gen_vec_extract_hi_v16hf
},
16397 { gen_vec_extract_lo_v16bf
, gen_vec_extract_hi_v16bf
}
16399 static rtx (*gen_insert
[8][2]) (rtx
, rtx
, rtx
)
16401 { gen_vec_set_lo_v32qi
, gen_vec_set_hi_v32qi
},
16402 { gen_vec_set_lo_v16hi
, gen_vec_set_hi_v16hi
},
16403 { gen_vec_set_lo_v8si
, gen_vec_set_hi_v8si
},
16404 { gen_vec_set_lo_v4di
, gen_vec_set_hi_v4di
},
16405 { gen_vec_set_lo_v8sf
, gen_vec_set_hi_v8sf
},
16406 { gen_vec_set_lo_v4df
, gen_vec_set_hi_v4df
},
16407 { gen_vec_set_lo_v16hf
, gen_vec_set_hi_v16hf
},
16408 { gen_vec_set_lo_v16bf
, gen_vec_set_hi_v16bf
},
16411 machine_mode mmode
= VOIDmode
;
16412 rtx (*gen_blendm
) (rtx
, rtx
, rtx
, rtx
);
16417 use_vec_merge
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
16425 tmp
= gen_reg_rtx (GET_MODE_INNER (mode
));
16426 ix86_expand_vector_extract (true, tmp
, target
, 1 - elt
);
16428 tmp
= gen_rtx_VEC_CONCAT (mode
, val
, tmp
);
16430 tmp
= gen_rtx_VEC_CONCAT (mode
, tmp
, val
);
16431 emit_insn (gen_rtx_SET (target
, tmp
));
16437 use_vec_merge
= TARGET_SSE4_1
&& TARGET_64BIT
;
16441 tmp
= gen_reg_rtx (GET_MODE_INNER (mode
));
16442 ix86_expand_vector_extract (false, tmp
, target
, 1 - elt
);
16444 tmp
= gen_rtx_VEC_CONCAT (mode
, val
, tmp
);
16446 tmp
= gen_rtx_VEC_CONCAT (mode
, tmp
, val
);
16447 emit_insn (gen_rtx_SET (target
, tmp
));
16451 /* NB: For ELT == 0, use standard scalar operation patterns which
16452 preserve the rest of the vector for combiner:
16455 (vec_duplicate:V2DF (reg:DF))
16465 /* For the two element vectors, we implement a VEC_CONCAT with
16466 the extraction of the other element. */
16468 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (1 - elt
)));
16469 tmp
= gen_rtx_VEC_SELECT (inner_mode
, target
, tmp
);
16472 op0
= val
, op1
= tmp
;
16474 op0
= tmp
, op1
= val
;
16476 tmp
= gen_rtx_VEC_CONCAT (mode
, op0
, op1
);
16477 emit_insn (gen_rtx_SET (target
, tmp
));
16482 use_vec_merge
= TARGET_SSE4_1
;
16489 use_vec_merge
= true;
16493 /* tmp = target = A B C D */
16494 tmp
= copy_to_reg (target
);
16495 /* target = A A B B */
16496 emit_insn (gen_vec_interleave_lowv4sf (target
, target
, target
));
16497 /* target = X A B B */
16498 ix86_expand_vector_set (false, target
, val
, 0);
16499 /* target = A X C D */
16500 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
16501 const1_rtx
, const0_rtx
,
16502 GEN_INT (2+4), GEN_INT (3+4)));
16506 /* tmp = target = A B C D */
16507 tmp
= copy_to_reg (target
);
16508 /* tmp = X B C D */
16509 ix86_expand_vector_set (false, tmp
, val
, 0);
16510 /* target = A B X D */
16511 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
16512 const0_rtx
, const1_rtx
,
16513 GEN_INT (0+4), GEN_INT (3+4)));
16517 /* tmp = target = A B C D */
16518 tmp
= copy_to_reg (target
);
16519 /* tmp = X B C D */
16520 ix86_expand_vector_set (false, tmp
, val
, 0);
16521 /* target = A B X D */
16522 emit_insn (gen_sse_shufps_v4sf (target
, target
, tmp
,
16523 const0_rtx
, const1_rtx
,
16524 GEN_INT (2+4), GEN_INT (0+4)));
16528 gcc_unreachable ();
16533 use_vec_merge
= TARGET_SSE4_1
;
16537 /* Element 0 handled by vec_merge below. */
16540 use_vec_merge
= true;
16546 /* With SSE2, use integer shuffles to swap element 0 and ELT,
16547 store into element 0, then shuffle them back. */
16551 order
[0] = GEN_INT (elt
);
16552 order
[1] = const1_rtx
;
16553 order
[2] = const2_rtx
;
16554 order
[3] = GEN_INT (3);
16555 order
[elt
] = const0_rtx
;
16557 emit_insn (gen_sse2_pshufd_1 (target
, target
, order
[0],
16558 order
[1], order
[2], order
[3]));
16560 ix86_expand_vector_set (false, target
, val
, 0);
16562 emit_insn (gen_sse2_pshufd_1 (target
, target
, order
[0],
16563 order
[1], order
[2], order
[3]));
16567 /* For SSE1, we have to reuse the V4SF code. */
16568 rtx t
= gen_reg_rtx (V4SFmode
);
16569 emit_move_insn (t
, gen_lowpart (V4SFmode
, target
));
16570 ix86_expand_vector_set (false, t
, gen_lowpart (SFmode
, val
), elt
);
16571 emit_move_insn (target
, gen_lowpart (mode
, t
));
16579 use_vec_merge
= TARGET_SSE2
;
16582 use_vec_merge
= mmx_ok
&& (TARGET_SSE
|| TARGET_3DNOW_A
);
16587 use_vec_merge
= TARGET_SSE4_1
;
16591 use_vec_merge
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
16595 half_mode
= V16QImode
;
16602 /* For ELT == 0, vec_setv8hf_0 can save 1 vpbroadcastw. */
16603 if (TARGET_AVX2
&& elt
!= 0)
16606 gen_blendm
= ((mode
== E_V16HFmode
) ? gen_avx2_pblendph_1
16607 : gen_avx2_pblendbf_1
);
16608 blendm_const
= true;
16613 half_mode
= ((mode
== E_V16HFmode
) ? V8HFmode
: V8BFmode
);
16614 j
= ((mode
== E_V16HFmode
) ? 6 : 7);
16620 half_mode
= V8HImode
;
16626 half_mode
= V4SImode
;
16632 half_mode
= V2DImode
;
16638 half_mode
= V4SFmode
;
16644 half_mode
= V2DFmode
;
16650 /* Compute offset. */
16654 gcc_assert (i
<= 1);
16656 /* Extract the half. */
16657 tmp
= gen_reg_rtx (half_mode
);
16658 emit_insn (gen_extract
[j
][i
] (tmp
, target
));
16660 /* Put val in tmp at elt. */
16661 ix86_expand_vector_set (false, tmp
, val
, elt
);
16664 emit_insn (gen_insert
[j
][i
] (target
, target
, tmp
));
16668 if (TARGET_AVX512F
)
16671 gen_blendm
= gen_avx512f_blendmv8df
;
16676 if (TARGET_AVX512F
)
16679 gen_blendm
= gen_avx512f_blendmv8di
;
16684 if (TARGET_AVX512F
)
16687 gen_blendm
= gen_avx512f_blendmv16sf
;
16692 if (TARGET_AVX512F
)
16695 gen_blendm
= gen_avx512f_blendmv16si
;
16700 if (TARGET_AVX512BW
)
16703 gen_blendm
= gen_avx512bw_blendmv32hf
;
16707 if (TARGET_AVX512BW
)
16710 gen_blendm
= gen_avx512bw_blendmv32bf
;
16714 if (TARGET_AVX512BW
)
16717 gen_blendm
= gen_avx512bw_blendmv32hi
;
16719 else if (TARGET_AVX512F
)
16721 half_mode
= E_V8HImode
;
16728 if (TARGET_AVX512BW
)
16731 gen_blendm
= gen_avx512bw_blendmv64qi
;
16733 else if (TARGET_AVX512F
)
16735 half_mode
= E_V16QImode
;
16742 /* Compute offset. */
16746 gcc_assert (i
<= 3);
16749 /* Extract the quarter. */
16750 tmp
= gen_reg_rtx (V4SImode
);
16751 rtx tmp2
= gen_lowpart (V16SImode
, target
);
16752 rtx mask
= gen_reg_rtx (QImode
);
16754 emit_move_insn (mask
, constm1_rtx
);
16755 emit_insn (gen_avx512f_vextracti32x4_mask (tmp
, tmp2
, GEN_INT (i
),
16758 tmp2
= gen_reg_rtx (half_mode
);
16759 emit_move_insn (tmp2
, gen_lowpart (half_mode
, tmp
));
16762 /* Put val in tmp at elt. */
16763 ix86_expand_vector_set (false, tmp
, val
, elt
);
16766 tmp2
= gen_reg_rtx (V16SImode
);
16767 rtx tmp3
= gen_lowpart (V16SImode
, target
);
16768 mask
= gen_reg_rtx (HImode
);
16769 emit_move_insn (mask
, constm1_rtx
);
16770 tmp
= gen_lowpart (V4SImode
, tmp
);
16771 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2
, tmp3
, tmp
, GEN_INT (i
),
16773 emit_move_insn (target
, gen_lowpart (mode
, tmp2
));
16781 if (mmode
!= VOIDmode
)
16783 tmp
= gen_reg_rtx (mode
);
16784 emit_insn (gen_rtx_SET (tmp
, gen_rtx_VEC_DUPLICATE (mode
, val
)));
16785 rtx merge_mask
= gen_int_mode (HOST_WIDE_INT_1U
<< elt
, mmode
);
16786 /* The avx512*_blendm<mode> expanders have different operand order
16787 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
16788 elements where the mask is set and second input operand otherwise,
16789 in {sse,avx}*_*blend* the first input operand is used for elements
16790 where the mask is clear and second input operand otherwise. */
16792 merge_mask
= force_reg (mmode
, merge_mask
);
16793 emit_insn (gen_blendm (target
, target
, tmp
, merge_mask
));
16795 else if (use_vec_merge
)
16798 tmp
= gen_rtx_VEC_DUPLICATE (mode
, val
);
16799 tmp
= gen_rtx_VEC_MERGE (mode
, tmp
, target
,
16800 GEN_INT (HOST_WIDE_INT_1U
<< elt
));
16801 emit_insn (gen_rtx_SET (target
, tmp
));
16805 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
16807 emit_move_insn (mem
, target
);
16809 tmp
= adjust_address (mem
, inner_mode
, elt
* GET_MODE_SIZE (inner_mode
));
16810 emit_move_insn (tmp
, val
);
16812 emit_move_insn (target
, mem
);
16817 ix86_expand_vector_extract (bool mmx_ok
, rtx target
, rtx vec
, int elt
)
16819 machine_mode mode
= GET_MODE (vec
);
16820 machine_mode inner_mode
= GET_MODE_INNER (mode
);
16821 bool use_vec_extr
= false;
16827 use_vec_extr
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
16841 use_vec_extr
= true;
16845 use_vec_extr
= TARGET_SSE4_1
;
16857 tmp
= gen_reg_rtx (mode
);
16858 emit_insn (gen_sse_shufps_v4sf (tmp
, vec
, vec
,
16859 GEN_INT (elt
), GEN_INT (elt
),
16860 GEN_INT (elt
+4), GEN_INT (elt
+4)));
16864 tmp
= gen_reg_rtx (mode
);
16865 emit_insn (gen_vec_interleave_highv4sf (tmp
, vec
, vec
));
16869 gcc_unreachable ();
16872 use_vec_extr
= true;
16877 use_vec_extr
= TARGET_SSE4_1
;
16891 tmp
= gen_reg_rtx (mode
);
16892 emit_insn (gen_sse2_pshufd_1 (tmp
, vec
,
16893 GEN_INT (elt
), GEN_INT (elt
),
16894 GEN_INT (elt
), GEN_INT (elt
)));
16898 tmp
= gen_reg_rtx (mode
);
16899 emit_insn (gen_vec_interleave_highv4si (tmp
, vec
, vec
));
16903 gcc_unreachable ();
16906 use_vec_extr
= true;
16911 /* For SSE1, we have to reuse the V4SF code. */
16912 ix86_expand_vector_extract (false, gen_lowpart (SFmode
, target
),
16913 gen_lowpart (V4SFmode
, vec
), elt
);
16922 use_vec_extr
= TARGET_SSE2
;
16925 use_vec_extr
= mmx_ok
&& (TARGET_SSE
|| TARGET_3DNOW_A
);
16929 use_vec_extr
= TARGET_SSE4_1
;
16933 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC
))
16935 tmp
= gen_reg_rtx (SImode
);
16936 ix86_expand_vector_extract (false, tmp
, gen_lowpart (V4SImode
, vec
),
16938 emit_insn (gen_rtx_SET (target
, gen_lowpart (QImode
, tmp
)));
16943 use_vec_extr
= TARGET_SSE4_1
;
16949 tmp
= gen_reg_rtx (V4SFmode
);
16951 emit_insn (gen_vec_extract_lo_v8sf (tmp
, vec
));
16953 emit_insn (gen_vec_extract_hi_v8sf (tmp
, vec
));
16954 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
16962 tmp
= gen_reg_rtx (V2DFmode
);
16964 emit_insn (gen_vec_extract_lo_v4df (tmp
, vec
));
16966 emit_insn (gen_vec_extract_hi_v4df (tmp
, vec
));
16967 ix86_expand_vector_extract (false, target
, tmp
, elt
& 1);
16975 tmp
= gen_reg_rtx (V16QImode
);
16977 emit_insn (gen_vec_extract_lo_v32qi (tmp
, vec
));
16979 emit_insn (gen_vec_extract_hi_v32qi (tmp
, vec
));
16980 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
16988 tmp
= gen_reg_rtx (V8HImode
);
16990 emit_insn (gen_vec_extract_lo_v16hi (tmp
, vec
));
16992 emit_insn (gen_vec_extract_hi_v16hi (tmp
, vec
));
16993 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
17001 tmp
= gen_reg_rtx (V4SImode
);
17003 emit_insn (gen_vec_extract_lo_v8si (tmp
, vec
));
17005 emit_insn (gen_vec_extract_hi_v8si (tmp
, vec
));
17006 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
17014 tmp
= gen_reg_rtx (V2DImode
);
17016 emit_insn (gen_vec_extract_lo_v4di (tmp
, vec
));
17018 emit_insn (gen_vec_extract_hi_v4di (tmp
, vec
));
17019 ix86_expand_vector_extract (false, target
, tmp
, elt
& 1);
17025 if (TARGET_AVX512BW
)
17027 tmp
= gen_reg_rtx (V16HImode
);
17029 emit_insn (gen_vec_extract_lo_v32hi (tmp
, vec
));
17031 emit_insn (gen_vec_extract_hi_v32hi (tmp
, vec
));
17032 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
17038 if (TARGET_AVX512BW
)
17040 tmp
= gen_reg_rtx (V32QImode
);
17042 emit_insn (gen_vec_extract_lo_v64qi (tmp
, vec
));
17044 emit_insn (gen_vec_extract_hi_v64qi (tmp
, vec
));
17045 ix86_expand_vector_extract (false, target
, tmp
, elt
& 31);
17051 tmp
= gen_reg_rtx (V8SFmode
);
17053 emit_insn (gen_vec_extract_lo_v16sf (tmp
, vec
));
17055 emit_insn (gen_vec_extract_hi_v16sf (tmp
, vec
));
17056 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
17060 tmp
= gen_reg_rtx (V4DFmode
);
17062 emit_insn (gen_vec_extract_lo_v8df (tmp
, vec
));
17064 emit_insn (gen_vec_extract_hi_v8df (tmp
, vec
));
17065 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
17069 tmp
= gen_reg_rtx (V8SImode
);
17071 emit_insn (gen_vec_extract_lo_v16si (tmp
, vec
));
17073 emit_insn (gen_vec_extract_hi_v16si (tmp
, vec
));
17074 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
17078 tmp
= gen_reg_rtx (V4DImode
);
17080 emit_insn (gen_vec_extract_lo_v8di (tmp
, vec
));
17082 emit_insn (gen_vec_extract_hi_v8di (tmp
, vec
));
17083 ix86_expand_vector_extract (false, target
, tmp
, elt
& 3);
17088 if (TARGET_AVX512BW
)
17090 tmp
= (mode
== E_V32HFmode
17091 ? gen_reg_rtx (V16HFmode
)
17092 : gen_reg_rtx (V16BFmode
));
17094 emit_insn (maybe_gen_vec_extract_lo (mode
, tmp
, vec
));
17096 emit_insn (maybe_gen_vec_extract_hi (mode
, tmp
, vec
));
17097 ix86_expand_vector_extract (false, target
, tmp
, elt
& 15);
17106 tmp
= (mode
== E_V16HFmode
17107 ? gen_reg_rtx (V8HFmode
)
17108 : gen_reg_rtx (V8BFmode
));
17110 emit_insn (maybe_gen_vec_extract_lo (mode
, tmp
, vec
));
17112 emit_insn (maybe_gen_vec_extract_hi (mode
, tmp
, vec
));
17113 ix86_expand_vector_extract (false, target
, tmp
, elt
& 7);
17119 use_vec_extr
= TARGET_MMX_WITH_SSE
&& TARGET_SSE4_1
;
17120 /* ??? Could extract the appropriate HImode element and shift. */
17129 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (elt
)));
17130 tmp
= gen_rtx_VEC_SELECT (inner_mode
, vec
, tmp
);
17132 /* Let the rtl optimizers know about the zero extension performed. */
17133 if (inner_mode
== QImode
|| inner_mode
== HImode
)
17135 rtx reg
= gen_reg_rtx (SImode
);
17136 tmp
= gen_rtx_ZERO_EXTEND (SImode
, tmp
);
17137 emit_move_insn (reg
, tmp
);
17138 tmp
= gen_lowpart (inner_mode
, reg
);
17139 SUBREG_PROMOTED_VAR_P (tmp
) = 1;
17140 SUBREG_PROMOTED_SET (tmp
, 1);
17143 emit_move_insn (target
, tmp
);
17147 rtx mem
= assign_stack_temp (mode
, GET_MODE_SIZE (mode
));
17149 emit_move_insn (mem
, vec
);
17151 tmp
= adjust_address (mem
, inner_mode
, elt
*GET_MODE_SIZE (inner_mode
));
17152 emit_move_insn (target
, tmp
);
17156 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
17157 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
17158 The upper bits of DEST are undefined, though they shouldn't cause
17159 exceptions (some bits from src or all zeros are ok). */
17162 emit_reduc_half (rtx dest
, rtx src
, int i
)
17165 switch (GET_MODE (src
))
17169 tem
= gen_sse_movhlps (dest
, src
, src
);
17171 tem
= gen_sse_shufps_v4sf (dest
, src
, src
, const1_rtx
, const1_rtx
,
17172 GEN_INT (1 + 4), GEN_INT (1 + 4));
17175 tem
= gen_vec_interleave_highv2df (dest
, src
, src
);
17178 d
= gen_reg_rtx (V1SImode
);
17179 tem
= gen_mmx_lshrv1si3 (d
, gen_lowpart (V1SImode
, src
),
17183 d
= gen_reg_rtx (V1DImode
);
17184 tem
= gen_mmx_lshrv1di3 (d
, gen_lowpart (V1DImode
, src
),
17192 d
= gen_reg_rtx (V1TImode
);
17193 tem
= gen_sse2_lshrv1ti3 (d
, gen_lowpart (V1TImode
, src
),
17198 tem
= gen_avx_vperm2f128v8sf3 (dest
, src
, src
, const1_rtx
);
17200 tem
= gen_avx_shufps256 (dest
, src
, src
,
17201 GEN_INT (i
== 128 ? 2 + (3 << 2) : 1));
17205 tem
= gen_avx_vperm2f128v4df3 (dest
, src
, src
, const1_rtx
);
17207 tem
= gen_avx_shufpd256 (dest
, src
, src
, const1_rtx
);
17216 if (GET_MODE (dest
) != V4DImode
)
17217 d
= gen_reg_rtx (V4DImode
);
17218 tem
= gen_avx2_permv2ti (d
, gen_lowpart (V4DImode
, src
),
17219 gen_lowpart (V4DImode
, src
),
17224 d
= gen_reg_rtx (V2TImode
);
17225 tem
= gen_avx2_lshrv2ti3 (d
, gen_lowpart (V2TImode
, src
),
17234 d
= gen_reg_rtx (V4TImode
);
17235 tem
= gen_avx512bw_lshrv4ti3 (d
, gen_lowpart (V4TImode
, src
),
17245 tem
= gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode
, dest
),
17246 gen_lowpart (V16SImode
, src
),
17247 gen_lowpart (V16SImode
, src
),
17248 GEN_INT (0x4 + (i
== 512 ? 4 : 0)),
17249 GEN_INT (0x5 + (i
== 512 ? 4 : 0)),
17250 GEN_INT (0x6 + (i
== 512 ? 4 : 0)),
17251 GEN_INT (0x7 + (i
== 512 ? 4 : 0)),
17252 GEN_INT (0xC), GEN_INT (0xD),
17253 GEN_INT (0xE), GEN_INT (0xF),
17254 GEN_INT (0x10), GEN_INT (0x11),
17255 GEN_INT (0x12), GEN_INT (0x13),
17256 GEN_INT (0x14), GEN_INT (0x15),
17257 GEN_INT (0x16), GEN_INT (0x17));
17259 tem
= gen_avx512f_pshufd_1 (gen_lowpart (V16SImode
, dest
),
17260 gen_lowpart (V16SImode
, src
),
17261 GEN_INT (i
== 128 ? 0x2 : 0x1),
17265 GEN_INT (i
== 128 ? 0x6 : 0x5),
17269 GEN_INT (i
== 128 ? 0xA : 0x9),
17273 GEN_INT (i
== 128 ? 0xE : 0xD),
17279 gcc_unreachable ();
17283 emit_move_insn (dest
, gen_lowpart (GET_MODE (dest
), d
));
17286 /* Expand a vector reduction. FN is the binary pattern to reduce;
17287 DEST is the destination; IN is the input vector. */
17290 ix86_expand_reduc (rtx (*fn
) (rtx
, rtx
, rtx
), rtx dest
, rtx in
)
17292 rtx half
, dst
, vec
= in
;
17293 machine_mode mode
= GET_MODE (in
);
17296 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
17298 && mode
== V8HImode
17299 && fn
== gen_uminv8hi3
)
17301 emit_insn (gen_sse4_1_phminposuw (dest
, in
));
17305 for (i
= GET_MODE_BITSIZE (mode
);
17306 i
> GET_MODE_UNIT_BITSIZE (mode
);
17309 half
= gen_reg_rtx (mode
);
17310 emit_reduc_half (half
, vec
, i
);
17311 if (i
== GET_MODE_UNIT_BITSIZE (mode
) * 2)
17314 dst
= gen_reg_rtx (mode
);
17315 emit_insn (fn (dst
, half
, vec
));
17320 /* Output code to perform a conditional jump to LABEL, if C2 flag in
17321 FP status register is set. */
17324 ix86_emit_fp_unordered_jump (rtx label
)
17326 rtx reg
= gen_reg_rtx (HImode
);
17330 emit_insn (gen_x86_fnstsw_1 (reg
));
17332 if (TARGET_SAHF
&& (TARGET_USE_SAHF
|| optimize_insn_for_size_p ()))
17334 emit_insn (gen_x86_sahf_1 (reg
));
17336 temp
= gen_rtx_REG (CCmode
, FLAGS_REG
);
17337 temp
= gen_rtx_UNORDERED (VOIDmode
, temp
, const0_rtx
);
17341 emit_insn (gen_testqi_ext_1_ccno (reg
, GEN_INT (0x04)));
17343 temp
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
17344 temp
= gen_rtx_NE (VOIDmode
, temp
, const0_rtx
);
17347 temp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, temp
,
17348 gen_rtx_LABEL_REF (VOIDmode
, label
),
17350 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, temp
));
17351 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
17352 JUMP_LABEL (insn
) = label
;
17355 /* Output code to perform an sinh XFmode calculation. */
17358 ix86_emit_i387_sinh (rtx op0
, rtx op1
)
17360 rtx e1
= gen_reg_rtx (XFmode
);
17361 rtx e2
= gen_reg_rtx (XFmode
);
17362 rtx scratch
= gen_reg_rtx (HImode
);
17363 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
17364 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
17366 rtx_code_label
*jump_label
= gen_label_rtx ();
17369 /* scratch = fxam (op1) */
17370 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
17372 /* e1 = expm1 (|op1|) */
17373 emit_insn (gen_absxf2 (e2
, op1
));
17374 emit_insn (gen_expm1xf2 (e1
, e2
));
17376 /* e2 = e1 / (e1 + 1.0) + e1 */
17377 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
17378 emit_insn (gen_addxf3 (e2
, e1
, cst1
));
17379 emit_insn (gen_divxf3 (e2
, e1
, e2
));
17380 emit_insn (gen_addxf3 (e2
, e2
, e1
));
17382 /* flags = signbit (op1) */
17383 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
17385 /* if (flags) then e2 = -e2 */
17386 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
17387 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
17388 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
17390 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
17391 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
17392 JUMP_LABEL (insn
) = jump_label
;
17394 emit_insn (gen_negxf2 (e2
, e2
));
17396 emit_label (jump_label
);
17397 LABEL_NUSES (jump_label
) = 1;
17399 /* op0 = 0.5 * e2 */
17400 half
= force_reg (XFmode
, half
);
17401 emit_insn (gen_mulxf3 (op0
, e2
, half
));
17404 /* Output code to perform an cosh XFmode calculation. */
17407 ix86_emit_i387_cosh (rtx op0
, rtx op1
)
17409 rtx e1
= gen_reg_rtx (XFmode
);
17410 rtx e2
= gen_reg_rtx (XFmode
);
17411 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
17414 /* e1 = exp (op1) */
17415 emit_insn (gen_expxf2 (e1
, op1
));
17417 /* e2 = e1 + 1.0 / e1 */
17418 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
17419 emit_insn (gen_divxf3 (e2
, cst1
, e1
));
17420 emit_insn (gen_addxf3 (e2
, e1
, e2
));
17422 /* op0 = 0.5 * e2 */
17423 half
= force_reg (XFmode
, half
);
17424 emit_insn (gen_mulxf3 (op0
, e2
, half
));
17427 /* Output code to perform an tanh XFmode calculation. */
17430 ix86_emit_i387_tanh (rtx op0
, rtx op1
)
17432 rtx e1
= gen_reg_rtx (XFmode
);
17433 rtx e2
= gen_reg_rtx (XFmode
);
17434 rtx scratch
= gen_reg_rtx (HImode
);
17435 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
17437 rtx_code_label
*jump_label
= gen_label_rtx ();
17440 /* scratch = fxam (op1) */
17441 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
17443 /* e1 = expm1 (-|2 * op1|) */
17444 emit_insn (gen_addxf3 (e2
, op1
, op1
));
17445 emit_insn (gen_absxf2 (e2
, e2
));
17446 emit_insn (gen_negxf2 (e2
, e2
));
17447 emit_insn (gen_expm1xf2 (e1
, e2
));
17449 /* e2 = e1 / (e1 + 2.0) */
17450 cst2
= force_reg (XFmode
, CONST2_RTX (XFmode
));
17451 emit_insn (gen_addxf3 (e2
, e1
, cst2
));
17452 emit_insn (gen_divxf3 (e2
, e1
, e2
));
17454 /* flags = signbit (op1) */
17455 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
17457 /* if (!flags) then e2 = -e2 */
17458 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
17459 gen_rtx_NE (VOIDmode
, flags
, const0_rtx
),
17460 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
17462 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
17463 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
17464 JUMP_LABEL (insn
) = jump_label
;
17466 emit_insn (gen_negxf2 (e2
, e2
));
17468 emit_label (jump_label
);
17469 LABEL_NUSES (jump_label
) = 1;
17471 emit_move_insn (op0
, e2
);
17474 /* Output code to perform an asinh XFmode calculation. */
17477 ix86_emit_i387_asinh (rtx op0
, rtx op1
)
17479 rtx e1
= gen_reg_rtx (XFmode
);
17480 rtx e2
= gen_reg_rtx (XFmode
);
17481 rtx scratch
= gen_reg_rtx (HImode
);
17482 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
17484 rtx_code_label
*jump_label
= gen_label_rtx ();
17487 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
17488 emit_insn (gen_mulxf3 (e1
, op1
, op1
));
17489 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
17490 emit_insn (gen_addxf3 (e2
, e1
, cst1
));
17491 emit_insn (gen_sqrtxf2 (e2
, e2
));
17492 emit_insn (gen_addxf3 (e2
, e2
, cst1
));
17495 emit_insn (gen_divxf3 (e1
, e1
, e2
));
17497 /* scratch = fxam (op1) */
17498 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
17500 /* e1 = e1 + |op1| */
17501 emit_insn (gen_absxf2 (e2
, op1
));
17502 emit_insn (gen_addxf3 (e1
, e1
, e2
));
17504 /* e2 = log1p (e1) */
17505 ix86_emit_i387_log1p (e2
, e1
);
17507 /* flags = signbit (op1) */
17508 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
17510 /* if (flags) then e2 = -e2 */
17511 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
17512 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
17513 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
17515 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
17516 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
17517 JUMP_LABEL (insn
) = jump_label
;
17519 emit_insn (gen_negxf2 (e2
, e2
));
17521 emit_label (jump_label
);
17522 LABEL_NUSES (jump_label
) = 1;
17524 emit_move_insn (op0
, e2
);
17527 /* Output code to perform an acosh XFmode calculation. */
17530 ix86_emit_i387_acosh (rtx op0
, rtx op1
)
17532 rtx e1
= gen_reg_rtx (XFmode
);
17533 rtx e2
= gen_reg_rtx (XFmode
);
17534 rtx cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
17536 /* e2 = sqrt (op1 + 1.0) */
17537 emit_insn (gen_addxf3 (e2
, op1
, cst1
));
17538 emit_insn (gen_sqrtxf2 (e2
, e2
));
17540 /* e1 = sqrt (op1 - 1.0) */
17541 emit_insn (gen_subxf3 (e1
, op1
, cst1
));
17542 emit_insn (gen_sqrtxf2 (e1
, e1
));
17545 emit_insn (gen_mulxf3 (e1
, e1
, e2
));
17547 /* e1 = e1 + op1 */
17548 emit_insn (gen_addxf3 (e1
, e1
, op1
));
17550 /* op0 = log (e1) */
17551 emit_insn (gen_logxf2 (op0
, e1
));
17554 /* Output code to perform an atanh XFmode calculation. */
17557 ix86_emit_i387_atanh (rtx op0
, rtx op1
)
17559 rtx e1
= gen_reg_rtx (XFmode
);
17560 rtx e2
= gen_reg_rtx (XFmode
);
17561 rtx scratch
= gen_reg_rtx (HImode
);
17562 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
17563 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
17565 rtx_code_label
*jump_label
= gen_label_rtx ();
17568 /* scratch = fxam (op1) */
17569 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
17572 emit_insn (gen_absxf2 (e2
, op1
));
17574 /* e1 = -(e2 + e2) / (e2 + 1.0) */
17575 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
17576 emit_insn (gen_addxf3 (e1
, e2
, cst1
));
17577 emit_insn (gen_addxf3 (e2
, e2
, e2
));
17578 emit_insn (gen_negxf2 (e2
, e2
));
17579 emit_insn (gen_divxf3 (e1
, e2
, e1
));
17581 /* e2 = log1p (e1) */
17582 ix86_emit_i387_log1p (e2
, e1
);
17584 /* flags = signbit (op1) */
17585 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
17587 /* if (!flags) then e2 = -e2 */
17588 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
17589 gen_rtx_NE (VOIDmode
, flags
, const0_rtx
),
17590 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
17592 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
17593 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
17594 JUMP_LABEL (insn
) = jump_label
;
17596 emit_insn (gen_negxf2 (e2
, e2
));
17598 emit_label (jump_label
);
17599 LABEL_NUSES (jump_label
) = 1;
17601 /* op0 = 0.5 * e2 */
17602 half
= force_reg (XFmode
, half
);
17603 emit_insn (gen_mulxf3 (op0
, e2
, half
));
17606 /* Output code to perform a log1p XFmode calculation. */
17609 ix86_emit_i387_log1p (rtx op0
, rtx op1
)
17611 rtx_code_label
*label1
= gen_label_rtx ();
17612 rtx_code_label
*label2
= gen_label_rtx ();
17614 rtx tmp
= gen_reg_rtx (XFmode
);
17615 rtx res
= gen_reg_rtx (XFmode
);
17616 rtx cst
, cstln2
, cst1
;
17619 /* The emit_jump call emits pending stack adjust, make sure it is emitted
17620 before the conditional jump, otherwise the stack adjustment will be
17621 only conditional. */
17622 do_pending_stack_adjust ();
17624 cst
= const_double_from_real_value
17625 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode
), XFmode
);
17626 cstln2
= force_reg (XFmode
, standard_80387_constant_rtx (4)); /* fldln2 */
17628 emit_insn (gen_absxf2 (tmp
, op1
));
17630 cst
= force_reg (XFmode
, cst
);
17631 ix86_expand_branch (GE
, tmp
, cst
, label1
);
17632 predict_jump (REG_BR_PROB_BASE
* 10 / 100);
17633 insn
= get_last_insn ();
17634 JUMP_LABEL (insn
) = label1
;
17636 emit_insn (gen_fyl2xp1xf3_i387 (res
, op1
, cstln2
));
17637 emit_jump (label2
);
17639 emit_label (label1
);
17640 LABEL_NUSES (label1
) = 1;
17642 cst1
= force_reg (XFmode
, CONST1_RTX (XFmode
));
17643 emit_insn (gen_rtx_SET (tmp
, gen_rtx_PLUS (XFmode
, op1
, cst1
)));
17644 emit_insn (gen_fyl2xxf3_i387 (res
, tmp
, cstln2
));
17646 emit_label (label2
);
17647 LABEL_NUSES (label2
) = 1;
17649 emit_move_insn (op0
, res
);
17652 /* Emit code for round calculation. */
17654 ix86_emit_i387_round (rtx op0
, rtx op1
)
17656 machine_mode inmode
= GET_MODE (op1
);
17657 machine_mode outmode
= GET_MODE (op0
);
17658 rtx e1
= gen_reg_rtx (XFmode
);
17659 rtx e2
= gen_reg_rtx (XFmode
);
17660 rtx scratch
= gen_reg_rtx (HImode
);
17661 rtx flags
= gen_rtx_REG (CCNOmode
, FLAGS_REG
);
17662 rtx half
= const_double_from_real_value (dconsthalf
, XFmode
);
17663 rtx res
= gen_reg_rtx (outmode
);
17664 rtx_code_label
*jump_label
= gen_label_rtx ();
17665 rtx (*floor_insn
) (rtx
, rtx
);
17666 rtx (*neg_insn
) (rtx
, rtx
);
17674 tmp
= gen_reg_rtx (XFmode
);
17676 emit_insn (gen_rtx_SET (tmp
, gen_rtx_FLOAT_EXTEND (XFmode
, op1
)));
17682 gcc_unreachable ();
17688 floor_insn
= gen_frndintxf2_floor
;
17689 neg_insn
= gen_negsf2
;
17692 floor_insn
= gen_frndintxf2_floor
;
17693 neg_insn
= gen_negdf2
;
17696 floor_insn
= gen_frndintxf2_floor
;
17697 neg_insn
= gen_negxf2
;
17700 floor_insn
= gen_lfloorxfhi2
;
17701 neg_insn
= gen_neghi2
;
17704 floor_insn
= gen_lfloorxfsi2
;
17705 neg_insn
= gen_negsi2
;
17708 floor_insn
= gen_lfloorxfdi2
;
17709 neg_insn
= gen_negdi2
;
17712 gcc_unreachable ();
17715 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
17717 /* scratch = fxam(op1) */
17718 emit_insn (gen_fxamxf2_i387 (scratch
, op1
));
17720 /* e1 = fabs(op1) */
17721 emit_insn (gen_absxf2 (e1
, op1
));
17723 /* e2 = e1 + 0.5 */
17724 half
= force_reg (XFmode
, half
);
17725 emit_insn (gen_rtx_SET (e2
, gen_rtx_PLUS (XFmode
, e1
, half
)));
17727 /* res = floor(e2) */
17733 tmp
= gen_reg_rtx (XFmode
);
17735 emit_insn (floor_insn (tmp
, e2
));
17736 emit_insn (gen_rtx_SET (res
,
17737 gen_rtx_UNSPEC (outmode
, gen_rtvec (1, tmp
),
17738 UNSPEC_TRUNC_NOOP
)));
17742 emit_insn (floor_insn (res
, e2
));
17745 /* flags = signbit(a) */
17746 emit_insn (gen_testqi_ext_1_ccno (scratch
, GEN_INT (0x02)));
17748 /* if (flags) then res = -res */
17749 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
,
17750 gen_rtx_EQ (VOIDmode
, flags
, const0_rtx
),
17751 gen_rtx_LABEL_REF (VOIDmode
, jump_label
),
17753 insn
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
17754 predict_jump (REG_BR_PROB_BASE
* 50 / 100);
17755 JUMP_LABEL (insn
) = jump_label
;
17757 emit_insn (neg_insn (res
, res
));
17759 emit_label (jump_label
);
17760 LABEL_NUSES (jump_label
) = 1;
17762 emit_move_insn (op0
, res
);
17765 /* Output code to perform a Newton-Rhapson approximation of a single precision
17766 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
17769 ix86_emit_swdivsf (rtx res
, rtx a
, rtx b
, machine_mode mode
)
17771 rtx x0
, x1
, e0
, e1
;
17773 x0
= gen_reg_rtx (mode
);
17774 e0
= gen_reg_rtx (mode
);
17775 e1
= gen_reg_rtx (mode
);
17776 x1
= gen_reg_rtx (mode
);
17778 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
17780 b
= force_reg (mode
, b
);
17782 /* x0 = rcp(b) estimate */
17783 if (mode
== V16SFmode
|| mode
== V8DFmode
)
17785 if (TARGET_AVX512ER
)
17787 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
17790 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, a
, x0
)));
17794 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
17798 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, b
),
17802 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, b
)));
17805 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, e0
)));
17808 emit_insn (gen_rtx_SET (e1
, gen_rtx_PLUS (mode
, x0
, x0
)));
17811 emit_insn (gen_rtx_SET (x1
, gen_rtx_MINUS (mode
, e1
, e0
)));
17814 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, a
, x1
)));
17817 /* Output code to perform a Newton-Rhapson approximation of a
17818 single precision floating point [reciprocal] square root. */
17821 ix86_emit_swsqrtsf (rtx res
, rtx a
, machine_mode mode
, bool recip
)
17823 rtx x0
, e0
, e1
, e2
, e3
, mthree
, mhalf
;
17827 x0
= gen_reg_rtx (mode
);
17828 e0
= gen_reg_rtx (mode
);
17829 e1
= gen_reg_rtx (mode
);
17830 e2
= gen_reg_rtx (mode
);
17831 e3
= gen_reg_rtx (mode
);
17833 if (TARGET_AVX512ER
&& mode
== V16SFmode
)
17836 /* res = rsqrt28(a) estimate */
17837 emit_insn (gen_rtx_SET (res
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
17841 /* x0 = rsqrt28(a) estimate */
17842 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
17844 /* res = rcp28(x0) estimate */
17845 emit_insn (gen_rtx_SET (res
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, x0
),
17851 real_from_integer (&r
, VOIDmode
, -3, SIGNED
);
17852 mthree
= const_double_from_real_value (r
, SFmode
);
17854 real_arithmetic (&r
, NEGATE_EXPR
, &dconsthalf
, NULL
);
17855 mhalf
= const_double_from_real_value (r
, SFmode
);
17856 unspec
= UNSPEC_RSQRT
;
17858 if (VECTOR_MODE_P (mode
))
17860 mthree
= ix86_build_const_vector (mode
, true, mthree
);
17861 mhalf
= ix86_build_const_vector (mode
, true, mhalf
);
17862 /* There is no 512-bit rsqrt. There is however rsqrt14. */
17863 if (GET_MODE_SIZE (mode
) == 64)
17864 unspec
= UNSPEC_RSQRT14
;
17867 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
17868 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
17870 a
= force_reg (mode
, a
);
17872 /* x0 = rsqrt(a) estimate */
17873 emit_insn (gen_rtx_SET (x0
, gen_rtx_UNSPEC (mode
, gen_rtvec (1, a
),
17876 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
17879 rtx zero
= force_reg (mode
, CONST0_RTX(mode
));
17882 /* Handle masked compare. */
17883 if (VECTOR_MODE_P (mode
) && GET_MODE_SIZE (mode
) == 64)
17885 mask
= gen_reg_rtx (HImode
);
17886 /* Imm value 0x4 corresponds to not-equal comparison. */
17887 emit_insn (gen_avx512f_cmpv16sf3 (mask
, zero
, a
, GEN_INT (0x4)));
17888 emit_insn (gen_avx512f_blendmv16sf (x0
, zero
, x0
, mask
));
17892 mask
= gen_reg_rtx (mode
);
17893 emit_insn (gen_rtx_SET (mask
, gen_rtx_NE (mode
, zero
, a
)));
17894 emit_insn (gen_rtx_SET (x0
, gen_rtx_AND (mode
, x0
, mask
)));
17898 mthree
= force_reg (mode
, mthree
);
17901 emit_insn (gen_rtx_SET (e0
, gen_rtx_MULT (mode
, x0
, a
)));
17903 unsigned vector_size
= GET_MODE_SIZE (mode
);
17905 || (TARGET_AVX512F
&& vector_size
== 64)
17906 || (TARGET_AVX512VL
&& (vector_size
== 32 || vector_size
== 16)))
17907 emit_insn (gen_rtx_SET (e2
,
17908 gen_rtx_FMA (mode
, e0
, x0
, mthree
)));
17912 emit_insn (gen_rtx_SET (e1
, gen_rtx_MULT (mode
, e0
, x0
)));
17915 emit_insn (gen_rtx_SET (e2
, gen_rtx_PLUS (mode
, e1
, mthree
)));
17918 mhalf
= force_reg (mode
, mhalf
);
17920 /* e3 = -.5 * x0 */
17921 emit_insn (gen_rtx_SET (e3
, gen_rtx_MULT (mode
, x0
, mhalf
)));
17923 /* e3 = -.5 * e0 */
17924 emit_insn (gen_rtx_SET (e3
, gen_rtx_MULT (mode
, e0
, mhalf
)));
17925 /* ret = e2 * e3 */
17926 emit_insn (gen_rtx_SET (res
, gen_rtx_MULT (mode
, e2
, e3
)));
17929 /* Expand fabs (OP0) and return a new rtx that holds the result. The
17930 mask for masking out the sign-bit is stored in *SMASK, if that is
17934 ix86_expand_sse_fabs (rtx op0
, rtx
*smask
)
17936 machine_mode vmode
, mode
= GET_MODE (op0
);
17939 xa
= gen_reg_rtx (mode
);
17940 if (mode
== SFmode
)
17942 else if (mode
== DFmode
)
17946 mask
= ix86_build_signbit_mask (vmode
, VECTOR_MODE_P (mode
), true);
17947 if (!VECTOR_MODE_P (mode
))
17949 /* We need to generate a scalar mode mask in this case. */
17950 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const0_rtx
));
17951 tmp
= gen_rtx_VEC_SELECT (mode
, mask
, tmp
);
17952 mask
= gen_reg_rtx (mode
);
17953 emit_insn (gen_rtx_SET (mask
, tmp
));
17955 emit_insn (gen_rtx_SET (xa
, gen_rtx_AND (mode
, op0
, mask
)));
17963 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
17964 swapping the operands if SWAP_OPERANDS is true. The expanded
17965 code is a forward jump to a newly created label in case the
17966 comparison is true. The generated label rtx is returned. */
17967 static rtx_code_label
*
17968 ix86_expand_sse_compare_and_jump (enum rtx_code code
, rtx op0
, rtx op1
,
17969 bool swap_operands
)
17971 bool unordered_compare
= ix86_unordered_fp_compare (code
);
17972 rtx_code_label
*label
;
17976 std::swap (op0
, op1
);
17978 label
= gen_label_rtx ();
17979 tmp
= gen_rtx_COMPARE (CCFPmode
, op0
, op1
);
17980 if (unordered_compare
)
17981 tmp
= gen_rtx_UNSPEC (CCFPmode
, gen_rtvec (1, tmp
), UNSPEC_NOTRAP
);
17982 reg
= gen_rtx_REG (CCFPmode
, FLAGS_REG
);
17983 emit_insn (gen_rtx_SET (reg
, tmp
));
17984 tmp
= gen_rtx_fmt_ee (code
, VOIDmode
, reg
, const0_rtx
);
17985 tmp
= gen_rtx_IF_THEN_ELSE (VOIDmode
, tmp
,
17986 gen_rtx_LABEL_REF (VOIDmode
, label
), pc_rtx
);
17987 tmp
= emit_jump_insn (gen_rtx_SET (pc_rtx
, tmp
));
17988 JUMP_LABEL (tmp
) = label
;
17993 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
17994 using comparison code CODE. Operands are swapped for the comparison if
17995 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
17997 ix86_expand_sse_compare_mask (enum rtx_code code
, rtx op0
, rtx op1
,
17998 bool swap_operands
)
18000 rtx (*insn
)(rtx
, rtx
, rtx
, rtx
);
18001 machine_mode mode
= GET_MODE (op0
);
18002 rtx mask
= gen_reg_rtx (mode
);
18005 std::swap (op0
, op1
);
18007 insn
= mode
== DFmode
? gen_setcc_df_sse
: gen_setcc_sf_sse
;
18009 emit_insn (insn (mask
, op0
, op1
,
18010 gen_rtx_fmt_ee (code
, mode
, op0
, op1
)));
18014 /* Expand copysign from SIGN to the positive value ABS_VALUE
18015 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
18019 ix86_sse_copysign_to_positive (rtx result
, rtx abs_value
, rtx sign
, rtx mask
)
18021 machine_mode mode
= GET_MODE (sign
);
18022 rtx sgn
= gen_reg_rtx (mode
);
18023 if (mask
== NULL_RTX
)
18025 machine_mode vmode
;
18027 if (mode
== SFmode
)
18029 else if (mode
== DFmode
)
18034 mask
= ix86_build_signbit_mask (vmode
, VECTOR_MODE_P (mode
), false);
18035 if (!VECTOR_MODE_P (mode
))
18037 /* We need to generate a scalar mode mask in this case. */
18038 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const0_rtx
));
18039 tmp
= gen_rtx_VEC_SELECT (mode
, mask
, tmp
);
18040 mask
= gen_reg_rtx (mode
);
18041 emit_insn (gen_rtx_SET (mask
, tmp
));
18045 mask
= gen_rtx_NOT (mode
, mask
);
18046 emit_insn (gen_rtx_SET (sgn
, gen_rtx_AND (mode
, mask
, sign
)));
18047 emit_insn (gen_rtx_SET (result
, gen_rtx_IOR (mode
, abs_value
, sgn
)));
18050 /* Expand SSE sequence for computing lround from OP1 storing
18054 ix86_expand_lround (rtx op0
, rtx op1
)
18056 /* C code for the stuff we're doing below:
18057 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
18060 machine_mode mode
= GET_MODE (op1
);
18061 const struct real_format
*fmt
;
18062 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
18065 /* load nextafter (0.5, 0.0) */
18066 fmt
= REAL_MODE_FORMAT (mode
);
18067 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
18068 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
18070 /* adj = copysign (0.5, op1) */
18071 adj
= force_reg (mode
, const_double_from_real_value (pred_half
, mode
));
18072 ix86_sse_copysign_to_positive (adj
, adj
, force_reg (mode
, op1
), NULL_RTX
);
18074 /* adj = op1 + adj */
18075 adj
= expand_simple_binop (mode
, PLUS
, adj
, op1
, NULL_RTX
, 0, OPTAB_DIRECT
);
18077 /* op0 = (imode)adj */
18078 expand_fix (op0
, adj
, 0);
18081 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
18085 ix86_expand_lfloorceil (rtx op0
, rtx op1
, bool do_floor
)
18087 /* C code for the stuff we're doing below (for do_floor):
18089 xi -= (double)xi > op1 ? 1 : 0;
18092 machine_mode fmode
= GET_MODE (op1
);
18093 machine_mode imode
= GET_MODE (op0
);
18094 rtx ireg
, freg
, tmp
;
18095 rtx_code_label
*label
;
18097 /* reg = (long)op1 */
18098 ireg
= gen_reg_rtx (imode
);
18099 expand_fix (ireg
, op1
, 0);
18101 /* freg = (double)reg */
18102 freg
= gen_reg_rtx (fmode
);
18103 expand_float (freg
, ireg
, 0);
18105 /* ireg = (freg > op1) ? ireg - 1 : ireg */
18106 label
= ix86_expand_sse_compare_and_jump (UNLE
,
18107 freg
, op1
, !do_floor
);
18108 tmp
= expand_simple_binop (imode
, do_floor
? MINUS
: PLUS
,
18109 ireg
, const1_rtx
, NULL_RTX
, 0, OPTAB_DIRECT
);
18110 emit_move_insn (ireg
, tmp
);
18112 emit_label (label
);
18113 LABEL_NUSES (label
) = 1;
18115 emit_move_insn (op0
, ireg
);
18118 /* Generate and return a rtx of mode MODE for 2**n where n is the number
18119 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
18122 ix86_gen_TWO52 (machine_mode mode
)
18124 const struct real_format
*fmt
;
18125 REAL_VALUE_TYPE TWO52r
;
18128 fmt
= REAL_MODE_FORMAT (mode
);
18129 real_2expN (&TWO52r
, fmt
->p
- 1, mode
);
18130 TWO52
= const_double_from_real_value (TWO52r
, mode
);
18131 TWO52
= force_reg (mode
, TWO52
);
18136 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
18139 ix86_expand_rint (rtx operand0
, rtx operand1
)
18141 /* C code for the stuff we're doing below:
18142 xa = fabs (operand1);
18143 if (!isless (xa, 2**52))
18146 if (flag_rounding_math)
18148 two52 = copysign (two52, operand1);
18151 xa = xa + two52 - two52;
18152 return copysign (xa, operand1);
18154 machine_mode mode
= GET_MODE (operand0
);
18155 rtx res
, xa
, TWO52
, mask
;
18156 rtx_code_label
*label
;
18158 TWO52
= ix86_gen_TWO52 (mode
);
18160 /* Temporary for holding the result, initialized to the input
18161 operand to ease control flow. */
18162 res
= copy_to_reg (operand1
);
18164 /* xa = abs (operand1) */
18165 xa
= ix86_expand_sse_fabs (res
, &mask
);
18167 /* if (!isless (xa, TWO52)) goto label; */
18168 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18170 if (flag_rounding_math
)
18172 ix86_sse_copysign_to_positive (TWO52
, TWO52
, res
, mask
);
18176 xa
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
18177 xa
= expand_simple_binop (mode
, MINUS
, xa
, TWO52
, xa
, 0, OPTAB_DIRECT
);
18179 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18180 if (HONOR_SIGNED_ZEROS (mode
) && flag_rounding_math
)
18181 xa
= ix86_expand_sse_fabs (xa
, NULL
);
18183 ix86_sse_copysign_to_positive (res
, xa
, res
, mask
);
18185 emit_label (label
);
18186 LABEL_NUSES (label
) = 1;
18188 emit_move_insn (operand0
, res
);
18191 /* Expand SSE2 sequence for computing floor or ceil
18192 from OPERAND1 storing into OPERAND0. */
18194 ix86_expand_floorceil (rtx operand0
, rtx operand1
, bool do_floor
)
18196 /* C code for the stuff we expand below.
18197 double xa = fabs (x), x2;
18198 if (!isless (xa, TWO52))
18200 x2 = (double)(long)x;
18209 if (HONOR_SIGNED_ZEROS (mode))
18210 return copysign (x2, x);
18213 machine_mode mode
= GET_MODE (operand0
);
18214 rtx xa
, xi
, TWO52
, tmp
, one
, res
, mask
;
18215 rtx_code_label
*label
;
18217 TWO52
= ix86_gen_TWO52 (mode
);
18219 /* Temporary for holding the result, initialized to the input
18220 operand to ease control flow. */
18221 res
= copy_to_reg (operand1
);
18223 /* xa = abs (operand1) */
18224 xa
= ix86_expand_sse_fabs (res
, &mask
);
18226 /* if (!isless (xa, TWO52)) goto label; */
18227 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18229 /* xa = (double)(long)x */
18230 xi
= gen_reg_rtx (int_mode_for_mode (mode
).require ());
18231 expand_fix (xi
, res
, 0);
18232 expand_float (xa
, xi
, 0);
18235 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
18237 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
18238 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa
, res
, !do_floor
);
18239 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
18240 tmp
= expand_simple_binop (mode
, do_floor
? MINUS
: PLUS
,
18241 xa
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
18242 if (HONOR_SIGNED_ZEROS (mode
))
18244 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18245 if (do_floor
&& flag_rounding_math
)
18246 tmp
= ix86_expand_sse_fabs (tmp
, NULL
);
18248 ix86_sse_copysign_to_positive (tmp
, tmp
, res
, mask
);
18250 emit_move_insn (res
, tmp
);
18252 emit_label (label
);
18253 LABEL_NUSES (label
) = 1;
18255 emit_move_insn (operand0
, res
);
18258 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
18259 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18260 that is only available on 64bit targets. */
18262 ix86_expand_floorceildf_32 (rtx operand0
, rtx operand1
, bool do_floor
)
18264 /* C code for the stuff we expand below.
18265 double xa = fabs (x), x2;
18266 if (!isless (xa, TWO52))
18268 xa = xa + TWO52 - TWO52;
18269 x2 = copysign (xa, x);
18278 if (HONOR_SIGNED_ZEROS (mode))
18279 x2 = copysign (x2, x);
18282 machine_mode mode
= GET_MODE (operand0
);
18283 rtx xa
, TWO52
, tmp
, one
, res
, mask
;
18284 rtx_code_label
*label
;
18286 TWO52
= ix86_gen_TWO52 (mode
);
18288 /* Temporary for holding the result, initialized to the input
18289 operand to ease control flow. */
18290 res
= copy_to_reg (operand1
);
18292 /* xa = abs (operand1) */
18293 xa
= ix86_expand_sse_fabs (res
, &mask
);
18295 /* if (!isless (xa, TWO52)) goto label; */
18296 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18298 /* xa = xa + TWO52 - TWO52; */
18299 xa
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
18300 xa
= expand_simple_binop (mode
, MINUS
, xa
, TWO52
, xa
, 0, OPTAB_DIRECT
);
18302 /* xa = copysign (xa, operand1) */
18303 ix86_sse_copysign_to_positive (xa
, xa
, res
, mask
);
18306 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
18308 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
18309 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa
, res
, !do_floor
);
18310 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
18311 tmp
= expand_simple_binop (mode
, do_floor
? MINUS
: PLUS
,
18312 xa
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
18313 if (HONOR_SIGNED_ZEROS (mode
))
18315 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18316 if (do_floor
&& flag_rounding_math
)
18317 tmp
= ix86_expand_sse_fabs (tmp
, NULL
);
18319 ix86_sse_copysign_to_positive (tmp
, tmp
, res
, mask
);
18321 emit_move_insn (res
, tmp
);
18323 emit_label (label
);
18324 LABEL_NUSES (label
) = 1;
18326 emit_move_insn (operand0
, res
);
18329 /* Expand SSE sequence for computing trunc
18330 from OPERAND1 storing into OPERAND0. */
18332 ix86_expand_trunc (rtx operand0
, rtx operand1
)
18334 /* C code for SSE variant we expand below.
18335 double xa = fabs (x), x2;
18336 if (!isless (xa, TWO52))
18338 x2 = (double)(long)x;
18339 if (HONOR_SIGNED_ZEROS (mode))
18340 return copysign (x2, x);
18343 machine_mode mode
= GET_MODE (operand0
);
18344 rtx xa
, xi
, TWO52
, res
, mask
;
18345 rtx_code_label
*label
;
18347 TWO52
= ix86_gen_TWO52 (mode
);
18349 /* Temporary for holding the result, initialized to the input
18350 operand to ease control flow. */
18351 res
= copy_to_reg (operand1
);
18353 /* xa = abs (operand1) */
18354 xa
= ix86_expand_sse_fabs (res
, &mask
);
18356 /* if (!isless (xa, TWO52)) goto label; */
18357 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18359 /* xa = (double)(long)x */
18360 xi
= gen_reg_rtx (int_mode_for_mode (mode
).require ());
18361 expand_fix (xi
, res
, 0);
18362 expand_float (xa
, xi
, 0);
18364 if (HONOR_SIGNED_ZEROS (mode
))
18365 ix86_sse_copysign_to_positive (xa
, xa
, res
, mask
);
18367 emit_move_insn (res
, xa
);
18369 emit_label (label
);
18370 LABEL_NUSES (label
) = 1;
18372 emit_move_insn (operand0
, res
);
18375 /* Expand SSE sequence for computing trunc from OPERAND1 storing
18376 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18377 that is only available on 64bit targets. */
18379 ix86_expand_truncdf_32 (rtx operand0
, rtx operand1
)
18381 machine_mode mode
= GET_MODE (operand0
);
18382 rtx xa
, xa2
, TWO52
, tmp
, one
, res
, mask
;
18383 rtx_code_label
*label
;
18385 /* C code for SSE variant we expand below.
18386 double xa = fabs (x), x2;
18387 if (!isless (xa, TWO52))
18389 xa2 = xa + TWO52 - TWO52;
18393 x2 = copysign (xa2, x);
18397 TWO52
= ix86_gen_TWO52 (mode
);
18399 /* Temporary for holding the result, initialized to the input
18400 operand to ease control flow. */
18401 res
=copy_to_reg (operand1
);
18403 /* xa = abs (operand1) */
18404 xa
= ix86_expand_sse_fabs (res
, &mask
);
18406 /* if (!isless (xa, TWO52)) goto label; */
18407 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18409 /* xa2 = xa + TWO52 - TWO52; */
18410 xa2
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
18411 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, TWO52
, xa2
, 0, OPTAB_DIRECT
);
18414 one
= force_reg (mode
, const_double_from_real_value (dconst1
, mode
));
18416 /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */
18417 tmp
= ix86_expand_sse_compare_mask (UNGT
, xa2
, xa
, false);
18418 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, one
, tmp
)));
18419 tmp
= expand_simple_binop (mode
, MINUS
,
18420 xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
18421 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18422 if (HONOR_SIGNED_ZEROS (mode
) && flag_rounding_math
)
18423 tmp
= ix86_expand_sse_fabs (tmp
, NULL
);
18425 /* res = copysign (xa2, operand1) */
18426 ix86_sse_copysign_to_positive (res
, tmp
, res
, mask
);
18428 emit_label (label
);
18429 LABEL_NUSES (label
) = 1;
18431 emit_move_insn (operand0
, res
);
18434 /* Expand SSE sequence for computing round
18435 from OPERAND1 storing into OPERAND0. */
18437 ix86_expand_round (rtx operand0
, rtx operand1
)
18439 /* C code for the stuff we're doing below:
18440 double xa = fabs (x);
18441 if (!isless (xa, TWO52))
18443 xa = (double)(long)(xa + nextafter (0.5, 0.0));
18444 return copysign (xa, x);
18446 machine_mode mode
= GET_MODE (operand0
);
18447 rtx res
, TWO52
, xa
, xi
, half
, mask
;
18448 rtx_code_label
*label
;
18449 const struct real_format
*fmt
;
18450 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
18452 /* Temporary for holding the result, initialized to the input
18453 operand to ease control flow. */
18454 res
= copy_to_reg (operand1
);
18456 TWO52
= ix86_gen_TWO52 (mode
);
18457 xa
= ix86_expand_sse_fabs (res
, &mask
);
18458 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18460 /* load nextafter (0.5, 0.0) */
18461 fmt
= REAL_MODE_FORMAT (mode
);
18462 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
18463 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
18465 /* xa = xa + 0.5 */
18466 half
= force_reg (mode
, const_double_from_real_value (pred_half
, mode
));
18467 xa
= expand_simple_binop (mode
, PLUS
, xa
, half
, NULL_RTX
, 0, OPTAB_DIRECT
);
18469 /* xa = (double)(int64_t)xa */
18470 xi
= gen_reg_rtx (int_mode_for_mode (mode
).require ());
18471 expand_fix (xi
, xa
, 0);
18472 expand_float (xa
, xi
, 0);
18474 /* res = copysign (xa, operand1) */
18475 ix86_sse_copysign_to_positive (res
, xa
, res
, mask
);
18477 emit_label (label
);
18478 LABEL_NUSES (label
) = 1;
18480 emit_move_insn (operand0
, res
);
18483 /* Expand SSE sequence for computing round from OPERAND1 storing
18484 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18485 that is only available on 64bit targets. */
18487 ix86_expand_rounddf_32 (rtx operand0
, rtx operand1
)
18489 /* C code for the stuff we expand below.
18490 double xa = fabs (x), xa2, x2;
18491 if (!isless (xa, TWO52))
18493 Using the absolute value and copying back sign makes
18494 -0.0 -> -0.0 correct.
18495 xa2 = xa + TWO52 - TWO52;
18500 else if (dxa > 0.5)
18502 x2 = copysign (xa2, x);
18505 machine_mode mode
= GET_MODE (operand0
);
18506 rtx xa
, xa2
, dxa
, TWO52
, tmp
, half
, mhalf
, one
, res
, mask
;
18507 rtx_code_label
*label
;
18509 TWO52
= ix86_gen_TWO52 (mode
);
18511 /* Temporary for holding the result, initialized to the input
18512 operand to ease control flow. */
18513 res
= copy_to_reg (operand1
);
18515 /* xa = abs (operand1) */
18516 xa
= ix86_expand_sse_fabs (res
, &mask
);
18518 /* if (!isless (xa, TWO52)) goto label; */
18519 label
= ix86_expand_sse_compare_and_jump (UNLE
, TWO52
, xa
, false);
18521 /* xa2 = xa + TWO52 - TWO52; */
18522 xa2
= expand_simple_binop (mode
, PLUS
, xa
, TWO52
, NULL_RTX
, 0, OPTAB_DIRECT
);
18523 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, TWO52
, xa2
, 0, OPTAB_DIRECT
);
18525 /* dxa = xa2 - xa; */
18526 dxa
= expand_simple_binop (mode
, MINUS
, xa2
, xa
, NULL_RTX
, 0, OPTAB_DIRECT
);
18528 /* generate 0.5, 1.0 and -0.5 */
18529 half
= force_reg (mode
, const_double_from_real_value (dconsthalf
, mode
));
18530 one
= expand_simple_binop (mode
, PLUS
, half
, half
, NULL_RTX
, 0, OPTAB_DIRECT
);
18531 mhalf
= expand_simple_binop (mode
, MINUS
, half
, one
, NULL_RTX
,
18535 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
18536 tmp
= ix86_expand_sse_compare_mask (UNGT
, dxa
, half
, false);
18537 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, tmp
, one
)));
18538 xa2
= expand_simple_binop (mode
, MINUS
, xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
18539 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
18540 tmp
= ix86_expand_sse_compare_mask (UNGE
, mhalf
, dxa
, false);
18541 emit_insn (gen_rtx_SET (tmp
, gen_rtx_AND (mode
, tmp
, one
)));
18542 xa2
= expand_simple_binop (mode
, PLUS
, xa2
, tmp
, NULL_RTX
, 0, OPTAB_DIRECT
);
18544 /* res = copysign (xa2, operand1) */
18545 ix86_sse_copysign_to_positive (res
, xa2
, res
, mask
);
18547 emit_label (label
);
18548 LABEL_NUSES (label
) = 1;
18550 emit_move_insn (operand0
, res
);
18553 /* Expand SSE sequence for computing round
18554 from OP1 storing into OP0 using sse4 round insn. */
18556 ix86_expand_round_sse4 (rtx op0
, rtx op1
)
18558 machine_mode mode
= GET_MODE (op0
);
18559 rtx e1
, e2
, res
, half
;
18560 const struct real_format
*fmt
;
18561 REAL_VALUE_TYPE pred_half
, half_minus_pred_half
;
18562 rtx (*gen_copysign
) (rtx
, rtx
, rtx
);
18563 rtx (*gen_round
) (rtx
, rtx
, rtx
);
18568 gen_copysign
= gen_copysignsf3
;
18569 gen_round
= gen_sse4_1_roundsf2
;
18572 gen_copysign
= gen_copysigndf3
;
18573 gen_round
= gen_sse4_1_rounddf2
;
18576 gcc_unreachable ();
18579 /* round (a) = trunc (a + copysign (0.5, a)) */
18581 /* load nextafter (0.5, 0.0) */
18582 fmt
= REAL_MODE_FORMAT (mode
);
18583 real_2expN (&half_minus_pred_half
, -(fmt
->p
) - 1, mode
);
18584 real_arithmetic (&pred_half
, MINUS_EXPR
, &dconsthalf
, &half_minus_pred_half
);
18585 half
= const_double_from_real_value (pred_half
, mode
);
18587 /* e1 = copysign (0.5, op1) */
18588 e1
= gen_reg_rtx (mode
);
18589 emit_insn (gen_copysign (e1
, half
, op1
));
18591 /* e2 = op1 + e1 */
18592 e2
= expand_simple_binop (mode
, PLUS
, op1
, e1
, NULL_RTX
, 0, OPTAB_DIRECT
);
18594 /* res = trunc (e2) */
18595 res
= gen_reg_rtx (mode
);
18596 emit_insn (gen_round (res
, e2
, GEN_INT (ROUND_TRUNC
)));
18598 emit_move_insn (op0
, res
);
18601 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
18602 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
18603 insn every time. */
18605 static GTY(()) rtx_insn
*vselect_insn
;
18607 /* Initialize vselect_insn. */
18610 init_vselect_insn (void)
18615 x
= gen_rtx_PARALLEL (VOIDmode
, rtvec_alloc (MAX_VECT_LEN
));
18616 for (i
= 0; i
< MAX_VECT_LEN
; ++i
)
18617 XVECEXP (x
, 0, i
) = const0_rtx
;
18618 x
= gen_rtx_VEC_SELECT (V2DFmode
, gen_rtx_VEC_CONCAT (V4DFmode
, const0_rtx
,
18620 x
= gen_rtx_SET (const0_rtx
, x
);
18622 vselect_insn
= emit_insn (x
);
18626 /* Construct (set target (vec_select op0 (parallel perm))) and
18627 return true if that's a valid instruction in the active ISA. */
18630 expand_vselect (rtx target
, rtx op0
, const unsigned char *perm
,
18631 unsigned nelt
, bool testing_p
)
18634 rtx x
, save_vconcat
;
18637 if (vselect_insn
== NULL_RTX
)
18638 init_vselect_insn ();
18640 x
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 1);
18641 PUT_NUM_ELEM (XVEC (x
, 0), nelt
);
18642 for (i
= 0; i
< nelt
; ++i
)
18643 XVECEXP (x
, 0, i
) = GEN_INT (perm
[i
]);
18644 save_vconcat
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 0);
18645 XEXP (SET_SRC (PATTERN (vselect_insn
)), 0) = op0
;
18646 PUT_MODE (SET_SRC (PATTERN (vselect_insn
)), GET_MODE (target
));
18647 SET_DEST (PATTERN (vselect_insn
)) = target
;
18648 icode
= recog_memoized (vselect_insn
);
18650 if (icode
>= 0 && !testing_p
)
18651 emit_insn (copy_rtx (PATTERN (vselect_insn
)));
18653 SET_DEST (PATTERN (vselect_insn
)) = const0_rtx
;
18654 XEXP (SET_SRC (PATTERN (vselect_insn
)), 0) = save_vconcat
;
18655 INSN_CODE (vselect_insn
) = -1;
18660 /* Similar, but generate a vec_concat from op0 and op1 as well. */
18663 expand_vselect_vconcat (rtx target
, rtx op0
, rtx op1
,
18664 const unsigned char *perm
, unsigned nelt
,
18667 machine_mode v2mode
;
18671 if (vselect_insn
== NULL_RTX
)
18672 init_vselect_insn ();
18674 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0
)).exists (&v2mode
))
18676 x
= XEXP (SET_SRC (PATTERN (vselect_insn
)), 0);
18677 PUT_MODE (x
, v2mode
);
18680 ok
= expand_vselect (target
, x
, perm
, nelt
, testing_p
);
18681 XEXP (x
, 0) = const0_rtx
;
18682 XEXP (x
, 1) = const0_rtx
;
18686 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
18687 using movss or movsd. */
18689 expand_vec_perm_movs (struct expand_vec_perm_d
*d
)
18691 machine_mode vmode
= d
->vmode
;
18692 unsigned i
, nelt
= d
->nelt
;
18695 if (d
->one_operand_p
)
18698 if (!(TARGET_SSE
&& vmode
== V4SFmode
)
18699 && !(TARGET_MMX_WITH_SSE
&& vmode
== V2SFmode
)
18700 && !(TARGET_SSE2
&& vmode
== V2DFmode
))
18703 /* Only the first element is changed. */
18704 if (d
->perm
[0] != nelt
&& d
->perm
[0] != 0)
18706 for (i
= 1; i
< nelt
; ++i
)
18707 if (d
->perm
[i
] != i
+ nelt
- d
->perm
[0])
18713 if (d
->perm
[0] == nelt
)
18714 x
= gen_rtx_VEC_MERGE (vmode
, d
->op1
, d
->op0
, GEN_INT (1));
18716 x
= gen_rtx_VEC_MERGE (vmode
, d
->op0
, d
->op1
, GEN_INT (1));
18718 emit_insn (gen_rtx_SET (d
->target
, x
));
18723 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
18724 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
18727 expand_vec_perm_blend (struct expand_vec_perm_d
*d
)
18729 machine_mode mmode
, vmode
= d
->vmode
;
18730 unsigned i
, nelt
= d
->nelt
;
18731 unsigned HOST_WIDE_INT mask
;
18732 rtx target
, op0
, op1
, maskop
, x
;
18733 rtx rperm
[32], vperm
;
18735 if (d
->one_operand_p
)
18737 if (TARGET_AVX512F
&& GET_MODE_SIZE (vmode
) == 64
18738 && (TARGET_AVX512BW
18739 || GET_MODE_UNIT_SIZE (vmode
) >= 4))
18741 else if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
18743 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
18745 else if (TARGET_SSE4_1
&& (GET_MODE_SIZE (vmode
) == 16
18746 || GET_MODE_SIZE (vmode
) == 8
18747 || GET_MODE_SIZE (vmode
) == 4))
18752 /* This is a blend, not a permute. Elements must stay in their
18753 respective lanes. */
18754 for (i
= 0; i
< nelt
; ++i
)
18756 unsigned e
= d
->perm
[i
];
18757 if (!(e
== i
|| e
== i
+ nelt
))
18764 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
18765 decision should be extracted elsewhere, so that we only try that
18766 sequence once all budget==3 options have been tried. */
18767 target
= d
->target
;
18787 for (i
= 0; i
< nelt
; ++i
)
18788 mask
|= ((unsigned HOST_WIDE_INT
) (d
->perm
[i
] >= nelt
)) << i
;
18792 for (i
= 0; i
< 2; ++i
)
18793 mask
|= (d
->perm
[i
] >= 2 ? 15 : 0) << (i
* 4);
18798 for (i
= 0; i
< 2; ++i
)
18799 mask
|= (d
->perm
[i
] >= 2 ? 3 : 0) << (i
* 2);
18804 for (i
= 0; i
< 4; ++i
)
18805 mask
|= (d
->perm
[i
] >= 4 ? 3 : 0) << (i
* 2);
18810 /* See if bytes move in pairs so we can use pblendw with
18811 an immediate argument, rather than pblendvb with a vector
18813 for (i
= 0; i
< 16; i
+= 2)
18814 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
18817 for (i
= 0; i
< nelt
; ++i
)
18818 rperm
[i
] = (d
->perm
[i
] < nelt
? const0_rtx
: constm1_rtx
);
18821 vperm
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
18822 vperm
= force_reg (vmode
, vperm
);
18824 if (GET_MODE_SIZE (vmode
) == 4)
18825 emit_insn (gen_mmx_pblendvb_v4qi (target
, op0
, op1
, vperm
));
18826 else if (GET_MODE_SIZE (vmode
) == 8)
18827 emit_insn (gen_mmx_pblendvb_v8qi (target
, op0
, op1
, vperm
));
18828 else if (GET_MODE_SIZE (vmode
) == 16)
18829 emit_insn (gen_sse4_1_pblendvb (target
, op0
, op1
, vperm
));
18831 emit_insn (gen_avx2_pblendvb (target
, op0
, op1
, vperm
));
18832 if (target
!= d
->target
)
18833 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
18837 for (i
= 0; i
< 8; ++i
)
18838 mask
|= (d
->perm
[i
* 2] >= 16) << i
;
18843 target
= gen_reg_rtx (vmode
);
18844 op0
= gen_lowpart (vmode
, op0
);
18845 op1
= gen_lowpart (vmode
, op1
);
18849 for (i
= 0; i
< 8; i
+= 2)
18850 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
18853 for (i
= 0; i
< 4; ++i
)
18854 mask
|= (d
->perm
[i
* 2] >= 8) << i
;
18859 for (i
= 0; i
< 4; i
+= 2)
18860 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
18863 for (i
= 0; i
< 2; ++i
)
18864 mask
|= (d
->perm
[i
* 2] >= 4) << i
;
18869 /* See if bytes move in pairs. If not, vpblendvb must be used. */
18870 for (i
= 0; i
< 32; i
+= 2)
18871 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
18873 /* See if bytes move in quadruplets. If yes, vpblendd
18874 with immediate can be used. */
18875 for (i
= 0; i
< 32; i
+= 4)
18876 if (d
->perm
[i
] + 2 != d
->perm
[i
+ 2])
18880 /* See if bytes move the same in both lanes. If yes,
18881 vpblendw with immediate can be used. */
18882 for (i
= 0; i
< 16; i
+= 2)
18883 if (d
->perm
[i
] + 16 != d
->perm
[i
+ 16])
18886 /* Use vpblendw. */
18887 for (i
= 0; i
< 16; ++i
)
18888 mask
|= (d
->perm
[i
* 2] >= 32) << i
;
18893 /* Use vpblendd. */
18894 for (i
= 0; i
< 8; ++i
)
18895 mask
|= (d
->perm
[i
* 4] >= 32) << i
;
18900 /* See if words move in pairs. If yes, vpblendd can be used. */
18901 for (i
= 0; i
< 16; i
+= 2)
18902 if (d
->perm
[i
] + 1 != d
->perm
[i
+ 1])
18906 /* See if words move the same in both lanes. If not,
18907 vpblendvb must be used. */
18908 for (i
= 0; i
< 8; i
++)
18909 if (d
->perm
[i
] + 8 != d
->perm
[i
+ 8])
18911 /* Use vpblendvb. */
18912 for (i
= 0; i
< 32; ++i
)
18913 rperm
[i
] = (d
->perm
[i
/ 2] < 16 ? const0_rtx
: constm1_rtx
);
18917 target
= gen_reg_rtx (vmode
);
18918 op0
= gen_lowpart (vmode
, op0
);
18919 op1
= gen_lowpart (vmode
, op1
);
18920 goto finish_pblendvb
;
18923 /* Use vpblendw. */
18924 for (i
= 0; i
< 16; ++i
)
18925 mask
|= (d
->perm
[i
] >= 16) << i
;
18929 /* Use vpblendd. */
18930 for (i
= 0; i
< 8; ++i
)
18931 mask
|= (d
->perm
[i
* 2] >= 16) << i
;
18936 /* Use vpblendd. */
18937 for (i
= 0; i
< 4; ++i
)
18938 mask
|= (d
->perm
[i
] >= 4 ? 3 : 0) << (i
* 2);
18943 gcc_unreachable ();
18966 if (mmode
!= VOIDmode
)
18967 maskop
= force_reg (mmode
, gen_int_mode (mask
, mmode
));
18969 maskop
= GEN_INT (mask
);
18971 /* This matches five different patterns with the different modes. */
18972 x
= gen_rtx_VEC_MERGE (vmode
, op1
, op0
, maskop
);
18973 x
= gen_rtx_SET (target
, x
);
18975 if (target
!= d
->target
)
18976 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
18981 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
18982 in terms of the variable form of vpermilps.
18984 Note that we will have already failed the immediate input vpermilps,
18985 which requires that the high and low part shuffle be identical; the
18986 variable form doesn't require that. */
18989 expand_vec_perm_vpermil (struct expand_vec_perm_d
*d
)
18991 rtx rperm
[8], vperm
;
18994 if (!TARGET_AVX
|| d
->vmode
!= V8SFmode
|| !d
->one_operand_p
)
18997 /* We can only permute within the 128-bit lane. */
18998 for (i
= 0; i
< 8; ++i
)
19000 unsigned e
= d
->perm
[i
];
19001 if (i
< 4 ? e
>= 4 : e
< 4)
19008 for (i
= 0; i
< 8; ++i
)
19010 unsigned e
= d
->perm
[i
];
19012 /* Within each 128-bit lane, the elements of op0 are numbered
19013 from 0 and the elements of op1 are numbered from 4. */
19019 rperm
[i
] = GEN_INT (e
);
19022 vperm
= gen_rtx_CONST_VECTOR (V8SImode
, gen_rtvec_v (8, rperm
));
19023 vperm
= force_reg (V8SImode
, vperm
);
19024 emit_insn (gen_avx_vpermilvarv8sf3 (d
->target
, d
->op0
, vperm
));
19029 /* For V*[QHS]Imode permutations, check if the same permutation
19030 can't be performed in a 2x, 4x or 8x wider inner mode. */
19033 canonicalize_vector_int_perm (const struct expand_vec_perm_d
*d
,
19034 struct expand_vec_perm_d
*nd
)
19037 machine_mode mode
= VOIDmode
;
19041 case E_V8QImode
: mode
= V4HImode
; break;
19042 case E_V16QImode
: mode
= V8HImode
; break;
19043 case E_V32QImode
: mode
= V16HImode
; break;
19044 case E_V64QImode
: mode
= V32HImode
; break;
19045 case E_V4HImode
: mode
= V2SImode
; break;
19046 case E_V8HImode
: mode
= V4SImode
; break;
19047 case E_V16HImode
: mode
= V8SImode
; break;
19048 case E_V32HImode
: mode
= V16SImode
; break;
19049 case E_V4SImode
: mode
= V2DImode
; break;
19050 case E_V8SImode
: mode
= V4DImode
; break;
19051 case E_V16SImode
: mode
= V8DImode
; break;
19052 default: return false;
19054 for (i
= 0; i
< d
->nelt
; i
+= 2)
19055 if ((d
->perm
[i
] & 1) || d
->perm
[i
+ 1] != d
->perm
[i
] + 1)
19058 nd
->nelt
= d
->nelt
/ 2;
19059 for (i
= 0; i
< nd
->nelt
; i
++)
19060 nd
->perm
[i
] = d
->perm
[2 * i
] / 2;
19061 if (GET_MODE_INNER (mode
) != DImode
)
19062 canonicalize_vector_int_perm (nd
, nd
);
19065 nd
->one_operand_p
= d
->one_operand_p
;
19066 nd
->testing_p
= d
->testing_p
;
19067 if (d
->op0
== d
->op1
)
19068 nd
->op0
= nd
->op1
= gen_lowpart (nd
->vmode
, d
->op0
);
19071 nd
->op0
= gen_lowpart (nd
->vmode
, d
->op0
);
19072 nd
->op1
= gen_lowpart (nd
->vmode
, d
->op1
);
19075 nd
->target
= gen_raw_REG (nd
->vmode
, LAST_VIRTUAL_REGISTER
+ 1);
19077 nd
->target
= gen_reg_rtx (nd
->vmode
);
19082 /* Return true if permutation D can be performed as VMODE permutation
19086 valid_perm_using_mode_p (machine_mode vmode
, struct expand_vec_perm_d
*d
)
19088 unsigned int i
, j
, chunk
;
19090 if (GET_MODE_CLASS (vmode
) != MODE_VECTOR_INT
19091 || GET_MODE_CLASS (d
->vmode
) != MODE_VECTOR_INT
19092 || GET_MODE_SIZE (vmode
) != GET_MODE_SIZE (d
->vmode
))
19095 if (GET_MODE_NUNITS (vmode
) >= d
->nelt
)
19098 chunk
= d
->nelt
/ GET_MODE_NUNITS (vmode
);
19099 for (i
= 0; i
< d
->nelt
; i
+= chunk
)
19100 if (d
->perm
[i
] & (chunk
- 1))
19103 for (j
= 1; j
< chunk
; ++j
)
19104 if (d
->perm
[i
] + j
!= d
->perm
[i
+ j
])
19110 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19111 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
19114 expand_vec_perm_pshufb (struct expand_vec_perm_d
*d
)
19116 unsigned i
, nelt
, eltsz
, mask
;
19117 unsigned char perm
[64];
19118 machine_mode vmode
;
19119 struct expand_vec_perm_d nd
;
19120 rtx rperm
[64], vperm
, target
, op0
, op1
;
19124 if (!d
->one_operand_p
)
19125 switch (GET_MODE_SIZE (d
->vmode
))
19149 if (valid_perm_using_mode_p (V2TImode
, d
))
19154 /* Use vperm2i128 insn. The pattern uses
19155 V4DImode instead of V2TImode. */
19156 target
= d
->target
;
19157 if (d
->vmode
!= V4DImode
)
19158 target
= gen_reg_rtx (V4DImode
);
19159 op0
= gen_lowpart (V4DImode
, d
->op0
);
19160 op1
= gen_lowpart (V4DImode
, d
->op1
);
19162 = GEN_INT ((d
->perm
[0] / (nelt
/ 2))
19163 | ((d
->perm
[nelt
/ 2] / (nelt
/ 2)) * 16));
19164 emit_insn (gen_avx2_permv2ti (target
, op0
, op1
, rperm
[0]));
19165 if (target
!= d
->target
)
19166 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
19175 switch (GET_MODE_SIZE (d
->vmode
))
19199 /* V4DImode should be already handled through
19200 expand_vselect by vpermq instruction. */
19201 gcc_assert (d
->vmode
!= V4DImode
);
19204 if (d
->vmode
== V8SImode
19205 || d
->vmode
== V16HImode
19206 || d
->vmode
== V32QImode
)
19208 /* First see if vpermq can be used for
19209 V8SImode/V16HImode/V32QImode. */
19210 if (valid_perm_using_mode_p (V4DImode
, d
))
19212 for (i
= 0; i
< 4; i
++)
19213 perm
[i
] = (d
->perm
[i
* nelt
/ 4] * 4 / nelt
) & 3;
19216 target
= gen_reg_rtx (V4DImode
);
19217 if (expand_vselect (target
, gen_lowpart (V4DImode
, d
->op0
),
19220 emit_move_insn (d
->target
,
19221 gen_lowpart (d
->vmode
, target
));
19227 /* Next see if vpermd can be used. */
19228 if (valid_perm_using_mode_p (V8SImode
, d
))
19231 /* Or if vpermps can be used. */
19232 else if (d
->vmode
== V8SFmode
)
19235 if (vmode
== V32QImode
)
19237 /* vpshufb only works intra lanes, it is not
19238 possible to shuffle bytes in between the lanes. */
19239 for (i
= 0; i
< nelt
; ++i
)
19240 if ((d
->perm
[i
] ^ i
) & (nelt
/ 2))
19246 if (!TARGET_AVX512BW
)
19249 /* If vpermq didn't work, vpshufb won't work either. */
19250 if (d
->vmode
== V8DFmode
|| d
->vmode
== V8DImode
)
19254 if (d
->vmode
== V16SImode
19255 || d
->vmode
== V32HImode
19256 || d
->vmode
== V64QImode
)
19258 /* First see if vpermq can be used for
19259 V16SImode/V32HImode/V64QImode. */
19260 if (valid_perm_using_mode_p (V8DImode
, d
))
19262 for (i
= 0; i
< 8; i
++)
19263 perm
[i
] = (d
->perm
[i
* nelt
/ 8] * 8 / nelt
) & 7;
19266 target
= gen_reg_rtx (V8DImode
);
19267 if (expand_vselect (target
, gen_lowpart (V8DImode
, d
->op0
),
19270 emit_move_insn (d
->target
,
19271 gen_lowpart (d
->vmode
, target
));
19277 /* Next see if vpermd can be used. */
19278 if (valid_perm_using_mode_p (V16SImode
, d
))
19281 /* Or if vpermps can be used. */
19282 else if (d
->vmode
== V16SFmode
)
19285 if (vmode
== V64QImode
)
19287 /* vpshufb only works intra lanes, it is not
19288 possible to shuffle bytes in between the lanes. */
19289 for (i
= 0; i
< nelt
; ++i
)
19290 if ((d
->perm
[i
] ^ i
) & (3 * nelt
/ 4))
19302 /* Try to avoid variable permutation instruction. */
19303 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
19305 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
19309 if (vmode
== V8SImode
)
19310 for (i
= 0; i
< 8; ++i
)
19311 rperm
[i
] = GEN_INT ((d
->perm
[i
* nelt
/ 8] * 8 / nelt
) & 7);
19312 else if (vmode
== V16SImode
)
19313 for (i
= 0; i
< 16; ++i
)
19314 rperm
[i
] = GEN_INT ((d
->perm
[i
* nelt
/ 16] * 16 / nelt
) & 15);
19317 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
19318 if (!d
->one_operand_p
)
19319 mask
= 2 * nelt
- 1;
19320 else if (vmode
== V64QImode
)
19321 mask
= nelt
/ 4 - 1;
19322 else if (vmode
== V32QImode
)
19323 mask
= nelt
/ 2 - 1;
19327 for (i
= 0; i
< nelt
; ++i
)
19329 unsigned j
, e
= d
->perm
[i
] & mask
;
19330 for (j
= 0; j
< eltsz
; ++j
)
19331 rperm
[i
* eltsz
+ j
] = GEN_INT (e
* eltsz
+ j
);
19335 machine_mode vpmode
= vmode
;
19337 nelt
= GET_MODE_SIZE (vmode
);
19339 /* Emulate narrow modes with V16QI instructions. */
19342 rtx m128
= GEN_INT (-128);
19344 /* Remap elements from the second operand, as we have to
19345 account for inactive top elements from the first operand. */
19346 if (!d
->one_operand_p
)
19348 for (i
= 0; i
< nelt
; ++i
)
19350 unsigned ival
= UINTVAL (rperm
[i
]);
19352 rperm
[i
] = GEN_INT (ival
+ 16 - nelt
);
19356 /* Fill inactive elements in the top positions with zeros. */
19357 for (i
= nelt
; i
< 16; ++i
)
19360 vpmode
= V16QImode
;
19363 vperm
= gen_rtx_CONST_VECTOR (vpmode
,
19364 gen_rtvec_v (GET_MODE_NUNITS (vpmode
), rperm
));
19365 vperm
= force_reg (vpmode
, vperm
);
19367 if (vmode
== d
->vmode
)
19368 target
= d
->target
;
19370 target
= gen_reg_rtx (vmode
);
19372 op0
= gen_lowpart (vmode
, d
->op0
);
19374 if (d
->one_operand_p
)
19376 rtx (*gen
) (rtx
, rtx
, rtx
);
19378 if (vmode
== V4QImode
)
19379 gen
= gen_mmx_pshufbv4qi3
;
19380 else if (vmode
== V8QImode
)
19381 gen
= gen_mmx_pshufbv8qi3
;
19382 else if (vmode
== V16QImode
)
19383 gen
= gen_ssse3_pshufbv16qi3
;
19384 else if (vmode
== V32QImode
)
19385 gen
= gen_avx2_pshufbv32qi3
;
19386 else if (vmode
== V64QImode
)
19387 gen
= gen_avx512bw_pshufbv64qi3
;
19388 else if (vmode
== V8SFmode
)
19389 gen
= gen_avx2_permvarv8sf
;
19390 else if (vmode
== V8SImode
)
19391 gen
= gen_avx2_permvarv8si
;
19392 else if (vmode
== V16SFmode
)
19393 gen
= gen_avx512f_permvarv16sf
;
19394 else if (vmode
== V16SImode
)
19395 gen
= gen_avx512f_permvarv16si
;
19397 gcc_unreachable ();
19399 emit_insn (gen (target
, op0
, vperm
));
19403 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
19405 op1
= gen_lowpart (vmode
, d
->op1
);
19407 if (vmode
== V4QImode
)
19408 gen
= gen_mmx_ppermv32
;
19409 else if (vmode
== V8QImode
)
19410 gen
= gen_mmx_ppermv64
;
19411 else if (vmode
== V16QImode
)
19412 gen
= gen_xop_pperm
;
19414 gcc_unreachable ();
19416 emit_insn (gen (target
, op0
, op1
, vperm
));
19419 if (target
!= d
->target
)
19420 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
19425 /* Try to expand one-operand permutation with constant mask. */
19428 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d
*d
)
19430 machine_mode mode
= GET_MODE (d
->op0
);
19431 machine_mode maskmode
= mode
;
19432 unsigned inner_size
= GET_MODE_SIZE (GET_MODE_INNER (mode
));
19433 rtx (*gen
) (rtx
, rtx
, rtx
) = NULL
;
19434 rtx target
, op0
, mask
;
19437 if (!rtx_equal_p (d
->op0
, d
->op1
))
19440 if (!TARGET_AVX512F
)
19443 /* Accept VNxHImode and VNxQImode now. */
19444 if (!TARGET_AVX512VL
&& GET_MODE_SIZE (mode
) < 64)
19448 if (!TARGET_AVX512BW
&& inner_size
== 2)
19452 if (!TARGET_AVX512VBMI
&& inner_size
== 1)
19458 gen
= gen_avx512f_permvarv16si
;
19461 gen
= gen_avx512f_permvarv16sf
;
19462 maskmode
= V16SImode
;
19465 gen
= gen_avx512f_permvarv8di
;
19468 gen
= gen_avx512f_permvarv8df
;
19469 maskmode
= V8DImode
;
19472 gen
= gen_avx512bw_permvarv32hi
;
19475 gen
= gen_avx512vl_permvarv16hi
;
19478 gen
= gen_avx512vl_permvarv8hi
;
19481 gen
= gen_avx512bw_permvarv64qi
;
19484 gen
= gen_avx512vl_permvarv32qi
;
19487 gen
= gen_avx512vl_permvarv16qi
;
19497 target
= d
->target
;
19499 for (int i
= 0; i
< d
->nelt
; ++i
)
19500 vec
[i
] = GEN_INT (d
->perm
[i
]);
19501 mask
= gen_rtx_CONST_VECTOR (maskmode
, gen_rtvec_v (d
->nelt
, vec
));
19502 emit_insn (gen (target
, op0
, force_reg (maskmode
, mask
)));
19506 static bool expand_vec_perm_palignr (struct expand_vec_perm_d
*d
, bool);
19508 /* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
19509 in a single instruction. */
19512 expand_vec_perm_1 (struct expand_vec_perm_d
*d
)
19514 unsigned i
, nelt
= d
->nelt
;
19515 struct expand_vec_perm_d nd
;
19517 /* Check plain VEC_SELECT first, because AVX has instructions that could
19518 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
19519 input where SEL+CONCAT may not. */
19520 if (d
->one_operand_p
)
19522 int mask
= nelt
- 1;
19523 bool identity_perm
= true;
19524 bool broadcast_perm
= true;
19526 for (i
= 0; i
< nelt
; i
++)
19528 nd
.perm
[i
] = d
->perm
[i
] & mask
;
19529 if (nd
.perm
[i
] != i
)
19530 identity_perm
= false;
19532 broadcast_perm
= false;
19538 emit_move_insn (d
->target
, d
->op0
);
19541 else if (broadcast_perm
&& TARGET_AVX2
)
19543 /* Use vpbroadcast{b,w,d}. */
19544 rtx (*gen
) (rtx
, rtx
) = NULL
;
19548 if (TARGET_AVX512BW
)
19549 gen
= gen_avx512bw_vec_dupv64qi_1
;
19552 gen
= gen_avx2_pbroadcastv32qi_1
;
19555 if (TARGET_AVX512BW
)
19556 gen
= gen_avx512bw_vec_dupv32hi_1
;
19559 gen
= gen_avx2_pbroadcastv16hi_1
;
19562 if (TARGET_AVX512F
)
19563 gen
= gen_avx512f_vec_dupv16si_1
;
19566 gen
= gen_avx2_pbroadcastv8si_1
;
19569 gen
= gen_avx2_pbroadcastv16qi
;
19572 gen
= gen_avx2_pbroadcastv8hi
;
19575 if (TARGET_AVX512F
)
19576 gen
= gen_avx512f_vec_dupv16sf_1
;
19579 gen
= gen_avx2_vec_dupv8sf_1
;
19582 if (TARGET_AVX512F
)
19583 gen
= gen_avx512f_vec_dupv8df_1
;
19586 if (TARGET_AVX512F
)
19587 gen
= gen_avx512f_vec_dupv8di_1
;
19589 /* For other modes prefer other shuffles this function creates. */
19595 emit_insn (gen (d
->target
, d
->op0
));
19600 if (expand_vselect (d
->target
, d
->op0
, nd
.perm
, nelt
, d
->testing_p
))
19603 /* There are plenty of patterns in sse.md that are written for
19604 SEL+CONCAT and are not replicated for a single op. Perhaps
19605 that should be changed, to avoid the nastiness here. */
19607 /* Recognize interleave style patterns, which means incrementing
19608 every other permutation operand. */
19609 for (i
= 0; i
< nelt
; i
+= 2)
19611 nd
.perm
[i
] = d
->perm
[i
] & mask
;
19612 nd
.perm
[i
+ 1] = (d
->perm
[i
+ 1] & mask
) + nelt
;
19614 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op0
, nd
.perm
, nelt
,
19618 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
19621 for (i
= 0; i
< nelt
; i
+= 4)
19623 nd
.perm
[i
+ 0] = d
->perm
[i
+ 0] & mask
;
19624 nd
.perm
[i
+ 1] = d
->perm
[i
+ 1] & mask
;
19625 nd
.perm
[i
+ 2] = (d
->perm
[i
+ 2] & mask
) + nelt
;
19626 nd
.perm
[i
+ 3] = (d
->perm
[i
+ 3] & mask
) + nelt
;
19629 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op0
, nd
.perm
, nelt
,
19635 /* Try movss/movsd instructions. */
19636 if (expand_vec_perm_movs (d
))
19639 /* Finally, try the fully general two operand permute. */
19640 if (expand_vselect_vconcat (d
->target
, d
->op0
, d
->op1
, d
->perm
, nelt
,
19644 /* Recognize interleave style patterns with reversed operands. */
19645 if (!d
->one_operand_p
)
19647 for (i
= 0; i
< nelt
; ++i
)
19649 unsigned e
= d
->perm
[i
];
19657 if (expand_vselect_vconcat (d
->target
, d
->op1
, d
->op0
, nd
.perm
, nelt
,
19662 /* Try the SSE4.1 blend variable merge instructions. */
19663 if (expand_vec_perm_blend (d
))
19666 /* Try one of the AVX vpermil variable permutations. */
19667 if (expand_vec_perm_vpermil (d
))
19670 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
19671 vpshufb, vpermd, vpermps or vpermq variable permutation. */
19672 if (expand_vec_perm_pshufb (d
))
19675 /* Try the AVX2 vpalignr instruction. */
19676 if (expand_vec_perm_palignr (d
, true))
19679 /* Try the AVX512F vperm{w,b,s,d} instructions */
19680 if (ix86_expand_vec_one_operand_perm_avx512 (d
))
19683 /* Try the AVX512F vpermt2/vpermi2 instructions. */
19684 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX
, NULL_RTX
, NULL_RTX
, NULL_RTX
, d
))
19687 /* See if we can get the same permutation in different vector integer
19689 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
19692 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
19698 /* Canonicalize vec_perm index to make the first index
19699 always comes from the first vector. */
19701 ix86_vec_perm_index_canon (struct expand_vec_perm_d
*d
)
19703 unsigned nelt
= d
->nelt
;
19704 if (d
->perm
[0] < nelt
)
19707 for (unsigned i
= 0; i
!= nelt
; i
++)
19708 d
->perm
[i
] = (d
->perm
[i
] + nelt
) % (2 * nelt
);
19710 std::swap (d
->op0
, d
->op1
);
19714 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19715 in terms of a pair of shufps+ shufps/pshufd instructions. */
19717 expand_vec_perm_shufps_shufps (struct expand_vec_perm_d
*d
)
19719 unsigned char perm1
[4];
19720 machine_mode vmode
= d
->vmode
;
19722 unsigned i
, j
, k
, count
= 0;
19724 if (d
->one_operand_p
19725 || (vmode
!= V4SImode
&& vmode
!= V4SFmode
))
19731 ix86_vec_perm_index_canon (d
);
19732 for (i
= 0; i
< 4; ++i
)
19733 count
+= d
->perm
[i
] > 3 ? 1 : 0;
19735 gcc_assert (count
& 3);
19737 rtx tmp
= gen_reg_rtx (vmode
);
19738 /* 2 from op0 and 2 from op1. */
19741 unsigned char perm2
[4];
19742 for (i
= 0, j
= 0, k
= 2; i
< 4; ++i
)
19743 if (d
->perm
[i
] & 4)
19745 perm1
[k
++] = d
->perm
[i
];
19750 perm1
[j
++] = d
->perm
[i
];
19755 ok
= expand_vselect_vconcat (tmp
, d
->op0
, d
->op1
,
19756 perm1
, d
->nelt
, false);
19758 if (vmode
== V4SImode
&& TARGET_SSE2
)
19760 ok
= expand_vselect (d
->target
, tmp
,
19761 perm2
, d
->nelt
, false);
19767 ok
= expand_vselect_vconcat (d
->target
, tmp
, tmp
,
19768 perm2
, d
->nelt
, false);
19772 /* 3 from one op and 1 from another. */
19775 unsigned pair_idx
= 8, lone_idx
= 8, shift
;
19777 /* Find the lone index. */
19778 for (i
= 0; i
< 4; ++i
)
19779 if ((d
->perm
[i
] > 3 && count
== 1)
19780 || (d
->perm
[i
] < 4 && count
== 3))
19783 /* When lone_idx is not 0, it must from second op(count == 1). */
19784 gcc_assert (count
== (lone_idx
? 1 : 3));
19786 /* Find the pair index that sits in the same half as the lone index. */
19787 shift
= lone_idx
& 2;
19788 pair_idx
= 1 - lone_idx
+ 2 * shift
;
19790 /* First permutate lone index and pair index into the same vector as
19791 [ lone, lone, pair, pair ]. */
19792 perm1
[1] = perm1
[0]
19793 = (count
== 3) ? d
->perm
[lone_idx
] : d
->perm
[lone_idx
] - 4;
19794 perm1
[3] = perm1
[2]
19795 = (count
== 3) ? d
->perm
[pair_idx
] : d
->perm
[pair_idx
] + 4;
19797 /* Alway put the vector contains lone indx at the first. */
19799 std::swap (d
->op0
, d
->op1
);
19802 ok
= expand_vselect_vconcat (tmp
, d
->op0
, d
->op1
,
19803 perm1
, d
->nelt
, false);
19806 /* Refine lone and pair index to original order. */
19807 perm1
[shift
] = lone_idx
<< 1;
19808 perm1
[shift
+ 1] = pair_idx
<< 1;
19810 /* Select the remaining 2 elements in another vector. */
19811 for (i
= 2 - shift
; i
< 4 - shift
; ++i
)
19812 perm1
[i
] = lone_idx
== 1 ? d
->perm
[i
] + 4 : d
->perm
[i
];
19814 /* Adjust to original selector. */
19816 std::swap (tmp
, d
->op1
);
19819 ok
= expand_vselect_vconcat (d
->target
, tmp
, d
->op1
,
19820 perm1
, d
->nelt
, false);
19828 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19829 in terms of a pair of pshuflw + pshufhw instructions. */
19832 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d
*d
)
19834 unsigned char perm2
[MAX_VECT_LEN
];
19838 if (d
->vmode
!= V8HImode
|| !d
->one_operand_p
)
19841 /* The two permutations only operate in 64-bit lanes. */
19842 for (i
= 0; i
< 4; ++i
)
19843 if (d
->perm
[i
] >= 4)
19845 for (i
= 4; i
< 8; ++i
)
19846 if (d
->perm
[i
] < 4)
19852 /* Emit the pshuflw. */
19853 memcpy (perm2
, d
->perm
, 4);
19854 for (i
= 4; i
< 8; ++i
)
19856 ok
= expand_vselect (d
->target
, d
->op0
, perm2
, 8, d
->testing_p
);
19859 /* Emit the pshufhw. */
19860 memcpy (perm2
+ 4, d
->perm
+ 4, 4);
19861 for (i
= 0; i
< 4; ++i
)
19863 ok
= expand_vselect (d
->target
, d
->target
, perm2
, 8, d
->testing_p
);
19869 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
19870 the permutation using the SSSE3 palignr instruction. This succeeds
19871 when all of the elements in PERM fit within one vector and we merely
19872 need to shift them down so that a single vector permutation has a
19873 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
19874 the vpalignr instruction itself can perform the requested permutation. */
19877 expand_vec_perm_palignr (struct expand_vec_perm_d
*d
, bool single_insn_only_p
)
19879 unsigned i
, nelt
= d
->nelt
;
19880 unsigned min
, max
, minswap
, maxswap
;
19881 bool in_order
, ok
, swap
= false;
19883 struct expand_vec_perm_d dcopy
;
19885 /* Even with AVX, palignr only operates on 128-bit vectors,
19886 in AVX2 palignr operates on both 128-bit lanes. */
19887 if ((!TARGET_SSSE3
|| GET_MODE_SIZE (d
->vmode
) != 16)
19888 && (!TARGET_AVX2
|| GET_MODE_SIZE (d
->vmode
) != 32))
19893 minswap
= 2 * nelt
;
19895 for (i
= 0; i
< nelt
; ++i
)
19897 unsigned e
= d
->perm
[i
];
19898 unsigned eswap
= d
->perm
[i
] ^ nelt
;
19899 if (GET_MODE_SIZE (d
->vmode
) == 32)
19901 e
= (e
& ((nelt
/ 2) - 1)) | ((e
& nelt
) >> 1);
19902 eswap
= e
^ (nelt
/ 2);
19908 if (eswap
< minswap
)
19910 if (eswap
> maxswap
)
19914 || max
- min
>= (GET_MODE_SIZE (d
->vmode
) == 32 ? nelt
/ 2 : nelt
))
19916 if (d
->one_operand_p
19918 || maxswap
- minswap
>= (GET_MODE_SIZE (d
->vmode
) == 32
19919 ? nelt
/ 2 : nelt
))
19926 /* Given that we have SSSE3, we know we'll be able to implement the
19927 single operand permutation after the palignr with pshufb for
19928 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
19930 if (d
->testing_p
&& GET_MODE_SIZE (d
->vmode
) == 16 && !single_insn_only_p
)
19936 dcopy
.op0
= d
->op1
;
19937 dcopy
.op1
= d
->op0
;
19938 for (i
= 0; i
< nelt
; ++i
)
19939 dcopy
.perm
[i
] ^= nelt
;
19943 for (i
= 0; i
< nelt
; ++i
)
19945 unsigned e
= dcopy
.perm
[i
];
19946 if (GET_MODE_SIZE (d
->vmode
) == 32
19948 && (e
& (nelt
/ 2 - 1)) < min
)
19949 e
= e
- min
- (nelt
/ 2);
19956 dcopy
.one_operand_p
= true;
19958 if (single_insn_only_p
&& !in_order
)
19961 /* For AVX2, test whether we can permute the result in one instruction. */
19966 dcopy
.op1
= dcopy
.op0
;
19967 return expand_vec_perm_1 (&dcopy
);
19970 shift
= GEN_INT (min
* GET_MODE_UNIT_BITSIZE (d
->vmode
));
19971 if (GET_MODE_SIZE (d
->vmode
) == 16)
19973 target
= gen_reg_rtx (V1TImode
);
19974 emit_insn (gen_ssse3_palignrv1ti (target
,
19975 gen_lowpart (V1TImode
, dcopy
.op1
),
19976 gen_lowpart (V1TImode
, dcopy
.op0
),
19981 target
= gen_reg_rtx (V2TImode
);
19982 emit_insn (gen_avx2_palignrv2ti (target
,
19983 gen_lowpart (V2TImode
, dcopy
.op1
),
19984 gen_lowpart (V2TImode
, dcopy
.op0
),
19988 dcopy
.op0
= dcopy
.op1
= gen_lowpart (d
->vmode
, target
);
19990 /* Test for the degenerate case where the alignment by itself
19991 produces the desired permutation. */
19994 emit_move_insn (d
->target
, dcopy
.op0
);
19998 ok
= expand_vec_perm_1 (&dcopy
);
19999 gcc_assert (ok
|| GET_MODE_SIZE (d
->vmode
) == 32);
20004 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20005 the permutation using the SSE4_1 pblendv instruction. Potentially
20006 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
20009 expand_vec_perm_pblendv (struct expand_vec_perm_d
*d
)
20011 unsigned i
, which
, nelt
= d
->nelt
;
20012 struct expand_vec_perm_d dcopy
, dcopy1
;
20013 machine_mode vmode
= d
->vmode
;
20016 /* Use the same checks as in expand_vec_perm_blend. */
20017 if (d
->one_operand_p
)
20019 if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
20021 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
20023 else if (TARGET_SSE4_1
&& (GET_MODE_SIZE (vmode
) == 4
20024 || GET_MODE_SIZE (vmode
) == 8
20025 || GET_MODE_SIZE (vmode
) == 16))
20030 /* Figure out where permutation elements stay not in their
20031 respective lanes. */
20032 for (i
= 0, which
= 0; i
< nelt
; ++i
)
20034 unsigned e
= d
->perm
[i
];
20036 which
|= (e
< nelt
? 1 : 2);
20038 /* We can pblend the part where elements stay not in their
20039 respective lanes only when these elements are all in one
20040 half of a permutation.
20041 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
20042 lanes, but both 8 and 9 >= 8
20043 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
20044 respective lanes and 8 >= 8, but 2 not. */
20045 if (which
!= 1 && which
!= 2)
20047 if (d
->testing_p
&& GET_MODE_SIZE (vmode
) == 16)
20050 /* First we apply one operand permutation to the part where
20051 elements stay not in their respective lanes. */
20054 dcopy
.op0
= dcopy
.op1
= d
->op1
;
20056 dcopy
.op0
= dcopy
.op1
= d
->op0
;
20058 dcopy
.target
= gen_reg_rtx (vmode
);
20059 dcopy
.one_operand_p
= true;
20061 for (i
= 0; i
< nelt
; ++i
)
20062 dcopy
.perm
[i
] = d
->perm
[i
] & (nelt
- 1);
20064 ok
= expand_vec_perm_1 (&dcopy
);
20065 if (GET_MODE_SIZE (vmode
) != 16 && !ok
)
20072 /* Next we put permuted elements into their positions. */
20075 dcopy1
.op1
= dcopy
.target
;
20077 dcopy1
.op0
= dcopy
.target
;
20079 for (i
= 0; i
< nelt
; ++i
)
20080 dcopy1
.perm
[i
] = ((d
->perm
[i
] >= nelt
) ? (nelt
+ i
) : i
);
20082 ok
= expand_vec_perm_blend (&dcopy1
);
20088 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d
*d
);
20090 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20091 a two vector permutation into a single vector permutation by using
20092 an interleave operation to merge the vectors. */
20095 expand_vec_perm_interleave2 (struct expand_vec_perm_d
*d
)
20097 struct expand_vec_perm_d dremap
, dfinal
;
20098 unsigned i
, nelt
= d
->nelt
, nelt2
= nelt
/ 2;
20099 unsigned HOST_WIDE_INT contents
;
20100 unsigned char remap
[2 * MAX_VECT_LEN
];
20102 bool ok
, same_halves
= false;
20104 if (GET_MODE_SIZE (d
->vmode
) == 4
20105 || GET_MODE_SIZE (d
->vmode
) == 8
20106 || GET_MODE_SIZE (d
->vmode
) == 16)
20108 if (d
->one_operand_p
)
20111 else if (GET_MODE_SIZE (d
->vmode
) == 32)
20115 /* For 32-byte modes allow even d->one_operand_p.
20116 The lack of cross-lane shuffling in some instructions
20117 might prevent a single insn shuffle. */
20119 dfinal
.testing_p
= true;
20120 /* If expand_vec_perm_interleave3 can expand this into
20121 a 3 insn sequence, give up and let it be expanded as
20122 3 insn sequence. While that is one insn longer,
20123 it doesn't need a memory operand and in the common
20124 case that both interleave low and high permutations
20125 with the same operands are adjacent needs 4 insns
20126 for both after CSE. */
20127 if (expand_vec_perm_interleave3 (&dfinal
))
20133 /* Examine from whence the elements come. */
20135 for (i
= 0; i
< nelt
; ++i
)
20136 contents
|= HOST_WIDE_INT_1U
<< d
->perm
[i
];
20138 memset (remap
, 0xff, sizeof (remap
));
20141 if (GET_MODE_SIZE (d
->vmode
) == 4
20142 || GET_MODE_SIZE (d
->vmode
) == 8)
20144 unsigned HOST_WIDE_INT h1
, h2
, h3
, h4
;
20146 /* Split the two input vectors into 4 halves. */
20147 h1
= (HOST_WIDE_INT_1U
<< nelt2
) - 1;
20152 /* If the elements from the low halves use interleave low,
20153 and similarly for interleave high. */
20154 if ((contents
& (h1
| h3
)) == contents
)
20157 for (i
= 0; i
< nelt2
; ++i
)
20160 remap
[i
+ nelt
] = i
* 2 + 1;
20161 dremap
.perm
[i
* 2] = i
;
20162 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
20165 else if ((contents
& (h2
| h4
)) == contents
)
20168 for (i
= 0; i
< nelt2
; ++i
)
20170 remap
[i
+ nelt2
] = i
* 2;
20171 remap
[i
+ nelt
+ nelt2
] = i
* 2 + 1;
20172 dremap
.perm
[i
* 2] = i
+ nelt2
;
20173 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt2
;
20179 else if (GET_MODE_SIZE (d
->vmode
) == 16)
20181 unsigned HOST_WIDE_INT h1
, h2
, h3
, h4
;
20183 /* Split the two input vectors into 4 halves. */
20184 h1
= (HOST_WIDE_INT_1U
<< nelt2
) - 1;
20189 /* If the elements from the low halves use interleave low, and similarly
20190 for interleave high. If the elements are from mis-matched halves, we
20191 can use shufps for V4SF/V4SI or do a DImode shuffle. */
20192 if ((contents
& (h1
| h3
)) == contents
)
20195 for (i
= 0; i
< nelt2
; ++i
)
20198 remap
[i
+ nelt
] = i
* 2 + 1;
20199 dremap
.perm
[i
* 2] = i
;
20200 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
20202 if (!TARGET_SSE2
&& d
->vmode
== V4SImode
)
20203 dremap
.vmode
= V4SFmode
;
20205 else if ((contents
& (h2
| h4
)) == contents
)
20208 for (i
= 0; i
< nelt2
; ++i
)
20210 remap
[i
+ nelt2
] = i
* 2;
20211 remap
[i
+ nelt
+ nelt2
] = i
* 2 + 1;
20212 dremap
.perm
[i
* 2] = i
+ nelt2
;
20213 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt2
;
20215 if (!TARGET_SSE2
&& d
->vmode
== V4SImode
)
20216 dremap
.vmode
= V4SFmode
;
20218 else if ((contents
& (h1
| h4
)) == contents
)
20221 for (i
= 0; i
< nelt2
; ++i
)
20224 remap
[i
+ nelt
+ nelt2
] = i
+ nelt2
;
20225 dremap
.perm
[i
] = i
;
20226 dremap
.perm
[i
+ nelt2
] = i
+ nelt
+ nelt2
;
20231 dremap
.vmode
= V2DImode
;
20233 dremap
.perm
[0] = 0;
20234 dremap
.perm
[1] = 3;
20237 else if ((contents
& (h2
| h3
)) == contents
)
20240 for (i
= 0; i
< nelt2
; ++i
)
20242 remap
[i
+ nelt2
] = i
;
20243 remap
[i
+ nelt
] = i
+ nelt2
;
20244 dremap
.perm
[i
] = i
+ nelt2
;
20245 dremap
.perm
[i
+ nelt2
] = i
+ nelt
;
20250 dremap
.vmode
= V2DImode
;
20252 dremap
.perm
[0] = 1;
20253 dremap
.perm
[1] = 2;
20261 unsigned int nelt4
= nelt
/ 4, nzcnt
= 0;
20262 unsigned HOST_WIDE_INT q
[8];
20263 unsigned int nonzero_halves
[4];
20265 /* Split the two input vectors into 8 quarters. */
20266 q
[0] = (HOST_WIDE_INT_1U
<< nelt4
) - 1;
20267 for (i
= 1; i
< 8; ++i
)
20268 q
[i
] = q
[0] << (nelt4
* i
);
20269 for (i
= 0; i
< 4; ++i
)
20270 if (((q
[2 * i
] | q
[2 * i
+ 1]) & contents
) != 0)
20272 nonzero_halves
[nzcnt
] = i
;
20278 gcc_assert (d
->one_operand_p
);
20279 nonzero_halves
[1] = nonzero_halves
[0];
20280 same_halves
= true;
20282 else if (d
->one_operand_p
)
20284 gcc_assert (nonzero_halves
[0] == 0);
20285 gcc_assert (nonzero_halves
[1] == 1);
20290 if (d
->perm
[0] / nelt2
== nonzero_halves
[1])
20292 /* Attempt to increase the likelihood that dfinal
20293 shuffle will be intra-lane. */
20294 std::swap (nonzero_halves
[0], nonzero_halves
[1]);
20297 /* vperm2f128 or vperm2i128. */
20298 for (i
= 0; i
< nelt2
; ++i
)
20300 remap
[i
+ nonzero_halves
[1] * nelt2
] = i
+ nelt2
;
20301 remap
[i
+ nonzero_halves
[0] * nelt2
] = i
;
20302 dremap
.perm
[i
+ nelt2
] = i
+ nonzero_halves
[1] * nelt2
;
20303 dremap
.perm
[i
] = i
+ nonzero_halves
[0] * nelt2
;
20306 if (d
->vmode
!= V8SFmode
20307 && d
->vmode
!= V4DFmode
20308 && d
->vmode
!= V8SImode
)
20310 dremap
.vmode
= V8SImode
;
20312 for (i
= 0; i
< 4; ++i
)
20314 dremap
.perm
[i
] = i
+ nonzero_halves
[0] * 4;
20315 dremap
.perm
[i
+ 4] = i
+ nonzero_halves
[1] * 4;
20319 else if (d
->one_operand_p
)
20321 else if (TARGET_AVX2
20322 && (contents
& (q
[0] | q
[2] | q
[4] | q
[6])) == contents
)
20325 for (i
= 0; i
< nelt4
; ++i
)
20328 remap
[i
+ nelt
] = i
* 2 + 1;
20329 remap
[i
+ nelt2
] = i
* 2 + nelt2
;
20330 remap
[i
+ nelt
+ nelt2
] = i
* 2 + nelt2
+ 1;
20331 dremap
.perm
[i
* 2] = i
;
20332 dremap
.perm
[i
* 2 + 1] = i
+ nelt
;
20333 dremap
.perm
[i
* 2 + nelt2
] = i
+ nelt2
;
20334 dremap
.perm
[i
* 2 + nelt2
+ 1] = i
+ nelt
+ nelt2
;
20337 else if (TARGET_AVX2
20338 && (contents
& (q
[1] | q
[3] | q
[5] | q
[7])) == contents
)
20341 for (i
= 0; i
< nelt4
; ++i
)
20343 remap
[i
+ nelt4
] = i
* 2;
20344 remap
[i
+ nelt
+ nelt4
] = i
* 2 + 1;
20345 remap
[i
+ nelt2
+ nelt4
] = i
* 2 + nelt2
;
20346 remap
[i
+ nelt
+ nelt2
+ nelt4
] = i
* 2 + nelt2
+ 1;
20347 dremap
.perm
[i
* 2] = i
+ nelt4
;
20348 dremap
.perm
[i
* 2 + 1] = i
+ nelt
+ nelt4
;
20349 dremap
.perm
[i
* 2 + nelt2
] = i
+ nelt2
+ nelt4
;
20350 dremap
.perm
[i
* 2 + nelt2
+ 1] = i
+ nelt
+ nelt2
+ nelt4
;
20357 /* Use the remapping array set up above to move the elements from their
20358 swizzled locations into their final destinations. */
20360 for (i
= 0; i
< nelt
; ++i
)
20362 unsigned e
= remap
[d
->perm
[i
]];
20363 gcc_assert (e
< nelt
);
20364 /* If same_halves is true, both halves of the remapped vector are the
20365 same. Avoid cross-lane accesses if possible. */
20366 if (same_halves
&& i
>= nelt2
)
20368 gcc_assert (e
< nelt2
);
20369 dfinal
.perm
[i
] = e
+ nelt2
;
20372 dfinal
.perm
[i
] = e
;
20376 dremap
.target
= gen_reg_rtx (dremap
.vmode
);
20377 dfinal
.op0
= gen_lowpart (dfinal
.vmode
, dremap
.target
);
20379 dfinal
.op1
= dfinal
.op0
;
20380 dfinal
.one_operand_p
= true;
20382 /* Test if the final remap can be done with a single insn. For V4SFmode or
20383 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
20385 ok
= expand_vec_perm_1 (&dfinal
);
20386 seq
= get_insns ();
20395 if (dremap
.vmode
!= dfinal
.vmode
)
20397 dremap
.op0
= gen_lowpart (dremap
.vmode
, dremap
.op0
);
20398 dremap
.op1
= gen_lowpart (dremap
.vmode
, dremap
.op1
);
20401 ok
= expand_vec_perm_1 (&dremap
);
20408 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20409 a single vector cross-lane permutation into vpermq followed
20410 by any of the single insn permutations. */
20413 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d
*d
)
20415 struct expand_vec_perm_d dremap
, dfinal
;
20416 unsigned i
, j
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, nelt4
= nelt
/ 4;
20417 unsigned contents
[2];
20421 && (d
->vmode
== V32QImode
|| d
->vmode
== V16HImode
)
20422 && d
->one_operand_p
))
20427 for (i
= 0; i
< nelt2
; ++i
)
20429 contents
[0] |= 1u << (d
->perm
[i
] / nelt4
);
20430 contents
[1] |= 1u << (d
->perm
[i
+ nelt2
] / nelt4
);
20433 for (i
= 0; i
< 2; ++i
)
20435 unsigned int cnt
= 0;
20436 for (j
= 0; j
< 4; ++j
)
20437 if ((contents
[i
] & (1u << j
)) != 0 && ++cnt
> 2)
20445 dremap
.vmode
= V4DImode
;
20447 dremap
.target
= gen_reg_rtx (V4DImode
);
20448 dremap
.op0
= gen_lowpart (V4DImode
, d
->op0
);
20449 dremap
.op1
= dremap
.op0
;
20450 dremap
.one_operand_p
= true;
20451 for (i
= 0; i
< 2; ++i
)
20453 unsigned int cnt
= 0;
20454 for (j
= 0; j
< 4; ++j
)
20455 if ((contents
[i
] & (1u << j
)) != 0)
20456 dremap
.perm
[2 * i
+ cnt
++] = j
;
20457 for (; cnt
< 2; ++cnt
)
20458 dremap
.perm
[2 * i
+ cnt
] = 0;
20462 dfinal
.op0
= gen_lowpart (dfinal
.vmode
, dremap
.target
);
20463 dfinal
.op1
= dfinal
.op0
;
20464 dfinal
.one_operand_p
= true;
20465 for (i
= 0, j
= 0; i
< nelt
; ++i
)
20469 dfinal
.perm
[i
] = (d
->perm
[i
] & (nelt4
- 1)) | (j
? nelt2
: 0);
20470 if ((d
->perm
[i
] / nelt4
) == dremap
.perm
[j
])
20472 else if ((d
->perm
[i
] / nelt4
) == dremap
.perm
[j
+ 1])
20473 dfinal
.perm
[i
] |= nelt4
;
20475 gcc_unreachable ();
20478 ok
= expand_vec_perm_1 (&dremap
);
20481 ok
= expand_vec_perm_1 (&dfinal
);
20487 static bool canonicalize_perm (struct expand_vec_perm_d
*d
);
20489 /* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
20490 a vector permutation using two instructions, vperm2f128 resp.
20491 vperm2i128 followed by any single in-lane permutation. */
20494 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d
*d
)
20496 struct expand_vec_perm_d dfirst
, dsecond
;
20497 unsigned i
, j
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, perm
;
20501 || GET_MODE_SIZE (d
->vmode
) != 32
20502 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
&& !TARGET_AVX2
))
20506 dsecond
.one_operand_p
= false;
20507 dsecond
.testing_p
= true;
20509 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
20510 immediate. For perm < 16 the second permutation uses
20511 d->op0 as first operand, for perm >= 16 it uses d->op1
20512 as first operand. The second operand is the result of
20514 for (perm
= 0; perm
< 32; perm
++)
20516 /* Ignore permutations which do not move anything cross-lane. */
20519 /* The second shuffle for e.g. V4DFmode has
20520 0123 and ABCD operands.
20521 Ignore AB23, as 23 is already in the second lane
20522 of the first operand. */
20523 if ((perm
& 0xc) == (1 << 2)) continue;
20524 /* And 01CD, as 01 is in the first lane of the first
20526 if ((perm
& 3) == 0) continue;
20527 /* And 4567, as then the vperm2[fi]128 doesn't change
20528 anything on the original 4567 second operand. */
20529 if ((perm
& 0xf) == ((3 << 2) | 2)) continue;
20533 /* The second shuffle for e.g. V4DFmode has
20534 4567 and ABCD operands.
20535 Ignore AB67, as 67 is already in the second lane
20536 of the first operand. */
20537 if ((perm
& 0xc) == (3 << 2)) continue;
20538 /* And 45CD, as 45 is in the first lane of the first
20540 if ((perm
& 3) == 2) continue;
20541 /* And 0123, as then the vperm2[fi]128 doesn't change
20542 anything on the original 0123 first operand. */
20543 if ((perm
& 0xf) == (1 << 2)) continue;
20546 for (i
= 0; i
< nelt
; i
++)
20548 j
= d
->perm
[i
] / nelt2
;
20549 if (j
== ((perm
>> (2 * (i
>= nelt2
))) & 3))
20550 dsecond
.perm
[i
] = nelt
+ (i
& nelt2
) + (d
->perm
[i
] & (nelt2
- 1));
20551 else if (j
== (unsigned) (i
>= nelt2
) + 2 * (perm
>= 16))
20552 dsecond
.perm
[i
] = d
->perm
[i
] & (nelt
- 1);
20560 ok
= expand_vec_perm_1 (&dsecond
);
20571 /* Found a usable second shuffle. dfirst will be
20572 vperm2f128 on d->op0 and d->op1. */
20573 dsecond
.testing_p
= false;
20575 dfirst
.target
= gen_reg_rtx (d
->vmode
);
20576 for (i
= 0; i
< nelt
; i
++)
20577 dfirst
.perm
[i
] = (i
& (nelt2
- 1))
20578 + ((perm
>> (2 * (i
>= nelt2
))) & 3) * nelt2
;
20580 canonicalize_perm (&dfirst
);
20581 ok
= expand_vec_perm_1 (&dfirst
);
20584 /* And dsecond is some single insn shuffle, taking
20585 d->op0 and result of vperm2f128 (if perm < 16) or
20586 d->op1 and result of vperm2f128 (otherwise). */
20588 dsecond
.op0
= dsecond
.op1
;
20589 dsecond
.op1
= dfirst
.target
;
20591 ok
= expand_vec_perm_1 (&dsecond
);
20597 /* For one operand, the only useful vperm2f128 permutation is 0x01
20599 if (d
->one_operand_p
)
20606 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20607 a two vector permutation using 2 intra-lane interleave insns
20608 and cross-lane shuffle for 32-byte vectors. */
20611 expand_vec_perm_interleave3 (struct expand_vec_perm_d
*d
)
20614 rtx (*gen
) (rtx
, rtx
, rtx
);
20616 if (d
->one_operand_p
)
20618 if (TARGET_AVX2
&& GET_MODE_SIZE (d
->vmode
) == 32)
20620 else if (TARGET_AVX
&& (d
->vmode
== V8SFmode
|| d
->vmode
== V4DFmode
))
20626 if (d
->perm
[0] != 0 && d
->perm
[0] != nelt
/ 2)
20628 for (i
= 0; i
< nelt
; i
+= 2)
20629 if (d
->perm
[i
] != d
->perm
[0] + i
/ 2
20630 || d
->perm
[i
+ 1] != d
->perm
[0] + i
/ 2 + nelt
)
20640 gen
= gen_vec_interleave_highv32qi
;
20642 gen
= gen_vec_interleave_lowv32qi
;
20646 gen
= gen_vec_interleave_highv16hi
;
20648 gen
= gen_vec_interleave_lowv16hi
;
20652 gen
= gen_vec_interleave_highv8si
;
20654 gen
= gen_vec_interleave_lowv8si
;
20658 gen
= gen_vec_interleave_highv4di
;
20660 gen
= gen_vec_interleave_lowv4di
;
20664 gen
= gen_vec_interleave_highv8sf
;
20666 gen
= gen_vec_interleave_lowv8sf
;
20670 gen
= gen_vec_interleave_highv4df
;
20672 gen
= gen_vec_interleave_lowv4df
;
20675 gcc_unreachable ();
20678 emit_insn (gen (d
->target
, d
->op0
, d
->op1
));
20682 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
20683 a single vector permutation using a single intra-lane vector
20684 permutation, vperm2f128 swapping the lanes and vblend* insn blending
20685 the non-swapped and swapped vectors together. */
20688 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d
*d
)
20690 struct expand_vec_perm_d dfirst
, dsecond
;
20691 unsigned i
, j
, msk
, nelt
= d
->nelt
, nelt2
= nelt
/ 2;
20694 rtx (*blend
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
20698 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
)
20699 || !d
->one_operand_p
)
20703 for (i
= 0; i
< nelt
; i
++)
20704 dfirst
.perm
[i
] = 0xff;
20705 for (i
= 0, msk
= 0; i
< nelt
; i
++)
20707 j
= (d
->perm
[i
] & nelt2
) ? i
| nelt2
: i
& ~nelt2
;
20708 if (dfirst
.perm
[j
] != 0xff && dfirst
.perm
[j
] != d
->perm
[i
])
20710 dfirst
.perm
[j
] = d
->perm
[i
];
20714 for (i
= 0; i
< nelt
; i
++)
20715 if (dfirst
.perm
[i
] == 0xff)
20716 dfirst
.perm
[i
] = i
;
20719 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
20722 ok
= expand_vec_perm_1 (&dfirst
);
20723 seq
= get_insns ();
20735 dsecond
.op0
= dfirst
.target
;
20736 dsecond
.op1
= dfirst
.target
;
20737 dsecond
.one_operand_p
= true;
20738 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
20739 for (i
= 0; i
< nelt
; i
++)
20740 dsecond
.perm
[i
] = i
^ nelt2
;
20742 ok
= expand_vec_perm_1 (&dsecond
);
20745 blend
= d
->vmode
== V8SFmode
? gen_avx_blendps256
: gen_avx_blendpd256
;
20746 emit_insn (blend (d
->target
, dfirst
.target
, dsecond
.target
, GEN_INT (msk
)));
20750 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
20751 a two vector permutation using two single vector permutations and
20752 {,v}{,p}unpckl{ps,pd,bw,wd,dq}. If two_insn, succeed only if one
20753 of dfirst or dsecond is identity permutation. */
20756 expand_vec_perm_2perm_interleave (struct expand_vec_perm_d
*d
, bool two_insn
)
20758 unsigned i
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, lane
= nelt
;
20759 struct expand_vec_perm_d dfirst
, dsecond
, dfinal
;
20760 bool ident1
= true, ident2
= true;
20762 if (d
->one_operand_p
)
20765 if (GET_MODE_SIZE (d
->vmode
) == 16)
20769 if (d
->vmode
!= V4SFmode
&& d
->vmode
!= V2DFmode
&& !TARGET_SSE2
)
20772 else if (GET_MODE_SIZE (d
->vmode
) == 32)
20776 if (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
&& !TARGET_AVX2
)
20783 for (i
= 1; i
< nelt
; i
++)
20784 if ((d
->perm
[i
] >= nelt
) != ((d
->perm
[0] >= nelt
) ^ (i
& 1)))
20790 dfirst
.op1
= dfirst
.op0
;
20791 dfirst
.one_operand_p
= true;
20792 dsecond
.op0
= dsecond
.op1
;
20793 dsecond
.one_operand_p
= true;
20795 for (i
= 0; i
< nelt
; i
++)
20796 if (d
->perm
[i
] >= nelt
)
20798 dsecond
.perm
[i
/ 2 + (i
>= lane
? lane
/ 2 : 0)] = d
->perm
[i
] - nelt
;
20799 if (d
->perm
[i
] - nelt
!= i
/ 2 + (i
>= lane
? lane
/ 2 : 0))
20801 dsecond
.perm
[i
/ 2 + (i
>= lane
? lane
: lane
/ 2)]
20802 = d
->perm
[i
] - nelt
;
20806 dfirst
.perm
[i
/ 2 + (i
>= lane
? lane
/ 2 : 0)] = d
->perm
[i
];
20807 if (d
->perm
[i
] != i
/ 2 + (i
>= lane
? lane
/ 2 : 0))
20809 dfirst
.perm
[i
/ 2 + (i
>= lane
? lane
: lane
/ 2)] = d
->perm
[i
];
20812 if (two_insn
&& !ident1
&& !ident2
)
20818 dfinal
.op0
= dfirst
.target
= gen_reg_rtx (d
->vmode
);
20820 dfinal
.op1
= dsecond
.target
= gen_reg_rtx (d
->vmode
);
20821 if (d
->perm
[0] >= nelt
)
20822 std::swap (dfinal
.op0
, dfinal
.op1
);
20826 rtx_insn
*seq1
= NULL
, *seq2
= NULL
;
20831 ok
= expand_vec_perm_1 (&dfirst
);
20832 seq1
= get_insns ();
20842 ok
= expand_vec_perm_1 (&dsecond
);
20843 seq2
= get_insns ();
20853 for (i
= 0; i
< nelt
; i
++)
20855 dfinal
.perm
[i
] = i
/ 2;
20857 dfinal
.perm
[i
] += lane
/ 2;
20859 dfinal
.perm
[i
] += nelt
;
20863 ok
= expand_vselect_vconcat (dfinal
.target
, dfinal
.op0
, dfinal
.op1
,
20864 dfinal
.perm
, dfinal
.nelt
, false);
20869 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20870 the permutation using two single vector permutations and the SSE4_1 pblendv
20871 instruction. If two_insn, succeed only if one of dfirst or dsecond is
20872 identity permutation. */
20875 expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d
*d
, bool two_insn
)
20877 unsigned i
, nelt
= d
->nelt
;
20878 struct expand_vec_perm_d dfirst
, dsecond
, dfinal
;
20879 machine_mode vmode
= d
->vmode
;
20880 bool ident1
= true, ident2
= true;
20882 /* Use the same checks as in expand_vec_perm_blend. */
20883 if (d
->one_operand_p
)
20885 if (TARGET_AVX2
&& GET_MODE_SIZE (vmode
) == 32)
20887 else if (TARGET_AVX
&& (vmode
== V4DFmode
|| vmode
== V8SFmode
))
20889 else if (TARGET_SSE4_1
&& (GET_MODE_SIZE (vmode
) == 16
20890 || GET_MODE_SIZE (vmode
) == 8
20891 || GET_MODE_SIZE (vmode
) == 4))
20899 dfirst
.op1
= dfirst
.op0
;
20900 dfirst
.one_operand_p
= true;
20901 dsecond
.op0
= dsecond
.op1
;
20902 dsecond
.one_operand_p
= true;
20904 for (i
= 0; i
< nelt
; ++i
)
20905 if (d
->perm
[i
] >= nelt
)
20907 dfirst
.perm
[i
] = 0xff;
20908 dsecond
.perm
[i
] = d
->perm
[i
] - nelt
;
20909 if (d
->perm
[i
] != i
+ nelt
)
20914 dsecond
.perm
[i
] = 0xff;
20915 dfirst
.perm
[i
] = d
->perm
[i
];
20916 if (d
->perm
[i
] != i
)
20920 if (two_insn
&& !ident1
&& !ident2
)
20923 /* For now. Ideally treat 0xff as a wildcard. */
20924 for (i
= 0; i
< nelt
; ++i
)
20925 if (dfirst
.perm
[i
] == 0xff)
20927 if (GET_MODE_SIZE (vmode
) == 32
20928 && dfirst
.perm
[i
^ (nelt
/ 2)] != 0xff)
20929 dfirst
.perm
[i
] = dfirst
.perm
[i
^ (nelt
/ 2)] ^ (nelt
/ 2);
20931 dfirst
.perm
[i
] = i
;
20935 if (GET_MODE_SIZE (vmode
) == 32
20936 && dsecond
.perm
[i
^ (nelt
/ 2)] != 0xff)
20937 dsecond
.perm
[i
] = dsecond
.perm
[i
^ (nelt
/ 2)] ^ (nelt
/ 2);
20939 dsecond
.perm
[i
] = i
;
20945 dfinal
.op0
= dfirst
.target
= gen_reg_rtx (d
->vmode
);
20947 dfinal
.op1
= dsecond
.target
= gen_reg_rtx (d
->vmode
);
20951 rtx_insn
*seq1
= NULL
, *seq2
= NULL
;
20956 ok
= expand_vec_perm_1 (&dfirst
);
20957 seq1
= get_insns ();
20967 ok
= expand_vec_perm_1 (&dsecond
);
20968 seq2
= get_insns ();
20978 for (i
= 0; i
< nelt
; ++i
)
20979 dfinal
.perm
[i
] = (d
->perm
[i
] >= nelt
? i
+ nelt
: i
);
20983 ok
= expand_vec_perm_blend (&dfinal
);
20988 /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
20989 permutation using two vperm2f128, followed by a vshufpd insn blending
20990 the two vectors together. */
20993 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d
*d
)
20995 struct expand_vec_perm_d dfirst
, dsecond
, dthird
;
20998 if (!TARGET_AVX
|| (d
->vmode
!= V4DFmode
))
21008 dfirst
.perm
[0] = (d
->perm
[0] & ~1);
21009 dfirst
.perm
[1] = (d
->perm
[0] & ~1) + 1;
21010 dfirst
.perm
[2] = (d
->perm
[2] & ~1);
21011 dfirst
.perm
[3] = (d
->perm
[2] & ~1) + 1;
21012 dsecond
.perm
[0] = (d
->perm
[1] & ~1);
21013 dsecond
.perm
[1] = (d
->perm
[1] & ~1) + 1;
21014 dsecond
.perm
[2] = (d
->perm
[3] & ~1);
21015 dsecond
.perm
[3] = (d
->perm
[3] & ~1) + 1;
21016 dthird
.perm
[0] = (d
->perm
[0] % 2);
21017 dthird
.perm
[1] = (d
->perm
[1] % 2) + 4;
21018 dthird
.perm
[2] = (d
->perm
[2] % 2) + 2;
21019 dthird
.perm
[3] = (d
->perm
[3] % 2) + 6;
21021 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
21022 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
21023 dthird
.op0
= dfirst
.target
;
21024 dthird
.op1
= dsecond
.target
;
21025 dthird
.one_operand_p
= false;
21027 canonicalize_perm (&dfirst
);
21028 canonicalize_perm (&dsecond
);
21030 ok
= expand_vec_perm_1 (&dfirst
)
21031 && expand_vec_perm_1 (&dsecond
)
21032 && expand_vec_perm_1 (&dthird
);
21039 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d
*);
21041 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
21042 a two vector permutation using two intra-lane vector
21043 permutations, vperm2f128 swapping the lanes and vblend* insn blending
21044 the non-swapped and swapped vectors together. */
21047 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d
*d
)
21049 struct expand_vec_perm_d dfirst
, dsecond
, dthird
;
21050 unsigned i
, j
, msk
, nelt
= d
->nelt
, nelt2
= nelt
/ 2, which1
= 0, which2
= 0;
21051 rtx_insn
*seq1
, *seq2
;
21053 rtx (*blend
) (rtx
, rtx
, rtx
, rtx
) = NULL
;
21057 || (d
->vmode
!= V8SFmode
&& d
->vmode
!= V4DFmode
)
21058 || d
->one_operand_p
)
21063 for (i
= 0; i
< nelt
; i
++)
21065 dfirst
.perm
[i
] = 0xff;
21066 dsecond
.perm
[i
] = 0xff;
21068 for (i
= 0, msk
= 0; i
< nelt
; i
++)
21070 j
= (d
->perm
[i
] & nelt2
) ? i
| nelt2
: i
& ~nelt2
;
21073 dfirst
.perm
[j
] = d
->perm
[i
];
21074 which1
|= (d
->perm
[i
] < nelt
? 1 : 2);
21078 dsecond
.perm
[j
] = d
->perm
[i
];
21079 which2
|= (d
->perm
[i
] < nelt
? 1 : 2);
21083 if (msk
== 0 || msk
== (1U << nelt
) - 1)
21088 dfirst
.target
= gen_reg_rtx (dfirst
.vmode
);
21089 dsecond
.target
= gen_reg_rtx (dsecond
.vmode
);
21092 for (i
= 0; i
< nelt
; i
++)
21094 if (dfirst
.perm
[i
] == 0xff)
21095 dfirst
.perm
[i
] = (which1
== 2 ? i
+ nelt
: i
);
21096 if (dsecond
.perm
[i
] == 0xff)
21097 dsecond
.perm
[i
] = (which2
== 2 ? i
+ nelt
: i
);
21099 canonicalize_perm (&dfirst
);
21101 ok
= ix86_expand_vec_perm_const_1 (&dfirst
);
21102 seq1
= get_insns ();
21108 canonicalize_perm (&dsecond
);
21110 ok
= ix86_expand_vec_perm_const_1 (&dsecond
);
21111 seq2
= get_insns ();
21124 dthird
.op0
= dsecond
.target
;
21125 dthird
.op1
= dsecond
.target
;
21126 dthird
.one_operand_p
= true;
21127 dthird
.target
= gen_reg_rtx (dthird
.vmode
);
21128 for (i
= 0; i
< nelt
; i
++)
21129 dthird
.perm
[i
] = i
^ nelt2
;
21131 ok
= expand_vec_perm_1 (&dthird
);
21134 blend
= d
->vmode
== V8SFmode
? gen_avx_blendps256
: gen_avx_blendpd256
;
21135 emit_insn (blend (d
->target
, dfirst
.target
, dthird
.target
, GEN_INT (msk
)));
21139 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
21140 permutation with two pshufb insns and an ior. We should have already
21141 failed all two instruction sequences. */
21144 expand_vec_perm_pshufb2 (struct expand_vec_perm_d
*d
)
21146 rtx rperm
[2][16], vperm
, l
, h
, op
, m128
;
21147 unsigned int i
, nelt
, eltsz
;
21149 rtx (*gen
) (rtx
, rtx
, rtx
);
21151 if (!TARGET_SSSE3
|| (GET_MODE_SIZE (d
->vmode
) != 16
21152 && GET_MODE_SIZE (d
->vmode
) != 8
21153 && GET_MODE_SIZE (d
->vmode
) != 4))
21155 gcc_assert (!d
->one_operand_p
);
21160 switch (GET_MODE_SIZE (d
->vmode
))
21164 gen
= gen_mmx_pshufbv4qi3
;
21168 gen
= gen_mmx_pshufbv8qi3
;
21172 gen
= gen_ssse3_pshufbv16qi3
;
21175 gcc_unreachable ();
21179 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
21181 /* Generate two permutation masks. If the required element is within
21182 the given vector it is shuffled into the proper lane. If the required
21183 element is in the other vector, force a zero into the lane by setting
21184 bit 7 in the permutation mask. */
21185 m128
= GEN_INT (-128);
21186 for (i
= 0; i
< nelt
; ++i
)
21188 unsigned j
, k
, e
= d
->perm
[i
];
21189 unsigned which
= (e
>= nelt
);
21193 for (j
= 0; j
< eltsz
; ++j
)
21195 rperm
[which
][i
*eltsz
+ j
] = GEN_INT (e
*eltsz
+ j
);
21196 rperm
[1-which
][i
*eltsz
+ j
] = m128
;
21199 for (k
= i
*eltsz
+ j
; k
< 16; ++k
)
21200 rperm
[0][k
] = rperm
[1][k
] = m128
;
21203 vperm
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, rperm
[0]));
21204 vperm
= force_reg (V16QImode
, vperm
);
21206 l
= gen_reg_rtx (mode
);
21207 op
= gen_lowpart (mode
, d
->op0
);
21208 emit_insn (gen (l
, op
, vperm
));
21210 vperm
= gen_rtx_CONST_VECTOR (V16QImode
, gen_rtvec_v (16, rperm
[1]));
21211 vperm
= force_reg (V16QImode
, vperm
);
21213 h
= gen_reg_rtx (mode
);
21214 op
= gen_lowpart (mode
, d
->op1
);
21215 emit_insn (gen (h
, op
, vperm
));
21218 if (d
->vmode
!= mode
)
21219 op
= gen_reg_rtx (mode
);
21220 ix86_emit_vec_binop (IOR
, mode
, op
, l
, h
);
21221 if (op
!= d
->target
)
21222 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
21227 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
21228 with two vpshufb insns, vpermq and vpor. We should have already failed
21229 all two or three instruction sequences. */
21232 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d
*d
)
21234 rtx rperm
[2][32], vperm
, l
, h
, hp
, op
, m128
;
21235 unsigned int i
, nelt
, eltsz
;
21238 || !d
->one_operand_p
21239 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
21246 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
21248 /* Generate two permutation masks. If the required element is within
21249 the same lane, it is shuffled in. If the required element from the
21250 other lane, force a zero by setting bit 7 in the permutation mask.
21251 In the other mask the mask has non-negative elements if element
21252 is requested from the other lane, but also moved to the other lane,
21253 so that the result of vpshufb can have the two V2TImode halves
21255 m128
= GEN_INT (-128);
21256 for (i
= 0; i
< nelt
; ++i
)
21258 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
21259 unsigned which
= ((d
->perm
[i
] ^ i
) & (nelt
/ 2)) * eltsz
;
21261 for (j
= 0; j
< eltsz
; ++j
)
21263 rperm
[!!which
][(i
* eltsz
+ j
) ^ which
] = GEN_INT (e
* eltsz
+ j
);
21264 rperm
[!which
][(i
* eltsz
+ j
) ^ (which
^ 16)] = m128
;
21268 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[1]));
21269 vperm
= force_reg (V32QImode
, vperm
);
21271 h
= gen_reg_rtx (V32QImode
);
21272 op
= gen_lowpart (V32QImode
, d
->op0
);
21273 emit_insn (gen_avx2_pshufbv32qi3 (h
, op
, vperm
));
21275 /* Swap the 128-byte lanes of h into hp. */
21276 hp
= gen_reg_rtx (V4DImode
);
21277 op
= gen_lowpart (V4DImode
, h
);
21278 emit_insn (gen_avx2_permv4di_1 (hp
, op
, const2_rtx
, GEN_INT (3), const0_rtx
,
21281 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[0]));
21282 vperm
= force_reg (V32QImode
, vperm
);
21284 l
= gen_reg_rtx (V32QImode
);
21285 op
= gen_lowpart (V32QImode
, d
->op0
);
21286 emit_insn (gen_avx2_pshufbv32qi3 (l
, op
, vperm
));
21289 if (d
->vmode
!= V32QImode
)
21290 op
= gen_reg_rtx (V32QImode
);
21291 emit_insn (gen_iorv32qi3 (op
, l
, gen_lowpart (V32QImode
, hp
)));
21292 if (op
!= d
->target
)
21293 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
21298 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21299 and extract-odd permutations of two V32QImode and V16QImode operand
21300 with two vpshufb insns, vpor and vpermq. We should have already
21301 failed all two or three instruction sequences. */
21304 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d
*d
)
21306 rtx rperm
[2][32], vperm
, l
, h
, ior
, op
, m128
;
21307 unsigned int i
, nelt
, eltsz
;
21310 || d
->one_operand_p
21311 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
21314 for (i
= 0; i
< d
->nelt
; ++i
)
21315 if ((d
->perm
[i
] ^ (i
* 2)) & (3 * d
->nelt
/ 2))
21322 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
21324 /* Generate two permutation masks. In the first permutation mask
21325 the first quarter will contain indexes for the first half
21326 of the op0, the second quarter will contain bit 7 set, third quarter
21327 will contain indexes for the second half of the op0 and the
21328 last quarter bit 7 set. In the second permutation mask
21329 the first quarter will contain bit 7 set, the second quarter
21330 indexes for the first half of the op1, the third quarter bit 7 set
21331 and last quarter indexes for the second half of the op1.
21332 I.e. the first mask e.g. for V32QImode extract even will be:
21333 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
21334 (all values masked with 0xf except for -128) and second mask
21335 for extract even will be
21336 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
21337 m128
= GEN_INT (-128);
21338 for (i
= 0; i
< nelt
; ++i
)
21340 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
21341 unsigned which
= d
->perm
[i
] >= nelt
;
21342 unsigned xorv
= (i
>= nelt
/ 4 && i
< 3 * nelt
/ 4) ? 24 : 0;
21344 for (j
= 0; j
< eltsz
; ++j
)
21346 rperm
[which
][(i
* eltsz
+ j
) ^ xorv
] = GEN_INT (e
* eltsz
+ j
);
21347 rperm
[1 - which
][(i
* eltsz
+ j
) ^ xorv
] = m128
;
21351 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[0]));
21352 vperm
= force_reg (V32QImode
, vperm
);
21354 l
= gen_reg_rtx (V32QImode
);
21355 op
= gen_lowpart (V32QImode
, d
->op0
);
21356 emit_insn (gen_avx2_pshufbv32qi3 (l
, op
, vperm
));
21358 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[1]));
21359 vperm
= force_reg (V32QImode
, vperm
);
21361 h
= gen_reg_rtx (V32QImode
);
21362 op
= gen_lowpart (V32QImode
, d
->op1
);
21363 emit_insn (gen_avx2_pshufbv32qi3 (h
, op
, vperm
));
21365 ior
= gen_reg_rtx (V32QImode
);
21366 emit_insn (gen_iorv32qi3 (ior
, l
, h
));
21368 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
21369 op
= gen_reg_rtx (V4DImode
);
21370 ior
= gen_lowpart (V4DImode
, ior
);
21371 emit_insn (gen_avx2_permv4di_1 (op
, ior
, const0_rtx
, const2_rtx
,
21372 const1_rtx
, GEN_INT (3)));
21373 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
21378 /* Implement permutation with pslldq + psrldq + por when pshufb is not
21381 expand_vec_perm_pslldq_psrldq_por (struct expand_vec_perm_d
*d
, bool pandn
)
21383 unsigned i
, nelt
= d
->nelt
;
21384 unsigned start1
, end1
= -1;
21385 machine_mode vmode
= d
->vmode
, imode
;
21387 bool clear_op0
, clear_op1
;
21388 unsigned inner_size
;
21389 rtx op0
, op1
, dop1
;
21390 rtx (*gen_vec_shr
) (rtx
, rtx
, rtx
);
21391 rtx (*gen_vec_shl
) (rtx
, rtx
, rtx
);
21393 /* pshufd can be used for V4SI/V2DI under TARGET_SSE2. */
21394 if (!TARGET_SSE2
|| (vmode
!= E_V16QImode
&& vmode
!= E_V8HImode
))
21397 start1
= d
->perm
[0];
21398 for (i
= 1; i
< nelt
; i
++)
21400 if (d
->perm
[i
] != d
->perm
[i
-1] + 1
21401 || d
->perm
[i
] == nelt
)
21405 start2
= d
->perm
[i
];
21406 end1
= d
->perm
[i
-1];
21413 clear_op0
= end1
!= nelt
- 1;
21414 clear_op1
= start2
% nelt
!= 0;
21415 /* pandn/pand is needed to clear upper/lower bits of op0/op1. */
21416 if (!pandn
&& (clear_op0
|| clear_op1
))
21422 gen_vec_shr
= vmode
== E_V16QImode
? gen_vec_shr_v16qi
: gen_vec_shr_v8hi
;
21423 gen_vec_shl
= vmode
== E_V16QImode
? gen_vec_shl_v16qi
: gen_vec_shl_v8hi
;
21424 imode
= GET_MODE_INNER (vmode
);
21425 inner_size
= GET_MODE_BITSIZE (imode
);
21426 op0
= gen_reg_rtx (vmode
);
21427 op1
= gen_reg_rtx (vmode
);
21430 emit_insn (gen_vec_shr (op0
, d
->op0
, GEN_INT (start1
* inner_size
)));
21432 emit_move_insn (op0
, d
->op0
);
21435 if (d
->one_operand_p
)
21438 int shl_offset
= end1
- start1
+ 1 - start2
% nelt
;
21440 emit_insn (gen_vec_shl (op1
, dop1
, GEN_INT (shl_offset
* inner_size
)));
21442 emit_move_insn (op1
, dop1
);
21444 /* Clear lower/upper bits for op0/op1. */
21445 if (clear_op0
|| clear_op1
)
21450 for (i
= 0; i
!= nelt
; i
++)
21452 if (i
< (end1
- start1
+ 1))
21453 vec
[i
] = gen_int_mode ((HOST_WIDE_INT_1U
<< inner_size
) - 1, imode
);
21455 vec
[i
] = CONST0_RTX (imode
);
21457 const_vec
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, vec
));
21458 const_vec
= validize_mem (force_const_mem (vmode
, const_vec
));
21459 clear
= force_reg (vmode
, const_vec
);
21462 emit_move_insn (op0
, gen_rtx_AND (vmode
, op0
, clear
));
21464 emit_move_insn (op1
, gen_rtx_AND (vmode
,
21465 gen_rtx_NOT (vmode
, clear
),
21469 emit_move_insn (d
->target
, gen_rtx_IOR (vmode
, op0
, op1
));
21473 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21474 and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
21475 operands with two "and" and "pack" or two "shift" and "pack" insns.
21476 We should have already failed all two instruction sequences. */
21479 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d
*d
)
21481 rtx op
, dop0
, dop1
, t
;
21482 unsigned i
, odd
, c
, s
, nelt
= d
->nelt
;
21483 bool end_perm
= false;
21484 machine_mode half_mode
;
21485 rtx (*gen_and
) (rtx
, rtx
, rtx
);
21486 rtx (*gen_pack
) (rtx
, rtx
, rtx
);
21487 rtx (*gen_shift
) (rtx
, rtx
, rtx
);
21489 if (d
->one_operand_p
)
21495 /* Required for "pack". */
21496 if (!TARGET_SSE4_1
)
21500 half_mode
= V2SImode
;
21501 gen_and
= gen_andv2si3
;
21502 gen_pack
= gen_mmx_packusdw
;
21503 gen_shift
= gen_lshrv2si3
;
21506 /* Required for "pack". */
21507 if (!TARGET_SSE4_1
)
21511 half_mode
= V4SImode
;
21512 gen_and
= gen_andv4si3
;
21513 gen_pack
= gen_sse4_1_packusdw
;
21514 gen_shift
= gen_lshrv4si3
;
21517 /* No check as all instructions are SSE2. */
21520 half_mode
= V4HImode
;
21521 gen_and
= gen_andv4hi3
;
21522 gen_pack
= gen_mmx_packuswb
;
21523 gen_shift
= gen_lshrv4hi3
;
21526 /* No check as all instructions are SSE2. */
21529 half_mode
= V8HImode
;
21530 gen_and
= gen_andv8hi3
;
21531 gen_pack
= gen_sse2_packuswb
;
21532 gen_shift
= gen_lshrv8hi3
;
21539 half_mode
= V8SImode
;
21540 gen_and
= gen_andv8si3
;
21541 gen_pack
= gen_avx2_packusdw
;
21542 gen_shift
= gen_lshrv8si3
;
21550 half_mode
= V16HImode
;
21551 gen_and
= gen_andv16hi3
;
21552 gen_pack
= gen_avx2_packuswb
;
21553 gen_shift
= gen_lshrv16hi3
;
21557 /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
21558 are more profitable than general shuffles. */
21562 /* Check that permutation is even or odd. */
21567 for (i
= 1; i
< nelt
; ++i
)
21568 if (d
->perm
[i
] != 2 * i
+ odd
)
21574 dop0
= gen_reg_rtx (half_mode
);
21575 dop1
= gen_reg_rtx (half_mode
);
21578 t
= gen_const_vec_duplicate (half_mode
, GEN_INT (c
));
21579 t
= force_reg (half_mode
, t
);
21580 emit_insn (gen_and (dop0
, t
, gen_lowpart (half_mode
, d
->op0
)));
21581 emit_insn (gen_and (dop1
, t
, gen_lowpart (half_mode
, d
->op1
)));
21585 emit_insn (gen_shift (dop0
,
21586 gen_lowpart (half_mode
, d
->op0
),
21588 emit_insn (gen_shift (dop1
,
21589 gen_lowpart (half_mode
, d
->op1
),
21592 /* In AVX2 for 256 bit case we need to permute pack result. */
21593 if (TARGET_AVX2
&& end_perm
)
21595 op
= gen_reg_rtx (d
->vmode
);
21596 t
= gen_reg_rtx (V4DImode
);
21597 emit_insn (gen_pack (op
, dop0
, dop1
));
21598 emit_insn (gen_avx2_permv4di_1 (t
,
21599 gen_lowpart (V4DImode
, op
),
21604 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, t
));
21607 emit_insn (gen_pack (d
->target
, dop0
, dop1
));
21612 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21613 and extract-odd permutations of two V64QI operands
21614 with two "shifts", two "truncs" and one "concat" insns for "odd"
21615 and two "truncs" and one concat insn for "even."
21616 Have already failed all two instruction sequences. */
21619 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d
*d
)
21621 rtx t1
, t2
, t3
, t4
;
21622 unsigned i
, odd
, nelt
= d
->nelt
;
21624 if (!TARGET_AVX512BW
21625 || d
->one_operand_p
21626 || d
->vmode
!= V64QImode
)
21629 /* Check that permutation is even or odd. */
21634 for (i
= 1; i
< nelt
; ++i
)
21635 if (d
->perm
[i
] != 2 * i
+ odd
)
21644 t1
= gen_reg_rtx (V32HImode
);
21645 t2
= gen_reg_rtx (V32HImode
);
21646 emit_insn (gen_lshrv32hi3 (t1
,
21647 gen_lowpart (V32HImode
, d
->op0
),
21649 emit_insn (gen_lshrv32hi3 (t2
,
21650 gen_lowpart (V32HImode
, d
->op1
),
21655 t1
= gen_lowpart (V32HImode
, d
->op0
);
21656 t2
= gen_lowpart (V32HImode
, d
->op1
);
21659 t3
= gen_reg_rtx (V32QImode
);
21660 t4
= gen_reg_rtx (V32QImode
);
21661 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3
, t1
));
21662 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4
, t2
));
21663 emit_insn (gen_avx_vec_concatv64qi (d
->target
, t3
, t4
));
21668 /* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
21669 and extract-odd permutations. */
21672 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d
*d
, unsigned odd
)
21674 rtx t1
, t2
, t3
, t4
, t5
;
21681 t1
= gen_reg_rtx (V4DFmode
);
21682 t2
= gen_reg_rtx (V4DFmode
);
21684 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
21685 emit_insn (gen_avx_vperm2f128v4df3 (t1
, d
->op0
, d
->op1
, GEN_INT (0x20)));
21686 emit_insn (gen_avx_vperm2f128v4df3 (t2
, d
->op0
, d
->op1
, GEN_INT (0x31)));
21688 /* Now an unpck[lh]pd will produce the result required. */
21690 t3
= gen_avx_unpckhpd256 (d
->target
, t1
, t2
);
21692 t3
= gen_avx_unpcklpd256 (d
->target
, t1
, t2
);
21698 int mask
= odd
? 0xdd : 0x88;
21702 t1
= gen_reg_rtx (V8SFmode
);
21703 t2
= gen_reg_rtx (V8SFmode
);
21704 t3
= gen_reg_rtx (V8SFmode
);
21706 /* Shuffle within the 128-bit lanes to produce:
21707 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
21708 emit_insn (gen_avx_shufps256 (t1
, d
->op0
, d
->op1
,
21711 /* Shuffle the lanes around to produce:
21712 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
21713 emit_insn (gen_avx_vperm2f128v8sf3 (t2
, t1
, t1
,
21716 /* Shuffle within the 128-bit lanes to produce:
21717 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
21718 emit_insn (gen_avx_shufps256 (t3
, t1
, t2
, GEN_INT (0x44)));
21720 /* Shuffle within the 128-bit lanes to produce:
21721 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
21722 emit_insn (gen_avx_shufps256 (t2
, t1
, t2
, GEN_INT (0xee)));
21724 /* Shuffle the lanes around to produce:
21725 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
21726 emit_insn (gen_avx_vperm2f128v8sf3 (d
->target
, t3
, t2
,
21737 /* These are always directly implementable by expand_vec_perm_1. */
21738 gcc_unreachable ();
21741 gcc_assert (TARGET_MMX_WITH_SSE
);
21742 /* We have no suitable instructions. */
21748 if (TARGET_SSSE3
&& !TARGET_SLOW_PSHUFB
)
21749 return expand_vec_perm_pshufb2 (d
);
21754 /* We need 2*log2(N)-1 operations to achieve odd/even
21755 with interleave. */
21756 t1
= gen_reg_rtx (V4QImode
);
21757 emit_insn (gen_mmx_punpckhbw_low (t1
, d
->op0
, d
->op1
));
21758 emit_insn (gen_mmx_punpcklbw_low (d
->target
, d
->op0
, d
->op1
));
21760 t2
= gen_mmx_punpckhbw_low (d
->target
, d
->target
, t1
);
21762 t2
= gen_mmx_punpcklbw_low (d
->target
, d
->target
, t1
);
21769 return expand_vec_perm_even_odd_pack (d
);
21770 else if (TARGET_SSSE3
&& !TARGET_SLOW_PSHUFB
)
21771 return expand_vec_perm_pshufb2 (d
);
21776 /* We need 2*log2(N)-1 operations to achieve odd/even
21777 with interleave. */
21778 t1
= gen_reg_rtx (V4HImode
);
21779 emit_insn (gen_mmx_punpckhwd (t1
, d
->op0
, d
->op1
));
21780 emit_insn (gen_mmx_punpcklwd (d
->target
, d
->op0
, d
->op1
));
21782 t2
= gen_mmx_punpckhwd (d
->target
, d
->target
, t1
);
21784 t2
= gen_mmx_punpcklwd (d
->target
, d
->target
, t1
);
21791 return expand_vec_perm_even_odd_pack (d
);
21792 else if (TARGET_SSSE3
&& !TARGET_SLOW_PSHUFB
)
21793 return expand_vec_perm_pshufb2 (d
);
21798 /* We need 2*log2(N)-1 operations to achieve odd/even
21799 with interleave. */
21800 t1
= gen_reg_rtx (V8HImode
);
21801 t2
= gen_reg_rtx (V8HImode
);
21802 emit_insn (gen_vec_interleave_highv8hi (t1
, d
->op0
, d
->op1
));
21803 emit_insn (gen_vec_interleave_lowv8hi (d
->target
, d
->op0
, d
->op1
));
21804 emit_insn (gen_vec_interleave_highv8hi (t2
, d
->target
, t1
));
21805 emit_insn (gen_vec_interleave_lowv8hi (d
->target
, d
->target
, t1
));
21807 t3
= gen_vec_interleave_highv8hi (d
->target
, d
->target
, t2
);
21809 t3
= gen_vec_interleave_lowv8hi (d
->target
, d
->target
, t2
);
21816 return expand_vec_perm_even_odd_pack (d
);
21820 return expand_vec_perm_even_odd_pack (d
);
21823 return expand_vec_perm_even_odd_trunc (d
);
21828 struct expand_vec_perm_d d_copy
= *d
;
21829 d_copy
.vmode
= V4DFmode
;
21831 d_copy
.target
= gen_raw_REG (V4DFmode
, LAST_VIRTUAL_REGISTER
+ 1);
21833 d_copy
.target
= gen_reg_rtx (V4DFmode
);
21834 d_copy
.op0
= gen_lowpart (V4DFmode
, d
->op0
);
21835 d_copy
.op1
= gen_lowpart (V4DFmode
, d
->op1
);
21836 if (expand_vec_perm_even_odd_1 (&d_copy
, odd
))
21839 emit_move_insn (d
->target
,
21840 gen_lowpart (V4DImode
, d_copy
.target
));
21849 t1
= gen_reg_rtx (V4DImode
);
21850 t2
= gen_reg_rtx (V4DImode
);
21852 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
21853 emit_insn (gen_avx2_permv2ti (t1
, d
->op0
, d
->op1
, GEN_INT (0x20)));
21854 emit_insn (gen_avx2_permv2ti (t2
, d
->op0
, d
->op1
, GEN_INT (0x31)));
21856 /* Now an vpunpck[lh]qdq will produce the result required. */
21858 t3
= gen_avx2_interleave_highv4di (d
->target
, t1
, t2
);
21860 t3
= gen_avx2_interleave_lowv4di (d
->target
, t1
, t2
);
21867 struct expand_vec_perm_d d_copy
= *d
;
21868 d_copy
.vmode
= V8SFmode
;
21870 d_copy
.target
= gen_raw_REG (V8SFmode
, LAST_VIRTUAL_REGISTER
+ 1);
21872 d_copy
.target
= gen_reg_rtx (V8SFmode
);
21873 d_copy
.op0
= gen_lowpart (V8SFmode
, d
->op0
);
21874 d_copy
.op1
= gen_lowpart (V8SFmode
, d
->op1
);
21875 if (expand_vec_perm_even_odd_1 (&d_copy
, odd
))
21878 emit_move_insn (d
->target
,
21879 gen_lowpart (V8SImode
, d_copy
.target
));
21888 t1
= gen_reg_rtx (V8SImode
);
21889 t2
= gen_reg_rtx (V8SImode
);
21890 t3
= gen_reg_rtx (V4DImode
);
21891 t4
= gen_reg_rtx (V4DImode
);
21892 t5
= gen_reg_rtx (V4DImode
);
21894 /* Shuffle the lanes around into
21895 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
21896 emit_insn (gen_avx2_permv2ti (t3
, gen_lowpart (V4DImode
, d
->op0
),
21897 gen_lowpart (V4DImode
, d
->op1
),
21899 emit_insn (gen_avx2_permv2ti (t4
, gen_lowpart (V4DImode
, d
->op0
),
21900 gen_lowpart (V4DImode
, d
->op1
),
21903 /* Swap the 2nd and 3rd position in each lane into
21904 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
21905 emit_insn (gen_avx2_pshufdv3 (t1
, gen_lowpart (V8SImode
, t3
),
21906 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
21907 emit_insn (gen_avx2_pshufdv3 (t2
, gen_lowpart (V8SImode
, t4
),
21908 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
21910 /* Now an vpunpck[lh]qdq will produce
21911 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
21913 t3
= gen_avx2_interleave_highv4di (t5
, gen_lowpart (V4DImode
, t1
),
21914 gen_lowpart (V4DImode
, t2
));
21916 t3
= gen_avx2_interleave_lowv4di (t5
, gen_lowpart (V4DImode
, t1
),
21917 gen_lowpart (V4DImode
, t2
));
21919 emit_move_insn (d
->target
, gen_lowpart (V8SImode
, t5
));
21923 gcc_unreachable ();
21929 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
21930 extract-even and extract-odd permutations. */
21933 expand_vec_perm_even_odd (struct expand_vec_perm_d
*d
)
21935 unsigned i
, odd
, nelt
= d
->nelt
;
21938 if (odd
!= 0 && odd
!= 1)
21941 for (i
= 1; i
< nelt
; ++i
)
21942 if (d
->perm
[i
] != 2 * i
+ odd
)
21945 if (d
->vmode
== E_V32HImode
21947 && !TARGET_AVX512BW
)
21950 return expand_vec_perm_even_odd_1 (d
, odd
);
21953 /* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
21954 permutations. We assume that expand_vec_perm_1 has already failed. */
21957 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d
*d
)
21959 unsigned elt
= d
->perm
[0], nelt2
= d
->nelt
/ 2;
21960 machine_mode vmode
= d
->vmode
;
21961 rtx (*gen
) (rtx
, rtx
, rtx
);
21962 unsigned char perm2
[4];
21963 rtx op0
= d
->op0
, dest
;
21970 /* These are special-cased in sse.md so that we can optionally
21971 use the vbroadcast instruction. They expand to two insns
21972 if the input happens to be in a register. */
21973 gcc_unreachable ();
21983 /* These are always implementable using standard shuffle patterns. */
21984 gcc_unreachable ();
21987 /* This can be implemented via interleave and pshuflw. */
21993 gen
= gen_mmx_punpckhbw_low
;
21997 gen
= gen_mmx_punpcklbw_low
;
21999 dest
= gen_reg_rtx (vmode
);
22000 emit_insn (gen (dest
, op0
, op0
));
22001 vmode
= get_mode_wider_vector (vmode
);
22002 op0
= gen_lowpart (vmode
, dest
);
22004 memset (perm2
, elt
, 2);
22005 dest
= gen_reg_rtx (vmode
);
22006 ok
= expand_vselect (dest
, op0
, perm2
, 2, d
->testing_p
);
22009 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
22013 /* This can be implemented via interleave. We save one insn by
22014 stopping once we have promoted to V2SImode and then use pshufd. */
22021 gen
= vmode
== V8QImode
? gen_mmx_punpckhbw
22022 : gen_mmx_punpckhwd
;
22026 gen
= vmode
== V8QImode
? gen_mmx_punpcklbw
22027 : gen_mmx_punpcklwd
;
22030 dest
= gen_reg_rtx (vmode
);
22031 emit_insn (gen (dest
, op0
, op0
));
22032 vmode
= get_mode_wider_vector (vmode
);
22033 op0
= gen_lowpart (vmode
, dest
);
22035 while (vmode
!= V2SImode
);
22037 memset (perm2
, elt
, 2);
22038 dest
= gen_reg_rtx (vmode
);
22039 ok
= expand_vselect (dest
, op0
, perm2
, 2, d
->testing_p
);
22042 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
22047 /* These can be implemented via interleave. We save one insn by
22048 stopping once we have promoted to V4SImode and then use pshufd. */
22055 gen
= vmode
== V16QImode
? gen_vec_interleave_highv16qi
22056 : gen_vec_interleave_highv8hi
;
22060 gen
= vmode
== V16QImode
? gen_vec_interleave_lowv16qi
22061 : gen_vec_interleave_lowv8hi
;
22064 dest
= gen_reg_rtx (vmode
);
22065 emit_insn (gen (dest
, op0
, op0
));
22066 vmode
= get_mode_wider_vector (vmode
);
22067 op0
= gen_lowpart (vmode
, dest
);
22069 while (vmode
!= V4SImode
);
22071 memset (perm2
, elt
, 4);
22072 dest
= gen_reg_rtx (vmode
);
22073 ok
= expand_vselect (dest
, op0
, perm2
, 4, d
->testing_p
);
22076 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
22081 /* This can be implemented via interleave and pshufd. */
22085 rtx (*maybe_gen
) (machine_mode
, int, rtx
, rtx
, rtx
);
22088 maybe_gen
= maybe_gen_vec_interleave_high
;
22092 maybe_gen
= maybe_gen_vec_interleave_low
;
22095 dest
= gen_reg_rtx (vmode
);
22096 emit_insn (maybe_gen (vmode
, 1, dest
, op0
, op0
));
22099 op0
= gen_lowpart (vmode
, dest
);
22101 memset (perm2
, elt
, 4);
22102 dest
= gen_reg_rtx (vmode
);
22103 ok
= expand_vselect (dest
, op0
, perm2
, 4, d
->testing_p
);
22106 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, dest
));
22113 /* For AVX2 broadcasts of the first element vpbroadcast* or
22114 vpermq should be used by expand_vec_perm_1. */
22115 gcc_assert (!TARGET_AVX2
|| d
->perm
[0]);
22119 gcc_assert (!TARGET_AVX512BW
|| d
->perm
[0]);
22123 gcc_assert (!TARGET_AVX512BW
);
22127 gcc_unreachable ();
22131 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
22132 broadcast permutations. */
22135 expand_vec_perm_broadcast (struct expand_vec_perm_d
*d
)
22137 unsigned i
, elt
, nelt
= d
->nelt
;
22139 if (!d
->one_operand_p
)
22143 for (i
= 1; i
< nelt
; ++i
)
22144 if (d
->perm
[i
] != elt
)
22147 return expand_vec_perm_broadcast_1 (d
);
22150 /* Implement arbitrary permutations of two V64QImode operands
22151 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
22153 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d
*d
)
22155 if (!TARGET_AVX512BW
|| !(d
->vmode
== V64QImode
))
22161 struct expand_vec_perm_d ds
[2];
22162 rtx rperm
[128], vperm
, target0
, target1
;
22163 unsigned int i
, nelt
;
22164 machine_mode vmode
;
22169 for (i
= 0; i
< 2; i
++)
22172 ds
[i
].vmode
= V32HImode
;
22174 ds
[i
].target
= gen_reg_rtx (V32HImode
);
22175 ds
[i
].op0
= gen_lowpart (V32HImode
, d
->op0
);
22176 ds
[i
].op1
= gen_lowpart (V32HImode
, d
->op1
);
22179 /* Prepare permutations such that the first one takes care of
22180 putting the even bytes into the right positions or one higher
22181 positions (ds[0]) and the second one takes care of
22182 putting the odd bytes into the right positions or one below
22185 for (i
= 0; i
< nelt
; i
++)
22187 ds
[i
& 1].perm
[i
/ 2] = d
->perm
[i
] / 2;
22190 rperm
[i
] = constm1_rtx
;
22191 rperm
[i
+ 64] = GEN_INT ((i
& 14) + (d
->perm
[i
] & 1));
22195 rperm
[i
] = GEN_INT ((i
& 14) + (d
->perm
[i
] & 1));
22196 rperm
[i
+ 64] = constm1_rtx
;
22200 bool ok
= expand_vec_perm_1 (&ds
[0]);
22202 ds
[0].target
= gen_lowpart (V64QImode
, ds
[0].target
);
22204 ok
= expand_vec_perm_1 (&ds
[1]);
22206 ds
[1].target
= gen_lowpart (V64QImode
, ds
[1].target
);
22208 vperm
= gen_rtx_CONST_VECTOR (V64QImode
, gen_rtvec_v (64, rperm
));
22209 vperm
= force_reg (vmode
, vperm
);
22210 target0
= gen_reg_rtx (V64QImode
);
22211 emit_insn (gen_avx512bw_pshufbv64qi3 (target0
, ds
[0].target
, vperm
));
22213 vperm
= gen_rtx_CONST_VECTOR (V64QImode
, gen_rtvec_v (64, rperm
+ 64));
22214 vperm
= force_reg (vmode
, vperm
);
22215 target1
= gen_reg_rtx (V64QImode
);
22216 emit_insn (gen_avx512bw_pshufbv64qi3 (target1
, ds
[1].target
, vperm
));
22218 emit_insn (gen_iorv64qi3 (d
->target
, target0
, target1
));
22222 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
22223 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
22224 all the shorter instruction sequences. */
22227 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d
*d
)
22229 rtx rperm
[4][32], vperm
, l
[2], h
[2], op
, m128
;
22230 unsigned int i
, nelt
, eltsz
;
22234 || d
->one_operand_p
22235 || (d
->vmode
!= V32QImode
&& d
->vmode
!= V16HImode
))
22242 eltsz
= GET_MODE_UNIT_SIZE (d
->vmode
);
22244 /* Generate 4 permutation masks. If the required element is within
22245 the same lane, it is shuffled in. If the required element from the
22246 other lane, force a zero by setting bit 7 in the permutation mask.
22247 In the other mask the mask has non-negative elements if element
22248 is requested from the other lane, but also moved to the other lane,
22249 so that the result of vpshufb can have the two V2TImode halves
22251 m128
= GEN_INT (-128);
22252 for (i
= 0; i
< 32; ++i
)
22254 rperm
[0][i
] = m128
;
22255 rperm
[1][i
] = m128
;
22256 rperm
[2][i
] = m128
;
22257 rperm
[3][i
] = m128
;
22263 for (i
= 0; i
< nelt
; ++i
)
22265 unsigned j
, e
= d
->perm
[i
] & (nelt
/ 2 - 1);
22266 unsigned xlane
= ((d
->perm
[i
] ^ i
) & (nelt
/ 2)) * eltsz
;
22267 unsigned int which
= ((d
->perm
[i
] & nelt
) ? 2 : 0) + (xlane
? 1 : 0);
22269 for (j
= 0; j
< eltsz
; ++j
)
22270 rperm
[which
][(i
* eltsz
+ j
) ^ xlane
] = GEN_INT (e
* eltsz
+ j
);
22271 used
[which
] = true;
22274 for (i
= 0; i
< 2; ++i
)
22276 if (!used
[2 * i
+ 1])
22281 vperm
= gen_rtx_CONST_VECTOR (V32QImode
,
22282 gen_rtvec_v (32, rperm
[2 * i
+ 1]));
22283 vperm
= force_reg (V32QImode
, vperm
);
22284 h
[i
] = gen_reg_rtx (V32QImode
);
22285 op
= gen_lowpart (V32QImode
, i
? d
->op1
: d
->op0
);
22286 emit_insn (gen_avx2_pshufbv32qi3 (h
[i
], op
, vperm
));
22289 /* Swap the 128-byte lanes of h[X]. */
22290 for (i
= 0; i
< 2; ++i
)
22292 if (h
[i
] == NULL_RTX
)
22294 op
= gen_reg_rtx (V4DImode
);
22295 emit_insn (gen_avx2_permv4di_1 (op
, gen_lowpart (V4DImode
, h
[i
]),
22296 const2_rtx
, GEN_INT (3), const0_rtx
,
22298 h
[i
] = gen_lowpart (V32QImode
, op
);
22301 for (i
= 0; i
< 2; ++i
)
22308 vperm
= gen_rtx_CONST_VECTOR (V32QImode
, gen_rtvec_v (32, rperm
[2 * i
]));
22309 vperm
= force_reg (V32QImode
, vperm
);
22310 l
[i
] = gen_reg_rtx (V32QImode
);
22311 op
= gen_lowpart (V32QImode
, i
? d
->op1
: d
->op0
);
22312 emit_insn (gen_avx2_pshufbv32qi3 (l
[i
], op
, vperm
));
22315 for (i
= 0; i
< 2; ++i
)
22319 op
= gen_reg_rtx (V32QImode
);
22320 emit_insn (gen_iorv32qi3 (op
, l
[i
], h
[i
]));
22327 gcc_assert (l
[0] && l
[1]);
22329 if (d
->vmode
!= V32QImode
)
22330 op
= gen_reg_rtx (V32QImode
);
22331 emit_insn (gen_iorv32qi3 (op
, l
[0], l
[1]));
22332 if (op
!= d
->target
)
22333 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, op
));
22337 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
22338 taken care of, perform the expansion in D and return true on success. */
22341 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
22343 /* Try a single instruction expansion. */
22344 if (expand_vec_perm_1 (d
))
22347 /* Try sequences of two instructions. */
22349 if (expand_vec_perm_pshuflw_pshufhw (d
))
22352 if (expand_vec_perm_palignr (d
, false))
22355 if (expand_vec_perm_interleave2 (d
))
22358 if (expand_vec_perm_broadcast (d
))
22361 if (expand_vec_perm_vpermq_perm_1 (d
))
22364 if (expand_vec_perm_vperm2f128 (d
))
22367 if (expand_vec_perm_pblendv (d
))
22370 if (expand_vec_perm_2perm_interleave (d
, true))
22373 if (expand_vec_perm_2perm_pblendv (d
, true))
22376 if (expand_vec_perm_shufps_shufps (d
))
22379 /* Try sequences of three instructions. */
22381 if (expand_vec_perm_even_odd_pack (d
))
22384 if (expand_vec_perm_2vperm2f128_vshuf (d
))
22387 if (expand_vec_perm_pshufb2 (d
))
22390 if (expand_vec_perm_pslldq_psrldq_por (d
, false))
22393 if (expand_vec_perm_interleave3 (d
))
22396 if (expand_vec_perm_vperm2f128_vblend (d
))
22399 if (expand_vec_perm_2perm_interleave (d
, false))
22402 if (expand_vec_perm_2perm_pblendv (d
, false))
22405 /* Try sequences of four instructions. */
22407 if (expand_vec_perm_even_odd_trunc (d
))
22409 if (expand_vec_perm_vpshufb2_vpermq (d
))
22412 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d
))
22415 if (expand_vec_perm_vpermt2_vpshub2 (d
))
22418 /* ??? Look for narrow permutations whose element orderings would
22419 allow the promotion to a wider mode. */
22421 /* ??? Look for sequences of interleave or a wider permute that place
22422 the data into the correct lanes for a half-vector shuffle like
22423 pshuf[lh]w or vpermilps. */
22425 /* ??? Look for sequences of interleave that produce the desired results.
22426 The combinatorics of punpck[lh] get pretty ugly... */
22428 if (expand_vec_perm_even_odd (d
))
22431 /* Generate four or five instructions. */
22432 if (expand_vec_perm_pslldq_psrldq_por (d
, true))
22435 /* Even longer sequences. */
22436 if (expand_vec_perm_vpshufb4_vpermq2 (d
))
22439 /* See if we can get the same permutation in different vector integer
22441 struct expand_vec_perm_d nd
;
22442 if (canonicalize_vector_int_perm (d
, &nd
) && expand_vec_perm_1 (&nd
))
22445 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, nd
.target
));
22449 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
22450 if (expand_vec_perm2_vperm2f128_vblend (d
))
22456 /* If a permutation only uses one operand, make it clear. Returns true
22457 if the permutation references both operands. */
22460 canonicalize_perm (struct expand_vec_perm_d
*d
)
22462 int i
, which
, nelt
= d
->nelt
;
22464 for (i
= which
= 0; i
< nelt
; ++i
)
22465 which
|= (d
->perm
[i
] < nelt
? 1 : 2);
22467 d
->one_operand_p
= true;
22474 if (!rtx_equal_p (d
->op0
, d
->op1
))
22476 d
->one_operand_p
= false;
22479 /* The elements of PERM do not suggest that only the first operand
22480 is used, but both operands are identical. Allow easier matching
22481 of the permutation by folding the permutation into the single
22486 for (i
= 0; i
< nelt
; ++i
)
22487 d
->perm
[i
] &= nelt
- 1;
22496 return (which
== 3);
22499 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
22502 ix86_vectorize_vec_perm_const (machine_mode vmode
, machine_mode op_mode
,
22503 rtx target
, rtx op0
, rtx op1
,
22504 const vec_perm_indices
&sel
)
22506 if (vmode
!= op_mode
)
22509 struct expand_vec_perm_d d
;
22510 unsigned char perm
[MAX_VECT_LEN
];
22511 unsigned int i
, nelt
, which
;
22514 /* For HF mode vector, convert it to HI using subreg. */
22515 if (GET_MODE_INNER (vmode
) == HFmode
)
22517 machine_mode orig_mode
= vmode
;
22518 vmode
= mode_for_vector (HImode
,
22519 GET_MODE_NUNITS (vmode
)).require ();
22521 target
= lowpart_subreg (vmode
, target
, orig_mode
);
22523 op0
= lowpart_subreg (vmode
, op0
, orig_mode
);
22525 op1
= lowpart_subreg (vmode
, op1
, orig_mode
);
22533 gcc_assert (VECTOR_MODE_P (d
.vmode
));
22534 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
22535 d
.testing_p
= !target
;
22537 gcc_assert (sel
.length () == nelt
);
22538 gcc_checking_assert (sizeof (d
.perm
) == sizeof (perm
));
22540 /* Given sufficient ISA support we can just return true here
22541 for selected vector modes. */
22548 if (!TARGET_AVX512F
)
22550 /* All implementable with a single vperm[it]2 insn. */
22555 if (!TARGET_AVX512F
)
22557 if (d
.testing_p
&& TARGET_AVX512BW
)
22558 /* All implementable with a single vperm[it]2 insn. */
22562 if (!TARGET_AVX512F
)
22564 if (d
.testing_p
&& TARGET_AVX512BW
)
22565 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
22574 if (d
.testing_p
&& TARGET_AVX512VL
)
22575 /* All implementable with a single vperm[it]2 insn. */
22581 if (d
.testing_p
&& TARGET_AVX2
)
22582 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
22588 if (d
.testing_p
&& TARGET_AVX2
)
22589 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
22596 /* Fall through. */
22601 /* All implementable with a single vpperm insn. */
22602 if (d
.testing_p
&& TARGET_XOP
)
22604 /* All implementable with 2 pshufb + 1 ior. */
22605 if (d
.testing_p
&& TARGET_SSSE3
)
22612 if (!TARGET_MMX_WITH_SSE
)
22618 /* All implementable with *punpckwd. */
22630 /* All implementable with shufpd or unpck[lh]pd. */
22638 for (i
= which
= 0; i
< nelt
; ++i
)
22640 unsigned char e
= sel
[i
];
22641 gcc_assert (e
< 2 * nelt
);
22644 which
|= (e
< nelt
? 1 : 2);
22649 /* For all elements from second vector, fold the elements to first. */
22651 for (i
= 0; i
< nelt
; ++i
)
22654 /* Check whether the mask can be applied to the vector type. */
22655 d
.one_operand_p
= (which
!= 3);
22657 /* Implementable with shufps, pshufd or pshuflw. */
22658 if (d
.one_operand_p
22659 && (d
.vmode
== V4SFmode
|| d
.vmode
== V2SFmode
22660 || d
.vmode
== V4SImode
|| d
.vmode
== V2SImode
22661 || d
.vmode
== V4HImode
|| d
.vmode
== V2HImode
))
22664 /* Otherwise we have to go through the motions and see if we can
22665 figure out how to generate the requested permutation. */
22666 d
.target
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 1);
22667 d
.op1
= d
.op0
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 2);
22668 if (!d
.one_operand_p
)
22669 d
.op1
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 3);
22672 bool ret
= ix86_expand_vec_perm_const_1 (&d
);
22678 two_args
= canonicalize_perm (&d
);
22680 /* If one of the operands is a zero vector, try to match pmovzx. */
22681 if (two_args
&& (d
.op0
== CONST0_RTX (vmode
) || d
.op1
== CONST0_RTX (vmode
)))
22683 struct expand_vec_perm_d dzero
= d
;
22684 if (d
.op0
== CONST0_RTX (vmode
))
22686 d
.op1
= dzero
.op1
= force_reg (vmode
, d
.op1
);
22687 std::swap (dzero
.op0
, dzero
.op1
);
22688 for (i
= 0; i
< nelt
; ++i
)
22689 dzero
.perm
[i
] ^= nelt
;
22692 d
.op0
= dzero
.op0
= force_reg (vmode
, d
.op0
);
22694 if (expand_vselect_vconcat (dzero
.target
, dzero
.op0
, dzero
.op1
,
22695 dzero
.perm
, nelt
, dzero
.testing_p
))
22699 /* Force operands into registers. */
22700 rtx nop0
= force_reg (vmode
, d
.op0
);
22701 if (d
.op0
== d
.op1
)
22704 d
.op1
= force_reg (vmode
, d
.op1
);
22706 if (ix86_expand_vec_perm_const_1 (&d
))
22709 /* If the selector says both arguments are needed, but the operands are the
22710 same, the above tried to expand with one_operand_p and flattened selector.
22711 If that didn't work, retry without one_operand_p; we succeeded with that
22713 if (two_args
&& d
.one_operand_p
)
22715 d
.one_operand_p
= false;
22716 memcpy (d
.perm
, perm
, sizeof (perm
));
22717 return ix86_expand_vec_perm_const_1 (&d
);
22724 ix86_expand_vec_extract_even_odd (rtx targ
, rtx op0
, rtx op1
, unsigned odd
)
22726 struct expand_vec_perm_d d
;
22732 d
.vmode
= GET_MODE (targ
);
22733 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
22734 d
.one_operand_p
= false;
22735 d
.testing_p
= false;
22737 for (i
= 0; i
< nelt
; ++i
)
22738 d
.perm
[i
] = i
* 2 + odd
;
22740 /* We'll either be able to implement the permutation directly... */
22741 if (expand_vec_perm_1 (&d
))
22744 /* ... or we use the special-case patterns. */
22745 expand_vec_perm_even_odd_1 (&d
, odd
);
22749 ix86_expand_vec_interleave (rtx targ
, rtx op0
, rtx op1
, bool high_p
)
22751 struct expand_vec_perm_d d
;
22752 unsigned i
, nelt
, base
;
22758 d
.vmode
= GET_MODE (targ
);
22759 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
22760 d
.one_operand_p
= false;
22761 d
.testing_p
= false;
22763 base
= high_p
? nelt
/ 2 : 0;
22764 for (i
= 0; i
< nelt
/ 2; ++i
)
22766 d
.perm
[i
* 2] = i
+ base
;
22767 d
.perm
[i
* 2 + 1] = i
+ base
+ nelt
;
22770 /* Note that for AVX this isn't one instruction. */
22771 ok
= ix86_expand_vec_perm_const_1 (&d
);
22775 /* This function is similar as ix86_expand_vecop_qihi,
22776 but optimized under AVX512BW by using vpmovwb.
22777 For example, optimize vector MUL generation like
22779 vpmovzxbw ymm2, xmm0
22780 vpmovzxbw ymm3, xmm1
22781 vpmullw ymm4, ymm2, ymm3
22784 it would take less instructions than ix86_expand_vecop_qihi.
22785 Return true if success. */
22788 ix86_expand_vecop_qihi2 (enum rtx_code code
, rtx dest
, rtx op1
, rtx op2
)
22790 machine_mode himode
, qimode
= GET_MODE (dest
);
22791 rtx hop1
, hop2
, hdest
;
22792 rtx (*gen_extend
)(rtx
, rtx
);
22793 rtx (*gen_truncate
)(rtx
, rtx
);
22794 bool uns_p
= (code
== ASHIFTRT
) ? false : true;
22796 /* There's no V64HImode multiplication instruction. */
22797 if (qimode
== E_V64QImode
)
22800 /* vpmovwb only available under AVX512BW. */
22801 if (!TARGET_AVX512BW
)
22803 if ((qimode
== V8QImode
|| qimode
== V16QImode
)
22804 && !TARGET_AVX512VL
)
22806 /* Not generate zmm instruction when prefer 128/256 bit vector width. */
22807 if (qimode
== V32QImode
22808 && (TARGET_PREFER_AVX128
|| TARGET_PREFER_AVX256
))
22815 gen_extend
= uns_p
? gen_zero_extendv8qiv8hi2
: gen_extendv8qiv8hi2
;
22816 gen_truncate
= gen_truncv8hiv8qi2
;
22819 himode
= V16HImode
;
22820 gen_extend
= uns_p
? gen_zero_extendv16qiv16hi2
: gen_extendv16qiv16hi2
;
22821 gen_truncate
= gen_truncv16hiv16qi2
;
22824 himode
= V32HImode
;
22825 gen_extend
= uns_p
? gen_zero_extendv32qiv32hi2
: gen_extendv32qiv32hi2
;
22826 gen_truncate
= gen_truncv32hiv32qi2
;
22829 gcc_unreachable ();
22832 hop1
= gen_reg_rtx (himode
);
22833 hop2
= gen_reg_rtx (himode
);
22834 hdest
= gen_reg_rtx (himode
);
22835 emit_insn (gen_extend (hop1
, op1
));
22836 emit_insn (gen_extend (hop2
, op2
));
22837 emit_insn (gen_rtx_SET (hdest
, simplify_gen_binary (code
, himode
,
22839 emit_insn (gen_truncate (dest
, hdest
));
22843 /* Expand a vector operation shift by constant for a V*QImode in terms of the
22844 same operation on V*HImode. Return true if success. */
22846 ix86_expand_vec_shift_qihi_constant (enum rtx_code code
,
22847 rtx dest
, rtx op1
, rtx op2
)
22849 machine_mode qimode
, himode
;
22850 HOST_WIDE_INT and_constant
, xor_constant
;
22851 HOST_WIDE_INT shift_amount
;
22852 rtx vec_const_and
, vec_const_xor
;
22853 rtx tmp
, op1_subreg
;
22854 rtx (*gen_shift
) (rtx
, rtx
, rtx
);
22855 rtx (*gen_and
) (rtx
, rtx
, rtx
);
22856 rtx (*gen_xor
) (rtx
, rtx
, rtx
);
22857 rtx (*gen_sub
) (rtx
, rtx
, rtx
);
22859 /* Only optimize shift by constant. */
22860 if (!CONST_INT_P (op2
))
22863 qimode
= GET_MODE (dest
);
22864 shift_amount
= INTVAL (op2
);
22865 /* Do nothing when shift amount greater equal 8. */
22866 if (shift_amount
> 7)
22869 gcc_assert (code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
);
22870 /* Record sign bit. */
22871 xor_constant
= 1 << (8 - shift_amount
- 1);
22873 /* Zero upper/lower bits shift from left/right element. */
22875 = (code
== ASHIFT
? 256 - (1 << shift_amount
)
22876 : (1 << (8 - shift_amount
)) - 1);
22885 : (code
== ASHIFTRT
) ? gen_ashrv8hi3
: gen_lshrv8hi3
);
22886 gen_and
= gen_andv16qi3
;
22887 gen_xor
= gen_xorv16qi3
;
22888 gen_sub
= gen_subv16qi3
;
22891 himode
= V16HImode
;
22895 : (code
== ASHIFTRT
) ? gen_ashrv16hi3
: gen_lshrv16hi3
);
22896 gen_and
= gen_andv32qi3
;
22897 gen_xor
= gen_xorv32qi3
;
22898 gen_sub
= gen_subv32qi3
;
22901 himode
= V32HImode
;
22905 : (code
== ASHIFTRT
) ? gen_ashrv32hi3
: gen_lshrv32hi3
);
22906 gen_and
= gen_andv64qi3
;
22907 gen_xor
= gen_xorv64qi3
;
22908 gen_sub
= gen_subv64qi3
;
22911 gcc_unreachable ();
22914 tmp
= gen_reg_rtx (himode
);
22915 vec_const_and
= gen_reg_rtx (qimode
);
22916 op1_subreg
= lowpart_subreg (himode
, op1
, qimode
);
22918 /* For ASHIFT and LSHIFTRT, perform operation like
22919 vpsllw/vpsrlw $shift_amount, %op1, %dest.
22920 vpand %vec_const_and, %dest. */
22921 emit_insn (gen_shift (tmp
, op1_subreg
, op2
));
22922 emit_move_insn (dest
, simplify_gen_subreg (qimode
, tmp
, himode
, 0));
22923 emit_move_insn (vec_const_and
,
22924 ix86_build_const_vector (qimode
, true,
22925 gen_int_mode (and_constant
, QImode
)));
22926 emit_insn (gen_and (dest
, dest
, vec_const_and
));
22928 /* For ASHIFTRT, perform extra operation like
22929 vpxor %vec_const_xor, %dest, %dest
22930 vpsubb %vec_const_xor, %dest, %dest */
22931 if (code
== ASHIFTRT
)
22933 vec_const_xor
= gen_reg_rtx (qimode
);
22934 emit_move_insn (vec_const_xor
,
22935 ix86_build_const_vector (qimode
, true,
22936 gen_int_mode (xor_constant
, QImode
)));
22937 emit_insn (gen_xor (dest
, dest
, vec_const_xor
));
22938 emit_insn (gen_sub (dest
, dest
, vec_const_xor
));
22943 /* Expand a vector operation CODE for a V*QImode in terms of the
22944 same operation on V*HImode. */
22947 ix86_expand_vecop_qihi (enum rtx_code code
, rtx dest
, rtx op1
, rtx op2
)
22949 machine_mode qimode
= GET_MODE (dest
);
22950 machine_mode himode
;
22951 rtx (*gen_il
) (rtx
, rtx
, rtx
);
22952 rtx (*gen_ih
) (rtx
, rtx
, rtx
);
22953 rtx op1_l
, op1_h
, op2_l
, op2_h
, res_l
, res_h
;
22954 struct expand_vec_perm_d d
;
22955 bool ok
, full_interleave
;
22956 bool uns_p
= false;
22959 if (CONST_INT_P (op2
)
22960 && (code
== ASHIFT
|| code
== LSHIFTRT
|| code
== ASHIFTRT
)
22961 && ix86_expand_vec_shift_qihi_constant (code
, dest
, op1
, op2
))
22964 if (TARGET_AVX512BW
22965 && VECTOR_MODE_P (GET_MODE (op2
))
22966 && ix86_expand_vecop_qihi2 (code
, dest
, op1
, op2
))
22973 gen_il
= gen_vec_interleave_lowv16qi
;
22974 gen_ih
= gen_vec_interleave_highv16qi
;
22977 himode
= V16HImode
;
22978 gen_il
= gen_avx2_interleave_lowv32qi
;
22979 gen_ih
= gen_avx2_interleave_highv32qi
;
22982 himode
= V32HImode
;
22983 gen_il
= gen_avx512bw_interleave_lowv64qi
;
22984 gen_ih
= gen_avx512bw_interleave_highv64qi
;
22987 gcc_unreachable ();
22993 /* Unpack data such that we've got a source byte in each low byte of
22994 each word. We don't care what goes into the high byte of each word.
22995 Rather than trying to get zero in there, most convenient is to let
22996 it be a copy of the low byte. */
22997 op2_l
= gen_reg_rtx (qimode
);
22998 op2_h
= gen_reg_rtx (qimode
);
22999 emit_insn (gen_il (op2_l
, op2
, op2
));
23000 emit_insn (gen_ih (op2_h
, op2
, op2
));
23002 op1_l
= gen_reg_rtx (qimode
);
23003 op1_h
= gen_reg_rtx (qimode
);
23004 emit_insn (gen_il (op1_l
, op1
, op1
));
23005 emit_insn (gen_ih (op1_h
, op1
, op1
));
23006 full_interleave
= qimode
== V16QImode
;
23014 op1_l
= gen_reg_rtx (himode
);
23015 op1_h
= gen_reg_rtx (himode
);
23016 ix86_expand_sse_unpack (op1_l
, op1
, uns_p
, false);
23017 ix86_expand_sse_unpack (op1_h
, op1
, uns_p
, true);
23018 /* vashr/vlshr/vashl */
23019 if (GET_MODE_CLASS (GET_MODE (op2
)) == MODE_VECTOR_INT
)
23021 rtx tmp
= force_reg (qimode
, op2
);
23022 op2_l
= gen_reg_rtx (himode
);
23023 op2_h
= gen_reg_rtx (himode
);
23024 ix86_expand_sse_unpack (op2_l
, tmp
, uns_p
, false);
23025 ix86_expand_sse_unpack (op2_h
, tmp
, uns_p
, true);
23028 op2_l
= op2_h
= op2
;
23030 full_interleave
= true;
23033 gcc_unreachable ();
23036 /* Perform vashr/vlshr/vashl. */
23038 && GET_MODE_CLASS (GET_MODE (op2
)) == MODE_VECTOR_INT
)
23040 res_l
= gen_reg_rtx (himode
);
23041 res_h
= gen_reg_rtx (himode
);
23042 emit_insn (gen_rtx_SET (res_l
,
23043 simplify_gen_binary (code
, himode
,
23045 emit_insn (gen_rtx_SET (res_h
,
23046 simplify_gen_binary (code
, himode
,
23049 /* Performance mult/ashr/lshr/ashl. */
23052 res_l
= expand_simple_binop (himode
, code
, op1_l
, op2_l
, NULL_RTX
,
23054 res_h
= expand_simple_binop (himode
, code
, op1_h
, op2_h
, NULL_RTX
,
23058 gcc_assert (res_l
&& res_h
);
23060 /* Merge the data back into the right place. */
23062 d
.op0
= gen_lowpart (qimode
, res_l
);
23063 d
.op1
= gen_lowpart (qimode
, res_h
);
23065 d
.nelt
= GET_MODE_NUNITS (qimode
);
23066 d
.one_operand_p
= false;
23067 d
.testing_p
= false;
23069 if (full_interleave
)
23071 /* For SSE2, we used an full interleave, so the desired
23072 results are in the even elements. */
23073 for (i
= 0; i
< d
.nelt
; ++i
)
23078 /* For AVX, the interleave used above was not cross-lane. So the
23079 extraction is evens but with the second and third quarter swapped.
23080 Happily, that is even one insn shorter than even extraction.
23081 For AVX512BW we have 4 lanes. We extract evens from within a lane,
23082 always first from the first and then from the second source operand,
23083 the index bits above the low 4 bits remains the same.
23084 Thus, for d.nelt == 32 we want permutation
23085 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
23086 and for d.nelt == 64 we want permutation
23087 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
23088 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
23089 for (i
= 0; i
< d
.nelt
; ++i
)
23090 d
.perm
[i
] = ((i
* 2) & 14) + ((i
& 8) ? d
.nelt
: 0) + (i
& ~15);
23093 ok
= ix86_expand_vec_perm_const_1 (&d
);
23096 set_unique_reg_note (get_last_insn (), REG_EQUAL
,
23097 gen_rtx_fmt_ee (code
, qimode
, op1
, op2
));
23100 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
23101 if op is CONST_VECTOR with all odd elements equal to their
23102 preceding element. */
23105 const_vector_equal_evenodd_p (rtx op
)
23107 machine_mode mode
= GET_MODE (op
);
23108 int i
, nunits
= GET_MODE_NUNITS (mode
);
23109 if (GET_CODE (op
) != CONST_VECTOR
23110 || nunits
!= CONST_VECTOR_NUNITS (op
))
23112 for (i
= 0; i
< nunits
; i
+= 2)
23113 if (CONST_VECTOR_ELT (op
, i
) != CONST_VECTOR_ELT (op
, i
+ 1))
23119 ix86_expand_mul_widen_evenodd (rtx dest
, rtx op1
, rtx op2
,
23120 bool uns_p
, bool odd_p
)
23122 machine_mode mode
= GET_MODE (op1
);
23123 machine_mode wmode
= GET_MODE (dest
);
23125 rtx orig_op1
= op1
, orig_op2
= op2
;
23127 if (!nonimmediate_operand (op1
, mode
))
23128 op1
= force_reg (mode
, op1
);
23129 if (!nonimmediate_operand (op2
, mode
))
23130 op2
= force_reg (mode
, op2
);
23132 /* We only play even/odd games with vectors of SImode. */
23133 gcc_assert (mode
== V4SImode
|| mode
== V8SImode
|| mode
== V16SImode
);
23135 /* If we're looking for the odd results, shift those members down to
23136 the even slots. For some cpus this is faster than a PSHUFD. */
23139 /* For XOP use vpmacsdqh, but only for smult, as it is only
23141 if (TARGET_XOP
&& mode
== V4SImode
&& !uns_p
)
23143 x
= force_reg (wmode
, CONST0_RTX (wmode
));
23144 emit_insn (gen_xop_pmacsdqh (dest
, op1
, op2
, x
));
23148 x
= GEN_INT (GET_MODE_UNIT_BITSIZE (mode
));
23149 if (!const_vector_equal_evenodd_p (orig_op1
))
23150 op1
= expand_binop (wmode
, lshr_optab
, gen_lowpart (wmode
, op1
),
23151 x
, NULL
, 1, OPTAB_DIRECT
);
23152 if (!const_vector_equal_evenodd_p (orig_op2
))
23153 op2
= expand_binop (wmode
, lshr_optab
, gen_lowpart (wmode
, op2
),
23154 x
, NULL
, 1, OPTAB_DIRECT
);
23155 op1
= gen_lowpart (mode
, op1
);
23156 op2
= gen_lowpart (mode
, op2
);
23159 if (mode
== V16SImode
)
23162 x
= gen_vec_widen_umult_even_v16si (dest
, op1
, op2
);
23164 x
= gen_vec_widen_smult_even_v16si (dest
, op1
, op2
);
23166 else if (mode
== V8SImode
)
23169 x
= gen_vec_widen_umult_even_v8si (dest
, op1
, op2
);
23171 x
= gen_vec_widen_smult_even_v8si (dest
, op1
, op2
);
23174 x
= gen_vec_widen_umult_even_v4si (dest
, op1
, op2
);
23175 else if (TARGET_SSE4_1
)
23176 x
= gen_sse4_1_mulv2siv2di3 (dest
, op1
, op2
);
23179 rtx s1
, s2
, t0
, t1
, t2
;
23181 /* The easiest way to implement this without PMULDQ is to go through
23182 the motions as if we are performing a full 64-bit multiply. With
23183 the exception that we need to do less shuffling of the elements. */
23185 /* Compute the sign-extension, aka highparts, of the two operands. */
23186 s1
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), GT
, CONST0_RTX (mode
),
23187 op1
, pc_rtx
, pc_rtx
);
23188 s2
= ix86_expand_sse_cmp (gen_reg_rtx (mode
), GT
, CONST0_RTX (mode
),
23189 op2
, pc_rtx
, pc_rtx
);
23191 /* Multiply LO(A) * HI(B), and vice-versa. */
23192 t1
= gen_reg_rtx (wmode
);
23193 t2
= gen_reg_rtx (wmode
);
23194 emit_insn (gen_vec_widen_umult_even_v4si (t1
, s1
, op2
));
23195 emit_insn (gen_vec_widen_umult_even_v4si (t2
, s2
, op1
));
23197 /* Multiply LO(A) * LO(B). */
23198 t0
= gen_reg_rtx (wmode
);
23199 emit_insn (gen_vec_widen_umult_even_v4si (t0
, op1
, op2
));
23201 /* Combine and shift the highparts into place. */
23202 t1
= expand_binop (wmode
, add_optab
, t1
, t2
, t1
, 1, OPTAB_DIRECT
);
23203 t1
= expand_binop (wmode
, ashl_optab
, t1
, GEN_INT (32), t1
,
23206 /* Combine high and low parts. */
23207 force_expand_binop (wmode
, add_optab
, t0
, t1
, dest
, 1, OPTAB_DIRECT
);
23214 ix86_expand_mul_widen_hilo (rtx dest
, rtx op1
, rtx op2
,
23215 bool uns_p
, bool high_p
)
23217 machine_mode wmode
= GET_MODE (dest
);
23218 machine_mode mode
= GET_MODE (op1
);
23219 rtx t1
, t2
, t3
, t4
, mask
;
23224 t1
= gen_reg_rtx (mode
);
23225 t2
= gen_reg_rtx (mode
);
23226 if (TARGET_XOP
&& !uns_p
)
23228 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
23229 shuffle the elements once so that all elements are in the right
23230 place for immediate use: { A C B D }. */
23231 emit_insn (gen_sse2_pshufd_1 (t1
, op1
, const0_rtx
, const2_rtx
,
23232 const1_rtx
, GEN_INT (3)));
23233 emit_insn (gen_sse2_pshufd_1 (t2
, op2
, const0_rtx
, const2_rtx
,
23234 const1_rtx
, GEN_INT (3)));
23238 /* Put the elements into place for the multiply. */
23239 ix86_expand_vec_interleave (t1
, op1
, op1
, high_p
);
23240 ix86_expand_vec_interleave (t2
, op2
, op2
, high_p
);
23243 ix86_expand_mul_widen_evenodd (dest
, t1
, t2
, uns_p
, high_p
);
23247 /* Shuffle the elements between the lanes. After this we
23248 have { A B E F | C D G H } for each operand. */
23249 t1
= gen_reg_rtx (V4DImode
);
23250 t2
= gen_reg_rtx (V4DImode
);
23251 emit_insn (gen_avx2_permv4di_1 (t1
, gen_lowpart (V4DImode
, op1
),
23252 const0_rtx
, const2_rtx
,
23253 const1_rtx
, GEN_INT (3)));
23254 emit_insn (gen_avx2_permv4di_1 (t2
, gen_lowpart (V4DImode
, op2
),
23255 const0_rtx
, const2_rtx
,
23256 const1_rtx
, GEN_INT (3)));
23258 /* Shuffle the elements within the lanes. After this we
23259 have { A A B B | C C D D } or { E E F F | G G H H }. */
23260 t3
= gen_reg_rtx (V8SImode
);
23261 t4
= gen_reg_rtx (V8SImode
);
23262 mask
= GEN_INT (high_p
23263 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
23264 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
23265 emit_insn (gen_avx2_pshufdv3 (t3
, gen_lowpart (V8SImode
, t1
), mask
));
23266 emit_insn (gen_avx2_pshufdv3 (t4
, gen_lowpart (V8SImode
, t2
), mask
));
23268 ix86_expand_mul_widen_evenodd (dest
, t3
, t4
, uns_p
, false);
23273 t1
= expand_binop (mode
, smul_optab
, op1
, op2
, NULL_RTX
,
23274 uns_p
, OPTAB_DIRECT
);
23275 t2
= expand_binop (mode
,
23276 uns_p
? umul_highpart_optab
: smul_highpart_optab
,
23277 op1
, op2
, NULL_RTX
, uns_p
, OPTAB_DIRECT
);
23278 gcc_assert (t1
&& t2
);
23280 t3
= gen_reg_rtx (mode
);
23281 ix86_expand_vec_interleave (t3
, t1
, t2
, high_p
);
23282 emit_move_insn (dest
, gen_lowpart (wmode
, t3
));
23290 t1
= gen_reg_rtx (wmode
);
23291 t2
= gen_reg_rtx (wmode
);
23292 ix86_expand_sse_unpack (t1
, op1
, uns_p
, high_p
);
23293 ix86_expand_sse_unpack (t2
, op2
, uns_p
, high_p
);
23295 emit_insn (gen_rtx_SET (dest
, gen_rtx_MULT (wmode
, t1
, t2
)));
23299 gcc_unreachable ();
23304 ix86_expand_sse2_mulv4si3 (rtx op0
, rtx op1
, rtx op2
)
23306 rtx res_1
, res_2
, res_3
, res_4
;
23308 res_1
= gen_reg_rtx (V4SImode
);
23309 res_2
= gen_reg_rtx (V4SImode
);
23310 res_3
= gen_reg_rtx (V2DImode
);
23311 res_4
= gen_reg_rtx (V2DImode
);
23312 ix86_expand_mul_widen_evenodd (res_3
, op1
, op2
, true, false);
23313 ix86_expand_mul_widen_evenodd (res_4
, op1
, op2
, true, true);
23315 /* Move the results in element 2 down to element 1; we don't care
23316 what goes in elements 2 and 3. Then we can merge the parts
23317 back together with an interleave.
23319 Note that two other sequences were tried:
23320 (1) Use interleaves at the start instead of psrldq, which allows
23321 us to use a single shufps to merge things back at the end.
23322 (2) Use shufps here to combine the two vectors, then pshufd to
23323 put the elements in the correct order.
23324 In both cases the cost of the reformatting stall was too high
23325 and the overall sequence slower. */
23327 emit_insn (gen_sse2_pshufd_1 (res_1
, gen_lowpart (V4SImode
, res_3
),
23328 const0_rtx
, const2_rtx
,
23329 const0_rtx
, const0_rtx
));
23330 emit_insn (gen_sse2_pshufd_1 (res_2
, gen_lowpart (V4SImode
, res_4
),
23331 const0_rtx
, const2_rtx
,
23332 const0_rtx
, const0_rtx
));
23333 res_1
= emit_insn (gen_vec_interleave_lowv4si (op0
, res_1
, res_2
));
23335 set_unique_reg_note (res_1
, REG_EQUAL
, gen_rtx_MULT (V4SImode
, op1
, op2
));
23339 ix86_expand_sse2_mulvxdi3 (rtx op0
, rtx op1
, rtx op2
)
23341 machine_mode mode
= GET_MODE (op0
);
23342 rtx t1
, t2
, t3
, t4
, t5
, t6
;
23344 if (TARGET_AVX512DQ
&& mode
== V8DImode
)
23345 emit_insn (gen_avx512dq_mulv8di3 (op0
, op1
, op2
));
23346 else if (TARGET_AVX512DQ
&& TARGET_AVX512VL
&& mode
== V4DImode
)
23347 emit_insn (gen_avx512dq_mulv4di3 (op0
, op1
, op2
));
23348 else if (TARGET_AVX512DQ
&& TARGET_AVX512VL
&& mode
== V2DImode
)
23349 emit_insn (gen_avx512dq_mulv2di3 (op0
, op1
, op2
));
23350 else if (TARGET_XOP
&& mode
== V2DImode
)
23352 /* op1: A,B,C,D, op2: E,F,G,H */
23353 op1
= gen_lowpart (V4SImode
, op1
);
23354 op2
= gen_lowpart (V4SImode
, op2
);
23356 t1
= gen_reg_rtx (V4SImode
);
23357 t2
= gen_reg_rtx (V4SImode
);
23358 t3
= gen_reg_rtx (V2DImode
);
23359 t4
= gen_reg_rtx (V2DImode
);
23362 emit_insn (gen_sse2_pshufd_1 (t1
, op1
,
23368 /* t2: (B*E),(A*F),(D*G),(C*H) */
23369 emit_insn (gen_mulv4si3 (t2
, t1
, op2
));
23371 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
23372 emit_insn (gen_xop_phadddq (t3
, t2
));
23374 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
23375 emit_insn (gen_ashlv2di3 (t4
, t3
, GEN_INT (32)));
23377 /* Multiply lower parts and add all */
23378 t5
= gen_reg_rtx (V2DImode
);
23379 emit_insn (gen_vec_widen_umult_even_v4si (t5
,
23380 gen_lowpart (V4SImode
, op1
),
23381 gen_lowpart (V4SImode
, op2
)));
23382 force_expand_binop (mode
, add_optab
, t5
, t4
, op0
, 1, OPTAB_DIRECT
);
23386 machine_mode nmode
;
23387 rtx (*umul
) (rtx
, rtx
, rtx
);
23389 if (mode
== V2DImode
)
23391 umul
= gen_vec_widen_umult_even_v4si
;
23394 else if (mode
== V4DImode
)
23396 umul
= gen_vec_widen_umult_even_v8si
;
23399 else if (mode
== V8DImode
)
23401 umul
= gen_vec_widen_umult_even_v16si
;
23405 gcc_unreachable ();
23408 /* Multiply low parts. */
23409 t1
= gen_reg_rtx (mode
);
23410 emit_insn (umul (t1
, gen_lowpart (nmode
, op1
), gen_lowpart (nmode
, op2
)));
23412 /* Shift input vectors right 32 bits so we can multiply high parts. */
23414 t2
= expand_binop (mode
, lshr_optab
, op1
, t6
, NULL
, 1, OPTAB_DIRECT
);
23415 t3
= expand_binop (mode
, lshr_optab
, op2
, t6
, NULL
, 1, OPTAB_DIRECT
);
23417 /* Multiply high parts by low parts. */
23418 t4
= gen_reg_rtx (mode
);
23419 t5
= gen_reg_rtx (mode
);
23420 emit_insn (umul (t4
, gen_lowpart (nmode
, t2
), gen_lowpart (nmode
, op2
)));
23421 emit_insn (umul (t5
, gen_lowpart (nmode
, t3
), gen_lowpart (nmode
, op1
)));
23423 /* Combine and shift the highparts back. */
23424 t4
= expand_binop (mode
, add_optab
, t4
, t5
, t4
, 1, OPTAB_DIRECT
);
23425 t4
= expand_binop (mode
, ashl_optab
, t4
, t6
, t4
, 1, OPTAB_DIRECT
);
23427 /* Combine high and low parts. */
23428 force_expand_binop (mode
, add_optab
, t1
, t4
, op0
, 1, OPTAB_DIRECT
);
23431 set_unique_reg_note (get_last_insn (), REG_EQUAL
,
23432 gen_rtx_MULT (mode
, op1
, op2
));
23435 /* Return 1 if control tansfer instruction INSN
23436 should be encoded with notrack prefix. */
23439 ix86_notrack_prefixed_insn_p (rtx_insn
*insn
)
23441 if (!insn
|| !((flag_cf_protection
& CF_BRANCH
)))
23446 rtx call
= get_call_rtx_from (insn
);
23447 gcc_assert (call
!= NULL_RTX
);
23448 rtx addr
= XEXP (call
, 0);
23450 /* Do not emit 'notrack' if it's not an indirect call. */
23452 && GET_CODE (XEXP (addr
, 0)) == SYMBOL_REF
)
23455 return find_reg_note (insn
, REG_CALL_NOCF_CHECK
, 0);
23458 if (JUMP_P (insn
) && !flag_cet_switch
)
23460 rtx target
= JUMP_LABEL (insn
);
23461 if (target
== NULL_RTX
|| ANY_RETURN_P (target
))
23464 /* Check the jump is a switch table. */
23465 rtx_insn
*label
= as_a
<rtx_insn
*> (target
);
23466 rtx_insn
*table
= next_insn (label
);
23467 if (table
== NULL_RTX
|| !JUMP_TABLE_DATA_P (table
))
23475 /* Calculate integer abs() using only SSE2 instructions. */
23478 ix86_expand_sse2_abs (rtx target
, rtx input
)
23480 machine_mode mode
= GET_MODE (target
);
23487 /* For 64-bit signed integer X, with SSE4.2 use
23488 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
23489 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
23490 32 and use logical instead of arithmetic right shift (which is
23491 unimplemented) and subtract. */
23494 tmp0
= gen_reg_rtx (mode
);
23495 tmp1
= gen_reg_rtx (mode
);
23496 emit_move_insn (tmp1
, CONST0_RTX (mode
));
23497 if (mode
== E_V2DImode
)
23498 emit_insn (gen_sse4_2_gtv2di3 (tmp0
, tmp1
, input
));
23500 emit_insn (gen_avx2_gtv4di3 (tmp0
, tmp1
, input
));
23504 tmp0
= expand_simple_binop (mode
, LSHIFTRT
, input
,
23505 GEN_INT (GET_MODE_UNIT_BITSIZE (mode
)
23506 - 1), NULL
, 0, OPTAB_DIRECT
);
23507 tmp0
= expand_simple_unop (mode
, NEG
, tmp0
, NULL
, false);
23510 tmp1
= expand_simple_binop (mode
, XOR
, tmp0
, input
,
23511 NULL
, 0, OPTAB_DIRECT
);
23512 x
= expand_simple_binop (mode
, MINUS
, tmp1
, tmp0
,
23513 target
, 0, OPTAB_DIRECT
);
23517 /* For 32-bit signed integer X, the best way to calculate the absolute
23518 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
23519 tmp0
= expand_simple_binop (mode
, ASHIFTRT
, input
,
23520 GEN_INT (GET_MODE_UNIT_BITSIZE (mode
) - 1),
23521 NULL
, 0, OPTAB_DIRECT
);
23522 tmp1
= expand_simple_binop (mode
, XOR
, tmp0
, input
,
23523 NULL
, 0, OPTAB_DIRECT
);
23524 x
= expand_simple_binop (mode
, MINUS
, tmp1
, tmp0
,
23525 target
, 0, OPTAB_DIRECT
);
23529 /* For 16-bit signed integer X, the best way to calculate the absolute
23530 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
23531 tmp0
= expand_unop (mode
, neg_optab
, input
, NULL_RTX
, 0);
23533 x
= expand_simple_binop (mode
, SMAX
, tmp0
, input
,
23534 target
, 0, OPTAB_DIRECT
);
23538 /* For 8-bit signed integer X, the best way to calculate the absolute
23539 value of X is min ((unsigned char) X, (unsigned char) (-X)),
23540 as SSE2 provides the PMINUB insn. */
23541 tmp0
= expand_unop (mode
, neg_optab
, input
, NULL_RTX
, 0);
23543 x
= expand_simple_binop (V16QImode
, UMIN
, tmp0
, input
,
23544 target
, 0, OPTAB_DIRECT
);
23548 gcc_unreachable ();
23552 emit_move_insn (target
, x
);
23555 /* Expand an extract from a vector register through pextr insn.
23556 Return true if successful. */
23559 ix86_expand_pextr (rtx
*operands
)
23561 rtx dst
= operands
[0];
23562 rtx src
= operands
[1];
23564 unsigned int size
= INTVAL (operands
[2]);
23565 unsigned int pos
= INTVAL (operands
[3]);
23567 if (SUBREG_P (dst
))
23569 /* Reject non-lowpart subregs. */
23570 if (SUBREG_BYTE (dst
) > 0)
23572 dst
= SUBREG_REG (dst
);
23575 if (SUBREG_P (src
))
23577 pos
+= SUBREG_BYTE (src
) * BITS_PER_UNIT
;
23578 src
= SUBREG_REG (src
);
23581 switch (GET_MODE (src
))
23589 machine_mode srcmode
, dstmode
;
23592 if (!int_mode_for_size (size
, 0).exists (&dstmode
))
23598 if (!TARGET_SSE4_1
)
23600 srcmode
= V16QImode
;
23606 srcmode
= V8HImode
;
23610 if (!TARGET_SSE4_1
)
23612 srcmode
= V4SImode
;
23616 gcc_assert (TARGET_64BIT
);
23617 if (!TARGET_SSE4_1
)
23619 srcmode
= V2DImode
;
23626 /* Reject extractions from misaligned positions. */
23627 if (pos
& (size
-1))
23630 if (GET_MODE (dst
) == dstmode
)
23633 d
= gen_reg_rtx (dstmode
);
23635 /* Construct insn pattern. */
23636 pat
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, GEN_INT (pos
/ size
)));
23637 pat
= gen_rtx_VEC_SELECT (dstmode
, gen_lowpart (srcmode
, src
), pat
);
23639 /* Let the rtl optimizers know about the zero extension performed. */
23640 if (dstmode
== QImode
|| dstmode
== HImode
)
23642 pat
= gen_rtx_ZERO_EXTEND (SImode
, pat
);
23643 d
= gen_lowpart (SImode
, d
);
23646 emit_insn (gen_rtx_SET (d
, pat
));
23649 emit_move_insn (dst
, gen_lowpart (GET_MODE (dst
), d
));
23658 /* Expand an insert into a vector register through pinsr insn.
23659 Return true if successful. */
23662 ix86_expand_pinsr (rtx
*operands
)
23664 rtx dst
= operands
[0];
23665 rtx src
= operands
[3];
23667 unsigned int size
= INTVAL (operands
[1]);
23668 unsigned int pos
= INTVAL (operands
[2]);
23670 if (SUBREG_P (dst
))
23672 pos
+= SUBREG_BYTE (dst
) * BITS_PER_UNIT
;
23673 dst
= SUBREG_REG (dst
);
23676 switch (GET_MODE (dst
))
23684 machine_mode srcmode
, dstmode
;
23685 rtx (*pinsr
)(rtx
, rtx
, rtx
, rtx
);
23688 if (!int_mode_for_size (size
, 0).exists (&srcmode
))
23694 if (!TARGET_SSE4_1
)
23696 dstmode
= V16QImode
;
23697 pinsr
= gen_sse4_1_pinsrb
;
23703 dstmode
= V8HImode
;
23704 pinsr
= gen_sse2_pinsrw
;
23708 if (!TARGET_SSE4_1
)
23710 dstmode
= V4SImode
;
23711 pinsr
= gen_sse4_1_pinsrd
;
23715 gcc_assert (TARGET_64BIT
);
23716 if (!TARGET_SSE4_1
)
23718 dstmode
= V2DImode
;
23719 pinsr
= gen_sse4_1_pinsrq
;
23726 /* Reject insertions to misaligned positions. */
23727 if (pos
& (size
-1))
23730 if (SUBREG_P (src
))
23732 unsigned int srcpos
= SUBREG_BYTE (src
);
23738 extr_ops
[0] = gen_reg_rtx (srcmode
);
23739 extr_ops
[1] = gen_lowpart (srcmode
, SUBREG_REG (src
));
23740 extr_ops
[2] = GEN_INT (size
);
23741 extr_ops
[3] = GEN_INT (srcpos
* BITS_PER_UNIT
);
23743 if (!ix86_expand_pextr (extr_ops
))
23749 src
= gen_lowpart (srcmode
, SUBREG_REG (src
));
23752 if (GET_MODE (dst
) == dstmode
)
23755 d
= gen_reg_rtx (dstmode
);
23757 emit_insn (pinsr (d
, gen_lowpart (dstmode
, dst
),
23758 gen_lowpart (srcmode
, src
),
23759 GEN_INT (1 << (pos
/ size
))));
23761 emit_move_insn (dst
, gen_lowpart (GET_MODE (dst
), d
));
23770 /* All CPUs prefer to avoid cross-lane operations so perform reductions
23771 upper against lower halves up to SSE reg size. */
23774 ix86_split_reduction (machine_mode mode
)
23776 /* Reduce lowpart against highpart until we reach SSE reg width to
23777 avoid cross-lane operations. */
23803 /* Generate call to __divmoddi4. */
23806 ix86_expand_divmod_libfunc (rtx libfunc
, machine_mode mode
,
23808 rtx
*quot_p
, rtx
*rem_p
)
23810 rtx rem
= assign_386_stack_local (mode
, SLOT_TEMP
);
23812 rtx quot
= emit_library_call_value (libfunc
, NULL_RTX
, LCT_NORMAL
,
23813 mode
, op0
, mode
, op1
, mode
,
23814 XEXP (rem
, 0), Pmode
);
23820 ix86_expand_atomic_fetch_op_loop (rtx target
, rtx mem
, rtx val
,
23821 enum rtx_code code
, bool after
,
23824 rtx old_reg
, new_reg
, old_mem
, success
;
23825 machine_mode mode
= GET_MODE (target
);
23826 rtx_code_label
*loop_label
= NULL
;
23828 old_reg
= gen_reg_rtx (mode
);
23830 old_mem
= copy_to_reg (mem
);
23831 loop_label
= gen_label_rtx ();
23832 emit_label (loop_label
);
23833 emit_move_insn (old_reg
, old_mem
);
23835 /* return value for atomic_fetch_op. */
23837 emit_move_insn (target
, old_reg
);
23841 new_reg
= expand_simple_binop (mode
, AND
, new_reg
, val
, NULL_RTX
,
23842 true, OPTAB_LIB_WIDEN
);
23843 new_reg
= expand_simple_unop (mode
, code
, new_reg
, NULL_RTX
, true);
23846 new_reg
= expand_simple_binop (mode
, code
, new_reg
, val
, NULL_RTX
,
23847 true, OPTAB_LIB_WIDEN
);
23849 /* return value for atomic_op_fetch. */
23851 emit_move_insn (target
, new_reg
);
23853 success
= NULL_RTX
;
23855 ix86_expand_cmpxchg_loop (&success
, old_mem
, mem
, old_reg
, new_reg
,
23856 gen_int_mode (MEMMODEL_SYNC_SEQ_CST
,
23858 doubleword
, loop_label
);
23861 /* Relax cmpxchg instruction, param loop_label indicates whether
23862 the instruction should be relaxed with a pause loop. If not,
23863 it will be relaxed to an atomic load + compare, and skip
23864 cmpxchg instruction if mem != exp_input. */
23867 ix86_expand_cmpxchg_loop (rtx
*ptarget_bool
, rtx target_val
,
23868 rtx mem
, rtx exp_input
, rtx new_input
,
23869 rtx mem_model
, bool doubleword
,
23870 rtx_code_label
*loop_label
)
23872 rtx_code_label
*cmp_label
= NULL
;
23873 rtx_code_label
*done_label
= NULL
;
23874 rtx target_bool
= NULL_RTX
, new_mem
= NULL_RTX
;
23875 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
, rtx
) = NULL
;
23876 rtx (*gendw
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
) = NULL
;
23877 machine_mode mode
= GET_MODE (target_val
), hmode
= mode
;
23879 if (*ptarget_bool
== NULL
)
23880 target_bool
= gen_reg_rtx (QImode
);
23882 target_bool
= *ptarget_bool
;
23884 cmp_label
= gen_label_rtx ();
23885 done_label
= gen_label_rtx ();
23887 new_mem
= gen_reg_rtx (mode
);
23888 /* Load memory first. */
23889 expand_atomic_load (new_mem
, mem
, MEMMODEL_SEQ_CST
);
23894 gendw
= gen_atomic_compare_and_swapti_doubleword
;
23900 gendw
= gen_atomic_compare_and_swapdi_doubleword
;
23904 gen
= gen_atomic_compare_and_swapdi_1
;
23907 gen
= gen_atomic_compare_and_swapsi_1
;
23910 gen
= gen_atomic_compare_and_swaphi_1
;
23913 gen
= gen_atomic_compare_and_swapqi_1
;
23916 gcc_unreachable ();
23919 /* Compare mem value with expected value. */
23922 rtx low_new_mem
= gen_lowpart (hmode
, new_mem
);
23923 rtx low_exp_input
= gen_lowpart (hmode
, exp_input
);
23924 rtx high_new_mem
= gen_highpart (hmode
, new_mem
);
23925 rtx high_exp_input
= gen_highpart (hmode
, exp_input
);
23926 emit_cmp_and_jump_insns (low_new_mem
, low_exp_input
, NE
, NULL_RTX
,
23927 hmode
, 1, cmp_label
,
23928 profile_probability::guessed_never ());
23929 emit_cmp_and_jump_insns (high_new_mem
, high_exp_input
, NE
, NULL_RTX
,
23930 hmode
, 1, cmp_label
,
23931 profile_probability::guessed_never ());
23934 emit_cmp_and_jump_insns (new_mem
, exp_input
, NE
, NULL_RTX
,
23935 GET_MODE (exp_input
), 1, cmp_label
,
23936 profile_probability::guessed_never ());
23938 /* Directly emits cmpxchg here. */
23940 emit_insn (gendw (target_val
, mem
, exp_input
,
23941 gen_lowpart (hmode
, new_input
),
23942 gen_highpart (hmode
, new_input
),
23945 emit_insn (gen (target_val
, mem
, exp_input
, new_input
, mem_model
));
23949 emit_jump_insn (gen_jump (done_label
));
23951 emit_label (cmp_label
);
23952 emit_move_insn (target_val
, new_mem
);
23953 emit_label (done_label
);
23954 ix86_expand_setcc (target_bool
, EQ
, gen_rtx_REG (CCZmode
, FLAGS_REG
),
23959 ix86_expand_setcc (target_bool
, EQ
, gen_rtx_REG (CCZmode
, FLAGS_REG
),
23961 emit_cmp_and_jump_insns (target_bool
, const0_rtx
, EQ
, const0_rtx
,
23962 GET_MODE (target_bool
), 1, loop_label
,
23963 profile_probability::guessed_never ());
23964 emit_jump_insn (gen_jump (done_label
));
23967 /* If mem is not expected, pause and loop back. */
23968 emit_label (cmp_label
);
23969 emit_move_insn (target_val
, new_mem
);
23970 emit_insn (gen_pause ());
23971 emit_jump_insn (gen_jump (loop_label
));
23973 emit_label (done_label
);
23976 *ptarget_bool
= target_bool
;
23979 #include "gt-i386-expand.h"