-/* Copyright (C) 1988-2019 Free Software Foundation, Inc.
+/* Copyright (C) 1988-2021 Free Software Foundation, Inc.
This file is part of GCC.
#include "gimplify.h"
#include "dwarf2.h"
#include "tm-constrs.h"
-#include "params.h"
#include "cselib.h"
#include "sched-int.h"
#include "opts.h"
{
machine_mode half_mode;
unsigned int byte;
+ rtx mem_op = NULL_RTX;
+ int mem_num = 0;
switch (mode)
{
case E_DImode:
half_mode = SImode;
break;
+ case E_P2HImode:
+ half_mode = HImode;
+ break;
+ case E_P2QImode:
+ half_mode = QImode;
+ break;
default:
gcc_unreachable ();
}
but we still have to handle it. */
if (MEM_P (op))
{
- lo_half[num] = adjust_address (op, half_mode, 0);
- hi_half[num] = adjust_address (op, half_mode, byte);
+ if (mem_op && rtx_equal_p (op, mem_op))
+ {
+ lo_half[num] = lo_half[mem_num];
+ hi_half[num] = hi_half[mem_num];
+ }
+ else
+ {
+ mem_op = op;
+ mem_num = num;
+ lo_half[num] = adjust_address (op, half_mode, 0);
+ hi_half[num] = adjust_address (op, half_mode, byte);
+ }
}
else
{
op0 = operands[0];
op1 = operands[1];
+ /* Avoid complex sets of likely spilled hard registers before reload. */
+ if (!ix86_hardreg_mov_ok (op0, op1))
+ {
+ tmp = gen_reg_rtx (mode);
+ operands[0] = tmp;
+ ix86_expand_move (mode, operands);
+ operands[0] = op0;
+ operands[1] = tmp;
+ op1 = tmp;
+ }
+
switch (GET_CODE (op1))
{
case CONST:
rtx src2 = operands[2];
/* Both source operands cannot be in memory. */
- if (MEM_P (src1) && MEM_P (src2))
+ if ((MEM_P (src1) || bcst_mem_operand (src1, mode))
+ && (MEM_P (src2) || bcst_mem_operand (src2, mode)))
return false;
/* Canonicalize operand order for commutative operators. */
rtx_insn *insn;
rtx scratch, tmp0, tmp1, tmp2;
rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
- rtx (*gen_zero_extend) (rtx, rtx);
- rtx (*gen_test_ccno_1) (rtx, rtx);
switch (mode)
{
else
gen_divmod4_1
= unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
- gen_zero_extend = gen_zero_extendqisi2;
}
else
- {
- gen_divmod4_1
- = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
- gen_zero_extend = gen_zero_extendqidi2;
- }
- gen_test_ccno_1 = gen_testsi_ccno_1;
+ gen_divmod4_1
+ = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
break;
+
case E_DImode:
gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
- gen_test_ccno_1 = gen_testdi_ccno_1;
- gen_zero_extend = gen_zero_extendqidi2;
break;
+
default:
gcc_unreachable ();
}
emit_move_insn (scratch, operands[2]);
scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
scratch, 1, OPTAB_DIRECT);
- emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
+ emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
JUMP_LABEL (insn) = qimode_label;
/* Generate original signed/unsigned divimod. */
- div = gen_divmod4_1 (operands[0], operands[1],
- operands[2], operands[3]);
- emit_insn (div);
+ emit_insn (gen_divmod4_1 (operands[0], operands[1],
+ operands[2], operands[3]));
/* Branch to the end. */
emit_jump_insn (gen_jump (end_label));
}
/* Extract remainder from AH. */
- tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
- tmp0, GEN_INT (8), GEN_INT (8));
- if (REG_P (operands[1]))
- insn = emit_move_insn (operands[1], tmp1);
- else
- {
- /* Need a new scratch register since the old one has result
- of 8bit divide. */
- scratch = gen_reg_rtx (GET_MODE (operands[1]));
- emit_move_insn (scratch, tmp1);
- insn = emit_move_insn (operands[1], scratch);
- }
+ scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
+ tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
+ GEN_INT (8), GEN_INT (8));
+ insn = emit_move_insn (operands[1], tmp1);
set_unique_reg_note (insn, REG_EQUAL, mod);
/* Zero extend quotient from AL. */
tmp1 = gen_lowpart (QImode, tmp0);
- insn = emit_insn (gen_zero_extend (operands[0], tmp1));
+ insn = emit_insn (gen_extend_insn
+ (operands[0], tmp1,
+ GET_MODE (operands[0]), QImode, 1));
set_unique_reg_note (insn, REG_EQUAL, div);
emit_label (end_label);
if (regno0 != regno2)
emit_insn (gen_rtx_SET (target, parts.index));
- /* Use shift for scaling. */
- ix86_emit_binop (ASHIFT, mode, target,
- GEN_INT (exact_log2 (parts.scale)));
+ /* Use shift for scaling, but emit it as MULT instead
+ to avoid it being immediately peephole2 optimized back
+ into lea. */
+ ix86_emit_binop (MULT, mode, target, GEN_INT (parts.scale));
if (parts.base)
ix86_emit_binop (PLUS, mode, target, parts.base);
gcc_unreachable ();
}
+static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask);
+
/* Convert an unsigned SImode value into a DFmode. Only currently used
for SSE, but applicable anywhere. */
x = const_double_from_real_value (TWO31r, DFmode);
x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
+
+ /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
+ if (HONOR_SIGNED_ZEROS (DFmode) && flag_rounding_math)
+ x = ix86_expand_sse_fabs (x, NULL);
+
if (x != target)
emit_move_insn (target, x);
}
OPTAB_DIRECT);
else
{
- rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
+ rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
two31 = ix86_build_const_vector (intmode, 1, two31);
*xorp = expand_simple_binop (intmode, AND,
gen_lowpart (intmode, tmp[0]),
machine_mode vmode = mode;
rtvec par;
- if (vector_mode)
- use_sse = true;
- else if (mode == TFmode)
+ if (vector_mode || mode == TFmode)
use_sse = true;
else if (TARGET_SSE_MATH)
{
Create the appropriate mask now. */
mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
use = gen_rtx_USE (VOIDmode, mask);
- if (vector_mode)
+ if (vector_mode || mode == TFmode)
par = gen_rtvec (2, set, use);
else
{
switch (code)
{
- case GT:
- case GE:
case LT:
case LE:
+ case GT:
+ case GE:
+ case LTGT:
return false;
case EQ:
case NE:
- case LTGT:
case UNORDERED:
case ORDERED:
case UNLT:
{
gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
- /* We may be reversing unordered compare to normal compare, that
- is not valid in general (we may convert non-trapping condition
- to trapping one), however on i386 we currently emit all
- comparisons unordered. */
- new_code = reverse_condition_maybe_unordered (code);
+ /* We may be reversing a non-trapping
+ comparison to a trapping comparison. */
+ if (HONOR_NANS (cmp_mode) && flag_trapping_math
+ && code != EQ && code != NE
+ && code != ORDERED && code != UNORDERED)
+ new_code = UNKNOWN;
+ else
+ new_code = reverse_condition_maybe_unordered (code);
}
else
new_code = ix86_reverse_condition (code, cmp_mode);
}
if (cf != 0)
{
- tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
+ tmp = plus_constant (mode, tmp, cf);
nops++;
}
if (!rtx_equal_p (tmp, out))
{
gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
- /* We may be reversing unordered compare to normal compare,
- that is not valid in general (we may convert non-trapping
- condition to trapping one), however on i386 we currently
- emit all comparisons unordered. */
- new_code = reverse_condition_maybe_unordered (code);
+ /* We may be reversing a non-trapping
+ comparison to a trapping comparison. */
+ if (HONOR_NANS (cmp_mode) && flag_trapping_math
+ && code != EQ && code != NE
+ && code != ORDERED && code != UNORDERED)
+ new_code = UNKNOWN;
+ else
+ new_code = reverse_condition_maybe_unordered (code);
+
}
else
{
{
var = operands[2];
if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
- operands[2] = constm1_rtx, op = and_optab;
+ {
+ /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
+ "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
+ if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
+ operands[1] = simplify_gen_relational (LT, VOIDmode,
+ GET_MODE (op0),
+ op0, const0_rtx);
+
+ operands[2] = constm1_rtx;
+ op = and_optab;
+ }
else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
operands[2] = const0_rtx, op = ior_optab;
else
return true;
}
+/* Return true if MODE is valid for vector compare to mask register,
+ Same result for conditionl vector move with mask register. */
+static bool
+ix86_valid_mask_cmp_mode (machine_mode mode)
+{
+ /* XOP has its own vector conditional movement. */
+ if (TARGET_XOP && !TARGET_AVX512F)
+ return false;
+
+ /* AVX512F is needed for mask operation. */
+ if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
+ return false;
+
+ /* AVX512BW is needed for vector QI/HImode,
+ AVX512VL is needed for 128/256-bit vector. */
+ machine_mode inner_mode = GET_MODE_INNER (mode);
+ int vector_size = GET_MODE_SIZE (mode);
+ if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
+ return false;
+
+ return vector_size == 64 || TARGET_AVX512VL;
+}
+
+/* Return true if integer mask comparison should be used. */
+static bool
+ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
+ rtx op_true, rtx op_false)
+{
+ if (GET_MODE_SIZE (mode) == 64)
+ return true;
+
+ /* When op_true is NULL, op_false must be NULL, or vice versa. */
+ gcc_assert (!op_true == !op_false);
+
+ /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
+ vector dest is required. */
+ if (!op_true || !ix86_valid_mask_cmp_mode (cmp_mode))
+ return false;
+
+ /* Exclude those that could be optimized in ix86_expand_sse_movcc. */
+ if (op_false == CONST0_RTX (mode)
+ || op_true == CONST0_RTX (mode)
+ || (INTEGRAL_MODE_P (mode)
+ && (op_true == CONSTM1_RTX (mode)
+ || op_false == CONSTM1_RTX (mode))))
+ return false;
+
+ return true;
+}
+
/* Expand an SSE comparison. Return the register with the result. */
static rtx
bool maskcmp = false;
rtx x;
- if (GET_MODE_SIZE (cmp_ops_mode) == 64)
+ if (ix86_use_mask_cmp_p (mode, cmp_ops_mode, op_true, op_false))
{
unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
- cmp_mode = int_mode_for_size (nbits, 0).require ();
maskcmp = true;
+ cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
}
else
cmp_mode = cmp_ops_mode;
|| (op_false && reg_overlap_mentioned_p (dest, op_false)))
dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
- /* Compare patterns for int modes are unspec in AVX512F only. */
- if (maskcmp && (code == GT || code == EQ))
+ if (maskcmp)
{
- rtx (*gen)(rtx, rtx, rtx);
-
- switch (cmp_ops_mode)
- {
- case E_V64QImode:
- gcc_assert (TARGET_AVX512BW);
- gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
- break;
- case E_V32HImode:
- gcc_assert (TARGET_AVX512BW);
- gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
- break;
- case E_V16SImode:
- gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
- break;
- case E_V8DImode:
- gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
- break;
- default:
- gen = NULL;
- }
-
- if (gen)
- {
- emit_insn (gen (dest, cmp_op0, cmp_op1));
- return dest;
- }
+ bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
+ gcc_assert (ok);
+ return dest;
}
+
x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
- if (cmp_mode != mode && !maskcmp)
+ if (cmp_mode != mode)
{
x = force_reg (cmp_ops_mode, x);
convert_move (dest, x, false);
machine_mode mode = GET_MODE (dest);
machine_mode cmpmode = GET_MODE (cmp);
- /* In AVX512F the result of comparison is an integer mask. */
- bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
+ /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
+ if (rtx_equal_p (op_true, op_false))
+ {
+ emit_move_insn (dest, op_true);
+ return;
+ }
rtx t2, t3, x;
cmp = gen_rtx_SUBREG (mode, cmp, 0);
}
- if (maskcmp)
+ /* In AVX512F the result of comparison is an integer mask. */
+ if (mode != cmpmode
+ && GET_MODE_CLASS (cmpmode) == MODE_INT)
{
- rtx (*gen) (rtx, rtx) = NULL;
- if ((op_true == CONST0_RTX (mode)
- && vector_all_ones_operand (op_false, mode))
- || (op_false == CONST0_RTX (mode)
- && vector_all_ones_operand (op_true, mode)))
- switch (mode)
- {
- case E_V64QImode:
- if (TARGET_AVX512BW)
- gen = gen_avx512bw_cvtmask2bv64qi;
- break;
- case E_V32QImode:
- if (TARGET_AVX512VL && TARGET_AVX512BW)
- gen = gen_avx512vl_cvtmask2bv32qi;
- break;
- case E_V16QImode:
- if (TARGET_AVX512VL && TARGET_AVX512BW)
- gen = gen_avx512vl_cvtmask2bv16qi;
- break;
- case E_V32HImode:
- if (TARGET_AVX512BW)
- gen = gen_avx512bw_cvtmask2wv32hi;
- break;
- case E_V16HImode:
- if (TARGET_AVX512VL && TARGET_AVX512BW)
- gen = gen_avx512vl_cvtmask2wv16hi;
- break;
- case E_V8HImode:
- if (TARGET_AVX512VL && TARGET_AVX512BW)
- gen = gen_avx512vl_cvtmask2wv8hi;
- break;
- case E_V16SImode:
- if (TARGET_AVX512DQ)
- gen = gen_avx512f_cvtmask2dv16si;
- break;
- case E_V8SImode:
- if (TARGET_AVX512VL && TARGET_AVX512DQ)
- gen = gen_avx512vl_cvtmask2dv8si;
- break;
- case E_V4SImode:
- if (TARGET_AVX512VL && TARGET_AVX512DQ)
- gen = gen_avx512vl_cvtmask2dv4si;
- break;
- case E_V8DImode:
- if (TARGET_AVX512DQ)
- gen = gen_avx512f_cvtmask2qv8di;
- break;
- case E_V4DImode:
- if (TARGET_AVX512VL && TARGET_AVX512DQ)
- gen = gen_avx512vl_cvtmask2qv4di;
- break;
- case E_V2DImode:
- if (TARGET_AVX512VL && TARGET_AVX512DQ)
- gen = gen_avx512vl_cvtmask2qv2di;
- break;
- default:
- break;
- }
- if (gen && SCALAR_INT_MODE_P (cmpmode))
- {
- cmp = force_reg (cmpmode, cmp);
- if (op_true == CONST0_RTX (mode))
- {
- rtx (*gen_not) (rtx, rtx);
- switch (cmpmode)
- {
- case E_QImode: gen_not = gen_knotqi; break;
- case E_HImode: gen_not = gen_knothi; break;
- case E_SImode: gen_not = gen_knotsi; break;
- case E_DImode: gen_not = gen_knotdi; break;
- default: gcc_unreachable ();
- }
- rtx n = gen_reg_rtx (cmpmode);
- emit_insn (gen_not (n, cmp));
- cmp = n;
- }
- emit_insn (gen (dest, cmp));
- return;
+ gcc_assert (ix86_valid_mask_cmp_mode (mode));
+ /* Using vector move with mask register. */
+ cmp = force_reg (cmpmode, cmp);
+ /* Optimize for mask zero. */
+ op_true = (op_true != CONST0_RTX (mode)
+ ? force_reg (mode, op_true) : op_true);
+ op_false = (op_false != CONST0_RTX (mode)
+ ? force_reg (mode, op_false) : op_false);
+ if (op_true == CONST0_RTX (mode))
+ {
+ rtx n = gen_reg_rtx (cmpmode);
+ if (cmpmode == E_DImode && !TARGET_64BIT)
+ emit_insn (gen_knotdi (n, cmp));
+ else
+ emit_insn (gen_rtx_SET (n, gen_rtx_fmt_e (NOT, cmpmode, cmp)));
+ cmp = n;
+ /* Reverse op_true op_false. */
+ std::swap (op_true, op_false);
}
+
+ rtx vec_merge = gen_rtx_VEC_MERGE (mode, op_true, op_false, cmp);
+ emit_insn (gen_rtx_SET (dest, vec_merge));
+ return;
}
else if (vector_all_ones_operand (op_true, mode)
&& op_false == CONST0_RTX (mode))
/* Expand AVX-512 vector comparison. */
bool
-ix86_expand_mask_vec_cmp (rtx operands[])
+ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
{
- machine_mode mask_mode = GET_MODE (operands[0]);
- machine_mode cmp_mode = GET_MODE (operands[2]);
- enum rtx_code code = GET_CODE (operands[1]);
+ machine_mode mask_mode = GET_MODE (dest);
+ machine_mode cmp_mode = GET_MODE (cmp_op0);
rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
int unspec_code;
rtx unspec;
unspec_code = UNSPEC_PCMP;
}
- unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
- operands[3], imm),
+ unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
unspec_code);
- emit_insn (gen_rtx_SET (operands[0], unspec));
+ emit_insn (gen_rtx_SET (dest, unspec));
return true;
}
}
else
cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
- operands[1], operands[2]);
+ NULL, NULL);
if (operands[0] != cmp)
emit_move_insn (operands[0], cmp);
&& (mode == V16QImode || mode == V8HImode
|| mode == V4SImode || mode == V2DImode))
;
+ /* AVX512F supports all of the comparsions
+ on all 128/256/512-bit vector int types. */
+ else if (ix86_use_mask_cmp_p (data_mode, mode, op_true, op_false))
+ ;
else
{
/* Canonicalize the comparison to EQ, GT, GTU. */
static void
-expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
+expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
rtx destptr, rtx srcptr, rtx value,
rtx count, machine_mode mode, int unroll,
int expected_size, bool issetmem)
Other arguments have same meaning as for previous function. */
static void
-expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
+expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
rtx destptr, rtx srcptr, rtx value, rtx orig_value,
rtx count,
machine_mode mode, bool issetmem)
/* If possible, it is shorter to use rep movs.
TODO: Maybe it is better to move this logic to decide_alg. */
if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
+ && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
&& (!issetmem || orig_value == const0_rtx))
mode = SImode;
emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
HOST_WIDE_INT size_to_move)
{
- rtx dst = destmem, src = *srcmem, adjust, tempreg;
+ rtx dst = destmem, src = *srcmem, tempreg;
enum insn_code code;
machine_mode move_mode;
int piece_size, i;
/* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
gcc_assert (size_to_move % piece_size == 0);
- adjust = GEN_INT (piece_size);
+
for (i = 0; i < size_to_move; i += piece_size)
{
/* We move from memory to memory, so we'll need to do it via
emit_insn (GEN_FCN (code) (dst, tempreg));
emit_move_insn (destptr,
- gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
+ plus_constant (Pmode, copy_rtx (destptr), piece_size));
emit_move_insn (srcptr,
- gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
+ plus_constant (Pmode, copy_rtx (srcptr), piece_size));
dst = adjust_automodify_address_nv (dst, move_mode, destptr,
piece_size);
/* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
static void
-expand_movmem_epilogue (rtx destmem, rtx srcmem,
+expand_cpymem_epilogue (rtx destmem, rtx srcmem,
rtx destptr, rtx srcptr, rtx count, int max_size)
{
rtx src, dest;
{
count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
count, 1, OPTAB_DIRECT);
- expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
+ expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
count, QImode, 1, 4, false);
return;
}
emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
HOST_WIDE_INT size_to_move)
{
- rtx dst = destmem, adjust;
+ rtx dst = destmem;
enum insn_code code;
machine_mode move_mode;
int piece_size, i;
/* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
gcc_assert (size_to_move % piece_size == 0);
- adjust = GEN_INT (piece_size);
+
for (i = 0; i < size_to_move; i += piece_size)
{
if (piece_size <= GET_MODE_SIZE (word_mode))
emit_insn (GEN_FCN (code) (dst, promoted_val));
emit_move_insn (destptr,
- gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
+ plus_constant (Pmode, copy_rtx (destptr), piece_size));
dst = adjust_automodify_address_nv (dst, move_mode, destptr,
piece_size);
{
count = expand_simple_binop (counter_mode (count), AND, count,
GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
- expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
+ expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
gen_lowpart (QImode, value), count, QImode,
1, max_size / 2, true);
}
Return value is updated DESTMEM. */
static rtx
-expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
+expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
rtx destptr, rtx srcptr, rtx value,
rtx vec_value, rtx count, int align,
int desired_alignment, bool issetmem)
or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
and jump to DONE_LABEL. */
static void
-expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
+expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
rtx destptr, rtx srcptr,
rtx value, rtx vec_value,
rtx count, int size,
done_label:
*/
static void
-expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
+expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
rtx *destptr, rtx *srcptr,
machine_mode mode,
rtx value, rtx vec_value,
/* Handle sizes > 3. */
for (;size2 > 2; size2 >>= 1)
- expand_small_movmem_or_setmem (destmem, srcmem,
+ expand_small_cpymem_or_setmem (destmem, srcmem,
*destptr, *srcptr,
value, vec_value,
*count,
is returned, but also of SRC, which is passed as a pointer for that
reason. */
static rtx
-expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
+expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
rtx srcreg, rtx value, rtx vec_value,
int desired_align, int align_bytes,
bool issetmem)
else if (!any_alg_usable_p)
break;
}
- else if (alg_usable_p (candidate, memset, have_as))
+ else if (alg_usable_p (candidate, memset, have_as)
+ && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
+ && candidate == rep_prefix_1_byte
+ /* NB: If min_size != max_size, size is
+ unknown. */
+ && min_size != max_size))
{
*noalign = algs->size[i].noalign;
return candidate;
desired_align = GET_MODE_SIZE (move_mode);
/* PentiumPro has special logic triggering for 8 byte aligned blocks.
copying whole cacheline at once. */
- if (TARGET_PENTIUMPRO
+ if (TARGET_CPU_P (PENTIUMPRO)
&& (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
desired_align = 8;
rtx reg = convert_modes (mode, QImode, val, true);
if (!TARGET_PARTIAL_REG_STALL)
- if (mode == SImode)
- emit_insn (gen_insvsi_1 (reg, reg));
- else
- emit_insn (gen_insvdi_1 (reg, reg));
+ emit_insn (gen_insv_1 (mode, reg, reg));
else
{
tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
with specified algorithm. */
bool
-ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
+ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
rtx align_exp, rtx expected_align_exp,
rtx expected_size_exp, rtx min_size_exp,
rtx max_size_exp, rtx probable_max_size_exp,
&& optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
move_mode = wider_mode;
- if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (move_mode) > 128)
+ if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
move_mode = TImode;
/* Find the corresponding vector mode with the same size as MOVE_MODE.
if (misaligned_prologue_used)
{
/* Misaligned move prologue handled small blocks by itself. */
- expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
+ expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
(dst, src, &destreg, &srcreg,
move_mode, promoted_val, vec_promoted_val,
&count_exp,
dst = change_address (dst, BLKmode, destreg);
if (!issetmem)
src = change_address (src, BLKmode, srcreg);
- dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
+ dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
promoted_val, vec_promoted_val,
count_exp, align, desired_align,
issetmem);
{
/* If we know how many bytes need to be stored before dst is
sufficiently aligned, maintain aliasing info accurately. */
- dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
+ dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
srcreg,
promoted_val,
vec_promoted_val,
case loop_1_byte:
case loop:
case unrolled_loop:
- expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
+ expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
count_exp, move_mode, unroll_factor,
expected_size, issetmem);
break;
case vector_loop:
- expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
+ expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
vec_promoted_val, count_exp, move_mode,
unroll_factor, expected_size, issetmem);
break;
case rep_prefix_8_byte:
case rep_prefix_4_byte:
case rep_prefix_1_byte:
- expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
+ expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
val_exp, count_exp, move_mode, issetmem);
break;
}
vec_promoted_val, count_exp,
epilogue_size_needed);
else
- expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
+ expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
epilogue_size_needed);
}
}
return true;
}
+/* Expand cmpstrn or memcmp. */
+
+bool
+ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2,
+ rtx length, rtx align, bool is_cmpstrn)
+{
+ /* Expand strncmp and memcmp only with -minline-all-stringops since
+ "repz cmpsb" can be much slower than strncmp and memcmp functions
+ implemented with vector instructions, see
+
+ https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
+ */
+ if (!TARGET_INLINE_ALL_STRINGOPS)
+ return false;
+
+ /* Can't use this if the user has appropriated ecx, esi or edi. */
+ if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
+ return false;
+
+ if (is_cmpstrn)
+ {
+ /* For strncmp, length is the maximum length, which can be larger
+ than actual string lengths. We can expand the cmpstrn pattern
+ to "repz cmpsb" only if one of the strings is a constant so
+ that expand_builtin_strncmp() can write the length argument to
+ be the minimum of the const string length and the actual length
+ argument. Otherwise, "repz cmpsb" may pass the 0 byte. */
+ tree t1 = MEM_EXPR (src1);
+ tree t2 = MEM_EXPR (src2);
+ if (!((t1 && TREE_CODE (t1) == MEM_REF
+ && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
+ && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0))
+ == STRING_CST))
+ || (t2 && TREE_CODE (t2) == MEM_REF
+ && TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
+ && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0))
+ == STRING_CST))))
+ return false;
+ }
+
+ rtx addr1 = copy_addr_to_reg (XEXP (src1, 0));
+ rtx addr2 = copy_addr_to_reg (XEXP (src2, 0));
+ if (addr1 != XEXP (src1, 0))
+ src1 = replace_equiv_address_nv (src1, addr1);
+ if (addr2 != XEXP (src2, 0))
+ src2 = replace_equiv_address_nv (src2, addr2);
+
+ /* NB: Make a copy of the data length to avoid changing the original
+ data length by cmpstrnqi patterns. */
+ length = ix86_zero_extend_to_Pmode (length);
+ rtx lengthreg = gen_reg_rtx (Pmode);
+ emit_move_insn (lengthreg, length);
+
+ /* If we are testing strict equality, we can use known alignment to
+ good advantage. This may be possible with combine, particularly
+ once cc0 is dead. */
+ if (CONST_INT_P (length))
+ {
+ if (length == const0_rtx)
+ {
+ emit_move_insn (result, const0_rtx);
+ return true;
+ }
+ emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align,
+ src1, src2));
+ }
+ else
+ {
+ emit_insn (gen_cmp_1 (Pmode, lengthreg, lengthreg));
+ emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align,
+ src1, src2));
+ }
+
+ rtx out = gen_lowpart (QImode, result);
+ emit_insn (gen_cmpintqi (out));
+ emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out));
+
+ return true;
+}
/* Expand the appropriate insns for doing strlen if not just doing
repnz; scasb
reg,
tmpreg)));
/* Emit lea manually to avoid clobbering of flags. */
- emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
+ emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
}
else if (!TARGET_PECOFF && !TARGET_MACHO)
{
- if (TARGET_64BIT)
+ if (TARGET_64BIT
+ && ix86_cmodel == CM_LARGE_PIC
+ && DEFAULT_ABI != MS_ABI)
+ {
+ fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
+ UNSPEC_GOT);
+ fnaddr = gen_rtx_CONST (Pmode, fnaddr);
+ fnaddr = force_reg (Pmode, fnaddr);
+ fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr);
+ }
+ else if (TARGET_64BIT)
{
fnaddr = gen_rtx_UNSPEC (Pmode,
gen_rtvec (1, addr),
enum rtx_code sub_code)
{
rtx pat;
- int i;
- int nargs;
+ unsigned int i, nargs;
bool comparison_p = false;
bool tf_p = false;
bool last_arg_constant = false;
int num_memory = 0;
- struct {
- rtx op;
- machine_mode mode;
- } args[4];
+ rtx xops[4];
machine_mode tmode = insn_data[icode].operand[0].mode;
else if (memory_operand (target, tmode))
num_memory++;
- gcc_assert (nargs <= 4);
+ gcc_assert (nargs <= ARRAY_SIZE (xops));
for (i = 0; i < nargs; i++)
{
op = force_reg (mode, op);
}
- args[i].op = op;
- args[i].mode = mode;
+ xops[i] = op;
}
switch (nargs)
{
case 1:
- pat = GEN_FCN (icode) (target, args[0].op);
+ pat = GEN_FCN (icode) (target, xops[0]);
break;
case 2:
if (tf_p)
- pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
+ pat = GEN_FCN (icode) (target, xops[0], xops[1],
GEN_INT ((int)sub_code));
else if (! comparison_p)
- pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
+ pat = GEN_FCN (icode) (target, xops[0], xops[1]);
else
{
rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
- args[0].op,
- args[1].op);
+ xops[0], xops[1]);
- pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
+ pat = GEN_FCN (icode) (target, cmp_op, xops[0], xops[1]);
}
break;
case 3:
- pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
+ pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
break;
case 4:
- pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
+ pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
break;
default:
if (VECTOR_MODE_P (mode1))
op1 = safe_vector_operand (op1, mode1);
- /* Swap operands if we have a comparison that isn't available in
- hardware. */
- if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
- std::swap (op0, op1);
-
target = gen_reg_rtx (SImode);
emit_move_insn (target, const0_rtx);
target = gen_rtx_SUBREG (QImode, target, 0);
unsigned int nargs_constant = 0;
unsigned int mask_pos = 0;
int num_memory = 0;
- struct
- {
- rtx op;
- machine_mode mode;
- } args[6];
+ rtx xops[6];
bool second_arg_count = false;
enum insn_code icode = d->icode;
const struct insn_data_d *insn_p = &insn_data[icode];
case USI_FTYPE_V32HI_V32HI_INT_USI:
case UHI_FTYPE_V16HI_V16HI_INT_UHI:
case UQI_FTYPE_V8HI_V8HI_INT_UQI:
- case V32HI_FTYPE_V32HI_V32HI_V32HI_INT:
- case V16HI_FTYPE_V16HI_V16HI_V16HI_INT:
- case V8HI_FTYPE_V8HI_V8HI_V8HI_INT:
- case V8SI_FTYPE_V8SI_V8SI_V8SI_INT:
- case V4DI_FTYPE_V4DI_V4DI_V4DI_INT:
- case V8DI_FTYPE_V8DI_V8DI_V8DI_INT:
- case V16SI_FTYPE_V16SI_V16SI_V16SI_INT:
- case V2DI_FTYPE_V2DI_V2DI_V2DI_INT:
- case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
nargs = 4;
mask_pos = 1;
nargs_constant = 1;
gcc_unreachable ();
}
- gcc_assert (nargs <= ARRAY_SIZE (args));
+ gcc_assert (nargs <= ARRAY_SIZE (xops));
if (comparison != UNKNOWN)
{
}
}
- args[i].op = op;
- args[i].mode = mode;
+ xops[i] = op;
}
switch (nargs)
{
case 1:
- pat = GEN_FCN (icode) (real_target, args[0].op);
+ pat = GEN_FCN (icode) (real_target, xops[0]);
break;
case 2:
- pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
+ pat = GEN_FCN (icode) (real_target, xops[0], xops[1]);
break;
case 3:
- pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
- args[2].op);
+ pat = GEN_FCN (icode) (real_target, xops[0], xops[1], xops[2]);
break;
case 4:
- pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
- args[2].op, args[3].op);
+ pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
+ xops[2], xops[3]);
break;
case 5:
- pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
- args[2].op, args[3].op, args[4].op);
+ pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
+ xops[2], xops[3], xops[4]);
break;
case 6:
- pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
- args[2].op, args[3].op, args[4].op,
- args[5].op);
+ pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
+ xops[2], xops[3], xops[4], xops[5]);
break;
default:
gcc_unreachable ();
{
rtx pat;
unsigned int i, nargs;
- struct
- {
- rtx op;
- machine_mode mode;
- } args[6];
+ rtx xops[6];
enum insn_code icode = d->icode;
const struct insn_data_d *insn_p = &insn_data[icode];
machine_mode tmode = insn_p->operand[0].mode;
case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
+ case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
+ case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
nargs = 5;
break;
case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
+ case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT:
+ case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT:
nargs_constant = 4;
nargs = 5;
break;
default:
gcc_unreachable ();
}
- gcc_assert (nargs <= ARRAY_SIZE (args));
+ gcc_assert (nargs <= ARRAY_SIZE (xops));
if (optimize
|| target == 0
}
}
- args[i].op = op;
- args[i].mode = mode;
+ xops[i] = op;
}
switch (nargs)
{
case 1:
- pat = GEN_FCN (icode) (target, args[0].op);
+ pat = GEN_FCN (icode) (target, xops[0]);
break;
case 2:
- pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
+ pat = GEN_FCN (icode) (target, xops[0], xops[1]);
break;
case 3:
- pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
- args[2].op);
+ pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
break;
case 4:
- pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
- args[2].op, args[3].op);
+ pat = GEN_FCN (icode) (target, xops[0], xops[1],
+ xops[2], xops[3]);
break;
case 5:
- pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
- args[2].op, args[3].op, args[4].op);
+ pat = GEN_FCN (icode) (target, xops[0], xops[1],
+ xops[2], xops[3], xops[4]);
break;
case 6:
- pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
- args[2].op, args[3].op, args[4].op,
- args[5].op);
+ pat = GEN_FCN (icode) (target, xops[0], xops[1],
+ xops[2], xops[3], xops[4], xops[5]);
break;
default:
gcc_unreachable ();
rtx pat, op;
unsigned int i, nargs, arg_adjust, memory;
bool aligned_mem = false;
- struct
- {
- rtx op;
- machine_mode mode;
- } args[3];
+ rtx xops[3];
enum insn_code icode = d->icode;
- bool last_arg_constant = false;
const struct insn_data_d *insn_p = &insn_data[icode];
machine_mode tmode = insn_p->operand[0].mode;
enum { load, store } klass;
case USHORT_FTYPE_VOID:
case UINT64_FTYPE_VOID:
case UINT_FTYPE_VOID:
+ case UINT8_FTYPE_VOID:
case UNSIGNED_FTYPE_VOID:
nargs = 0;
klass = load;
nargs = 1;
klass = store;
/* Reserve memory operand for target. */
- memory = ARRAY_SIZE (args);
+ memory = ARRAY_SIZE (xops);
switch (icode)
{
/* These builtins and instructions require the memory
case VOID_FTYPE_PV8SI_V8DI_UQI:
case VOID_FTYPE_PV8HI_V8DI_UQI:
case VOID_FTYPE_PV16HI_V16SI_UHI:
- case VOID_FTYPE_PV16QI_V8DI_UQI:
+ case VOID_FTYPE_PUDI_V8DI_UQI:
case VOID_FTYPE_PV16QI_V16SI_UHI:
case VOID_FTYPE_PV4SI_V4DI_UQI:
- case VOID_FTYPE_PV4SI_V2DI_UQI:
- case VOID_FTYPE_PV8HI_V4DI_UQI:
- case VOID_FTYPE_PV8HI_V2DI_UQI:
+ case VOID_FTYPE_PUDI_V2DI_UQI:
+ case VOID_FTYPE_PUDI_V4DI_UQI:
+ case VOID_FTYPE_PUSI_V2DI_UQI:
case VOID_FTYPE_PV8HI_V8SI_UQI:
- case VOID_FTYPE_PV8HI_V4SI_UQI:
- case VOID_FTYPE_PV16QI_V4DI_UQI:
- case VOID_FTYPE_PV16QI_V2DI_UQI:
- case VOID_FTYPE_PV16QI_V8SI_UQI:
- case VOID_FTYPE_PV16QI_V4SI_UQI:
+ case VOID_FTYPE_PUDI_V4SI_UQI:
+ case VOID_FTYPE_PUSI_V4DI_UQI:
+ case VOID_FTYPE_PUHI_V2DI_UQI:
+ case VOID_FTYPE_PUDI_V8SI_UQI:
+ case VOID_FTYPE_PUSI_V4SI_UQI:
case VOID_FTYPE_PCHAR_V64QI_UDI:
case VOID_FTYPE_PCHAR_V32QI_USI:
case VOID_FTYPE_PCHAR_V16QI_UHI:
case VOID_FTYPE_PFLOAT_V4SF_UQI:
case VOID_FTYPE_PV32QI_V32HI_USI:
case VOID_FTYPE_PV16QI_V16HI_UHI:
- case VOID_FTYPE_PV8QI_V8HI_UQI:
+ case VOID_FTYPE_PUDI_V8HI_UQI:
nargs = 2;
klass = store;
/* Reserve memory operand for target. */
- memory = ARRAY_SIZE (args);
+ memory = ARRAY_SIZE (xops);
break;
case V4SF_FTYPE_PCV4SF_V4SF_UQI:
case V8SF_FTYPE_PCV8SF_V8SF_UQI:
klass = load;
memory = 0;
break;
- case VOID_FTYPE_UINT_UINT_UINT:
- case VOID_FTYPE_UINT64_UINT_UINT:
- case UCHAR_FTYPE_UINT_UINT_UINT:
- case UCHAR_FTYPE_UINT64_UINT_UINT:
- nargs = 3;
- klass = load;
- memory = ARRAY_SIZE (args);
- last_arg_constant = true;
- break;
default:
gcc_unreachable ();
}
- gcc_assert (nargs <= ARRAY_SIZE (args));
+ gcc_assert (nargs <= ARRAY_SIZE (xops));
if (klass == store)
{
for (i = 0; i < nargs; i++)
{
machine_mode mode = insn_p->operand[i + 1].mode;
- bool match;
arg = CALL_EXPR_ARG (exp, i + arg_adjust);
op = expand_normal (arg);
- match = insn_p->operand[i + 1].predicate (op, mode);
- if (last_arg_constant && (i + 1) == nargs)
+ if (i == memory)
{
- if (!match)
- {
- if (icode == CODE_FOR_lwp_lwpvalsi3
- || icode == CODE_FOR_lwp_lwpinssi3
- || icode == CODE_FOR_lwp_lwpvaldi3
- || icode == CODE_FOR_lwp_lwpinsdi3)
- error ("the last argument must be a 32-bit immediate");
- else
- error ("the last argument must be an 8-bit immediate");
- return const0_rtx;
- }
+ /* This must be the memory operand. */
+ op = ix86_zero_extend_to_Pmode (op);
+ op = gen_rtx_MEM (mode, op);
+ /* op at this point has just BITS_PER_UNIT MEM_ALIGN
+ on it. Try to improve it using get_pointer_alignment,
+ and if the special builtin is one that requires strict
+ mode alignment, also from it's GET_MODE_ALIGNMENT.
+ Failure to do so could lead to ix86_legitimate_combined_insn
+ rejecting all changes to such insns. */
+ unsigned int align = get_pointer_alignment (arg);
+ if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
+ align = GET_MODE_ALIGNMENT (mode);
+ if (MEM_ALIGN (op) < align)
+ set_mem_align (op, align);
}
else
{
- if (i == memory)
- {
- /* This must be the memory operand. */
- op = ix86_zero_extend_to_Pmode (op);
- op = gen_rtx_MEM (mode, op);
- /* op at this point has just BITS_PER_UNIT MEM_ALIGN
- on it. Try to improve it using get_pointer_alignment,
- and if the special builtin is one that requires strict
- mode alignment, also from it's GET_MODE_ALIGNMENT.
- Failure to do so could lead to ix86_legitimate_combined_insn
- rejecting all changes to such insns. */
- unsigned int align = get_pointer_alignment (arg);
- if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
- align = GET_MODE_ALIGNMENT (mode);
- if (MEM_ALIGN (op) < align)
- set_mem_align (op, align);
- }
- else
- {
- /* This must be register. */
- if (VECTOR_MODE_P (mode))
- op = safe_vector_operand (op, mode);
+ /* This must be register. */
+ if (VECTOR_MODE_P (mode))
+ op = safe_vector_operand (op, mode);
- op = fixup_modeless_constant (op, mode);
+ op = fixup_modeless_constant (op, mode);
- if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
- op = copy_to_mode_reg (mode, op);
- else
- {
- op = copy_to_reg (op);
- op = lowpart_subreg (mode, op, GET_MODE (op));
- }
+ /* NB: 3-operands load implied it's a mask load,
+ and that mask operand shoud be at the end.
+ Keep all-ones mask which would be simplified by the expander. */
+ if (nargs == 3 && i == 2 && klass == load
+ && constm1_operand (op, mode))
+ ;
+ else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
+ op = copy_to_mode_reg (mode, op);
+ else
+ {
+ op = copy_to_reg (op);
+ op = lowpart_subreg (mode, op, GET_MODE (op));
}
}
- args[i].op = op;
- args[i].mode = mode;
+ xops[i]= op;
}
switch (nargs)
pat = GEN_FCN (icode) (target);
break;
case 1:
- pat = GEN_FCN (icode) (target, args[0].op);
+ pat = GEN_FCN (icode) (target, xops[0]);
break;
case 2:
- pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
+ pat = GEN_FCN (icode) (target, xops[0], xops[1]);
break;
case 3:
- pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
+ pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
break;
default:
gcc_unreachable ();
if (! pat)
return 0;
+
emit_insn (pat);
return klass == store ? 0 : target;
}
tree arg0, arg1, arg2, arg3, arg4;
rtx op0, op1, op2, op3, op4, pat, pat2, insn;
machine_mode mode0, mode1, mode2, mode3, mode4;
- unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
+ unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
/* For CPU builtins that can be folded, fold first and expand the fold. */
switch (fcode)
OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
- where for each this pair it is sufficient if either of the ISAs is
- enabled, plus if it is ored with other options also those others. */
+ (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
+ OPTION_MASK_ISA2_AVXVNNI
+ where for each such pair it is sufficient if either of the ISAs is
+ enabled, plus if it is ored with other options also those others.
+ OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
== (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
&& (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
+
if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
== (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
&& (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
+
if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
== (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
&& (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
- /* Use SSE/SSE2/SSSE3 to emulate MMX intrinsics in 64-bit mode when
- MMX is disabled. NB: Since MMX intrinsics are marked with
- SSE/SSE2/SSSE3, enable them without SSE/SSE2/SSSE3 if MMX is
- enabled. */
- if (TARGET_MMX || TARGET_MMX_WITH_SSE)
- {
- if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX))
- == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX))
- && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX)) != 0)
- isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX);
- if (((bisa & (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX))
- == (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX))
- && (isa & (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX)) != 0)
- isa |= (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX);
- if (((bisa & (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX))
- == (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX))
- && (isa & (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX)) != 0)
- isa |= (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX);
+
+ if ((((bisa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
+ == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
+ || (bisa2 & OPTION_MASK_ISA2_AVXVNNI) != 0)
+ && (((isa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
+ == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
+ || (isa2 & OPTION_MASK_ISA2_AVXVNNI) != 0))
+ {
+ isa |= OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL;
+ isa2 |= OPTION_MASK_ISA2_AVXVNNI;
+ }
+
+ if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
+ /* __builtin_ia32_maskmovq requires MMX registers. */
+ && fcode != IX86_BUILTIN_MASKMOVQ)
+ {
+ bisa &= ~OPTION_MASK_ISA_MMX;
+ bisa |= OPTION_MASK_ISA_SSE2;
}
+
if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
{
bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
else
bisa |= OPTION_MASK_ABI_64;
char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
- (enum fpmath_unit) 0, false, add_abi_p);
+ (enum fpmath_unit) 0,
+ (enum prefer_vector_width) 0,
+ false, add_abi_p);
if (!opts)
error ("%qE needs unknown isa option", fndecl);
else
return target;
+ case IX86_BUILTIN_TESTUI:
+ emit_insn (gen_testui ());
+
+ if (target == 0
+ || !register_operand (target, QImode))
+ target = gen_reg_rtx (QImode);
+
+ pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
+ const0_rtx);
+ emit_insn (gen_rtx_SET (target, pat));
+
+ return target;
+
case IX86_BUILTIN_CLZERO:
arg0 = CALL_EXPR_ARG (exp, 0);
op0 = expand_normal (arg0);
emit_insn (gen_cldemote (op0));
return 0;
- case IX86_BUILTIN_VEC_INIT_V2SI:
- case IX86_BUILTIN_VEC_INIT_V4HI:
- case IX86_BUILTIN_VEC_INIT_V8QI:
- return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
+ case IX86_BUILTIN_LOADIWKEY:
+ {
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ arg1 = CALL_EXPR_ARG (exp, 1);
+ arg2 = CALL_EXPR_ARG (exp, 2);
+ arg3 = CALL_EXPR_ARG (exp, 3);
- case IX86_BUILTIN_VEC_EXT_V2DF:
- case IX86_BUILTIN_VEC_EXT_V2DI:
- case IX86_BUILTIN_VEC_EXT_V4SF:
- case IX86_BUILTIN_VEC_EXT_V4SI:
- case IX86_BUILTIN_VEC_EXT_V8HI:
- case IX86_BUILTIN_VEC_EXT_V2SI:
- case IX86_BUILTIN_VEC_EXT_V4HI:
- case IX86_BUILTIN_VEC_EXT_V16QI:
- return ix86_expand_vec_ext_builtin (exp, target);
+ op0 = expand_normal (arg0);
+ op1 = expand_normal (arg1);
+ op2 = expand_normal (arg2);
+ op3 = expand_normal (arg3);
- case IX86_BUILTIN_VEC_SET_V2DI:
- case IX86_BUILTIN_VEC_SET_V4SF:
- case IX86_BUILTIN_VEC_SET_V4SI:
- case IX86_BUILTIN_VEC_SET_V8HI:
- case IX86_BUILTIN_VEC_SET_V4HI:
- case IX86_BUILTIN_VEC_SET_V16QI:
- return ix86_expand_vec_set_builtin (exp);
+ if (!REG_P (op0))
+ op0 = copy_to_mode_reg (V2DImode, op0);
+ if (!REG_P (op1))
+ op1 = copy_to_mode_reg (V2DImode, op1);
+ if (!REG_P (op2))
+ op2 = copy_to_mode_reg (V2DImode, op2);
+ if (!REG_P (op3))
+ op3 = copy_to_mode_reg (SImode, op3);
- case IX86_BUILTIN_NANQ:
- case IX86_BUILTIN_NANSQ:
- return expand_call (exp, target, ignore);
+ emit_insn (gen_loadiwkey (op0, op1, op2, op3));
- case IX86_BUILTIN_RDPID:
+ return 0;
+ }
- op0 = gen_reg_rtx (word_mode);
+ case IX86_BUILTIN_AESDEC128KLU8:
+ icode = CODE_FOR_aesdec128klu8;
+ goto aesdecenc_expand;
- if (TARGET_64BIT)
- {
- insn = gen_rdpid_rex64 (op0);
- op0 = convert_to_mode (SImode, op0, 1);
- }
- else
- insn = gen_rdpid (op0);
+ case IX86_BUILTIN_AESDEC256KLU8:
+ icode = CODE_FOR_aesdec256klu8;
+ goto aesdecenc_expand;
- emit_insn (insn);
+ case IX86_BUILTIN_AESENC128KLU8:
+ icode = CODE_FOR_aesenc128klu8;
+ goto aesdecenc_expand;
- if (target == 0
- || !register_operand (target, SImode))
- target = gen_reg_rtx (SImode);
+ case IX86_BUILTIN_AESENC256KLU8:
+ icode = CODE_FOR_aesenc256klu8;
- emit_move_insn (target, op0);
- return target;
+ aesdecenc_expand:
- case IX86_BUILTIN_RDPMC:
- case IX86_BUILTIN_RDTSC:
- case IX86_BUILTIN_RDTSCP:
+ arg0 = CALL_EXPR_ARG (exp, 0); // __m128i *odata
+ arg1 = CALL_EXPR_ARG (exp, 1); // __m128i idata
+ arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
+
+ op0 = expand_normal (arg0);
+ op1 = expand_normal (arg1);
+ op2 = expand_normal (arg2);
+
+ if (!address_operand (op0, V2DImode))
+ {
+ op0 = convert_memory_address (Pmode, op0);
+ op0 = copy_addr_to_reg (op0);
+ }
+ op0 = gen_rtx_MEM (V2DImode, op0);
+
+ if (!REG_P (op1))
+ op1 = copy_to_mode_reg (V2DImode, op1);
+
+ if (!address_operand (op2, VOIDmode))
+ {
+ op2 = convert_memory_address (Pmode, op2);
+ op2 = copy_addr_to_reg (op2);
+ }
+ op2 = gen_rtx_MEM (BLKmode, op2);
+
+ emit_insn (GEN_FCN (icode) (op1, op1, op2));
+
+ if (target == 0)
+ target = gen_reg_rtx (QImode);
+
+ pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCZmode, FLAGS_REG),
+ const0_rtx);
+ emit_insn (gen_rtx_SET (target, pat));
+
+ emit_insn (gen_rtx_SET (op0, op1));
+
+ return target;
+
+ case IX86_BUILTIN_AESDECWIDE128KLU8:
+ icode = CODE_FOR_aesdecwide128klu8;
+ goto wideaesdecenc_expand;
+
+ case IX86_BUILTIN_AESDECWIDE256KLU8:
+ icode = CODE_FOR_aesdecwide256klu8;
+ goto wideaesdecenc_expand;
+
+ case IX86_BUILTIN_AESENCWIDE128KLU8:
+ icode = CODE_FOR_aesencwide128klu8;
+ goto wideaesdecenc_expand;
+
+ case IX86_BUILTIN_AESENCWIDE256KLU8:
+ icode = CODE_FOR_aesencwide256klu8;
+
+ wideaesdecenc_expand:
+
+ rtx xmm_regs[8];
+ rtx op;
+
+ arg0 = CALL_EXPR_ARG (exp, 0); // __m128i * odata
+ arg1 = CALL_EXPR_ARG (exp, 1); // const __m128i * idata
+ arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
+
+ op0 = expand_normal (arg0);
+ op1 = expand_normal (arg1);
+ op2 = expand_normal (arg2);
+
+ if (!address_operand (op2, VOIDmode))
+ {
+ op2 = convert_memory_address (Pmode, op2);
+ op2 = copy_addr_to_reg (op2);
+ }
+ op2 = gen_rtx_MEM (BLKmode, op2);
+
+ for (i = 0; i < 8; i++)
+ {
+ xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
+
+ op = gen_rtx_MEM (V2DImode,
+ plus_constant (Pmode, op1, (i * 16)));
+
+ emit_move_insn (xmm_regs[i], op);
+ }
+
+ emit_insn (GEN_FCN (icode) (op2));
+
+ if (target == 0)
+ target = gen_reg_rtx (QImode);
+
+ pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCZmode, FLAGS_REG),
+ const0_rtx);
+ emit_insn (gen_rtx_SET (target, pat));
+
+ for (i = 0; i < 8; i++)
+ {
+ op = gen_rtx_MEM (V2DImode,
+ plus_constant (Pmode, op0, (i * 16)));
+ emit_move_insn (op, xmm_regs[i]);
+ }
+
+ return target;
+
+ case IX86_BUILTIN_ENCODEKEY128U32:
+ {
+ rtx op, xmm_regs[7];
+
+ arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
+ arg1 = CALL_EXPR_ARG (exp, 1); // __m128i key
+ arg2 = CALL_EXPR_ARG (exp, 2); // void *h
+
+ op0 = expand_normal (arg0);
+ op1 = expand_normal (arg1);
+ op2 = expand_normal (arg2);
+
+ if (!REG_P (op0))
+ op0 = copy_to_mode_reg (SImode, op0);
+
+ op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
+ emit_move_insn (op, op1);
+
+ for (i = 0; i < 3; i++)
+ xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
+
+ if (target == 0)
+ target = gen_reg_rtx (SImode);
+
+ emit_insn (gen_encodekey128u32 (target, op0));
+
+ for (i = 0; i < 3; i++)
+ {
+ op = gen_rtx_MEM (V2DImode,
+ plus_constant (Pmode, op2, (i * 16)));
+ emit_move_insn (op, xmm_regs[i]);
+ }
+
+ return target;
+ }
+ case IX86_BUILTIN_ENCODEKEY256U32:
+ {
+ rtx op, xmm_regs[7];
+
+ arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
+ arg1 = CALL_EXPR_ARG (exp, 1); // __m128i keylow
+ arg2 = CALL_EXPR_ARG (exp, 2); // __m128i keyhi
+ arg3 = CALL_EXPR_ARG (exp, 3); // void *h
+
+ op0 = expand_normal (arg0);
+ op1 = expand_normal (arg1);
+ op2 = expand_normal (arg2);
+ op3 = expand_normal (arg3);
+
+ if (!REG_P (op0))
+ op0 = copy_to_mode_reg (SImode, op0);
+
+ /* Force to use xmm0, xmm1 for keylow, keyhi*/
+ op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
+ emit_move_insn (op, op1);
+ op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (1));
+ emit_move_insn (op, op2);
+
+ for (i = 0; i < 4; i++)
+ xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
+
+ if (target == 0)
+ target = gen_reg_rtx (SImode);
+
+ emit_insn (gen_encodekey256u32 (target, op0));
+
+ for (i = 0; i < 4; i++)
+ {
+ op = gen_rtx_MEM (V2DImode,
+ plus_constant (Pmode, op3, (i * 16)));
+ emit_move_insn (op, xmm_regs[i]);
+ }
+
+ return target;
+ }
+
+ case IX86_BUILTIN_VEC_INIT_V2SI:
+ case IX86_BUILTIN_VEC_INIT_V4HI:
+ case IX86_BUILTIN_VEC_INIT_V8QI:
+ return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
+
+ case IX86_BUILTIN_VEC_EXT_V2DF:
+ case IX86_BUILTIN_VEC_EXT_V2DI:
+ case IX86_BUILTIN_VEC_EXT_V4SF:
+ case IX86_BUILTIN_VEC_EXT_V4SI:
+ case IX86_BUILTIN_VEC_EXT_V8HI:
+ case IX86_BUILTIN_VEC_EXT_V2SI:
+ case IX86_BUILTIN_VEC_EXT_V4HI:
+ case IX86_BUILTIN_VEC_EXT_V16QI:
+ return ix86_expand_vec_ext_builtin (exp, target);
+
+ case IX86_BUILTIN_VEC_SET_V2DI:
+ case IX86_BUILTIN_VEC_SET_V4SF:
+ case IX86_BUILTIN_VEC_SET_V4SI:
+ case IX86_BUILTIN_VEC_SET_V8HI:
+ case IX86_BUILTIN_VEC_SET_V4HI:
+ case IX86_BUILTIN_VEC_SET_V16QI:
+ return ix86_expand_vec_set_builtin (exp);
+
+ case IX86_BUILTIN_NANQ:
+ case IX86_BUILTIN_NANSQ:
+ return expand_call (exp, target, ignore);
+
+ case IX86_BUILTIN_RDPID:
+
+ op0 = gen_reg_rtx (word_mode);
+
+ if (TARGET_64BIT)
+ {
+ insn = gen_rdpid_rex64 (op0);
+ op0 = convert_to_mode (SImode, op0, 1);
+ }
+ else
+ insn = gen_rdpid (op0);
+
+ emit_insn (insn);
+
+ if (target == 0
+ || !register_operand (target, SImode))
+ target = gen_reg_rtx (SImode);
+
+ emit_move_insn (target, op0);
+ return target;
+
+ case IX86_BUILTIN_2INTERSECTD512:
+ case IX86_BUILTIN_2INTERSECTQ512:
+ case IX86_BUILTIN_2INTERSECTD256:
+ case IX86_BUILTIN_2INTERSECTQ256:
+ case IX86_BUILTIN_2INTERSECTD128:
+ case IX86_BUILTIN_2INTERSECTQ128:
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ arg1 = CALL_EXPR_ARG (exp, 1);
+ arg2 = CALL_EXPR_ARG (exp, 2);
+ arg3 = CALL_EXPR_ARG (exp, 3);
+ op0 = expand_normal (arg0);
+ op1 = expand_normal (arg1);
+ op2 = expand_normal (arg2);
+ op3 = expand_normal (arg3);
+
+ if (!address_operand (op0, VOIDmode))
+ {
+ op0 = convert_memory_address (Pmode, op0);
+ op0 = copy_addr_to_reg (op0);
+ }
+ if (!address_operand (op1, VOIDmode))
+ {
+ op1 = convert_memory_address (Pmode, op1);
+ op1 = copy_addr_to_reg (op1);
+ }
+
+ switch (fcode)
+ {
+ case IX86_BUILTIN_2INTERSECTD512:
+ mode4 = P2HImode;
+ icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
+ break;
+ case IX86_BUILTIN_2INTERSECTQ512:
+ mode4 = P2QImode;
+ icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
+ break;
+ case IX86_BUILTIN_2INTERSECTD256:
+ mode4 = P2QImode;
+ icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
+ break;
+ case IX86_BUILTIN_2INTERSECTQ256:
+ mode4 = P2QImode;
+ icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
+ break;
+ case IX86_BUILTIN_2INTERSECTD128:
+ mode4 = P2QImode;
+ icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
+ break;
+ case IX86_BUILTIN_2INTERSECTQ128:
+ mode4 = P2QImode;
+ icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ mode2 = insn_data[icode].operand[1].mode;
+ mode3 = insn_data[icode].operand[2].mode;
+ if (!insn_data[icode].operand[1].predicate (op2, mode2))
+ op2 = copy_to_mode_reg (mode2, op2);
+ if (!insn_data[icode].operand[2].predicate (op3, mode3))
+ op3 = copy_to_mode_reg (mode3, op3);
+
+ op4 = gen_reg_rtx (mode4);
+ emit_insn (GEN_FCN (icode) (op4, op2, op3));
+ mode0 = mode4 == P2HImode ? HImode : QImode;
+ emit_move_insn (gen_rtx_MEM (mode0, op0),
+ gen_lowpart (mode0, op4));
+ emit_move_insn (gen_rtx_MEM (mode0, op1),
+ gen_highpart (mode0, op4));
+
+ return 0;
+
+ case IX86_BUILTIN_RDPMC:
+ case IX86_BUILTIN_RDTSC:
+ case IX86_BUILTIN_RDTSCP:
case IX86_BUILTIN_XGETBV:
op0 = gen_reg_rtx (DImode);
}
else
{
- rtx pat;
+ if (target == 0
+ || !register_operand (target, SImode))
+ target = gen_reg_rtx (SImode);
- target = gen_reg_rtx (SImode);
emit_move_insn (target, const0_rtx);
target = gen_rtx_SUBREG (QImode, target, 0);
- if (fcode == IX86_BUILTIN_ENQCMD)
- pat = gen_enqcmd (UNSPECV_ENQCMD, Pmode, op0, op1);
- else
- pat = gen_enqcmd (UNSPECV_ENQCMDS, Pmode, op0, op1);
-
- emit_insn (pat);
-
- emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
- gen_rtx_fmt_ee (EQ, QImode,
- SET_DEST (pat),
- const0_rtx)));
+ int unspecv = (fcode == IX86_BUILTIN_ENQCMD
+ ? UNSPECV_ENQCMD
+ : UNSPECV_ENQCMDS);
+ icode = code_for_enqcmd (unspecv, Pmode);
+ emit_insn (GEN_FCN (icode) (op0, op1));
+ emit_insn
+ (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
+ gen_rtx_fmt_ee (EQ, QImode,
+ gen_rtx_REG (CCZmode, FLAGS_REG),
+ const0_rtx)));
return SUBREG_REG (target);
}
case IX86_BUILTIN_LLWPCB:
arg0 = CALL_EXPR_ARG (exp, 0);
op0 = expand_normal (arg0);
- icode = CODE_FOR_lwp_llwpcb;
- if (!insn_data[icode].operand[0].predicate (op0, Pmode))
+
+ if (!register_operand (op0, Pmode))
op0 = ix86_zero_extend_to_Pmode (op0);
- emit_insn (gen_lwp_llwpcb (op0));
+ emit_insn (gen_lwp_llwpcb (Pmode, op0));
return 0;
case IX86_BUILTIN_SLWPCB:
- icode = CODE_FOR_lwp_slwpcb;
if (!target
- || !insn_data[icode].operand[0].predicate (target, Pmode))
+ || !register_operand (target, Pmode))
target = gen_reg_rtx (Pmode);
- emit_insn (gen_lwp_slwpcb (target));
+ emit_insn (gen_lwp_slwpcb (Pmode, target));
return target;
+ case IX86_BUILTIN_LWPVAL32:
+ case IX86_BUILTIN_LWPVAL64:
+ case IX86_BUILTIN_LWPINS32:
+ case IX86_BUILTIN_LWPINS64:
+ mode = ((fcode == IX86_BUILTIN_LWPVAL32
+ || fcode == IX86_BUILTIN_LWPINS32)
+ ? SImode : DImode);
+
+ if (fcode == IX86_BUILTIN_LWPVAL32
+ || fcode == IX86_BUILTIN_LWPVAL64)
+ icode = code_for_lwp_lwpval (mode);
+ else
+ icode = code_for_lwp_lwpins (mode);
+
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ arg1 = CALL_EXPR_ARG (exp, 1);
+ arg2 = CALL_EXPR_ARG (exp, 2);
+ op0 = expand_normal (arg0);
+ op1 = expand_normal (arg1);
+ op2 = expand_normal (arg2);
+ mode0 = insn_data[icode].operand[0].mode;
+
+ if (!insn_data[icode].operand[0].predicate (op0, mode0))
+ op0 = copy_to_mode_reg (mode0, op0);
+ if (!insn_data[icode].operand[1].predicate (op1, SImode))
+ op1 = copy_to_mode_reg (SImode, op1);
+
+ if (!CONST_INT_P (op2))
+ {
+ error ("the last argument must be a 32-bit immediate");
+ return const0_rtx;
+ }
+
+ emit_insn (GEN_FCN (icode) (op0, op1, op2));
+
+ if (fcode == IX86_BUILTIN_LWPINS32
+ || fcode == IX86_BUILTIN_LWPINS64)
+ {
+ if (target == 0
+ || !nonimmediate_operand (target, QImode))
+ target = gen_reg_rtx (QImode);
+
+ pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
+ const0_rtx);
+ emit_insn (gen_rtx_SET (target, pat));
+
+ return target;
+ }
+ else
+ return 0;
+
case IX86_BUILTIN_BEXTRI32:
case IX86_BUILTIN_BEXTRI64:
+ mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
+
arg0 = CALL_EXPR_ARG (exp, 0);
arg1 = CALL_EXPR_ARG (exp, 1);
op0 = expand_normal (arg0);
op1 = expand_normal (arg1);
- icode = (fcode == IX86_BUILTIN_BEXTRI32
- ? CODE_FOR_tbm_bextri_si
- : CODE_FOR_tbm_bextri_di);
+
if (!CONST_INT_P (op1))
- {
- error ("last argument must be an immediate");
- return const0_rtx;
- }
+ {
+ error ("last argument must be an immediate");
+ return const0_rtx;
+ }
else
- {
- unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
- unsigned char lsb_index = INTVAL (op1) & 0xFF;
- op1 = GEN_INT (length);
- op2 = GEN_INT (lsb_index);
+ {
+ unsigned char lsb_index = UINTVAL (op1);
+ unsigned char length = UINTVAL (op1) >> 8;
+
+ unsigned char bitsize = GET_MODE_BITSIZE (mode);
+
+ icode = code_for_tbm_bextri (mode);
mode1 = insn_data[icode].operand[1].mode;
if (!insn_data[icode].operand[1].predicate (op0, mode1))
|| !register_operand (target, mode0))
target = gen_reg_rtx (mode0);
- pat = GEN_FCN (icode) (target, op0, op1, op2);
- if (pat)
- emit_insn (pat);
- return target;
- }
+ if (length == 0 || lsb_index >= bitsize)
+ {
+ emit_move_insn (target, const0_rtx);
+ return target;
+ }
+
+ if (length + lsb_index > bitsize)
+ length = bitsize - lsb_index;
+
+ op1 = GEN_INT (length);
+ op2 = GEN_INT (lsb_index);
+
+ emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
+ return target;
+ }
case IX86_BUILTIN_RDRAND16_STEP:
- icode = CODE_FOR_rdrandhi_1;
- mode0 = HImode;
+ mode = HImode;
goto rdrand_step;
case IX86_BUILTIN_RDRAND32_STEP:
- icode = CODE_FOR_rdrandsi_1;
- mode0 = SImode;
+ mode = SImode;
goto rdrand_step;
case IX86_BUILTIN_RDRAND64_STEP:
- icode = CODE_FOR_rdranddi_1;
- mode0 = DImode;
+ mode = DImode;
rdrand_step:
arg0 = CALL_EXPR_ARG (exp, 0);
op1 = copy_addr_to_reg (op1);
}
- op0 = gen_reg_rtx (mode0);
- emit_insn (GEN_FCN (icode) (op0));
+ op0 = gen_reg_rtx (mode);
+ emit_insn (gen_rdrand (mode, op0));
- emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
+ emit_move_insn (gen_rtx_MEM (mode, op1), op0);
- op1 = gen_reg_rtx (SImode);
- emit_move_insn (op1, CONST1_RTX (SImode));
+ op1 = force_reg (SImode, const1_rtx);
/* Emit SImode conditional move. */
- if (mode0 == HImode)
+ if (mode == HImode)
{
if (TARGET_ZERO_EXTEND_WITH_AND
&& optimize_function_for_speed_p (cfun))
emit_insn (gen_zero_extendhisi2 (op2, op0));
}
}
- else if (mode0 == SImode)
+ else if (mode == SImode)
op2 = op0;
else
op2 = gen_rtx_SUBREG (SImode, op0, 0);
return target;
case IX86_BUILTIN_RDSEED16_STEP:
- icode = CODE_FOR_rdseedhi_1;
- mode0 = HImode;
+ mode = HImode;
goto rdseed_step;
case IX86_BUILTIN_RDSEED32_STEP:
- icode = CODE_FOR_rdseedsi_1;
- mode0 = SImode;
+ mode = SImode;
goto rdseed_step;
case IX86_BUILTIN_RDSEED64_STEP:
- icode = CODE_FOR_rdseeddi_1;
- mode0 = DImode;
+ mode = DImode;
rdseed_step:
arg0 = CALL_EXPR_ARG (exp, 0);
op1 = copy_addr_to_reg (op1);
}
- op0 = gen_reg_rtx (mode0);
- emit_insn (GEN_FCN (icode) (op0));
+ op0 = gen_reg_rtx (mode);
+ emit_insn (gen_rdseed (mode, op0));
- emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
+ emit_move_insn (gen_rtx_MEM (mode, op1), op0);
op2 = gen_reg_rtx (QImode);
tree fndecl = gimple_call_fndecl (def_stmt);
if (fndecl
&& fndecl_built_in_p (fndecl, BUILT_IN_MD))
- switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
+ switch (DECL_MD_FUNCTION_CODE (fndecl))
{
case IX86_BUILTIN_CMPPD:
case IX86_BUILTIN_CMPPS:
emit_insn (gen_xabort (op0));
return 0;
+ case IX86_BUILTIN_RDSSPD:
+ case IX86_BUILTIN_RDSSPQ:
+ mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
+
+ if (target == 0
+ || !register_operand (target, mode))
+ target = gen_reg_rtx (mode);
+
+ op0 = force_reg (mode, const0_rtx);
+
+ emit_insn (gen_rdssp (mode, target, op0));
+ return target;
+
+ case IX86_BUILTIN_INCSSPD:
+ case IX86_BUILTIN_INCSSPQ:
+ mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
+
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ op0 = expand_normal (arg0);
+
+ op0 = force_reg (mode, op0);
+
+ emit_insn (gen_incssp (mode, op0));
+ return 0;
+
+ case IX86_BUILTIN_HRESET:
+ icode = CODE_FOR_hreset;
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ op0 = expand_normal (arg0);
+ op0 = force_reg (SImode, op0);
+ emit_insn (gen_hreset (op0));
+ return 0;
+
case IX86_BUILTIN_RSTORSSP:
case IX86_BUILTIN_CLRSSBSY:
arg0 = CALL_EXPR_ARG (exp, 0);
op0 = expand_normal (arg0);
icode = (fcode == IX86_BUILTIN_RSTORSSP
- ? CODE_FOR_rstorssp
- : CODE_FOR_clrssbsy);
+ ? CODE_FOR_rstorssp
+ : CODE_FOR_clrssbsy);
+
if (!address_operand (op0, VOIDmode))
{
- op1 = convert_memory_address (Pmode, op0);
- op0 = copy_addr_to_reg (op1);
+ op0 = convert_memory_address (Pmode, op0);
+ op0 = copy_addr_to_reg (op0);
}
- emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
+ emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
return 0;
case IX86_BUILTIN_WRSSD:
case IX86_BUILTIN_WRSSQ:
case IX86_BUILTIN_WRUSSD:
case IX86_BUILTIN_WRUSSQ:
+ mode = ((fcode == IX86_BUILTIN_WRSSD
+ || fcode == IX86_BUILTIN_WRUSSD)
+ ? SImode : DImode);
+
arg0 = CALL_EXPR_ARG (exp, 0);
op0 = expand_normal (arg0);
arg1 = CALL_EXPR_ARG (exp, 1);
op1 = expand_normal (arg1);
- switch (fcode)
- {
- case IX86_BUILTIN_WRSSD:
- icode = CODE_FOR_wrsssi;
- mode = SImode;
- break;
- case IX86_BUILTIN_WRSSQ:
- icode = CODE_FOR_wrssdi;
- mode = DImode;
- break;
- case IX86_BUILTIN_WRUSSD:
- icode = CODE_FOR_wrusssi;
- mode = SImode;
- break;
- case IX86_BUILTIN_WRUSSQ:
- icode = CODE_FOR_wrussdi;
- mode = DImode;
- break;
- }
+
op0 = force_reg (mode, op0);
+
if (!address_operand (op1, VOIDmode))
{
- op2 = convert_memory_address (Pmode, op1);
- op1 = copy_addr_to_reg (op2);
+ op1 = convert_memory_address (Pmode, op1);
+ op1 = copy_addr_to_reg (op1);
}
- emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
+ op1 = gen_rtx_MEM (mode, op1);
+
+ icode = ((fcode == IX86_BUILTIN_WRSSD
+ || fcode == IX86_BUILTIN_WRSSQ)
+ ? code_for_wrss (mode)
+ : code_for_wruss (mode));
+ emit_insn (GEN_FCN (icode) (op0, op1));
+
return 0;
+ case IX86_BUILTIN_VZEROUPPER:
+ cfun->machine->has_explicit_vzeroupper = true;
+ break;
+
default:
break;
}
target);
}
- if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
- && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
- {
- i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
- return ix86_expand_special_args_builtin (bdesc_cet_rdssp + i, exp,
- target);
- }
-
gcc_unreachable ();
}
case E_V8HImode:
use_vector_set = TARGET_SSE2;
break;
+ case E_V8QImode:
+ use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
+ break;
case E_V4HImode:
use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
break;
wmode = V8HImode;
goto widen;
case E_V8QImode:
+ if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
+ break;
wmode = V4HImode;
goto widen;
widen:
ix86_expand_vector_init_concat (machine_mode mode,
rtx target, rtx *ops, int n)
{
- machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
- rtx first[16], second[8], third[4];
+ machine_mode half_mode = VOIDmode;
+ rtx half[2];
rtvec v;
int i, j;
switch (mode)
{
case E_V16SImode:
- cmode = V8SImode;
+ half_mode = V8SImode;
break;
case E_V16SFmode:
- cmode = V8SFmode;
+ half_mode = V8SFmode;
break;
case E_V8DImode:
- cmode = V4DImode;
+ half_mode = V4DImode;
break;
case E_V8DFmode:
- cmode = V4DFmode;
+ half_mode = V4DFmode;
break;
case E_V8SImode:
- cmode = V4SImode;
+ half_mode = V4SImode;
break;
case E_V8SFmode:
- cmode = V4SFmode;
+ half_mode = V4SFmode;
break;
case E_V4DImode:
- cmode = V2DImode;
+ half_mode = V2DImode;
break;
case E_V4DFmode:
- cmode = V2DFmode;
+ half_mode = V2DFmode;
break;
case E_V4SImode:
- cmode = V2SImode;
+ half_mode = V2SImode;
break;
case E_V4SFmode:
- cmode = V2SFmode;
+ half_mode = V2SFmode;
break;
case E_V2DImode:
- cmode = DImode;
+ half_mode = DImode;
break;
case E_V2SImode:
- cmode = SImode;
+ half_mode = SImode;
break;
case E_V2DFmode:
- cmode = DFmode;
+ half_mode = DFmode;
break;
case E_V2SFmode:
- cmode = SFmode;
+ half_mode = SFmode;
break;
default:
gcc_unreachable ();
}
- if (!register_operand (ops[1], cmode))
- ops[1] = force_reg (cmode, ops[1]);
- if (!register_operand (ops[0], cmode))
- ops[0] = force_reg (cmode, ops[0]);
+ if (!register_operand (ops[1], half_mode))
+ ops[1] = force_reg (half_mode, ops[1]);
+ if (!register_operand (ops[0], half_mode))
+ ops[0] = force_reg (half_mode, ops[0]);
emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
ops[1])));
break;
switch (mode)
{
case E_V4DImode:
- cmode = V2DImode;
+ half_mode = V2DImode;
break;
case E_V4DFmode:
- cmode = V2DFmode;
+ half_mode = V2DFmode;
break;
case E_V4SImode:
- cmode = V2SImode;
+ half_mode = V2SImode;
break;
case E_V4SFmode:
- cmode = V2SFmode;
+ half_mode = V2SFmode;
break;
default:
gcc_unreachable ();
switch (mode)
{
case E_V8DImode:
- cmode = V2DImode;
- hmode = V4DImode;
+ half_mode = V4DImode;
break;
case E_V8DFmode:
- cmode = V2DFmode;
- hmode = V4DFmode;
+ half_mode = V4DFmode;
break;
case E_V8SImode:
- cmode = V2SImode;
- hmode = V4SImode;
+ half_mode = V4SImode;
break;
case E_V8SFmode:
- cmode = V2SFmode;
- hmode = V4SFmode;
+ half_mode = V4SFmode;
break;
default:
gcc_unreachable ();
switch (mode)
{
case E_V16SImode:
- cmode = V2SImode;
- hmode = V4SImode;
- gmode = V8SImode;
+ half_mode = V8SImode;
break;
case E_V16SFmode:
- cmode = V2SFmode;
- hmode = V4SFmode;
- gmode = V8SFmode;
+ half_mode = V8SFmode;
break;
default:
gcc_unreachable ();
half:
/* FIXME: We process inputs backward to help RA. PR 36222. */
i = n - 1;
- j = (n >> 1) - 1;
- for (; i > 0; i -= 2, j--)
- {
- first[j] = gen_reg_rtx (cmode);
- v = gen_rtvec (2, ops[i - 1], ops[i]);
- ix86_expand_vector_init (false, first[j],
- gen_rtx_PARALLEL (cmode, v));
- }
-
- n >>= 1;
- if (n > 4)
+ for (j = 1; j != -1; j--)
{
- gcc_assert (hmode != VOIDmode);
- gcc_assert (gmode != VOIDmode);
- for (i = j = 0; i < n; i += 2, j++)
+ half[j] = gen_reg_rtx (half_mode);
+ switch (n >> 1)
{
- second[j] = gen_reg_rtx (hmode);
- ix86_expand_vector_init_concat (hmode, second [j],
- &first [i], 2);
- }
- n >>= 1;
- for (i = j = 0; i < n; i += 2, j++)
- {
- third[j] = gen_reg_rtx (gmode);
- ix86_expand_vector_init_concat (gmode, third[j],
- &second[i], 2);
- }
- n >>= 1;
- ix86_expand_vector_init_concat (mode, target, third, n);
- }
- else if (n > 2)
- {
- gcc_assert (hmode != VOIDmode);
- for (i = j = 0; i < n; i += 2, j++)
- {
- second[j] = gen_reg_rtx (hmode);
- ix86_expand_vector_init_concat (hmode, second [j],
- &first [i], 2);
+ case 2:
+ v = gen_rtvec (2, ops[i-1], ops[i]);
+ i -= 2;
+ break;
+ case 4:
+ v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
+ i -= 4;
+ break;
+ case 8:
+ v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
+ ops[i-3], ops[i-2], ops[i-1], ops[i]);
+ i -= 8;
+ break;
+ default:
+ gcc_unreachable ();
}
- n >>= 1;
- ix86_expand_vector_init_concat (mode, target, second, n);
+ ix86_expand_vector_init (false, half[j],
+ gen_rtx_PARALLEL (half_mode, v));
}
- else
- ix86_expand_vector_init_concat (mode, target, first, n);
+
+ ix86_expand_vector_init_concat (mode, target, half, 2);
break;
default:
ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
}
+/* Implemented as
+ V setg (V v, int idx, T val)
+ {
+ V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
+ V valv = (V){val, val, val, val, val, val, val, val};
+ V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
+ v = (v & ~mask) | (valv & mask);
+ return v;
+ }. */
+void
+ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
+{
+ rtx vec[64];
+ machine_mode mode = GET_MODE (target);
+ machine_mode cmp_mode = mode;
+ int n_elts = GET_MODE_NUNITS (mode);
+ rtx valv,idxv,constv,idx_tmp;
+ bool ok = false;
+
+ /* 512-bits vector byte/word broadcast and comparison only available
+ under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
+ when without TARGET_AVX512BW. */
+ if ((mode == V32HImode || mode == V64QImode) && !TARGET_AVX512BW)
+ {
+ gcc_assert (TARGET_AVX512F);
+ rtx vhi, vlo, idx_hi;
+ machine_mode half_mode;
+ rtx (*extract_hi)(rtx, rtx);
+ rtx (*extract_lo)(rtx, rtx);
+
+ if (mode == V32HImode)
+ {
+ half_mode = V16HImode;
+ extract_hi = gen_vec_extract_hi_v32hi;
+ extract_lo = gen_vec_extract_lo_v32hi;
+ }
+ else
+ {
+ half_mode = V32QImode;
+ extract_hi = gen_vec_extract_hi_v64qi;
+ extract_lo = gen_vec_extract_lo_v64qi;
+ }
+
+ vhi = gen_reg_rtx (half_mode);
+ vlo = gen_reg_rtx (half_mode);
+ idx_hi = gen_reg_rtx (GET_MODE (idx));
+ emit_insn (extract_hi (vhi, target));
+ emit_insn (extract_lo (vlo, target));
+ vec[0] = idx_hi;
+ vec[1] = idx;
+ vec[2] = GEN_INT (n_elts/2);
+ ix86_expand_binary_operator (MINUS, GET_MODE (idx), vec);
+ ix86_expand_vector_set_var (vhi, val, idx_hi);
+ ix86_expand_vector_set_var (vlo, val, idx);
+ emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, vlo, vhi)));
+ return;
+ }
+
+ if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
+ {
+ switch (mode)
+ {
+ case E_V2DFmode:
+ cmp_mode = V2DImode;
+ break;
+ case E_V4DFmode:
+ cmp_mode = V4DImode;
+ break;
+ case E_V8DFmode:
+ cmp_mode = V8DImode;
+ break;
+ case E_V4SFmode:
+ cmp_mode = V4SImode;
+ break;
+ case E_V8SFmode:
+ cmp_mode = V8SImode;
+ break;
+ case E_V16SFmode:
+ cmp_mode = V16SImode;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ }
+
+ for (int i = 0; i != n_elts; i++)
+ vec[i] = GEN_INT (i);
+ constv = gen_rtx_CONST_VECTOR (cmp_mode, gen_rtvec_v (n_elts, vec));
+ valv = gen_reg_rtx (mode);
+ idxv = gen_reg_rtx (cmp_mode);
+ idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
+
+ ok = ix86_expand_vector_init_duplicate (false, mode, valv, val);
+ gcc_assert (ok);
+ ok = ix86_expand_vector_init_duplicate (false, cmp_mode, idxv, idx_tmp);
+ gcc_assert (ok);
+ vec[0] = target;
+ vec[1] = valv;
+ vec[2] = target;
+ vec[3] = gen_rtx_EQ (mode, idxv, constv);
+ vec[4] = idxv;
+ vec[5] = constv;
+ ok = ix86_expand_int_vcond (vec);
+ gcc_assert (ok);
+}
+
void
ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
{
switch (mode)
{
- case E_V2SFmode:
case E_V2SImode:
+ use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
+ if (use_vec_merge)
+ break;
+ /* FALLTHRU */
+
+ case E_V2SFmode:
if (mmx_ok)
{
tmp = gen_reg_rtx (GET_MODE_INNER (mode));
break;
case E_V8QImode:
+ use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
break;
case E_V32QImode:
switch (mode)
{
case E_V2SImode:
+ use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
+ if (use_vec_extr)
+ break;
+ /* FALLTHRU */
+
case E_V2SFmode:
if (!mmx_ok)
break;
case E_V16QImode:
use_vec_extr = TARGET_SSE4_1;
+ if (!use_vec_extr
+ && TARGET_SSE2
+ && elt == 0
+ && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
+ {
+ tmp = gen_reg_rtx (SImode);
+ ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
+ 0);
+ emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
+ return;
+ }
break;
case E_V8SFmode:
return;
case E_V8QImode:
+ use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
/* ??? Could extract the appropriate HImode element and shift. */
+ break;
+
default:
break;
}
break;
case E_V64QImode:
case E_V32HImode:
+ if (i < 64)
+ {
+ d = gen_reg_rtx (V4TImode);
+ tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
+ GEN_INT (i / 2));
+ break;
+ }
+ /* FALLTHRU */
case E_V16SImode:
case E_V16SFmode:
case E_V8DImode:
case E_V8DFmode:
if (i > 128)
tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
- gen_lowpart (V16SImode, src),
- gen_lowpart (V16SImode, src),
- GEN_INT (0x4 + (i == 512 ? 4 : 0)),
- GEN_INT (0x5 + (i == 512 ? 4 : 0)),
- GEN_INT (0x6 + (i == 512 ? 4 : 0)),
- GEN_INT (0x7 + (i == 512 ? 4 : 0)),
- GEN_INT (0xC), GEN_INT (0xD),
- GEN_INT (0xE), GEN_INT (0xF),
- GEN_INT (0x10), GEN_INT (0x11),
- GEN_INT (0x12), GEN_INT (0x13),
- GEN_INT (0x14), GEN_INT (0x15),
- GEN_INT (0x16), GEN_INT (0x17));
+ gen_lowpart (V16SImode, src),
+ gen_lowpart (V16SImode, src),
+ GEN_INT (0x4 + (i == 512 ? 4 : 0)),
+ GEN_INT (0x5 + (i == 512 ? 4 : 0)),
+ GEN_INT (0x6 + (i == 512 ? 4 : 0)),
+ GEN_INT (0x7 + (i == 512 ? 4 : 0)),
+ GEN_INT (0xC), GEN_INT (0xD),
+ GEN_INT (0xE), GEN_INT (0xF),
+ GEN_INT (0x10), GEN_INT (0x11),
+ GEN_INT (0x12), GEN_INT (0x13),
+ GEN_INT (0x14), GEN_INT (0x15),
+ GEN_INT (0x16), GEN_INT (0x17));
else
tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
- gen_lowpart (V16SImode, src),
- GEN_INT (i == 128 ? 0x2 : 0x1),
- GEN_INT (0x3),
- GEN_INT (0x3),
- GEN_INT (0x3),
- GEN_INT (i == 128 ? 0x6 : 0x5),
- GEN_INT (0x7),
- GEN_INT (0x7),
- GEN_INT (0x7),
- GEN_INT (i == 128 ? 0xA : 0x9),
- GEN_INT (0xB),
- GEN_INT (0xB),
- GEN_INT (0xB),
- GEN_INT (i == 128 ? 0xE : 0xD),
- GEN_INT (0xF),
- GEN_INT (0xF),
- GEN_INT (0xF));
+ gen_lowpart (V16SImode, src),
+ GEN_INT (i == 128 ? 0x2 : 0x1),
+ GEN_INT (0x3),
+ GEN_INT (0x3),
+ GEN_INT (0x3),
+ GEN_INT (i == 128 ? 0x6 : 0x5),
+ GEN_INT (0x7),
+ GEN_INT (0x7),
+ GEN_INT (0x7),
+ GEN_INT (i == 128 ? 0xA : 0x9),
+ GEN_INT (0xB),
+ GEN_INT (0xB),
+ GEN_INT (0xB),
+ GEN_INT (i == 128 ? 0xE : 0xD),
+ GEN_INT (0xF),
+ GEN_INT (0xF),
+ GEN_INT (0xF));
break;
default:
gcc_unreachable ();
}
}
+ mthree = force_reg (mode, mthree);
+
/* e0 = x0 * a */
emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
- /* e1 = e0 * x0 */
- emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
- /* e2 = e1 - 3. */
- mthree = force_reg (mode, mthree);
- emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
+ unsigned vector_size = GET_MODE_SIZE (mode);
+ if (TARGET_FMA
+ || (TARGET_AVX512F && vector_size == 64)
+ || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
+ emit_insn (gen_rtx_SET (e2,
+ gen_rtx_FMA (mode, e0, x0, mthree)));
+ else
+ {
+ /* e1 = e0 * x0 */
+ emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
+
+ /* e2 = e1 - 3. */
+ emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
+ }
mhalf = force_reg (mode, mhalf);
if (recip)
ix86_expand_lround (rtx op0, rtx op1)
{
/* C code for the stuff we're doing below:
- tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
- return (long)tmp;
+ tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
+ return (long)tmp;
*/
machine_mode mode = GET_MODE (op1);
const struct real_format *fmt;
{
/* C code for the stuff we're doing below (for do_floor):
xi = (long)op1;
- xi -= (double)xi > op1 ? 1 : 0;
- return xi;
+ xi -= (double)xi > op1 ? 1 : 0;
+ return xi;
*/
machine_mode fmode = GET_MODE (op1);
machine_mode imode = GET_MODE (op0);
static rtx
ix86_gen_TWO52 (machine_mode mode)
{
+ const struct real_format *fmt;
REAL_VALUE_TYPE TWO52r;
rtx TWO52;
- real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
+ fmt = REAL_MODE_FORMAT (mode);
+ real_2expN (&TWO52r, fmt->p - 1, mode);
TWO52 = const_double_from_real_value (TWO52r, mode);
TWO52 = force_reg (mode, TWO52);
{
/* C code for the stuff we're doing below:
xa = fabs (operand1);
- if (!isless (xa, 2**52))
+ if (!isless (xa, 2**52))
return operand1;
- two52 = 2**52;
- if (flag_rounding_math)
+ two52 = 2**52;
+ if (flag_rounding_math)
{
two52 = copysign (two52, operand1);
xa = operand1;
}
- xa = xa + two52 - two52;
- return copysign (xa, operand1);
+ xa = xa + two52 - two52;
+ return copysign (xa, operand1);
*/
machine_mode mode = GET_MODE (operand0);
- rtx res, xa, TWO52, two52, mask;
+ rtx res, xa, TWO52, mask;
rtx_code_label *label;
- res = gen_reg_rtx (mode);
- emit_move_insn (res, operand1);
+ TWO52 = ix86_gen_TWO52 (mode);
+
+ /* Temporary for holding the result, initialized to the input
+ operand to ease control flow. */
+ res = copy_to_reg (operand1);
/* xa = abs (operand1) */
xa = ix86_expand_sse_fabs (res, &mask);
/* if (!isless (xa, TWO52)) goto label; */
- TWO52 = ix86_gen_TWO52 (mode);
label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
- two52 = TWO52;
if (flag_rounding_math)
{
- two52 = gen_reg_rtx (mode);
- ix86_sse_copysign_to_positive (two52, TWO52, res, mask);
+ ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask);
xa = res;
}
- xa = expand_simple_binop (mode, PLUS, xa, two52, NULL_RTX, 0, OPTAB_DIRECT);
- xa = expand_simple_binop (mode, MINUS, xa, two52, xa, 0, OPTAB_DIRECT);
-
- ix86_sse_copysign_to_positive (res, xa, res, mask);
-
- emit_label (label);
- LABEL_NUSES (label) = 1;
-
- emit_move_insn (operand0, res);
-}
-
-/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
- into OPERAND0. */
-void
-ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
-{
- /* C code for the stuff we expand below.
- double xa = fabs (x), x2;
- if (!isless (xa, TWO52))
- return x;
- xa = xa + TWO52 - TWO52;
- x2 = copysign (xa, x);
- Compensate. Floor:
- if (x2 > x)
- x2 -= 1;
- Compensate. Ceil:
- if (x2 < x)
- x2 += 1;
- if (HONOR_SIGNED_ZEROS (mode))
- x2 = copysign (x2, x);
- return x2;
- */
- machine_mode mode = GET_MODE (operand0);
- rtx xa, TWO52, tmp, one, res, mask;
- rtx_code_label *label;
-
- TWO52 = ix86_gen_TWO52 (mode);
-
- /* Temporary for holding the result, initialized to the input
- operand to ease control flow. */
- res = gen_reg_rtx (mode);
- emit_move_insn (res, operand1);
-
- /* xa = abs (operand1) */
- xa = ix86_expand_sse_fabs (res, &mask);
-
- /* if (!isless (xa, TWO52)) goto label; */
- label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
-
- /* xa = xa + TWO52 - TWO52; */
xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
- /* xa = copysign (xa, operand1) */
- ix86_sse_copysign_to_positive (xa, xa, res, mask);
+ /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
+ if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
+ xa = ix86_expand_sse_fabs (xa, NULL);
- /* generate 1.0 */
- one = force_reg (mode, const_double_from_real_value (dconst1, mode));
-
- /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
- tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
- emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
- tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
- xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
- if (!do_floor && HONOR_SIGNED_ZEROS (mode))
- ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
- emit_move_insn (res, tmp);
+ ix86_sse_copysign_to_positive (res, xa, res, mask);
emit_label (label);
LABEL_NUSES (label) = 1;
emit_move_insn (operand0, res);
}
-/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
- into OPERAND0. */
+/* Expand SSE2 sequence for computing floor or ceil
+ from OPERAND1 storing into OPERAND0. */
void
ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
{
/* C code for the stuff we expand below.
double xa = fabs (x), x2;
- if (!isless (xa, TWO52))
- return x;
+ if (!isless (xa, TWO52))
+ return x;
x2 = (double)(long)x;
+
Compensate. Floor:
if (x2 > x)
x2 -= 1;
Compensate. Ceil:
if (x2 < x)
x2 += 1;
+
if (HONOR_SIGNED_ZEROS (mode))
return copysign (x2, x);
return x2;
/* Temporary for holding the result, initialized to the input
operand to ease control flow. */
- res = gen_reg_rtx (mode);
- emit_move_insn (res, operand1);
+ res = copy_to_reg (operand1);
/* xa = abs (operand1) */
xa = ix86_expand_sse_fabs (res, &mask);
label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
/* xa = (double)(long)x */
- xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
+ xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
expand_fix (xi, res, 0);
expand_float (xa, xi, 0);
emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
- emit_move_insn (res, tmp);
-
if (HONOR_SIGNED_ZEROS (mode))
- ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
+ {
+ /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
+ if (do_floor && flag_rounding_math)
+ tmp = ix86_expand_sse_fabs (tmp, NULL);
+
+ ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
+ }
+ emit_move_insn (res, tmp);
emit_label (label);
LABEL_NUSES (label) = 1;
emit_move_insn (operand0, res);
}
-/* Expand SSE sequence for computing round from OPERAND1 storing
- into OPERAND0. Sequence that works without relying on DImode truncation
- via cvttsd2siq that is only available on 64bit targets. */
+/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
+ into OPERAND0 without relying on DImode truncation via cvttsd2siq
+ that is only available on 64bit targets. */
void
-ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
+ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
{
/* C code for the stuff we expand below.
- double xa = fabs (x), xa2, x2;
- if (!isless (xa, TWO52))
- return x;
- Using the absolute value and copying back sign makes
- -0.0 -> -0.0 correct.
- xa2 = xa + TWO52 - TWO52;
- Compensate.
- dxa = xa2 - xa;
- if (dxa <= -0.5)
- xa2 += 1;
- else if (dxa > 0.5)
- xa2 -= 1;
- x2 = copysign (xa2, x);
- return x2;
+ double xa = fabs (x), x2;
+ if (!isless (xa, TWO52))
+ return x;
+ xa = xa + TWO52 - TWO52;
+ x2 = copysign (xa, x);
+
+ Compensate. Floor:
+ if (x2 > x)
+ x2 -= 1;
+ Compensate. Ceil:
+ if (x2 < x)
+ x2 += 1;
+
+ if (HONOR_SIGNED_ZEROS (mode))
+ x2 = copysign (x2, x);
+ return x2;
*/
machine_mode mode = GET_MODE (operand0);
- rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
+ rtx xa, TWO52, tmp, one, res, mask;
rtx_code_label *label;
TWO52 = ix86_gen_TWO52 (mode);
/* Temporary for holding the result, initialized to the input
operand to ease control flow. */
- res = gen_reg_rtx (mode);
- emit_move_insn (res, operand1);
+ res = copy_to_reg (operand1);
/* xa = abs (operand1) */
xa = ix86_expand_sse_fabs (res, &mask);
/* if (!isless (xa, TWO52)) goto label; */
label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
- /* xa2 = xa + TWO52 - TWO52; */
- xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
- xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
-
- /* dxa = xa2 - xa; */
- dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
-
- /* generate 0.5, 1.0 and -0.5 */
- half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
- one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
- mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
- 0, OPTAB_DIRECT);
+ /* xa = xa + TWO52 - TWO52; */
+ xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
+ xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
- /* Compensate. */
- tmp = gen_reg_rtx (mode);
- /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
- tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
- emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
- xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
- /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
- tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
+ /* xa = copysign (xa, operand1) */
+ ix86_sse_copysign_to_positive (xa, xa, res, mask);
+
+ /* generate 1.0 */
+ one = force_reg (mode, const_double_from_real_value (dconst1, mode));
+
+ /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
+ tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
- xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
+ tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
+ xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
+ if (HONOR_SIGNED_ZEROS (mode))
+ {
+ /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
+ if (do_floor && flag_rounding_math)
+ tmp = ix86_expand_sse_fabs (tmp, NULL);
- /* res = copysign (xa2, operand1) */
- ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
+ ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
+ }
+ emit_move_insn (res, tmp);
emit_label (label);
LABEL_NUSES (label) = 1;
emit_move_insn (operand0, res);
}
-/* Expand SSE sequence for computing trunc from OPERAND1 storing
- into OPERAND0. */
+/* Expand SSE sequence for computing trunc
+ from OPERAND1 storing into OPERAND0. */
void
ix86_expand_trunc (rtx operand0, rtx operand1)
{
/* C code for SSE variant we expand below.
- double xa = fabs (x), x2;
- if (!isless (xa, TWO52))
- return x;
- x2 = (double)(long)x;
+ double xa = fabs (x), x2;
+ if (!isless (xa, TWO52))
+ return x;
+ x2 = (double)(long)x;
if (HONOR_SIGNED_ZEROS (mode))
return copysign (x2, x);
return x2;
/* Temporary for holding the result, initialized to the input
operand to ease control flow. */
- res = gen_reg_rtx (mode);
- emit_move_insn (res, operand1);
+ res = copy_to_reg (operand1);
/* xa = abs (operand1) */
xa = ix86_expand_sse_fabs (res, &mask);
/* if (!isless (xa, TWO52)) goto label; */
label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
- /* x = (double)(long)x */
- xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
+ /* xa = (double)(long)x */
+ xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
expand_fix (xi, res, 0);
- expand_float (res, xi, 0);
+ expand_float (xa, xi, 0);
if (HONOR_SIGNED_ZEROS (mode))
- ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
+ ix86_sse_copysign_to_positive (xa, xa, res, mask);
+
+ emit_move_insn (res, xa);
emit_label (label);
LABEL_NUSES (label) = 1;
}
/* Expand SSE sequence for computing trunc from OPERAND1 storing
- into OPERAND0. */
+ into OPERAND0 without relying on DImode truncation via cvttsd2siq
+ that is only available on 64bit targets. */
void
ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
{
machine_mode mode = GET_MODE (operand0);
- rtx xa, mask, TWO52, one, res, smask, tmp;
+ rtx xa, xa2, TWO52, tmp, one, res, mask;
rtx_code_label *label;
/* C code for SSE variant we expand below.
- double xa = fabs (x), x2;
- if (!isless (xa, TWO52))
- return x;
- xa2 = xa + TWO52 - TWO52;
+ double xa = fabs (x), x2;
+ if (!isless (xa, TWO52))
+ return x;
+ xa2 = xa + TWO52 - TWO52;
Compensate:
- if (xa2 > xa)
- xa2 -= 1.0;
- x2 = copysign (xa2, x);
- return x2;
+ if (xa2 > xa)
+ xa2 -= 1.0;
+ x2 = copysign (xa2, x);
+ return x2;
*/
TWO52 = ix86_gen_TWO52 (mode);
/* Temporary for holding the result, initialized to the input
operand to ease control flow. */
- res = gen_reg_rtx (mode);
- emit_move_insn (res, operand1);
+ res =copy_to_reg (operand1);
/* xa = abs (operand1) */
- xa = ix86_expand_sse_fabs (res, &smask);
+ xa = ix86_expand_sse_fabs (res, &mask);
/* if (!isless (xa, TWO52)) goto label; */
label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
- /* res = xa + TWO52 - TWO52; */
- tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
- tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
- emit_move_insn (res, tmp);
+ /* xa2 = xa + TWO52 - TWO52; */
+ xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
+ xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
/* generate 1.0 */
one = force_reg (mode, const_double_from_real_value (dconst1, mode));
- /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
- mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
- emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
+ /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */
+ tmp = ix86_expand_sse_compare_mask (UNGT, xa2, xa, false);
+ emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
tmp = expand_simple_binop (mode, MINUS,
- res, mask, NULL_RTX, 0, OPTAB_DIRECT);
- emit_move_insn (res, tmp);
+ xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
+ /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
+ if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
+ tmp = ix86_expand_sse_fabs (tmp, NULL);
- /* res = copysign (res, operand1) */
- ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
+ /* res = copysign (xa2, operand1) */
+ ix86_sse_copysign_to_positive (res, tmp, res, mask);
emit_label (label);
LABEL_NUSES (label) = 1;
emit_move_insn (operand0, res);
}
-/* Expand SSE sequence for computing round from OPERAND1 storing
- into OPERAND0. */
+/* Expand SSE sequence for computing round
+ from OPERAND1 storing into OPERAND0. */
void
ix86_expand_round (rtx operand0, rtx operand1)
{
/* C code for the stuff we're doing below:
- double xa = fabs (x);
- if (!isless (xa, TWO52))
- return x;
- xa = (double)(long)(xa + nextafter (0.5, 0.0));
- return copysign (xa, x);
+ double xa = fabs (x);
+ if (!isless (xa, TWO52))
+ return x;
+ xa = (double)(long)(xa + nextafter (0.5, 0.0));
+ return copysign (xa, x);
*/
machine_mode mode = GET_MODE (operand0);
rtx res, TWO52, xa, xi, half, mask;
/* Temporary for holding the result, initialized to the input
operand to ease control flow. */
- res = gen_reg_rtx (mode);
- emit_move_insn (res, operand1);
+ res = copy_to_reg (operand1);
TWO52 = ix86_gen_TWO52 (mode);
xa = ix86_expand_sse_fabs (res, &mask);
xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
/* xa = (double)(int64_t)xa */
- xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
+ xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
expand_fix (xi, xa, 0);
expand_float (xa, xi, 0);
/* res = copysign (xa, operand1) */
- ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
+ ix86_sse_copysign_to_positive (res, xa, res, mask);
+
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+
+ emit_move_insn (operand0, res);
+}
+
+/* Expand SSE sequence for computing round from OPERAND1 storing
+ into OPERAND0 without relying on DImode truncation via cvttsd2siq
+ that is only available on 64bit targets. */
+void
+ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
+{
+ /* C code for the stuff we expand below.
+ double xa = fabs (x), xa2, x2;
+ if (!isless (xa, TWO52))
+ return x;
+ Using the absolute value and copying back sign makes
+ -0.0 -> -0.0 correct.
+ xa2 = xa + TWO52 - TWO52;
+ Compensate.
+ dxa = xa2 - xa;
+ if (dxa <= -0.5)
+ xa2 += 1;
+ else if (dxa > 0.5)
+ xa2 -= 1;
+ x2 = copysign (xa2, x);
+ return x2;
+ */
+ machine_mode mode = GET_MODE (operand0);
+ rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
+ rtx_code_label *label;
+
+ TWO52 = ix86_gen_TWO52 (mode);
+
+ /* Temporary for holding the result, initialized to the input
+ operand to ease control flow. */
+ res = copy_to_reg (operand1);
+
+ /* xa = abs (operand1) */
+ xa = ix86_expand_sse_fabs (res, &mask);
+
+ /* if (!isless (xa, TWO52)) goto label; */
+ label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+ /* xa2 = xa + TWO52 - TWO52; */
+ xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
+ xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
+
+ /* dxa = xa2 - xa; */
+ dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
+
+ /* generate 0.5, 1.0 and -0.5 */
+ half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
+ one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
+ mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
+ 0, OPTAB_DIRECT);
+
+ /* Compensate. */
+ /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
+ tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
+ emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
+ xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
+ /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
+ tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
+ emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
+ xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
+
+ /* res = copysign (xa2, operand1) */
+ ix86_sse_copysign_to_positive (res, xa2, res, mask);
emit_label (label);
LABEL_NUSES (label) = 1;
return ok;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
using movss or movsd. */
static bool
expand_vec_perm_movs (struct expand_vec_perm_d *d)
return false;
if (!(TARGET_SSE && vmode == V4SFmode)
+ && !(TARGET_MMX_WITH_SSE && vmode == V2SFmode)
&& !(TARGET_SSE2 && vmode == V2DFmode))
return false;
return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
static bool
expand_vec_perm_blend (struct expand_vec_perm_d *d)
{
machine_mode mmode, vmode = d->vmode;
- unsigned i, mask, nelt = d->nelt;
+ unsigned i, nelt = d->nelt;
+ unsigned HOST_WIDE_INT mask;
rtx target, op0, op1, maskop, x;
rtx rperm[32], vperm;
case E_V16SImode:
case E_V8DImode:
for (i = 0; i < nelt; ++i)
- mask |= (d->perm[i] >= nelt) << i;
+ mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
break;
case E_V2DImode:
return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
in terms of the variable form of vpermilps.
Note that we will have already failed the immediate input vpermilps,
return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
static bool
/* vpshufb only works intra lanes, it is not
possible to shuffle bytes in between the lanes. */
for (i = 0; i < nelt; ++i)
- if ((d->perm[i] ^ i) & (nelt / 4))
+ if ((d->perm[i] ^ i) & (3 * nelt / 4))
return false;
}
}
static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
in a single instruction. */
static bool
return false;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
in terms of a pair of pshuflw + pshufhw instructions. */
static bool
return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
the permutation using the SSSE3 palignr instruction. This succeeds
when all of the elements in PERM fit within one vector and we merely
need to shift them down so that a single vector permutation has a
static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
a two vector permutation into a single vector permutation by using
an interleave operation to merge the vectors. */
return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
a single vector cross-lane permutation into vpermq followed
by any of the single insn permutations. */
static bool canonicalize_perm (struct expand_vec_perm_d *d);
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
a vector permutation using two instructions, vperm2f128 resp.
vperm2i128 followed by any single in-lane permutation. */
return false;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
a two vector permutation using 2 intra-lane interleave insns
and cross-lane shuffle for 32-byte vectors. */
return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
a single vector permutation using a single intra-lane vector
permutation, vperm2f128 swapping the lanes and vblend* insn blending
the non-swapped and swapped vectors together. */
return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
+/* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
permutation using two vperm2f128, followed by a vshufpd insn blending
the two vectors together. */
return true;
}
+static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
+
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
+ a two vector permutation using two intra-lane vector
+ permutations, vperm2f128 swapping the lanes and vblend* insn blending
+ the non-swapped and swapped vectors together. */
+
+static bool
+expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
+{
+ struct expand_vec_perm_d dfirst, dsecond, dthird;
+ unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
+ rtx_insn *seq1, *seq2;
+ bool ok;
+ rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
+
+ if (!TARGET_AVX
+ || TARGET_AVX2
+ || (d->vmode != V8SFmode && d->vmode != V4DFmode)
+ || d->one_operand_p)
+ return false;
+
+ dfirst = *d;
+ dsecond = *d;
+ for (i = 0; i < nelt; i++)
+ {
+ dfirst.perm[i] = 0xff;
+ dsecond.perm[i] = 0xff;
+ }
+ for (i = 0, msk = 0; i < nelt; i++)
+ {
+ j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
+ if (j == i)
+ {
+ dfirst.perm[j] = d->perm[i];
+ which1 |= (d->perm[i] < nelt ? 1 : 2);
+ }
+ else
+ {
+ dsecond.perm[j] = d->perm[i];
+ which2 |= (d->perm[i] < nelt ? 1 : 2);
+ msk |= (1U << i);
+ }
+ }
+ if (msk == 0 || msk == (1U << nelt) - 1)
+ return false;
+
+ if (!d->testing_p)
+ {
+ dfirst.target = gen_reg_rtx (dfirst.vmode);
+ dsecond.target = gen_reg_rtx (dsecond.vmode);
+ }
+
+ for (i = 0; i < nelt; i++)
+ {
+ if (dfirst.perm[i] == 0xff)
+ dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
+ if (dsecond.perm[i] == 0xff)
+ dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
+ }
+ canonicalize_perm (&dfirst);
+ start_sequence ();
+ ok = ix86_expand_vec_perm_const_1 (&dfirst);
+ seq1 = get_insns ();
+ end_sequence ();
+
+ if (!ok)
+ return false;
+
+ canonicalize_perm (&dsecond);
+ start_sequence ();
+ ok = ix86_expand_vec_perm_const_1 (&dsecond);
+ seq2 = get_insns ();
+ end_sequence ();
+
+ if (!ok)
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ emit_insn (seq1);
+ emit_insn (seq2);
+
+ dthird = *d;
+ dthird.op0 = dsecond.target;
+ dthird.op1 = dsecond.target;
+ dthird.one_operand_p = true;
+ dthird.target = gen_reg_rtx (dthird.vmode);
+ for (i = 0; i < nelt; i++)
+ dthird.perm[i] = i ^ nelt2;
+
+ ok = expand_vec_perm_1 (&dthird);
+ gcc_assert (ok);
+
+ blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
+ emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
+ return true;
+}
+
/* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
permutation with two pshufb insns and an ior. We should have already
failed all two instruction sequences. */
return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
+/* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
and extract-odd permutations. */
static bool
case E_V2DFmode:
case E_V4SFmode:
case E_V2DImode:
+ case E_V2SImode:
case E_V4SImode:
/* These are always directly implementable by expand_vec_perm_1. */
gcc_unreachable ();
+ case E_V2SFmode:
+ gcc_assert (TARGET_MMX_WITH_SSE);
+ /* We have no suitable instructions. */
+ if (d->testing_p)
+ return false;
+ break;
+
+ case E_V4HImode:
+ if (d->testing_p)
+ break;
+ /* We need 2*log2(N)-1 operations to achieve odd/even
+ with interleave. */
+ t1 = gen_reg_rtx (V4HImode);
+ emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
+ emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
+ if (odd)
+ t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
+ else
+ t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
+ emit_insn (t2);
+ break;
+
case E_V8HImode:
if (TARGET_SSE4_1)
return expand_vec_perm_even_odd_pack (d);
return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
+/* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
extract-even and extract-odd permutations. */
static bool
return expand_vec_perm_even_odd_1 (d, odd);
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
+/* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
permutations. We assume that expand_vec_perm_1 has already failed. */
static bool
gcc_unreachable ();
case E_V2DFmode:
- case E_V2DImode:
+ case E_V2SFmode:
case E_V4SFmode:
+ case E_V2DImode:
+ case E_V2SImode:
case E_V4SImode:
/* These are always implementable using standard shuffle patterns. */
gcc_unreachable ();
}
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
+/* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
broadcast permutations. */
static bool
return true;
}
+ /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
+ if (expand_vec_perm2_vperm2f128_vblend (d))
+ return true;
+
return false;
}
int i, which, nelt = d->nelt;
for (i = which = 0; i < nelt; ++i)
- which |= (d->perm[i] < nelt ? 1 : 2);
+ which |= (d->perm[i] < nelt ? 1 : 2);
d->one_operand_p = true;
switch (which)
if (d.testing_p && TARGET_SSSE3)
return true;
break;
+ case E_V2SFmode:
+ case E_V2SImode:
+ case E_V4HImode:
+ if (!TARGET_MMX_WITH_SSE)
+ return false;
+ break;
case E_V2DImode:
case E_V2DFmode:
if (!TARGET_SSE)
d.one_operand_p = (which != 3);
/* Implementable with shufps or pshufd. */
- if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
+ if (d.one_operand_p
+ && (d.vmode == V4SFmode || d.vmode == V2SFmode
+ || d.vmode == V4SImode || d.vmode == V2SImode))
return true;
/* Otherwise we have to go through the motions and see if we can
two_args = canonicalize_perm (&d);
+ /* If one of the operands is a zero vector, try to match pmovzx. */
+ if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
+ {
+ struct expand_vec_perm_d dzero = d;
+ if (d.op0 == CONST0_RTX (vmode))
+ {
+ d.op1 = dzero.op1 = force_reg (vmode, d.op1);
+ std::swap (dzero.op0, dzero.op1);
+ for (i = 0; i < nelt; ++i)
+ dzero.perm[i] ^= nelt;
+ }
+ else
+ d.op0 = dzero.op0 = force_reg (vmode, d.op0);
+
+ if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
+ dzero.perm, nelt, dzero.testing_p))
+ return true;
+ }
+
+ /* Force operands into registers. */
+ rtx nop0 = force_reg (vmode, d.op0);
+ if (d.op0 == d.op1)
+ d.op1 = nop0;
+ d.op0 = nop0;
+ d.op1 = force_reg (vmode, d.op1);
+
if (ix86_expand_vec_perm_const_1 (&d))
return true;
gcc_assert (ok);
}
+/* Optimize vector MUL generation for V8QI, V16QI and V32QI
+ under TARGET_AVX512BW. i.e. for v16qi a * b, it has
+
+ vpmovzxbw ymm2, xmm0
+ vpmovzxbw ymm3, xmm1
+ vpmullw ymm4, ymm2, ymm3
+ vpmovwb xmm0, ymm4
+
+ it would take less instructions than ix86_expand_vecop_qihi.
+ Return true if success. */
+
+bool
+ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2)
+{
+ machine_mode himode, qimode = GET_MODE (dest);
+ rtx hop1, hop2, hdest;
+ rtx (*gen_extend)(rtx, rtx);
+ rtx (*gen_truncate)(rtx, rtx);
+
+ /* There's no V64HImode multiplication instruction. */
+ if (qimode == E_V64QImode)
+ return false;
+
+ /* vpmovwb only available under AVX512BW. */
+ if (!TARGET_AVX512BW)
+ return false;
+ if ((qimode == V8QImode || qimode == V16QImode)
+ && !TARGET_AVX512VL)
+ return false;
+ /* Not generate zmm instruction when prefer 128/256 bit vector width. */
+ if (qimode == V32QImode
+ && (TARGET_PREFER_AVX128 || TARGET_PREFER_AVX256))
+ return false;
+
+ switch (qimode)
+ {
+ case E_V8QImode:
+ himode = V8HImode;
+ gen_extend = gen_zero_extendv8qiv8hi2;
+ gen_truncate = gen_truncv8hiv8qi2;
+ break;
+ case E_V16QImode:
+ himode = V16HImode;
+ gen_extend = gen_zero_extendv16qiv16hi2;
+ gen_truncate = gen_truncv16hiv16qi2;
+ break;
+ case E_V32QImode:
+ himode = V32HImode;
+ gen_extend = gen_zero_extendv32qiv32hi2;
+ gen_truncate = gen_truncv32hiv32qi2;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ hop1 = gen_reg_rtx (himode);
+ hop2 = gen_reg_rtx (himode);
+ hdest = gen_reg_rtx (himode);
+ emit_insn (gen_extend (hop1, op1));
+ emit_insn (gen_extend (hop2, op2));
+ emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (MULT, himode,
+ hop1, hop2)));
+ emit_insn (gen_truncate (dest, hdest));
+ return true;
+}
+
+/* Expand a vector operation shift by constant for a V*QImode in terms of the
+ same operation on V*HImode. Return true if success. */
+bool
+ix86_expand_vec_shift_qihi_constant (enum rtx_code code, rtx dest, rtx op1, rtx op2)
+{
+ machine_mode qimode, himode;
+ HOST_WIDE_INT and_constant, xor_constant;
+ HOST_WIDE_INT shift_amount;
+ rtx vec_const_and, vec_const_xor;
+ rtx tmp, op1_subreg;
+ rtx (*gen_shift) (rtx, rtx, rtx);
+ rtx (*gen_and) (rtx, rtx, rtx);
+ rtx (*gen_xor) (rtx, rtx, rtx);
+ rtx (*gen_sub) (rtx, rtx, rtx);
+
+ /* Only optimize shift by constant. */
+ if (!CONST_INT_P (op2))
+ return false;
+
+ qimode = GET_MODE (dest);
+ shift_amount = INTVAL (op2);
+ /* Do nothing when shift amount greater equal 8. */
+ if (shift_amount > 7)
+ return false;
+
+ gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
+ /* Record sign bit. */
+ xor_constant = 1 << (8 - shift_amount - 1);
+
+ /* Zero upper/lower bits shift from left/right element. */
+ and_constant
+ = (code == ASHIFT ? 256 - (1 << shift_amount)
+ : (1 << (8 - shift_amount)) - 1);
+
+ switch (qimode)
+ {
+ case V16QImode:
+ himode = V8HImode;
+ gen_shift =
+ ((code == ASHIFT)
+ ? gen_ashlv8hi3
+ : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
+ gen_and = gen_andv16qi3;
+ gen_xor = gen_xorv16qi3;
+ gen_sub = gen_subv16qi3;
+ break;
+ case V32QImode:
+ himode = V16HImode;
+ gen_shift =
+ ((code == ASHIFT)
+ ? gen_ashlv16hi3
+ : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
+ gen_and = gen_andv32qi3;
+ gen_xor = gen_xorv32qi3;
+ gen_sub = gen_subv32qi3;
+ break;
+ case V64QImode:
+ himode = V32HImode;
+ gen_shift =
+ ((code == ASHIFT)
+ ? gen_ashlv32hi3
+ : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
+ gen_and = gen_andv64qi3;
+ gen_xor = gen_xorv64qi3;
+ gen_sub = gen_subv64qi3;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ tmp = gen_reg_rtx (himode);
+ vec_const_and = gen_reg_rtx (qimode);
+ op1_subreg = lowpart_subreg (himode, op1, qimode);
+
+ /* For ASHIFT and LSHIFTRT, perform operation like
+ vpsllw/vpsrlw $shift_amount, %op1, %dest.
+ vpand %vec_const_and, %dest. */
+ emit_insn (gen_shift (tmp, op1_subreg, op2));
+ emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
+ emit_move_insn (vec_const_and,
+ ix86_build_const_vector (qimode, true,
+ gen_int_mode (and_constant, QImode)));
+ emit_insn (gen_and (dest, dest, vec_const_and));
+
+ /* For ASHIFTRT, perform extra operation like
+ vpxor %vec_const_xor, %dest, %dest
+ vpsubb %vec_const_xor, %dest, %dest */
+ if (code == ASHIFTRT)
+ {
+ vec_const_xor = gen_reg_rtx (qimode);
+ emit_move_insn (vec_const_xor,
+ ix86_build_const_vector (qimode, true,
+ gen_int_mode (xor_constant, QImode)));
+ emit_insn (gen_xor (dest, dest, vec_const_xor));
+ emit_insn (gen_sub (dest, dest, vec_const_xor));
+ }
+ return true;
+}
/* Expand a vector operation CODE for a V*QImode in terms of the
same operation on V*HImode. */
emit_insn (gen_vec_widen_umult_even_v4si (t5,
gen_lowpart (V4SImode, op1),
gen_lowpart (V4SImode, op2)));
- op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
-
+ force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
}
else
{
should be encoded with notrack prefix. */
bool
-ix86_notrack_prefixed_insn_p (rtx insn)
+ix86_notrack_prefixed_insn_p (rtx_insn *insn)
{
if (!insn || !((flag_cf_protection & CF_BRANCH)))
return false;
case E_V4SImode:
case E_V2DImode:
case E_V1TImode:
- case E_TImode:
{
machine_mode srcmode, dstmode;
rtx d, pat;
case E_V4SImode:
case E_V2DImode:
case E_V1TImode:
- case E_TImode:
{
machine_mode srcmode, dstmode;
rtx (*pinsr)(rtx, rtx, rtx, rtx);