/* Vector */
{
COSTS_N_INSNS (1), /* alu. */
- COSTS_N_INSNS (4) /* mult. */
+ COSTS_N_INSNS (4), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
/* Vector */
{
COSTS_N_INSNS (1), /* Alu. */
- COSTS_N_INSNS (4) /* mult. */
+ COSTS_N_INSNS (4), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
/* Vector */
{
COSTS_N_INSNS (1), /* Alu. */
- COSTS_N_INSNS (4) /* Mult. */
+ COSTS_N_INSNS (4), /* Mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
/* Vector */
{
COSTS_N_INSNS (1), /* Alu. */
- COSTS_N_INSNS (4) /* Mult. */
+ COSTS_N_INSNS (4), /* Mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
/* Vector */
{
COSTS_N_INSNS (1), /* alu. */
- COSTS_N_INSNS (4) /* mult. */
+ COSTS_N_INSNS (4), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
/* Vector */
{
COSTS_N_INSNS (1), /* alu. */
- COSTS_N_INSNS (4) /* mult. */
+ COSTS_N_INSNS (4), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
)
(define_insn "aarch64_simd_dup<mode>"
- [(set (match_operand:VDQF_F16 0 "register_operand" "=w")
+ [(set (match_operand:VDQF_F16 0 "register_operand" "=w,w")
(vec_duplicate:VDQF_F16
- (match_operand:<VEL> 1 "register_operand" "w")))]
+ (match_operand:<VEL> 1 "register_operand" "w,r")))]
"TARGET_SIMD"
- "dup\\t%0.<Vtype>, %1.<Vetype>[0]"
- [(set_attr "type" "neon_dup<q>")]
+ "@
+ dup\\t%0.<Vtype>, %1.<Vetype>[0]
+ dup\\t%0.<Vtype>, %<vw>1"
+ [(set_attr "type" "neon_dup<q>, neon_from_gp<q>")]
)
(define_insn "aarch64_dup_lane<mode>"
rtx op0, op1, op2;
const struct cpu_cost_table *extra_cost
= aarch64_tune_params.insn_extra_cost;
- int code = GET_CODE (x);
+ rtx_code code = GET_CODE (x);
scalar_int_mode int_mode;
/* By default, assume that everything has equivalent cost to the
we must cost the explicit register move. */
if (mode == DImode
- && GET_MODE (op0) == SImode
- && outer == SET)
+ && GET_MODE (op0) == SImode)
{
int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
mode, MULT, 1, speed);
return true;
}
+ break;
+ case CONST_VECTOR:
+ {
+ /* Load using MOVI/MVNI. */
+ if (aarch64_simd_valid_immediate (x, NULL))
+ *cost = extra_cost->vect.movi;
+ else /* Load using constant pool. */
+ *cost = extra_cost->ldst.load;
+ break;
+ }
+ case VEC_CONCAT:
+ /* depending on the operation, either DUP or INS.
+ For now, keep default costing. */
+ break;
+ case VEC_DUPLICATE:
+ /* Load using a DUP. */
+ *cost = extra_cost->vect.dup;
+ return false;
+ case VEC_SELECT:
+ {
+ rtx op0 = XEXP (x, 0);
+ *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);
- /* Fall through. */
+ /* cost subreg of 0 as free, otherwise as DUP */
+ rtx op1 = XEXP (x, 1);
+ if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
+ ;
+ else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
+ *cost = extra_cost->vect.dup;
+ else
+ *cost = extra_cost->vect.extract;
+ return true;
+ }
default:
break;
}
{
const int alu;
const int mult;
+ const int movi;
+ const int dup;
+ const int extract;
};
struct cpu_cost_table
/* Vector */
{
COSTS_N_INSNS (1), /* alu. */
- COSTS_N_INSNS (4) /* mult. */
+ COSTS_N_INSNS (4), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
/* Vector */
{
COSTS_N_INSNS (1), /* alu. */
- COSTS_N_INSNS (4) /* mult. */
+ COSTS_N_INSNS (4), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
/* Vector */
{
COSTS_N_INSNS (1), /* alu. */
- COSTS_N_INSNS (4) /* mult. */
+ COSTS_N_INSNS (4), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
/* Vector */
{
COSTS_N_INSNS (1), /* alu. */
- COSTS_N_INSNS (4) /* mult. */
+ COSTS_N_INSNS (4), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
/* Vector */
{
COSTS_N_INSNS (0), /* alu. */
- COSTS_N_INSNS (4) /* mult. */
+ COSTS_N_INSNS (4), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
/* Vector */
{
COSTS_N_INSNS (2), /* alu. */
- COSTS_N_INSNS (8) /* mult. */
+ COSTS_N_INSNS (8), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
--- /dev/null
+/* { dg-do compile { target { lp64 } } } */
+/* { dg-additional-options "-O3 -march=armv8.2-a+crypto -fno-schedule-insns -fno-schedule-insns2 -mcmodel=small" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+**test1:
+** adrp x[0-9]+, .LC[0-9]+
+** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+** add v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+** str q[0-9]+, \[x[0-9]+\]
+** fmov x[0-9]+, d[0-9]+
+** orr x[0-9]+, x[0-9]+, x[0-9]+
+** ret
+*/
+
+uint64_t
+test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+ uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL};
+ uint64_t res = a | arr[0];
+ uint64x2_t val = vld1q_u64 (arr);
+ *rt = vaddq_u64 (val, b);
+ return res;
+}
+
+/*
+**test2:
+** adrp x[0-9]+, .LC[0-1]+
+** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+** add v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+** str q[0-9]+, \[x[0-9]+\]
+** fmov x[0-9]+, d[0-9]+
+** orr x[0-9]+, x[0-9]+, x[0-9]+
+** ret
+*/
+
+uint64_t
+test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+ uint64x2_t val = vdupq_n_u64 (0x0424303242234076UL);
+ uint64_t arr = vgetq_lane_u64 (val, 0);
+ uint64_t res = a | arr;
+ *rt = vaddq_u64 (val, b);
+ return res;
+}
+
+/*
+**test3:
+** adrp x[0-9]+, .LC[0-9]+
+** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+** add v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** str q[0-9]+, \[x1\]
+** fmov w[0-9]+, s[0-9]+
+** orr w[0-9]+, w[0-9]+, w[0-9]+
+** ret
+*/
+
+uint32_t
+test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt)
+{
+ uint32_t arr[4] = { 0x094243, 0x094243, 0x094243, 0x094243 };
+ uint32_t res = a | arr[0];
+ uint32x4_t val = vld1q_u32 (arr);
+ *rt = vaddq_u32 (val, b);
+ return res;
+}
+
+/*
+**test4:
+** ushr v[0-9]+.16b, v[0-9]+.16b, 7
+** mov x[0-9]+, 16512
+** movk x[0-9]+, 0x1020, lsl 16
+** movk x[0-9]+, 0x408, lsl 32
+** movk x[0-9]+, 0x102, lsl 48
+** fmov d[0-9]+, x[0-9]+
+** pmull v[0-9]+.1q, v[0-9]+.1d, v[0-9]+.1d
+** dup v[0-9]+.2d, v[0-9]+.d\[0\]
+** pmull2 v[0-9]+.1q, v[0-9]+.2d, v[0-9]+.2d
+** trn2 v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** umov w[0-9]+, v[0-9]+.h\[3\]
+** ret
+*/
+
+uint64_t
+test4 (uint8x16_t input)
+{
+ uint8x16_t bool_input = vshrq_n_u8(input, 7);
+ poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
+ poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
+ vgetq_lane_p64(mask, 0));
+ poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
+ uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
+ return vget_lane_u16((uint16x4_t)res, 3);
+}
+