return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
}
-typedef auto_vec<std::pair<rtx, rtx>, 12> copy_ops;
-
-/* Copy one block of size MODE from SRC to DST at offset OFFSET. */
-static void
-aarch64_copy_one_block (copy_ops &ops, rtx src, rtx dst,
- int offset, machine_mode mode)
-{
- /* Emit explict load/store pair instructions for 32-byte copies. */
- if (known_eq (GET_MODE_SIZE (mode), 32))
- {
- mode = V4SImode;
- rtx src1 = adjust_address (src, mode, offset);
- rtx dst1 = adjust_address (dst, mode, offset);
- rtx reg1 = gen_reg_rtx (mode);
- rtx reg2 = gen_reg_rtx (mode);
- rtx load = aarch64_gen_load_pair (reg1, reg2, src1);
- rtx store = aarch64_gen_store_pair (dst1, reg1, reg2);
- ops.safe_push ({ load, store });
- return;
- }
-
- rtx reg = gen_reg_rtx (mode);
- rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
- rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
- ops.safe_push ({ load, store });
-}
-
/* Expand a cpymem/movmem using the MOPS extension. OPERANDS are taken
from the cpymem/movmem pattern. IS_MEMMOVE is true if this is a memmove
rather than memcpy. Return true iff we succeeded. */
rtx src = operands[1];
unsigned align = UINTVAL (operands[3]);
rtx base;
- machine_mode cur_mode = BLKmode, next_mode;
+ machine_mode mode = BLKmode, next_mode;
/* Variable-sized or strict-align copies may use the MOPS expansion. */
if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
return aarch64_expand_cpymem_mops (operands, is_memmove);
- unsigned copy_max = 32;
-
- /* Default to 32-byte LDP/STP on large copies, however small copies, no SIMD
- support or slow LDP/STP fall back to 16-byte chunks.
-
+ /* Default to 32-byte LDP/STP on large copies, however small copies or
+ no SIMD support fall back to 16-byte chunks.
??? Although it would be possible to use LDP/STP Qn in streaming mode
(so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear
whether that would improve performance. */
- if (size <= 24 || !use_ldpq)
- copy_max = 16;
+ bool use_qregs = size > 24 && TARGET_SIMD;
base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
dst = adjust_automodify_address (dst, VOIDmode, base, 0);
base = copy_to_mode_reg (Pmode, XEXP (src, 0));
src = adjust_automodify_address (src, VOIDmode, base, 0);
- copy_ops ops;
+ auto_vec<std::pair<rtx, rtx>, 16> ops;
int offset = 0;
while (size > 0)
or writing. */
opt_scalar_int_mode mode_iter;
FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
- if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, copy_max))
- cur_mode = mode_iter.require ();
+ if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, 16))
+ mode = mode_iter.require ();
- gcc_assert (cur_mode != BLKmode);
+ gcc_assert (mode != BLKmode);
+
+ mode_bytes = GET_MODE_SIZE (mode).to_constant ();
- mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
+ /* Prefer Q-register accesses. */
+ if (mode_bytes == 16 && use_qregs)
+ mode = V4SImode;
- /* Prefer Q-register accesses for the last bytes. */
- if (mode_bytes == 16 && copy_max == 32)
- cur_mode = V4SImode;
- aarch64_copy_one_block (ops, src, dst, offset, cur_mode);
+ rtx reg = gen_reg_rtx (mode);
+ rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
+ rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
+ ops.safe_push ({ load, store });
size -= mode_bytes;
offset += mode_bytes;
/* Emit trailing copies using overlapping unaligned accesses
(when !STRICT_ALIGNMENT) - this is smaller and faster. */
- if (size > 0 && size < copy_max / 2 && !STRICT_ALIGNMENT)
+ if (size > 0 && size < 16 && !STRICT_ALIGNMENT)
{
next_mode = smallest_mode_for_size (size * BITS_PER_UNIT, MODE_INT);
int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
/* Memcpy interleaves loads with stores, memmove emits all loads first. */
int nops = ops.length();
- int inc = is_memmove ? nops : nops == 4 ? 2 : 3;
+ int inc = is_memmove || nops <= 8 ? nops : 6;
for (int i = 0; i < nops; i += inc)
{
/* "Cast" the *dst to the correct mode. */
*dst = adjust_address (*dst, mode, 0);
/* Emit the memset. */
- emit_insn (aarch64_gen_store_pair (*dst, src, src));
+ emit_move_insn (*dst, src);
+ emit_move_insn (aarch64_move_pointer (*dst, 16), src);
/* Move the pointers forward. */
*dst = aarch64_move_pointer (*dst, 32);