AArch64: memcpy/memset expansions should not emit LDP/STP [PR113618]

author Wilco Dijkstra <wilco.dijkstra@arm.com>

Wed, 21 Feb 2024 23:33:58 +0000 (23:33 +0000)

committer Wilco Dijkstra <wilco.dijkstra@arm.com>

Thu, 7 Mar 2024 21:25:23 +0000 (21:25 +0000)
author Wilco Dijkstra <wilco.dijkstra@arm.com>
Wed, 21 Feb 2024 23:33:58 +0000 (23:33 +0000)
committer Wilco Dijkstra <wilco.dijkstra@arm.com>
Thu, 7 Mar 2024 21:25:23 +0000 (21:25 +0000)
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc

index 16318bf925883ecedf9345e53fc0824a553b2747..0a28e033088a00818c6ed9fa8c15ecdee5a86c35 100644 (file)
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -26465,33 +26465,6 @@ aarch64_progress_pointer (rtx pointer)
    return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
  }
  
-typedef auto_vec<std::pair<rtx, rtx>, 12> copy_ops;
-
-/* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
-static void
-aarch64_copy_one_block (copy_ops &ops, rtx src, rtx dst,
-                       int offset, machine_mode mode)
-{
-  /* Emit explict load/store pair instructions for 32-byte copies.  */
-  if (known_eq (GET_MODE_SIZE (mode), 32))
-    {
-      mode = V4SImode;
-      rtx src1 = adjust_address (src, mode, offset);
-      rtx dst1 = adjust_address (dst, mode, offset);
-      rtx reg1 = gen_reg_rtx (mode);
-      rtx reg2 = gen_reg_rtx (mode);
-      rtx load = aarch64_gen_load_pair (reg1, reg2, src1);
-      rtx store = aarch64_gen_store_pair (dst1, reg1, reg2);
-      ops.safe_push ({ load, store });
-      return;
-    }
-
-  rtx reg = gen_reg_rtx (mode);
-  rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
-  rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
-  ops.safe_push ({ load, store });
-}
-
  /* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
     from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
     rather than memcpy.  Return true iff we succeeded.  */
@@ -26527,7 +26500,7 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
    rtx src = operands[1];
    unsigned align = UINTVAL (operands[3]);
    rtx base;
-  machine_mode cur_mode = BLKmode, next_mode;
+  machine_mode mode = BLKmode, next_mode;
  
    /* Variable-sized or strict-align copies may use the MOPS expansion.  */
    if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
@@ -26550,16 +26523,12 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
    if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
      return aarch64_expand_cpymem_mops (operands, is_memmove);
  
-  unsigned copy_max = 32;
-
-  /* Default to 32-byte LDP/STP on large copies, however small copies, no SIMD
-     support or slow LDP/STP fall back to 16-byte chunks.
-
+  /* Default to 32-byte LDP/STP on large copies, however small copies or
+     no SIMD support fall back to 16-byte chunks.
       ??? Although it would be possible to use LDP/STP Qn in streaming mode
       (so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear
       whether that would improve performance.  */
-  if (size <= 24 || !use_ldpq)
-    copy_max = 16;
+  bool use_qregs = size > 24 && TARGET_SIMD;
  
    base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
    dst = adjust_automodify_address (dst, VOIDmode, base, 0);
@@ -26567,7 +26536,7 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
    base = copy_to_mode_reg (Pmode, XEXP (src, 0));
    src = adjust_automodify_address (src, VOIDmode, base, 0);
  
-  copy_ops ops;
+  auto_vec<std::pair<rtx, rtx>, 16> ops;
    int offset = 0;
  
    while (size > 0)
@@ -26576,23 +26545,27 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
          or writing.  */
        opt_scalar_int_mode mode_iter;
        FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
-       if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, copy_max))
-         cur_mode = mode_iter.require ();
+       if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, 16))
+         mode = mode_iter.require ();
  
-      gcc_assert (cur_mode != BLKmode);
+      gcc_assert (mode != BLKmode);
+
+      mode_bytes = GET_MODE_SIZE (mode).to_constant ();
  
-      mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
+      /* Prefer Q-register accesses.  */
+      if (mode_bytes == 16 && use_qregs)
+       mode = V4SImode;
  
-      /* Prefer Q-register accesses for the last bytes.  */
-      if (mode_bytes == 16 && copy_max == 32)
-       cur_mode = V4SImode;
-      aarch64_copy_one_block (ops, src, dst, offset, cur_mode);
+      rtx reg = gen_reg_rtx (mode);
+      rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
+      rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
+      ops.safe_push ({ load, store });
        size -= mode_bytes;
        offset += mode_bytes;
  
        /* Emit trailing copies using overlapping unaligned accesses
          (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
-      if (size > 0 && size < copy_max / 2 && !STRICT_ALIGNMENT)
+      if (size > 0 && size < 16 && !STRICT_ALIGNMENT)
         {
           next_mode = smallest_mode_for_size (size * BITS_PER_UNIT, MODE_INT);
           int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
@@ -26604,7 +26577,7 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
  
    /* Memcpy interleaves loads with stores, memmove emits all loads first.  */
    int nops = ops.length();
-  int inc = is_memmove ? nops : nops == 4 ? 2 : 3;
+  int inc = is_memmove || nops <= 8 ? nops : 6;
  
    for (int i = 0; i < nops; i += inc)
      {
@@ -26633,7 +26606,8 @@ aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
        /* "Cast" the *dst to the correct mode.  */
        *dst = adjust_address (*dst, mode, 0);
        /* Emit the memset.  */
-      emit_insn (aarch64_gen_store_pair (*dst, src, src));
+      emit_move_insn (*dst, src);
+      emit_move_insn (aarch64_move_pointer (*dst, 16), src);
  
        /* Move the pointers forward.  */
        *dst = aarch64_move_pointer (*dst, 32);
diff --git a/gcc/testsuite/gcc.target/aarch64/pr113618.c b/gcc/testsuite/gcc.target/aarch64/pr113618.c

new file mode 100644 (file)

index 0000000..f582360
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr113618.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+
+/*
+** move32:
+**     ...
+**     ldp     q([0-9]+), q([0-9]+), \[x0\]
+**     stp     q\1, q\2, \[x1\]
+**     ...
+*/
+
+void move32 (char *a, char *b)
+{
+    char temp[32];
+    __builtin_memcpy (temp, a, 32);
+    __builtin_memcpy (b, temp, 32);
+}
+
+/*
+** move64:
+**     ...
+**     ldp     q([0-9]+), q([0-9]+), \[x0\]
+**     ldp     q([0-9]+), q([0-9]+), \[x0, 32\]
+**     stp     q\1, q\2, \[x1\]
+**     stp     q\3, q\4, \[x1, 32\]
+**     ...
+*/
+
+void move64 (char *a, char *b)
+{
+    char temp[64];
+    __builtin_memcpy (temp, a, 64);
+    __builtin_memcpy (b, temp, 64);
+}
author	Wilco Dijkstra <wilco.dijkstra@arm.com>
	Wed, 21 Feb 2024 23:33:58 +0000 (23:33 +0000)
committer	Wilco Dijkstra <wilco.dijkstra@arm.com>
	Thu, 7 Mar 2024 21:25:23 +0000 (21:25 +0000)
gcc/config/aarch64/aarch64.cc		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/pr113618.c	[new file with mode: 0644]	patch \| blob