This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[AArch64] Implement movmem for the benefit of inline memcpy
- From: James Greenhalgh <james dot greenhalgh at arm dot com>
- To: gcc-patches at gcc dot gnu dot org
- Cc: marcus dot shawcroft at arm dot com
- Date: Fri, 6 Jun 2014 09:50:09 +0100
- Subject: [AArch64] Implement movmem for the benefit of inline memcpy
- Authentication-results: sourceware.org; auth=none
Hi,
The move_by_pieces infrastructure performs a copy by repeatedly trying
the largest safe copy it can make. So for a 15-byte copy we might see:
offset amount bytes copied
0 8 0-7
8 4 8-11
12 2 12-13
14 1 14
However, we can implement a 15-byte copy as so:
offset amount bytes copied
0 8 0-7
7 8 7-14
Which can prove more efficient for both space and speed.
In this patch we set MOVE_RATIO low to avoid using move_by_pieces, and
implement the movmem pattern name to expand small block copy cases. Note, this
optimization does not apply for -mstrict-align targets, which must continue
copying byte-by-byte.
Setting MOVE_RATIO low in this way causes a few tests to begin failing,
both of these are documented in the test-case as expected to fail for
low MOVE_RATIO targets, which do not allow certain Tree-Level optimizations.
Bootstrapped on aarch64-unknown-linux-gnu with no issues.
OK for trunk?
Thanks,
James
---
gcc/
2014-06-06 James Greenhalgh <james.greenhalgh@arm.com>
* config/aarch64/aarch64-protos.h (aarch64_expand_movmem): New.
* config/aarch64/aarch64.c (aarch64_move_pointer): New.
(aarch64_progress_pointer): Likewise.
(aarch64_copy_one_part_and_move_pointers): Likewise.
(aarch64_expand_movmen): Likewise.
* config/aarch64/aarch64.h (MOVE_RATIO): Set low.
* config/aarch64/aarch64.md (movmem<mode>): New.
gcc/testsuite/
2014-06-06 James Greenhalgh <james.greenhalgh@arm.com>
* gcc.dg/tree-ssa/pr42585.c: Skip for AArch64.
* gcc.dg/tree-ssa/sra-12.c: Likewise.
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 68d488d..c4f75b3 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -180,6 +180,7 @@ bool aarch64_cannot_change_mode_class (enum machine_mode,
enum aarch64_symbol_type
aarch64_classify_symbolic_expression (rtx, enum aarch64_symbol_context);
bool aarch64_constant_address_p (rtx);
+bool aarch64_expand_movmem (rtx *);
bool aarch64_float_const_zero_rtx_p (rtx);
bool aarch64_function_arg_regno_p (unsigned);
bool aarch64_gen_movmemqi (rtx *);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index b26e5f5..0ae21cd 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -9434,6 +9434,164 @@ aarch64_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
return false;
}
+/* Return a new RTX holding the result of moving POINTER forward by
+ AMOUNT bytes. */
+
+static rtx
+aarch64_move_pointer (rtx pointer, int amount)
+{
+ rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
+
+ return adjust_automodify_address (pointer, GET_MODE (pointer),
+ next, amount);
+}
+
+/* Return a new RTX holding the result of moving POINTER forward by the
+ size of the mode it points to. */
+
+static rtx
+aarch64_progress_pointer (rtx pointer)
+{
+ HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
+
+ return aarch64_move_pointer (pointer, amount);
+}
+
+/* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
+ MODE bytes. */
+
+static void
+aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
+ enum machine_mode mode)
+{
+ rtx reg = gen_reg_rtx (mode);
+
+ /* "Cast" the pointers to the correct mode. */
+ *src = adjust_address (*src, mode, 0);
+ *dst = adjust_address (*dst, mode, 0);
+ /* Emit the memcpy. */
+ emit_move_insn (reg, *src);
+ emit_move_insn (*dst, reg);
+ /* Move the pointers forward. */
+ *src = aarch64_progress_pointer (*src);
+ *dst = aarch64_progress_pointer (*dst);
+}
+
+/* Expand movmem, as if from a __builtin_memcpy. Return true if
+ we succeed, otherwise return false. */
+
+bool
+aarch64_expand_movmem (rtx *operands)
+{
+ unsigned int n;
+ rtx dst = operands[0];
+ rtx src = operands[1];
+ rtx base;
+ bool speed_p = !optimize_function_for_size_p (cfun);
+
+ /* When optimizing for size, give a better estimate of the length of a
+ memcpy call, but use the default otherwise. */
+ unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
+
+ /* We can't do anything smart if the amount to copy is not constant. */
+ if (!CONST_INT_P (operands[2]))
+ return false;
+
+ n = UINTVAL (operands[2]);
+
+ /* Try to keep the number of instructions low. For cases below 16 bytes we
+ need to make at most two moves. For cases above 16 bytes it will be one
+ move for each 16 byte chunk, then at most two additional moves. */
+ if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
+ return false;
+
+ base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
+ dst = adjust_automodify_address (dst, VOIDmode, base, 0);
+
+ base = copy_to_mode_reg (Pmode, XEXP (src, 0));
+ src = adjust_automodify_address (src, VOIDmode, base, 0);
+
+ /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
+ 1-byte chunk. */
+ if (n < 4)
+ {
+ if (n >= 2)
+ {
+ aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
+ n -= 2;
+ }
+
+ if (n == 1)
+ aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
+
+ return true;
+ }
+
+ /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
+ 4-byte chunk, partially overlapping with the previously copied chunk. */
+ if (n < 8)
+ {
+ aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
+ n -= 4;
+ if (n > 0)
+ {
+ int move = n - 4;
+
+ src = aarch64_move_pointer (src, move);
+ dst = aarch64_move_pointer (dst, move);
+ aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
+ }
+ return true;
+ }
+
+ /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
+ them, then (if applicable) an 8-byte chunk. */
+ while (n >= 8)
+ {
+ if (n / 16)
+ {
+ aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
+ n -= 16;
+ }
+ else
+ {
+ aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
+ n -= 8;
+ }
+ }
+
+ /* Finish the final bytes of the copy. We can always do this in one
+ instruction. We either copy the exact amount we need, or partially
+ overlap with the previous chunk we copied and copy 8-bytes. */
+ if (n == 0)
+ return true;
+ else if (n == 1)
+ aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
+ else if (n == 2)
+ aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
+ else if (n == 4)
+ aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
+ else
+ {
+ if (n == 3)
+ {
+ src = aarch64_move_pointer (src, -1);
+ dst = aarch64_move_pointer (dst, -1);
+ aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
+ }
+ else
+ {
+ int move = n - 8;
+
+ src = aarch64_move_pointer (src, move);
+ dst = aarch64_move_pointer (dst, move);
+ aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
+ }
+ }
+
+ return true;
+}
+
#undef TARGET_ADDRESS_COST
#define TARGET_ADDRESS_COST aarch64_address_cost
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index ced5a5e..c5d144e 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -659,12 +659,14 @@ do { \
/* The base cost overhead of a memcpy call, for MOVE_RATIO and friends. */
#define AARCH64_CALL_RATIO 8
-/* When optimizing for size, give a better estimate of the length of a memcpy
- call, but use the default otherwise. But move_by_pieces_ninsns() counts
- memory-to-memory moves, and we'll have to generate a load & store for each,
- so halve the value to take that into account. */
+/* MOVE_RATIO dictates when we will use the move_by_pieces infrastructure.
+ move_by_pieces will continually copy the largest safe chunks. So a
+ 7-byte copy is a 4-byte + 2-byte + byte copy. This proves inefficient
+ for both size and speed of copy, so we will instead use the "movmem"
+ standard name to implement the copy. This logic does not apply when
+ targeting -mstrict-align, so keep a sensible default in that case. */
#define MOVE_RATIO(speed) \
- (((speed) ? 15 : AARCH64_CALL_RATIO) / 2)
+ (!STRICT_ALIGNMENT ? 2 : (((speed) ? 15 : AARCH64_CALL_RATIO) / 2))
/* For CLEAR_RATIO, when optimizing for size, give a better estimate
of the length of a memset call, but use the default otherwise. */
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 6e605c19f0acbe88d53f460cb513d24dde6d658f..661d784b93e60fd2f636f5b5f03c10c6d53493dd 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -883,6 +883,24 @@ (define_split
}
)
+;; 0 is dst
+;; 1 is src
+;; 2 is size of move in bytes
+;; 3 is alignment
+
+(define_expand "movmemdi"
+ [(match_operand:BLK 0 "memory_operand")
+ (match_operand:BLK 1 "memory_operand")
+ (match_operand:DI 2 "immediate_operand")
+ (match_operand:DI 3 "immediate_operand")]
+ "!STRICT_ALIGNMENT"
+{
+ if (aarch64_expand_movmem (operands))
+ DONE;
+ FAIL;
+}
+)
+
;; Operands 1 and 3 are tied together by the final condition; so we allow
;; fairly lax checking on the second memory operation.
(define_insn "load_pair<mode>"
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c b/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
index a970c85..07f575d 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
@@ -35,6 +35,6 @@ Cyc_string_ungetc (int ignore, struct _fat_ptr *sptr)
/* Whether the structs are totally scalarized or not depends on the
MOVE_RATIO macro definition in the back end. The scalarization will
not take place when using small values for MOVE_RATIO. */
-/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
-/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
+/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
+/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
/* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c b/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
index 59e5e6a..45aa963 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
@@ -21,5 +21,5 @@ int foo (struct S *p)
*p = l;
}
-/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! "avr*-*-* nds32*-*-*" } } } } */
+/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! "aarch64*-*-* avr*-*-* nds32*-*-*" } } } } */
/* { dg-final { cleanup-tree-dump "release_ssa" } } */