[gcc r13-8874] AArch64: Fix strict-align cpymem/setmem [PR103100]
Wilco Dijkstra
wilco@gcc.gnu.org
Thu Jun 27 17:23:40 GMT 2024
https://gcc.gnu.org/g:5aa9ed0f353f835005c3df8932c7bc6e26f53904
commit r13-8874-g5aa9ed0f353f835005c3df8932c7bc6e26f53904
Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Wed Oct 25 16:28:04 2023 +0100
AArch64: Fix strict-align cpymem/setmem [PR103100]
The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
Clean up the condition when to use MOPS.
gcc/ChangeLog/
PR target/103100
* config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.
(setmemdi): Likewise.
* config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
strict-align. Cleanup condition for using MOPS.
(aarch64_expand_setmem): Likewise.
(cherry picked from commit 318f5232cfb3e0c9694889565e1f5424d0354463)
Diff:
---
gcc/config/aarch64/aarch64.cc | 52 ++++++++++++++++++-------------------------
gcc/config/aarch64/aarch64.md | 4 ++--
2 files changed, 24 insertions(+), 32 deletions(-)
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index b8a4ab1b980..2f01580a797 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -24897,27 +24897,23 @@ aarch64_expand_cpymem (rtx *operands)
int mode_bits;
rtx dst = operands[0];
rtx src = operands[1];
+ unsigned align = UINTVAL (operands[3]);
rtx base;
machine_mode cur_mode = BLKmode;
+ bool size_p = optimize_function_for_size_p (cfun);
- /* Variable-sized memcpy can go through the MOPS expansion if available. */
- if (!CONST_INT_P (operands[2]))
+ /* Variable-sized or strict-align copies may use the MOPS expansion. */
+ if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
return aarch64_expand_cpymem_mops (operands);
- unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
-
- /* Try to inline up to 256 bytes or use the MOPS threshold if available. */
- unsigned HOST_WIDE_INT max_copy_size
- = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
+ unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
- bool size_p = optimize_function_for_size_p (cfun);
+ /* Try to inline up to 256 bytes. */
+ unsigned max_copy_size = 256;
+ unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
- /* Large constant-sized cpymem should go through MOPS when possible.
- It should be a win even for size optimization in the general case.
- For speed optimization the choice between MOPS and the SIMD sequence
- depends on the size of the copy, rather than number of instructions,
- alignment etc. */
- if (size > max_copy_size)
+ /* Large copies use MOPS when available or a library call. */
+ if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
return aarch64_expand_cpymem_mops (operands);
int copy_bits = 256;
@@ -25081,12 +25077,13 @@ aarch64_expand_setmem (rtx *operands)
unsigned HOST_WIDE_INT len;
rtx dst = operands[0];
rtx val = operands[2], src;
+ unsigned align = UINTVAL (operands[3]);
rtx base;
machine_mode cur_mode = BLKmode, next_mode;
- /* If we don't have SIMD registers or the size is variable use the MOPS
- inlined sequence if possible. */
- if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
+ /* Variable-sized or strict-align memset may use the MOPS expansion. */
+ if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
+ || (STRICT_ALIGNMENT && align < 16))
return aarch64_expand_setmem_mops (operands);
bool size_p = optimize_function_for_size_p (cfun);
@@ -25094,10 +25091,13 @@ aarch64_expand_setmem (rtx *operands)
/* Default the maximum to 256-bytes when considering only libcall vs
SIMD broadcast sequence. */
unsigned max_set_size = 256;
+ unsigned mops_threshold = aarch64_mops_memset_size_threshold;
- len = INTVAL (operands[1]);
- if (len > max_set_size && !TARGET_MOPS)
- return false;
+ len = UINTVAL (operands[1]);
+
+ /* Large memset uses MOPS when available or a library call. */
+ if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
+ return aarch64_expand_setmem_mops (operands);
int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
/* The MOPS sequence takes:
@@ -25110,12 +25110,6 @@ aarch64_expand_setmem (rtx *operands)
the arguments + 1 for the call. */
unsigned libcall_cost = 4;
- /* Upper bound check. For large constant-sized setmem use the MOPS sequence
- when available. */
- if (TARGET_MOPS
- && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
- return aarch64_expand_setmem_mops (operands);
-
/* Attempt a sequence with a vector broadcast followed by stores.
Count the number of operations involved to see if it's worth it
against the alternatives. A simple counter simd_ops on the
@@ -25157,10 +25151,8 @@ aarch64_expand_setmem (rtx *operands)
simd_ops++;
n -= mode_bits;
- /* Do certain trailing copies as overlapping if it's going to be
- cheaper. i.e. less instructions to do so. For instance doing a 15
- byte copy it's more efficient to do two overlapping 8 byte copies than
- 8 + 4 + 2 + 1. Only do this when -mstrict-align is not supplied. */
+ /* Emit trailing writes using overlapping unaligned accesses
+ (when !STRICT_ALIGNMENT) - this is smaller and faster. */
if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
{
next_mode = smallest_mode_for_size (n, MODE_INT);
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 922cc987595..b585604cbe0 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1659,7 +1659,7 @@
(match_operand:BLK 1 "memory_operand")
(match_operand:DI 2 "general_operand")
(match_operand:DI 3 "immediate_operand")]
- "!STRICT_ALIGNMENT || TARGET_MOPS"
+ ""
{
if (aarch64_expand_cpymem (operands))
DONE;
@@ -1756,7 +1756,7 @@
(match_operand:QI 2 "nonmemory_operand")) ;; Value
(use (match_operand:DI 1 "general_operand")) ;; Length
(match_operand 3 "immediate_operand")] ;; Align
- "TARGET_SIMD || TARGET_MOPS"
+ ""
{
if (aarch64_expand_setmem (operands))
DONE;
More information about the Gcc-cvs
mailing list