This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [PATCH, ARM] Unaligned accesses for builtin memcpy [2/2]
- From: Julian Brown <julian at codesourcery dot com>
- To: Ramana Radhakrishnan <ramana dot radhakrishnan at linaro dot org>
- Cc: gcc-patches at gcc dot gnu dot org, paul at codesourcery dot com, Richard Earnshaw <rearnsha at arm dot com>
- Date: Thu, 13 Oct 2011 18:53:05 +0100
- Subject: Re: [PATCH, ARM] Unaligned accesses for builtin memcpy [2/2]
- References: <20110506141332.63b88bcb@rex.config> <CACUk7=VxTyCKLmKFFX9qG+RQd75ZTZU1H4ngkQV_ApKFah0ueA@mail.gmail.com>
On Wed, 28 Sep 2011 14:33:17 +0100
Ramana Radhakrishnan <ramana.radhakrishnan@linaro.org> wrote:
> On 6 May 2011 14:13, Julian Brown <julian@codesourcery.com> wrote:
> > Hi,
> >
> > This is the second of two patches to add unaligned-access support to
> > the ARM backend. It builds on the first patch to provide support for
> > unaligned accesses when expanding block moves (i.e. for builtin
> > memcpy operations). It makes some effort to use load/store multiple
> > instructions where appropriate (when accessing sufficiently-aligned
> > source or destination addresses), and also makes some effort to
> > generate fast code (for -O1/2/3) or small code (for -Os), though
> > some of the heuristics may need tweaking still
>
> Sorry it's taken me a while to get around to this one. Do you know
> what difference this makes to performance on some standard benchmarks
> on let's say an A9 and an M4 as I see that this gets triggered only
> when we have less than 64 bytes to copy. ?
No, sorry, I don't have any benchmark results available at present. I
think we'd have to have terrifically bad luck for it to be a
performance degradation, though...
> Please add a few testcases from the examples that you've shown here to
> be sure that ldm's are being generated in the right cases.
I've added test cases which cover copies with combinations of
aligned/unaligned sources/destinations, gated on a new
require-effective-target so the tests only run when suitable support is
available.
I re-tested the patch for good measure, in case of bitrot (and the new
tests pass with the patch applied, of course). OK to apply now?
Thanks,
Julian
ChangeLog
gcc/
* config/arm/arm.c (arm_block_move_unaligned_straight)
(arm_adjust_block_mem, arm_block_move_unaligned_loop)
(arm_movmemqi_unaligned): New.
(arm_gen_movmemqi): Support unaligned block copies.
gcc/testsuite/
* lib/target-supports.exp (check_effective_target_arm_unaligned):
New.
* gcc.target/arm/unaligned-memcpy-1.c: New.
* gcc.target/arm/unaligned-memcpy-2.c: New.
* gcc.target/arm/unaligned-memcpy-3.c: New.
* gcc.target/arm/unaligned-memcpy-4.c: New.
Index: gcc/testsuite/gcc.target/arm/unaligned-memcpy-3.c
===================================================================
--- gcc/testsuite/gcc.target/arm/unaligned-memcpy-3.c (revision 0)
+++ gcc/testsuite/gcc.target/arm/unaligned-memcpy-3.c (revision 0)
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_unaligned } */
+/* { dg-options "-O2" } */
+
+#include <string.h>
+
+char src[16];
+
+void aligned_src (char *dest)
+{
+ memcpy (dest, src, 15);
+}
+
+/* Expect a multi-word load for the main part of the copy, but subword
+ loads/stores for the remainder. */
+
+/* { dg-final { scan-assembler-times "ldmia" 1 } } */
+/* { dg-final { scan-assembler-times "ldrh" 1 } } */
+/* { dg-final { scan-assembler-times "strh" 1 } } */
+/* { dg-final { scan-assembler-times "ldrb" 1 } } */
+/* { dg-final { scan-assembler-times "strb" 1 } } */
Index: gcc/testsuite/gcc.target/arm/unaligned-memcpy-4.c
===================================================================
--- gcc/testsuite/gcc.target/arm/unaligned-memcpy-4.c (revision 0)
+++ gcc/testsuite/gcc.target/arm/unaligned-memcpy-4.c (revision 0)
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_unaligned } */
+/* { dg-options "-O2" } */
+
+#include <string.h>
+
+char src[16];
+char dest[16];
+
+void aligned_both (void)
+{
+ memcpy (dest, src, 15);
+}
+
+/* We know both src and dest to be aligned: expect multiword loads/stores. */
+
+/* { dg-final { scan-assembler-times "ldmia" 1 } } */
+/* { dg-final { scan-assembler-times "stmia" 1 } } */
Index: gcc/testsuite/gcc.target/arm/unaligned-memcpy-1.c
===================================================================
--- gcc/testsuite/gcc.target/arm/unaligned-memcpy-1.c (revision 0)
+++ gcc/testsuite/gcc.target/arm/unaligned-memcpy-1.c (revision 0)
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_unaligned } */
+/* { dg-options "-O2" } */
+
+#include <string.h>
+
+void unknown_alignment (char *dest, char *src)
+{
+ memcpy (dest, src, 15);
+}
+
+/* We should see three unaligned word loads and store pairs, one unaligned
+ ldrh/strh pair, and an ldrb/strb pair. Sanity check that. */
+
+/* { dg-final { scan-assembler-times "@ unaligned" 8 } } */
+/* { dg-final { scan-assembler-times "ldrh" 1 } } */
+/* { dg-final { scan-assembler-times "strh" 1 } } */
+/* { dg-final { scan-assembler-times "ldrb" 1 } } */
+/* { dg-final { scan-assembler-times "strb" 1 } } */
Index: gcc/testsuite/gcc.target/arm/unaligned-memcpy-2.c
===================================================================
--- gcc/testsuite/gcc.target/arm/unaligned-memcpy-2.c (revision 0)
+++ gcc/testsuite/gcc.target/arm/unaligned-memcpy-2.c (revision 0)
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_unaligned } */
+/* { dg-options "-O2" } */
+
+#include <string.h>
+
+char dest[16];
+
+void aligned_dest (char *src)
+{
+ memcpy (dest, src, 15);
+}
+
+/* Expect a multi-word store for the main part of the copy, but subword
+ loads/stores for the remainder. */
+
+/* { dg-final { scan-assembler-times "stmia" 1 } } */
+/* { dg-final { scan-assembler-times "ldrh" 1 } } */
+/* { dg-final { scan-assembler-times "strh" 1 } } */
+/* { dg-final { scan-assembler-times "ldrb" 1 } } */
+/* { dg-final { scan-assembler-times "strb" 1 } } */
Index: gcc/testsuite/lib/target-supports.exp
===================================================================
--- gcc/testsuite/lib/target-supports.exp (revision 179860)
+++ gcc/testsuite/lib/target-supports.exp (working copy)
@@ -1973,6 +1973,18 @@ proc check_effective_target_arm_dsp { }
}]
}
+# Return 1 if this is an ARM target that supports unaligned word/halfword
+# load/store instructions.
+
+proc check_effective_target_arm_unaligned { } {
+ return [check_no_compiler_messages arm_unaligned assembly {
+ #ifndef __ARM_FEATURE_UNALIGNED
+ #error no unaligned support
+ #endif
+ int i;
+ }]
+}
+
# Add the options needed for NEON. We need either -mfloat-abi=softfp
# or -mfloat-abi=hard, but if one is already specified by the
# multilib, use it. Similarly, if a -mfpu option already enables
Index: gcc/config/arm/arm.c
===================================================================
--- gcc/config/arm/arm.c (revision 179860)
+++ gcc/config/arm/arm.c (working copy)
@@ -10766,6 +10766,335 @@ gen_const_stm_seq (rtx *operands, int no
return true;
}
+/* Copy a block of memory using plain ldr/str/ldrh/strh instructions, to permit
+ unaligned copies on processors which support unaligned semantics for those
+ instructions. INTERLEAVE_FACTOR can be used to attempt to hide load latency
+ (using more registers) by doing e.g. load/load/store/store for a factor of 2.
+ An interleave factor of 1 (the minimum) will perform no interleaving.
+ Load/store multiple are used for aligned addresses where possible. */
+
+static void
+arm_block_move_unaligned_straight (rtx dstbase, rtx srcbase,
+ HOST_WIDE_INT length,
+ unsigned int interleave_factor)
+{
+ rtx *regs = XALLOCAVEC (rtx, interleave_factor);
+ int *regnos = XALLOCAVEC (int, interleave_factor);
+ HOST_WIDE_INT block_size_bytes = interleave_factor * UNITS_PER_WORD;
+ HOST_WIDE_INT i, j;
+ HOST_WIDE_INT remaining = length, words;
+ rtx halfword_tmp = NULL, byte_tmp = NULL;
+ rtx dst, src;
+ bool src_aligned = MEM_ALIGN (srcbase) >= BITS_PER_WORD;
+ bool dst_aligned = MEM_ALIGN (dstbase) >= BITS_PER_WORD;
+ HOST_WIDE_INT srcoffset, dstoffset;
+ HOST_WIDE_INT src_autoinc, dst_autoinc;
+ rtx mem, addr;
+
+ gcc_assert (1 <= interleave_factor && interleave_factor <= 4);
+
+ /* Use hard registers if we have aligned source or destination so we can use
+ load/store multiple with contiguous registers. */
+ if (dst_aligned || src_aligned)
+ for (i = 0; i < interleave_factor; i++)
+ regs[i] = gen_rtx_REG (SImode, i);
+ else
+ for (i = 0; i < interleave_factor; i++)
+ regs[i] = gen_reg_rtx (SImode);
+
+ dst = copy_addr_to_reg (XEXP (dstbase, 0));
+ src = copy_addr_to_reg (XEXP (srcbase, 0));
+
+ srcoffset = dstoffset = 0;
+
+ /* Calls to arm_gen_load_multiple and arm_gen_store_multiple update SRC/DST.
+ For copying the last bytes we want to subtract this offset again. */
+ src_autoinc = dst_autoinc = 0;
+
+ for (i = 0; i < interleave_factor; i++)
+ regnos[i] = i;
+
+ /* Copy BLOCK_SIZE_BYTES chunks. */
+
+ for (i = 0; i + block_size_bytes <= length; i += block_size_bytes)
+ {
+ /* Load words. */
+ if (src_aligned && interleave_factor > 1)
+ {
+ emit_insn (arm_gen_load_multiple (regnos, interleave_factor, src,
+ TRUE, srcbase, &srcoffset));
+ src_autoinc += UNITS_PER_WORD * interleave_factor;
+ }
+ else
+ {
+ for (j = 0; j < interleave_factor; j++)
+ {
+ addr = plus_constant (src, srcoffset + j * UNITS_PER_WORD
+ - src_autoinc);
+ mem = adjust_automodify_address (srcbase, SImode, addr,
+ srcoffset + j * UNITS_PER_WORD);
+ emit_insn (gen_unaligned_loadsi (regs[j], mem));
+ }
+ srcoffset += block_size_bytes;
+ }
+
+ /* Store words. */
+ if (dst_aligned && interleave_factor > 1)
+ {
+ emit_insn (arm_gen_store_multiple (regnos, interleave_factor, dst,
+ TRUE, dstbase, &dstoffset));
+ dst_autoinc += UNITS_PER_WORD * interleave_factor;
+ }
+ else
+ {
+ for (j = 0; j < interleave_factor; j++)
+ {
+ addr = plus_constant (dst, dstoffset + j * UNITS_PER_WORD
+ - dst_autoinc);
+ mem = adjust_automodify_address (dstbase, SImode, addr,
+ dstoffset + j * UNITS_PER_WORD);
+ emit_insn (gen_unaligned_storesi (mem, regs[j]));
+ }
+ dstoffset += block_size_bytes;
+ }
+
+ remaining -= block_size_bytes;
+ }
+
+ /* Copy any whole words left (note these aren't interleaved with any
+ subsequent halfword/byte load/stores in the interests of simplicity). */
+
+ words = remaining / UNITS_PER_WORD;
+
+ gcc_assert (words < interleave_factor);
+
+ if (src_aligned && words > 1)
+ {
+ emit_insn (arm_gen_load_multiple (regnos, words, src, TRUE, srcbase,
+ &srcoffset));
+ src_autoinc += UNITS_PER_WORD * words;
+ }
+ else
+ {
+ for (j = 0; j < words; j++)
+ {
+ addr = plus_constant (src,
+ srcoffset + j * UNITS_PER_WORD - src_autoinc);
+ mem = adjust_automodify_address (srcbase, SImode, addr,
+ srcoffset + j * UNITS_PER_WORD);
+ emit_insn (gen_unaligned_loadsi (regs[j], mem));
+ }
+ srcoffset += words * UNITS_PER_WORD;
+ }
+
+ if (dst_aligned && words > 1)
+ {
+ emit_insn (arm_gen_store_multiple (regnos, words, dst, TRUE, dstbase,
+ &dstoffset));
+ dst_autoinc += words * UNITS_PER_WORD;
+ }
+ else
+ {
+ for (j = 0; j < words; j++)
+ {
+ addr = plus_constant (dst,
+ dstoffset + j * UNITS_PER_WORD - dst_autoinc);
+ mem = adjust_automodify_address (dstbase, SImode, addr,
+ dstoffset + j * UNITS_PER_WORD);
+ emit_insn (gen_unaligned_storesi (mem, regs[j]));
+ }
+ dstoffset += words * UNITS_PER_WORD;
+ }
+
+ remaining -= words * UNITS_PER_WORD;
+
+ gcc_assert (remaining < 4);
+
+ /* Copy a halfword if necessary. */
+
+ if (remaining >= 2)
+ {
+ halfword_tmp = gen_reg_rtx (SImode);
+
+ addr = plus_constant (src, srcoffset - src_autoinc);
+ mem = adjust_automodify_address (srcbase, HImode, addr, srcoffset);
+ emit_insn (gen_unaligned_loadhiu (halfword_tmp, mem));
+
+ /* Either write out immediately, or delay until we've loaded the last
+ byte, depending on interleave factor. */
+ if (interleave_factor == 1)
+ {
+ addr = plus_constant (dst, dstoffset - dst_autoinc);
+ mem = adjust_automodify_address (dstbase, HImode, addr, dstoffset);
+ emit_insn (gen_unaligned_storehi (mem,
+ gen_lowpart (HImode, halfword_tmp)));
+ halfword_tmp = NULL;
+ dstoffset += 2;
+ }
+
+ remaining -= 2;
+ srcoffset += 2;
+ }
+
+ gcc_assert (remaining < 2);
+
+ /* Copy last byte. */
+
+ if ((remaining & 1) != 0)
+ {
+ byte_tmp = gen_reg_rtx (SImode);
+
+ addr = plus_constant (src, srcoffset - src_autoinc);
+ mem = adjust_automodify_address (srcbase, QImode, addr, srcoffset);
+ emit_move_insn (gen_lowpart (QImode, byte_tmp), mem);
+
+ if (interleave_factor == 1)
+ {
+ addr = plus_constant (dst, dstoffset - dst_autoinc);
+ mem = adjust_automodify_address (dstbase, QImode, addr, dstoffset);
+ emit_move_insn (mem, gen_lowpart (QImode, byte_tmp));
+ byte_tmp = NULL;
+ dstoffset++;
+ }
+
+ remaining--;
+ srcoffset++;
+ }
+
+ /* Store last halfword if we haven't done so already. */
+
+ if (halfword_tmp)
+ {
+ addr = plus_constant (dst, dstoffset - dst_autoinc);
+ mem = adjust_automodify_address (dstbase, HImode, addr, dstoffset);
+ emit_insn (gen_unaligned_storehi (mem,
+ gen_lowpart (HImode, halfword_tmp)));
+ dstoffset += 2;
+ }
+
+ /* Likewise for last byte. */
+
+ if (byte_tmp)
+ {
+ addr = plus_constant (dst, dstoffset - dst_autoinc);
+ mem = adjust_automodify_address (dstbase, QImode, addr, dstoffset);
+ emit_move_insn (mem, gen_lowpart (QImode, byte_tmp));
+ dstoffset++;
+ }
+
+ gcc_assert (remaining == 0 && srcoffset == dstoffset);
+}
+
+/* From mips_adjust_block_mem:
+
+ Helper function for doing a loop-based block operation on memory
+ reference MEM. Each iteration of the loop will operate on LENGTH
+ bytes of MEM.
+
+ Create a new base register for use within the loop and point it to
+ the start of MEM. Create a new memory reference that uses this
+ register. Store them in *LOOP_REG and *LOOP_MEM respectively. */
+
+static void
+arm_adjust_block_mem (rtx mem, HOST_WIDE_INT length, rtx *loop_reg,
+ rtx *loop_mem)
+{
+ *loop_reg = copy_addr_to_reg (XEXP (mem, 0));
+
+ /* Although the new mem does not refer to a known location,
+ it does keep up to LENGTH bytes of alignment. */
+ *loop_mem = change_address (mem, BLKmode, *loop_reg);
+ set_mem_align (*loop_mem, MIN (MEM_ALIGN (mem), length * BITS_PER_UNIT));
+}
+
+/* From mips_block_move_loop:
+
+ Move LENGTH bytes from SRC to DEST using a loop that moves BYTES_PER_ITER
+ bytes at a time. LENGTH must be at least BYTES_PER_ITER. Assume that
+ the memory regions do not overlap. */
+
+static void
+arm_block_move_unaligned_loop (rtx dest, rtx src, HOST_WIDE_INT length,
+ unsigned int interleave_factor,
+ HOST_WIDE_INT bytes_per_iter)
+{
+ rtx label, src_reg, dest_reg, final_src, test;
+ HOST_WIDE_INT leftover;
+
+ leftover = length % bytes_per_iter;
+ length -= leftover;
+
+ /* Create registers and memory references for use within the loop. */
+ arm_adjust_block_mem (src, bytes_per_iter, &src_reg, &src);
+ arm_adjust_block_mem (dest, bytes_per_iter, &dest_reg, &dest);
+
+ /* Calculate the value that SRC_REG should have after the last iteration of
+ the loop. */
+ final_src = expand_simple_binop (Pmode, PLUS, src_reg, GEN_INT (length),
+ 0, 0, OPTAB_WIDEN);
+
+ /* Emit the start of the loop. */
+ label = gen_label_rtx ();
+ emit_label (label);
+
+ /* Emit the loop body. */
+ arm_block_move_unaligned_straight (dest, src, bytes_per_iter,
+ interleave_factor);
+
+ /* Move on to the next block. */
+ emit_move_insn (src_reg, plus_constant (src_reg, bytes_per_iter));
+ emit_move_insn (dest_reg, plus_constant (dest_reg, bytes_per_iter));
+
+ /* Emit the loop condition. */
+ test = gen_rtx_NE (VOIDmode, src_reg, final_src);
+ emit_jump_insn (gen_cbranchsi4 (test, src_reg, final_src, label));
+
+ /* Mop up any left-over bytes. */
+ if (leftover)
+ arm_block_move_unaligned_straight (dest, src, leftover, interleave_factor);
+}
+
+/* Emit a block move when either the source or destination is unaligned (not
+ aligned to a four-byte boundary). This may need further tuning depending on
+ core type, optimize_size setting, etc. */
+
+static int
+arm_movmemqi_unaligned (rtx *operands)
+{
+ HOST_WIDE_INT length = INTVAL (operands[2]);
+
+ if (optimize_size)
+ {
+ bool src_aligned = MEM_ALIGN (operands[1]) >= BITS_PER_WORD;
+ bool dst_aligned = MEM_ALIGN (operands[0]) >= BITS_PER_WORD;
+ /* Inlined memcpy using ldr/str/ldrh/strh can be quite big: try to limit
+ size of code if optimizing for size. We'll use ldm/stm if src_aligned
+ or dst_aligned though: allow more interleaving in those cases since the
+ resulting code can be smaller. */
+ unsigned int interleave_factor = (src_aligned || dst_aligned) ? 2 : 1;
+ HOST_WIDE_INT bytes_per_iter = (src_aligned || dst_aligned) ? 8 : 4;
+
+ if (length > 12)
+ arm_block_move_unaligned_loop (operands[0], operands[1], length,
+ interleave_factor, bytes_per_iter);
+ else
+ arm_block_move_unaligned_straight (operands[0], operands[1], length,
+ interleave_factor);
+ }
+ else
+ {
+ /* Note that the loop created by arm_block_move_unaligned_loop may be
+ subject to loop unrolling, which makes tuning this condition a little
+ redundant. */
+ if (length > 32)
+ arm_block_move_unaligned_loop (operands[0], operands[1], length, 4, 16);
+ else
+ arm_block_move_unaligned_straight (operands[0], operands[1], length, 4);
+ }
+
+ return 1;
+}
+
int
arm_gen_movmemqi (rtx *operands)
{
@@ -10778,8 +11107,13 @@ arm_gen_movmemqi (rtx *operands)
if (GET_CODE (operands[2]) != CONST_INT
|| GET_CODE (operands[3]) != CONST_INT
- || INTVAL (operands[2]) > 64
- || INTVAL (operands[3]) & 3)
+ || INTVAL (operands[2]) > 64)
+ return 0;
+
+ if (unaligned_access && (INTVAL (operands[3]) & 3) != 0)
+ return arm_movmemqi_unaligned (operands);
+
+ if (INTVAL (operands[3]) & 3)
return 0;
dstbase = operands[0];