This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH, ARM] Unaligned accesses for builtin memcpy [2/2]
- From: Julian Brown <julian at codesourcery dot com>
- To: gcc-patches at gcc dot gnu dot org, paul at codesourcery dot com, Richard Earnshaw <rearnsha at arm dot com>, Ramana Radhakrishnan <ramana dot radhakrishnan at linaro dot org>
- Date: Fri, 6 May 2011 14:13:32 +0100
- Subject: [PATCH, ARM] Unaligned accesses for builtin memcpy [2/2]
Hi,
This is the second of two patches to add unaligned-access support to
the ARM backend. It builds on the first patch to provide support for
unaligned accesses when expanding block moves (i.e. for builtin memcpy
operations). It makes some effort to use load/store multiple
instructions where appropriate (when accessing sufficiently-aligned
source or destination addresses), and also makes some effort to
generate fast code (for -O1/2/3) or small code (for -Os), though some
of the heuristics may need tweaking still.
Examples:
#include <string.h>
void foo (char *dest, char *src)
{
memcpy (dest, src, AMOUNT);
}
char known[64];
void dst_aligned (char *src)
{
memcpy (known, src, AMOUNT);
}
void src_aligned (char *dst)
{
memcpy (dst, known, AMOUNT);
}
For "-mcpu=cortex-m4 -mthumb -O2 -DAMOUNT=15" we get:
foo:
ldr r2, [r1, #4] @ unaligned
ldr r3, [r1, #8] @ unaligned
push {r4}
ldr r4, [r1, #0] @ unaligned
str r2, [r0, #4] @ unaligned
str r4, [r0, #0] @ unaligned
str r3, [r0, #8] @ unaligned
ldrh r2, [r1, #12] @ unaligned
ldrb r3, [r1, #14] @ zero_extendqisi2
strh r2, [r0, #12] @ unaligned
strb r3, [r0, #14]
pop {r4}
bx lr
dst_aligned:
push {r4}
mov r4, r0
movw r3, #:lower16:known
ldr r1, [r4, #4] @ unaligned
ldr r2, [r4, #8] @ unaligned
ldr r0, [r0, #0] @ unaligned
movt r3, #:upper16:known
stmia r3!, {r0, r1, r2}
ldrh r1, [r4, #12] @ unaligned
ldrb r2, [r4, #14] @ zero_extendqisi2
strh r1, [r3, #0] @ unaligned
strb r2, [r3, #2]
pop {r4}
bx lr
src_aligned:
push {r4}
movw r3, #:lower16:known
movt r3, #:upper16:known
mov r4, r0
ldmia r3!, {r0, r1, r2}
str r0, [r4, #0] @ unaligned
str r1, [r4, #4] @ unaligned
str r2, [r4, #8] @ unaligned
ldrh r2, [r3, #0] @ unaligned
ldrb r3, [r3, #2] @ zero_extendqisi2
strh r2, [r4, #12] @ unaligned
strb r3, [r4, #14]
pop {r4}
bx lr
Whereas for "-mcpu=cortex-m4 -mthumb -Os -DAMOUNT=15", e.g.:
foo:
add r3, r1, #12
.L2:
ldr r2, [r1], #4 @ unaligned
cmp r1, r3
str r2, [r0], #4 @ unaligned
bne .L2
ldrh r3, [r1, #0] @ unaligned
strh r3, [r0, #0] @ unaligned
ldrb r3, [r1, #2] @ zero_extendqisi2
strb r3, [r0, #2]
bx lr
Tested (alongside the first patch) with cross to ARM Linux. OK to apply?
Thanks,
Julian
ChangeLog
gcc/
* config/arm/arm.c (arm_block_move_unaligned_straight)
(arm_adjust_block_mem, arm_block_move_unaligned_loop)
(arm_movmemqi_unaligned): New.
(arm_gen_movmemqi): Support unaligned block copies.
commit 16973f69fce37a2b347ea7daffd6f593aba843d5
Author: Julian Brown <julian@henry7.codesourcery.com>
Date: Wed May 4 11:26:01 2011 -0700
Optimize block moves when unaligned accesses are permitted.
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index a18aea6..b6df0d3 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -10362,6 +10362,335 @@ gen_const_stm_seq (rtx *operands, int nops)
return true;
}
+/* Copy a block of memory using plain ldr/str/ldrh/strh instructions, to permit
+ unaligned copies on processors which support unaligned semantics for those
+ instructions. INTERLEAVE_FACTOR can be used to attempt to hide load latency
+ (using more registers) by doing e.g. load/load/store/store for a factor of 2.
+ An interleave factor of 1 (the minimum) will perform no interleaving.
+ Load/store multiple are used for aligned addresses where possible. */
+
+static void
+arm_block_move_unaligned_straight (rtx dstbase, rtx srcbase,
+ HOST_WIDE_INT length,
+ unsigned int interleave_factor)
+{
+ rtx *regs = XALLOCAVEC (rtx, interleave_factor);
+ int *regnos = XALLOCAVEC (int, interleave_factor);
+ HOST_WIDE_INT block_size_bytes = interleave_factor * UNITS_PER_WORD;
+ HOST_WIDE_INT i, j;
+ HOST_WIDE_INT remaining = length, words;
+ rtx halfword_tmp = NULL, byte_tmp = NULL;
+ rtx dst, src;
+ bool src_aligned = MEM_ALIGN (srcbase) >= BITS_PER_WORD;
+ bool dst_aligned = MEM_ALIGN (dstbase) >= BITS_PER_WORD;
+ HOST_WIDE_INT srcoffset, dstoffset;
+ HOST_WIDE_INT src_autoinc, dst_autoinc;
+ rtx mem, addr;
+
+ gcc_assert (1 <= interleave_factor && interleave_factor <= 4);
+
+ /* Use hard registers if we have aligned source or destination so we can use
+ load/store multiple with contiguous registers. */
+ if (dst_aligned || src_aligned)
+ for (i = 0; i < interleave_factor; i++)
+ regs[i] = gen_rtx_REG (SImode, i);
+ else
+ for (i = 0; i < interleave_factor; i++)
+ regs[i] = gen_reg_rtx (SImode);
+
+ dst = copy_addr_to_reg (XEXP (dstbase, 0));
+ src = copy_addr_to_reg (XEXP (srcbase, 0));
+
+ srcoffset = dstoffset = 0;
+
+ /* Calls to arm_gen_load_multiple and arm_gen_store_multiple update SRC/DST.
+ For copying the last bytes we want to subtract this offset again. */
+ src_autoinc = dst_autoinc = 0;
+
+ for (i = 0; i < interleave_factor; i++)
+ regnos[i] = i;
+
+ /* Copy BLOCK_SIZE_BYTES chunks. */
+
+ for (i = 0; i + block_size_bytes <= length; i += block_size_bytes)
+ {
+ /* Load words. */
+ if (src_aligned && interleave_factor > 1)
+ {
+ emit_insn (arm_gen_load_multiple (regnos, interleave_factor, src,
+ TRUE, srcbase, &srcoffset));
+ src_autoinc += UNITS_PER_WORD * interleave_factor;
+ }
+ else
+ {
+ for (j = 0; j < interleave_factor; j++)
+ {
+ addr = plus_constant (src, srcoffset + j * UNITS_PER_WORD
+ - src_autoinc);
+ mem = adjust_automodify_address (srcbase, SImode, addr,
+ srcoffset + j * UNITS_PER_WORD);
+ emit_insn (gen_unaligned_loadsi (regs[j], mem));
+ }
+ srcoffset += block_size_bytes;
+ }
+
+ /* Store words. */
+ if (dst_aligned && interleave_factor > 1)
+ {
+ emit_insn (arm_gen_store_multiple (regnos, interleave_factor, dst,
+ TRUE, dstbase, &dstoffset));
+ dst_autoinc += UNITS_PER_WORD * interleave_factor;
+ }
+ else
+ {
+ for (j = 0; j < interleave_factor; j++)
+ {
+ addr = plus_constant (dst, dstoffset + j * UNITS_PER_WORD
+ - dst_autoinc);
+ mem = adjust_automodify_address (dstbase, SImode, addr,
+ dstoffset + j * UNITS_PER_WORD);
+ emit_insn (gen_unaligned_storesi (mem, regs[j]));
+ }
+ dstoffset += block_size_bytes;
+ }
+
+ remaining -= block_size_bytes;
+ }
+
+ /* Copy any whole words left (note these aren't interleaved with any
+ subsequent halfword/byte load/stores in the interests of simplicity). */
+
+ words = remaining / UNITS_PER_WORD;
+
+ gcc_assert (words < interleave_factor);
+
+ if (src_aligned && words > 1)
+ {
+ emit_insn (arm_gen_load_multiple (regnos, words, src, TRUE, srcbase,
+ &srcoffset));
+ src_autoinc += UNITS_PER_WORD * words;
+ }
+ else
+ {
+ for (j = 0; j < words; j++)
+ {
+ addr = plus_constant (src,
+ srcoffset + j * UNITS_PER_WORD - src_autoinc);
+ mem = adjust_automodify_address (srcbase, SImode, addr,
+ srcoffset + j * UNITS_PER_WORD);
+ emit_insn (gen_unaligned_loadsi (regs[j], mem));
+ }
+ srcoffset += words * UNITS_PER_WORD;
+ }
+
+ if (dst_aligned && words > 1)
+ {
+ emit_insn (arm_gen_store_multiple (regnos, words, dst, TRUE, dstbase,
+ &dstoffset));
+ dst_autoinc += words * UNITS_PER_WORD;
+ }
+ else
+ {
+ for (j = 0; j < words; j++)
+ {
+ addr = plus_constant (dst,
+ dstoffset + j * UNITS_PER_WORD - dst_autoinc);
+ mem = adjust_automodify_address (dstbase, SImode, addr,
+ dstoffset + j * UNITS_PER_WORD);
+ emit_insn (gen_unaligned_storesi (mem, regs[j]));
+ }
+ dstoffset += words * UNITS_PER_WORD;
+ }
+
+ remaining -= words * UNITS_PER_WORD;
+
+ gcc_assert (remaining < 4);
+
+ /* Copy a halfword if necessary. */
+
+ if (remaining >= 2)
+ {
+ halfword_tmp = gen_reg_rtx (SImode);
+
+ addr = plus_constant (src, srcoffset - src_autoinc);
+ mem = adjust_automodify_address (srcbase, HImode, addr, srcoffset);
+ emit_insn (gen_unaligned_loadhiu (halfword_tmp, mem));
+
+ /* Either write out immediately, or delay until we've loaded the last
+ byte, depending on interleave factor. */
+ if (interleave_factor == 1)
+ {
+ addr = plus_constant (dst, dstoffset - dst_autoinc);
+ mem = adjust_automodify_address (dstbase, HImode, addr, dstoffset);
+ emit_insn (gen_unaligned_storehi (mem,
+ gen_lowpart (HImode, halfword_tmp)));
+ halfword_tmp = NULL;
+ dstoffset += 2;
+ }
+
+ remaining -= 2;
+ srcoffset += 2;
+ }
+
+ gcc_assert (remaining < 2);
+
+ /* Copy last byte. */
+
+ if ((remaining & 1) != 0)
+ {
+ byte_tmp = gen_reg_rtx (SImode);
+
+ addr = plus_constant (src, srcoffset - src_autoinc);
+ mem = adjust_automodify_address (srcbase, QImode, addr, srcoffset);
+ emit_move_insn (gen_lowpart (QImode, byte_tmp), mem);
+
+ if (interleave_factor == 1)
+ {
+ addr = plus_constant (dst, dstoffset - dst_autoinc);
+ mem = adjust_automodify_address (dstbase, QImode, addr, dstoffset);
+ emit_move_insn (mem, gen_lowpart (QImode, byte_tmp));
+ byte_tmp = NULL;
+ dstoffset++;
+ }
+
+ remaining--;
+ srcoffset++;
+ }
+
+ /* Store last halfword if we haven't done so already. */
+
+ if (halfword_tmp)
+ {
+ addr = plus_constant (dst, dstoffset - dst_autoinc);
+ mem = adjust_automodify_address (dstbase, HImode, addr, dstoffset);
+ emit_insn (gen_unaligned_storehi (mem,
+ gen_lowpart (HImode, halfword_tmp)));
+ dstoffset += 2;
+ }
+
+ /* Likewise for last byte. */
+
+ if (byte_tmp)
+ {
+ addr = plus_constant (dst, dstoffset - dst_autoinc);
+ mem = adjust_automodify_address (dstbase, QImode, addr, dstoffset);
+ emit_move_insn (mem, gen_lowpart (QImode, byte_tmp));
+ dstoffset++;
+ }
+
+ gcc_assert (remaining == 0 && srcoffset == dstoffset);
+}
+
+/* From mips_adjust_block_mem:
+
+ Helper function for doing a loop-based block operation on memory
+ reference MEM. Each iteration of the loop will operate on LENGTH
+ bytes of MEM.
+
+ Create a new base register for use within the loop and point it to
+ the start of MEM. Create a new memory reference that uses this
+ register. Store them in *LOOP_REG and *LOOP_MEM respectively. */
+
+static void
+arm_adjust_block_mem (rtx mem, HOST_WIDE_INT length, rtx *loop_reg,
+ rtx *loop_mem)
+{
+ *loop_reg = copy_addr_to_reg (XEXP (mem, 0));
+
+ /* Although the new mem does not refer to a known location,
+ it does keep up to LENGTH bytes of alignment. */
+ *loop_mem = change_address (mem, BLKmode, *loop_reg);
+ set_mem_align (*loop_mem, MIN (MEM_ALIGN (mem), length * BITS_PER_UNIT));
+}
+
+/* From mips_block_move_loop:
+
+ Move LENGTH bytes from SRC to DEST using a loop that moves BYTES_PER_ITER
+ bytes at a time. LENGTH must be at least BYTES_PER_ITER. Assume that
+ the memory regions do not overlap. */
+
+static void
+arm_block_move_unaligned_loop (rtx dest, rtx src, HOST_WIDE_INT length,
+ unsigned int interleave_factor,
+ HOST_WIDE_INT bytes_per_iter)
+{
+ rtx label, src_reg, dest_reg, final_src, test;
+ HOST_WIDE_INT leftover;
+
+ leftover = length % bytes_per_iter;
+ length -= leftover;
+
+ /* Create registers and memory references for use within the loop. */
+ arm_adjust_block_mem (src, bytes_per_iter, &src_reg, &src);
+ arm_adjust_block_mem (dest, bytes_per_iter, &dest_reg, &dest);
+
+ /* Calculate the value that SRC_REG should have after the last iteration of
+ the loop. */
+ final_src = expand_simple_binop (Pmode, PLUS, src_reg, GEN_INT (length),
+ 0, 0, OPTAB_WIDEN);
+
+ /* Emit the start of the loop. */
+ label = gen_label_rtx ();
+ emit_label (label);
+
+ /* Emit the loop body. */
+ arm_block_move_unaligned_straight (dest, src, bytes_per_iter,
+ interleave_factor);
+
+ /* Move on to the next block. */
+ emit_move_insn (src_reg, plus_constant (src_reg, bytes_per_iter));
+ emit_move_insn (dest_reg, plus_constant (dest_reg, bytes_per_iter));
+
+ /* Emit the loop condition. */
+ test = gen_rtx_NE (VOIDmode, src_reg, final_src);
+ emit_jump_insn (gen_cbranchsi4 (test, src_reg, final_src, label));
+
+ /* Mop up any left-over bytes. */
+ if (leftover)
+ arm_block_move_unaligned_straight (dest, src, leftover, interleave_factor);
+}
+
+/* Emit a block move when either the source or destination is unaligned (not
+ aligned to a four-byte boundary). This may need further tuning depending on
+ core type, optimize_size setting, etc. */
+
+static int
+arm_movmemqi_unaligned (rtx *operands)
+{
+ HOST_WIDE_INT length = INTVAL (operands[2]);
+
+ if (optimize_size)
+ {
+ bool src_aligned = MEM_ALIGN (operands[1]) >= BITS_PER_WORD;
+ bool dst_aligned = MEM_ALIGN (operands[0]) >= BITS_PER_WORD;
+ /* Inlined memcpy using ldr/str/ldrh/strh can be quite big: try to limit
+ size of code if optimizing for size. We'll use ldm/stm if src_aligned
+ or dst_aligned though: allow more interleaving in those cases since the
+ resulting code can be smaller. */
+ unsigned int interleave_factor = (src_aligned || dst_aligned) ? 2 : 1;
+ HOST_WIDE_INT bytes_per_iter = (src_aligned || dst_aligned) ? 8 : 4;
+
+ if (length > 12)
+ arm_block_move_unaligned_loop (operands[0], operands[1], length,
+ interleave_factor, bytes_per_iter);
+ else
+ arm_block_move_unaligned_straight (operands[0], operands[1], length,
+ interleave_factor);
+ }
+ else
+ {
+ /* Note that the loop created by arm_block_move_unaligned_loop may be
+ subject to loop unrolling, which makes tuning this condition a little
+ redundant. */
+ if (length > 32)
+ arm_block_move_unaligned_loop (operands[0], operands[1], length, 4, 16);
+ else
+ arm_block_move_unaligned_straight (operands[0], operands[1], length, 4);
+ }
+
+ return 1;
+}
+
int
arm_gen_movmemqi (rtx *operands)
{
@@ -10374,8 +10703,13 @@ arm_gen_movmemqi (rtx *operands)
if (GET_CODE (operands[2]) != CONST_INT
|| GET_CODE (operands[3]) != CONST_INT
- || INTVAL (operands[2]) > 64
- || INTVAL (operands[3]) & 3)
+ || INTVAL (operands[2]) > 64)
+ return 0;
+
+ if (unaligned_access && (INTVAL (operands[3]) & 3) != 0)
+ return arm_movmemqi_unaligned (operands);
+
+ if (INTVAL (operands[3]) & 3)
return 0;
dstbase = operands[0];