Use vector insns for memcpy, memset on rs6000
Geoffrey Keating
gkeating@apple.com
Wed Sep 1 21:11:00 GMT 2004
This patch improves codegen for block moves and clears when Altivec is
enabled and the block is aligned and 16 bytes or larger, by using
Altivec instructions.
Bootstrapped & tested on powerpc-darwin, plus a c-torture run with
-maltivec (still running, won't commit until it's successful).
--
- Geoffrey Keating <geoffk@apple.com>
===File ~/patches/gcc-vectormemcpy.patch====================
2004-08-31 Geoffrey Keating <geoffk@apple.com>
* config/rs6000/rs6000.c (expand_block_clear): Use vector
instructions if available.
(expand_block_move): Likewise.
Index: testsuite/ChangeLog
2004-08-31 Geoffrey Keating <geoffk@apple.com>
* gcc.dg/ppc-vector-memcpy.c: New.
* gcc.dg/ppc-vector-memset.c: New.
Index: config/rs6000/rs6000.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/rs6000/rs6000.c,v
retrieving revision 1.703
diff -u -p -u -p -r1.703 rs6000.c
--- config/rs6000/rs6000.c 27 Aug 2004 18:29:52 -0000 1.703
+++ config/rs6000/rs6000.c 1 Sep 2004 01:11:19 -0000
@@ -8319,11 +8319,12 @@ expand_block_clear (rtx operands[])
rtx orig_dest = operands[0];
rtx bytes_rtx = operands[1];
rtx align_rtx = operands[2];
- int constp = (GET_CODE (bytes_rtx) == CONST_INT);
- int align;
- int bytes;
+ bool constp = (GET_CODE (bytes_rtx) == CONST_INT);
+ HOST_WIDE_INT align;
+ HOST_WIDE_INT bytes;
int offset;
int clear_bytes;
+ int clear_step;
/* If this is not a fixed size move, just call memcpy */
if (! constp)
@@ -8339,49 +8340,59 @@ expand_block_clear (rtx operands[])
if (bytes <= 0)
return 1;
- if (bytes > (TARGET_POWERPC64 && align >= 32 ? 64 : 32))
- return 0;
+ /* Use the builtin memset after a point, to avoid huge code bloat.
+ When optimize_size, avoid any significant code bloat; calling
+ memset is about 4 instructions, so allow for one instruction to
+ load zero and three to do clearing. */
+ if (TARGET_ALTIVEC && align >= 128)
+ clear_step = 16;
+ else if (TARGET_POWERPC64 && align >= 64)
+ clear_step = 8;
+ else
+ clear_step = 4;
- if (optimize_size && bytes > 16)
+ if (optimize_size && bytes > 3 * clear_step)
+ return 0;
+ if (! optimize_size && bytes > 8 * clear_step)
return 0;
for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
{
- rtx (*mov) (rtx, rtx);
enum machine_mode mode = BLKmode;
rtx dest;
- if (bytes >= 8 && TARGET_POWERPC64
- /* 64-bit loads and stores require word-aligned
- displacements. */
- && (align >= 64 || (!STRICT_ALIGNMENT && align >= 32)))
+ if (bytes >= 16 && TARGET_ALTIVEC && align >= 128)
+ {
+ clear_bytes = 16;
+ mode = V4SImode;
+ }
+ else if (bytes >= 8 && TARGET_POWERPC64
+ /* 64-bit loads and stores require word-aligned
+ displacements. */
+ && (align >= 64 || (!STRICT_ALIGNMENT && align >= 32)))
{
clear_bytes = 8;
mode = DImode;
- mov = gen_movdi;
}
- else if (bytes >= 4 && !STRICT_ALIGNMENT)
+ else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
{ /* move 4 bytes */
clear_bytes = 4;
mode = SImode;
- mov = gen_movsi;
}
- else if (bytes == 2 && !STRICT_ALIGNMENT)
+ else if (bytes == 2 && (align >= 16 || !STRICT_ALIGNMENT))
{ /* move 2 bytes */
clear_bytes = 2;
mode = HImode;
- mov = gen_movhi;
}
else /* move 1 byte at a time */
{
clear_bytes = 1;
mode = QImode;
- mov = gen_movqi;
}
dest = adjust_address (orig_dest, mode, offset);
- emit_insn ((*mov) (dest, const0_rtx));
+ emit_move_insn (dest, CONST0_RTX (mode));
}
return 1;
@@ -8441,7 +8452,15 @@ expand_block_move (rtx operands[])
enum machine_mode mode = BLKmode;
rtx src, dest;
- if (TARGET_STRING
+ /* Altivec first, since it will be faster than a string move
+ when it applies, and usually not significantly larger. */
+ if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
+ {
+ move_bytes = 16;
+ mode = V4SImode;
+ gen_func.mov = gen_movv4si;
+ }
+ else if (TARGET_STRING
&& bytes > 24 /* move up to 32 bytes at a time */
&& ! fixed_regs[5]
&& ! fixed_regs[6]
Index: testsuite/gcc.dg/ppc-vector-memcpy.c
===================================================================
RCS file: testsuite/gcc.dg/ppc-vector-memcpy.c
diff -N testsuite/gcc.dg/ppc-vector-memcpy.c
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ testsuite/gcc.dg/ppc-vector-memcpy.c 1 Sep 2004 01:11:25 -0000
@@ -0,0 +1,9 @@
+/* { dg-do compile { target powerpc*-*-* } } */
+/* { dg-options "-O -maltivec" } */
+/* { dg-final { scan-assembler "lvx" } } */
+
+void foo(void)
+{
+ int x[8] __attribute__((aligned(128))) = { 1 };
+ bar (x);
+}
Index: testsuite/gcc.dg/ppc-vector-memset.c
===================================================================
RCS file: testsuite/gcc.dg/ppc-vector-memset.c
diff -N testsuite/gcc.dg/ppc-vector-memset.c
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ testsuite/gcc.dg/ppc-vector-memset.c 1 Sep 2004 01:11:25 -0000
@@ -0,0 +1,12 @@
+/* { dg-do compile { target powerpc*-*-* } } */
+/* { dg-options "-O -maltivec" } */
+/* { dg-final { scan-assembler "stvx" } } */
+
+#include <string.h>
+
+void foo(void)
+{
+ int x[8] __attribute__((aligned(128)));
+ memset (x, 0, sizeof (x));
+ bar (x);
+}
============================================================
More information about the Gcc-patches
mailing list