This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

Use vector insns for memcpy, memset on rs6000

From: gkeating at apple dot com (Geoffrey Keating)
To: gcc-patches at gcc dot gnu dot org
Date: Tue, 31 Aug 2004 18:18:06 -0700 (PDT)
Subject: Use vector insns for memcpy, memset on rs6000

This patch improves codegen for block moves and clears when Altivec is
enabled and the block is aligned and 16 bytes or larger, by using
Altivec instructions.

Bootstrapped & tested on powerpc-darwin, plus a c-torture run with
-maltivec (still running, won't commit until it's successful).

-- 
- Geoffrey Keating <geoffk@apple.com>

===File ~/patches/gcc-vectormemcpy.patch====================
2004-08-31  Geoffrey Keating  <geoffk@apple.com>

	* config/rs6000/rs6000.c (expand_block_clear): Use vector
	instructions if available.
	(expand_block_move): Likewise.

Index: testsuite/ChangeLog
2004-08-31  Geoffrey Keating  <geoffk@apple.com>

	* gcc.dg/ppc-vector-memcpy.c: New.
	* gcc.dg/ppc-vector-memset.c: New.

Index: config/rs6000/rs6000.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/rs6000/rs6000.c,v
retrieving revision 1.703
diff -u -p -u -p -r1.703 rs6000.c
--- config/rs6000/rs6000.c	27 Aug 2004 18:29:52 -0000	1.703
+++ config/rs6000/rs6000.c	1 Sep 2004 01:11:19 -0000
@@ -8319,11 +8319,12 @@ expand_block_clear (rtx operands[])
   rtx orig_dest = operands[0];
   rtx bytes_rtx	= operands[1];
   rtx align_rtx = operands[2];
-  int constp	= (GET_CODE (bytes_rtx) == CONST_INT);
-  int align;
-  int bytes;
+  bool constp	= (GET_CODE (bytes_rtx) == CONST_INT);
+  HOST_WIDE_INT align;
+  HOST_WIDE_INT bytes;
   int offset;
   int clear_bytes;
+  int clear_step;
 
   /* If this is not a fixed size move, just call memcpy */
   if (! constp)
@@ -8339,49 +8340,59 @@ expand_block_clear (rtx operands[])
   if (bytes <= 0)
     return 1;
 
-  if (bytes > (TARGET_POWERPC64 && align >= 32 ? 64 : 32))
-    return 0;
+  /* Use the builtin memset after a point, to avoid huge code bloat.
+     When optimize_size, avoid any significant code bloat; calling
+     memset is about 4 instructions, so allow for one instruction to
+     load zero and three to do clearing.  */
+  if (TARGET_ALTIVEC && align >= 128)
+    clear_step = 16;
+  else if (TARGET_POWERPC64 && align >= 64)
+    clear_step = 8;
+  else
+    clear_step = 4;
 
-  if (optimize_size && bytes > 16)
+  if (optimize_size && bytes > 3 * clear_step)
+    return 0;
+  if (! optimize_size && bytes > 8 * clear_step)
     return 0;
 
   for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
     {
-      rtx (*mov) (rtx, rtx);
       enum machine_mode mode = BLKmode;
       rtx dest;
 
-      if (bytes >= 8 && TARGET_POWERPC64
-	       /* 64-bit loads and stores require word-aligned
-		  displacements.  */
-	       && (align >= 64 || (!STRICT_ALIGNMENT && align >= 32)))
+      if (bytes >= 16 && TARGET_ALTIVEC && align >= 128)
+	{
+	  clear_bytes = 16;
+	  mode = V4SImode;
+	}
+      else if (bytes >= 8 && TARGET_POWERPC64
+	  /* 64-bit loads and stores require word-aligned
+	     displacements.  */
+	  && (align >= 64 || (!STRICT_ALIGNMENT && align >= 32)))
 	{
 	  clear_bytes = 8;
 	  mode = DImode;
-	  mov = gen_movdi;
 	}
-      else if (bytes >= 4 && !STRICT_ALIGNMENT)
+      else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
 	{			/* move 4 bytes */
 	  clear_bytes = 4;
 	  mode = SImode;
-	  mov = gen_movsi;
 	}
-      else if (bytes == 2 && !STRICT_ALIGNMENT)
+      else if (bytes == 2 && (align >= 16 || !STRICT_ALIGNMENT))
 	{			/* move 2 bytes */
 	  clear_bytes = 2;
 	  mode = HImode;
-	  mov = gen_movhi;
 	}
       else /* move 1 byte at a time */
 	{
 	  clear_bytes = 1;
 	  mode = QImode;
-	  mov = gen_movqi;
 	}
 
       dest = adjust_address (orig_dest, mode, offset);
 
-      emit_insn ((*mov) (dest, const0_rtx));
+      emit_move_insn (dest, CONST0_RTX (mode));
     }
 
   return 1;
@@ -8441,7 +8452,15 @@ expand_block_move (rtx operands[])
       enum machine_mode mode = BLKmode;
       rtx src, dest;
 
-      if (TARGET_STRING
+      /* Altivec first, since it will be faster than a string move
+	 when it applies, and usually not significantly larger.  */
+      if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
+	{
+	  move_bytes = 16;
+	  mode = V4SImode;
+	  gen_func.mov = gen_movv4si;
+	}
+      else if (TARGET_STRING
 	  && bytes > 24		/* move up to 32 bytes at a time */
 	  && ! fixed_regs[5]
 	  && ! fixed_regs[6]
Index: testsuite/gcc.dg/ppc-vector-memcpy.c
===================================================================
RCS file: testsuite/gcc.dg/ppc-vector-memcpy.c
diff -N testsuite/gcc.dg/ppc-vector-memcpy.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ testsuite/gcc.dg/ppc-vector-memcpy.c	1 Sep 2004 01:11:25 -0000
@@ -0,0 +1,9 @@
+/* { dg-do compile { target powerpc*-*-* } } */
+/* { dg-options "-O -maltivec" } */
+/* { dg-final { scan-assembler "lvx" } } */
+
+void foo(void)
+{
+  int x[8] __attribute__((aligned(128))) = { 1 };
+  bar (x);
+}
Index: testsuite/gcc.dg/ppc-vector-memset.c
===================================================================
RCS file: testsuite/gcc.dg/ppc-vector-memset.c
diff -N testsuite/gcc.dg/ppc-vector-memset.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ testsuite/gcc.dg/ppc-vector-memset.c	1 Sep 2004 01:11:25 -0000
@@ -0,0 +1,12 @@
+/* { dg-do compile { target powerpc*-*-* } } */
+/* { dg-options "-O -maltivec" } */
+/* { dg-final { scan-assembler "stvx" } } */
+
+#include <string.h>
+
+void foo(void)
+{
+  int x[8] __attribute__((aligned(128)));
+  memset (x, 0, sizeof (x));
+  bar (x);
+}
============================================================

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]