This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH 2/4] S/390: Unroll mvc/xc loop for memset with small constant lengths.


When expanding a memset we emit a loop of MVCs/XCs instructions dealing
with 256 byte blocks.  This loop used to get unrolled with older GCCs
when using constant length operands.  GCC lost this ability probably
when more of the loop unrolling stuff has been moved to tree level.

With this patch the unrolling is done manually when emitting the RTL
insns.

2017-01-03  Andreas Krebbel  <krebbel@linux.vnet.ibm.com>

	* gcc.target/s390/memset-1.c: New test.

gcc/ChangeLog:

2017-01-03  Andreas Krebbel  <krebbel@linux.vnet.ibm.com>

	* config/s390/s390.c (s390_expand_setmem): Unroll the loop for
	small constant length operands.
---
 gcc/config/s390/s390.c                   |  56 ++++++++-----
 gcc/testsuite/gcc.target/s390/memset-1.c | 134 +++++++++++++++++++++++++++++++
 2 files changed, 168 insertions(+), 22 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/s390/memset-1.c

diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index 257bce7..1266f45 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -5348,34 +5348,46 @@ s390_expand_setmem (rtx dst, rtx len, rtx val)
 {
   const int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
 
-  if (GET_CODE (len) == CONST_INT && INTVAL (len) == 0)
+  if (GET_CODE (len) == CONST_INT && INTVAL (len) <= 0)
     return;
 
   gcc_assert (GET_CODE (val) == CONST_INT || GET_MODE (val) == QImode);
 
-  if (GET_CODE (len) == CONST_INT && INTVAL (len) > 0 && INTVAL (len) <= 257)
+  /* Expand setmem/clrmem for a constant length operand without a
+     loop if it will be shorter that way.
+     With a constant length and without pfd argument a
+     clrmem loop is 32 bytes -> 5.3 * xc
+     setmem loop is 36 bytes -> 3.6 * (mvi/stc + mvc) */
+  if (GET_CODE (len) == CONST_INT
+      && ((INTVAL (len) <= 256 * 5 && val == const0_rtx)
+	  || INTVAL (len) <= 257 * 3)
+      && (!TARGET_MVCLE || INTVAL (len) <= 256))
     {
-      if (val == const0_rtx && INTVAL (len) <= 256)
-        emit_insn (gen_clrmem_short (dst, GEN_INT (INTVAL (len) - 1)));
-      else
-	{
-	  /* Initialize memory by storing the first byte.  */
-	  emit_move_insn (adjust_address (dst, QImode, 0), val);
+      HOST_WIDE_INT o, l;
 
-	  if (INTVAL (len) > 1)
-	    {
-	      /* Initiate 1 byte overlap move.
-	         The first byte of DST is propagated through DSTP1.
-		 Prepare a movmem for:  DST+1 = DST (length = LEN - 1).
-		 DST is set to size 1 so the rest of the memory location
-		 does not count as source operand.  */
-	      rtx dstp1 = adjust_address (dst, VOIDmode, 1);
-	      set_mem_size (dst, 1);
-
-	      emit_insn (gen_movmem_short (dstp1, dst,
-					   GEN_INT (INTVAL (len) - 2)));
-	    }
-	}
+      if (val == const0_rtx)
+	/* clrmem: emit 256 byte blockwise XCs.  */
+	for (l = INTVAL (len), o = 0; l > 0; l -= 256, o += 256)
+	  {
+	    rtx newdst = adjust_address (dst, BLKmode, o);
+	    emit_insn (gen_clrmem_short (newdst,
+					 GEN_INT (l > 256 ? 255 : l - 1)));
+	  }
+      else
+	/* setmem: emit 1(mvi) + 256(mvc) byte blockwise memsets by
+	   setting first byte to val and using a 256 byte mvc with one
+	   byte overlap to propagate the byte.  */
+	for (l = INTVAL (len), o = 0; l > 0; l -= 257, o += 257)
+	  {
+	    rtx newdst = adjust_address (dst, BLKmode, o);
+	    emit_move_insn (adjust_address (dst, QImode, o), val);
+	    if (l > 1)
+	      {
+		rtx newdstp1 = adjust_address (dst, BLKmode, o + 1);
+		emit_insn (gen_movmem_short (newdstp1, newdst,
+					     GEN_INT (l > 257 ? 255 : l - 2)));
+	      }
+	  }
     }
 
   else if (TARGET_MVCLE)
diff --git a/gcc/testsuite/gcc.target/s390/memset-1.c b/gcc/testsuite/gcc.target/s390/memset-1.c
new file mode 100644
index 0000000..7b43b97c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/memset-1.c
@@ -0,0 +1,134 @@
+/* Make sure that short memset's with constant length are emitted
+   without loop statements.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O3 -mzarch" } */
+
+/* 1 mvc */
+void
+*memset1(void *s, int c)
+{
+  return __builtin_memset (s, c, 42);
+}
+
+/* 3 mvc */
+void
+*memset2(void *s, int c)
+{
+  return __builtin_memset (s, c, 700);
+}
+
+/* nop */
+void
+*memset3(void *s, int c)
+{
+  return __builtin_memset (s, c, 0);
+}
+
+/* mvc */
+void
+*memset4(void *s, int c)
+{
+  return __builtin_memset (s, c, 256);
+}
+
+/* 2 mvc */
+void
+*memset5(void *s, int c)
+{
+  return __builtin_memset (s, c, 512);
+}
+
+/* still 2 mvc through the additional first byte  */
+void
+*memset6(void *s, int c)
+{
+  return __builtin_memset (s, c, 514);
+}
+
+/* 3 mvc */
+void
+*memset7(void *s, int c)
+{
+  return __builtin_memset (s, c, 515);
+}
+
+/* still 3 mvc through the additional first byte  */
+void
+*memset8(void *s, int c)
+{
+  return __builtin_memset (s, c, 771);
+}
+
+/* Use mvc loop: 2 mvc */
+void
+*memset9(void *s, int c)
+{
+  return __builtin_memset (s, c, 772);
+}
+
+/* 3 mvc with displacement overflow after the first */
+void
+*memset10(void *s, int c)
+{
+  return __builtin_memset ((char*)s + 4000, c, 700);
+}
+
+/* 1 xc */
+void
+*clrmem1(void *s)
+{
+  return __builtin_memset (s, 0, 42);
+}
+
+/* 3 xc */
+void
+*clrmem2(void *s)
+{
+  return __builtin_memset (s, 0, 700);
+}
+
+/* nop */
+void
+*clrmem3(void *s)
+{
+  return __builtin_memset (s, 0, 0);
+}
+
+/* 1 xc */
+void
+*clrmem4(void *s)
+{
+  return __builtin_memset (s, 0, 256);
+}
+
+/* 2 xc */
+void
+*clrmem5(void *s)
+{
+  return __builtin_memset (s, 0, 512);
+}
+
+/* 3 xc */
+void
+*clrmem6(void *s)
+{
+  return __builtin_memset (s, 0, 768);
+}
+
+/* start using xc loop */
+void
+*clrmem7(void *s)
+{
+  return __builtin_memset (s, 0, 1281);
+}
+
+/* 3 xc with displacement overflow after the first */
+void
+*clrmem8(void *s)
+{
+  return __builtin_memset (s + 4000, 0, 700);
+}
+
+/* { dg-final { scan-assembler-times "mvc" 19 } } */
+/* { dg-final { scan-assembler-times "xc" 15 } } */
-- 
2.9.1


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]