This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Optimize ix86_expand_clrmem


Hi!

For say:
void foo (char *p, long *q)
{
  __builtin_memset (p, 0, 7);
  __builtin_memset (q, 0, 9);
}
GCC -m32 -O2 -march=pentium4 generates code like:
	cld
        movl    $1, %ecx
        rep
        stosl
        stosw
        stosb
...
        movb    $2, %cl
        rep
        stosl
        stosb
and with -m64
        movl    $0, (%rdi)
        movw    $0, 4(%rdi)
        movb    $0, 6(%rdi)
...
        cld
        movl    $1, %ecx
        rep
        stosq
        movb    $0, (%rdi)
This is both speed and space inefficient.
For code size, cld; movl $N, %ecx; rep; stos{b,l} is 8 bytes,
cld; followed by N x stos{b,l} is 1 + N bytes.
cld; movl $N, %ecx; rep; stosq is 9 bytes, cld; followed by N x stosq is
1 + 2N bytes.
So, for -Os it is benefficial to use a sequence of stosl instructions
(resp. stosq) if N is at most 7 (resp. 4).
I have done some benchmarking on PIII, P4, Opteron and Nocona and it seems
on PIII and Opteron at least a sequence of 7 stosl instructions (resp. 4
stosq) is still faster than loading up %ecx and rep; stos{l,q}, while on
P4 and Nocona it seems for N equal to 4 it is already break-even and
for bigger values rep; stos* is faster.
In all cases not using stos* for such small bzero calls seems to be a win
on all these arches, but that is unrelated to this patch (MOVE_BY_PIECES_P
and CLEAR_BY_PIECES_P don't work as they should, I'll talk about this in
another mail).

Ok to commit?  Is anyone running nightly SPEC on i386 and x86-64 against
mainline?

2004-08-05  Jakub Jelinek  <jakub@redhat.com>

	* config/i386/i386.c (ix86_expand_clrmem): Move gen_cld down to
	the places where it is actually needed.  Don't use repz; stosb
	for -Os with sufficiently small constant sizes.
	For sufficiently small repz; stos{l,q} repeat counts use a sequence
	of stos{l,q} instructions instead.

--- gcc/config/i386/i386.c.jj	2004-08-05 12:03:44.000000000 +0200
+++ gcc/config/i386/i386.c	2004-08-05 13:59:19.136402101 +0200
@@ -11508,13 +11508,20 @@ ix86_expand_clrmem (rtx dst, rtx count_e
   if (destreg != XEXP (dst, 0))
     dst = replace_equiv_address_nv (dst, destreg);
 
-  emit_insn (gen_cld ());
 
   /* When optimizing for size emit simple rep ; movsb instruction for
-     counts not divisible by 4.  */
-
-  if ((!optimize || optimize_size) && (count == 0 || (count & 0x03)))
+     counts not divisible by 4.  The movl $N, %ecx; rep; stosb
+     sequence is 7 bytes long, so if optimizing for size and count is
+     small enough that some stosl, stosw and stosb instructions without
+     rep are shorter, fall back into the next if.  */
+
+  if ((!optimize || optimize_size)
+      && (count == 0
+	  || ((count & 0x03)
+	      && (!optimize_size || (count & 0x03) + (count >> 2) > 7))))
     {
+      emit_insn (gen_cld ());
+
       countreg = ix86_zero_extend_to_Pmode (count_exp);
       zeroreg = copy_to_mode_reg (QImode, const0_rtx);
       destexp = gen_rtx_PLUS (Pmode, destreg, countreg);
@@ -11528,17 +11535,54 @@ ix86_expand_clrmem (rtx dst, rtx count_e
       int size = TARGET_64BIT && !optimize_size ? 8 : 4;
       unsigned HOST_WIDE_INT offset = 0;
 
+      emit_insn (gen_cld ());
+
       zeroreg = copy_to_mode_reg (size == 4 ? SImode : DImode, const0_rtx);
       if (count & ~(size - 1))
 	{
-	  countreg = copy_to_mode_reg (counter_mode,
-				       GEN_INT ((count >> (size == 4 ? 2 : 3))
-						& (TARGET_64BIT ? -1 : 0x3fffffff)));
-	  countreg = ix86_zero_extend_to_Pmode (countreg);
-	  destexp = gen_rtx_ASHIFT (Pmode, countreg, GEN_INT (size == 4 ? 2 : 3));
-	  destexp = gen_rtx_PLUS (Pmode, destexp, destreg);
-	  emit_insn (gen_rep_stos (destreg, countreg, dst, zeroreg, destexp));
-	  offset = count & ~(size - 1);
+	  unsigned HOST_WIDE_INT repcount;
+	  unsigned int max_nonrep;
+
+	  repcount = count >> (size == 4 ? 2 : 3);
+	  if (!TARGET_64BIT)
+	    repcount &= 0x3fffffff;
+
+	  /* movl $N, %ecx; rep; stosl is 7 bytes, while N x stosl is N bytes.
+	     movl $N, %ecx; rep; stosq is 8 bytes, while N x stosq is 2xN
+	     bytes.  In both cases the latter seems to be faster for small
+	     values of N.  */
+	  max_nonrep = size == 4 ? 7 : 4;
+	  if (!optimize_size)
+	    switch (ix86_tune)
+	      {
+	      case PROCESSOR_PENTIUM4:
+	      case PROCESSOR_NOCONA:
+	        max_nonrep = 3;
+	        break;
+	      default:
+	        break;
+	      }
+
+	  if (repcount <= max_nonrep)
+	    while (repcount-- > 0)
+	      {
+		rtx mem = adjust_automodify_address_nv (dst,
+							GET_MODE (zeroreg),
+							destreg, offset);
+		emit_insn (gen_strset (destreg, mem, zeroreg));
+		offset += size;
+	      }
+	  else
+	    {
+	      countreg = copy_to_mode_reg (counter_mode, GEN_INT (repcount));
+	      countreg = ix86_zero_extend_to_Pmode (countreg);
+	      destexp = gen_rtx_ASHIFT (Pmode, countreg,
+					GEN_INT (size == 4 ? 2 : 3));
+	      destexp = gen_rtx_PLUS (Pmode, destexp, destreg);
+	      emit_insn (gen_rep_stos (destreg, countreg, dst, zeroreg,
+				       destexp));
+	      offset = count & ~(size - 1);
+	    }
 	}
       if (size == 8 && (count & 0x04))
 	{

	Jakub


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]