[PATCH] Optimize strcpy(p, "abc") and memcpy(p, "abc", 4)

Jakub Jelinek jakub@redhat.com
Fri Nov 24 07:53:00 GMT 2000


Hi!

This patch adds one optimization glibc was doing in bits/string2.h:
for strcpy and memcpy from string constant, if emit_block_move would move it
by pieces, we can instead just store the pre-computed constants into the
destination block. So e.g. on i386 instead of:

.LC0:
        .string "abc"
foo:
        pushl   %ebp
        movl    .LC0, %eax
        movl    %esp, %ebp
        movl    %eax, p
        popl    %ebp
        ret

we can do:
foo:
        pushl   %ebp
        movl    %esp, %ebp
        movl    $6513249, p
        popl    %ebp
        ret

(and if the string will not be used elsewhere, we will not output it at
all).
Bootstrapped on i386-redhat-linux, no regressions, briefly tested on sparc
to see if the endianity issues are solved correctly.
I think the c_readstr routine might be used in other cases as well, like
e.g. strcmp with one string constant, etc.

What I've postponed for later in this patch is some way how to limit the
largest used mode for this on arches where building huge constants in
registers is expensive (e.g. on sparc64 writing in SImode is 100% ok, but
writing in DImode is questionable (constants would usually take 6
instructions to compute plus one DImode store while if the same was done in
SImode, we could use twice 2 instructions to compute the 2 SImode constants
and 2 stores). I'm not sure how to deduce this information from machine
descriptions though.

Ok to commit?

2000-11-24  Jakub Jelinek  <jakub@redhat.com>

	* expr.h (store_by_pieces): Add prototype.
	* expr.c (struct store_by_pieces): Renamed from clear_by_pieces.
	(store_by_pieces): New.
	(clear_by_pieces): New.
	(store_by_pieces_1): Renamed from clear_by_pieces, handle storing
	arbitrary compiler generated constants into memory block.
	(store_by_pieces_2): Renamed from clear_by_pieces_1, likewise.
	* builtins.c (c_readstr): New.
	(builtin_memcpy_read_str): New.
	(expand_builtin_memcpy): If src is string constant and
	emit_block_move would move it by pieces, compute integer constants
	from the string and store it into memory block instead.

	* gcc.c-torture/execute/string-opt-6.c: New test.

--- gcc/expr.h.jj	Mon Nov  6 09:38:24 2000
+++ gcc/expr.h	Fri Nov 24 12:19:02 2000
@@ -1010,6 +1010,16 @@ extern void use_group_regs PARAMS ((rtx 
    alignment.  */
 extern rtx clear_storage PARAMS ((rtx, rtx, unsigned int));
 
+/* If storing by pieces is desirable, generate several move instructions to
+   store LEN bytes generated by CONSTFUN to block TO.  (A MEM rtx with
+   BLKmode).  CONSTFUNDATA is a pointer which will be passed as argument
+   in every CONSTFUN call.  ALIGN is maximum alignment we can assume.
+   Return non-zero if it had emitted something.  */
+extern int store_by_pieces PARAMS ((rtx, unsigned HOST_WIDE_INT,
+				    rtx (*) (PTR, HOST_WIDE_INT,
+					     enum machine_mode),
+				    PTR, unsigned int));
+
 /* Emit insns to set X from Y.  */
 extern rtx emit_move_insn PARAMS ((rtx, rtx));
 
--- gcc/expr.c.jj	Fri Nov 24 10:47:59 2000
+++ gcc/expr.c	Fri Nov 24 14:10:43 2000
@@ -127,10 +127,10 @@ struct move_by_pieces
   int reverse;
 };
 
-/* This structure is used by clear_by_pieces to describe the clear to
+/* This structure is used by store_by_pieces to describe the clear to
    be performed.  */
 
-struct clear_by_pieces
+struct store_by_pieces
 {
   rtx to;
   rtx to_addr;
@@ -138,6 +138,8 @@ struct clear_by_pieces
   int explicit_inc_to;
   unsigned HOST_WIDE_INT len;
   HOST_WIDE_INT offset;
+  rtx (*constfun) PARAMS ((PTR, HOST_WIDE_INT, enum machine_mode));
+  PTR constfundata;
   int reverse;
 };
 
@@ -153,9 +155,11 @@ static void move_by_pieces_1	PARAMS ((rt
 					 struct move_by_pieces *));
 static void clear_by_pieces	PARAMS ((rtx, unsigned HOST_WIDE_INT,
 					 unsigned int));
-static void clear_by_pieces_1	PARAMS ((rtx (*) (rtx, ...),
+static void store_by_pieces_1	PARAMS ((struct store_by_pieces *,
+					 unsigned int));
+static void store_by_pieces_2	PARAMS ((rtx (*) (rtx, ...),
 					 enum machine_mode,
-					 struct clear_by_pieces *));
+					 struct store_by_pieces *));
 static rtx get_subtarget	PARAMS ((rtx));
 static int is_zeros_p		PARAMS ((tree));
 static int mostly_zeros_p	PARAMS ((tree));
@@ -2249,6 +2253,33 @@ use_group_regs (call_fusage, regs)
     }
 }
 
+/* If storing by pieces is desirable, generate several move instructions to
+   store LEN bytes generated by CONSTFUN to block TO.  (A MEM rtx with
+   BLKmode).  CONSTFUNDATA is a pointer which will be passed as argument in
+   every CONSTFUN call.  ALIGN is maximum alignment we can assume.
+   Return non-zero if it had emitted something.  */
+
+int
+store_by_pieces (to, len, constfun, constfundata, align)
+     rtx to;
+     unsigned HOST_WIDE_INT len;
+     rtx (*constfun) PARAMS ((PTR, HOST_WIDE_INT, enum machine_mode));
+     PTR constfundata;
+     unsigned int align;
+{
+  struct store_by_pieces data;
+
+  if (! MOVE_BY_PIECES_P (len, align))
+    return 0;
+  to = protect_from_queue (to, 1);
+  data.constfun = constfun;
+  data.constfundata = constfundata;
+  data.len = len;
+  data.to = to;
+  store_by_pieces_1 (&data, align);
+  return 1;
+}
+
 /* Generate several move instructions to clear LEN bytes of block TO.  (A MEM
    rtx with BLKmode).  The caller must pass TO through protect_from_queue
    before calling. ALIGN is maximum alignment we can assume.  */
@@ -2259,31 +2290,47 @@ clear_by_pieces (to, len, align)
      unsigned HOST_WIDE_INT len;
      unsigned int align;
 {
-  struct clear_by_pieces data;
-  rtx to_addr = XEXP (to, 0);
+  struct store_by_pieces data;
+
+  data.constfun = NULL_PTR;
+  data.constfundata = NULL_PTR;
+  data.len = len;
+  data.to = to;
+  store_by_pieces_1 (&data, align);
+}
+
+/* Subroutine of clear_by_pieces and store_by_pieces.
+   Generate several move instructions to store LEN bytes of block TO.  (A MEM
+   rtx with BLKmode).  The caller must pass TO through protect_from_queue
+   before calling. ALIGN is maximum alignment we can assume.  */
+
+static void
+store_by_pieces_1 (data, align)
+     struct store_by_pieces *data;
+     unsigned int align;
+{
+  rtx to_addr = XEXP (data->to, 0);
   unsigned HOST_WIDE_INT max_size = MOVE_MAX_PIECES + 1;
   enum machine_mode mode = VOIDmode, tmode;
   enum insn_code icode;
 
-  data.offset = 0;
-  data.to_addr = to_addr;
-  data.to = to;
-  data.autinc_to
+  data->offset = 0;
+  data->to_addr = to_addr;
+  data->autinc_to
     = (GET_CODE (to_addr) == PRE_INC || GET_CODE (to_addr) == PRE_DEC
        || GET_CODE (to_addr) == POST_INC || GET_CODE (to_addr) == POST_DEC);
 
-  data.explicit_inc_to = 0;
-  data.reverse
+  data->explicit_inc_to = 0;
+  data->reverse
     = (GET_CODE (to_addr) == PRE_DEC || GET_CODE (to_addr) == POST_DEC);
-  if (data.reverse)
-    data.offset = len;
-  data.len = len;
+  if (data->reverse)
+    data->offset = data->len;
 
-  /* If copying requires more than two move insns,
+  /* If storing requires more than two move insns,
      copy addresses to registers (to make displacements shorter)
      and use post-increment if available.  */
-  if (!data.autinc_to
-      && move_by_pieces_ninsns (len, align) > 2)
+  if (!data->autinc_to
+      && move_by_pieces_ninsns (data->len, align) > 2)
     {
       /* Determine the main mode we'll be using.  */
       for (tmode = GET_CLASS_NARROWEST_MODE (MODE_INT);
@@ -2291,30 +2338,30 @@ clear_by_pieces (to, len, align)
 	if (GET_MODE_SIZE (tmode) < max_size)
 	  mode = tmode;
 
-      if (USE_STORE_PRE_DECREMENT (mode) && data.reverse && ! data.autinc_to)
+      if (USE_STORE_PRE_DECREMENT (mode) && data->reverse && ! data->autinc_to)
 	{
-	  data.to_addr = copy_addr_to_reg (plus_constant (to_addr, len));
-	  data.autinc_to = 1;
-	  data.explicit_inc_to = -1;
+	  data->to_addr = copy_addr_to_reg (plus_constant (to_addr, data->len));
+	  data->autinc_to = 1;
+	  data->explicit_inc_to = -1;
 	}
 
-      if (USE_STORE_POST_INCREMENT (mode) && ! data.reverse
-	  && ! data.autinc_to)
+      if (USE_STORE_POST_INCREMENT (mode) && ! data->reverse
+	  && ! data->autinc_to)
 	{
-	  data.to_addr = copy_addr_to_reg (to_addr);
-	  data.autinc_to = 1;
-	  data.explicit_inc_to = 1;
+	  data->to_addr = copy_addr_to_reg (to_addr);
+	  data->autinc_to = 1;
+	  data->explicit_inc_to = 1;
 	}
 
-      if ( !data.autinc_to && CONSTANT_P (to_addr))
-	data.to_addr = copy_addr_to_reg (to_addr);
+      if ( !data->autinc_to && CONSTANT_P (to_addr))
+	data->to_addr = copy_addr_to_reg (to_addr);
     }
 
   if (! SLOW_UNALIGNED_ACCESS (word_mode, align)
       || align > MOVE_MAX * BITS_PER_UNIT || align >= BIGGEST_ALIGNMENT)
     align = MOVE_MAX * BITS_PER_UNIT;
 
-  /* First move what we can in the largest integer mode, then go to
+  /* First store what we can in the largest integer mode, then go to
      successively smaller modes.  */
 
   while (max_size > 1)
@@ -2329,25 +2376,25 @@ clear_by_pieces (to, len, align)
 
       icode = mov_optab->handlers[(int) mode].insn_code;
       if (icode != CODE_FOR_nothing && align >= GET_MODE_ALIGNMENT (mode))
-	clear_by_pieces_1 (GEN_FCN (icode), mode, &data);
+	store_by_pieces_2 (GEN_FCN (icode), mode, data);
 
       max_size = GET_MODE_SIZE (mode);
     }
 
   /* The code above should have handled everything.  */
-  if (data.len != 0)
+  if (data->len != 0)
     abort ();
 }
 
-/* Subroutine of clear_by_pieces.  Clear as many bytes as appropriate
+/* Subroutine of store_by_pieces_1.  Store as many bytes as appropriate
    with move instructions for mode MODE.  GENFUN is the gen_... function
    to make a move insn for that mode.  DATA has all the other info.  */
 
 static void
-clear_by_pieces_1 (genfun, mode, data)
+store_by_pieces_2 (genfun, mode, data)
      rtx (*genfun) PARAMS ((rtx, ...));
      enum machine_mode mode;
-     struct clear_by_pieces *data;
+     struct store_by_pieces *data;
 {
   unsigned int size = GET_MODE_SIZE (mode);
   rtx to1;
@@ -2367,9 +2414,13 @@ clear_by_pieces_1 (genfun, mode, data)
 			      plus_constant (data->to_addr, data->offset));
 
       if (HAVE_PRE_DECREMENT && data->explicit_inc_to < 0)
-	emit_insn (gen_add2_insn (data->to_addr, GEN_INT (-size)));
+	emit_insn (gen_add2_insn (data->to_addr,
+				  GEN_INT (-(HOST_WIDE_INT) size)));
 
-      emit_insn ((*genfun) (to1, const0_rtx));
+      emit_insn ((*genfun) (to1, data->constfun ?
+				 (*data->constfun) (data->constfundata,
+						    data->offset, mode)
+				 : const0_rtx));
 
       if (HAVE_POST_INCREMENT && data->explicit_inc_to > 0)
 	emit_insn (gen_add2_insn (data->to_addr, GEN_INT (size)));
--- gcc/builtins.c.jj	Mon Nov 20 16:27:20 2000
+++ gcc/builtins.c	Fri Nov 24 14:47:14 2000
@@ -81,6 +81,7 @@ tree (*lang_type_promotes_to) PARAMS ((t
 static int get_pointer_alignment	PARAMS ((tree, unsigned));
 static tree c_strlen			PARAMS ((tree));
 static const char *c_getstr		PARAMS ((tree));
+static rtx c_readstr			PARAMS ((const char *, enum machine_mode));
 static rtx get_memory_rtx		PARAMS ((tree));
 static int apply_args_size		PARAMS ((void));
 static int apply_result_size		PARAMS ((void));
@@ -104,6 +105,8 @@ static rtx expand_builtin_memcmp	PARAMS 
 #endif
 static rtx expand_builtin_strcmp	PARAMS ((tree, rtx,
 						 enum machine_mode));
+static rtx builtin_memcpy_read_str	PARAMS ((PTR, HOST_WIDE_INT,
+						 enum machine_mode));
 static rtx expand_builtin_memcpy	PARAMS ((tree));
 static rtx expand_builtin_strcpy	PARAMS ((tree));
 static rtx expand_builtin_memset	PARAMS ((tree));
@@ -304,6 +307,42 @@ c_getstr (src)
   return ptr + offset;
 }
 
+/* Return a CONST_INT or CONST_DOUBLE corresponding to target
+   reading GET_MODE_BITSIZE (MODE) bits from string constant
+   STR.  */
+static rtx
+c_readstr (str, mode)
+     const char *str;
+     enum machine_mode mode;
+{
+  HOST_WIDE_INT c[2];
+  HOST_WIDE_INT ch;
+  unsigned int i, j;
+
+  if (GET_MODE_CLASS (mode) != MODE_INT)
+    abort ();
+  c[0] = 0;
+  c[1] = 0;
+  ch = 1;
+  for (i = 0; i < GET_MODE_SIZE (mode); i++)
+    {
+      if (!ch)
+	abort ();  /* Attempt to read past end of string.  */
+      j = i;
+      if (WORDS_BIG_ENDIAN)
+	j = GET_MODE_SIZE (mode) - i - 1;
+      if (BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN
+	  && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+	j = j + UNITS_PER_WORD - 2 * (j % UNITS_PER_WORD) - 1;
+      j *= BITS_PER_UNIT;
+      if (j > 2 * HOST_BITS_PER_WIDE_INT)
+	abort ();
+      ch = (unsigned char) str[i];
+      c[j / HOST_BITS_PER_WIDE_INT] |= ch << (j % HOST_BITS_PER_WIDE_INT);
+    }
+  return immed_double_const (c[0], c[1], mode);
+}
+
 /* Given TEM, a pointer to a stack frame, follow the dynamic chain COUNT
    times to get the address of either a higher stack frame, or a return
    address located within it (depending on FNDECL_CODE).  */
@@ -1681,6 +1720,23 @@ expand_builtin_strpbrk (arglist, target,
     }
 }
 
+/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)
+   bytes from constant string DATA + OFFSET and return it as target
+   constant.  */
+static rtx
+builtin_memcpy_read_str (data, offset, mode)
+     PTR data;
+     HOST_WIDE_INT offset;
+     enum machine_mode mode;
+{
+  const char *str = (const char *) data;
+
+  if (offset + GET_MODE_SIZE (mode) > strlen (str) + 1)
+    abort ();  /* Attempt to read past the end of constant string.  */
+
+  return c_readstr (str + offset, mode);
+}
+
 /* Expand a call to the memcpy builtin, with arguments in ARGLIST.  */
 static rtx
 expand_builtin_memcpy (arglist)
@@ -1702,6 +1758,7 @@ expand_builtin_memcpy (arglist)
       tree dest = TREE_VALUE (arglist);
       tree src = TREE_VALUE (TREE_CHAIN (arglist));
       tree len = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist)));
+      const char *src_str;
 
       int src_align = get_pointer_alignment (src, BIGGEST_ALIGNMENT);
       int dest_align = get_pointer_alignment (dest, BIGGEST_ALIGNMENT);
@@ -1713,8 +1770,22 @@ expand_builtin_memcpy (arglist)
 	return 0;
 
       dest_mem = get_memory_rtx (dest);
-      src_mem = get_memory_rtx (src);
       len_rtx = expand_expr (len, NULL_RTX, VOIDmode, 0);
+      src_str = c_getstr (src);
+
+      /* If SRC is a string constant and block move would be done
+	 by pieces, we can avoid loading the string from memory
+	 and only stored the computed constants.  */
+      if (src_str
+	  && !current_function_check_memory_usage
+	  && GET_CODE (len_rtx) == CONST_INT
+	  && (unsigned HOST_WIDE_INT) INTVAL (len_rtx) <= strlen (src_str) + 1
+	  && store_by_pieces (dest_mem, INTVAL (len_rtx),
+			      builtin_memcpy_read_str,
+			      (PTR) src_str, dest_align))
+	return force_operand (XEXP (dest_mem, 0), NULL_RTX);
+
+      src_mem = get_memory_rtx (src);
 
       /* Just copy the rights of SRC to the rights of DEST.  */
       if (current_function_check_memory_usage)
--- gcc/testsuite/gcc.c-torture/execute/string-opt-6.c.jj	Fri Nov 24 15:10:53 2000
+++ gcc/testsuite/gcc.c-torture/execute/string-opt-6.c	Fri Nov 24 15:10:33 2000
@@ -0,0 +1,46 @@
+/* Copyright (C) 2000  Free Software Foundation.
+
+   Ensure builtin memcpy and strcpy perform correctly.
+
+   Written by Jakub Jelinek, 11/24/2000.  */
+
+extern void abort (void);
+extern char *strcpy (char *, const char *);
+typedef __SIZE_TYPE__ size_t;
+extern void *memcpy (void *, const void *, size_t);
+extern int memcmp (const void *, const void *, size_t);
+
+char p[32] = "";
+
+int main()
+{
+  if (strcpy (p, "abcde") != p || memcmp (p, "abcde", 6))
+    abort ();
+  if (strcpy (p + 16, "vwxyz" + 1) != p + 16 || memcmp (p + 16, "wxyz", 5))
+    abort ();
+  if (strcpy (p + 1, "") != p + 1 || memcmp (p, "a\0cde", 6))
+    abort ();  
+  if (strcpy (p + 3, "fghij") != p + 3 || memcmp (p, "a\0cfghij", 9))
+    abort ();
+  if (memcpy (p, "ABCDE", 6) != p || memcmp (p, "ABCDE", 6))
+    abort ();
+  if (memcpy (p + 16, "VWX" + 1, 2) != p + 16 || memcmp (p + 16, "WXyz", 5))
+    abort ();
+  if (memcpy (p + 1, "", 1) != p + 1 || memcmp (p, "A\0CDE", 6))
+    abort ();  
+  if (memcpy (p + 3, "FGHI", 4) != p + 3 || memcmp (p, "A\0CFGHIj", 9))
+    abort ();
+
+  return 0;
+}
+
+#ifdef __OPTIMIZE__
+/* When optimizing, all the above cases should be transformed into
+   something else.  So any remaining calls to the original function
+   should abort.  */
+static char *
+strcpy (char *d, const char *s)
+{
+  abort ();
+}
+#endif

	Jakub


More information about the Gcc-patches mailing list