[PATCH] Optimize memcpy + memset (PR fortran/45636)

Richard Guenther rguenther@suse.de
Tue Oct 12 09:38:00 GMT 2010


On Mon, 11 Oct 2010, Jakub Jelinek wrote:

> Hi!
> 
> This patch optimizes
> memcpy (x, "abcd", 4);
> memset (x + 4, ' ', 4);
> sequences into a single
> memcpy (x, "abcd    ", 8);
> if it is known (or expected) that the memcpy will be implemented using
> store_by_pieces (otherwise we'd risk enlarging .rodata too much, e.g.
> if there are only few longer string literals in memcpy originally and
> adding too many different memset variants after them would mean the string
> literals couldn't be shared anymore).
> The patch also handles
> memcpy (x, "a", 1);
> memset (x + 1, 'b', 11);
> where the memcpy has been optimized into a store.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
> 
> 2010-10-11  Jakub Jelinek  <jakub@redhat.com>
> 
> 	PR fortran/45636
> 	* tree-ssa-forwprop.c: Include expr.h.
> 	(constant_pointer_difference, simplify_builtin_call): New functions.
> 	(tree_ssa_forward_propagate_single_use_vars): Call
> 	simplify_builtin_call on builtin calls.
> 
> 	* gcc.c-torture/execute/pr45636.c: New test.
> 	* gfortran.dg/pr45636.f90: New test.
> 
> --- gcc/tree-ssa-forwprop.c.jj	2010-10-07 19:44:48.000000000 +0200
> +++ gcc/tree-ssa-forwprop.c	2010-10-11 16:43:03.000000000 +0200
> @@ -33,6 +33,7 @@ along with GCC; see the file COPYING3.  
>  #include "langhooks.h"
>  #include "flags.h"
>  #include "gimple.h"
> +#include "expr.h"
>  
>  /* This pass propagates the RHS of assignment statements into use
>     sites of the LHS of the assignment.  It's basically a specialized
> @@ -1317,6 +1318,296 @@ simplify_gimple_switch (gimple stmt)
>      }
>  }
>  
> +/* For pointers p2 and p1 return p2 - p1 if the
> +   difference is known and constant, otherwise return NULL.  */
> +
> +static tree
> +constant_pointer_difference (tree p1, tree p2)
> +{
> +  int i, j;
> +  tree exps[2][5];
> +  tree offs[2][5];
> +  int cnt[2];
> +
> +  for (i = 0; i < 2; i++)
> +    {
> +      tree p = i ? p1 : p2;
> +      tree off = size_zero_node;
> +      gimple stmt;
> +      enum tree_code code;
> +
> +      j = 0;
> +      do
> +	{
> +	  if (!POINTER_TYPE_P (TREE_TYPE (p)))
> +	    break;
> +	  if (TREE_CODE (p) == ADDR_EXPR)
> +	    {
> +	      tree q = TREE_OPERAND (p, 0);
> +	      if (handled_component_p (q)
> +		  || TREE_CODE (q) == MEM_REF
> +		  || TREE_CODE (q) == TARGET_MEM_REF)
> +		{
> +		  HOST_WIDE_INT offset, size, maxsize;
> +		  tree base
> +		    = get_ref_base_and_extent (q, &offset,
> +					       &size, &maxsize);

Please use get_addr_base_and_unit_offset here, that simplifies
the following checks

> +		  if (size == maxsize
> +		      && offset >= 0
> +		      && (offset % BITS_PER_UNIT) == 0)
> +		    {
> +		      q = base;
> +		      off = size_binop (PLUS_EXPR, off,
> +					size_int (offset / BITS_PER_UNIT));
> +		    }
> +		}
> +	      if (TREE_CODE (q) == MEM_REF
> +		  && TREE_CODE (TREE_OPERAND (q, 0)) == SSA_NAME)
> +		{
> +		  p = TREE_OPERAND (q, 0);
> +		  off = size_binop (PLUS_EXPR, off,
> +				    fold_convert (sizetype,
> +						  TREE_OPERAND (q, 1)));

Use double_int_to_tree (sizetype, mem_ref_offset (q)) which properly
sign-extends the pointer offset.

> +		}
> +	      else
> +		{
> +		  exps[i][j] = q;
> +		  offs[i][j++] = off;
> +		  break;
> +		}
> +	    }
> +	  if (TREE_CODE (p) != SSA_NAME)
> +	    break;
> +	  exps[i][j] = p;
> +	  offs[i][j++] = off;
> +	  if (j == 5)
> +	    break;
> +	  stmt = SSA_NAME_DEF_STMT (p);
> +	  if (!is_gimple_assign (stmt) || gimple_assign_lhs (stmt) != p)
> +	    break;
> +	  code = gimple_assign_rhs_code (stmt);
> +	  if (code == POINTER_PLUS_EXPR)
> +	    {
> +	      if (TREE_CODE (gimple_assign_rhs2 (stmt)) != INTEGER_CST)
> +		break;
> +	      off = size_binop (PLUS_EXPR, off, gimple_assign_rhs2 (stmt));
> +	      p = gimple_assign_rhs1 (stmt);
> +	    }
> +	  else if (code == ADDR_EXPR || code == NOP_EXPR)
> +	    p = gimple_assign_rhs1 (stmt);
> +	  else
> +	    break;
> +	}
> +      while (1);
> +      cnt[i] = j;
> +    }

I wonder why the forward-propagation of POINTER_PLUS_EXPR and ADDR_EXPR
doesn't make the above redundant?

> +  for (i = 0; i < cnt[0]; i++)
> +    for (j = 0; j < cnt[1]; j++)
> +      if (exps[0][i] == exps[1][j])
> +	return size_binop (MINUS_EXPR, offs[0][i], offs[1][j]);
> +
> +  return NULL_TREE;
> +}
> +
> +/* *GSI_P is a GIMPLE_CALL to a builtin function.
> +   Optimize
> +   memcpy (p, "abcd", 4);
> +   memset (p + 4, ' ', 3);
> +   into
> +   memcpy (p, "abcd   ", 7);
> +   call if the latter can be stored by pieces during expansion.  */
> +
> +static bool
> +simplify_builtin_call (gimple_stmt_iterator *gsi_p, tree callee2)
> +{
> +  gimple stmt1, stmt2 = gsi_stmt (*gsi_p);
> +  tree vuse = gimple_vuse (stmt2);
> +  if (vuse == NULL)
> +    return false;
> +  stmt1 = SSA_NAME_DEF_STMT (vuse);
> +
> +  switch (DECL_FUNCTION_CODE (callee2))
> +    {
> +    case BUILT_IN_MEMSET:
> +      if (gimple_call_num_args (stmt2) != 3
> +	  || gimple_call_lhs (stmt2)
> +	  || CHAR_BIT != 8
> +	  || BITS_PER_UNIT != 8)
> +	break;
> +      else
> +	{
> +	  tree callee1;
> +	  tree ptr1, src1, str1, off1, len1, lhs1;
> +	  tree ptr2 = gimple_call_arg (stmt2, 0);
> +	  tree val2 = gimple_call_arg (stmt2, 1);
> +	  tree len2 = gimple_call_arg (stmt2, 2);
> +	  tree diff, vdef, new_str_cst;
> +	  unsigned int ptr1_align;
> +	  unsigned HOST_WIDE_INT src_len;
> +	  char *src_buf;
> +	  imm_use_iterator imm_iter;
> +	  use_operand_p use_p;
> +	  bool do_fail;
> +
> +	  if (!host_integerp (val2, 0)
> +	      || !host_integerp (len2, 1))
> +	    break;
> +	  if (is_gimple_call (stmt1))
> +	    {
> +	      callee1 = gimple_call_fndecl (stmt1);
> +	      if (callee1 == NULL_TREE
> +		  || DECL_BUILT_IN_CLASS (callee1) != BUILT_IN_NORMAL
> +		  || gimple_call_num_args (stmt1) != 3)
> +		break;
> +	      if (DECL_FUNCTION_CODE (callee1) != BUILT_IN_MEMCPY
> +		  && DECL_FUNCTION_CODE (callee1) != BUILT_IN_MEMPCPY)
> +		break;
> +	      ptr1 = gimple_call_arg (stmt1, 0);
> +	      src1 = gimple_call_arg (stmt1, 1);
> +	      len1 = gimple_call_arg (stmt1, 2);
> +	      lhs1 = gimple_call_lhs (stmt1);
> +	      if (!host_integerp (len1, 1))
> +		break;
> +	      str1 = string_constant (src1, &off1);
> +	      if (str1 == NULL_TREE)
> +		break;
> +	      if (!host_integerp (off1, 1)
> +		  || compare_tree_int (off1, TREE_STRING_LENGTH (str1) - 1) > 0
> +		  || compare_tree_int (len1, TREE_STRING_LENGTH (str1)
> +					     - tree_low_cst (off1, 1)) > 0
> +		  || TREE_CODE (TREE_TYPE (str1)) != ARRAY_TYPE
> +		  || TYPE_MODE (TREE_TYPE (TREE_TYPE (str1)))
> +		     != TYPE_MODE (char_type_node))
> +		break;
> +	    }
> +	  else if (gimple_assign_single_p (stmt1))
> +	    {
> +    	      ptr1 = gimple_assign_lhs (stmt1);
> +	      src1 = gimple_assign_rhs1 (stmt1);
> +	      if (TREE_CODE (ptr1) != MEM_REF
> +		  || TYPE_MODE (TREE_TYPE (ptr1)) != TYPE_MODE (char_type_node)
> +		  || !host_integerp (src1, 0))
> +		break;
> +	      ptr1 = build_fold_addr_expr (ptr1);
> +	      callee1 = NULL_TREE;
> +	      len1 = size_one_node;
> +	      lhs1 = NULL_TREE;
> +	      off1 = size_zero_node;
> +	      str1 = NULL_TREE;
> +	    }
> +	  else
> +	    break;
> +
> +	  diff = constant_pointer_difference (ptr1, ptr2);
> +	  if (diff == NULL && lhs1 != NULL)
> +	    {
> +	      diff = constant_pointer_difference (lhs1, ptr2);
> +	      if (DECL_FUNCTION_CODE (callee1) == BUILT_IN_MEMPCPY
> +		  && diff != NULL)
> +		diff = size_binop (PLUS_EXPR, diff,
> +				   fold_convert (sizetype, len1));
> +	    }
> +	  if (diff == NULL
> +	      || !host_integerp (diff, 1)
> +	      || tree_int_cst_lt (len1, diff))
> +	    break;
> +
> +	  src_len = tree_low_cst (diff, 1);
> +	  src_len += tree_low_cst (len2, 1);
> +	  if (src_len < (unsigned HOST_WIDE_INT) tree_low_cst (len1, 1))
> +	    src_len = tree_low_cst (len1, 1);
> +	  if (src_len > 1024)
> +	    break;
> +
> +	  do_fail = false;
> +	  if (lhs1 != NULL
> +	      && DECL_FUNCTION_CODE (callee1) == BUILT_IN_MEMPCPY)
> +	    {
> +	      if (TREE_CODE (lhs1) != SSA_NAME)
> +		break;
> +	      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs1)
> +		{
> +		  if (USE_STMT (use_p) != stmt2)
> +		    {
> +		      do_fail = true;
> +		      break;
> +		    }
> +		}

That looks like

  if (TREE_CODE (lhs1) != SSA_NAME
      || !single_imm_use (lhs1, &use_p, &use_stmt)
      || use_stmt != stmt2)
    break;

and your loop gets confused by DEBUG_STMTs at least (and possibly
generates different code -g vs. -g0)

> +	      if (do_fail)
> +		break;
> +	    }
> +	  vdef = gimple_vdef (stmt1);
> +	  if (vdef != NULL)
> +	    {
> +	      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, vdef)
> +		{
> +		  if (USE_STMT (use_p) != stmt2)
> +		    {
> +		      do_fail = true;
> +		      break;
> +		    }
> +		}

Likewise.  Why do we care about other uses of the VDEF?

> +	      if (do_fail)
> +		break;
> +	    }
> +
> +	  ptr1_align = get_pointer_alignment (ptr1, BIGGEST_ALIGNMENT);
> +	  src_buf = XALLOCAVEC (char, src_len + 1);
> +	  if (callee1)
> +	    memcpy (src_buf,
> +		    TREE_STRING_POINTER (str1) + tree_low_cst (off1, 1),
> +		    tree_low_cst (len1, 1));
> +	  else
> +	    src_buf[0] = tree_low_cst (src1, 0);
> +	  memset (src_buf + tree_low_cst (diff, 1),
> +		  tree_low_cst (val2, 1), tree_low_cst (len2, 1));
> +	  src_buf[src_len] = '\0';
> +	  if (strlen (src_buf) != src_len)
> +	    break;
> +	  rtl_profile_for_bb (gimple_bb (stmt2));
> +	  if (!can_store_by_pieces (src_len,
> +				    builtin_strncpy_read_str,
> +				    src_buf, ptr1_align, false))
> +	    break;
> +
> +	  new_str_cst = build_string_literal (src_len, src_buf);
> +	  if (callee1)
> +	    {
> +	      if (DECL_FUNCTION_CODE (callee1) == BUILT_IN_MEMPCPY)
> +		gimple_call_set_lhs (stmt1, NULL_TREE);
> +	      gimple_call_set_arg (stmt1, 1, new_str_cst);
> +	      gimple_call_set_arg (stmt1, 2,
> +				   build_int_cst (TREE_TYPE (len1), src_len));
> +	      update_stmt (stmt1);
> +	      unlink_stmt_vdef (stmt2);
> +	      gsi_remove (gsi_p, true);
> +	      release_defs (stmt2);
> +	      return true;
> +	    }
> +	  else
> +	    {
> +	      gimple_stmt_iterator gsi = gsi_for_stmt (stmt1);
> +
> +	      gimple_call_set_fndecl (stmt2, built_in_decls [BUILT_IN_MEMCPY]);
> +	      gimple_call_set_arg (stmt2, 0, ptr1);
> +	      gimple_call_set_arg (stmt2, 1, new_str_cst);
> +	      gimple_call_set_arg (stmt2, 2,
> +				   build_int_cst (TREE_TYPE (len2), src_len));
> +	      unlink_stmt_vdef (stmt1);
> +	      gsi_remove (&gsi, true);
> +	      release_defs (stmt1);
> +	      update_stmt (stmt2);
> +	      return false;
> +	    }
> +	}
> +      break;
> +    default:
> +      break;
> +    }
> +  return false;
> +}

The code above could use some comments, like those you gave about
can_store_by_pieces in the mail.

Otherwise the patch looks good, but I'm really curious as of why
constant_pointer_difference is as complex as it is.

Thanks,
Richard.

>  /* Run bitwise and assignments throug the folder.  If the first argument is an
>     ssa name that is itself a result of a typecast of an ADDR_EXPR to an
>     integer, feed the ADDR_EXPR to the folder rather than the ssa name.
> @@ -1784,6 +2075,14 @@ tree_ssa_forward_propagate_single_use_va
>  					      WARN_STRICT_OVERFLOW_CONDITIONAL);
>  	      gsi_next (&gsi);
>  	    }
> +	  else if (is_gimple_call (stmt))
> +	    {
> +	      tree callee = gimple_call_fndecl (stmt);
> +	      if (callee == NULL_TREE
> +		  || DECL_BUILT_IN_CLASS (callee) != BUILT_IN_NORMAL
> +		  || !simplify_builtin_call (&gsi, callee))
> +		gsi_next (&gsi);
> +	    }
>  	  else
>  	    gsi_next (&gsi);
>  	}
> --- gcc/Makefile.in.jj	2010-10-11 07:51:09.000000000 +0200
> +++ gcc/Makefile.in	2010-10-11 09:15:25.000000000 +0200
> @@ -2404,7 +2404,7 @@ tree-ssa-dse.o : tree-ssa-dse.c $(CONFIG
>  tree-ssa-forwprop.o : tree-ssa-forwprop.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
>     $(TM_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
>     $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) $(DIAGNOSTIC_H) $(TIMEVAR_H) \
> -   langhooks.h $(FLAGS_H) $(GIMPLE_H) tree-pretty-print.h
> +   langhooks.h $(FLAGS_H) $(GIMPLE_H) tree-pretty-print.h $(EXPR_H)
>  tree-ssa-phiprop.o : tree-ssa-phiprop.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
>     $(TM_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
>     $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) $(DIAGNOSTIC_H) $(TIMEVAR_H) \
> --- gcc/testsuite/gcc.c-torture/execute/pr45636.c.jj	2010-10-11 09:15:25.000000000 +0200
> +++ gcc/testsuite/gcc.c-torture/execute/pr45636.c	2010-10-11 09:15:25.000000000 +0200
> @@ -0,0 +1,75 @@
> +/* PR fortran/45636 */
> +
> +typedef __SIZE_TYPE__ size_t;
> +void *memcpy (void *__restrict__, const void *__restrict__, size_t);
> +void *mempcpy (void *__restrict__, const void *__restrict__, size_t);
> +void *memset (void *, int, size_t);
> +int memcmp (const void *, const void *, size_t);
> +extern void abort (void);
> +
> +struct A { int i; char c[32]; } a[2];
> +
> +__attribute__((noinline, noclone)) int
> +f1 (char *p, int q, int z)
> +{
> +  memcpy (p, "abcd", 4);
> +  if (q)
> +    z = z + 123;
> +  else
> +    z *= 114;
> +  memset (p + 4, ' ', 2);
> +  return z;
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f2 (void)
> +{
> +  char *p = mempcpy (&a[0].c[13], "123456", 4);
> +  memset (p, '7', 3);
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f3 (struct A *p)
> +{
> +  p++;
> +  char *q = &p->c[10];
> +  memcpy (q + 4, "__1234567" + 2, 7);
> +  memset (&p->c[21], '9', 3);
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f4 (void)
> +{
> +  memcpy (&a[0].c[10], "0123456789", 10);
> +  memset (&a[0].c[13], ' ', 3);
> +}
> +
> +__attribute__((noinline, noclone)) void
> +check (const char *p, const char *str, size_t size)
> +{
> +  const char *q;
> +  for (q = (const char *) &a; q < p; q++)
> +    if (*q)
> +      abort ();
> +  if (memcmp (p, str, size) != 0)
> +    abort ();
> +  for (q = p + size; q < (const char *) (&a[0] + 2); q++)
> +    if (*q)
> +      abort ();
> +  memset (&a, '\0', sizeof a);
> +}
> +
> +int
> +main (void)
> +{
> +  if (f1 (&a[0].c[7], 1, 2) != 125)
> +    abort ();
> +  check (&a[0].c[7], "abcd  ", 6);
> +  f2 ();
> +  check (&a[0].c[13], "1234777", 7);
> +  f3 (&a[0]);
> +  check (&a[1].c[14], "1234567999", 10);
> +  f4 ();
> +  check (&a[0].c[10], "012   6789", 10);
> +  return 0;
> +}
> --- gcc/testsuite/gfortran.dg/pr45636.f90.jj	2010-10-11 13:19:41.000000000 +0200
> +++ gcc/testsuite/gfortran.dg/pr45636.f90	2010-10-11 16:49:19.000000000 +0200
> @@ -0,0 +1,14 @@
> +! PR fortran/45636
> +! { dg-do compile }
> +! { dg-options "-O2 -fdump-tree-forwprop2" }
> +! PR 45636 - make sure no memset is needed for a short right-hand side.
> +program main
> +  character(len=2), parameter :: x='a '
> +  character(len=1), parameter :: y='b'
> +  character(len=4) :: a, b
> +  a = x
> +  b = y
> +  call sub(a, b)
> +end program main
> +! { dg-final { scan-tree-dump-times "memset" 0 "forwprop2" } }
> +! { dg-final { cleanup-tree-dump "forwprop2" } }
> 
> 	Jakub
> 
> 

-- 
Richard Guenther <rguenther@suse.de>
Novell / SUSE Labs
SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746 - GF: Markus Rex



More information about the Gcc-patches mailing list