This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] Optimize memcpy + memset (PR fortran/45636)


On Tue, 12 Oct 2010, Jakub Jelinek wrote:

> On Tue, Oct 12, 2010 at 11:18:35AM +0200, Richard Guenther wrote:
> > Please use get_addr_base_and_unit_offset here, that simplifies
> > the following checks
> 
> Done.
> 
> > > +		  off = size_binop (PLUS_EXPR, off,
> > > +				    fold_convert (sizetype,
> > > +						  TREE_OPERAND (q, 1)));
> > 
> > Use double_int_to_tree (sizetype, mem_ref_offset (q)) which properly
> > sign-extends the pointer offset.
> 
> Likewise.
> 
> > > +	  if (code == POINTER_PLUS_EXPR)
> > > +	    {
> > > +	      if (TREE_CODE (gimple_assign_rhs2 (stmt)) != INTEGER_CST)
> > > +		break;
> > > +	      off = size_binop (PLUS_EXPR, off, gimple_assign_rhs2 (stmt));
> > > +	      p = gimple_assign_rhs1 (stmt);
> > > +	    }
> > > +	  else if (code == ADDR_EXPR || code == NOP_EXPR)
> > > +	    p = gimple_assign_rhs1 (stmt);
> > > +	  else
> > > +	    break;
> > > +	}
> > > +      while (1);
> > > +      cnt[i] = j;
> > > +    }
> > 
> > I wonder why the forward-propagation of POINTER_PLUS_EXPR and ADDR_EXPR
> > doesn't make the above redundant?
> 
> On the pr45636.c testcase, in f1, constant_pointer_difference is called
> with p1 p.1_4and p2 D.2771_9: 
> 
> p.1_4 = (void * restrict) p_3(D);
> memcpy (p.1_4, &"abcd"[0], 4);

Ah, restrict ... indeed.

> if (q_5(D) != 0)
>   goto <bb 3>;
> else
>   goto <bb 4>;
> ...
> # z_1 = PHI <z_7(3), z_8(4)>
> D.2771_9 = p_3(D) + 4;		# POINTER_PLUS_EXPR here
> memset (D.2771_9, 32, 2);

Hm, ok.  that's not invariant and thus not propagated into the
memset arg.

> On f2, it is called with p1 &a[0].c[13] and p2 p_3 (and later with p1 == p2 p_3):
> p_3 = mempcpy (&a[0].c[13], &"123456"[0], 4);
> memset (p_3, 55, 3);
> 
> On f3 p1 is D.2762_6 and p2 is D.2763_7:
> D.2762_6 = &MEM[(void *)p_1(D) + 54B];
> memcpy (D.2762_6, &"__1234567"[2], 7);
> D.2763_7 = &MEM[(struct A *)p_1(D) + 36B].c[21];
> memset (D.2763_7, 57, 3);

Likewise.

> and on f4 p1 is &a[0].c[10] and p2 is &a[0].c[13]:
> memcpy (&a[0].c[10], &"0123456789"[0], 10);
> memset (&a[0].c[13], 32, 3);
> 
> So, what exactly do you think is redundant?

I thought about

  D.12_1 = p_3(D) + 4;
  D.23_2 = D.12_1 + 4;
  memcpy (D.23_2, ...);

which it looks like you handle via iterating a few times.  It indeed
looks like a single stmt lookup is always necessary (just because
most of the offsetting isn't a proper gimple-val).  None of the
above case would require the iteration though.

Basically I'd think that the inner loop over j could go away?

> > That looks like
> > 
> >   if (TREE_CODE (lhs1) != SSA_NAME
> >       || !single_imm_use (lhs1, &use_p, &use_stmt)
> >       || use_stmt != stmt2)
> >     break;
> > 
> > and your loop gets confused by DEBUG_STMTs at least (and possibly
> > generates different code -g vs. -g0)
> 
> Fixed.
> 
> > Likewise.  Why do we care about other uses of the VDEF?
> 
> Likewise.  We do care about other uses of VDEF.  We could use the oracle
> to find out if the other uses matter or not I guess, but it would
> make the code larger.
> Consider
> memcpy (&p[0], &"abcd"[0], 4);
> v = p[4];
> memset (&p[4], ' ', 2);
> or
> MEM[&p] = 'a';
> w = MEM[&p];
> memset (&p[1], ' ', 3);
> 
> If we change memcpy above to memcpy (&p[0], &"abcd  "[0], 6); and remove
> memset, then v might have incorrect value (' ' instead of whatever it had
> before), if the MEM[&p] = 'a' is removed and memset is changed
> into memset (&p[0], "a   ", 4);, then w will have wrong value.

Ah indeed.

> > The code above could use some comments, like those you gave about
> > can_store_by_pieces in the mail.
> 
> Some comments added.

Much better.  Does it work without the iteration?

Thanks,
Richard.

> 2010-10-12  Jakub Jelinek  <jakub@redhat.com>
> 
> 	PR fortran/45636
> 	* tree-ssa-forwprop.c: Include expr.h.
> 	(constant_pointer_difference, simplify_builtin_call): New functions.
> 	(tree_ssa_forward_propagate_single_use_vars): Call
> 	simplify_builtin_call on builtin calls.
> 
> 	* gcc.c-torture/execute/pr45636.c: New test.
> 	* gfortran.dg/pr45636.f90: New test.
> 
> --- gcc/tree-ssa-forwprop.c.jj	2010-10-11 20:37:16.511307231 +0200
> +++ gcc/tree-ssa-forwprop.c	2010-10-12 13:36:51.499542149 +0200
> @@ -33,6 +33,7 @@ along with GCC; see the file COPYING3.  
>  #include "langhooks.h"
>  #include "flags.h"
>  #include "gimple.h"
> +#include "expr.h"
>  
>  /* This pass propagates the RHS of assignment statements into use
>     sites of the LHS of the assignment.  It's basically a specialized
> @@ -1317,6 +1318,292 @@ simplify_gimple_switch (gimple stmt)
>      }
>  }
>  
> +/* For pointers p2 and p1 return p2 - p1 if the
> +   difference is known and constant, otherwise return NULL.  */
> +
> +static tree
> +constant_pointer_difference (tree p1, tree p2)
> +{
> +  int i, j;
> +  tree exps[2][5];
> +  tree offs[2][5];
> +  int cnt[2];
> +
> +  for (i = 0; i < 2; i++)
> +    {
> +      tree p = i ? p1 : p2;
> +      tree off = size_zero_node;
> +      gimple stmt;
> +      enum tree_code code;
> +
> +      j = 0;
> +      do
> +	{
> +	  if (!POINTER_TYPE_P (TREE_TYPE (p)))
> +	    break;
> +	  if (TREE_CODE (p) == ADDR_EXPR)
> +	    {
> +	      tree q = TREE_OPERAND (p, 0);
> +	      HOST_WIDE_INT offset;
> +	      tree base = get_addr_base_and_unit_offset (q, &offset);
> +	      if (base)
> +		{
> +		  q = base;
> +		  if (offset)
> +		    off = size_binop (PLUS_EXPR, off, size_int (offset));
> +		}
> +	      if (TREE_CODE (q) == MEM_REF
> +		  && TREE_CODE (TREE_OPERAND (q, 0)) == SSA_NAME)
> +		{
> +		  p = TREE_OPERAND (q, 0);
> +		  off = size_binop (PLUS_EXPR, off,
> +				    double_int_to_tree (sizetype,
> +							mem_ref_offset (q)));
> +		}
> +	      else
> +		{
> +		  exps[i][j] = q;
> +		  offs[i][j++] = off;
> +		  break;
> +		}
> +	    }
> +	  if (TREE_CODE (p) != SSA_NAME)
> +	    break;
> +	  exps[i][j] = p;
> +	  offs[i][j++] = off;
> +	  if (j == 5)
> +	    break;
> +	  stmt = SSA_NAME_DEF_STMT (p);
> +	  if (!is_gimple_assign (stmt) || gimple_assign_lhs (stmt) != p)
> +	    break;
> +	  code = gimple_assign_rhs_code (stmt);
> +	  if (code == POINTER_PLUS_EXPR)
> +	    {
> +	      if (TREE_CODE (gimple_assign_rhs2 (stmt)) != INTEGER_CST)
> +		break;
> +	      off = size_binop (PLUS_EXPR, off, gimple_assign_rhs2 (stmt));
> +	      p = gimple_assign_rhs1 (stmt);
> +	    }
> +	  else if (code == ADDR_EXPR || code == NOP_EXPR)
> +	    p = gimple_assign_rhs1 (stmt);
> +	  else
> +	    break;
> +	}
> +      while (1);
> +      cnt[i] = j;
> +    }
> +
> +  for (i = 0; i < cnt[0]; i++)
> +    for (j = 0; j < cnt[1]; j++)
> +      if (exps[0][i] == exps[1][j])
> +	return size_binop (MINUS_EXPR, offs[0][i], offs[1][j]);
> +
> +  return NULL_TREE;
> +}
> +
> +/* *GSI_P is a GIMPLE_CALL to a builtin function.
> +   Optimize
> +   memcpy (p, "abcd", 4);
> +   memset (p + 4, ' ', 3);
> +   into
> +   memcpy (p, "abcd   ", 7);
> +   call if the latter can be stored by pieces during expansion.  */
> +
> +static bool
> +simplify_builtin_call (gimple_stmt_iterator *gsi_p, tree callee2)
> +{
> +  gimple stmt1, stmt2 = gsi_stmt (*gsi_p);
> +  tree vuse = gimple_vuse (stmt2);
> +  if (vuse == NULL)
> +    return false;
> +  stmt1 = SSA_NAME_DEF_STMT (vuse);
> +
> +  switch (DECL_FUNCTION_CODE (callee2))
> +    {
> +    case BUILT_IN_MEMSET:
> +      if (gimple_call_num_args (stmt2) != 3
> +	  || gimple_call_lhs (stmt2)
> +	  || CHAR_BIT != 8
> +	  || BITS_PER_UNIT != 8)
> +	break;
> +      else
> +	{
> +	  tree callee1;
> +	  tree ptr1, src1, str1, off1, len1, lhs1;
> +	  tree ptr2 = gimple_call_arg (stmt2, 0);
> +	  tree val2 = gimple_call_arg (stmt2, 1);
> +	  tree len2 = gimple_call_arg (stmt2, 2);
> +	  tree diff, vdef, new_str_cst;
> +	  gimple use_stmt;
> +	  unsigned int ptr1_align;
> +	  unsigned HOST_WIDE_INT src_len;
> +	  char *src_buf;
> +	  use_operand_p use_p;
> +
> +	  if (!host_integerp (val2, 0)
> +	      || !host_integerp (len2, 1))
> +	    break;
> +	  if (is_gimple_call (stmt1))
> +	    {
> +	      /* If first stmt is a call, it needs to be memcpy
> +		 or mempcpy, with string literal as second argument and
> +		 constant length.  */
> +	      callee1 = gimple_call_fndecl (stmt1);
> +	      if (callee1 == NULL_TREE
> +		  || DECL_BUILT_IN_CLASS (callee1) != BUILT_IN_NORMAL
> +		  || gimple_call_num_args (stmt1) != 3)
> +		break;
> +	      if (DECL_FUNCTION_CODE (callee1) != BUILT_IN_MEMCPY
> +		  && DECL_FUNCTION_CODE (callee1) != BUILT_IN_MEMPCPY)
> +		break;
> +	      ptr1 = gimple_call_arg (stmt1, 0);
> +	      src1 = gimple_call_arg (stmt1, 1);
> +	      len1 = gimple_call_arg (stmt1, 2);
> +	      lhs1 = gimple_call_lhs (stmt1);
> +	      if (!host_integerp (len1, 1))
> +		break;
> +	      str1 = string_constant (src1, &off1);
> +	      if (str1 == NULL_TREE)
> +		break;
> +	      if (!host_integerp (off1, 1)
> +		  || compare_tree_int (off1, TREE_STRING_LENGTH (str1) - 1) > 0
> +		  || compare_tree_int (len1, TREE_STRING_LENGTH (str1)
> +					     - tree_low_cst (off1, 1)) > 0
> +		  || TREE_CODE (TREE_TYPE (str1)) != ARRAY_TYPE
> +		  || TYPE_MODE (TREE_TYPE (TREE_TYPE (str1)))
> +		     != TYPE_MODE (char_type_node))
> +		break;
> +	    }
> +	  else if (gimple_assign_single_p (stmt1))
> +	    {
> +	      /* Otherwise look for length 1 memcpy optimized into
> +		 assignment.  */
> +    	      ptr1 = gimple_assign_lhs (stmt1);
> +	      src1 = gimple_assign_rhs1 (stmt1);
> +	      if (TREE_CODE (ptr1) != MEM_REF
> +		  || TYPE_MODE (TREE_TYPE (ptr1)) != TYPE_MODE (char_type_node)
> +		  || !host_integerp (src1, 0))
> +		break;
> +	      ptr1 = build_fold_addr_expr (ptr1);
> +	      callee1 = NULL_TREE;
> +	      len1 = size_one_node;
> +	      lhs1 = NULL_TREE;
> +	      off1 = size_zero_node;
> +	      str1 = NULL_TREE;
> +	    }
> +	  else
> +	    break;
> +
> +	  diff = constant_pointer_difference (ptr1, ptr2);
> +	  if (diff == NULL && lhs1 != NULL)
> +	    {
> +	      diff = constant_pointer_difference (lhs1, ptr2);
> +	      if (DECL_FUNCTION_CODE (callee1) == BUILT_IN_MEMPCPY
> +		  && diff != NULL)
> +		diff = size_binop (PLUS_EXPR, diff,
> +				   fold_convert (sizetype, len1));
> +	    }
> +	  /* If the difference between the second and first destination pointer
> +	     is not constant, or is bigger than memcpy length, bail out.  */
> +	  if (diff == NULL
> +	      || !host_integerp (diff, 1)
> +	      || tree_int_cst_lt (len1, diff))
> +	    break;
> +
> +	  /* Use maximum of difference plus memset length and memcpy length
> +	     as the new memcpy length, if it is too big, bail out.  */
> +	  src_len = tree_low_cst (diff, 1);
> +	  src_len += tree_low_cst (len2, 1);
> +	  if (src_len < (unsigned HOST_WIDE_INT) tree_low_cst (len1, 1))
> +	    src_len = tree_low_cst (len1, 1);
> +	  if (src_len > 1024)
> +	    break;
> +
> +	  /* If mempcpy value is used elsewhere, bail out, as mempcpy
> +	     with bigger length will return different result.  */
> +	  if (lhs1 != NULL_TREE
> +	      && DECL_FUNCTION_CODE (callee1) == BUILT_IN_MEMPCPY
> +	      && (TREE_CODE (lhs1) != SSA_NAME
> +		  || !single_imm_use (lhs1, &use_p, &use_stmt)
> +		  || use_stmt != stmt2))
> +	    break;
> +
> +	  /* If anything reads memory in between memcpy and memset
> +	     call, the modified memcpy call might change it.  */
> +	  vdef = gimple_vdef (stmt1);
> +	  if (vdef != NULL
> +	      && (!single_imm_use (vdef, &use_p, &use_stmt)
> +		  || use_stmt != stmt2))
> +	    break;
> +
> +	  ptr1_align = get_pointer_alignment (ptr1, BIGGEST_ALIGNMENT);
> +	  /* Construct the new source string literal.  */
> +	  src_buf = XALLOCAVEC (char, src_len + 1);
> +	  if (callee1)
> +	    memcpy (src_buf,
> +		    TREE_STRING_POINTER (str1) + tree_low_cst (off1, 1),
> +		    tree_low_cst (len1, 1));
> +	  else
> +	    src_buf[0] = tree_low_cst (src1, 0);
> +	  memset (src_buf + tree_low_cst (diff, 1),
> +		  tree_low_cst (val2, 1), tree_low_cst (len2, 1));
> +	  src_buf[src_len] = '\0';
> +	  /* Neither builtin_strncpy_read_str nor builtin_memcpy_read_str
> +	     handle embedded '\0's.  */
> +	  if (strlen (src_buf) != src_len)
> +	    break;
> +	  rtl_profile_for_bb (gimple_bb (stmt2));
> +	  /* If the new memcpy wouldn't be emitted by storing the literal
> +	     by pieces, this optimization might enlarge .rodata too much,
> +	     as commonly used string literals couldn't be shared any
> +	     longer.  */
> +	  if (!can_store_by_pieces (src_len,
> +				    builtin_strncpy_read_str,
> +				    src_buf, ptr1_align, false))
> +	    break;
> +
> +	  new_str_cst = build_string_literal (src_len, src_buf);
> +	  if (callee1)
> +	    {
> +	      /* If STMT1 is a mem{,p}cpy call, adjust it and remove
> +		 memset call.  */
> +	      if (DECL_FUNCTION_CODE (callee1) == BUILT_IN_MEMPCPY)
> +		gimple_call_set_lhs (stmt1, NULL_TREE);
> +	      gimple_call_set_arg (stmt1, 1, new_str_cst);
> +	      gimple_call_set_arg (stmt1, 2,
> +				   build_int_cst (TREE_TYPE (len1), src_len));
> +	      update_stmt (stmt1);
> +	      unlink_stmt_vdef (stmt2);
> +	      gsi_remove (gsi_p, true);
> +	      release_defs (stmt2);
> +	      return true;
> +	    }
> +	  else
> +	    {
> +	      /* Otherwise, if STMT1 is length 1 memcpy optimized into
> +		 assignment, remove STMT1 and change memset call into
> +		 memcpy call.  */
> +	      gimple_stmt_iterator gsi = gsi_for_stmt (stmt1);
> +
> +	      gimple_call_set_fndecl (stmt2, built_in_decls [BUILT_IN_MEMCPY]);
> +	      gimple_call_set_arg (stmt2, 0, ptr1);
> +	      gimple_call_set_arg (stmt2, 1, new_str_cst);
> +	      gimple_call_set_arg (stmt2, 2,
> +				   build_int_cst (TREE_TYPE (len2), src_len));
> +	      unlink_stmt_vdef (stmt1);
> +	      gsi_remove (&gsi, true);
> +	      release_defs (stmt1);
> +	      update_stmt (stmt2);
> +	      return false;
> +	    }
> +	}
> +      break;
> +    default:
> +      break;
> +    }
> +  return false;
> +}
> +
>  /* Run bitwise and assignments throug the folder.  If the first argument is an
>     ssa name that is itself a result of a typecast of an ADDR_EXPR to an
>     integer, feed the ADDR_EXPR to the folder rather than the ssa name.
> @@ -1784,6 +2071,14 @@ tree_ssa_forward_propagate_single_use_va
>  					      WARN_STRICT_OVERFLOW_CONDITIONAL);
>  	      gsi_next (&gsi);
>  	    }
> +	  else if (is_gimple_call (stmt))
> +	    {
> +	      tree callee = gimple_call_fndecl (stmt);
> +	      if (callee == NULL_TREE
> +		  || DECL_BUILT_IN_CLASS (callee) != BUILT_IN_NORMAL
> +		  || !simplify_builtin_call (&gsi, callee))
> +		gsi_next (&gsi);
> +	    }
>  	  else
>  	    gsi_next (&gsi);
>  	}
> --- gcc/Makefile.in.jj	2010-10-12 08:23:50.948247432 +0200
> +++ gcc/Makefile.in	2010-10-12 13:08:25.096497334 +0200
> @@ -2407,7 +2407,7 @@ tree-ssa-dse.o : tree-ssa-dse.c $(CONFIG
>  tree-ssa-forwprop.o : tree-ssa-forwprop.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
>     $(TM_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
>     $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) $(DIAGNOSTIC_H) $(TIMEVAR_H) \
> -   langhooks.h $(FLAGS_H) $(GIMPLE_H) tree-pretty-print.h
> +   langhooks.h $(FLAGS_H) $(GIMPLE_H) tree-pretty-print.h $(EXPR_H)
>  tree-ssa-phiprop.o : tree-ssa-phiprop.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
>     $(TM_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
>     $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) $(DIAGNOSTIC_H) $(TIMEVAR_H) \
> --- gcc/testsuite/gcc.c-torture/execute/pr45636.c.jj	2010-10-12 13:08:25.098497519 +0200
> +++ gcc/testsuite/gcc.c-torture/execute/pr45636.c	2010-10-12 13:08:25.098497519 +0200
> @@ -0,0 +1,75 @@
> +/* PR fortran/45636 */
> +
> +typedef __SIZE_TYPE__ size_t;
> +void *memcpy (void *__restrict__, const void *__restrict__, size_t);
> +void *mempcpy (void *__restrict__, const void *__restrict__, size_t);
> +void *memset (void *, int, size_t);
> +int memcmp (const void *, const void *, size_t);
> +extern void abort (void);
> +
> +struct A { int i; char c[32]; } a[2];
> +
> +__attribute__((noinline, noclone)) int
> +f1 (char *p, int q, int z)
> +{
> +  memcpy (p, "abcd", 4);
> +  if (q)
> +    z = z + 123;
> +  else
> +    z *= 114;
> +  memset (p + 4, ' ', 2);
> +  return z;
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f2 (void)
> +{
> +  char *p = mempcpy (&a[0].c[13], "123456", 4);
> +  memset (p, '7', 3);
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f3 (struct A *p)
> +{
> +  p++;
> +  char *q = &p->c[10];
> +  memcpy (q + 4, "__1234567" + 2, 7);
> +  memset (&p->c[21], '9', 3);
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f4 (void)
> +{
> +  memcpy (&a[0].c[10], "0123456789", 10);
> +  memset (&a[0].c[13], ' ', 3);
> +}
> +
> +__attribute__((noinline, noclone)) void
> +check (const char *p, const char *str, size_t size)
> +{
> +  const char *q;
> +  for (q = (const char *) &a; q < p; q++)
> +    if (*q)
> +      abort ();
> +  if (memcmp (p, str, size) != 0)
> +    abort ();
> +  for (q = p + size; q < (const char *) (&a[0] + 2); q++)
> +    if (*q)
> +      abort ();
> +  memset (&a, '\0', sizeof a);
> +}
> +
> +int
> +main (void)
> +{
> +  if (f1 (&a[0].c[7], 1, 2) != 125)
> +    abort ();
> +  check (&a[0].c[7], "abcd  ", 6);
> +  f2 ();
> +  check (&a[0].c[13], "1234777", 7);
> +  f3 (&a[0]);
> +  check (&a[1].c[14], "1234567999", 10);
> +  f4 ();
> +  check (&a[0].c[10], "012   6789", 10);
> +  return 0;
> +}
> --- gcc/testsuite/gfortran.dg/pr45636.f90.jj	2010-10-12 13:08:25.099615239 +0200
> +++ gcc/testsuite/gfortran.dg/pr45636.f90	2010-10-12 13:08:25.099615239 +0200
> @@ -0,0 +1,14 @@
> +! PR fortran/45636
> +! { dg-do compile }
> +! { dg-options "-O2 -fdump-tree-forwprop2" }
> +! PR 45636 - make sure no memset is needed for a short right-hand side.
> +program main
> +  character(len=2), parameter :: x='a '
> +  character(len=1), parameter :: y='b'
> +  character(len=4) :: a, b
> +  a = x
> +  b = y
> +  call sub(a, b)
> +end program main
> +! { dg-final { scan-tree-dump-times "memset" 0 "forwprop2" } }
> +! { dg-final { cleanup-tree-dump "forwprop2" } }
> 
> 
> 	Jakub
> 
> 

-- 
Richard Guenther <rguenther@suse.de>
Novell / SUSE Labs
SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746 - GF: Markus Rex


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]