This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] Optimize memcpy + memset (PR fortran/45636)


On Tue, Oct 12, 2010 at 11:18:35AM +0200, Richard Guenther wrote:
> Please use get_addr_base_and_unit_offset here, that simplifies
> the following checks

Done.

> > +		  off = size_binop (PLUS_EXPR, off,
> > +				    fold_convert (sizetype,
> > +						  TREE_OPERAND (q, 1)));
> 
> Use double_int_to_tree (sizetype, mem_ref_offset (q)) which properly
> sign-extends the pointer offset.

Likewise.

> > +	  if (code == POINTER_PLUS_EXPR)
> > +	    {
> > +	      if (TREE_CODE (gimple_assign_rhs2 (stmt)) != INTEGER_CST)
> > +		break;
> > +	      off = size_binop (PLUS_EXPR, off, gimple_assign_rhs2 (stmt));
> > +	      p = gimple_assign_rhs1 (stmt);
> > +	    }
> > +	  else if (code == ADDR_EXPR || code == NOP_EXPR)
> > +	    p = gimple_assign_rhs1 (stmt);
> > +	  else
> > +	    break;
> > +	}
> > +      while (1);
> > +      cnt[i] = j;
> > +    }
> 
> I wonder why the forward-propagation of POINTER_PLUS_EXPR and ADDR_EXPR
> doesn't make the above redundant?

On the pr45636.c testcase, in f1, constant_pointer_difference is called
with p1 p.1_4and p2 D.2771_9: 

p.1_4 = (void * restrict) p_3(D);
memcpy (p.1_4, &"abcd"[0], 4);
if (q_5(D) != 0)
  goto <bb 3>;
else
  goto <bb 4>;
...
# z_1 = PHI <z_7(3), z_8(4)>
D.2771_9 = p_3(D) + 4;		# POINTER_PLUS_EXPR here
memset (D.2771_9, 32, 2);

On f2, it is called with p1 &a[0].c[13] and p2 p_3 (and later with p1 == p2 p_3):
p_3 = mempcpy (&a[0].c[13], &"123456"[0], 4);
memset (p_3, 55, 3);

On f3 p1 is D.2762_6 and p2 is D.2763_7:
D.2762_6 = &MEM[(void *)p_1(D) + 54B];
memcpy (D.2762_6, &"__1234567"[2], 7);
D.2763_7 = &MEM[(struct A *)p_1(D) + 36B].c[21];
memset (D.2763_7, 57, 3);

and on f4 p1 is &a[0].c[10] and p2 is &a[0].c[13]:
memcpy (&a[0].c[10], &"0123456789"[0], 10);
memset (&a[0].c[13], 32, 3);

So, what exactly do you think is redundant?

> That looks like
> 
>   if (TREE_CODE (lhs1) != SSA_NAME
>       || !single_imm_use (lhs1, &use_p, &use_stmt)
>       || use_stmt != stmt2)
>     break;
> 
> and your loop gets confused by DEBUG_STMTs at least (and possibly
> generates different code -g vs. -g0)

Fixed.

> Likewise.  Why do we care about other uses of the VDEF?

Likewise.  We do care about other uses of VDEF.  We could use the oracle
to find out if the other uses matter or not I guess, but it would
make the code larger.
Consider
memcpy (&p[0], &"abcd"[0], 4);
v = p[4];
memset (&p[4], ' ', 2);
or
MEM[&p] = 'a';
w = MEM[&p];
memset (&p[1], ' ', 3);

If we change memcpy above to memcpy (&p[0], &"abcd  "[0], 6); and remove
memset, then v might have incorrect value (' ' instead of whatever it had
before), if the MEM[&p] = 'a' is removed and memset is changed
into memset (&p[0], "a   ", 4);, then w will have wrong value.

> The code above could use some comments, like those you gave about
> can_store_by_pieces in the mail.

Some comments added.

2010-10-12  Jakub Jelinek  <jakub@redhat.com>

	PR fortran/45636
	* tree-ssa-forwprop.c: Include expr.h.
	(constant_pointer_difference, simplify_builtin_call): New functions.
	(tree_ssa_forward_propagate_single_use_vars): Call
	simplify_builtin_call on builtin calls.

	* gcc.c-torture/execute/pr45636.c: New test.
	* gfortran.dg/pr45636.f90: New test.

--- gcc/tree-ssa-forwprop.c.jj	2010-10-11 20:37:16.511307231 +0200
+++ gcc/tree-ssa-forwprop.c	2010-10-12 13:36:51.499542149 +0200
@@ -33,6 +33,7 @@ along with GCC; see the file COPYING3.  
 #include "langhooks.h"
 #include "flags.h"
 #include "gimple.h"
+#include "expr.h"
 
 /* This pass propagates the RHS of assignment statements into use
    sites of the LHS of the assignment.  It's basically a specialized
@@ -1317,6 +1318,292 @@ simplify_gimple_switch (gimple stmt)
     }
 }
 
+/* For pointers p2 and p1 return p2 - p1 if the
+   difference is known and constant, otherwise return NULL.  */
+
+static tree
+constant_pointer_difference (tree p1, tree p2)
+{
+  int i, j;
+  tree exps[2][5];
+  tree offs[2][5];
+  int cnt[2];
+
+  for (i = 0; i < 2; i++)
+    {
+      tree p = i ? p1 : p2;
+      tree off = size_zero_node;
+      gimple stmt;
+      enum tree_code code;
+
+      j = 0;
+      do
+	{
+	  if (!POINTER_TYPE_P (TREE_TYPE (p)))
+	    break;
+	  if (TREE_CODE (p) == ADDR_EXPR)
+	    {
+	      tree q = TREE_OPERAND (p, 0);
+	      HOST_WIDE_INT offset;
+	      tree base = get_addr_base_and_unit_offset (q, &offset);
+	      if (base)
+		{
+		  q = base;
+		  if (offset)
+		    off = size_binop (PLUS_EXPR, off, size_int (offset));
+		}
+	      if (TREE_CODE (q) == MEM_REF
+		  && TREE_CODE (TREE_OPERAND (q, 0)) == SSA_NAME)
+		{
+		  p = TREE_OPERAND (q, 0);
+		  off = size_binop (PLUS_EXPR, off,
+				    double_int_to_tree (sizetype,
+							mem_ref_offset (q)));
+		}
+	      else
+		{
+		  exps[i][j] = q;
+		  offs[i][j++] = off;
+		  break;
+		}
+	    }
+	  if (TREE_CODE (p) != SSA_NAME)
+	    break;
+	  exps[i][j] = p;
+	  offs[i][j++] = off;
+	  if (j == 5)
+	    break;
+	  stmt = SSA_NAME_DEF_STMT (p);
+	  if (!is_gimple_assign (stmt) || gimple_assign_lhs (stmt) != p)
+	    break;
+	  code = gimple_assign_rhs_code (stmt);
+	  if (code == POINTER_PLUS_EXPR)
+	    {
+	      if (TREE_CODE (gimple_assign_rhs2 (stmt)) != INTEGER_CST)
+		break;
+	      off = size_binop (PLUS_EXPR, off, gimple_assign_rhs2 (stmt));
+	      p = gimple_assign_rhs1 (stmt);
+	    }
+	  else if (code == ADDR_EXPR || code == NOP_EXPR)
+	    p = gimple_assign_rhs1 (stmt);
+	  else
+	    break;
+	}
+      while (1);
+      cnt[i] = j;
+    }
+
+  for (i = 0; i < cnt[0]; i++)
+    for (j = 0; j < cnt[1]; j++)
+      if (exps[0][i] == exps[1][j])
+	return size_binop (MINUS_EXPR, offs[0][i], offs[1][j]);
+
+  return NULL_TREE;
+}
+
+/* *GSI_P is a GIMPLE_CALL to a builtin function.
+   Optimize
+   memcpy (p, "abcd", 4);
+   memset (p + 4, ' ', 3);
+   into
+   memcpy (p, "abcd   ", 7);
+   call if the latter can be stored by pieces during expansion.  */
+
+static bool
+simplify_builtin_call (gimple_stmt_iterator *gsi_p, tree callee2)
+{
+  gimple stmt1, stmt2 = gsi_stmt (*gsi_p);
+  tree vuse = gimple_vuse (stmt2);
+  if (vuse == NULL)
+    return false;
+  stmt1 = SSA_NAME_DEF_STMT (vuse);
+
+  switch (DECL_FUNCTION_CODE (callee2))
+    {
+    case BUILT_IN_MEMSET:
+      if (gimple_call_num_args (stmt2) != 3
+	  || gimple_call_lhs (stmt2)
+	  || CHAR_BIT != 8
+	  || BITS_PER_UNIT != 8)
+	break;
+      else
+	{
+	  tree callee1;
+	  tree ptr1, src1, str1, off1, len1, lhs1;
+	  tree ptr2 = gimple_call_arg (stmt2, 0);
+	  tree val2 = gimple_call_arg (stmt2, 1);
+	  tree len2 = gimple_call_arg (stmt2, 2);
+	  tree diff, vdef, new_str_cst;
+	  gimple use_stmt;
+	  unsigned int ptr1_align;
+	  unsigned HOST_WIDE_INT src_len;
+	  char *src_buf;
+	  use_operand_p use_p;
+
+	  if (!host_integerp (val2, 0)
+	      || !host_integerp (len2, 1))
+	    break;
+	  if (is_gimple_call (stmt1))
+	    {
+	      /* If first stmt is a call, it needs to be memcpy
+		 or mempcpy, with string literal as second argument and
+		 constant length.  */
+	      callee1 = gimple_call_fndecl (stmt1);
+	      if (callee1 == NULL_TREE
+		  || DECL_BUILT_IN_CLASS (callee1) != BUILT_IN_NORMAL
+		  || gimple_call_num_args (stmt1) != 3)
+		break;
+	      if (DECL_FUNCTION_CODE (callee1) != BUILT_IN_MEMCPY
+		  && DECL_FUNCTION_CODE (callee1) != BUILT_IN_MEMPCPY)
+		break;
+	      ptr1 = gimple_call_arg (stmt1, 0);
+	      src1 = gimple_call_arg (stmt1, 1);
+	      len1 = gimple_call_arg (stmt1, 2);
+	      lhs1 = gimple_call_lhs (stmt1);
+	      if (!host_integerp (len1, 1))
+		break;
+	      str1 = string_constant (src1, &off1);
+	      if (str1 == NULL_TREE)
+		break;
+	      if (!host_integerp (off1, 1)
+		  || compare_tree_int (off1, TREE_STRING_LENGTH (str1) - 1) > 0
+		  || compare_tree_int (len1, TREE_STRING_LENGTH (str1)
+					     - tree_low_cst (off1, 1)) > 0
+		  || TREE_CODE (TREE_TYPE (str1)) != ARRAY_TYPE
+		  || TYPE_MODE (TREE_TYPE (TREE_TYPE (str1)))
+		     != TYPE_MODE (char_type_node))
+		break;
+	    }
+	  else if (gimple_assign_single_p (stmt1))
+	    {
+	      /* Otherwise look for length 1 memcpy optimized into
+		 assignment.  */
+    	      ptr1 = gimple_assign_lhs (stmt1);
+	      src1 = gimple_assign_rhs1 (stmt1);
+	      if (TREE_CODE (ptr1) != MEM_REF
+		  || TYPE_MODE (TREE_TYPE (ptr1)) != TYPE_MODE (char_type_node)
+		  || !host_integerp (src1, 0))
+		break;
+	      ptr1 = build_fold_addr_expr (ptr1);
+	      callee1 = NULL_TREE;
+	      len1 = size_one_node;
+	      lhs1 = NULL_TREE;
+	      off1 = size_zero_node;
+	      str1 = NULL_TREE;
+	    }
+	  else
+	    break;
+
+	  diff = constant_pointer_difference (ptr1, ptr2);
+	  if (diff == NULL && lhs1 != NULL)
+	    {
+	      diff = constant_pointer_difference (lhs1, ptr2);
+	      if (DECL_FUNCTION_CODE (callee1) == BUILT_IN_MEMPCPY
+		  && diff != NULL)
+		diff = size_binop (PLUS_EXPR, diff,
+				   fold_convert (sizetype, len1));
+	    }
+	  /* If the difference between the second and first destination pointer
+	     is not constant, or is bigger than memcpy length, bail out.  */
+	  if (diff == NULL
+	      || !host_integerp (diff, 1)
+	      || tree_int_cst_lt (len1, diff))
+	    break;
+
+	  /* Use maximum of difference plus memset length and memcpy length
+	     as the new memcpy length, if it is too big, bail out.  */
+	  src_len = tree_low_cst (diff, 1);
+	  src_len += tree_low_cst (len2, 1);
+	  if (src_len < (unsigned HOST_WIDE_INT) tree_low_cst (len1, 1))
+	    src_len = tree_low_cst (len1, 1);
+	  if (src_len > 1024)
+	    break;
+
+	  /* If mempcpy value is used elsewhere, bail out, as mempcpy
+	     with bigger length will return different result.  */
+	  if (lhs1 != NULL_TREE
+	      && DECL_FUNCTION_CODE (callee1) == BUILT_IN_MEMPCPY
+	      && (TREE_CODE (lhs1) != SSA_NAME
+		  || !single_imm_use (lhs1, &use_p, &use_stmt)
+		  || use_stmt != stmt2))
+	    break;
+
+	  /* If anything reads memory in between memcpy and memset
+	     call, the modified memcpy call might change it.  */
+	  vdef = gimple_vdef (stmt1);
+	  if (vdef != NULL
+	      && (!single_imm_use (vdef, &use_p, &use_stmt)
+		  || use_stmt != stmt2))
+	    break;
+
+	  ptr1_align = get_pointer_alignment (ptr1, BIGGEST_ALIGNMENT);
+	  /* Construct the new source string literal.  */
+	  src_buf = XALLOCAVEC (char, src_len + 1);
+	  if (callee1)
+	    memcpy (src_buf,
+		    TREE_STRING_POINTER (str1) + tree_low_cst (off1, 1),
+		    tree_low_cst (len1, 1));
+	  else
+	    src_buf[0] = tree_low_cst (src1, 0);
+	  memset (src_buf + tree_low_cst (diff, 1),
+		  tree_low_cst (val2, 1), tree_low_cst (len2, 1));
+	  src_buf[src_len] = '\0';
+	  /* Neither builtin_strncpy_read_str nor builtin_memcpy_read_str
+	     handle embedded '\0's.  */
+	  if (strlen (src_buf) != src_len)
+	    break;
+	  rtl_profile_for_bb (gimple_bb (stmt2));
+	  /* If the new memcpy wouldn't be emitted by storing the literal
+	     by pieces, this optimization might enlarge .rodata too much,
+	     as commonly used string literals couldn't be shared any
+	     longer.  */
+	  if (!can_store_by_pieces (src_len,
+				    builtin_strncpy_read_str,
+				    src_buf, ptr1_align, false))
+	    break;
+
+	  new_str_cst = build_string_literal (src_len, src_buf);
+	  if (callee1)
+	    {
+	      /* If STMT1 is a mem{,p}cpy call, adjust it and remove
+		 memset call.  */
+	      if (DECL_FUNCTION_CODE (callee1) == BUILT_IN_MEMPCPY)
+		gimple_call_set_lhs (stmt1, NULL_TREE);
+	      gimple_call_set_arg (stmt1, 1, new_str_cst);
+	      gimple_call_set_arg (stmt1, 2,
+				   build_int_cst (TREE_TYPE (len1), src_len));
+	      update_stmt (stmt1);
+	      unlink_stmt_vdef (stmt2);
+	      gsi_remove (gsi_p, true);
+	      release_defs (stmt2);
+	      return true;
+	    }
+	  else
+	    {
+	      /* Otherwise, if STMT1 is length 1 memcpy optimized into
+		 assignment, remove STMT1 and change memset call into
+		 memcpy call.  */
+	      gimple_stmt_iterator gsi = gsi_for_stmt (stmt1);
+
+	      gimple_call_set_fndecl (stmt2, built_in_decls [BUILT_IN_MEMCPY]);
+	      gimple_call_set_arg (stmt2, 0, ptr1);
+	      gimple_call_set_arg (stmt2, 1, new_str_cst);
+	      gimple_call_set_arg (stmt2, 2,
+				   build_int_cst (TREE_TYPE (len2), src_len));
+	      unlink_stmt_vdef (stmt1);
+	      gsi_remove (&gsi, true);
+	      release_defs (stmt1);
+	      update_stmt (stmt2);
+	      return false;
+	    }
+	}
+      break;
+    default:
+      break;
+    }
+  return false;
+}
+
 /* Run bitwise and assignments throug the folder.  If the first argument is an
    ssa name that is itself a result of a typecast of an ADDR_EXPR to an
    integer, feed the ADDR_EXPR to the folder rather than the ssa name.
@@ -1784,6 +2071,14 @@ tree_ssa_forward_propagate_single_use_va
 					      WARN_STRICT_OVERFLOW_CONDITIONAL);
 	      gsi_next (&gsi);
 	    }
+	  else if (is_gimple_call (stmt))
+	    {
+	      tree callee = gimple_call_fndecl (stmt);
+	      if (callee == NULL_TREE
+		  || DECL_BUILT_IN_CLASS (callee) != BUILT_IN_NORMAL
+		  || !simplify_builtin_call (&gsi, callee))
+		gsi_next (&gsi);
+	    }
 	  else
 	    gsi_next (&gsi);
 	}
--- gcc/Makefile.in.jj	2010-10-12 08:23:50.948247432 +0200
+++ gcc/Makefile.in	2010-10-12 13:08:25.096497334 +0200
@@ -2407,7 +2407,7 @@ tree-ssa-dse.o : tree-ssa-dse.c $(CONFIG
 tree-ssa-forwprop.o : tree-ssa-forwprop.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
    $(TM_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
    $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) $(DIAGNOSTIC_H) $(TIMEVAR_H) \
-   langhooks.h $(FLAGS_H) $(GIMPLE_H) tree-pretty-print.h
+   langhooks.h $(FLAGS_H) $(GIMPLE_H) tree-pretty-print.h $(EXPR_H)
 tree-ssa-phiprop.o : tree-ssa-phiprop.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
    $(TM_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
    $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) $(DIAGNOSTIC_H) $(TIMEVAR_H) \
--- gcc/testsuite/gcc.c-torture/execute/pr45636.c.jj	2010-10-12 13:08:25.098497519 +0200
+++ gcc/testsuite/gcc.c-torture/execute/pr45636.c	2010-10-12 13:08:25.098497519 +0200
@@ -0,0 +1,75 @@
+/* PR fortran/45636 */
+
+typedef __SIZE_TYPE__ size_t;
+void *memcpy (void *__restrict__, const void *__restrict__, size_t);
+void *mempcpy (void *__restrict__, const void *__restrict__, size_t);
+void *memset (void *, int, size_t);
+int memcmp (const void *, const void *, size_t);
+extern void abort (void);
+
+struct A { int i; char c[32]; } a[2];
+
+__attribute__((noinline, noclone)) int
+f1 (char *p, int q, int z)
+{
+  memcpy (p, "abcd", 4);
+  if (q)
+    z = z + 123;
+  else
+    z *= 114;
+  memset (p + 4, ' ', 2);
+  return z;
+}
+
+__attribute__((noinline, noclone)) void
+f2 (void)
+{
+  char *p = mempcpy (&a[0].c[13], "123456", 4);
+  memset (p, '7', 3);
+}
+
+__attribute__((noinline, noclone)) void
+f3 (struct A *p)
+{
+  p++;
+  char *q = &p->c[10];
+  memcpy (q + 4, "__1234567" + 2, 7);
+  memset (&p->c[21], '9', 3);
+}
+
+__attribute__((noinline, noclone)) void
+f4 (void)
+{
+  memcpy (&a[0].c[10], "0123456789", 10);
+  memset (&a[0].c[13], ' ', 3);
+}
+
+__attribute__((noinline, noclone)) void
+check (const char *p, const char *str, size_t size)
+{
+  const char *q;
+  for (q = (const char *) &a; q < p; q++)
+    if (*q)
+      abort ();
+  if (memcmp (p, str, size) != 0)
+    abort ();
+  for (q = p + size; q < (const char *) (&a[0] + 2); q++)
+    if (*q)
+      abort ();
+  memset (&a, '\0', sizeof a);
+}
+
+int
+main (void)
+{
+  if (f1 (&a[0].c[7], 1, 2) != 125)
+    abort ();
+  check (&a[0].c[7], "abcd  ", 6);
+  f2 ();
+  check (&a[0].c[13], "1234777", 7);
+  f3 (&a[0]);
+  check (&a[1].c[14], "1234567999", 10);
+  f4 ();
+  check (&a[0].c[10], "012   6789", 10);
+  return 0;
+}
--- gcc/testsuite/gfortran.dg/pr45636.f90.jj	2010-10-12 13:08:25.099615239 +0200
+++ gcc/testsuite/gfortran.dg/pr45636.f90	2010-10-12 13:08:25.099615239 +0200
@@ -0,0 +1,14 @@
+! PR fortran/45636
+! { dg-do compile }
+! { dg-options "-O2 -fdump-tree-forwprop2" }
+! PR 45636 - make sure no memset is needed for a short right-hand side.
+program main
+  character(len=2), parameter :: x='a '
+  character(len=1), parameter :: y='b'
+  character(len=4) :: a, b
+  a = x
+  b = y
+  call sub(a, b)
+end program main
+! { dg-final { scan-tree-dump-times "memset" 0 "forwprop2" } }
+! { dg-final { cleanup-tree-dump "forwprop2" } }


	Jakub


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]