This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [PATCH] Optimize memcpy + memset (PR fortran/45636)
On Tue, 12 Oct 2010, Jakub Jelinek wrote:
> On Tue, Oct 12, 2010 at 11:18:35AM +0200, Richard Guenther wrote:
> > Please use get_addr_base_and_unit_offset here, that simplifies
> > the following checks
>
> Done.
>
> > > + off = size_binop (PLUS_EXPR, off,
> > > + fold_convert (sizetype,
> > > + TREE_OPERAND (q, 1)));
> >
> > Use double_int_to_tree (sizetype, mem_ref_offset (q)) which properly
> > sign-extends the pointer offset.
>
> Likewise.
>
> > > + if (code == POINTER_PLUS_EXPR)
> > > + {
> > > + if (TREE_CODE (gimple_assign_rhs2 (stmt)) != INTEGER_CST)
> > > + break;
> > > + off = size_binop (PLUS_EXPR, off, gimple_assign_rhs2 (stmt));
> > > + p = gimple_assign_rhs1 (stmt);
> > > + }
> > > + else if (code == ADDR_EXPR || code == NOP_EXPR)
> > > + p = gimple_assign_rhs1 (stmt);
> > > + else
> > > + break;
> > > + }
> > > + while (1);
> > > + cnt[i] = j;
> > > + }
> >
> > I wonder why the forward-propagation of POINTER_PLUS_EXPR and ADDR_EXPR
> > doesn't make the above redundant?
>
> On the pr45636.c testcase, in f1, constant_pointer_difference is called
> with p1 p.1_4and p2 D.2771_9:
>
> p.1_4 = (void * restrict) p_3(D);
> memcpy (p.1_4, &"abcd"[0], 4);
Ah, restrict ... indeed.
> if (q_5(D) != 0)
> goto <bb 3>;
> else
> goto <bb 4>;
> ...
> # z_1 = PHI <z_7(3), z_8(4)>
> D.2771_9 = p_3(D) + 4; # POINTER_PLUS_EXPR here
> memset (D.2771_9, 32, 2);
Hm, ok. that's not invariant and thus not propagated into the
memset arg.
> On f2, it is called with p1 &a[0].c[13] and p2 p_3 (and later with p1 == p2 p_3):
> p_3 = mempcpy (&a[0].c[13], &"123456"[0], 4);
> memset (p_3, 55, 3);
>
> On f3 p1 is D.2762_6 and p2 is D.2763_7:
> D.2762_6 = &MEM[(void *)p_1(D) + 54B];
> memcpy (D.2762_6, &"__1234567"[2], 7);
> D.2763_7 = &MEM[(struct A *)p_1(D) + 36B].c[21];
> memset (D.2763_7, 57, 3);
Likewise.
> and on f4 p1 is &a[0].c[10] and p2 is &a[0].c[13]:
> memcpy (&a[0].c[10], &"0123456789"[0], 10);
> memset (&a[0].c[13], 32, 3);
>
> So, what exactly do you think is redundant?
I thought about
D.12_1 = p_3(D) + 4;
D.23_2 = D.12_1 + 4;
memcpy (D.23_2, ...);
which it looks like you handle via iterating a few times. It indeed
looks like a single stmt lookup is always necessary (just because
most of the offsetting isn't a proper gimple-val). None of the
above case would require the iteration though.
Basically I'd think that the inner loop over j could go away?
> > That looks like
> >
> > if (TREE_CODE (lhs1) != SSA_NAME
> > || !single_imm_use (lhs1, &use_p, &use_stmt)
> > || use_stmt != stmt2)
> > break;
> >
> > and your loop gets confused by DEBUG_STMTs at least (and possibly
> > generates different code -g vs. -g0)
>
> Fixed.
>
> > Likewise. Why do we care about other uses of the VDEF?
>
> Likewise. We do care about other uses of VDEF. We could use the oracle
> to find out if the other uses matter or not I guess, but it would
> make the code larger.
> Consider
> memcpy (&p[0], &"abcd"[0], 4);
> v = p[4];
> memset (&p[4], ' ', 2);
> or
> MEM[&p] = 'a';
> w = MEM[&p];
> memset (&p[1], ' ', 3);
>
> If we change memcpy above to memcpy (&p[0], &"abcd "[0], 6); and remove
> memset, then v might have incorrect value (' ' instead of whatever it had
> before), if the MEM[&p] = 'a' is removed and memset is changed
> into memset (&p[0], "a ", 4);, then w will have wrong value.
Ah indeed.
> > The code above could use some comments, like those you gave about
> > can_store_by_pieces in the mail.
>
> Some comments added.
Much better. Does it work without the iteration?
Thanks,
Richard.
> 2010-10-12 Jakub Jelinek <jakub@redhat.com>
>
> PR fortran/45636
> * tree-ssa-forwprop.c: Include expr.h.
> (constant_pointer_difference, simplify_builtin_call): New functions.
> (tree_ssa_forward_propagate_single_use_vars): Call
> simplify_builtin_call on builtin calls.
>
> * gcc.c-torture/execute/pr45636.c: New test.
> * gfortran.dg/pr45636.f90: New test.
>
> --- gcc/tree-ssa-forwprop.c.jj 2010-10-11 20:37:16.511307231 +0200
> +++ gcc/tree-ssa-forwprop.c 2010-10-12 13:36:51.499542149 +0200
> @@ -33,6 +33,7 @@ along with GCC; see the file COPYING3.
> #include "langhooks.h"
> #include "flags.h"
> #include "gimple.h"
> +#include "expr.h"
>
> /* This pass propagates the RHS of assignment statements into use
> sites of the LHS of the assignment. It's basically a specialized
> @@ -1317,6 +1318,292 @@ simplify_gimple_switch (gimple stmt)
> }
> }
>
> +/* For pointers p2 and p1 return p2 - p1 if the
> + difference is known and constant, otherwise return NULL. */
> +
> +static tree
> +constant_pointer_difference (tree p1, tree p2)
> +{
> + int i, j;
> + tree exps[2][5];
> + tree offs[2][5];
> + int cnt[2];
> +
> + for (i = 0; i < 2; i++)
> + {
> + tree p = i ? p1 : p2;
> + tree off = size_zero_node;
> + gimple stmt;
> + enum tree_code code;
> +
> + j = 0;
> + do
> + {
> + if (!POINTER_TYPE_P (TREE_TYPE (p)))
> + break;
> + if (TREE_CODE (p) == ADDR_EXPR)
> + {
> + tree q = TREE_OPERAND (p, 0);
> + HOST_WIDE_INT offset;
> + tree base = get_addr_base_and_unit_offset (q, &offset);
> + if (base)
> + {
> + q = base;
> + if (offset)
> + off = size_binop (PLUS_EXPR, off, size_int (offset));
> + }
> + if (TREE_CODE (q) == MEM_REF
> + && TREE_CODE (TREE_OPERAND (q, 0)) == SSA_NAME)
> + {
> + p = TREE_OPERAND (q, 0);
> + off = size_binop (PLUS_EXPR, off,
> + double_int_to_tree (sizetype,
> + mem_ref_offset (q)));
> + }
> + else
> + {
> + exps[i][j] = q;
> + offs[i][j++] = off;
> + break;
> + }
> + }
> + if (TREE_CODE (p) != SSA_NAME)
> + break;
> + exps[i][j] = p;
> + offs[i][j++] = off;
> + if (j == 5)
> + break;
> + stmt = SSA_NAME_DEF_STMT (p);
> + if (!is_gimple_assign (stmt) || gimple_assign_lhs (stmt) != p)
> + break;
> + code = gimple_assign_rhs_code (stmt);
> + if (code == POINTER_PLUS_EXPR)
> + {
> + if (TREE_CODE (gimple_assign_rhs2 (stmt)) != INTEGER_CST)
> + break;
> + off = size_binop (PLUS_EXPR, off, gimple_assign_rhs2 (stmt));
> + p = gimple_assign_rhs1 (stmt);
> + }
> + else if (code == ADDR_EXPR || code == NOP_EXPR)
> + p = gimple_assign_rhs1 (stmt);
> + else
> + break;
> + }
> + while (1);
> + cnt[i] = j;
> + }
> +
> + for (i = 0; i < cnt[0]; i++)
> + for (j = 0; j < cnt[1]; j++)
> + if (exps[0][i] == exps[1][j])
> + return size_binop (MINUS_EXPR, offs[0][i], offs[1][j]);
> +
> + return NULL_TREE;
> +}
> +
> +/* *GSI_P is a GIMPLE_CALL to a builtin function.
> + Optimize
> + memcpy (p, "abcd", 4);
> + memset (p + 4, ' ', 3);
> + into
> + memcpy (p, "abcd ", 7);
> + call if the latter can be stored by pieces during expansion. */
> +
> +static bool
> +simplify_builtin_call (gimple_stmt_iterator *gsi_p, tree callee2)
> +{
> + gimple stmt1, stmt2 = gsi_stmt (*gsi_p);
> + tree vuse = gimple_vuse (stmt2);
> + if (vuse == NULL)
> + return false;
> + stmt1 = SSA_NAME_DEF_STMT (vuse);
> +
> + switch (DECL_FUNCTION_CODE (callee2))
> + {
> + case BUILT_IN_MEMSET:
> + if (gimple_call_num_args (stmt2) != 3
> + || gimple_call_lhs (stmt2)
> + || CHAR_BIT != 8
> + || BITS_PER_UNIT != 8)
> + break;
> + else
> + {
> + tree callee1;
> + tree ptr1, src1, str1, off1, len1, lhs1;
> + tree ptr2 = gimple_call_arg (stmt2, 0);
> + tree val2 = gimple_call_arg (stmt2, 1);
> + tree len2 = gimple_call_arg (stmt2, 2);
> + tree diff, vdef, new_str_cst;
> + gimple use_stmt;
> + unsigned int ptr1_align;
> + unsigned HOST_WIDE_INT src_len;
> + char *src_buf;
> + use_operand_p use_p;
> +
> + if (!host_integerp (val2, 0)
> + || !host_integerp (len2, 1))
> + break;
> + if (is_gimple_call (stmt1))
> + {
> + /* If first stmt is a call, it needs to be memcpy
> + or mempcpy, with string literal as second argument and
> + constant length. */
> + callee1 = gimple_call_fndecl (stmt1);
> + if (callee1 == NULL_TREE
> + || DECL_BUILT_IN_CLASS (callee1) != BUILT_IN_NORMAL
> + || gimple_call_num_args (stmt1) != 3)
> + break;
> + if (DECL_FUNCTION_CODE (callee1) != BUILT_IN_MEMCPY
> + && DECL_FUNCTION_CODE (callee1) != BUILT_IN_MEMPCPY)
> + break;
> + ptr1 = gimple_call_arg (stmt1, 0);
> + src1 = gimple_call_arg (stmt1, 1);
> + len1 = gimple_call_arg (stmt1, 2);
> + lhs1 = gimple_call_lhs (stmt1);
> + if (!host_integerp (len1, 1))
> + break;
> + str1 = string_constant (src1, &off1);
> + if (str1 == NULL_TREE)
> + break;
> + if (!host_integerp (off1, 1)
> + || compare_tree_int (off1, TREE_STRING_LENGTH (str1) - 1) > 0
> + || compare_tree_int (len1, TREE_STRING_LENGTH (str1)
> + - tree_low_cst (off1, 1)) > 0
> + || TREE_CODE (TREE_TYPE (str1)) != ARRAY_TYPE
> + || TYPE_MODE (TREE_TYPE (TREE_TYPE (str1)))
> + != TYPE_MODE (char_type_node))
> + break;
> + }
> + else if (gimple_assign_single_p (stmt1))
> + {
> + /* Otherwise look for length 1 memcpy optimized into
> + assignment. */
> + ptr1 = gimple_assign_lhs (stmt1);
> + src1 = gimple_assign_rhs1 (stmt1);
> + if (TREE_CODE (ptr1) != MEM_REF
> + || TYPE_MODE (TREE_TYPE (ptr1)) != TYPE_MODE (char_type_node)
> + || !host_integerp (src1, 0))
> + break;
> + ptr1 = build_fold_addr_expr (ptr1);
> + callee1 = NULL_TREE;
> + len1 = size_one_node;
> + lhs1 = NULL_TREE;
> + off1 = size_zero_node;
> + str1 = NULL_TREE;
> + }
> + else
> + break;
> +
> + diff = constant_pointer_difference (ptr1, ptr2);
> + if (diff == NULL && lhs1 != NULL)
> + {
> + diff = constant_pointer_difference (lhs1, ptr2);
> + if (DECL_FUNCTION_CODE (callee1) == BUILT_IN_MEMPCPY
> + && diff != NULL)
> + diff = size_binop (PLUS_EXPR, diff,
> + fold_convert (sizetype, len1));
> + }
> + /* If the difference between the second and first destination pointer
> + is not constant, or is bigger than memcpy length, bail out. */
> + if (diff == NULL
> + || !host_integerp (diff, 1)
> + || tree_int_cst_lt (len1, diff))
> + break;
> +
> + /* Use maximum of difference plus memset length and memcpy length
> + as the new memcpy length, if it is too big, bail out. */
> + src_len = tree_low_cst (diff, 1);
> + src_len += tree_low_cst (len2, 1);
> + if (src_len < (unsigned HOST_WIDE_INT) tree_low_cst (len1, 1))
> + src_len = tree_low_cst (len1, 1);
> + if (src_len > 1024)
> + break;
> +
> + /* If mempcpy value is used elsewhere, bail out, as mempcpy
> + with bigger length will return different result. */
> + if (lhs1 != NULL_TREE
> + && DECL_FUNCTION_CODE (callee1) == BUILT_IN_MEMPCPY
> + && (TREE_CODE (lhs1) != SSA_NAME
> + || !single_imm_use (lhs1, &use_p, &use_stmt)
> + || use_stmt != stmt2))
> + break;
> +
> + /* If anything reads memory in between memcpy and memset
> + call, the modified memcpy call might change it. */
> + vdef = gimple_vdef (stmt1);
> + if (vdef != NULL
> + && (!single_imm_use (vdef, &use_p, &use_stmt)
> + || use_stmt != stmt2))
> + break;
> +
> + ptr1_align = get_pointer_alignment (ptr1, BIGGEST_ALIGNMENT);
> + /* Construct the new source string literal. */
> + src_buf = XALLOCAVEC (char, src_len + 1);
> + if (callee1)
> + memcpy (src_buf,
> + TREE_STRING_POINTER (str1) + tree_low_cst (off1, 1),
> + tree_low_cst (len1, 1));
> + else
> + src_buf[0] = tree_low_cst (src1, 0);
> + memset (src_buf + tree_low_cst (diff, 1),
> + tree_low_cst (val2, 1), tree_low_cst (len2, 1));
> + src_buf[src_len] = '\0';
> + /* Neither builtin_strncpy_read_str nor builtin_memcpy_read_str
> + handle embedded '\0's. */
> + if (strlen (src_buf) != src_len)
> + break;
> + rtl_profile_for_bb (gimple_bb (stmt2));
> + /* If the new memcpy wouldn't be emitted by storing the literal
> + by pieces, this optimization might enlarge .rodata too much,
> + as commonly used string literals couldn't be shared any
> + longer. */
> + if (!can_store_by_pieces (src_len,
> + builtin_strncpy_read_str,
> + src_buf, ptr1_align, false))
> + break;
> +
> + new_str_cst = build_string_literal (src_len, src_buf);
> + if (callee1)
> + {
> + /* If STMT1 is a mem{,p}cpy call, adjust it and remove
> + memset call. */
> + if (DECL_FUNCTION_CODE (callee1) == BUILT_IN_MEMPCPY)
> + gimple_call_set_lhs (stmt1, NULL_TREE);
> + gimple_call_set_arg (stmt1, 1, new_str_cst);
> + gimple_call_set_arg (stmt1, 2,
> + build_int_cst (TREE_TYPE (len1), src_len));
> + update_stmt (stmt1);
> + unlink_stmt_vdef (stmt2);
> + gsi_remove (gsi_p, true);
> + release_defs (stmt2);
> + return true;
> + }
> + else
> + {
> + /* Otherwise, if STMT1 is length 1 memcpy optimized into
> + assignment, remove STMT1 and change memset call into
> + memcpy call. */
> + gimple_stmt_iterator gsi = gsi_for_stmt (stmt1);
> +
> + gimple_call_set_fndecl (stmt2, built_in_decls [BUILT_IN_MEMCPY]);
> + gimple_call_set_arg (stmt2, 0, ptr1);
> + gimple_call_set_arg (stmt2, 1, new_str_cst);
> + gimple_call_set_arg (stmt2, 2,
> + build_int_cst (TREE_TYPE (len2), src_len));
> + unlink_stmt_vdef (stmt1);
> + gsi_remove (&gsi, true);
> + release_defs (stmt1);
> + update_stmt (stmt2);
> + return false;
> + }
> + }
> + break;
> + default:
> + break;
> + }
> + return false;
> +}
> +
> /* Run bitwise and assignments throug the folder. If the first argument is an
> ssa name that is itself a result of a typecast of an ADDR_EXPR to an
> integer, feed the ADDR_EXPR to the folder rather than the ssa name.
> @@ -1784,6 +2071,14 @@ tree_ssa_forward_propagate_single_use_va
> WARN_STRICT_OVERFLOW_CONDITIONAL);
> gsi_next (&gsi);
> }
> + else if (is_gimple_call (stmt))
> + {
> + tree callee = gimple_call_fndecl (stmt);
> + if (callee == NULL_TREE
> + || DECL_BUILT_IN_CLASS (callee) != BUILT_IN_NORMAL
> + || !simplify_builtin_call (&gsi, callee))
> + gsi_next (&gsi);
> + }
> else
> gsi_next (&gsi);
> }
> --- gcc/Makefile.in.jj 2010-10-12 08:23:50.948247432 +0200
> +++ gcc/Makefile.in 2010-10-12 13:08:25.096497334 +0200
> @@ -2407,7 +2407,7 @@ tree-ssa-dse.o : tree-ssa-dse.c $(CONFIG
> tree-ssa-forwprop.o : tree-ssa-forwprop.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
> $(TM_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
> $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) $(DIAGNOSTIC_H) $(TIMEVAR_H) \
> - langhooks.h $(FLAGS_H) $(GIMPLE_H) tree-pretty-print.h
> + langhooks.h $(FLAGS_H) $(GIMPLE_H) tree-pretty-print.h $(EXPR_H)
> tree-ssa-phiprop.o : tree-ssa-phiprop.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
> $(TM_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
> $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) $(DIAGNOSTIC_H) $(TIMEVAR_H) \
> --- gcc/testsuite/gcc.c-torture/execute/pr45636.c.jj 2010-10-12 13:08:25.098497519 +0200
> +++ gcc/testsuite/gcc.c-torture/execute/pr45636.c 2010-10-12 13:08:25.098497519 +0200
> @@ -0,0 +1,75 @@
> +/* PR fortran/45636 */
> +
> +typedef __SIZE_TYPE__ size_t;
> +void *memcpy (void *__restrict__, const void *__restrict__, size_t);
> +void *mempcpy (void *__restrict__, const void *__restrict__, size_t);
> +void *memset (void *, int, size_t);
> +int memcmp (const void *, const void *, size_t);
> +extern void abort (void);
> +
> +struct A { int i; char c[32]; } a[2];
> +
> +__attribute__((noinline, noclone)) int
> +f1 (char *p, int q, int z)
> +{
> + memcpy (p, "abcd", 4);
> + if (q)
> + z = z + 123;
> + else
> + z *= 114;
> + memset (p + 4, ' ', 2);
> + return z;
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f2 (void)
> +{
> + char *p = mempcpy (&a[0].c[13], "123456", 4);
> + memset (p, '7', 3);
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f3 (struct A *p)
> +{
> + p++;
> + char *q = &p->c[10];
> + memcpy (q + 4, "__1234567" + 2, 7);
> + memset (&p->c[21], '9', 3);
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f4 (void)
> +{
> + memcpy (&a[0].c[10], "0123456789", 10);
> + memset (&a[0].c[13], ' ', 3);
> +}
> +
> +__attribute__((noinline, noclone)) void
> +check (const char *p, const char *str, size_t size)
> +{
> + const char *q;
> + for (q = (const char *) &a; q < p; q++)
> + if (*q)
> + abort ();
> + if (memcmp (p, str, size) != 0)
> + abort ();
> + for (q = p + size; q < (const char *) (&a[0] + 2); q++)
> + if (*q)
> + abort ();
> + memset (&a, '\0', sizeof a);
> +}
> +
> +int
> +main (void)
> +{
> + if (f1 (&a[0].c[7], 1, 2) != 125)
> + abort ();
> + check (&a[0].c[7], "abcd ", 6);
> + f2 ();
> + check (&a[0].c[13], "1234777", 7);
> + f3 (&a[0]);
> + check (&a[1].c[14], "1234567999", 10);
> + f4 ();
> + check (&a[0].c[10], "012 6789", 10);
> + return 0;
> +}
> --- gcc/testsuite/gfortran.dg/pr45636.f90.jj 2010-10-12 13:08:25.099615239 +0200
> +++ gcc/testsuite/gfortran.dg/pr45636.f90 2010-10-12 13:08:25.099615239 +0200
> @@ -0,0 +1,14 @@
> +! PR fortran/45636
> +! { dg-do compile }
> +! { dg-options "-O2 -fdump-tree-forwprop2" }
> +! PR 45636 - make sure no memset is needed for a short right-hand side.
> +program main
> + character(len=2), parameter :: x='a '
> + character(len=1), parameter :: y='b'
> + character(len=4) :: a, b
> + a = x
> + b = y
> + call sub(a, b)
> +end program main
> +! { dg-final { scan-tree-dump-times "memset" 0 "forwprop2" } }
> +! { dg-final { cleanup-tree-dump "forwprop2" } }
>
>
> Jakub
>
>
--
Richard Guenther <rguenther@suse.de>
Novell / SUSE Labs
SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746 - GF: Markus Rex