Bug 86017 - multiple consecutive calls to bzero/memset not merged
Summary: multiple consecutive calls to bzero/memset not merged
Status: RESOLVED DUPLICATE of bug 49872
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 8.0
: P3 normal
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks:
 
Reported: 2018-05-31 17:16 UTC by Martin Sebor
Modified: 2021-08-21 23:57 UTC (History)
3 users (show)

See Also:
Host:
Target:
Build:
Known to work:
Known to fail:
Last reconfirmed: 2018-06-01 00:00:00


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Martin Sebor 2018-05-31 17:16:35 UTC
Even though the two functions defined in the test case below are equivalent, GCC emits considerably less efficient code the one with multiple calls to memset than for the one with just a single call.  Clang emits the same optimally efficient code for both.

$ cat b.c && gcc -O2 -S -Wall -Wextra -fdump-tree-optimized=/dev/stdout -o/dev/stdout b.c
void f (void*);

void g (void)
{
  char a[8];
  __builtin_memset (a, 0, 8);

  f (a);
}

void h (void)
{
  char a[8];
  __builtin_memset (a, 0, 1);
  __builtin_memset (a + 1, 0, 1);
  __builtin_memset (a + 2, 0, 1);
  __builtin_memset (a + 3, 0, 1);
  __builtin_memset (a + 4, 0, 1);
  __builtin_memset (a + 5, 0, 1);
  __builtin_memset (a + 6, 0, 1);
  __builtin_memset (a + 7, 0, 1);

  f (a);
}

	.file	"b.c"
	.text

;; Function g (g, funcdef_no=0, decl_uid=1958, cgraph_uid=0, symbol_order=0)

g ()
{
  char a[8];

  <bb 2> [local count: 1073741825]:
  __builtin_memset (&a, 0, 8);
  f (&a);
  a ={v} {CLOBBER};
  return;

}


	.p2align 4,,15
	.globl	g
	.type	g, @function
g:
.LFB0:
	.cfi_startproc
	subq	$24, %rsp
	.cfi_def_cfa_offset 32
	movq	$0, 8(%rsp)
	leaq	8(%rsp), %rdi
	call	f
	addq	$24, %rsp
	.cfi_def_cfa_offset 8
	ret
	.cfi_endproc
.LFE0:
	.size	g, .-g

;; Function h (h, funcdef_no=1, decl_uid=1962, cgraph_uid=1, symbol_order=1)

h ()
{
  char a[8];

  <bb 2> [local count: 1073741825]:
  MEM[(void *)&a] = 0;
  __builtin_memset (&MEM[(void *)&a + 1B], 0, 1);
  __builtin_memset (&MEM[(void *)&a + 2B], 0, 1);
  __builtin_memset (&MEM[(void *)&a + 3B], 0, 1);
  __builtin_memset (&MEM[(void *)&a + 4B], 0, 1);
  __builtin_memset (&MEM[(void *)&a + 5B], 0, 1);
  __builtin_memset (&MEM[(void *)&a + 6B], 0, 1);
  __builtin_memset (&MEM[(void *)&a + 7B], 0, 1);
  f (&a);
  a ={v} {CLOBBER};
  return;

}


	.p2align 4,,15
	.globl	h
	.type	h, @function
h:
.LFB1:
	.cfi_startproc
	subq	$24, %rsp
	.cfi_def_cfa_offset 32
	leaq	8(%rsp), %rdi
	movb	$0, 8(%rsp)
	movb	$0, 9(%rsp)
	movb	$0, 10(%rsp)
	movb	$0, 11(%rsp)
	movb	$0, 12(%rsp)
	movb	$0, 13(%rsp)
	movb	$0, 14(%rsp)
	movb	$0, 15(%rsp)
	call	f
	addq	$24, %rsp
	.cfi_def_cfa_offset 8
	ret
	.cfi_endproc
.LFE1:
	.size	h, .-h
	.ident	"GCC: (GNU) 8.1.1 20180522"
	.section	.note.GNU-stack,"",@progbits
Comment 1 Martin Sebor 2018-05-31 17:18:27 UTC
See also bug 86010 for a related missed optimization (that one is a regression while this bug does not appear to be).
Comment 2 Richard Biener 2018-06-01 08:15:34 UTC
Confirmed.  A related issue is that we inline the first but not the adjacent memsets.  Fixing that makes us apply store-merging.  Testing a patch.
Comment 3 Richard Biener 2018-06-01 10:50:28 UTC
Author: rguenth
Date: Fri Jun  1 10:49:54 2018
New Revision: 261061

URL: https://gcc.gnu.org/viewcvs?rev=261061&root=gcc&view=rev
Log:
2018-06-01  Richard Biener  <rguenther@suse.de>

	PR middle-end/86017
	* gimple-fold.c (var_decl_component_p): Also allow offsetted
	vars wrapped in MEM_REFs.

	* gcc.dg/tree-ssa/pr86017.c: New testcase.

Added:
    trunk/gcc/testsuite/gcc.dg/tree-ssa/pr86017.c
Modified:
    trunk/gcc/ChangeLog
    trunk/gcc/gimple-fold.c
    trunk/gcc/testsuite/ChangeLog
Comment 4 Richard Biener 2018-06-01 10:53:04 UTC
The testcase is now fixed.  We're still not merging adjacent memset/bzero calls.\
Modified testcase:

void f (void*);
void h (void)
{
  char a[8];
  __builtin_memset (a, 0, 1);
  __builtin_memset (a + 1, 0, 3);
  __builtin_memset (a + 4, 0, 4);

  f (a);
}

results in

;; Function h (h, funcdef_no=1, decl_uid=1962, cgraph_uid=1, symbol_order=1)

h ()
{
  char a[8];

  <bb 2> [local count: 1073741825]:
  MEM[(void *)&a] = 0;
  __builtin_memset (&MEM[(void *)&a + 1B], 0, 3);
  __builtin_memset (&MEM[(void *)&a + 4B], 0, 4);
  f (&a);
  a ={v} {CLOBBER};
  return;

}

note we have to deal with merging with stores as well.  Given the
original testcase was ultimatively solved by making only stores
available to store-merging the obvious thing to do is to teach
store-merging about memset()/bzero() and consider memset() for
code-generation as well(?).

No longer mine.
Comment 5 Andrew Pinski 2021-08-21 23:57:14 UTC
The rest is a dup of bug 49872.

*** This bug has been marked as a duplicate of bug 49872 ***