86017 – multiple consecutive calls to bzero/memset not merged

Bug 86017 - multiple consecutive calls to bzero/memset not merged

Summary: multiple consecutive calls to bzero/memset not merged

Status:	RESOLVED DUPLICATE of bug 49872

Alias:	None

Product:	gcc
Classification:	Unclassified
Component:	tree-optimization (show other bugs)
Version:	8.0

Importance:	P3 normal
Target Milestone:	---
Assignee:	Not yet assigned to anyone

URL:
Keywords:	missed-optimization

Depends on:
Blocks:

Reported:	2018-05-31 17:16 UTC by Martin Sebor
Modified:	2021-08-21 23:57 UTC (History)
CC List:	3 users (show)

See Also:	86010
Host:
Target:
Build:
Known to work:
Known to fail:
Last reconfirmed:	2018-06-01 00:00:00

Attachments
Add an attachment (proposed patch, testcase, etc.)

Note You need to log in before you can comment on or make changes to this bug.

Description Martin Sebor 2018-05-31 17:16:35 UTC

Even though the two functions defined in the test case below are equivalent, GCC emits considerably less efficient code the one with multiple calls to memset than for the one with just a single call.  Clang emits the same optimally efficient code for both.

$ cat b.c && gcc -O2 -S -Wall -Wextra -fdump-tree-optimized=/dev/stdout -o/dev/stdout b.c
void f (void*);

void g (void)
{
  char a[8];
  __builtin_memset (a, 0, 8);

  f (a);
}

void h (void)
{
  char a[8];
  __builtin_memset (a, 0, 1);
  __builtin_memset (a + 1, 0, 1);
  __builtin_memset (a + 2, 0, 1);
  __builtin_memset (a + 3, 0, 1);
  __builtin_memset (a + 4, 0, 1);
  __builtin_memset (a + 5, 0, 1);
  __builtin_memset (a + 6, 0, 1);
  __builtin_memset (a + 7, 0, 1);

  f (a);
}

	.file	"b.c"
	.text

;; Function g (g, funcdef_no=0, decl_uid=1958, cgraph_uid=0, symbol_order=0)

g ()
{
  char a[8];

  <bb 2> [local count: 1073741825]:
  __builtin_memset (&a, 0, 8);
  f (&a);
  a ={v} {CLOBBER};
  return;

}


	.p2align 4,,15
	.globl	g
	.type	g, @function
g:
.LFB0:
	.cfi_startproc
	subq	$24, %rsp
	.cfi_def_cfa_offset 32
	movq	$0, 8(%rsp)
	leaq	8(%rsp), %rdi
	call	f
	addq	$24, %rsp
	.cfi_def_cfa_offset 8
	ret
	.cfi_endproc
.LFE0:
	.size	g, .-g

;; Function h (h, funcdef_no=1, decl_uid=1962, cgraph_uid=1, symbol_order=1)

h ()
{
  char a[8];

  <bb 2> [local count: 1073741825]:
  MEM[(void *)&a] = 0;
  __builtin_memset (&MEM[(void *)&a + 1B], 0, 1);
  __builtin_memset (&MEM[(void *)&a + 2B], 0, 1);
  __builtin_memset (&MEM[(void *)&a + 3B], 0, 1);
  __builtin_memset (&MEM[(void *)&a + 4B], 0, 1);
  __builtin_memset (&MEM[(void *)&a + 5B], 0, 1);
  __builtin_memset (&MEM[(void *)&a + 6B], 0, 1);
  __builtin_memset (&MEM[(void *)&a + 7B], 0, 1);
  f (&a);
  a ={v} {CLOBBER};
  return;

}


	.p2align 4,,15
	.globl	h
	.type	h, @function
h:
.LFB1:
	.cfi_startproc
	subq	$24, %rsp
	.cfi_def_cfa_offset 32
	leaq	8(%rsp), %rdi
	movb	$0, 8(%rsp)
	movb	$0, 9(%rsp)
	movb	$0, 10(%rsp)
	movb	$0, 11(%rsp)
	movb	$0, 12(%rsp)
	movb	$0, 13(%rsp)
	movb	$0, 14(%rsp)
	movb	$0, 15(%rsp)
	call	f
	addq	$24, %rsp
	.cfi_def_cfa_offset 8
	ret
	.cfi_endproc
.LFE1:
	.size	h, .-h
	.ident	"GCC: (GNU) 8.1.1 20180522"
	.section	.note.GNU-stack,"",@progbits

Comment 1 Martin Sebor 2018-05-31 17:18:27 UTC

See also bug 86010 for a related missed optimization (that one is a regression while this bug does not appear to be).

Comment 2 Richard Biener 2018-06-01 08:15:34 UTC

Confirmed.  A related issue is that we inline the first but not the adjacent memsets.  Fixing that makes us apply store-merging.  Testing a patch.

Comment 3 Richard Biener 2018-06-01 10:50:28 UTC

Author: rguenth
Date: Fri Jun  1 10:49:54 2018
New Revision: 261061

URL: https://gcc.gnu.org/viewcvs?rev=261061&root=gcc&view=rev
Log:
2018-06-01  Richard Biener  <rguenther@suse.de>

	PR middle-end/86017
	* gimple-fold.c (var_decl_component_p): Also allow offsetted
	vars wrapped in MEM_REFs.

	* gcc.dg/tree-ssa/pr86017.c: New testcase.

Added:
    trunk/gcc/testsuite/gcc.dg/tree-ssa/pr86017.c
Modified:
    trunk/gcc/ChangeLog
    trunk/gcc/gimple-fold.c
    trunk/gcc/testsuite/ChangeLog

Comment 4 Richard Biener 2018-06-01 10:53:04 UTC

The testcase is now fixed.  We're still not merging adjacent memset/bzero calls.\
Modified testcase:

void f (void*);
void h (void)
{
  char a[8];
  __builtin_memset (a, 0, 1);
  __builtin_memset (a + 1, 0, 3);
  __builtin_memset (a + 4, 0, 4);

  f (a);
}

results in

;; Function h (h, funcdef_no=1, decl_uid=1962, cgraph_uid=1, symbol_order=1)

h ()
{
  char a[8];

  <bb 2> [local count: 1073741825]:
  MEM[(void *)&a] = 0;
  __builtin_memset (&MEM[(void *)&a + 1B], 0, 3);
  __builtin_memset (&MEM[(void *)&a + 4B], 0, 4);
  f (&a);
  a ={v} {CLOBBER};
  return;

}

note we have to deal with merging with stores as well.  Given the
original testcase was ultimatively solved by making only stores
available to store-merging the obvious thing to do is to teach
store-merging about memset()/bzero() and consider memset() for
code-generation as well(?).

No longer mine.

Comment 5 Andrew Pinski 2021-08-21 23:57:14 UTC

The rest is a dup of bug 49872.

*** This bug has been marked as a duplicate of bug 49872 ***