Even though the two functions defined in the test case below are equivalent, GCC emits considerably less efficient code the one with multiple calls to memset than for the one with just a single call. Clang emits the same optimally efficient code for both. $ cat b.c && gcc -O2 -S -Wall -Wextra -fdump-tree-optimized=/dev/stdout -o/dev/stdout b.c void f (void*); void g (void) { char a[8]; __builtin_memset (a, 0, 8); f (a); } void h (void) { char a[8]; __builtin_memset (a, 0, 1); __builtin_memset (a + 1, 0, 1); __builtin_memset (a + 2, 0, 1); __builtin_memset (a + 3, 0, 1); __builtin_memset (a + 4, 0, 1); __builtin_memset (a + 5, 0, 1); __builtin_memset (a + 6, 0, 1); __builtin_memset (a + 7, 0, 1); f (a); } .file "b.c" .text ;; Function g (g, funcdef_no=0, decl_uid=1958, cgraph_uid=0, symbol_order=0) g () { char a[8]; <bb 2> [local count: 1073741825]: __builtin_memset (&a, 0, 8); f (&a); a ={v} {CLOBBER}; return; } .p2align 4,,15 .globl g .type g, @function g: .LFB0: .cfi_startproc subq $24, %rsp .cfi_def_cfa_offset 32 movq $0, 8(%rsp) leaq 8(%rsp), %rdi call f addq $24, %rsp .cfi_def_cfa_offset 8 ret .cfi_endproc .LFE0: .size g, .-g ;; Function h (h, funcdef_no=1, decl_uid=1962, cgraph_uid=1, symbol_order=1) h () { char a[8]; <bb 2> [local count: 1073741825]: MEM[(void *)&a] = 0; __builtin_memset (&MEM[(void *)&a + 1B], 0, 1); __builtin_memset (&MEM[(void *)&a + 2B], 0, 1); __builtin_memset (&MEM[(void *)&a + 3B], 0, 1); __builtin_memset (&MEM[(void *)&a + 4B], 0, 1); __builtin_memset (&MEM[(void *)&a + 5B], 0, 1); __builtin_memset (&MEM[(void *)&a + 6B], 0, 1); __builtin_memset (&MEM[(void *)&a + 7B], 0, 1); f (&a); a ={v} {CLOBBER}; return; } .p2align 4,,15 .globl h .type h, @function h: .LFB1: .cfi_startproc subq $24, %rsp .cfi_def_cfa_offset 32 leaq 8(%rsp), %rdi movb $0, 8(%rsp) movb $0, 9(%rsp) movb $0, 10(%rsp) movb $0, 11(%rsp) movb $0, 12(%rsp) movb $0, 13(%rsp) movb $0, 14(%rsp) movb $0, 15(%rsp) call f addq $24, %rsp .cfi_def_cfa_offset 8 ret .cfi_endproc .LFE1: .size h, .-h .ident "GCC: (GNU) 8.1.1 20180522" .section .note.GNU-stack,"",@progbits
See also bug 86010 for a related missed optimization (that one is a regression while this bug does not appear to be).
Confirmed. A related issue is that we inline the first but not the adjacent memsets. Fixing that makes us apply store-merging. Testing a patch.
Author: rguenth Date: Fri Jun 1 10:49:54 2018 New Revision: 261061 URL: https://gcc.gnu.org/viewcvs?rev=261061&root=gcc&view=rev Log: 2018-06-01 Richard Biener <rguenther@suse.de> PR middle-end/86017 * gimple-fold.c (var_decl_component_p): Also allow offsetted vars wrapped in MEM_REFs. * gcc.dg/tree-ssa/pr86017.c: New testcase. Added: trunk/gcc/testsuite/gcc.dg/tree-ssa/pr86017.c Modified: trunk/gcc/ChangeLog trunk/gcc/gimple-fold.c trunk/gcc/testsuite/ChangeLog
The testcase is now fixed. We're still not merging adjacent memset/bzero calls.\ Modified testcase: void f (void*); void h (void) { char a[8]; __builtin_memset (a, 0, 1); __builtin_memset (a + 1, 0, 3); __builtin_memset (a + 4, 0, 4); f (a); } results in ;; Function h (h, funcdef_no=1, decl_uid=1962, cgraph_uid=1, symbol_order=1) h () { char a[8]; <bb 2> [local count: 1073741825]: MEM[(void *)&a] = 0; __builtin_memset (&MEM[(void *)&a + 1B], 0, 3); __builtin_memset (&MEM[(void *)&a + 4B], 0, 4); f (&a); a ={v} {CLOBBER}; return; } note we have to deal with merging with stores as well. Given the original testcase was ultimatively solved by making only stores available to store-merging the obvious thing to do is to teach store-merging about memset()/bzero() and consider memset() for code-generation as well(?). No longer mine.
The rest is a dup of bug 49872. *** This bug has been marked as a duplicate of bug 49872 ***