Bug 30442 - Expanded array initialization can use memset builtin function
Summary: Expanded array initialization can use memset builtin function
Status: NEW
Alias: None
Product: gcc
Classification: Unclassified
Component: middle-end (show other bugs)
Version: 4.3.0
: P3 enhancement
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks:
 
Reported: 2007-01-11 19:53 UTC by Uroš Bizjak
Modified: 2022-01-28 23:47 UTC (History)
3 users (show)

See Also:
Host:
Target:
Build:
Known to work:
Known to fail:
Last reconfirmed: 2022-01-28 00:00:00


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Uroš Bizjak 2007-01-11 19:53:35 UTC
Array initialization could use memset builtin function. In following two testcases, array is initialized without use of memset:

--cut here--
long long foo(long long *);

long long test1(void)
{
  long long a[32];

  a[0] = 0;
  a[1] = 0;
  a[2] = 0;
  a[3] = 0;
  a[4] = 0;
  a[5] = 0;
  a[6] = 0;
  a[7] = 0;
  a[8] = 0;
  a[9] = 0;
  a[10] = 0;
  a[11] = 0;
  a[12] = 0;
  a[13] = 0;
  a[14] = 0;
  a[15] = 0;
  a[16] = 0;
  a[17] = 0;
  a[18] = 0;
  a[19] = 0;
  a[20] = 0;
  a[21] = 0;
  a[22] = 0;
  a[23] = 0;
  a[24] = 0;
  a[25] = 0;
  a[26] = 0;
  a[27] = 0;
  a[28] = 0;
  a[29] = 0;
  a[30] = 0;
  a[31] = 0;

  return foo(a);
}

long long test2(void)
{
  long long a[32];
  int i;

  for (i = 0; i < 32; i++)
    a[i] = 0;

  return foo(a);
}
--cut here--

gcc -O3 -m32 -S -fomit-frame-pointer

test1:
        subl    $268, %esp
        leal    8(%esp), %eax
        movl    $0, 8(%esp)
        movl    $0, 12(%esp)
        movl    $0, 16(%esp)
        movl    $0, 20(%esp)
        movl    $0, 24(%esp)
        movl    $0, 28(%esp)
        movl    $0, 32(%esp)
        movl    $0, 36(%esp)
        movl    $0, 40(%esp)
        movl    $0, 44(%esp)
        movl    $0, 48(%esp)
        movl    $0, 52(%esp)
        movl    $0, 56(%esp)
        movl    $0, 60(%esp)
        movl    $0, 64(%esp)
        movl    $0, 68(%esp)
        movl    $0, 72(%esp)
        movl    $0, 76(%esp)
        movl    $0, 80(%esp)
        movl    $0, 84(%esp)
        movl    $0, 88(%esp)
        movl    $0, 92(%esp)
        movl    $0, 96(%esp)
        movl    $0, 100(%esp)
        movl    $0, 104(%esp)
        movl    $0, 108(%esp)
        movl    $0, 112(%esp)
        movl    $0, 116(%esp)
        movl    $0, 120(%esp)
        movl    $0, 124(%esp)
        movl    $0, 128(%esp)
        movl    $0, 132(%esp)
        movl    $0, 136(%esp)
        movl    $0, 140(%esp)
        movl    $0, 144(%esp)
        movl    $0, 148(%esp)
        movl    $0, 152(%esp)
        movl    $0, 156(%esp)
        movl    $0, 160(%esp)
        movl    $0, 164(%esp)
        movl    $0, 168(%esp)
        movl    $0, 172(%esp)
        movl    $0, 176(%esp)
        movl    $0, 180(%esp)
        movl    $0, 184(%esp)
        movl    $0, 188(%esp)
        movl    $0, 192(%esp)
        movl    $0, 196(%esp)
        movl    $0, 200(%esp)
        movl    $0, 204(%esp)
        movl    $0, 208(%esp)
        movl    $0, 212(%esp)
        movl    $0, 216(%esp)
        movl    $0, 220(%esp)
        movl    $0, 224(%esp)
        movl    $0, 228(%esp)
        movl    $0, 232(%esp)
        movl    $0, 236(%esp)
        movl    $0, 240(%esp)
        movl    $0, 244(%esp)
        movl    $0, 248(%esp)
        movl    $0, 252(%esp)
        movl    $0, 256(%esp)
        movl    $0, 260(%esp)
        movl    %eax, (%esp)
        call    foo
        addl    $268, %esp
        ret

test2:
        subl    $268, %esp
        movl    $2, %eax
        movl    $0, 8(%esp)
        leal    8(%esp), %edx
        movl    $0, 12(%esp)
        .p2align 4,,7
.L2:
        movl    $0, -8(%edx,%eax,8)
        movl    $0, -4(%edx,%eax,8)
        addl    $1, %eax
        cmpl    $33, %eax
        jne     .L2
        movl    %edx, (%esp)
        call    foo
        addl    $268, %esp
        ret

IIRC, this optimization was recently implemented in gfrortran...
Comment 1 Andrew Pinski 2007-01-11 22:16:37 UTC
> IIRC, this optimization was recently implemented in gfrortran...

That is because in Fortran, you have "a = 0" and instead of lowering it into a loop, it lowers it to a memset instead.
Comment 2 Andrew Pinski 2007-01-14 05:01:04 UTC
This is a bit complex, we could use a loop reroller to roll this into a loop and then transform that loop into memset for test1.
For test2 it is simple and a patch was presented at last year's gcc summit (and I posted an older version of that patch a year or two ago).
Comment 3 Johan Walles 2007-11-09 13:09:08 UTC
This optimization would have made grep 2.5.3 30% faster in a real-world test case:

http://bugs.debian.org/450649
Comment 4 Uroš Bizjak 2008-03-12 11:05:19 UTC
This still happens on mainline.

I wonder if vectorizer infrastructure can be re-used here to detect unrolled and looped version of memset. In addition to loop that can be "vectorized", we have something resembling "vectorization" of straight code.

And looking at comment #3, the rewards from real-world code look really promising.
Comment 5 Ira Rosen 2008-03-13 06:51:14 UTC
(In reply to comment #4)
> This still happens on mainline.
> 
> I wonder if vectorizer infrastructure can be re-used here to detect unrolled
> and looped version of memset. In addition to loop that can be "vectorized", we
> have something resembling "vectorization" of straight code.

This code can be vectorized with basic block SLP, which is not implemented yet (currently only SLP in loops is implemented).

> And looking at comment #3, the rewards from real-world code look really
> promising.
> 
However, in comment #3 there is a loop:

               :  MALLOC(visited, int, d->tindex);
 48796 16.2440 :  for (i = 0; i < d->tindex; ++i)
 94442 31.4394 :    visited[i] = 0;

There was an effort to replace such loops with calls to builtins - http://gcc.gnu.org/ml/gcc-patches/2007-07/msg00054.html.

Ira
Comment 6 Uroš Bizjak 2012-02-06 21:40:50 UTC
Reconfirmed with 4.7.
Comment 7 Richard Biener 2012-06-05 10:50:22 UTC
The

long long test2(void)
{
  long long a[32];
  int i;

  for (i = 0; i < 32; i++)
    a[i] = 0;

  return foo(a);
}

loop is transformed to memset at -O3.  The unrolled version is not "re-rolled"
still, and basic-block vectorization does not catch it because of the
call in the basic-block.  I'm trying to fix that.
Comment 8 Richard Biener 2012-06-05 12:38:30 UTC
Author: rguenth
Date: Tue Jun  5 12:38:26 2012
New Revision: 188235

URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=188235
Log:
2012-06-05  Richard Guenther  <rguenther@suse.de>

	PR tree-optimization/30442
	* tree-vect-data-refs.c (vect_analyze_data_refs): For basic-block
	vectorization stop analysis at the first stmt we cannot compute
	a data-reference for instead of giving up completely.

	* gcc.dg/vect/bb-slp-30.c: New testcase.

Added:
    trunk/gcc/testsuite/gcc.dg/vect/bb-slp-30.c
Modified:
    trunk/gcc/ChangeLog
    trunk/gcc/testsuite/ChangeLog
    trunk/gcc/tree-vect-data-refs.c
Comment 9 Richard Biener 2012-06-05 12:39:29 UTC
So, $summary is still true but we now at least vectorize the initialization.