Bug 15036 - [3.4 regression] Performance and code size regression compared to 3.3
Summary: [3.4 regression] Performance and code size regression compared to 3.3
Status: RESOLVED WONTFIX
Alias: None
Product: gcc
Classification: Unclassified
Component: c++ (show other bugs)
Version: 3.4.0
: P2 normal
Target Milestone: 4.0.0
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks:
 
Reported: 2004-04-21 08:32 UTC by Martin Reinecke
Modified: 2005-03-31 21:53 UTC (History)
4 users (show)

See Also:
Host: i686-pc-linux-gnu
Target: i686-pc-linux-gnu
Build: i686-pc-linux-gnu
Known to work: 3.3.3 tree-ssa 4.0.0
Known to fail: 3.4.0
Last reconfirmed: 2005-02-12 20:57:50


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Martin Reinecke 2004-04-21 08:32:02 UTC
When compiling the following code:

template<typename T> class xcomplex
  {
  public:
    T re, im;

    xcomplex (const T &re_, const T &im_)
      : re(re_), im(im_) {}
    template<typename U> xcomplex (const xcomplex<U> &orig)
      : re(orig.re), im(orig.im) {}
    xcomplex operator* (const T &fact) const
      { return xcomplex (re*fact,im*fact); }
  };

void foo (xcomplex<double> *a, double *b, xcomplex<double> *c)
  {
  for (int m=0; m<1000; ++m)
    c[m]=a[m]*b[m];
  }


g++ 3.4 produces bigger (and slower) code than 3.3:

~/tmp>g++ -O2 -S -v perf.cc
Reading specs from
/afs/mpa/common/pdsoft/appl/gcc-3.3.2/lib/gcc-lib/i686-pc-linux-gnu/3.3.2/specs
Configured with: /scratch/gcc-3.3.2/configure
--prefix=/afs/mpa/common/pdsoft/appl/gcc-3.3.2 --enable-languages=c++
--disable-checking
Thread model: posix
gcc version 3.3.2
 /afs/mpa/common/pdsoft/appl/gcc-3.3.2/lib/gcc-lib/i686-pc-linux-gnu/3.3.2/cc1plus
-quiet -v -D__GNUC__=3 -D__GNUC_MINOR__=3 -D__GNUC_PATCHLEVEL__=2 -D_GNU_SOURCE
perf.cc -D__GNUG__=3 -quiet -dumpbase perf.cc -auxbase perf -O2 -version -o perf.s
GNU C++ version 3.3.2 (i686-pc-linux-gnu)
        compiled by GNU C version 3.3.2.
GGC heuristics: --param ggc-min-expand=63 --param ggc-min-heapsize=63318
ignoring nonexistent directory
"/afs/mpa/common/pdsoft/appl/gcc-3.3.2/i686-pc-linux-gnu/include"
#include "..." search starts here:
#include <...> search starts here:
 /afs/mpa/common/pdsoft/appl/gcc-3.3.2/include/c++/3.3.2
 /afs/mpa/common/pdsoft/appl/gcc-3.3.2/include/c++/3.3.2/i686-pc-linux-gnu
 /afs/mpa/common/pdsoft/appl/gcc-3.3.2/include/c++/3.3.2/backward
 /usr/local/include
 /afs/mpa/common/pdsoft/appl/gcc-3.3.2/include
 /afs/mpa/common/pdsoft/appl/gcc-3.3.2/lib/gcc-lib/i686-pc-linux-gnu/3.3.2/include
 /usr/include
End of search list.

~/tmp>cat perf.s
        .file   "perf.cc"
        .text
        .align 2
        .p2align 4,,15
.globl _Z3fooP8xcomplexIdEPdS1_
        .type   _Z3fooP8xcomplexIdEPdS1_, @function
_Z3fooP8xcomplexIdEPdS1_:
.LFB10:
        pushl   %ebp
.LCFI0:
        movl    %esp, %ebp
.LCFI1:
        pushl   %edi
.LCFI2:
        pushl   %esi
.LCFI3:
        xorl    %esi, %esi
        pushl   %ebx
.LCFI4:
        subl    $28, %esp
.LCFI5:
        movl    8(%ebp), %edi
        movl    16(%ebp), %ebx
        movl    12(%ebp), %ecx
        .p2align 4,,15
.L8:
        fldl    (%ecx)
        movl    %esi, %edx
        sall    $4, %edx
        leal    (%edx,%edi), %eax
        incl    %esi
        addl    $8, %ecx
        fldl    (%eax)
        cmpl    $999, %esi
        fmul    %st(1), %st
        fxch    %st(1)
        fmull   8(%eax)
        fxch    %st(1)
        fstpl   -40(%ebp)
        movl    -40(%ebp), %eax
        fstpl   -32(%ebp)
        movl    %eax, (%ebx,%edx)
        movl    -36(%ebp), %eax
        movl    %eax, 4(%ebx,%edx)
        movl    -32(%ebp), %eax
        movl    %eax, 8(%ebx,%edx)
        movl    -28(%ebp), %eax
        movl    %eax, 12(%ebx,%edx)
        jle     .L8
        addl    $28, %esp
        popl    %ebx
        popl    %esi
        popl    %edi
        popl    %ebp
        ret
.LFE10:
        .size   _Z3fooP8xcomplexIdEPdS1_, .-_Z3fooP8xcomplexIdEPdS1_
        .ident  "GCC: (GNU) 3.3.2"


~/tmp>g++ -O2 -S -v perf.cc
Reading specs from /afs/mpa/data/martin/ugcc34/lib/gcc/i686-pc-linux-gnu/3.4.0/specs
Configured with: /scratch/gcc34/configure --prefix=/afs/mpa/data/martin/ugcc34
--enable-languages=c++ --disable-checking
Thread model: posix
gcc version 3.4.0 20040418 (prerelease)
 /afs/mpa/data/martin/ugcc34/libexec/gcc/i686-pc-linux-gnu/3.4.0/cc1plus -quiet
-v -D_GNU_SOURCE perf.cc -quiet -dumpbase perf.cc -mtune=pentiumpro -auxbase
perf -O2 -version -o perf.s
ignoring nonexistent directory
"/afs/mpa/data/martin/ugcc34/lib/gcc/i686-pc-linux-gnu/3.4.0/../../../../i686-pc-linux-gnu/include"
#include "..." search starts here:
#include <...> search starts here:
 /afs/mpa/data/martin/ugcc34/lib/gcc/i686-pc-linux-gnu/3.4.0/../../../../include/c++/3.4.0
 /afs/mpa/data/martin/ugcc34/lib/gcc/i686-pc-linux-gnu/3.4.0/../../../../include/c++/3.4.0/i686-pc-linux-gnu
 /afs/mpa/data/martin/ugcc34/lib/gcc/i686-pc-linux-gnu/3.4.0/../../../../include/c++/3.4.0/backward
 /usr/local/include
 /afs/mpa/data/martin/ugcc34/include
 /afs/mpa/data/martin/ugcc34/lib/gcc/i686-pc-linux-gnu/3.4.0/include
 /usr/include
End of search list.
GNU C++ version 3.4.0 20040418 (prerelease) (i686-pc-linux-gnu)
        compiled by GNU C version 3.4.0 20040418 (prerelease).
GGC heuristics: --param ggc-min-expand=63 --param ggc-min-heapsize=63318


~/tmp>cat perf.s
        .file   "perf.cc"
        .text
        .align 2
        .p2align 4,,15
.globl _Z3fooP8xcomplexIdEPdS1_
        .type   _Z3fooP8xcomplexIdEPdS1_, @function
_Z3fooP8xcomplexIdEPdS1_:
.LFB5:
        pushl   %ebp
.LCFI0:
        movl    %esp, %ebp
.LCFI1:
        pushl   %edi
.LCFI2:
        pushl   %esi
.LCFI3:
        pushl   %ebx
.LCFI4:
        subl    $60, %esp
.LCFI5:
        movl    16(%ebp), %edi
        movl    $0, -60(%ebp)
        movl    12(%ebp), %esi
        .p2align 4,,15
.L7:
        fldl    (%esi)
        addl    $8, %esi
        movl    -60(%ebp), %edx
        incl    -60(%ebp)
        movl    8(%ebp), %ecx
        sall    $4, %edx
        cmpl    $999, -60(%ebp)
        leal    (%edx,%ecx), %eax
        fldl    (%eax)
        fmul    %st(1), %st
        fxch    %st(1)
        fmull   8(%eax)
        fxch    %st(1)
        fstpl   -56(%ebp)
        movl    -56(%ebp), %ebx
        movl    -52(%ebp), %ecx
        fstpl   -48(%ebp)
        movl    -48(%ebp), %eax
        movl    %ebx, -40(%ebp)
        movl    %ecx, -36(%ebp)
        movl    %eax, -32(%ebp)
        movl    -44(%ebp), %eax
        movl    %eax, -28(%ebp)
        movl    %ebx, (%edx,%edi)
        movl    %ecx, 4(%edx,%edi)
        movl    -32(%ebp), %eax
        movl    %eax, 8(%edx,%edi)
        movl    -28(%ebp), %eax
        movl    %eax, 12(%edx,%edi)
        jle     .L7
        addl    $60, %esp
        popl    %ebx
        popl    %esi
        popl    %edi
        popl    %ebp
        ret
.LFE5:
        .size   _Z3fooP8xcomplexIdEPdS1_, .-_Z3fooP8xcomplexIdEPdS1_
        .section        .note.GNU-stack,"",@progbits
        .ident  "GCC: (GNU) 3.4.0 20040418 (prerelease)"


The assembler generated by g++ 3.4 contains more movl statements inside the
loop.
This problem is also present on mainline.
Comment 1 Andrew Pinski 2004-04-21 11:39:41 UTC
Confirmed.  I suspect that multiplication operator is being created differently (but someone would need 
to look at the tree dumps).
Comment 2 Mark Mitchell 2004-06-05 20:35:25 UTC
Richard --

As the IA32 maintainer, would you take a look at this?

Thanks,

-- Mark
Comment 3 Mark Mitchell 2004-06-21 21:17:16 UTC
Postponed until GCC 3.4.2.
Comment 4 Andrew Pinski 2004-08-10 01:39:24 UTC
In reality I think this is a front-end regression.
Comment 5 Mark Mitchell 2004-08-29 18:51:52 UTC
Postponed until GCC 3.4.3.
Comment 6 Daniel S. 2004-09-16 18:40:47 UTC
I was benchmarking STL containers out of curiosity and it turned out that 
inserting keys in a set/map is up to 70% slower with 3.4.x than it was with 
3.3.4...

Adding 1 milion keys to set<int> / hash_set<int> / map<int>:
1.10 / 0.68 / 1.11 bilion ticks with 3.2.2 (P233MMX / -O2 -march=pentium)
1.09 / 0.72 / 1.11 with 3.3.4
1.54 / 0.94 / 1.90 with 3.4.1
1.52 / 0.93 / 2.01 with 3.4.2

Sample loop used for measurement: (using rdtsc to estimate the tick count, 
accuracy is within 1-5%)

for(int a=0; a < 5000000; a+=5)
	intSet.insert(a);

While this is not realistic every day usage, it does appear to be 
representative of this issue and I am somewhat surprised to see it as being 
only a normal-normal issue.
Comment 7 Paolo Carlini 2004-09-16 18:47:59 UTC
Daniel S., could you please try rebuilding gcc3.4.2 passing at configure time
--enable-libstdcxx-allocator=pool ? The timings should definitely change.
Thanks, Paolo.
Comment 8 Daniel S. 2004-09-17 23:23:56 UTC
(In reply to comment #7)
Started the build this morning, tried my simplistic benchmark tonight with the 
new GCC build...

Thanks, this appears to have done a small miracle and made 3.4.2 5-10% faster 
than my 3.3.4 was... but it still lags ~5% behind 3.2.2 for hash_set though. 
(that 3.2.2 is from stock/up2date RH9 packages, the others are from sources 
with -O2 for GCC and -O2 -g for C/C++ libs.)

Does this mean that allocator pool used to be the default in 3.3.x and this 
behavior got dropped along the way to 3.4.x? (this certainly is what comparing 
the 3.3.4 c++configure.h file with the 3.4.2 version is hinting - I would be 
curious to read the reason(s).)
Comment 9 Paolo Carlini 2004-09-18 07:34:50 UTC
> Does this mean that allocator pool used to be the default in 3.3.x and this 
> behavior got dropped along the way to 3.4.x?

Yes. To summarize a long story, allocators are a work in progress and we are
currently concentrating on yet another one, called mt_allocator. Unfortunately,
we haven't been able to forward port to a new framework in time for 3.4.0 the 
traditional pool allocator and, to be safe, we chosen the "trivial" new-based
allocator as the default. Then, for ABI stability we couldn't change that
default anymore for the next 3.4.x releases. However, for 3.4.2 and newer you
can always configure it yourself at build time to obtain a behavior very
similar to that of 3.3.x allocator. Thanks, Paolo.
Comment 10 Mark Mitchell 2004-11-01 00:45:40 UTC
Postponed until GCC 3.4.4.
Comment 11 Richard Henderson 2005-03-31 21:53:33 UTC
The code generated by gcc 4.0 is significantly improved:

.L2:
        fldl    -8(%ebx,%ecx,8)
        incl    %ecx
        fld     %st(0)
        fmull   (%edx)
        fxch    %st(1)
        fmull   8(%edx)
        fxch    %st(1)
        addl    $16, %edx
        fstpl   (%eax)
        fstpl   8(%eax)
        addl    $16, %eax
        cmpl    $1001, %ecx
        jne     .L2

I have no plans to address this for the 3.4 branch.