[Bug c++/42209] New: missed optimization leads to several times slower code
gb-0001 at xsim dot com
gcc-bugzilla@gcc.gnu.org
Sun Nov 29 00:37:00 GMT 2009
This is not a correctness bug, this is a performance problem.
It appears constant propagation is inconsistent, causing a routine to be
inlined and simplified some places and not others. Code of the form
a(..., ~0x3)
for i=1..n
a(..., ~0)
a(..., 0x0fffffff)
inlines and simplifies the call with ~0 as an arg, but not the others.
Unfortunately, a() if inlined and simplified would reduce to about 4 machine
instructions, but the out-of-line call is about 80 static instructions with a
path length of about 20 instructions including an indirect branch. For small
values of 'n', the out-of-line call dominates execution time of this routine.
Following is complete source code (no headers) and the output of g++ -v. Note
that many small source changes cause the symptom to go away, such as manually
eliminating dead code (that otherwise GCC determines is dead and eliminates).
This "smells" like there is an optimization threshold and beyond some level of
source complexity the threshold is not reached.
#define CHAR_BIT (8)
template<typename T, typename size_t>
size_t bitsizeof() { return CHAR_BIT * sizeof(T); }
template<typename T>
unsigned bitsizeof() { return bitsizeof<T, unsigned>(); }
template<typename word_t>
word_t ALLONES() { return ~static_cast<word_t>(0); }
template<typename word_t, typename offset_t, typename bitlen_t>
static bool is_ltoh(bool might_overlap,
const word_t *dst, offset_t doff,
const word_t *src, offset_t soff,
bitlen_t bitlen) {
if (!might_overlap)
return true;
if (dst <= src) {
return true;
} else /* (dst > src) */ {
const unsigned WORDSIZE = bitsizeof<word_t>();
const word_t *srclast = src + (soff + bitlen) / WORDSIZE;
return srclast < dst;
}
}
typedef unsigned char op_t;
const op_t SRC = 0xC;
const op_t DST = 0xA;
const op_t SET = 0xF;
template<typename word_t>
static word_t switch_case(op_t op, word_t d, word_t s, word_t dst_mask) {
switch(op) {
case 0 & 0xF: d &= ~dst_mask; break; // 0
case ~(DST|SRC) & 0xF: d ^= ((~s | d) & dst_mask); break; // 1
case DST&~SRC & 0xF: d ^= (( s & d) & dst_mask); break; // 2
case ~SRC & 0xF: d ^= ((~s ^ d) & dst_mask); break; // 3
case ~DST&SRC & 0xF: d ^= (( s | d) & dst_mask); break; // 4
case ~DST & 0xF: d ^= dst_mask; break; // 5
case (DST^SRC) & 0xF: d ^= ( s & dst_mask); break; // 6
case ~(DST&SRC) & 0xF: d ^= (( s | ~d) & dst_mask); break; // 7
case DST&SRC & 0xF: d ^= ((~s & d) & dst_mask); break; // 8
case ~(DST^SRC) & 0xF: d ^= (~s & dst_mask); break; // 9
case DST & 0xF: break; // 10
case (DST|~SRC) & 0xF: d |= (~s & dst_mask); break; // 11
case SRC & 0xF: d ^= (( s ^ d) & dst_mask); break; // 12
case (~DST|SRC) & 0xF: d ^= (~(s & d) & dst_mask); break; // 13
case (DST|SRC) & 0xF: d |= (s & dst_mask); break; // 14
case SET & 0xF: d |= dst_mask; break; // 15
default:
; // NOT REACHED
}
return d;
}
template<typename word_t, typename offset_t, typename bitlen_t>
static void ltoh_MtoN_down(op_t op, word_t *di, const word_t *si, word_t
lo_mask, word_t hi_mask, unsigned ts, bitlen_t n1, bool more_src_words) {
unsigned bs = bitsizeof<word_t>() - ts;
word_t top = si[1];
word_t s = (top << ts) | (si[0] >> bs);
di[0] = switch_case(op, di[0], s, lo_mask);
for (bitlen_t i=1; i<n1; ++i) {
word_t bot = top;
top = si[i+1];
s = top << ts | bot >> bs;
di[i] = switch_case(op, di[i], s, ALLONES<word_t>());
}
s = top >> bs;
if (more_src_words)
s |= (si[n1+1] << ts);
di[n1] = switch_case(op, di[n1], s, hi_mask);
}
template<typename word_t, typename offset_t, typename bitlen_t>
static void htol_MtoN_down(op_t op, word_t *di, const word_t *si, word_t
lo_mask, word_t hi_mask, unsigned ts, bitlen_t n1, bool one_more_src_word) {
unsigned bs = bitsizeof<word_t>() - ts;
word_t bot = si[n1];
word_t s = bot >> bs;
if (one_more_src_word)
s |= si[n1+1] << ts;
di[n1] = switch_case(op, di[n1], s, hi_mask);
for (offset_t i=n1-1; i>0; --i) {
word_t top = bot;
bot = si[i];
s = top << ts | bot >> bs;
di[i] = switch_case(op, di[i], s, ALLONES<word_t>());
}
s = (bot << ts) | (si[0] >> bs);
di[0] = switch_case(op, di[0], s, lo_mask);
}
template<typename word_t, typename offset_t, typename bitlen_t>
void mem(word_t *di, offset_t doff, const word_t *si, offset_t soff,
bitlen_t bitlen, op_t op, bool might_overlap=true) {
const offset_t WS = bitsizeof<word_t,offset_t>();
const word_t WORDMASK = WS - 1;
di += doff / WS;
doff %= WS;
si += soff / WS;
soff %= WS;
const word_t ONES = ALLONES<word_t>();
word_t lo_mask = ONES << doff;
word_t hi_mask = ONES >> (WORDMASK - ((doff + bitlen - 1) & WORDMASK));
bool one_word = doff + static_cast<offset_t>(bitlen) <= WS;
if (soff == doff) {
} else if (soff < doff) {
} else /* if (soff > doff) */ {
unsigned ts = WS - (soff - doff);
if (one_word) {
} else {
offset_t n1dst = (doff+bitlen-1)/WS;
offset_t n1src = (soff+bitlen-1)/WS;
if (is_ltoh(might_overlap, di, doff, si, soff, bitlen))
ltoh_MtoN_down<word_t,offset_t,bitlen_t>(op, di, si, lo_mask, hi_mask,
ts, n1dst, n1src != n1dst);
else
htol_MtoN_down<word_t,offset_t,bitlen_t>(op, di, si, lo_mask, hi_mask,
ts, n1dst, n1src != n1dst);
}
}
}
void slow(unsigned *di, const unsigned *si) {
mem<unsigned>(di, 2, si, 10, 2042, SRC, false);
}
int main(int argc, char **argv) {;}
$ g++ -v -O4 -Wall -Werror x.cc -o x
Using built-in specs.
Target: i486-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Ubuntu 4.4.1-4ubuntu8'
--with-bugurl=file:///usr/share/doc/gcc-4.4/README.Bugs
--enable-languages=c,c++,fortran,objc,obj-c++ --prefix=/usr --enable-shared
--enable-multiarch --enable-linker-build-id --with-system-zlib
--libexecdir=/usr/lib --without-included-gettext --enable-threads=posix
--with-gxx-include-dir=/usr/include/c++/4.4 --program-suffix=-4.4 --enable-nls
--enable-clocale=gnu --enable-libstdcxx-debug --enable-objc-gc
--enable-targets=all --disable-werror --with-arch-32=i486 --with-tune=generic
--enable-checking=release --build=i486-linux-gnu --host=i486-linux-gnu
--target=i486-linux-gnu
Thread model: posix
gcc version 4.4.1 (Ubuntu 4.4.1-4ubuntu8)
COLLECT_GCC_OPTIONS='-v' '-O4' '-Wall' '-Werror' '-o' 'x' '-shared-libgcc'
'-mtune=generic' '-march=i486'
/usr/lib/gcc/i486-linux-gnu/4.4.1/cc1plus -quiet -v -D_GNU_SOURCE x.cc
-D_FORTIFY_SOURCE=2 -quiet -dumpbase x.cc -mtune=generic -march=i486 -auxbase x
-O4 -Wall -Werror -version -fstack-protector -o /tmp/ccdsdx0h.s
ignoring nonexistent directory "/usr/local/include/i486-linux-gnu"
ignoring nonexistent directory
"/usr/lib/gcc/i486-linux-gnu/4.4.1/../../../../i486-linux-gnu/include"
ignoring nonexistent directory "/usr/include/i486-linux-gnu"
#include "..." search starts here:
#include <...> search starts here:
/usr/include/c++/4.4
/usr/include/c++/4.4/i486-linux-gnu
/usr/include/c++/4.4/backward
/usr/local/include
/usr/lib/gcc/i486-linux-gnu/4.4.1/include
/usr/lib/gcc/i486-linux-gnu/4.4.1/include-fixed
/usr/include
End of search list.
GNU C++ (Ubuntu 4.4.1-4ubuntu8) version 4.4.1 (i486-linux-gnu)
compiled by GNU C version 4.4.1, GMP version 4.3.1, MPFR version
2.4.1-p2.
GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
Compiler executable checksum: d2bfb4b66d7bb5211c5fc338cae297a2
COLLECT_GCC_OPTIONS='-v' '-O4' '-Wall' '-Werror' '-o' 'x' '-shared-libgcc'
'-mtune=generic' '-march=i486'
as -V -Qy -o /tmp/cc6Uy9PA.o /tmp/ccdsdx0h.s
GNU assembler version 2.20 (i486-linux-gnu) using BFD version (GNU Binutils for
Ubuntu) 2.20
COMPILER_PATH=/usr/lib/gcc/i486-linux-gnu/4.4.1/:/usr/lib/gcc/i486-linux-gnu/4.4.1/:/usr/lib/gcc/i486-linux-gnu/:/usr/lib/gcc/i486-linux-gnu/4.4.1/:/usr/lib/gcc/i486-linux-gnu/:/usr/lib/gcc/i486-linux-gnu/4.4.1/:/usr/lib/gcc/i486-linux-gnu/
LIBRARY_PATH=/usr/lib/gcc/i486-linux-gnu/4.4.1/:/usr/lib/gcc/i486-linux-gnu/4.4.1/:/usr/lib/gcc/i486-linux-gnu/4.4.1/../../../../lib/:/lib/../lib/:/usr/lib/../lib/:/usr/lib/gcc/i486-linux-gnu/4.4.1/../../../:/lib/:/usr/lib/:/usr/lib/i486-linux-gnu/
COLLECT_GCC_OPTIONS='-v' '-O4' '-Wall' '-Werror' '-o' 'x' '-shared-libgcc'
'-mtune=generic' '-march=i486'
/usr/lib/gcc/i486-linux-gnu/4.4.1/collect2 --build-id --eh-frame-hdr -m
elf_i386 --hash-style=both -dynamic-linker /lib/ld-linux.so.2 -o x -z relro
/usr/lib/gcc/i486-linux-gnu/4.4.1/../../../../lib/crt1.o
/usr/lib/gcc/i486-linux-gnu/4.4.1/../../../../lib/crti.o
/usr/lib/gcc/i486-linux-gnu/4.4.1/crtbegin.o
-L/usr/lib/gcc/i486-linux-gnu/4.4.1 -L/usr/lib/gcc/i486-linux-gnu/4.4.1
-L/usr/lib/gcc/i486-linux-gnu/4.4.1/../../../../lib -L/lib/../lib
-L/usr/lib/../lib -L/usr/lib/gcc/i486-linux-gnu/4.4.1/../../..
-L/usr/lib/i486-linux-gnu /tmp/cc6Uy9PA.o -lstdc++ -lm -lgcc_s -lgcc -lc
-lgcc_s -lgcc /usr/lib/gcc/i486-linux-gnu/4.4.1/crtend.o
/usr/lib/gcc/i486-linux-gnu/4.4.1/../../../../lib/crtn.o
(This is a standard Ubuntu build of GCC.)
$ objdump -D x > x.dis
The code of interest is for "slow()": For the inner loop (80485be..80485d7),
"switch_case()" is inlined and simplifies. Since the mask is ~0, it reduces to
nothing. For the leading and trailing calls, the relevant values are pushed
and it calls switch_case() out-of-line. If the source is simplified (e.g., by
replacing the unused "else" call to "htol..." with a semicolon) the routine is
inlined and reduces to three instructions at each site. Where the inner loop
is called just a few times (which is common) the leading/trailing cost
dominates.
080484b0 <_Z11switch_caseIjET_hS0_S0_S0_>:
80484b0: 55 push %ebp
80484b1: 3c 0f cmp $0xf,%al
80484b3: 89 e5 mov %esp,%ebp
80484b5: 53 push %ebx
80484b6: 8b 5d 08 mov 0x8(%ebp),%ebx
80484b9: 77 15 ja 80484d0
<_Z11switch_caseIjET_hS0_S0_S0_+0x20>
80484bb: 0f b6 c0 movzbl %al,%eax
80484be: ff 24 85 e0 86 04 08 jmp *0x80486e0(,%eax,4)
80484c5: 8d 76 00 lea 0x0(%esi),%esi
80484c8: f7 d3 not %ebx
80484ca: 21 da and %ebx,%edx
80484cc: 8d 74 26 00 lea 0x0(%esi,%eiz,1),%esi
80484d0: 89 d0 mov %edx,%eax
80484d2: 5b pop %ebx
80484d3: 5d pop %ebp
80484d4: c3 ret
80484d5: 8d 76 00 lea 0x0(%esi),%esi
80484d8: f7 d1 not %ecx
80484da: 21 d9 and %ebx,%ecx
80484dc: 31 ca xor %ecx,%edx
80484de: 89 d0 mov %edx,%eax
80484e0: 5b pop %ebx
80484e1: 5d pop %ebp
80484e2: c3 ret
80484e3: 90 nop
80484e4: 8d 74 26 00 lea 0x0(%esi,%eiz,1),%esi
80484e8: 89 d0 mov %edx,%eax
80484ea: f7 d0 not %eax
80484ec: 09 c8 or %ecx,%eax
80484ee: 21 d8 and %ebx,%eax
80484f0: 31 c2 xor %eax,%edx
80484f2: eb dc jmp 80484d0
<_Z11switch_caseIjET_hS0_S0_S0_+0x20>
80484f4: 8d 74 26 00 lea 0x0(%esi,%eiz,1),%esi
80484f8: f7 d1 not %ecx
80484fa: 21 d9 and %ebx,%ecx
80484fc: 21 d1 and %edx,%ecx
80484fe: 31 ca xor %ecx,%edx
8048500: eb ce jmp 80484d0
<_Z11switch_caseIjET_hS0_S0_S0_+0x20>
8048502: 8d b6 00 00 00 00 lea 0x0(%esi),%esi
8048508: 09 da or %ebx,%edx
804850a: eb c4 jmp 80484d0
<_Z11switch_caseIjET_hS0_S0_S0_+0x20>
804850c: 8d 74 26 00 lea 0x0(%esi,%eiz,1),%esi
8048510: f7 d1 not %ecx
8048512: 21 d9 and %ebx,%ecx
8048514: 09 ca or %ecx,%edx
8048516: eb b8 jmp 80484d0
<_Z11switch_caseIjET_hS0_S0_S0_+0x20>
8048518: 31 d1 xor %edx,%ecx
804851a: 21 d9 and %ebx,%ecx
804851c: 31 ca xor %ecx,%edx
804851e: eb b0 jmp 80484d0
<_Z11switch_caseIjET_hS0_S0_S0_+0x20>
8048520: 21 d1 and %edx,%ecx
8048522: f7 d1 not %ecx
8048524: 21 d9 and %ebx,%ecx
8048526: 31 ca xor %ecx,%edx
8048528: eb a6 jmp 80484d0
<_Z11switch_caseIjET_hS0_S0_S0_+0x20>
804852a: 8d b6 00 00 00 00 lea 0x0(%esi),%esi
8048530: 21 d9 and %ebx,%ecx
8048532: 09 ca or %ecx,%edx
8048534: eb 9a jmp 80484d0
<_Z11switch_caseIjET_hS0_S0_S0_+0x20>
8048536: 66 90 xchg %ax,%ax
8048538: f7 d1 not %ecx
804853a: 09 d1 or %edx,%ecx
804853c: 21 d9 and %ebx,%ecx
804853e: 31 ca xor %ecx,%edx
8048540: eb 8e jmp 80484d0
<_Z11switch_caseIjET_hS0_S0_S0_+0x20>
8048542: 8d b6 00 00 00 00 lea 0x0(%esi),%esi
8048548: 21 d1 and %edx,%ecx
804854a: 21 d9 and %ebx,%ecx
804854c: 31 ca xor %ecx,%edx
804854e: eb 80 jmp 80484d0
<_Z11switch_caseIjET_hS0_S0_S0_+0x20>
8048550: f7 d1 not %ecx
8048552: 31 d1 xor %edx,%ecx
8048554: 21 d9 and %ebx,%ecx
8048556: 31 ca xor %ecx,%edx
8048558: e9 73 ff ff ff jmp 80484d0
<_Z11switch_caseIjET_hS0_S0_S0_+0x20>
804855d: 8d 76 00 lea 0x0(%esi),%esi
8048560: 09 d1 or %edx,%ecx
8048562: 21 d9 and %ebx,%ecx
8048564: 31 ca xor %ecx,%edx
8048566: e9 65 ff ff ff jmp 80484d0
<_Z11switch_caseIjET_hS0_S0_S0_+0x20>
804856b: 90 nop
804856c: 8d 74 26 00 lea 0x0(%esi,%eiz,1),%esi
8048570: 31 da xor %ebx,%edx
8048572: e9 59 ff ff ff jmp 80484d0
<_Z11switch_caseIjET_hS0_S0_S0_+0x20>
8048577: 89 f6 mov %esi,%esi
8048579: 8d bc 27 00 00 00 00 lea 0x0(%edi,%eiz,1),%edi
08048580 <_Z4slowPjPKj>:
8048580: 55 push %ebp
8048581: 89 e5 mov %esp,%ebp
8048583: 83 ec 10 sub $0x10,%esp
8048586: 89 7d fc mov %edi,-0x4(%ebp)
8048589: 8b 7d 0c mov 0xc(%ebp),%edi
804858c: 89 75 f8 mov %esi,-0x8(%ebp)
804858f: 8b 75 08 mov 0x8(%ebp),%esi
8048592: 89 5d f4 mov %ebx,-0xc(%ebp)
8048595: 8b 5f 04 mov 0x4(%edi),%ebx
8048598: 8b 07 mov (%edi),%eax
804859a: c7 04 24 fc ff ff ff movl $0xfffffffc,(%esp)
80485a1: 8b 16 mov (%esi),%edx
80485a3: 89 d9 mov %ebx,%ecx
80485a5: c1 e8 08 shr $0x8,%eax
80485a8: c1 e1 18 shl $0x18,%ecx
80485ab: 09 c1 or %eax,%ecx
80485ad: b8 0c 00 00 00 mov $0xc,%eax
80485b2: e8 f9 fe ff ff call 80484b0
<_Z11switch_caseIjET_hS0_S0_S0_>
80485b7: 89 06 mov %eax,(%esi)
80485b9: b8 01 00 00 00 mov $0x1,%eax
80485be: 8b 54 87 04 mov 0x4(%edi,%eax,4),%edx
80485c2: c1 eb 08 shr $0x8,%ebx
80485c5: 89 d1 mov %edx,%ecx
80485c7: c1 e1 18 shl $0x18,%ecx
80485ca: 09 cb or %ecx,%ebx
80485cc: 89 1c 86 mov %ebx,(%esi,%eax,4)
80485cf: 83 c0 01 add $0x1,%eax
80485d2: 89 d3 mov %edx,%ebx
80485d4: 83 f8 3f cmp $0x3f,%eax
80485d7: 75 e5 jne 80485be <_Z4slowPjPKj+0x3e>
80485d9: 8b 87 00 01 00 00 mov 0x100(%edi),%eax
80485df: 89 d1 mov %edx,%ecx
80485e1: c7 04 24 ff ff ff 0f movl $0xfffffff,(%esp)
80485e8: 8b 96 fc 00 00 00 mov 0xfc(%esi),%edx
80485ee: c1 e9 08 shr $0x8,%ecx
80485f1: c1 e0 18 shl $0x18,%eax
80485f4: 09 c1 or %eax,%ecx
80485f6: b8 0c 00 00 00 mov $0xc,%eax
80485fb: e8 b0 fe ff ff call 80484b0
<_Z11switch_caseIjET_hS0_S0_S0_>
8048600: 89 86 fc 00 00 00 mov %eax,0xfc(%esi)
8048606: 8b 5d f4 mov -0xc(%ebp),%ebx
8048609: 8b 75 f8 mov -0x8(%ebp),%esi
804860c: 8b 7d fc mov -0x4(%ebp),%edi
804860f: 89 ec mov %ebp,%esp
8048611: 5d pop %ebp
8048612: c3 ret
--
Summary: missed optimization leads to several times slower code
Product: gcc
Version: 4.4.1
Status: UNCONFIRMED
Severity: minor
Priority: P3
Component: c++
AssignedTo: unassigned at gcc dot gnu dot org
ReportedBy: gb-0001 at xsim dot com
GCC build triplet: i486-linux-gnu
GCC host triplet: i486-linux-gnu
GCC target triplet: i486-linux-gnu
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=42209
More information about the Gcc-bugs
mailing list