This is the mail archive of the gcc-bugs@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[Bug c++/42209] New: missed optimization leads to several times slower code


This is not a correctness bug, this is a performance problem.

It appears constant propagation is inconsistent, causing a routine to be
inlined and simplified some places and not others.  Code of the form

a(..., ~0x3)
for i=1..n
  a(..., ~0)
a(..., 0x0fffffff)

inlines and simplifies the call with ~0 as an arg, but not the others. 
Unfortunately, a() if inlined and simplified would reduce to about 4 machine
instructions, but the out-of-line call is about 80 static instructions with a
path length of about 20 instructions including an indirect branch.  For small
values of 'n', the out-of-line call dominates execution time of this routine.

Following is complete source code (no headers) and the output of g++ -v.  Note
that many small source changes cause the symptom to go away, such as manually
eliminating dead code (that otherwise GCC determines is dead and eliminates). 
This "smells" like there is an optimization threshold and beyond some level of
source complexity the threshold is not reached.



#define CHAR_BIT  (8)

template<typename T, typename size_t>
size_t bitsizeof() { return CHAR_BIT * sizeof(T); }
template<typename T>
unsigned bitsizeof() { return bitsizeof<T, unsigned>(); }

template<typename word_t>
word_t ALLONES() { return ~static_cast<word_t>(0); }

template<typename word_t, typename offset_t, typename bitlen_t>
static bool is_ltoh(bool might_overlap,
                    const word_t *dst, offset_t doff,
                    const word_t *src, offset_t soff,
                    bitlen_t bitlen) {
  if (!might_overlap)
    return true;

  if (dst <= src) {
    return true;
  } else /* (dst > src) */ {
    const unsigned WORDSIZE = bitsizeof<word_t>();
    const word_t *srclast = src + (soff + bitlen) / WORDSIZE;
    return srclast < dst;
  }
}


typedef unsigned char op_t;
const op_t SRC = 0xC;
const op_t DST = 0xA;
const op_t SET = 0xF;


template<typename word_t>
static word_t switch_case(op_t op, word_t d, word_t s, word_t dst_mask) {
  switch(op) {
  case 0          & 0xF:   d &= ~dst_mask;                 break; //  0
  case ~(DST|SRC) & 0xF:   d ^= ((~s | d) & dst_mask);     break; //  1
  case DST&~SRC   & 0xF:   d ^= (( s & d) & dst_mask);     break; //  2
  case ~SRC       & 0xF:   d ^= ((~s ^ d) & dst_mask);     break; //  3
  case ~DST&SRC   & 0xF:   d ^= (( s | d) & dst_mask);     break; //  4
  case ~DST       & 0xF:   d ^= dst_mask;                  break; //  5
  case (DST^SRC)  & 0xF:   d ^= ( s & dst_mask);           break; //  6
  case ~(DST&SRC) & 0xF:   d ^= (( s | ~d) & dst_mask);    break; //  7
  case DST&SRC    & 0xF:   d ^= ((~s &  d) & dst_mask);    break; //  8
  case ~(DST^SRC) & 0xF:   d ^= (~s & dst_mask);           break; //  9
  case DST        & 0xF:                                   break; // 10
  case (DST|~SRC) & 0xF:   d |= (~s & dst_mask);           break; // 11
  case SRC        & 0xF:   d ^= (( s ^ d) & dst_mask);     break; // 12
  case (~DST|SRC) & 0xF:   d ^= (~(s & d) & dst_mask);     break; // 13
  case (DST|SRC)  & 0xF:   d |= (s & dst_mask);            break; // 14
  case SET        & 0xF:   d |= dst_mask;                  break; // 15
  default:
    ; // NOT REACHED
  }
  return d;
}


template<typename word_t, typename offset_t, typename bitlen_t>
static void ltoh_MtoN_down(op_t op, word_t *di, const word_t *si, word_t
lo_mask, word_t hi_mask, unsigned ts, bitlen_t n1, bool more_src_words) {
  unsigned bs = bitsizeof<word_t>() - ts;
  word_t top = si[1];
  word_t s = (top << ts) | (si[0] >> bs);
  di[0] = switch_case(op, di[0], s, lo_mask);

  for (bitlen_t i=1; i<n1; ++i) {
    word_t bot = top;
    top = si[i+1];
    s = top << ts | bot >> bs;
    di[i] = switch_case(op, di[i], s, ALLONES<word_t>());
  }

  s = top >> bs;
  if (more_src_words)
    s |= (si[n1+1] << ts);
  di[n1] = switch_case(op, di[n1], s, hi_mask);
}

template<typename word_t, typename offset_t, typename bitlen_t>
static void htol_MtoN_down(op_t op, word_t *di, const word_t *si, word_t
lo_mask, word_t hi_mask, unsigned ts, bitlen_t n1, bool one_more_src_word) {
  unsigned bs = bitsizeof<word_t>() - ts;
  word_t bot = si[n1];
  word_t s = bot >> bs;
  if (one_more_src_word)
    s |= si[n1+1] << ts;
  di[n1] = switch_case(op, di[n1], s, hi_mask);

  for (offset_t i=n1-1; i>0; --i) {
    word_t top = bot;
    bot = si[i];
    s = top << ts | bot >> bs;
    di[i] = switch_case(op, di[i], s, ALLONES<word_t>());
  }

  s = (bot << ts) | (si[0] >> bs);
  di[0] = switch_case(op, di[0], s, lo_mask);
}



template<typename word_t, typename offset_t, typename bitlen_t>
void mem(word_t *di, offset_t doff, const word_t *si, offset_t soff,
         bitlen_t bitlen, op_t op, bool might_overlap=true) {
  const offset_t WS = bitsizeof<word_t,offset_t>();
  const word_t WORDMASK = WS - 1;

  di += doff / WS;
  doff %= WS;
  si += soff / WS;
  soff %= WS;

  const word_t ONES = ALLONES<word_t>();
  word_t lo_mask = ONES << doff;
  word_t hi_mask = ONES >> (WORDMASK - ((doff + bitlen - 1) & WORDMASK));

  bool one_word = doff + static_cast<offset_t>(bitlen) <= WS;
  if (soff == doff) {
  } else if (soff < doff) {
  } else /* if (soff > doff) */ {
    unsigned ts = WS - (soff - doff);
    if (one_word) {
    } else {
      offset_t n1dst = (doff+bitlen-1)/WS;
      offset_t n1src = (soff+bitlen-1)/WS;
      if (is_ltoh(might_overlap, di, doff, si, soff, bitlen))
        ltoh_MtoN_down<word_t,offset_t,bitlen_t>(op, di, si, lo_mask, hi_mask,
ts, n1dst, n1src != n1dst);
      else
        htol_MtoN_down<word_t,offset_t,bitlen_t>(op, di, si, lo_mask, hi_mask,
ts, n1dst, n1src != n1dst);
    }
  }
}


void slow(unsigned *di, const unsigned *si) {
  mem<unsigned>(di, 2, si, 10, 2042, SRC, false);
}

int main(int argc, char **argv) {;}



$ g++ -v -O4 -Wall -Werror x.cc -o x
Using built-in specs.
Target: i486-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Ubuntu 4.4.1-4ubuntu8'
--with-bugurl=file:///usr/share/doc/gcc-4.4/README.Bugs
--enable-languages=c,c++,fortran,objc,obj-c++ --prefix=/usr --enable-shared
--enable-multiarch --enable-linker-build-id --with-system-zlib
--libexecdir=/usr/lib --without-included-gettext --enable-threads=posix
--with-gxx-include-dir=/usr/include/c++/4.4 --program-suffix=-4.4 --enable-nls
--enable-clocale=gnu --enable-libstdcxx-debug --enable-objc-gc
--enable-targets=all --disable-werror --with-arch-32=i486 --with-tune=generic
--enable-checking=release --build=i486-linux-gnu --host=i486-linux-gnu
--target=i486-linux-gnu
Thread model: posix
gcc version 4.4.1 (Ubuntu 4.4.1-4ubuntu8) 
COLLECT_GCC_OPTIONS='-v' '-O4' '-Wall' '-Werror' '-o' 'x' '-shared-libgcc'
'-mtune=generic' '-march=i486'
 /usr/lib/gcc/i486-linux-gnu/4.4.1/cc1plus -quiet -v -D_GNU_SOURCE x.cc
-D_FORTIFY_SOURCE=2 -quiet -dumpbase x.cc -mtune=generic -march=i486 -auxbase x
-O4 -Wall -Werror -version -fstack-protector -o /tmp/ccdsdx0h.s
ignoring nonexistent directory "/usr/local/include/i486-linux-gnu"
ignoring nonexistent directory
"/usr/lib/gcc/i486-linux-gnu/4.4.1/../../../../i486-linux-gnu/include"
ignoring nonexistent directory "/usr/include/i486-linux-gnu"
#include "..." search starts here:
#include <...> search starts here:
 /usr/include/c++/4.4
 /usr/include/c++/4.4/i486-linux-gnu
 /usr/include/c++/4.4/backward
 /usr/local/include
 /usr/lib/gcc/i486-linux-gnu/4.4.1/include
 /usr/lib/gcc/i486-linux-gnu/4.4.1/include-fixed
 /usr/include
End of search list.
GNU C++ (Ubuntu 4.4.1-4ubuntu8) version 4.4.1 (i486-linux-gnu)
        compiled by GNU C version 4.4.1, GMP version 4.3.1, MPFR version
2.4.1-p2.
GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
Compiler executable checksum: d2bfb4b66d7bb5211c5fc338cae297a2
COLLECT_GCC_OPTIONS='-v' '-O4' '-Wall' '-Werror' '-o' 'x' '-shared-libgcc'
'-mtune=generic' '-march=i486'
 as -V -Qy -o /tmp/cc6Uy9PA.o /tmp/ccdsdx0h.s
GNU assembler version 2.20 (i486-linux-gnu) using BFD version (GNU Binutils for
Ubuntu) 2.20
COMPILER_PATH=/usr/lib/gcc/i486-linux-gnu/4.4.1/:/usr/lib/gcc/i486-linux-gnu/4.4.1/:/usr/lib/gcc/i486-linux-gnu/:/usr/lib/gcc/i486-linux-gnu/4.4.1/:/usr/lib/gcc/i486-linux-gnu/:/usr/lib/gcc/i486-linux-gnu/4.4.1/:/usr/lib/gcc/i486-linux-gnu/
LIBRARY_PATH=/usr/lib/gcc/i486-linux-gnu/4.4.1/:/usr/lib/gcc/i486-linux-gnu/4.4.1/:/usr/lib/gcc/i486-linux-gnu/4.4.1/../../../../lib/:/lib/../lib/:/usr/lib/../lib/:/usr/lib/gcc/i486-linux-gnu/4.4.1/../../../:/lib/:/usr/lib/:/usr/lib/i486-linux-gnu/
COLLECT_GCC_OPTIONS='-v' '-O4' '-Wall' '-Werror' '-o' 'x' '-shared-libgcc'
'-mtune=generic' '-march=i486'
 /usr/lib/gcc/i486-linux-gnu/4.4.1/collect2 --build-id --eh-frame-hdr -m
elf_i386 --hash-style=both -dynamic-linker /lib/ld-linux.so.2 -o x -z relro
/usr/lib/gcc/i486-linux-gnu/4.4.1/../../../../lib/crt1.o
/usr/lib/gcc/i486-linux-gnu/4.4.1/../../../../lib/crti.o
/usr/lib/gcc/i486-linux-gnu/4.4.1/crtbegin.o
-L/usr/lib/gcc/i486-linux-gnu/4.4.1 -L/usr/lib/gcc/i486-linux-gnu/4.4.1
-L/usr/lib/gcc/i486-linux-gnu/4.4.1/../../../../lib -L/lib/../lib
-L/usr/lib/../lib -L/usr/lib/gcc/i486-linux-gnu/4.4.1/../../..
-L/usr/lib/i486-linux-gnu /tmp/cc6Uy9PA.o -lstdc++ -lm -lgcc_s -lgcc -lc
-lgcc_s -lgcc /usr/lib/gcc/i486-linux-gnu/4.4.1/crtend.o
/usr/lib/gcc/i486-linux-gnu/4.4.1/../../../../lib/crtn.o

(This is a standard Ubuntu build of GCC.)

$ objdump -D x > x.dis


The code of interest is for "slow()":  For the inner loop (80485be..80485d7),
"switch_case()" is inlined and simplifies.  Since the mask is ~0, it reduces to
nothing.  For the leading and trailing calls, the relevant values are pushed
and it calls switch_case() out-of-line.  If the source is simplified (e.g., by
replacing the unused "else" call to "htol..." with a semicolon) the routine is
inlined and reduces to three instructions at each site.  Where the inner loop
is called just a few times (which is common) the leading/trailing cost
dominates.


080484b0 <_Z11switch_caseIjET_hS0_S0_S0_>:
 80484b0:       55                      push   %ebp
 80484b1:       3c 0f                   cmp    $0xf,%al
 80484b3:       89 e5                   mov    %esp,%ebp
 80484b5:       53                      push   %ebx
 80484b6:       8b 5d 08                mov    0x8(%ebp),%ebx
 80484b9:       77 15                   ja     80484d0
<_Z11switch_caseIjET_hS0_S0_S0_+0x20>
 80484bb:       0f b6 c0                movzbl %al,%eax
 80484be:       ff 24 85 e0 86 04 08    jmp    *0x80486e0(,%eax,4)
 80484c5:       8d 76 00                lea    0x0(%esi),%esi
 80484c8:       f7 d3                   not    %ebx
 80484ca:       21 da                   and    %ebx,%edx
 80484cc:       8d 74 26 00             lea    0x0(%esi,%eiz,1),%esi
 80484d0:       89 d0                   mov    %edx,%eax
 80484d2:       5b                      pop    %ebx
 80484d3:       5d                      pop    %ebp
 80484d4:       c3                      ret    
 80484d5:       8d 76 00                lea    0x0(%esi),%esi
 80484d8:       f7 d1                   not    %ecx
 80484da:       21 d9                   and    %ebx,%ecx
 80484dc:       31 ca                   xor    %ecx,%edx
 80484de:       89 d0                   mov    %edx,%eax
 80484e0:       5b                      pop    %ebx
 80484e1:       5d                      pop    %ebp
 80484e2:       c3                      ret    
 80484e3:       90                      nop
 80484e4:       8d 74 26 00             lea    0x0(%esi,%eiz,1),%esi
 80484e8:       89 d0                   mov    %edx,%eax
 80484ea:       f7 d0                   not    %eax
 80484ec:       09 c8                   or     %ecx,%eax
 80484ee:       21 d8                   and    %ebx,%eax
 80484f0:       31 c2                   xor    %eax,%edx
 80484f2:       eb dc                   jmp    80484d0
<_Z11switch_caseIjET_hS0_S0_S0_+0x20>
 80484f4:       8d 74 26 00             lea    0x0(%esi,%eiz,1),%esi
 80484f8:       f7 d1                   not    %ecx
 80484fa:       21 d9                   and    %ebx,%ecx
 80484fc:       21 d1                   and    %edx,%ecx
 80484fe:       31 ca                   xor    %ecx,%edx
 8048500:       eb ce                   jmp    80484d0
<_Z11switch_caseIjET_hS0_S0_S0_+0x20>
 8048502:       8d b6 00 00 00 00       lea    0x0(%esi),%esi
 8048508:       09 da                   or     %ebx,%edx
 804850a:       eb c4                   jmp    80484d0
<_Z11switch_caseIjET_hS0_S0_S0_+0x20>
 804850c:       8d 74 26 00             lea    0x0(%esi,%eiz,1),%esi
 8048510:       f7 d1                   not    %ecx
 8048512:       21 d9                   and    %ebx,%ecx
 8048514:       09 ca                   or     %ecx,%edx
 8048516:       eb b8                   jmp    80484d0
<_Z11switch_caseIjET_hS0_S0_S0_+0x20>
 8048518:       31 d1                   xor    %edx,%ecx
 804851a:       21 d9                   and    %ebx,%ecx
 804851c:       31 ca                   xor    %ecx,%edx
 804851e:       eb b0                   jmp    80484d0
<_Z11switch_caseIjET_hS0_S0_S0_+0x20>
 8048520:       21 d1                   and    %edx,%ecx
 8048522:       f7 d1                   not    %ecx
 8048524:       21 d9                   and    %ebx,%ecx
 8048526:       31 ca                   xor    %ecx,%edx
 8048528:       eb a6                   jmp    80484d0
<_Z11switch_caseIjET_hS0_S0_S0_+0x20>
 804852a:       8d b6 00 00 00 00       lea    0x0(%esi),%esi
 8048530:       21 d9                   and    %ebx,%ecx
 8048532:       09 ca                   or     %ecx,%edx
 8048534:       eb 9a                   jmp    80484d0
<_Z11switch_caseIjET_hS0_S0_S0_+0x20>
 8048536:       66 90                   xchg   %ax,%ax
 8048538:       f7 d1                   not    %ecx
 804853a:       09 d1                   or     %edx,%ecx
 804853c:       21 d9                   and    %ebx,%ecx
 804853e:       31 ca                   xor    %ecx,%edx
 8048540:       eb 8e                   jmp    80484d0
<_Z11switch_caseIjET_hS0_S0_S0_+0x20>
 8048542:       8d b6 00 00 00 00       lea    0x0(%esi),%esi
 8048548:       21 d1                   and    %edx,%ecx
 804854a:       21 d9                   and    %ebx,%ecx
 804854c:       31 ca                   xor    %ecx,%edx
 804854e:       eb 80                   jmp    80484d0
<_Z11switch_caseIjET_hS0_S0_S0_+0x20>
 8048550:       f7 d1                   not    %ecx
 8048552:       31 d1                   xor    %edx,%ecx
 8048554:       21 d9                   and    %ebx,%ecx
 8048556:       31 ca                   xor    %ecx,%edx
 8048558:       e9 73 ff ff ff          jmp    80484d0
<_Z11switch_caseIjET_hS0_S0_S0_+0x20>
 804855d:       8d 76 00                lea    0x0(%esi),%esi
 8048560:       09 d1                   or     %edx,%ecx
 8048562:       21 d9                   and    %ebx,%ecx
 8048564:       31 ca                   xor    %ecx,%edx
 8048566:       e9 65 ff ff ff          jmp    80484d0
<_Z11switch_caseIjET_hS0_S0_S0_+0x20>
 804856b:       90                      nop
 804856c:       8d 74 26 00             lea    0x0(%esi,%eiz,1),%esi
 8048570:       31 da                   xor    %ebx,%edx
 8048572:       e9 59 ff ff ff          jmp    80484d0
<_Z11switch_caseIjET_hS0_S0_S0_+0x20>
 8048577:       89 f6                   mov    %esi,%esi
 8048579:       8d bc 27 00 00 00 00    lea    0x0(%edi,%eiz,1),%edi

08048580 <_Z4slowPjPKj>:
 8048580:       55                      push   %ebp
 8048581:       89 e5                   mov    %esp,%ebp
 8048583:       83 ec 10                sub    $0x10,%esp
 8048586:       89 7d fc                mov    %edi,-0x4(%ebp)
 8048589:       8b 7d 0c                mov    0xc(%ebp),%edi
 804858c:       89 75 f8                mov    %esi,-0x8(%ebp)
 804858f:       8b 75 08                mov    0x8(%ebp),%esi
 8048592:       89 5d f4                mov    %ebx,-0xc(%ebp)
 8048595:       8b 5f 04                mov    0x4(%edi),%ebx
 8048598:       8b 07                   mov    (%edi),%eax
 804859a:       c7 04 24 fc ff ff ff    movl   $0xfffffffc,(%esp)
 80485a1:       8b 16                   mov    (%esi),%edx
 80485a3:       89 d9                   mov    %ebx,%ecx
 80485a5:       c1 e8 08                shr    $0x8,%eax
 80485a8:       c1 e1 18                shl    $0x18,%ecx
 80485ab:       09 c1                   or     %eax,%ecx
 80485ad:       b8 0c 00 00 00          mov    $0xc,%eax
 80485b2:       e8 f9 fe ff ff          call   80484b0
<_Z11switch_caseIjET_hS0_S0_S0_>
 80485b7:       89 06                   mov    %eax,(%esi)
 80485b9:       b8 01 00 00 00          mov    $0x1,%eax
 80485be:       8b 54 87 04             mov    0x4(%edi,%eax,4),%edx
 80485c2:       c1 eb 08                shr    $0x8,%ebx
 80485c5:       89 d1                   mov    %edx,%ecx
 80485c7:       c1 e1 18                shl    $0x18,%ecx
 80485ca:       09 cb                   or     %ecx,%ebx
 80485cc:       89 1c 86                mov    %ebx,(%esi,%eax,4)
 80485cf:       83 c0 01                add    $0x1,%eax
 80485d2:       89 d3                   mov    %edx,%ebx
 80485d4:       83 f8 3f                cmp    $0x3f,%eax
 80485d7:       75 e5                   jne    80485be <_Z4slowPjPKj+0x3e>
 80485d9:       8b 87 00 01 00 00       mov    0x100(%edi),%eax
 80485df:       89 d1                   mov    %edx,%ecx
 80485e1:       c7 04 24 ff ff ff 0f    movl   $0xfffffff,(%esp)
 80485e8:       8b 96 fc 00 00 00       mov    0xfc(%esi),%edx
 80485ee:       c1 e9 08                shr    $0x8,%ecx
 80485f1:       c1 e0 18                shl    $0x18,%eax
 80485f4:       09 c1                   or     %eax,%ecx
 80485f6:       b8 0c 00 00 00          mov    $0xc,%eax
 80485fb:       e8 b0 fe ff ff          call   80484b0
<_Z11switch_caseIjET_hS0_S0_S0_>
 8048600:       89 86 fc 00 00 00       mov    %eax,0xfc(%esi)
 8048606:       8b 5d f4                mov    -0xc(%ebp),%ebx
 8048609:       8b 75 f8                mov    -0x8(%ebp),%esi
 804860c:       8b 7d fc                mov    -0x4(%ebp),%edi
 804860f:       89 ec                   mov    %ebp,%esp
 8048611:       5d                      pop    %ebp
 8048612:       c3                      ret


-- 
           Summary: missed optimization leads to several times slower code
           Product: gcc
           Version: 4.4.1
            Status: UNCONFIRMED
          Severity: minor
          Priority: P3
         Component: c++
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: gb-0001 at xsim dot com
 GCC build triplet: i486-linux-gnu
  GCC host triplet: i486-linux-gnu
GCC target triplet: i486-linux-gnu


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=42209


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]