85640 – [11/12/13/14 Regression] code size regression vs 7.x

Bug 85640 - [11/12/13/14 Regression] code size regression vs 7.x

Summary: [11/12/13/14 Regression] code size regression vs 7.x

Status:	NEW

Alias:	None

Product:	gcc
Classification:	Unclassified
Component:	target (show other bugs)
Version:	8.1.1

Importance:	P2 normal
Target Milestone:	11.5
Assignee:	Not yet assigned to anyone

URL:
Keywords:	missed-optimization

Depends on:
Blocks:

Reported:	2018-05-03 21:21 UTC by petschy
Modified:	2024-02-22 21:52 UTC (History)
CC List:	4 users (show)

See Also:
Host:
Target:	x86_64--
Build:
Known to work:
Known to fail:
Last reconfirmed:	2018-08-10 00:00:00

Attachments
source (425 bytes, text/plain) 2018-05-03 21:21 UTC, petschy	Details
View All Add an attachment (proposed patch, testcase, etc.)

Note You need to log in before you can comment on or make changes to this bug.

Description petschy 2018-05-03 21:21:28 UTC

Created attachment 44062 [details]
source

Attached the source of a simple Adler32 checksum class. The Update() fn is 32 bytes longer compared to the code generated with 7.3.1.

Dump of assembler code for function Adler32::Update(void const*, unsigned int):
7.3.1 0x0000000000400500 <+0>:     test   %edx,%edx
7.3.1 0x0000000000400502 <+2>:     je     0x400578 <Adler32::Update(void const*, unsigned int)+120>
7.3.1 0x0000000000400504 <+4>:     mov    (%rdi),%ecx
7.3.1 0x0000000000400506 <+6>:     mov    0x4(%rdi),%r8d
7.3.1 0x000000000040050a <+10>:    mov    $0x80078071,%r10d
7.3.1 0x0000000000400510 <+16>:    xor    %r9d,%r9d
7.3.1 0x0000000000400513 <+19>:    cmp    $0x15af,%edx
7.3.1 0x0000000000400519 <+25>:    jbe    0x400527 <Adler32::Update(void const*, unsigned int)+39>
7.3.1 0x000000000040051b <+27>:    lea    -0x15b0(%rdx),%r9d
7.3.1 0x0000000000400522 <+34>:    mov    $0x15b0,%edx
7.3.1 0x0000000000400527 <+39>:    lea    -0x1(%rdx),%eax
7.3.1 0x000000000040052a <+42>:    lea    0x1(%rsi,%rax,1),%rdx
7.3.1 0x000000000040052f <+47>:    nop
7.3.1 0x0000000000400530 <+48>:    add    $0x1,%rsi
7.3.1 0x0000000000400534 <+52>:    movzbl -0x1(%rsi),%eax
7.3.1 0x0000000000400538 <+56>:    add    %eax,%ecx
7.3.1 0x000000000040053a <+58>:    add    %ecx,%r8d
7.3.1 0x000000000040053d <+61>:    cmp    %rdx,%rsi
7.3.1 0x0000000000400540 <+64>:    mov    %ecx,(%rdi)
7.3.1 0x0000000000400542 <+66>:    mov    %r8d,0x4(%rdi)
7.3.1 0x0000000000400546 <+70>:    jne    0x400530 <Adler32::Update(void const*, unsigned int)+48>
7.3.1 0x0000000000400548 <+72>:    mov    %ecx,%eax
7.3.1 0x000000000040054a <+74>:    mul    %r10d
7.3.1 0x000000000040054d <+77>:    mov    %r8d,%eax
7.3.1 0x0000000000400550 <+80>:    shr    $0xf,%edx
7.3.1 0x0000000000400553 <+83>:    imul   $0xfff1,%edx,%edx
7.3.1 0x0000000000400559 <+89>:    sub    %edx,%ecx
7.3.1 0x000000000040055b <+91>:    mul    %r10d
7.3.1 0x000000000040055e <+94>:    mov    %ecx,(%rdi)
7.3.1 0x0000000000400560 <+96>:    shr    $0xf,%edx
7.3.1 0x0000000000400563 <+99>:    imul   $0xfff1,%edx,%edx
7.3.1 0x0000000000400569 <+105>:   sub    %edx,%r8d
7.3.1 0x000000000040056c <+108>:   test   %r9d,%r9d
7.3.1 0x000000000040056f <+111>:   mov    %r9d,%edx
7.3.1 0x0000000000400572 <+114>:   mov    %r8d,0x4(%rdi)
7.3.1 0x0000000000400576 <+118>:   jne    0x400510 <Adler32::Update(void const*, unsigned int)+16>
7.3.1 0x0000000000400578 <+120>:   repz retq 

Dump of assembler code for function Adler32::Update(void const*, unsigned int):
8.1.1 0x0000000000400500 <+0>:     test   %edx,%edx
8.1.1 0x0000000000400502 <+2>:     je     0x400598 <Adler32::Update(void const*, unsigned int)+152>
8.1.1 0x0000000000400508 <+8>:     mov    (%rdi),%ecx
8.1.1 0x000000000040050a <+10>:    mov    0x4(%rdi),%r8d
8.1.1 0x000000000040050e <+14>:    push   %rbx
8.1.1 0x000000000040050f <+15>:    mov    $0x80078071,%ebx
8.1.1 0x0000000000400514 <+20>:    nopl   0x0(%rax)
8.1.1 0x0000000000400518 <+24>:    xor    %r11d,%r11d
8.1.1 0x000000000040051b <+27>:    cmp    $0x15af,%edx
8.1.1 0x0000000000400521 <+33>:    jbe    0x40052f <Adler32::Update(void const*, unsigned int)+47>
8.1.1 0x0000000000400523 <+35>:    lea    -0x15b0(%rdx),%r11d
8.1.1 0x000000000040052a <+42>:    mov    $0x15b0,%edx
8.1.1 0x000000000040052f <+47>:    mov    %edx,%r10d
8.1.1 0x0000000000400532 <+50>:    mov    %rsi,%rax
8.1.1 0x0000000000400535 <+53>:    add    %rsi,%r10
8.1.1 0x0000000000400538 <+56>:    nopl   0x0(%rax,%rax,1)
8.1.1 0x0000000000400540 <+64>:    add    $0x1,%rax
8.1.1 0x0000000000400544 <+68>:    movzbl -0x1(%rax),%r9d
8.1.1 0x0000000000400549 <+73>:    add    %r9d,%ecx
8.1.1 0x000000000040054c <+76>:    add    %ecx,%r8d
8.1.1 0x000000000040054f <+79>:    mov    %ecx,(%rdi)
8.1.1 0x0000000000400551 <+81>:    mov    %r8d,0x4(%rdi)
8.1.1 0x0000000000400555 <+85>:    cmp    %r10,%rax
8.1.1 0x0000000000400558 <+88>:    jne    0x400540 <Adler32::Update(void const*, unsigned int)+64>
8.1.1 0x000000000040055a <+90>:    lea    -0x1(%rdx),%eax
8.1.1 0x000000000040055d <+93>:    lea    0x1(%rsi,%rax,1),%rsi
8.1.1 0x0000000000400562 <+98>:    mov    %ecx,%eax
8.1.1 0x0000000000400564 <+100>:   mul    %ebx
8.1.1 0x0000000000400566 <+102>:   mov    %r8d,%eax
8.1.1 0x0000000000400569 <+105>:   shr    $0xf,%edx
8.1.1 0x000000000040056c <+108>:   imul   $0xfff1,%edx,%edx
8.1.1 0x0000000000400572 <+114>:   sub    %edx,%ecx
8.1.1 0x0000000000400574 <+116>:   mul    %ebx
8.1.1 0x0000000000400576 <+118>:   mov    %ecx,(%rdi)
8.1.1 0x0000000000400578 <+120>:   shr    $0xf,%edx
8.1.1 0x000000000040057b <+123>:   imul   $0xfff1,%edx,%edx
8.1.1 0x0000000000400581 <+129>:   sub    %edx,%r8d
8.1.1 0x0000000000400584 <+132>:   mov    %r11d,%edx
8.1.1 0x0000000000400587 <+135>:   mov    %r8d,0x4(%rdi)
8.1.1 0x000000000040058b <+139>:   test   %r11d,%r11d
8.1.1 0x000000000040058e <+142>:   jne    0x400518 <Adler32::Update(void const*, unsigned int)+24>
8.1.1 0x0000000000400590 <+144>:   pop    %rbx
8.1.1 0x0000000000400591 <+145>:   retq   
8.1.1 0x0000000000400592 <+146>:   nopw   0x0(%rax,%rax,1)
8.1.1 0x0000000000400598 <+152>:   retq   

Here is an interwoven version, hopefully easier to follow:
7.3.1 0x0000000000400500 <+0>:     test   %edx,%edx
7.3.1 0x0000000000400502 <+2>:     je     0x400578 <Adler32::Update(void const*, unsigned int)+120>
7.3.1 0x0000000000400504 <+4>:     mov    (%rdi),%ecx
7.3.1 0x0000000000400506 <+6>:     mov    0x4(%rdi),%r8d
7.3.1 0x000000000040050a <+10>:    mov    $0x80078071,%r10d

8.1.1 0x0000000000400500 <+0>:     test   %edx,%edx
8.1.1 0x0000000000400502 <+2>:     je     0x400598 <Adler32::Update(void const*, unsigned int)+152>
8.1.1 0x0000000000400508 <+8>:     mov    (%rdi),%ecx
8.1.1 0x000000000040050a <+10>:    mov    0x4(%rdi),%r8d
8.1.1 0x000000000040050e <+14>:    push   %rbx
8.1.1 0x000000000040050f <+15>:    mov    $0x80078071,%ebx
8.1.1 0x0000000000400514 <+20>:    nopl   0x0(%rax)

Two things so far:
- the je is 6 bytes in 8.1.1 vs 2 bytes in 7.3.1 because the jump offset can't fit in a byte
- in 8.1.1 ebx is used for the modulo magic, which is callee saved, so have to push before use



7.3.1 0x0000000000400510 <+16>:    xor    %r9d,%r9d
7.3.1 0x0000000000400513 <+19>:    cmp    $0x15af,%edx
7.3.1 0x0000000000400519 <+25>:    jbe    0x400527 <Adler32::Update(void const*, unsigned int)+39>
7.3.1 0x000000000040051b <+27>:    lea    -0x15b0(%rdx),%r9d
7.3.1 0x0000000000400522 <+34>:    mov    $0x15b0,%edx
7.3.1 0x0000000000400527 <+39>:    lea    -0x1(%rdx),%eax
7.3.1 0x000000000040052a <+42>:    lea    0x1(%rsi,%rax,1),%rdx
7.3.1 0x000000000040052f <+47>:    nop

8.1.1 0x0000000000400518 <+24>:    xor    %r11d,%r11d
8.1.1 0x000000000040051b <+27>:    cmp    $0x15af,%edx
8.1.1 0x0000000000400521 <+33>:    jbe    0x40052f <Adler32::Update(void const*, unsigned int)+47>
8.1.1 0x0000000000400523 <+35>:    lea    -0x15b0(%rdx),%r11d
8.1.1 0x000000000040052a <+42>:    mov    $0x15b0,%edx
8.1.1 0x000000000040052f <+47>:    mov    %edx,%r10d
8.1.1 0x0000000000400532 <+50>:    mov    %rsi,%rax
8.1.1 0x0000000000400535 <+53>:    add    %rsi,%r10
8.1.1 0x0000000000400538 <+56>:    nopl   0x0(%rax,%rax,1)

This is the inner loop init, pretty similar.



7.3.1 0x0000000000400530 <+48>:    add    $0x1,%rsi
7.3.1 0x0000000000400534 <+52>:    movzbl -0x1(%rsi),%eax
7.3.1 0x0000000000400538 <+56>:    add    %eax,%ecx
7.3.1 0x000000000040053a <+58>:    add    %ecx,%r8d
7.3.1 0x000000000040053d <+61>:    cmp    %rdx,%rsi
7.3.1 0x0000000000400540 <+64>:    mov    %ecx,(%rdi)
7.3.1 0x0000000000400542 <+66>:    mov    %r8d,0x4(%rdi)
7.3.1 0x0000000000400546 <+70>:    jne    0x400530 <Adler32::Update(void const*, unsigned int)+48>

8.1.1 0x0000000000400540 <+64>:    add    $0x1,%rax
8.1.1 0x0000000000400544 <+68>:    movzbl -0x1(%rax),%r9d
8.1.1 0x0000000000400549 <+73>:    add    %r9d,%ecx
8.1.1 0x000000000040054c <+76>:    add    %ecx,%r8d
8.1.1 0x000000000040054f <+79>:    mov    %ecx,(%rdi)
8.1.1 0x0000000000400551 <+81>:    mov    %r8d,0x4(%rdi)
8.1.1 0x0000000000400555 <+85>:    cmp    %r10,%rax
8.1.1 0x0000000000400558 <+88>:    jne    0x400540 <Adler32::Update(void const*, unsigned int)+64>

These are the same, except the cmp is before/after the two stores, and the movzbl and the first add is one byte cheaper in 7.3.1.


7.3.1 0x0000000000400548 <+72>:    mov    %ecx,%eax
7.3.1 0x000000000040054a <+74>:    mul    %r10d
7.3.1 0x000000000040054d <+77>:    mov    %r8d,%eax
7.3.1 0x0000000000400550 <+80>:    shr    $0xf,%edx
7.3.1 0x0000000000400553 <+83>:    imul   $0xfff1,%edx,%edx
7.3.1 0x0000000000400559 <+89>:    sub    %edx,%ecx
7.3.1 0x000000000040055b <+91>:    mul    %r10d
7.3.1 0x000000000040055e <+94>:    mov    %ecx,(%rdi)
7.3.1 0x0000000000400560 <+96>:    shr    $0xf,%edx
7.3.1 0x0000000000400563 <+99>:    imul   $0xfff1,%edx,%edx
7.3.1 0x0000000000400569 <+105>:   sub    %edx,%r8d
7.3.1 0x000000000040056c <+108>:   test   %r9d,%r9d
7.3.1 0x000000000040056f <+111>:   mov    %r9d,%edx
7.3.1 0x0000000000400572 <+114>:   mov    %r8d,0x4(%rdi)
7.3.1 0x0000000000400576 <+118>:   jne    0x400510 <Adler32::Update(void const*, unsigned int)+16>
7.3.1 0x0000000000400578 <+120>:   repz retq 

8.1.1 0x000000000040055a <+90>:    lea    -0x1(%rdx),%eax
8.1.1 0x000000000040055d <+93>:    lea    0x1(%rsi,%rax,1),%rsi
8.1.1 0x0000000000400562 <+98>:    mov    %ecx,%eax
8.1.1 0x0000000000400564 <+100>:   mul    %ebx
8.1.1 0x0000000000400566 <+102>:   mov    %r8d,%eax
8.1.1 0x0000000000400569 <+105>:   shr    $0xf,%edx
8.1.1 0x000000000040056c <+108>:   imul   $0xfff1,%edx,%edx
8.1.1 0x0000000000400572 <+114>:   sub    %edx,%ecx
8.1.1 0x0000000000400574 <+116>:   mul    %ebx
8.1.1 0x0000000000400576 <+118>:   mov    %ecx,(%rdi)
8.1.1 0x0000000000400578 <+120>:   shr    $0xf,%edx
8.1.1 0x000000000040057b <+123>:   imul   $0xfff1,%edx,%edx
8.1.1 0x0000000000400581 <+129>:   sub    %edx,%r8d
8.1.1 0x0000000000400584 <+132>:   mov    %r11d,%edx
8.1.1 0x0000000000400587 <+135>:   mov    %r8d,0x4(%rdi)
8.1.1 0x000000000040058b <+139>:   test   %r11d,%r11d
8.1.1 0x000000000040058e <+142>:   jne    0x400518 <Adler32::Update(void const*, unsigned int)+24>
8.1.1 0x0000000000400590 <+144>:   pop    %rbx
8.1.1 0x0000000000400591 <+145>:   retq   
8.1.1 0x0000000000400592 <+146>:   nopw   0x0(%rax,%rax,1)
8.1.1 0x0000000000400598 <+152>:   retq   

The loop variables in 8.1.1 handled a bit differently, that's why the minor size increase. The two lea's here in the 8.1.1 version are very similar to the ones before the inner loop in 7.3.1, those calculate the ptr range of the inner loop. These lea's here calculate the start ptr for the next inner loop run, however, this ptr is already present in eax. I don't know if this is some special optimization, or eax should have been used but the compiler missed it.

The double retq at the end is interesting, but maybe the debug info is to blame.

cmdline used to compile:
$ g++-7.3.1 -g -O3 -Wall a32.cpp
$ g++-8.1.1 -g -O3 -Wall a32.cpp

Platform is AMD64 (FX-8150), Debian 9.4

$ g++-7.3.1 -v
Using built-in specs.
COLLECT_GCC=g++-7.3.1
COLLECT_LTO_WRAPPER=/usr/local/libexec/gcc/x86_64-pc-linux-gnu/7.3.1/lto-wrapper
Target: x86_64-pc-linux-gnu
Configured with: ../configure --enable-languages=c,c++ --disable-multilib --program-suffix=-7.3.1 --disable-bootstrap CFLAGS='-O2 -march=native -mtune=native' CXXFLAGS='-O2 -march=native -mtune=native'
Thread model: posix
gcc version 7.3.1 20180429 (GCC)

$ g++-8.1.1 -v
Using built-in specs.
COLLECT_GCC=g++-8.1.1
COLLECT_LTO_WRAPPER=/usr/local/libexec/gcc/x86_64-pc-linux-gnu/8.1.1/lto-wrapper
Target: x86_64-pc-linux-gnu
Configured with: ../configure --enable-languages=c,c++ --disable-multilib --program-suffix=-8.1.1 --disable-bootstrap CFLAGS='-O2 -march=native -mtune=native' CXXFLAGS='-O2 -march=native -mtune=native'
Thread model: posix
gcc version 8.1.1 20180502 (GCC)

Comment 1 Jakub Jelinek 2018-07-26 11:19:51 UTC

GCC 8.2 has been released.

Comment 2 Alexander Monakov 2018-08-10 13:54:26 UTC

This is interesting. First scev-cprop emits computation of final value of 'buf' before the inner loop. Then ivopts emits the same, but *after* the loop. In gcc-8, ivopts uses a slightly more efficient form though, so dom3 fails to unify the two, and we end up with redundant computations. Previously two final value computations were identical and dom3 managed to clean up.

Final value replacement is unnecessary in this case, 'buf' remains used in the loop and the final value is non-constant so doesn't lead to further simplification. I believe we should try to throttle down scev-cprop (especially for -Os, but likely for -O2 as well).

Emitting final value in the loop preheader instead of after exit may also be useful as it may reduce the amount of variables that live across the loop.

(-fno-tree-scev-cprop leads to good code on the testcase)

Comment 3 Alexander Monakov 2018-08-13 14:53:08 UTC

(first paragraph of previous comment had 'before'/'after' swapped, sorry)

Also note that adjusting the testcase to use size_t rather than unsigned int for the 'len' variable (as production code probably should) avoids the issue: final value replacement doesn't need to special case len==0 then.

Comment 4 Richard Biener 2018-12-20 12:32:44 UTC

Looking at trunk we have

  # RANGE [1, 5552] NONZERO 8191
  # iftmp.2_14 = PHI <5552(6), len_29(5)>
  # prephitmp_53 = PHI <_52(6), 0(5)>
  _36 = (sizetype) iftmp.2_14;
  _37 = buf_11 + _36;

  <bb 8> [local count: 1073741824]:
  # buf_12 = PHI <buf_11(7), buf_21(11)>
...

  <bb 9> [local count: 118111601]:
  # _17 = PHI <_6(8)>
  # _59 = PHI <_8(8)>
  _46 = iftmp.2_14 + 4294967295;
  _45 = (sizetype) _46;
  _44 = _45 + 1;
  buf_47 = buf_11 + _44;

where the final-value replacement failed to elide the +-1 dance because

  sizetype _45;
  unsigned int _46;

IVOPTs is using tree-affine which performs some extra tricks such as
using range-info to prove it can elide the casts.  I think if we
teach the same to SCEV we'd get it optimized again.  I'm talking about

            /* If inner type has wrapping overflow behavior, fold conversion
               for below case:
                 (T1)(X - CST) -> (T1)X - (T1)CST
               if X - CST doesn't overflow by range information.  Also handle
               (T1)(X + CST) as (T1)(X - (-CST)).  */
            if (TYPE_UNSIGNED (itype)
...

Note that the plan is to remove the unconditional final-value replacement
and integrate it into passes that would benefit - first of all DCE which
can trigger it if the IV (or the whole loop) can be eliminated.

Comment 5 Jakub Jelinek 2019-02-22 15:21:37 UTC

GCC 8.3 has been released.

Comment 6 Jakub Jelinek 2020-03-04 09:37:29 UTC

GCC 8.4.0 has been released, adjusting target milestone.

Comment 7 Jakub Jelinek 2021-02-03 18:09:22 UTC

Bisection on size shows
r8-1162-gba00284cedbcb0b980af4e5e41f427581af64462
as the first size difference, then
r9-2350-gf80d6a178ed2043c6a9a4d8076cdc72d61b67b2b
for a smaller growth,
r11-3874-gec5e6467091ee0f8de2f894f0c1669465a8440f1
was apparently a temporary growth only,
r11-4569-g33c0f246f799b7403171e97f31276a8feddd05c9
grew the other Adler32 function.
Anyway, the first one seems to be the largest size difference.

Comment 8 Jakub Jelinek 2021-02-03 18:36:08 UTC

Anyway, the problem discussed in #c2 to #c4 doesn't seem to exist in 10/trunk anymore, since:
r10-2806-gdf7d46d925c7baca7bf9961aee900876d8aef225
And at least some of the changes that grew size are desirable for speed (and -O3 is not size optimization), so is there anything we want to do about this PR?

Comment 9 Jakub Jelinek 2021-05-14 09:50:26 UTC

GCC 8 branch is being closed.

Comment 10 Richard Biener 2021-06-01 08:11:30 UTC

GCC 9.4 is being released, retargeting bugs to GCC 9.5.

Comment 11 Richard Biener 2022-05-27 09:39:01 UTC

GCC 9 branch is being closed

Comment 12 Jakub Jelinek 2022-06-28 10:35:17 UTC

GCC 10.4 is being released, retargeting bugs to GCC 10.5.

Comment 13 Richard Biener 2023-07-07 10:33:51 UTC

GCC 10 branch is being closed.

Comment 14 Andrew Pinski 2024-02-22 21:52:07 UTC

Seems like it has been improved again for GCC 12.