Created attachment 44062 [details] source Attached the source of a simple Adler32 checksum class. The Update() fn is 32 bytes longer compared to the code generated with 7.3.1. Dump of assembler code for function Adler32::Update(void const*, unsigned int): 7.3.1 0x0000000000400500 <+0>: test %edx,%edx 7.3.1 0x0000000000400502 <+2>: je 0x400578 <Adler32::Update(void const*, unsigned int)+120> 7.3.1 0x0000000000400504 <+4>: mov (%rdi),%ecx 7.3.1 0x0000000000400506 <+6>: mov 0x4(%rdi),%r8d 7.3.1 0x000000000040050a <+10>: mov $0x80078071,%r10d 7.3.1 0x0000000000400510 <+16>: xor %r9d,%r9d 7.3.1 0x0000000000400513 <+19>: cmp $0x15af,%edx 7.3.1 0x0000000000400519 <+25>: jbe 0x400527 <Adler32::Update(void const*, unsigned int)+39> 7.3.1 0x000000000040051b <+27>: lea -0x15b0(%rdx),%r9d 7.3.1 0x0000000000400522 <+34>: mov $0x15b0,%edx 7.3.1 0x0000000000400527 <+39>: lea -0x1(%rdx),%eax 7.3.1 0x000000000040052a <+42>: lea 0x1(%rsi,%rax,1),%rdx 7.3.1 0x000000000040052f <+47>: nop 7.3.1 0x0000000000400530 <+48>: add $0x1,%rsi 7.3.1 0x0000000000400534 <+52>: movzbl -0x1(%rsi),%eax 7.3.1 0x0000000000400538 <+56>: add %eax,%ecx 7.3.1 0x000000000040053a <+58>: add %ecx,%r8d 7.3.1 0x000000000040053d <+61>: cmp %rdx,%rsi 7.3.1 0x0000000000400540 <+64>: mov %ecx,(%rdi) 7.3.1 0x0000000000400542 <+66>: mov %r8d,0x4(%rdi) 7.3.1 0x0000000000400546 <+70>: jne 0x400530 <Adler32::Update(void const*, unsigned int)+48> 7.3.1 0x0000000000400548 <+72>: mov %ecx,%eax 7.3.1 0x000000000040054a <+74>: mul %r10d 7.3.1 0x000000000040054d <+77>: mov %r8d,%eax 7.3.1 0x0000000000400550 <+80>: shr $0xf,%edx 7.3.1 0x0000000000400553 <+83>: imul $0xfff1,%edx,%edx 7.3.1 0x0000000000400559 <+89>: sub %edx,%ecx 7.3.1 0x000000000040055b <+91>: mul %r10d 7.3.1 0x000000000040055e <+94>: mov %ecx,(%rdi) 7.3.1 0x0000000000400560 <+96>: shr $0xf,%edx 7.3.1 0x0000000000400563 <+99>: imul $0xfff1,%edx,%edx 7.3.1 0x0000000000400569 <+105>: sub %edx,%r8d 7.3.1 0x000000000040056c <+108>: test %r9d,%r9d 7.3.1 0x000000000040056f <+111>: mov %r9d,%edx 7.3.1 0x0000000000400572 <+114>: mov %r8d,0x4(%rdi) 7.3.1 0x0000000000400576 <+118>: jne 0x400510 <Adler32::Update(void const*, unsigned int)+16> 7.3.1 0x0000000000400578 <+120>: repz retq Dump of assembler code for function Adler32::Update(void const*, unsigned int): 8.1.1 0x0000000000400500 <+0>: test %edx,%edx 8.1.1 0x0000000000400502 <+2>: je 0x400598 <Adler32::Update(void const*, unsigned int)+152> 8.1.1 0x0000000000400508 <+8>: mov (%rdi),%ecx 8.1.1 0x000000000040050a <+10>: mov 0x4(%rdi),%r8d 8.1.1 0x000000000040050e <+14>: push %rbx 8.1.1 0x000000000040050f <+15>: mov $0x80078071,%ebx 8.1.1 0x0000000000400514 <+20>: nopl 0x0(%rax) 8.1.1 0x0000000000400518 <+24>: xor %r11d,%r11d 8.1.1 0x000000000040051b <+27>: cmp $0x15af,%edx 8.1.1 0x0000000000400521 <+33>: jbe 0x40052f <Adler32::Update(void const*, unsigned int)+47> 8.1.1 0x0000000000400523 <+35>: lea -0x15b0(%rdx),%r11d 8.1.1 0x000000000040052a <+42>: mov $0x15b0,%edx 8.1.1 0x000000000040052f <+47>: mov %edx,%r10d 8.1.1 0x0000000000400532 <+50>: mov %rsi,%rax 8.1.1 0x0000000000400535 <+53>: add %rsi,%r10 8.1.1 0x0000000000400538 <+56>: nopl 0x0(%rax,%rax,1) 8.1.1 0x0000000000400540 <+64>: add $0x1,%rax 8.1.1 0x0000000000400544 <+68>: movzbl -0x1(%rax),%r9d 8.1.1 0x0000000000400549 <+73>: add %r9d,%ecx 8.1.1 0x000000000040054c <+76>: add %ecx,%r8d 8.1.1 0x000000000040054f <+79>: mov %ecx,(%rdi) 8.1.1 0x0000000000400551 <+81>: mov %r8d,0x4(%rdi) 8.1.1 0x0000000000400555 <+85>: cmp %r10,%rax 8.1.1 0x0000000000400558 <+88>: jne 0x400540 <Adler32::Update(void const*, unsigned int)+64> 8.1.1 0x000000000040055a <+90>: lea -0x1(%rdx),%eax 8.1.1 0x000000000040055d <+93>: lea 0x1(%rsi,%rax,1),%rsi 8.1.1 0x0000000000400562 <+98>: mov %ecx,%eax 8.1.1 0x0000000000400564 <+100>: mul %ebx 8.1.1 0x0000000000400566 <+102>: mov %r8d,%eax 8.1.1 0x0000000000400569 <+105>: shr $0xf,%edx 8.1.1 0x000000000040056c <+108>: imul $0xfff1,%edx,%edx 8.1.1 0x0000000000400572 <+114>: sub %edx,%ecx 8.1.1 0x0000000000400574 <+116>: mul %ebx 8.1.1 0x0000000000400576 <+118>: mov %ecx,(%rdi) 8.1.1 0x0000000000400578 <+120>: shr $0xf,%edx 8.1.1 0x000000000040057b <+123>: imul $0xfff1,%edx,%edx 8.1.1 0x0000000000400581 <+129>: sub %edx,%r8d 8.1.1 0x0000000000400584 <+132>: mov %r11d,%edx 8.1.1 0x0000000000400587 <+135>: mov %r8d,0x4(%rdi) 8.1.1 0x000000000040058b <+139>: test %r11d,%r11d 8.1.1 0x000000000040058e <+142>: jne 0x400518 <Adler32::Update(void const*, unsigned int)+24> 8.1.1 0x0000000000400590 <+144>: pop %rbx 8.1.1 0x0000000000400591 <+145>: retq 8.1.1 0x0000000000400592 <+146>: nopw 0x0(%rax,%rax,1) 8.1.1 0x0000000000400598 <+152>: retq Here is an interwoven version, hopefully easier to follow: 7.3.1 0x0000000000400500 <+0>: test %edx,%edx 7.3.1 0x0000000000400502 <+2>: je 0x400578 <Adler32::Update(void const*, unsigned int)+120> 7.3.1 0x0000000000400504 <+4>: mov (%rdi),%ecx 7.3.1 0x0000000000400506 <+6>: mov 0x4(%rdi),%r8d 7.3.1 0x000000000040050a <+10>: mov $0x80078071,%r10d 8.1.1 0x0000000000400500 <+0>: test %edx,%edx 8.1.1 0x0000000000400502 <+2>: je 0x400598 <Adler32::Update(void const*, unsigned int)+152> 8.1.1 0x0000000000400508 <+8>: mov (%rdi),%ecx 8.1.1 0x000000000040050a <+10>: mov 0x4(%rdi),%r8d 8.1.1 0x000000000040050e <+14>: push %rbx 8.1.1 0x000000000040050f <+15>: mov $0x80078071,%ebx 8.1.1 0x0000000000400514 <+20>: nopl 0x0(%rax) Two things so far: - the je is 6 bytes in 8.1.1 vs 2 bytes in 7.3.1 because the jump offset can't fit in a byte - in 8.1.1 ebx is used for the modulo magic, which is callee saved, so have to push before use 7.3.1 0x0000000000400510 <+16>: xor %r9d,%r9d 7.3.1 0x0000000000400513 <+19>: cmp $0x15af,%edx 7.3.1 0x0000000000400519 <+25>: jbe 0x400527 <Adler32::Update(void const*, unsigned int)+39> 7.3.1 0x000000000040051b <+27>: lea -0x15b0(%rdx),%r9d 7.3.1 0x0000000000400522 <+34>: mov $0x15b0,%edx 7.3.1 0x0000000000400527 <+39>: lea -0x1(%rdx),%eax 7.3.1 0x000000000040052a <+42>: lea 0x1(%rsi,%rax,1),%rdx 7.3.1 0x000000000040052f <+47>: nop 8.1.1 0x0000000000400518 <+24>: xor %r11d,%r11d 8.1.1 0x000000000040051b <+27>: cmp $0x15af,%edx 8.1.1 0x0000000000400521 <+33>: jbe 0x40052f <Adler32::Update(void const*, unsigned int)+47> 8.1.1 0x0000000000400523 <+35>: lea -0x15b0(%rdx),%r11d 8.1.1 0x000000000040052a <+42>: mov $0x15b0,%edx 8.1.1 0x000000000040052f <+47>: mov %edx,%r10d 8.1.1 0x0000000000400532 <+50>: mov %rsi,%rax 8.1.1 0x0000000000400535 <+53>: add %rsi,%r10 8.1.1 0x0000000000400538 <+56>: nopl 0x0(%rax,%rax,1) This is the inner loop init, pretty similar. 7.3.1 0x0000000000400530 <+48>: add $0x1,%rsi 7.3.1 0x0000000000400534 <+52>: movzbl -0x1(%rsi),%eax 7.3.1 0x0000000000400538 <+56>: add %eax,%ecx 7.3.1 0x000000000040053a <+58>: add %ecx,%r8d 7.3.1 0x000000000040053d <+61>: cmp %rdx,%rsi 7.3.1 0x0000000000400540 <+64>: mov %ecx,(%rdi) 7.3.1 0x0000000000400542 <+66>: mov %r8d,0x4(%rdi) 7.3.1 0x0000000000400546 <+70>: jne 0x400530 <Adler32::Update(void const*, unsigned int)+48> 8.1.1 0x0000000000400540 <+64>: add $0x1,%rax 8.1.1 0x0000000000400544 <+68>: movzbl -0x1(%rax),%r9d 8.1.1 0x0000000000400549 <+73>: add %r9d,%ecx 8.1.1 0x000000000040054c <+76>: add %ecx,%r8d 8.1.1 0x000000000040054f <+79>: mov %ecx,(%rdi) 8.1.1 0x0000000000400551 <+81>: mov %r8d,0x4(%rdi) 8.1.1 0x0000000000400555 <+85>: cmp %r10,%rax 8.1.1 0x0000000000400558 <+88>: jne 0x400540 <Adler32::Update(void const*, unsigned int)+64> These are the same, except the cmp is before/after the two stores, and the movzbl and the first add is one byte cheaper in 7.3.1. 7.3.1 0x0000000000400548 <+72>: mov %ecx,%eax 7.3.1 0x000000000040054a <+74>: mul %r10d 7.3.1 0x000000000040054d <+77>: mov %r8d,%eax 7.3.1 0x0000000000400550 <+80>: shr $0xf,%edx 7.3.1 0x0000000000400553 <+83>: imul $0xfff1,%edx,%edx 7.3.1 0x0000000000400559 <+89>: sub %edx,%ecx 7.3.1 0x000000000040055b <+91>: mul %r10d 7.3.1 0x000000000040055e <+94>: mov %ecx,(%rdi) 7.3.1 0x0000000000400560 <+96>: shr $0xf,%edx 7.3.1 0x0000000000400563 <+99>: imul $0xfff1,%edx,%edx 7.3.1 0x0000000000400569 <+105>: sub %edx,%r8d 7.3.1 0x000000000040056c <+108>: test %r9d,%r9d 7.3.1 0x000000000040056f <+111>: mov %r9d,%edx 7.3.1 0x0000000000400572 <+114>: mov %r8d,0x4(%rdi) 7.3.1 0x0000000000400576 <+118>: jne 0x400510 <Adler32::Update(void const*, unsigned int)+16> 7.3.1 0x0000000000400578 <+120>: repz retq 8.1.1 0x000000000040055a <+90>: lea -0x1(%rdx),%eax 8.1.1 0x000000000040055d <+93>: lea 0x1(%rsi,%rax,1),%rsi 8.1.1 0x0000000000400562 <+98>: mov %ecx,%eax 8.1.1 0x0000000000400564 <+100>: mul %ebx 8.1.1 0x0000000000400566 <+102>: mov %r8d,%eax 8.1.1 0x0000000000400569 <+105>: shr $0xf,%edx 8.1.1 0x000000000040056c <+108>: imul $0xfff1,%edx,%edx 8.1.1 0x0000000000400572 <+114>: sub %edx,%ecx 8.1.1 0x0000000000400574 <+116>: mul %ebx 8.1.1 0x0000000000400576 <+118>: mov %ecx,(%rdi) 8.1.1 0x0000000000400578 <+120>: shr $0xf,%edx 8.1.1 0x000000000040057b <+123>: imul $0xfff1,%edx,%edx 8.1.1 0x0000000000400581 <+129>: sub %edx,%r8d 8.1.1 0x0000000000400584 <+132>: mov %r11d,%edx 8.1.1 0x0000000000400587 <+135>: mov %r8d,0x4(%rdi) 8.1.1 0x000000000040058b <+139>: test %r11d,%r11d 8.1.1 0x000000000040058e <+142>: jne 0x400518 <Adler32::Update(void const*, unsigned int)+24> 8.1.1 0x0000000000400590 <+144>: pop %rbx 8.1.1 0x0000000000400591 <+145>: retq 8.1.1 0x0000000000400592 <+146>: nopw 0x0(%rax,%rax,1) 8.1.1 0x0000000000400598 <+152>: retq The loop variables in 8.1.1 handled a bit differently, that's why the minor size increase. The two lea's here in the 8.1.1 version are very similar to the ones before the inner loop in 7.3.1, those calculate the ptr range of the inner loop. These lea's here calculate the start ptr for the next inner loop run, however, this ptr is already present in eax. I don't know if this is some special optimization, or eax should have been used but the compiler missed it. The double retq at the end is interesting, but maybe the debug info is to blame. cmdline used to compile: $ g++-7.3.1 -g -O3 -Wall a32.cpp $ g++-8.1.1 -g -O3 -Wall a32.cpp Platform is AMD64 (FX-8150), Debian 9.4 $ g++-7.3.1 -v Using built-in specs. COLLECT_GCC=g++-7.3.1 COLLECT_LTO_WRAPPER=/usr/local/libexec/gcc/x86_64-pc-linux-gnu/7.3.1/lto-wrapper Target: x86_64-pc-linux-gnu Configured with: ../configure --enable-languages=c,c++ --disable-multilib --program-suffix=-7.3.1 --disable-bootstrap CFLAGS='-O2 -march=native -mtune=native' CXXFLAGS='-O2 -march=native -mtune=native' Thread model: posix gcc version 7.3.1 20180429 (GCC) $ g++-8.1.1 -v Using built-in specs. COLLECT_GCC=g++-8.1.1 COLLECT_LTO_WRAPPER=/usr/local/libexec/gcc/x86_64-pc-linux-gnu/8.1.1/lto-wrapper Target: x86_64-pc-linux-gnu Configured with: ../configure --enable-languages=c,c++ --disable-multilib --program-suffix=-8.1.1 --disable-bootstrap CFLAGS='-O2 -march=native -mtune=native' CXXFLAGS='-O2 -march=native -mtune=native' Thread model: posix gcc version 8.1.1 20180502 (GCC)
GCC 8.2 has been released.
This is interesting. First scev-cprop emits computation of final value of 'buf' before the inner loop. Then ivopts emits the same, but *after* the loop. In gcc-8, ivopts uses a slightly more efficient form though, so dom3 fails to unify the two, and we end up with redundant computations. Previously two final value computations were identical and dom3 managed to clean up. Final value replacement is unnecessary in this case, 'buf' remains used in the loop and the final value is non-constant so doesn't lead to further simplification. I believe we should try to throttle down scev-cprop (especially for -Os, but likely for -O2 as well). Emitting final value in the loop preheader instead of after exit may also be useful as it may reduce the amount of variables that live across the loop. (-fno-tree-scev-cprop leads to good code on the testcase)
(first paragraph of previous comment had 'before'/'after' swapped, sorry) Also note that adjusting the testcase to use size_t rather than unsigned int for the 'len' variable (as production code probably should) avoids the issue: final value replacement doesn't need to special case len==0 then.
Looking at trunk we have # RANGE [1, 5552] NONZERO 8191 # iftmp.2_14 = PHI <5552(6), len_29(5)> # prephitmp_53 = PHI <_52(6), 0(5)> _36 = (sizetype) iftmp.2_14; _37 = buf_11 + _36; <bb 8> [local count: 1073741824]: # buf_12 = PHI <buf_11(7), buf_21(11)> ... <bb 9> [local count: 118111601]: # _17 = PHI <_6(8)> # _59 = PHI <_8(8)> _46 = iftmp.2_14 + 4294967295; _45 = (sizetype) _46; _44 = _45 + 1; buf_47 = buf_11 + _44; where the final-value replacement failed to elide the +-1 dance because sizetype _45; unsigned int _46; IVOPTs is using tree-affine which performs some extra tricks such as using range-info to prove it can elide the casts. I think if we teach the same to SCEV we'd get it optimized again. I'm talking about /* If inner type has wrapping overflow behavior, fold conversion for below case: (T1)(X - CST) -> (T1)X - (T1)CST if X - CST doesn't overflow by range information. Also handle (T1)(X + CST) as (T1)(X - (-CST)). */ if (TYPE_UNSIGNED (itype) ... Note that the plan is to remove the unconditional final-value replacement and integrate it into passes that would benefit - first of all DCE which can trigger it if the IV (or the whole loop) can be eliminated.
GCC 8.3 has been released.
GCC 8.4.0 has been released, adjusting target milestone.
Bisection on size shows r8-1162-gba00284cedbcb0b980af4e5e41f427581af64462 as the first size difference, then r9-2350-gf80d6a178ed2043c6a9a4d8076cdc72d61b67b2b for a smaller growth, r11-3874-gec5e6467091ee0f8de2f894f0c1669465a8440f1 was apparently a temporary growth only, r11-4569-g33c0f246f799b7403171e97f31276a8feddd05c9 grew the other Adler32 function. Anyway, the first one seems to be the largest size difference.
Anyway, the problem discussed in #c2 to #c4 doesn't seem to exist in 10/trunk anymore, since: r10-2806-gdf7d46d925c7baca7bf9961aee900876d8aef225 And at least some of the changes that grew size are desirable for speed (and -O3 is not size optimization), so is there anything we want to do about this PR?
GCC 8 branch is being closed.
GCC 9.4 is being released, retargeting bugs to GCC 9.5.
GCC 9 branch is being closed
GCC 10.4 is being released, retargeting bugs to GCC 10.5.
GCC 10 branch is being closed.
Seems like it has been improved again for GCC 12.