[638] % gcctk -v Using built-in specs. COLLECT_GCC=gcctk COLLECT_LTO_WRAPPER=/local/suz-local/software/local/gcc-trunk/libexec/gcc/x86_64-pc-linux-gnu/12.0.0/lto-wrapper Target: x86_64-pc-linux-gnu Configured with: ../gcc-trunk/configure --disable-bootstrap --prefix=/local/suz-local/software/local/gcc-trunk --enable-languages=c,c++ --disable-werror --enable-multilib --with-system-zlib Thread model: posix Supported LTO compression algorithms: zlib gcc version 12.0.0 20210422 (experimental) [master revision 3cf04d1afa8:0e51007a40c:d42088e453042f4f8ba9190a7e29efd937ea2181] (GCC) [639] % [639] % gcctk -O1 -S -o O1.s small.c [640] % gcctk -O3 -S -o O3.s small.c [641] % [641] % wc O1.s O3.s 62 135 857 O1.s 93 200 1337 O3.s 155 335 2194 total [642] % [642] % grep foo O1.s [643] % grep foo O3.s call foo [644] % [644] % cat small.c extern void foo(void); int b, c, d, e, *h; static int *f = &e; static int a() { return 1; } static void g() { if (!*f) for (; 1; d++) ; foo(); } static void i() { int j, l = 0, k[24] = {0}, *m[2] = {&k[4], &l}, n[27]; h = n; if (a() & n[0]) for (; c; c--) ; int p[8]; h = p; p[0] && (h = &j); e = 0; } static void o() { int *q, **r = &q, ***s[1]; s[0] = &r; i(); g(); } int main() { if (b) o(); return 0; }
-O3 changes inlining just so slightly and not inlining as much any more; /* -O3 parameters. */ { OPT_LEVELS_3_PLUS, OPT__param_max_inline_insns_auto_, NULL, 30 }, { OPT_LEVELS_3_PLUS, OPT__param_early_inlining_insns_, NULL, 14 }, { OPT_LEVELS_3_PLUS, OPT__param_inline_heuristics_hint_percent_, NULL, 600 }, { OPT_LEVELS_3_PLUS, OPT__param_inline_min_speedup_, NULL, 15 }, { OPT_LEVELS_3_PLUS, OPT__param_max_inline_insns_single_, NULL, 200 },
Indeed similar interaction between inlining, static var const promotion and IPA CP / inline heuristics
Here the stack frame size of i is stimated to 244 bytes void i () { int p[8]; int n[27]; int k[24]; int l; int j; int _1; int _2; int _3; int c.2_4; int _5; <bb 2> [local count: 236223200]: l = 0; k = {}; h = &n; _1 = n[0]; _2 = _1 & 1; if (_2 != 0) goto <bb 8>; [50.00%] else goto <bb 5>; [50.00%] <bb 8> [local count: 118111600]: goto <bb 4>; [100.00%] <bb 3> [local count: 955630225]: _3 = c.2_4 + -1; c = _3; <bb 4> [local count: 1073741824]: c.2_4 = c; if (c.2_4 != 0) goto <bb 3>; [89.00%] else goto <bb 5>; [11.00%] <bb 5> [local count: 236223200]: h = &p; _5 = p[0]; if (_5 != 0) goto <bb 6>; [50.00%] else goto <bb 7>; [50.00%] <bb 6> [local count: 118111600]: h = &j; <bb 7> [local count: 236223200]: e = 0; j ={v} {CLOBBER}; l ={v} {CLOBBER}; k ={v} {CLOBBER}; n ={v} {CLOBBER}; p ={v} {CLOBBER}; return; } so it indeed has larger arrays. k is initialized but never used (so it is missed DSE). n is used in stupid way: h = &n; _1 = n[0]; where h is write only static var, but we do not know that during early opts (we could try our luck and schedule one extra writeonly detection before early optimization passes, but I am not sure it is worth). I would say that main issue is also missed DSE
Seems to be fixed in GCC 13.1.0.