Bug 93435 - [8/9 Regression] Hang with -O2 on innocuous looking code with GCC 8.3
Summary: [8/9 Regression] Hang with -O2 on innocuous looking code with GCC 8.3
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 10.0
: P3 normal
Target Milestone: 8.5
Assignee: Not yet assigned to anyone
URL:
Keywords: compile-time-hog
Depends on:
Blocks:
 
Reported: 2020-01-25 22:05 UTC by Ryan Livingston
Modified: 2021-04-30 07:58 UTC (History)
3 users (show)

See Also:
Host:
Target:
Build:
Known to work: 8.4.1, 9.3.1
Known to fail:
Last reconfirmed: 2020-01-27 00:00:00


Attachments
Perf report (55.05 KB, text/plain)
2020-01-27 10:18 UTC, Martin Liška
Details

Note You need to log in before you can comment on or make changes to this bug.
Description Ryan Livingston 2020-01-25 22:05:02 UTC
We're working on upgrading our code base to gcc 8.3 and hit a test which hung. Upon further investigation the gcc invocation was hanging. A standalone repro from that code is:

typedef signed char int8_T;
typedef int int32_T;

typedef struct {
  int8_T a;
} struct0_T;

typedef struct {
  struct0_T f10[4];
} struct_T;

typedef struct {
  struct_T f9[4];
} b_struct_T;

typedef struct {
  b_struct_T f8[4];
} c_struct_T;

typedef struct {
  c_struct_T f7[4];
} d_struct_T;

typedef struct {
  d_struct_T f6[4];
} e_struct_T;

typedef struct {
  e_struct_T f5[4];
} f_struct_T;

typedef struct {
  f_struct_T f4[4];
} g_struct_T;

typedef struct {
  g_struct_T f3[4];
} h_struct_T;

typedef struct {
  h_struct_T f2[4];
} i_struct_T;

typedef struct {
  i_struct_T f1[4];
} j_struct_T;

typedef struct {
  struct {
    j_struct_T ds21[4];
    i_struct_T ds20[4];
    i_struct_T r9;
  } f0;
} deep_struct_arraysStackData;

/* Function Definitions */
void deep_struct_arrays(deep_struct_arraysStackData *SD,
  int8_T in1, int8_T inCount, int8_T *out1, int8_T *out2, struct0_T out3[4])
{
  struct0_T r;
  struct_T r1;
  b_struct_T r2;
  c_struct_T r3;
  d_struct_T r4;
  e_struct_T r5;
  f_struct_T r6;
  g_struct_T r7;
  h_struct_T r8;
  int32_T count;
  int32_T i;

  /*  Check properties of input in1 */
  /*  Check properties of input inCount */
  /*  Copyright 2006 The MathWorks, Inc. */
  r.a = in1;
  r1.f10[0] = r;
  r1.f10[1] = r;
  r1.f10[2] = r;
  r1.f10[3] = r;
  r2.f9[0] = r1;
  r2.f9[1] = r1;
  r2.f9[2] = r1;
  r2.f9[3] = r1;
  r3.f8[0] = r2;
  r3.f8[1] = r2;
  r3.f8[2] = r2;
  r3.f8[3] = r2;
  r4.f7[0] = r3;
  r4.f7[1] = r3;
  r4.f7[2] = r3;
  r4.f7[3] = r3;
  r5.f6[0] = r4;
  r5.f6[1] = r4;
  r5.f6[2] = r4;
  r5.f6[3] = r4;
  r6.f5[0] = r5;
  r6.f5[1] = r5;
  r6.f5[2] = r5;
  r6.f5[3] = r5;
  r7.f4[0] = r6;
  r7.f4[1] = r6;
  r7.f4[2] = r6;
  r7.f4[3] = r6;
  r8.f3[0] = r7;
  r8.f3[1] = r7;
  r8.f3[2] = r7;
  r8.f3[3] = r7;
  SD->f0.r9.f2[0] = r8;
  SD->f0.r9.f2[1] = r8;
  SD->f0.r9.f2[2] = r8;
  SD->f0.r9.f2[3] = r8;
  SD->f0.ds20[0] = SD->f0.r9;
  SD->f0.ds20[3] = SD->f0.r9;
  count = 0;
  while (count < inCount) {
    i = in1 + SD->f0.ds20[0].f2[0].f3[0].f4[0].f5[0].f6[0].f7[0].f8[0].f9[0]
      .f10[0].a;
    if (i > 127) {
      i = 127;
    } else {
      if (i < -128) {
        i = -128;
      }
    }

    SD->f0.ds20[0].f2[0].f3[0].f4[0].f5[0].f6[0].f7[0].f8[0].f9[0].f10[0].a =
      (int8_T)i;
    i = SD->f0.ds20[3].f2[3].f3[3].f4[3].f5[3].f6[3].f7[3].f8[3].f9[3].f10[3].a
      + 3;
    if (i > 127) {
      i = 127;
    }

    SD->f0.ds20[3].f2[3].f3[3].f4[3].f5[3].f6[3].f7[3].f8[3].f9[3].f10[3].a =
      (int8_T)i;
    count++;
  }

  if (inCount > 10) {
    SD->f0.ds21[0].f1[1].f2[2].f3[3].f4[3].f5[3].f6[3].f7[3].f8[3].f9[3].f10[3].
      a = 14;
  } else {
    SD->f0.ds21[0].f1[1].f2[2].f3[3].f4[3].f5[3].f6[3].f7[3].f8[3].f9[3].f10[3].
      a = 16;
  }

  *out1 = SD->f0.ds20[0].f2[0].f3[0].f4[0].f5[0].f6[0].f7[0].f8[0].f9[0].f10[0].
    a;
  *out2 = SD->f0.ds20[3].f2[3].f3[3].f4[3].f5[3].f6[3].f7[3].f8[3].f9[3].f10[3].
    a;
  out3[0] = r;
  out3[1] = r;
  out3[2] = r;
  out3[3] = SD->f0.ds21[0].f1[1].f2[2].f3[3].f4[3].f5[3].f6[3].f7[3].f8[3].f9[3]
    .f10[3];
}

That causes a hang when compiled with -O2. Adding in -Q gives:

gcc -O2 -Q -c foo.c                                                                     
 deep_struct_arrays
Analyzing compilation unit
Performing interprocedural optimizations
 <*free_lang_data> <visibility> <build_ssa_passes> <opt_local_passes> <targetclone> <free-fnsummary> <whole-program> <profile_estimate> <icf> <devirt> <cp> <fnsummary> <inline> <pure-const> <free-fnsummary> <static-var> <single-use> <comdats>Assembling functions:
 <materialize-all-clones> <simdclone> deep_struct_arrays {GC 401309k -> 221133k}

with no progress past that point. Adding in:

  -Wall -Wextra -fno-strict-aliasing -fwrapv 

shows no warnings and the same hang is observed.

gcc -v
Using built-in specs.
COLLECT_GCC=.../gcc-8.3.0/bin/gcc
COLLECT_LTO_WRAPPER=.../gcc-8.3.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/8.3.0/lto-wrapper
Target: x86_64-pc-linux-gnu
Configured with: .../gcc-8.3/configure --with-gmp=.../gcc-8.3/gmp-4.3 --with-mpfr=../gcc-8.3/mpfr --with-mpc=...gcc-8.3/mpc --enable-languages=c,c++,fortran --enable-shared --enable-linker-build-id --enable-plugin --enable-checking=release --enable-multiarch --enable-gold --enable-ld=default --enable-libstdcxx-time=no --prefix=gcc-8.3.0 --with-pkgversion='MW GCC 8.3.0-gold' --with-tune=generic --with-system-zlib --enable-multilib --with-multilib-list=m32,m64 --with-arch-directory=amd64 --with-arch-32=i586 --with-abi=m64
Thread model: posix
gcc version 8.3.0 (GCC 8.3.0-gold)
Comment 1 Martin Liška 2020-01-27 10:18:05 UTC
Created attachment 47713 [details]
Perf report

Confirmed, also current master is affected.
We spend time in pass_dse.
Comment 2 Martin Liška 2020-01-27 10:21:15 UTC
Probably started with r6-3380-gd0f4e7fcdbde6134.
Comment 3 Jakub Jelinek 2020-01-27 10:33:43 UTC
I guess the main problem is that SRA turns those ~ 100 statements into ~350000 statements and e.g. tree DSE then hangs on that.
Comment 4 Richard Biener 2020-01-27 11:24:20 UTC
So SRA probably shouldn't do that.  Somehow it passed

  unsigned HOST_WIDE_INT max_scalarization_size
    = get_move_ratio (optimize_speed_p) * UNITS_PER_WORD;

which is a bit of an odd thing since we test it against

            if (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (var)))
                <= max_scalarization_size)

later (UNITS vs. bits).  MOVE_RATIO can be as big as 17 on x86_64 (which
is also the default), times 8 that's 136 bytes - but we test against
bits above, so I don't see how it qualifies...

Then tree DSE does have some rate-limiting of its walks as well.
Comment 5 Jeffrey A. Law 2020-02-27 19:20:43 UTC
Yea, we end up with ~260k stores in the key block.

The rate limiters in tree/gimple DSE are the number of queries we make per store and the size of object for which we'll do byte tracking.

Clearly with hundreds of thousands of stores we may want another limiter -- obviously a user could write code that triggers this problem directly.  I'll figure out a reasonable limiter for DSE.

It also seems like SRA went a little bananas here and that's probably more important to fix since it's taking fairly sensible code and exploding it beyone sensibility.
Comment 6 Jakub Jelinek 2020-03-04 09:44:39 UTC
GCC 8.4.0 has been released, adjusting target milestone.
Comment 7 Jakub Jelinek 2020-03-17 13:26:39 UTC
Martin, could you please have a look?
Comment 8 Martin Jambor 2020-03-19 13:31:40 UTC
The issue actually started with my r8-344-2bba75411e1 and it is
basically a perfect SRA bomb, it makes SRA sub-access propagation
accross assignments create gazillions of accesses and then
replacements, because they facilitate forward propagation (and as ccp3
dumps shows, they do).

I already have a patch that simply limits the number of replacements
to a param, defaulting to 128, which makes the testcase compilation
finish in about 9 seconds on my machine.  However, SRA analysis still
takes 7 seconds of that, so I'm looking at capping the propagation
earlier.  That takes more book-keeping, so at least for backports, I'd
like to use the simpler approach on released branches.
Comment 9 GCC Commits 2020-03-20 23:21:35 UTC
The master branch has been updated by Martin Jambor <jamborm@gcc.gnu.org>:

https://gcc.gnu.org/g:29f23ed79b60949fc60f6fdbbd931bd58090b241

commit r10-7309-g29f23ed79b60949fc60f6fdbbd931bd58090b241
Author: Martin Jambor <mjambor@suse.cz>
Date:   Sat Mar 21 00:21:02 2020 +0100

    sra: Cap number of sub-access propagations with a param (PR 93435)
    
    PR 93435 is a perfect SRA bomb.  It initializes an array of 16 chars
    element-wise, then uses that to initialize an aggregate that consists
    of four such arrays, that one to initialize one four times as big as
    the previous one all the way to an aggregate that has 64kb.
    
    This causes the sub-access propagation across assignments to create
    thousands of byte-sized artificial accesses which are then eligible to
    be replaced - they do facilitate forward propagation but there is
    enough of them for DSE to never finish.
    
    This patch avoids that situation by accounting how many of such
    replacements can be created per SRA candidate.  The default value of
    32 was just the largest power of two that did not slow down
    compilation of the testcase, but it should also hopefully be big
    enough for any reasonable input that might rely on the optimization.
    
    2020-03-20  Martin Jambor  <mjambor@suse.cz>
    
            PR tree-optimization/93435
            * params.opt (sra-max-propagations): New parameter.
            * tree-sra.c (propagation_budget): New variable.
            (budget_for_propagation_access): New function.
            (propagate_subaccesses_from_rhs): Use it.
            (propagate_subaccesses_from_lhs): Likewise.
            (propagate_all_subaccesses): Set up and destroy propagation_budget.
    
            gcc/testsuite/
            * gcc.dg/tree-ssa/pr93435.c: New test.
Comment 10 Martin Jambor 2020-03-20 23:24:50 UTC
Fixed on trunk with https://gcc.gnu.org/pipermail/gcc-patches/2020-March/542390.html
Comment 11 GCC Commits 2020-04-03 18:33:22 UTC
The releases/gcc-9 branch has been updated by Martin Jambor <jamborm@gcc.gnu.org>:

https://gcc.gnu.org/g:a1bb16994caed4dacf8c9ee1a33b177df140e9dc

commit r9-8449-ga1bb16994caed4dacf8c9ee1a33b177df140e9dc
Author: Martin Jambor <mjambor@suse.cz>
Date:   Fri Apr 3 20:32:44 2020 +0200

    gcc-9 sra: Cap number of sub-access propagations with a param (PR 93435)
    
    This is non-trivial but rather straightforward backport of
    29f23ed79b60949fc60f6fdbbd931bd58090b241 from master.  See
    https://gcc.gnu.org/pipermail/gcc-patches/2020-March/542390.html for
    more information.
    
    2020-04-02  Martin Jambor  <mjambor@suse.cz>
    
            PR tree-optimization/93435
            * params.def (PARAM_SRA_MAX_PROPAGATIONS): New parameter.
            * tree-sra.c (propagation_budget): New variable.
            (budget_for_propagation_access): New function.
            (propagate_subaccesses_across_link): Use it.
            (propagate_all_subaccesses): Set up and destroy propagation_budget.
            * doc/invoke.texi (sra-max-propagations): New.
    
            testsuite/
            * gcc.dg/tree-ssa/pr93435.c: New test.
Comment 12 GCC Commits 2020-04-03 20:01:46 UTC
The releases/gcc-8 branch has been updated by Martin Jambor <jamborm@gcc.gnu.org>:

https://gcc.gnu.org/g:b445ceec81ba3f4afad8c3ead1e58f14f1c2e146

commit r8-10163-gb445ceec81ba3f4afad8c3ead1e58f14f1c2e146
Author: Martin Jambor <mjambor@suse.cz>
Date:   Fri Apr 3 22:01:17 2020 +0200

    gcc-8 sra: Cap number of sub-access propagations with a param (PR 93435)
    
    This is non-trivial but rather straightforward backport of
    29f23ed79b60949fc60f6fdbbd931bd58090b241 from master.  See
    https://gcc.gnu.org/pipermail/gcc-patches/2020-March/542390.html for
    more information.
    
    Bootstrapped and tested on gcc-8 branch.
    
    2020-04-03  Martin Jambor  <mjambor@suse.cz>
    
            PR tree-optimization/93435
            * params.def (PARAM_SRA_MAX_PROPAGATIONS): New parameter.
            * tree-sra.c (propagation_budget): New variable.
            (budget_for_propagation_access): New function.
            (propagate_subaccesses_across_link): Use it.
            (propagate_all_subaccesses): Set up and destroy propagation_budget.
            * doc/invoke.texi (sra-max-propagations): New.
    
            testsuite/
            * gcc.dg/tree-ssa/pr93435.c: New test.
Comment 13 Martin Jambor 2020-04-03 20:09:54 UTC
The problematic behavior of SRA is now fixed on master and both opened
release branches so I consider my work done here.

I'm leaving the bug opened in case Jeff wants to add some DSE limiter
like he wrote in comment #5.
Comment 14 Richard Biener 2021-04-30 07:58:49 UTC
Let's close the bug.  The DSE issue should be tracked elsewhere with an actual testcase that still fails.