Bug 97821 - [9/10/11/12 Regression] wrong code with -ftree-vectorize at -O1 on x86_64-pc-linux-gnu by r6-3608
Summary: [9/10/11/12 Regression] wrong code with -ftree-vectorize at -O1 on x86_64-pc-...
Status: RESOLVED INVALID
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 11.0
: P2 normal
Target Milestone: 9.5
Assignee: Richard Biener
URL:
Keywords: wrong-code
Depends on:
Blocks:
 
Reported: 2020-11-13 22:17 UTC by Zhendong Su
Modified: 2022-02-02 09:00 UTC (History)
7 users (show)

See Also:
Host:
Target:
Build:
Known to work:
Known to fail:
Last reconfirmed: 2022-01-29 00:00:00


Attachments
for the testsuite (1.22 KB, text/plain)
2020-11-16 10:26 UTC, Richard Biener
Details
somewhat reduced testcase (1.09 KB, text/plain)
2022-02-01 15:47 UTC, Richard Biener
Details

Note You need to log in before you can comment on or make changes to this bug.
Description Zhendong Su 2020-11-13 22:17:21 UTC
The code is valid, but it is hard to reduce, so still quite large.

[509] % gcctk -v
Using built-in specs.
COLLECT_GCC=gcctk
COLLECT_LTO_WRAPPER=/local/suz-local/software/local/gcc-trunk/libexec/gcc/x86_64-pc-linux-gnu/11.0.0/lto-wrapper
Target: x86_64-pc-linux-gnu
Configured with: ../gcc-trunk/configure --disable-bootstrap --prefix=/local/suz-local/software/local/gcc-trunk --enable-languages=c,c++ --disable-werror --enable-multilib --with-system-zlib
Thread model: posix
Supported LTO compression algorithms: zlib
gcc version 11.0.0 20201113 (experimental) [master revision 54896b10dbe:c3a97a9df4b:a514934a0565255276adaa4fbd4aa35579ec33c6] (GCC) 
[510] % 
[510] % gcctk -O1 small.c; ./a.out
00005030-170
[511] % gcctk -O1 -ftree-vectorize small.c; ./a.out
00005030-176
[512] % 
[512] % cat small.c
int printf (const char *, ...);

static unsigned a, f, v;
int b, h, aa, ab, ac, ad, ae, y, z, af;
static long c, m, t, ag, ah = 3;
static signed d;
static char e, ai;
static short g, j = 1, o, w;
int *i, *s;
long long l;
static int *n;
char p;
static int q;
static int r;
static int u;
short x;
long long *aj = &l;
static signed ak;
static volatile unsigned al = 5;
static volatile short am = 1;
int *an(int *ao, int *ap) { return ap; }
static int aq() {
  int ar[] = {2, 2, 2, 2, 2, 2};
  short *as = &x;
  int at[] = {0, 1, 0, 1};
  int au = ab = 0;
  for (; m <= 1; m++) {
    int av = 0, k, aw = e && u, ax = aw || ag;
    int **ay = &n;
    for (; ab; ab++)
      ac = 0;
    for (; ac; ac++)
      am;
    u &&am;
    short az = am || a ^ w;
    unsigned bc = am & w | am || ag;
  ba:
    aw = u;
    i = 0;
    for (; i; i++)
      b = a;
    printf("0");
    if (p) {
      printf("%ld", ag);
      continue;
    }
    if (ag) {
      printf("7");
      e = w | ag<e> c < ax;
    }
    if (w) {
      printf("%d", u);
      goto bb;
    }
    if (u)
      printf("%d", e);
    s = &k;
    u = aw;
    t = 0;
    for (; t <= 1; t++)
      *ay = an(&au, &av);
    e++;
  }
  for (; r >= 0;)
    for (; ag <= 5;) {
      signed bd[6];
      int be = 0, bf = am % al;
      for (; be < 6; be++)
        bd[0] = 9;
      h = 0;
      for (; h <= 5; h++)
        *aj = *as = aa;
      for (; w; w = d)
        ;
      short bg = d + j ^ e + r;
      al % am;
      int bi = bg & al >> am;
      am ^ al;
      am / al;
      am &al;
      al;
      am / al;
      if (c)
        if (q) {
          be = 0;
          for (; be; be++)
            z = 0;
        }
      am;
      int bj = 0;
      if (m || q) {
      bh:
        l = ad = c;
        int bm = al || q;
        al;
        al;
        char bn = al || q;
        al;
        al;
        bm = q;
        ae = a;
      bk:
        ai = h || q > d;
        ag = d;
        al;
        al;
        printf("%d", q);
        if (a > 1)
          break;
        if (q)
          printf("%d", d);
        if (q) {
          printf("3");
          h = d | bm > q;
          goto bk;
        }
        if (!ai || al && 0) {
          printf("%d", d);
          al;
          printf("%d", a);
          goto bb;
        }
        d = al;
        printf("%lld", l);
        m = q;
        if (ak) {
          printf("%ld", c);
          ad = c & q;
        }
        if (!ah) {
          printf("%d", q);
          goto bh;
        }
      }
      if (c)
        s = &bj;
      m = q = d && c;
      r = ~(e / j & al > r);
      f |= d = al;
      v |= am;
      al / al ^ am;
      ak = am + al | al;
      am / al + al ^ am;
      j = am;
      al;
    bb:
      if (c)
        g++;
      a = q || e & d;
      am || al;
      am;
      am;
      am;
      al &am;
      am;
      am;
    bl:
      am;
      if (q) {
        printf("%d", q);
        a = q - am;
        goto bl;
      }
      am;
      printf("%d", d);
      m = q & am;
      am;
      printf("%d", a);
      if (d < -41) {
        printf("%ld", ag);
        goto ba;
      }
      h = *n;
      printf("3");
      c = e / d;
      printf("%ld", m);
      d = d << q / ag;
      o = 2;
      for (; o; o++)
        i = &be;
      x = m = e;
      printf("%d", r) && (ah = r) || (d = ak && e);
      printf("%d", ak);
      if (!bf) {
        printf("%d", e);
        *as = a;
        i = n;
        bi = ak / am > r;
        *n = 0;
        for (; n; n++)
          ;
      }
      y = bi;
    }
  return 0;
}
int main() {
  for (; af < 6; af++) {
    d = 8;
    aq();
  }
  printf("%d\n", h);
  return 0;
}
Comment 1 H.J. Lu 2020-11-14 18:47:06 UTC
It was caused by r6-3608.
Comment 2 Richard Biener 2020-11-16 10:26:15 UTC
Created attachment 49568 [details]
for the testsuite
Comment 3 Richard Biener 2020-11-16 10:48:17 UTC
Hmm.  It takes quite some time for us to elide the dead stores to bd, ar and at
(and only ar and at are vectorized).  I guess we run into some alias-walk
limits of DCE and vectorizing the stores fixes that.   Unfortunately those
limits are hard-coded:

          if (/* Constant but quadratic for small functions.  */
              total_chain > 128 * 128
              /* Linear in the number of may-defs.  */
              && total_chain > 32 * longest_chain
              /* Linear in the number of uses.  */
              && total_chain > nr_walks * 32)
            {

this points to the known issue of stack var coalescing with CLOBBERs and
indeed -fstack-reuse=none fixes the testcase, without actually spotting
the bad coalescing.

So related to that duplicate bug we have about this issue.
Comment 4 Jakub Jelinek 2021-02-03 16:25:04 UTC
Do you know the PR number of that other bug?
Comment 5 Richard Biener 2021-02-04 09:02:12 UTC
(In reply to Jakub Jelinek from comment #4)
> Do you know the PR number of that other bug?

PR90348
Comment 6 Jakub Jelinek 2021-05-14 09:54:06 UTC
GCC 8 branch is being closed.
Comment 7 Richard Biener 2021-06-01 08:18:54 UTC
GCC 9.4 is being released, retargeting bugs to GCC 9.5.
Comment 8 Andrew Pinski 2021-11-09 07:53:30 UTC
Hmm, the trunk fails even with -fno-tree-vectorize -O1 :).
Comment 9 Richard Biener 2022-01-31 14:35:12 UTC
(In reply to Andrew Pinski from comment #8)
> Hmm, the trunk fails even with -fno-tree-vectorize -O1 :).

Yep, and my fix for the stack slot sharing issue doesn't fix it :/  But I've now looked for an extended time and cannot figure why the sharing should be invalid...

So maybe the bug is sth different after all.

Disabling gimple DSE or DCE "fixes" it unfortunately.  Since the only hint it is the same as PR90348 was -fstack-reuse=none helps I'm no longer sure it is.
Comment 10 Richard Biener 2022-02-01 15:22:11 UTC
OK, so meanwhile when adding more births when we add clobbers during late transforms I get to improve the situation (now it only fails at -Os and -O2 -flto).
Comment 11 Richard Biener 2022-02-01 15:47:06 UTC
Created attachment 52325 [details]
somewhat reduced testcase

Somewhat reduced testcase.

Note there are uninit diagnostics because we jump into the code skipping
initialization for 'bi', 'bf' and 'ax'.  Those might all be false positives
and the partitions we coalesce do not involve those variables (but their
init is conditionally skipped)

Partition 3: size 4 align 4
        bj      k
Partition 2: size 4 align 4
        be      av
Comment 12 Richard Biener 2022-02-02 09:00:04 UTC
Hmm, cvise reduced it to the following for me - clearly invalid in this case
since 'av' is no longer live at h = *n.

res[];
pos, h;
*n;
**aq_ay = &n;
main() {
  { int av = *aq_ay = &av; }
  h = *n;
  res[pos] = h;
  __builtin_puts(res);
}

OK, so the original testcase has

  for (; m <= 1; m++) {
    int av = 0, k, aw = e && u, ax = aw || ag;
    int **ay = &n;
...
    for (; t <= 1; t++)
      *ay = an(&au, &av);
    e++;
  }

which effectively does

     n = &av;

and in the loop following that

  for (; r >= 0;)
    for (; ag <= 5;) {
...
      h = *n; 
    }

that's invalid since 'av' is no longer live here.

-fsanitize=address shows this:

=================================================================
==24252==ERROR: AddressSanitizer: stack-use-after-scope on address 0x7fffffffdd20 at pc 0x000000401827 bp 0x7fffffffdca0 sp 0x7fffffffdc98
READ of size 4 at 0x7fffffffdd20 thread T0
    #0 0x401826 in aq /tmp/t.c:173
    #1 0x401ba3 in main /tmp/t.c:200
    #2 0x7ffff6e4029c in __libc_start_main (/lib64/libc.so.6+0x3529c)
    #3 0x400859 in _start (/tmp/a.out+0x400859)

meh, I only tried -fsanitize=undefined sofar :/

The symptom (bad stack slot sharing) is exactly what you'd expect from such
an error.