Bug 97139 - [11 Regression] Miscompare of foreman_test_baseline_encodelog.out in 464.h264ref since r11-3319-g48b0c1250a5c7d72be6b3fbbb1117d1cce43daee
Summary: [11 Regression] Miscompare of foreman_test_baseline_encodelog.out in 464.h264...
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 11.0
: P3 normal
Target Milestone: 11.0
Assignee: Not yet assigned to anyone
URL:
Keywords: wrong-code
Depends on:
Blocks: spec
  Show dependency treegraph
 
Reported: 2020-09-21 10:30 UTC by Martin Liška
Modified: 2020-09-21 14:54 UTC (History)
1 user (show)

See Also:
Host:
Target:
Build:
Known to work: 10.2.0
Known to fail: 11.0
Last reconfirmed: 2020-09-21 00:00:00


Attachments
patch I am testing (492 bytes, patch)
2020-09-21 13:54 UTC, Richard Biener
Details | Diff

Note You need to log in before you can comment on or make changes to this bug.
Description Martin Liška 2020-09-21 10:30:16 UTC
Since r11-3068-gfb51be60c8e7b697 the SPEC 2006 benchmark is miscompiled with:
-Ofast -march=znver2 -g -flto=128:

cat /home/marxin/Programming/cpu2006/benchspec/CPU2006/464.h264ref/run/run_peak_test_amd64-m64-mine.0004/foreman_test_baseline_encodelog.out.mis
0015:  0000(IDR)   21952 0 28  37.383  41.260  42.850        0       0     FRM    99
       0000(IDR)   21960 0 28  37.383  41.260  42.850        0       0     FRM    99
                       ^
0046:     149400    21952    21952 
          149400    21960    21960 
                        ^
0047:     186750    21952    21952 
          186750    21960    21960 
                        ^
0048:     224100    21952    21952 
          224100    21960    21960 
                        ^
0049:     261450    21952    21952 
          261450    21960    21960 
                        ^
0050:     298800    21952    21952 
          298800    21960    21960 
                        ^
0051:     336150    21952    21952 
          336150    21960    21960 
                        ^
0052:     373500    21952    21952 
          373500    21960    21960 
                        ^
0053:     410850    21952    21952 
          410850    21960    21960 
                        ^
0076:   Total bits                        : 124680 (I 21952, P 102552, NVB 176) 
        Total bits                        : 124688 (I 21960, P 102552, NVB 176) 
                                                 ^
Comment 1 Martin Liška 2020-09-21 12:07:38 UTC
So it started with r11-3319-g48b0c1250a5c7d72be6b3fbbb1117d1cce43daee (plus one needs patch from fb51be60c8e7b697).
Comment 2 Martin Liška 2020-09-21 13:18:37 UTC
So one can reproduce it with current master with:
-O3 -march=znver2 -g -flto=auto -flto-partition=one

where first bad debug counter value is:
-fdbg-cnt=vect_slp:357

The change is:

before:
  _648 = pix.x;
  _649 = _648 / 4;
  pix.x = _649;
  _650 = pix.y;
  _651 = _650 / 4;
  pix.y = _651;
  _652 = pix.pos_x;
  _653 = _652 / 4;
  pix.pos_x = _653;
  _654 = pix.pos_y;
  _655 = _654 / 4;
  pix.pos_y = _655;
...

after:

  vect__648.11467_1941 = MEM <vector(4) int> [(int *)&pix + 8B];
  _1946 = vect__648.11467_1941 < { 0, 0, 0, 0 };
  vect_patt_180.11468_1944 = VEC_COND_EXPR <_1946, { 3, 3, 3, 3 }, { 0, 0, 0, 0 }>;
  vect_patt_182.11469_1947 = vect__648.11467_1941 + vect_patt_180.11468_1944;
  vect_patt_185.11470_1966 = vect_patt_182.11469_1947 >> 2;
  _1967 = BIT_FIELD_REF <vect_patt_185.11470_1966, 32, 0>;
  _1968 = BIT_FIELD_REF <vect_patt_185.11470_1966, 32, 32>;
  _1950 = BIT_FIELD_REF <vect_patt_182.11469_1947, 32, 0>;
  _1965 = BIT_FIELD_REF <vect_patt_182.11469_1947, 32, 32>;
  _1943 = BIT_FIELD_REF <vect_patt_180.11468_1944, 32, 0>;
  _1942 = BIT_FIELD_REF <vect_patt_180.11468_1944, 32, 32>;
  _1945 = &pix.x + 4;
  _648 = pix.x;
  _649 = _648 / 4;
  _650 = pix.y;
  _651 = _650 / 4;
  _652 = pix.pos_x;
  _653 = _652 / 4;
  _654 = pix.pos_y;
  _655 = _654 / 4;
  MEM <vector(4) int> [(int *)&pix + 8B] = vect_patt_185.11470_1966;
...

So a vector shift by 2 instead of division by 4. It looks fine (assuming the trick for negative numbers is correct).
@Richi: can you please dig into it?
Comment 3 Martin Liška 2020-09-21 13:32:28 UTC
Ok, I've got the problem, it's bit later in the function:
The diff is:

diff -u before.txt after.txt
--- before.txt	2020-09-21 15:29:56.462394644 +0200
+++ after.txt	2020-09-21 15:29:48.086453128 +0200
@@ -1,15 +1,24 @@
+  vect__648.11467_1941 = MEM <vector(4) int> [(int *)&pix + 8B];
+  _1946 = vect__648.11467_1941 < { 0, 0, 0, 0 };
+  vect_patt_180.11468_1944 = VEC_COND_EXPR <_1946, { 3, 3, 3, 3 }, { 0, 0, 0, 0 }>;
+  vect_patt_182.11469_1947 = vect__648.11467_1941 + vect_patt_180.11468_1944;
+  vect_patt_185.11470_1966 = vect_patt_182.11469_1947 >> 2;
+  _1967 = BIT_FIELD_REF <vect_patt_185.11470_1966, 32, 0>;
+  _1968 = BIT_FIELD_REF <vect_patt_185.11470_1966, 32, 32>;
+  _1950 = BIT_FIELD_REF <vect_patt_182.11469_1947, 32, 0>;
+  _1965 = BIT_FIELD_REF <vect_patt_182.11469_1947, 32, 32>;
+  _1943 = BIT_FIELD_REF <vect_patt_180.11468_1944, 32, 0>;
+  _1942 = BIT_FIELD_REF <vect_patt_180.11468_1944, 32, 32>;
+  _1945 = &pix.x + 4;
   _648 = pix.x;
   _649 = _648 / 4;
-  pix.x = _649;
   _650 = pix.y;
   _651 = _650 / 4;
-  pix.y = _651;
   _652 = pix.pos_x;
   _653 = _652 / 4;
-  pix.pos_x = _653;
   _654 = pix.pos_y;
   _655 = _654 / 4;
-  pix.pos_y = _655;
+  MEM <vector(4) int> [(int *)&pix + 8B] = vect_patt_185.11470_1966;
   _492 = active_pps;
   _493 = _492->constrained_intra_pred_flag;
   pretmp_2084 = pix.mb_addr;
@@ -25,12 +34,12 @@
   _511 = *_510;
   _512 = subblock_x_278 / 2;
   _513 = _512 * 2;
-  _515 = _513 + _649;
+  _515 = _513 + _1943;
   _516 = (long unsigned int) _515;
   _517 = _516 * 8;
   _518 = _511 + _517;
   _519 = *_518;
-  _521 = _651 + 4;
+  _521 = _1942 + 4;
   _522 = (long unsigned int) _521;
   _523 = _522 * 4;
   _524 = _519 + _523;

The bad assignmed is
+  _515 = _513 + _1943;

where
+  _1943 = BIT_FIELD_REF <vect_patt_180.11468_1944, 32, 0>;

which is bogus VEC_COND_EXPR <_1946, { 3, 3, 3, 3 }, { 0, 0, 0, 0 }>;
while in the original code we do:
-  _515 = _513 + _649;
where _649 is pix.x / 4

So I think correct would be
+  _515 = _513 + _1967;
?
Comment 4 Richard Biener 2020-09-21 13:54:00 UTC
Created attachment 49245 [details]
patch I am testing

So the issue must be a bit subtle, I am testing the following patch on SPEC and will try to construct a testcase meanwhile.
Comment 5 Richard Biener 2020-09-21 14:03:36 UTC
#include "tree-vect.h"

int pix[4];

int __attribute__((noipa)) foo (void)
{
  pix[0] = pix[0] / 4;
  pix[1] = pix[1] / 4;
  pix[2] = pix[2] / 4;
  pix[3] = pix[3] / 4;
  return pix[0] + pix[1] + pix[2] + pix[3];
}

int main ()
{
  check_vect ();

  pix[0] = 8;
  pix[1] = 16;
  pix[2] = 32;
  pix[3] = 64;
  if (foo () != 30) 
    __builtin_abort ();
  return 0;
}
Comment 6 GCC Commits 2020-09-21 14:54:15 UTC
The master branch has been updated by Richard Biener <rguenth@gcc.gnu.org>:

https://gcc.gnu.org/g:e6f58fb6196ba16ce070e3722451f040a13f963b

commit r11-3327-ge6f58fb6196ba16ce070e3722451f040a13f963b
Author: Richard Biener <rguenther@suse.de>
Date:   Mon Sep 21 16:51:33 2020 +0200

    tree-optimization/97139 - fix BB SLP live lane extraction
    
    This fixes SLP live lane extraction with pattern stmts.
    
    2020-09-21  Richard Biener  <rguenther@suse.de>
    
            PR tree-optimization/97139
            * tree-vect-slp.c (vect_bb_slp_mark_live_stmts): Only mark the
            pattern root, track visited vectorized stmts.
    
            * gcc.dg/vect/pr97139.c: New testcase.
Comment 7 Richard Biener 2020-09-21 14:54:29 UTC
Fixed.