[Bug tree-optimization/98113] New: [11 Regression] popcnt is not vectorized on s390 since f5e18dd9c7da

iii at linux dot ibm.com gcc-bugzilla@gcc.gnu.org
Thu Dec 3 01:28:53 GMT 2020


https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98113

            Bug ID: 98113
           Summary: [11 Regression] popcnt is not vectorized on s390 since
                    f5e18dd9c7da
           Product: gcc
           Version: 11.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: iii at linux dot ibm.com
  Target Milestone: ---

s390's vxe/popcount-1.c began to fail after PR96789 fix.

The reason is that for the following source code

uv4si __attribute__((noinline))
vpopctf (uv4si a)
{
  uv4si r;
  int i;

  for (i = 0; i < 4; i++)
    r[i] = __builtin_popcount (a[i]);

  return r;
}

FRE turned

  _4 = BIT_FIELD_REF <aD.2283, 32, 0>;
  _11 = __builtin_popcountD.1211 (_4);
  _18 = (unsigned intD.9) _11;
  BIT_FIELD_REF <rD.2286, 32, 0> = _18;
  i_20 = 1;
  ivtmp_21 = 3;
  _25 = VIEW_CONVERT_EXPR<unsigned intD.9[4]>(aD.2283)[i_20];
  _26 = __builtin_popcountD.1211 (_25);
  _27 = (unsigned intD.9) _26;
  VIEW_CONVERT_EXPR<unsigned intD.9[4]>(rD.2286)[i_20] = _27;
  i_29 = i_20 + 1;
  ivtmp_30 = ivtmp_21 + 4294967295;
  _34 = VIEW_CONVERT_EXPR<unsigned intD.9[4]>(aD.2283)[i_29];
  _35 = __builtin_popcountD.1211 (_34);
  _36 = (unsigned intD.9) _35;
  VIEW_CONVERT_EXPR<unsigned intD.9[4]>(rD.2286)[i_29] = _36;
  i_38 = i_29 + 1;
  ivtmp_39 = ivtmp_30 + 4294967295;
  _1 = VIEW_CONVERT_EXPR<unsigned intD.9[4]>(aD.2283)[i_38];
  _2 = __builtin_popcountD.1211 (_1);
  _3 = (unsigned intD.9) _2;
  VIEW_CONVERT_EXPR<unsigned intD.9[4]>(rD.2286)[i_38] = _3;
  i_10 = i_38 + 1;
  ivtmp_16 = ivtmp_39 + 4294967295;
  _7 = rD.2286;
  rD.2286 ={v} {CLOBBER};
  return _7;

into

  _4 = BIT_FIELD_REF <a_17(D), 32, 0>;
  _11 = __builtin_popcountD.1211 (_4);
  _18 = (unsigned intD.9) _11;
  r_14 = BIT_INSERT_EXPR <r_15(D), _18, 0 (32 bits)>;
  _25 = BIT_FIELD_REF <a_17(D), 32, 32>;
  _26 = __builtin_popcountD.1211 (_25);
  _27 = (unsigned intD.9) _26;
  r_33 = BIT_INSERT_EXPR <r_14, _27, 32 (32 bits)>;
  _34 = BIT_FIELD_REF <a_17(D), 32, 64>;
  _35 = __builtin_popcountD.1211 (_34);
  _36 = (unsigned intD.9) _35;
  r_32 = BIT_INSERT_EXPR <r_33, _36, 64 (32 bits)>;
  _1 = BIT_FIELD_REF <a_17(D), 32, 96>;
  _2 = __builtin_popcountD.1211 (_1);
  _3 = (unsigned intD.9) _2;
  r_31 = BIT_INSERT_EXPR <r_32, _3, 96 (32 bits)>;
  _7 = r_31;
  return _7;

that is, replaced a sequence of stores with a sequence of
BIT_INSERT_EXPRs.

slp1 now says: "missed:  not vectorized: no grouped stores in basic
block", presumably because it doesn't understand BIT_INSERT_EXPRs.


More information about the Gcc-bugs mailing list