Bug 95201 - Some x86 vector-extend patterns are not exercised.
Summary: Some x86 vector-extend patterns are not exercised.
Status: UNCONFIRMED
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 11.0
: P3 normal
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks: vectorizer 92658
  Show dependency treegraph
 
Reported: 2020-05-19 09:52 UTC by Uroš Bizjak
Modified: 2020-05-19 11:01 UTC (History)
1 user (show)

See Also:
Host:
Target:
Build:
Known to work:
Known to fail:
Last reconfirmed:


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Uroš Bizjak 2020-05-19 09:52:26 UTC
Some of x86 vector extend patterns are not exercised by middle end. Currently, they are XFAILed in gcc.target/i386/pr92658-*.c:


pr92658-avx2.c:/* { dg-final { scan-assembler-times "pmovzxbq" 2 { xfail *-*-* } } } */
pr92658-sse4.c:/* { dg-final { scan-assembler-times "pmovzxbd" 2 { xfail *-*-* } } } */
pr92658-sse4.c:/* { dg-final { scan-assembler-times "pmovzxbq" 2 { xfail *-*-* } } } */
pr92658-sse4.c:/* { dg-final { scan-assembler-times "pmovzxwq" 2 { xfail *-*-* } } } */

These correspond to:

-O2 -ftree-vectorize -mavx2 is required:

--cut here--
typedef unsigned char v32qi __attribute__((vector_size (32)));
typedef unsigned short v16hi __attribute__((vector_size (32)));
typedef unsigned int v8si __attribute__((vector_size (32)));
typedef unsigned long long v4di __attribute__((vector_size (32)));

void
foo_u8_u64 (v4di * dst, v32qi * __restrict src)
{
  unsigned long long tem[4];
  tem[0] = (*src)[0];
  tem[1] = (*src)[1];
  tem[2] = (*src)[2];
  tem[3] = (*src)[3];
  dst[0] = *(v4di *) tem;
}

void
bar_u8_u64 (v4di * dst, v32qi src)
{
  unsigned long long tem[4];
  tem[0] = src[0];
  tem[1] = src[1];
  tem[2] = src[2];
  tem[3] = src[3];
  dst[0] = *(v4di *) tem;
}

/* { dg-final { scan-assembler-times "pmovzxbq" 2 { xfail *-*-* } } } */
--cut here--

-O2 -ftree-vectorize -msse4.1 is required:

--cut here--
void
foo_u8_u32 (v4si * dst, v16qi * __restrict src)
{
  unsigned int tem[4];
  tem[0] = (*src)[0];
  tem[1] = (*src)[1];
  tem[2] = (*src)[2];
  tem[3] = (*src)[3];
  dst[0] = *(v4si *) tem;
}

void
bar_u8_u32 (v4si * dst, v16qi src)
{
  unsigned int tem[4];
  tem[0] = src[0];
  tem[1] = src[1];
  tem[2] = src[2];
  tem[3] = src[3];
  dst[0] = *(v4si *) tem;
}

/* { dg-final { scan-assembler-times "pmovzxbd" 2 { xfail *-*-* } } } */

void
foo_u8_u64 (v2di * dst, v16qi * __restrict src)
{
  unsigned long long tem[2];
  tem[0] = (*src)[0];
  tem[1] = (*src)[1];
  dst[0] = *(v2di *) tem;
}

void
bar_u8_u64 (v2di * dst, v16qi src)
{
  unsigned long long tem[2];
  tem[0] = src[0];
  tem[1] = src[1];
  dst[0] = *(v2di *) tem;
}

/* { dg-final { scan-assembler-times "pmovzxbq" 2 { xfail *-*-* } } } */

void
foo_u16_u64 (v2di * dst, v8hi * __restrict src)
{
  unsigned long long tem[2];
  tem[0] = (*src)[0];
  tem[1] = (*src)[1];
  dst[0] = *(v2di *) tem;
}

void
bar_u16_u64 (v2di * dst, v8hi src)
{
  unsigned long long tem[2];
  tem[0] = src[0];
  tem[1] = src[1];
  dst[0] = *(v2di *) tem;
}

/* { dg-final { scan-assembler-times "pmovzxwq" 2 { xfail *-*-* } } } */

Please note that these testcases fail to vectorize also in their loop forms, e.g.:

--cut here--
void
foo_u8_u64 (v4di * dst, v32qi * __restrict src)
{
  unsigned long long tem[4];

  for (int i = 0; i < 4; i++)
    tem[i] = (*src)[i];

  dst[0] = *(v4di *) tem;
}

void
bar_u8_u64 (v4di * dst, v32qi src)
{
  unsigned long long tem[4];

  for (int i = 0; i < 4; i++)
    tem[i] = src[i];

  dst[0] = *(v4di *) tem;
}
--cut here--

Please see also PR 92658#c8 for some analysis.