Some of x86 vector extend patterns are not exercised by middle end. Currently, they are XFAILed in gcc.target/i386/pr92658-*.c: pr92658-avx2.c:/* { dg-final { scan-assembler-times "pmovzxbq" 2 { xfail *-*-* } } } */ pr92658-sse4.c:/* { dg-final { scan-assembler-times "pmovzxbd" 2 { xfail *-*-* } } } */ pr92658-sse4.c:/* { dg-final { scan-assembler-times "pmovzxbq" 2 { xfail *-*-* } } } */ pr92658-sse4.c:/* { dg-final { scan-assembler-times "pmovzxwq" 2 { xfail *-*-* } } } */ These correspond to: -O2 -ftree-vectorize -mavx2 is required: --cut here-- typedef unsigned char v32qi __attribute__((vector_size (32))); typedef unsigned short v16hi __attribute__((vector_size (32))); typedef unsigned int v8si __attribute__((vector_size (32))); typedef unsigned long long v4di __attribute__((vector_size (32))); void foo_u8_u64 (v4di * dst, v32qi * __restrict src) { unsigned long long tem[4]; tem[0] = (*src)[0]; tem[1] = (*src)[1]; tem[2] = (*src)[2]; tem[3] = (*src)[3]; dst[0] = *(v4di *) tem; } void bar_u8_u64 (v4di * dst, v32qi src) { unsigned long long tem[4]; tem[0] = src[0]; tem[1] = src[1]; tem[2] = src[2]; tem[3] = src[3]; dst[0] = *(v4di *) tem; } /* { dg-final { scan-assembler-times "pmovzxbq" 2 { xfail *-*-* } } } */ --cut here-- -O2 -ftree-vectorize -msse4.1 is required: --cut here-- void foo_u8_u32 (v4si * dst, v16qi * __restrict src) { unsigned int tem[4]; tem[0] = (*src)[0]; tem[1] = (*src)[1]; tem[2] = (*src)[2]; tem[3] = (*src)[3]; dst[0] = *(v4si *) tem; } void bar_u8_u32 (v4si * dst, v16qi src) { unsigned int tem[4]; tem[0] = src[0]; tem[1] = src[1]; tem[2] = src[2]; tem[3] = src[3]; dst[0] = *(v4si *) tem; } /* { dg-final { scan-assembler-times "pmovzxbd" 2 { xfail *-*-* } } } */ void foo_u8_u64 (v2di * dst, v16qi * __restrict src) { unsigned long long tem[2]; tem[0] = (*src)[0]; tem[1] = (*src)[1]; dst[0] = *(v2di *) tem; } void bar_u8_u64 (v2di * dst, v16qi src) { unsigned long long tem[2]; tem[0] = src[0]; tem[1] = src[1]; dst[0] = *(v2di *) tem; } /* { dg-final { scan-assembler-times "pmovzxbq" 2 { xfail *-*-* } } } */ void foo_u16_u64 (v2di * dst, v8hi * __restrict src) { unsigned long long tem[2]; tem[0] = (*src)[0]; tem[1] = (*src)[1]; dst[0] = *(v2di *) tem; } void bar_u16_u64 (v2di * dst, v8hi src) { unsigned long long tem[2]; tem[0] = src[0]; tem[1] = src[1]; dst[0] = *(v2di *) tem; } /* { dg-final { scan-assembler-times "pmovzxwq" 2 { xfail *-*-* } } } */ Please note that these testcases fail to vectorize also in their loop forms, e.g.: --cut here-- void foo_u8_u64 (v4di * dst, v32qi * __restrict src) { unsigned long long tem[4]; for (int i = 0; i < 4; i++) tem[i] = (*src)[i]; dst[0] = *(v4di *) tem; } void bar_u8_u64 (v4di * dst, v32qi src) { unsigned long long tem[4]; for (int i = 0; i < 4; i++) tem[i] = src[i]; dst[0] = *(v4di *) tem; } --cut here-- Please see also PR 92658#c8 for some analysis.
This is now fixed.