[Bug tree-optimization/97428] -O3 is great for basic AoSoA packing of complex arrays, but horrible one step above the basic
already5chosen at yahoo dot com
gcc-bugzilla@gcc.gnu.org
Fri Oct 16 01:04:45 GMT 2020
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97428
--- Comment #5 from Michael_S <already5chosen at yahoo dot com> ---
(In reply to Richard Biener from comment #4)
> I have a fix that, with -mavx512f generates just
>
> .L3:
> vmovupd (%rcx,%rax), %zmm0
> vpermpd (%rsi,%rax), %zmm1, %zmm2
> vpermpd %zmm0, %zmm1, %zmm0
> vmovupd %zmm2, (%rdi,%rax,2)
> vmovupd %zmm0, 64(%rdi,%rax,2)
> addq $64, %rax
> cmpq %rax, %rdx
> jne .L3
>
This particular kernel on AVX512 is less interesting, because under AVX512 a
natural AoSoA layout is different.
typedef struct { double re, im; } dcmlx_t;
typedef struct { double re[8], im[8]; } dcmlx8_t;
void foo512(dcmlx8_t dst[], const dcmlx_t src[], int n)
{
for (int i = 0; i < n; ++i) {
dcmlx_t s0 = src[i*8+0];
dcmlx_t s1 = src[i*8+1];
dcmlx_t s2 = src[i*8+2];
dcmlx_t s3 = src[i*8+3];
dcmlx_t s4 = src[i*8+4];
dcmlx_t s5 = src[i*8+5];
dcmlx_t s6 = src[i*8+6];
dcmlx_t s7 = src[i*8+7];
dst[i].re[0] = s0.re;
dst[i].re[1] = s1.re;
dst[i].re[2] = s2.re;
dst[i].re[3] = s3.re;
dst[i].re[4] = s4.re;
dst[i].re[5] = s5.re;
dst[i].re[6] = s6.re;
dst[i].re[7] = s7.re;
dst[i].im[0] = s0.im;
dst[i].im[1] = s1.im;
dst[i].im[2] = s2.im;
dst[i].im[3] = s3.im;
dst[i].im[4] = s4.im;
dst[i].im[5] = s5.im;
dst[i].im[6] = s6.im;
dst[i].im[7] = s7.im;
}
}
And, respectively:
typedef struct { double re, im; } dcmlx_t;
typedef struct { double re[8], im[8]; } dcmlx8_t;
void foo512_i2(dcmlx8_t dst[], const dcmlx_t src[], int n)
{
for (int i = 0; i < n; ++i) {
dcmlx_t s00 = src[i*8+0];
dcmlx_t s01 = src[i*8+1];
dcmlx_t s02 = src[i*8+2];
dcmlx_t s03 = src[i*8+3];
dcmlx_t s04 = src[i*8+4];
dcmlx_t s05 = src[i*8+5];
dcmlx_t s06 = src[i*8+6];
dcmlx_t s07 = src[i*8+7];
dcmlx_t s10 = src[i*8+0+n*8];
dcmlx_t s11 = src[i*8+1+n*8];
dcmlx_t s12 = src[i*8+2+n*8];
dcmlx_t s13 = src[i*8+3+n*8];
dcmlx_t s14 = src[i*8+4+n*8];
dcmlx_t s15 = src[i*8+5+n*8];
dcmlx_t s16 = src[i*8+6+n*8];
dcmlx_t s17 = src[i*8+7+n*8];
dst[i*2+0].re[0] = s00.re;
dst[i*2+0].re[1] = s01.re;
dst[i*2+0].re[2] = s02.re;
dst[i*2+0].re[3] = s03.re;
dst[i*2+0].re[4] = s04.re;
dst[i*2+0].re[5] = s05.re;
dst[i*2+0].re[6] = s06.re;
dst[i*2+0].re[7] = s07.re;
dst[i*2+0].im[0] = s00.im;
dst[i*2+0].im[1] = s01.im;
dst[i*2+0].im[2] = s02.im;
dst[i*2+0].im[3] = s03.im;
dst[i*2+0].im[4] = s04.im;
dst[i*2+0].im[5] = s05.im;
dst[i*2+0].im[6] = s06.im;
dst[i*2+0].im[7] = s07.im;
dst[i*2+1].re[0] = s10.re;
dst[i*2+1].re[1] = s11.re;
dst[i*2+1].re[2] = s12.re;
dst[i*2+1].re[3] = s13.re;
dst[i*2+1].re[4] = s14.re;
dst[i*2+1].re[5] = s15.re;
dst[i*2+1].re[6] = s16.re;
dst[i*2+1].re[7] = s17.re;
dst[i*2+1].im[0] = s10.im;
dst[i*2+1].im[1] = s11.im;
dst[i*2+1].im[2] = s12.im;
dst[i*2+1].im[3] = s13.im;
dst[i*2+1].im[4] = s14.im;
dst[i*2+1].im[5] = s15.im;
dst[i*2+1].im[6] = s16.im;
dst[i*2+1].im[7] = s17.im;
}
}
More information about the Gcc-bugs
mailing list