[Bug target/97428] New: -O3 is great for basic AoSoA packing of complex arrays, but horrible one step above the basic

already5chosen at yahoo dot com gcc-bugzilla@gcc.gnu.org
Wed Oct 14 19:24:59 GMT 2020


https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97428

            Bug ID: 97428
           Summary: -O3 is great for basic AoSoA packing of complex
                    arrays, but horrible one step above the basic
           Product: gcc
           Version: 10.2.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: already5chosen at yahoo dot com
  Target Milestone: ---

That my next example of bad handling of AoSoA layout by gcc
optimizer/vectorizer.
For discussion of AoSoA see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97343

The issue in hand is transformation (packing) of complex AoS numbers into AoSoA
format.
Compiler used: gcc 10.2
Target: AVX2 (Skylake)

Part 1.
typedef struct { double re, im; } dcmlx_t;
typedef struct { double re[4], im[4]; } dcmlx4_t;

void foo(dcmlx4_t dst[], const dcmlx_t src[], int n)
{
  for (int i = 0; i < n; ++i) {
    dcmlx_t s00 = src[i*4+0];
    dcmlx_t s01 = src[i*4+1];
    dcmlx_t s02 = src[i*4+2];
    dcmlx_t s03 = src[i*4+3];

    dcmlx_t s10 = src[i*4+0+n];
    dcmlx_t s11 = src[i*4+1+n];
    dcmlx_t s12 = src[i*4+2+n];
    dcmlx_t s13 = src[i*4+3+n];

    dst[i*2+0].re[0] = s00.re;
    dst[i*2+0].re[1] = s01.re;
    dst[i*2+0].re[2] = s02.re;
    dst[i*2+0].re[3] = s03.re;
    dst[i*2+0].im[0] = s00.im;
    dst[i*2+0].im[1] = s01.im;
    dst[i*2+0].im[2] = s02.im;
    dst[i*2+0].im[3] = s03.im;

    dst[i*2+1].re[0] = s10.re;
    dst[i*2+1].re[1] = s11.re;
    dst[i*2+1].re[2] = s12.re;
    dst[i*2+1].re[3] = s13.re;
    dst[i*2+1].im[0] = s10.im;
    dst[i*2+1].im[1] = s11.im;
    dst[i*2+1].im[2] = s12.im;
    dst[i*2+1].im[3] = s13.im;
  }
}

-march=skylake -O2 produces following inner loop:
.L3:
        vmovsd  (%rdx), %xmm7
        vmovsd  8(%rdx), %xmm3
        vmovsd  16(%rdx), %xmm6
        vmovsd  24(%rdx), %xmm2
        vmovsd  32(%rdx), %xmm5
        vmovsd  40(%rdx), %xmm1
        vmovsd  48(%rdx), %xmm4
        vmovsd  56(%rdx), %xmm0
        addq    $64, %rdx
        vmovsd  %xmm7, (%rcx)
        vmovsd  %xmm6, 8(%rcx)
        vmovsd  %xmm5, 16(%rcx)
        vmovsd  %xmm4, 24(%rcx)
        vmovsd  %xmm3, 32(%rcx)
        vmovsd  %xmm2, 40(%rcx)
        vmovsd  %xmm1, 48(%rcx)
        vmovsd  %xmm0, 56(%rcx)
        addq    $64, %rcx
        cmpq    %rax, %rdx
        jne     .L3


Quite reasonable for non-vectorizing  optimization level. It's possible to save
one instruction by using indexed addressing, but in majority of situations it
wouldn't be faster.

-march=skylake -O3 inner loop:
.L3:
        vmovupd (%rdx,%rax), %ymm0
        vmovupd 32(%rdx,%rax), %ymm2
        vunpcklpd       %ymm2, %ymm0, %ymm1
        vunpckhpd       %ymm2, %ymm0, %ymm0
        vpermpd $216, %ymm1, %ymm1
        vpermpd $216, %ymm0, %ymm0
        vmovupd %ymm1, (%rcx,%rax)
        vmovupd %ymm0, 32(%rcx,%rax)
        addq    $64, %rax
        cmpq    %r8, %rax
        jne     .L3

That's excellent. It's not only looks better. According to my measurement, for
source array in external memory and destination in L1/L2 cache it's actually
~1.5x faster than -O2, which is not a small fit.

Part 2.
A little more involved case. Now we want to interleave 2 lines of source
matrix.
Sometimes it's desirable to have interleaved layout, because it improves
locality of access for the rest of processing, and also can reduce pressure on
GPRs that are used as pointers or indices.

typedef struct { double re, im; } dcmlx_t;
typedef struct { double re[4], im[4]; } dcmlx4_t;

void foo_i2(dcmlx4_t dst[], const dcmlx_t src[], int n)
{
  for (int i = 0; i < n; ++i) {
    dcmlx_t s00 = src[i*4+0];
    dcmlx_t s01 = src[i*4+1];
    dcmlx_t s02 = src[i*4+2];
    dcmlx_t s03 = src[i*4+3];

    dcmlx_t s10 = src[i*4+0+n];
    dcmlx_t s11 = src[i*4+1+n];
    dcmlx_t s12 = src[i*4+2+n];
    dcmlx_t s13 = src[i*4+3+n];

    dst[i*2+0].re[0] = s00.re;
    dst[i*2+0].re[1] = s01.re;
    dst[i*2+0].re[2] = s02.re;
    dst[i*2+0].re[3] = s03.re;
    dst[i*2+0].im[0] = s00.im;
    dst[i*2+0].im[1] = s01.im;
    dst[i*2+0].im[2] = s02.im;
    dst[i*2+0].im[3] = s03.im;

    dst[i*2+1].re[0] = s10.re;
    dst[i*2+1].re[1] = s11.re;
    dst[i*2+1].re[2] = s12.re;
    dst[i*2+1].re[3] = s13.re;
    dst[i*2+1].im[0] = s10.im;
    dst[i*2+1].im[1] = s11.im;
    dst[i*2+1].im[2] = s12.im;
    dst[i*2+1].im[3] = s13.im;
  }
}

-march=skylake -O2 produces following inner loop:
.L3:
        vmovsd  (%rdx), %xmm15
        vmovsd  8(%rdx), %xmm11
        vmovsd  16(%rdx), %xmm14
        vmovsd  24(%rdx), %xmm10
        vmovsd  32(%rdx), %xmm13
        vmovsd  40(%rdx), %xmm9
        vmovsd  48(%rdx), %xmm12
        vmovsd  56(%rdx), %xmm8
        vmovsd  (%rax), %xmm7
        vmovsd  8(%rax), %xmm3
        vmovsd  16(%rax), %xmm6
        vmovsd  24(%rax), %xmm2
        vmovsd  32(%rax), %xmm5
        vmovsd  40(%rax), %xmm1
        vmovsd  48(%rax), %xmm4
        vmovsd  56(%rax), %xmm0
        subq    $-128, %rcx
        vmovsd  %xmm15, -128(%rcx)
        vmovsd  %xmm14, -120(%rcx)
        vmovsd  %xmm13, -112(%rcx)
        vmovsd  %xmm12, -104(%rcx)
        vmovsd  %xmm11, -96(%rcx)
        vmovsd  %xmm10, -88(%rcx)
        vmovsd  %xmm9, -80(%rcx)
        vmovsd  %xmm8, -72(%rcx)
        vmovsd  %xmm7, -64(%rcx)
        vmovsd  %xmm6, -56(%rcx)
        vmovsd  %xmm5, -48(%rcx)
        vmovsd  %xmm4, -40(%rcx)
        vmovsd  %xmm3, -32(%rcx)
        vmovsd  %xmm2, -24(%rcx)
        vmovsd  %xmm1, -16(%rcx)
        vmovsd  %xmm0, -8(%rcx)
        addq    $64, %rdx
        addq    $64, %rax
        cmpq    %rcx, %r8
        jne     .L3

Once again, in absence of vectorizer it's very reasonable.
But may be, vectorizer can do better, as it did in the Part 1?

-march=skylake -O3 inner loop:
.L4:
        vmovupd (%rcx), %ymm5
        vmovupd 64(%rcx), %ymm4
        vunpcklpd       32(%rcx), %ymm5, %ymm3
        vunpckhpd       32(%rcx), %ymm5, %ymm1
        vmovupd 128(%rcx), %ymm5
        vmovupd 192(%rcx), %ymm7
        vunpcklpd       160(%rcx), %ymm5, %ymm0
        vunpckhpd       160(%rcx), %ymm5, %ymm2
        vmovupd 192(%rcx), %ymm5
        vunpcklpd       96(%rcx), %ymm4, %ymm6
        vunpcklpd       224(%rcx), %ymm5, %ymm5
        vunpckhpd       96(%rcx), %ymm4, %ymm4
        vunpckhpd       224(%rcx), %ymm7, %ymm7
        vpermpd $216, %ymm5, %ymm5
        vpermpd $216, %ymm0, %ymm0
        vpermpd $216, %ymm3, %ymm3
        vpermpd $216, %ymm4, %ymm4
        vpermpd $216, %ymm7, %ymm7
        vunpcklpd       %ymm5, %ymm0, %ymm8
        vpermpd $216, %ymm1, %ymm1
        vunpckhpd       %ymm5, %ymm0, %ymm0
        vpermpd $216, %ymm6, %ymm6
        vpermpd $216, %ymm2, %ymm2
        vunpcklpd       %ymm6, %ymm3, %ymm15
        vunpcklpd       %ymm4, %ymm1, %ymm5
        vunpckhpd       %ymm6, %ymm3, %ymm6
        vunpckhpd       %ymm4, %ymm1, %ymm1
        vpermpd $216, %ymm0, %ymm3
        vunpcklpd       %ymm7, %ymm2, %ymm0
        vunpckhpd       %ymm7, %ymm2, %ymm2
        vpermpd $216, %ymm0, %ymm0
        vpermpd $216, %ymm2, %ymm2
        vpermpd $216, %ymm6, %ymm6
        vpermpd $216, %ymm5, %ymm5
        vpermpd $216, %ymm1, %ymm1
        vunpcklpd       %ymm0, %ymm5, %ymm13
        vunpcklpd       %ymm2, %ymm1, %ymm12
        vunpckhpd       %ymm0, %ymm5, %ymm5
        vunpckhpd       %ymm2, %ymm1, %ymm1
        vunpcklpd       %ymm3, %ymm6, %ymm0
        vmovupd (%rdx), %ymm2
        vunpckhpd       %ymm3, %ymm6, %ymm6
        vmovupd 64(%rdx), %ymm3
        vunpcklpd       32(%rdx), %ymm2, %ymm2
        vpermpd $216, %ymm1, %ymm4
        vunpcklpd       96(%rdx), %ymm3, %ymm1
        vmovupd 128(%rdx), %ymm3
        vpermpd $216, %ymm1, %ymm1
        vpermpd $216, %ymm2, %ymm2
        vunpcklpd       %ymm1, %ymm2, %ymm2
        vunpcklpd       160(%rdx), %ymm3, %ymm1
        vmovupd 192(%rdx), %ymm3
        vpermpd $216, %ymm1, %ymm1
        vunpcklpd       224(%rdx), %ymm3, %ymm3
        vmovupd 64(%rdx), %ymm7
        vpermpd $216, %ymm3, %ymm3
        vunpcklpd       %ymm3, %ymm1, %ymm1
        vmovupd (%rdx), %ymm3
        vpermpd $216, %ymm1, %ymm1
        vpermpd $216, %ymm2, %ymm2
        vmovupd %ymm4, (%rsp)
        vunpcklpd       %ymm1, %ymm2, %ymm2
        vunpckhpd       32(%rdx), %ymm3, %ymm4
        vunpckhpd       96(%rdx), %ymm7, %ymm1
        vmovupd 128(%rdx), %ymm3
        vmovupd 192(%rdx), %ymm7
        vpermpd $216, %ymm1, %ymm1
        vpermpd $216, %ymm4, %ymm4
        vunpcklpd       %ymm1, %ymm4, %ymm4
        vunpckhpd       160(%rdx), %ymm3, %ymm1
        vunpckhpd       224(%rdx), %ymm7, %ymm3
        vpermpd $216, %ymm1, %ymm1
        vpermpd $216, %ymm3, %ymm3
        vunpcklpd       %ymm3, %ymm1, %ymm1
        vmovupd (%r11), %ymm3
        vpermpd $216, %ymm1, %ymm1
        vunpcklpd       32(%r11), %ymm3, %ymm9
        vmovupd 64(%r11), %ymm7
        vpermpd $216, %ymm4, %ymm4
        vunpcklpd       %ymm1, %ymm4, %ymm4
        vunpcklpd       96(%r11), %ymm7, %ymm1
        vmovupd 128(%r11), %ymm3
        vmovupd 192(%r11), %ymm7
        vpermpd $216, %ymm1, %ymm1
        vpermpd $216, %ymm9, %ymm9
        vunpcklpd       %ymm1, %ymm9, %ymm9
        vunpcklpd       160(%r11), %ymm3, %ymm1
        vunpcklpd       224(%r11), %ymm7, %ymm3
        vpermpd $216, %ymm1, %ymm1
        vpermpd $216, %ymm3, %ymm3
        vmovupd 64(%r11), %ymm7
        vunpcklpd       %ymm3, %ymm1, %ymm1
        vmovupd (%r11), %ymm3
        vpermpd $216, %ymm1, %ymm1
        vpermpd $216, %ymm9, %ymm9
        vunpckhpd       32(%r11), %ymm3, %ymm3
        vunpcklpd       %ymm1, %ymm9, %ymm9
        vunpckhpd       96(%r11), %ymm7, %ymm1
        vmovupd 128(%r11), %ymm7
        vpermpd $216, %ymm1, %ymm1
        vpermpd $216, %ymm3, %ymm3
        vunpcklpd       %ymm1, %ymm3, %ymm3
        vunpckhpd       160(%r11), %ymm7, %ymm1
        vmovupd 192(%r11), %ymm7
        vpermpd $216, %ymm1, %ymm1
        vunpckhpd       224(%r11), %ymm7, %ymm7
        vpermpd $216, %ymm8, %ymm8
        vpermpd $216, %ymm7, %ymm7
        vunpcklpd       %ymm7, %ymm1, %ymm1
        vmovupd (%r10), %ymm7
        vpermpd $216, %ymm15, %ymm15
        vunpcklpd       %ymm8, %ymm15, %ymm10
        vunpckhpd       %ymm8, %ymm15, %ymm15
        vunpcklpd       32(%r10), %ymm7, %ymm8
        vmovupd 64(%r10), %ymm7
        vpermpd $216, %ymm1, %ymm1
        vpermpd $216, %ymm3, %ymm3
        vunpcklpd       %ymm1, %ymm3, %ymm3
        vunpcklpd       96(%r10), %ymm7, %ymm1
        vmovupd 128(%r10), %ymm7
        vpermpd $216, %ymm1, %ymm1
        vpermpd $216, %ymm8, %ymm8
        vunpcklpd       %ymm1, %ymm8, %ymm8
        vunpcklpd       160(%r10), %ymm7, %ymm1
        vmovupd 192(%r10), %ymm7
        vpermpd $216, %ymm1, %ymm1
        vunpcklpd       224(%r10), %ymm7, %ymm7
        vpermpd $216, %ymm8, %ymm8
        vpermpd $216, %ymm7, %ymm7
        vunpcklpd       %ymm7, %ymm1, %ymm1
        vmovupd (%r10), %ymm7
        vpermpd $216, %ymm1, %ymm1
        vunpcklpd       %ymm1, %ymm8, %ymm8
        vunpckhpd       32(%r10), %ymm7, %ymm1
        vmovupd 64(%r10), %ymm7
        vpermpd $216, %ymm1, %ymm1
        vunpckhpd       96(%r10), %ymm7, %ymm7
        vmovupd 192(%r10), %ymm11
        vpermpd $216, %ymm7, %ymm7
        vunpcklpd       %ymm7, %ymm1, %ymm1
        vmovupd 128(%r10), %ymm7
        vunpckhpd       224(%r10), %ymm11, %ymm11
        vunpckhpd       160(%r10), %ymm7, %ymm7
        vpermpd $216, %ymm11, %ymm11
        vpermpd $216, %ymm7, %ymm7
        vunpcklpd       %ymm11, %ymm7, %ymm7
        vpermpd $216, %ymm1, %ymm1
        vpermpd $216, %ymm7, %ymm7
        vunpcklpd       %ymm7, %ymm1, %ymm7
        vmovupd (%r9), %ymm1
        vpermpd $216, %ymm7, %ymm7
        vmovupd %ymm7, 32(%rsp)
        vunpcklpd       32(%r9), %ymm1, %ymm7
        vmovupd 64(%r9), %ymm1
        vpermpd $216, %ymm7, %ymm7
        vunpcklpd       96(%r9), %ymm1, %ymm1
        vmovupd 192(%r9), %ymm14
        vpermpd $216, %ymm1, %ymm1
        vunpcklpd       %ymm1, %ymm7, %ymm7
        vmovupd 128(%r9), %ymm1
        vunpcklpd       224(%r9), %ymm14, %ymm11
        vunpcklpd       160(%r9), %ymm1, %ymm1
        vpermpd $216, %ymm11, %ymm11
        vpermpd $216, %ymm1, %ymm1
        vunpcklpd       %ymm11, %ymm1, %ymm1
        vpermpd $216, %ymm1, %ymm1
        vpermpd $216, %ymm7, %ymm7
        vmovupd 64(%r9), %ymm11
        vunpcklpd       %ymm1, %ymm7, %ymm7
        vmovupd (%r9), %ymm1
        vunpckhpd       96(%r9), %ymm11, %ymm11
        vunpckhpd       32(%r9), %ymm1, %ymm1
        vmovupd 128(%r9), %ymm14
        vpermpd $216, %ymm11, %ymm11
        vpermpd $216, %ymm1, %ymm1
        vunpcklpd       %ymm11, %ymm1, %ymm1
        vunpckhpd       160(%r9), %ymm14, %ymm11
        vmovupd 192(%r9), %ymm14
        vpermpd $216, %ymm11, %ymm11
        vunpckhpd       224(%r9), %ymm14, %ymm14
        vpermpd $216, %ymm10, %ymm10
        vpermpd $216, %ymm14, %ymm14
        vunpcklpd       %ymm14, %ymm11, %ymm11
        vpermpd $216, %ymm2, %ymm2
        vpermpd $216, %ymm11, %ymm11
        vpermpd $216, %ymm1, %ymm1
        vpermpd $68, %ymm10, %ymm14
        vpermpd $216, %ymm0, %ymm0
        vpermpd $216, %ymm9, %ymm9
        vunpcklpd       %ymm11, %ymm1, %ymm1
        vpermpd $238, %ymm10, %ymm10
        vpermpd $68, %ymm2, %ymm11
        vpermpd $238, %ymm2, %ymm2
        vshufpd $12, %ymm11, %ymm14, %ymm11
        vshufpd $12, %ymm2, %ymm10, %ymm2
        vpermpd $216, %ymm15, %ymm15
        vpermpd $68, %ymm0, %ymm10
        vpermpd $216, %ymm8, %ymm8
        vpermpd $68, %ymm9, %ymm14
        vpermpd $238, %ymm0, %ymm0
        vpermpd $238, %ymm9, %ymm9
        vshufpd $12, %ymm9, %ymm0, %ymm0
        vpermpd $216, %ymm6, %ymm6
        vmovupd %ymm0, 64(%rsp)
        vpermpd $68, %ymm15, %ymm9
        vpermpd $216, %ymm7, %ymm7
        vpermpd $68, %ymm8, %ymm0
        vshufpd $12, %ymm14, %ymm10, %ymm14
        vshufpd $12, %ymm0, %ymm9, %ymm0
        vpermpd $216, %ymm13, %ymm13
        vpermpd $68, %ymm7, %ymm9
        vpermpd $216, %ymm4, %ymm4
        vpermpd $238, %ymm8, %ymm8
        vpermpd $68, %ymm6, %ymm10
        vpermpd $238, %ymm7, %ymm7
        vpermpd $238, %ymm6, %ymm6
        vpermpd $238, %ymm15, %ymm15
        vshufpd $12, %ymm8, %ymm15, %ymm15
        vshufpd $12, %ymm9, %ymm10, %ymm10
        vshufpd $12, %ymm7, %ymm6, %ymm8
        vmovupd %ymm10, 128(%rsp)
        vpermpd $68, %ymm4, %ymm6
        vpermpd $68, %ymm13, %ymm10
        vshufpd $12, %ymm6, %ymm10, %ymm10
        vpermpd $216, %ymm12, %ymm12
        vmovupd 32(%rsp), %ymm6
        vpermpd $216, %ymm3, %ymm3
        vpermpd $238, %ymm13, %ymm13
        vpermpd $238, %ymm4, %ymm4
        vmovupd %ymm15, 96(%rsp)
        vmovupd %ymm8, 160(%rsp)
        vshufpd $12, %ymm4, %ymm13, %ymm15
        vpermpd $216, %ymm5, %ymm5
        vpermpd $68, %ymm3, %ymm4
        vpermpd $68, %ymm12, %ymm8
        vpermpd $238, %ymm3, %ymm3
        vpermpd $238, %ymm12, %ymm12
        vshufpd $12, %ymm4, %ymm8, %ymm8
        vshufpd $12, %ymm3, %ymm12, %ymm12
        vpermpd $68, %ymm5, %ymm4
        vpermpd $68, %ymm6, %ymm3
        vpermpd $238, %ymm5, %ymm5
        vpermpd $238, %ymm6, %ymm6
        vshufpd $12, %ymm6, %ymm5, %ymm6
        vmovupd (%rsp), %ymm5
        vpermpd $216, %ymm1, %ymm1
        vshufpd $12, %ymm3, %ymm4, %ymm3
        vpermpd $68, %ymm5, %ymm7
        vpermpd $68, %ymm1, %ymm4
        vpermpd $238, %ymm5, %ymm5
        vpermpd $238, %ymm1, %ymm1
        vshufpd $12, %ymm1, %ymm5, %ymm5
        vshufpd $12, %ymm4, %ymm7, %ymm7
        vpermpd $68, %ymm10, %ymm1
        vpermpd $68, %ymm11, %ymm4
        vpermpd $68, %ymm2, %ymm9
        vshufpd $12, %ymm1, %ymm4, %ymm4
        vpermpd $238, %ymm10, %ymm10
        vpermpd $68, %ymm15, %ymm1
        vpermpd $238, %ymm2, %ymm2
        vpermpd $238, %ymm15, %ymm15
        vpermpd $238, %ymm11, %ymm11
        vshufpd $12, %ymm10, %ymm11, %ymm11
        vshufpd $12, %ymm1, %ymm9, %ymm1
        vshufpd $12, %ymm15, %ymm2, %ymm10
        vpermpd $68, %ymm14, %ymm9
        vpermpd $68, %ymm8, %ymm2
        vpermpd $238, %ymm14, %ymm14
        vpermpd $238, %ymm8, %ymm8
        vshufpd $12, %ymm8, %ymm14, %ymm8
        vmovupd 64(%rsp), %ymm14
        vshufpd $12, %ymm2, %ymm9, %ymm2
        vpermpd $68, %ymm14, %ymm9
        vmovupd %ymm2, 32(%rsp)
        vpermpd $68, %ymm12, %ymm2
        vshufpd $12, %ymm2, %ymm9, %ymm13
        vpermpd $238, %ymm12, %ymm12
        vpermpd $238, %ymm14, %ymm9
        vmovupd 96(%rsp), %ymm15
        vshufpd $12, %ymm12, %ymm9, %ymm14
        vpermpd $68, %ymm3, %ymm2
        vpermpd $68, %ymm0, %ymm9
        vpermpd $238, %ymm3, %ymm3
        vpermpd $238, %ymm0, %ymm0
        vshufpd $12, %ymm3, %ymm0, %ymm12
        vmovupd %ymm13, 64(%rsp)
        vpermpd $68, %ymm6, %ymm0
        vpermpd $238, %ymm6, %ymm13
        vmovupd 128(%rsp), %ymm6
        vpermpd $68, %ymm15, %ymm3
        vpermpd $238, %ymm15, %ymm15
        vshufpd $12, %ymm2, %ymm9, %ymm2
        vshufpd $12, %ymm13, %ymm15, %ymm13
        vpermpd $238, %ymm6, %ymm9
        vpermpd $68, %ymm6, %ymm15
        vmovupd 160(%rsp), %ymm6
        vshufpd $12, %ymm0, %ymm3, %ymm3
        vpermpd $68, %ymm7, %ymm0
        vpermpd $238, %ymm7, %ymm7
        vshufpd $12, %ymm7, %ymm9, %ymm9
        vshufpd $12, %ymm0, %ymm15, %ymm15
        vpermpd $68, %ymm6, %ymm7
        vpermpd $68, %ymm5, %ymm0
        vshufpd $12, %ymm0, %ymm7, %ymm7
        vmovupd %ymm14, 192(%rsp)
        vpermpd $68, %ymm2, %ymm0
        vpermpd $238, %ymm5, %ymm14
        vpermpd $238, %ymm2, %ymm2
        vpermpd $68, %ymm4, %ymm5
        vpermpd $238, %ymm4, %ymm4
        vshufpd $12, %ymm0, %ymm5, %ymm5
        vshufpd $12, %ymm2, %ymm4, %ymm4
        vpermpd $68, %ymm12, %ymm0
        vpermpd $68, %ymm3, %ymm2
        vmovupd %ymm5, (%rsp)
        vmovupd %ymm4, 96(%rsp)
        vpermpd $68, %ymm11, %ymm5
        vpermpd $68, %ymm1, %ymm4
        vpermpd $238, %ymm3, %ymm3
        vpermpd $238, %ymm1, %ymm1
        vshufpd $12, %ymm0, %ymm5, %ymm5
        vshufpd $12, %ymm2, %ymm4, %ymm4
        vshufpd $12, %ymm3, %ymm1, %ymm1
        vpermpd $68, %ymm13, %ymm2
        vpermpd $238, %ymm12, %ymm0
        vpermpd $68, %ymm10, %ymm3
        vmovupd 32(%rsp), %ymm12
        vshufpd $12, %ymm2, %ymm3, %ymm3
        vpermpd $238, %ymm6, %ymm6
        vpermpd $238, %ymm10, %ymm2
        vpermpd $238, %ymm13, %ymm13
        vshufpd $12, %ymm14, %ymm6, %ymm14
        vshufpd $12, %ymm13, %ymm2, %ymm2
        vpermpd $68, %ymm15, %ymm6
        vpermpd $68, %ymm12, %ymm13
        vpermpd $238, %ymm15, %ymm15
        vpermpd $238, %ymm12, %ymm12
        vshufpd $12, %ymm15, %ymm12, %ymm12
        vpermpd $238, %ymm11, %ymm11
        vmovupd 64(%rsp), %ymm15
        vshufpd $12, %ymm0, %ymm11, %ymm0
        vpermpd $238, %ymm9, %ymm10
        vpermpd $68, %ymm8, %ymm11
        vpermpd $238, %ymm8, %ymm8
        vshufpd $12, %ymm6, %ymm13, %ymm13
        vshufpd $12, %ymm10, %ymm8, %ymm10
        vpermpd $68, %ymm9, %ymm6
        vpermpd $238, %ymm15, %ymm8
        vpermpd $68, %ymm15, %ymm9
        vmovupd 192(%rsp), %ymm15
        vshufpd $12, %ymm6, %ymm11, %ymm11
        vpermpd $68, %ymm7, %ymm6
        vpermpd $238, %ymm7, %ymm7
        vshufpd $12, %ymm6, %ymm9, %ymm9
        vshufpd $12, %ymm7, %ymm8, %ymm8
        vpermpd $68, %ymm14, %ymm6
        vpermpd $68, %ymm15, %ymm7
        vshufpd $12, %ymm6, %ymm7, %ymm7
        vpermpd $238, %ymm14, %ymm14
        vpermpd $238, %ymm15, %ymm6
        vshufpd $12, %ymm14, %ymm6, %ymm6
        vpermpd $68, (%rsp), %ymm14
        vpermpd $68, %ymm13, %ymm15
        vshufpd $12, %ymm15, %ymm14, %ymm14
        vmovupd 96(%rsp), %ymm15
        vmovupd %ymm14, (%rax)
        vpermpd $238, (%rsp), %ymm14
        vpermpd $238, %ymm13, %ymm13
        vshufpd $12, %ymm13, %ymm14, %ymm13
        vpermpd $68, %ymm12, %ymm14
        vmovupd %ymm13, 32(%rax)
        vpermpd $68, %ymm15, %ymm13
        vshufpd $12, %ymm14, %ymm13, %ymm13
        vpermpd $238, %ymm12, %ymm12
        vmovupd %ymm13, 64(%rax)
        vpermpd $238, %ymm15, %ymm13
        vshufpd $12, %ymm12, %ymm13, %ymm12
        vpermpd $68, %ymm11, %ymm13
        vmovupd %ymm12, 96(%rax)
        vpermpd $238, %ymm11, %ymm11
        vpermpd $68, %ymm5, %ymm12
        vpermpd $238, %ymm5, %ymm5
        vshufpd $12, %ymm11, %ymm5, %ymm5
        vpermpd $68, %ymm10, %ymm11
        vmovupd %ymm5, 160(%rax)
        vpermpd $238, %ymm10, %ymm10
        vpermpd $68, %ymm0, %ymm5
        vpermpd $238, %ymm0, %ymm0
        vshufpd $12, %ymm11, %ymm5, %ymm5
        vshufpd $12, %ymm10, %ymm0, %ymm0
        vmovupd %ymm5, 192(%rax)
        vmovupd %ymm0, 224(%rax)
        vpermpd $68, %ymm9, %ymm5
        vpermpd $68, %ymm4, %ymm0
        vpermpd $238, %ymm9, %ymm9
        vpermpd $238, %ymm4, %ymm4
        vshufpd $12, %ymm5, %ymm0, %ymm0
        vshufpd $12, %ymm9, %ymm4, %ymm4
        vmovupd %ymm0, 256(%rax)
        vmovupd %ymm4, 288(%rax)
        vpermpd $68, %ymm1, %ymm0
        vpermpd $68, %ymm8, %ymm4
        vpermpd $238, %ymm1, %ymm1
        vpermpd $238, %ymm8, %ymm8
        vshufpd $12, %ymm4, %ymm0, %ymm0
        vshufpd $12, %ymm8, %ymm1, %ymm1
        vmovupd %ymm0, 320(%rax)
        vmovupd %ymm1, 352(%rax)
        vpermpd $68, %ymm3, %ymm0
        vpermpd $68, %ymm7, %ymm1
        vshufpd $12, %ymm1, %ymm0, %ymm0
        vpermpd $238, %ymm3, %ymm3
        vmovupd %ymm0, 384(%rax)
        vpermpd $68, %ymm6, %ymm1
        vpermpd $68, %ymm2, %ymm0
        vpermpd $238, %ymm7, %ymm7
        vpermpd $238, %ymm2, %ymm2
        vpermpd $238, %ymm6, %ymm6
        addq    $256, %rcx
        vshufpd $12, %ymm13, %ymm12, %ymm12
        vshufpd $12, %ymm7, %ymm3, %ymm3
        vmovupd %ymm12, 128(%rax)
        vmovupd %ymm3, 416(%rax)
        vshufpd $12, %ymm1, %ymm0, %ymm0
        vshufpd $12, %ymm6, %ymm2, %ymm2
        vmovupd %ymm0, 448(%rax)
        vmovupd %ymm2, 480(%rax)
        addq    $256, %rdx
        addq    $256, %r11
        addq    $256, %r10
        addq    $256, %r9
        addq    $512, %rax
        cmpq    %rcx, %rbp
        jne     .L4

I am not kidding.
gcc 10.2 -O3 really generates code that is approximately 3 times slower than
scalar output of -O2 and, may be, 4-4.5 times slower than good SIMD code
similar to what was generated in Part1.

My guess is that it's once again, as in nearly all my complains of recent
months
it a case of earlier phase of optimization producing a mess that totally
confuses a later stage. I just can't guess what is the name of stage in fault
this time.
You have so many.


More information about the Gcc-bugs mailing list