Bug 97832 - AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
Summary: AoSoA complex caxpy-like loops: AVX2+FMA -Ofast 7 times slower than -O3
Status: ASSIGNED
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 10.2.0
: P3 normal
Target Milestone: ---
Assignee: Richard Biener
URL:
Keywords: missed-optimization
Depends on:
Blocks: vectorizer 99412
  Show dependency treegraph
 
Reported: 2020-11-14 20:44 UTC by Michael_S
Modified: 2021-03-08 08:32 UTC (History)
4 users (show)

See Also:
Host:
Target: x86_64-*-* i?86-*-*
Build:
Known to work:
Known to fail:
Last reconfirmed: 2020-11-16 00:00:00


Attachments
prototype (4.11 KB, patch)
2020-11-18 13:23 UTC, Richard Biener
Details | Diff

Note You need to log in before you can comment on or make changes to this bug.
Description Michael_S 2020-11-14 20:44:21 UTC
I am reporting under 'target' because AVX2+FMA is the only 256-bit SIMD platform I have to play with. If it's really tree-optomization, please change.

void foo(double* restrict y, const double* restrict x0, const double* restrict x1, int clen)
{
  int xi = clen & 2;
  double f00_re = x0[0+xi+0];
  double f10_re = x1[0+xi+0];
  double f01_re = x0[0+xi+1];
  double f11_re = x1[0+xi+1];
  double f00_im = x0[4+xi+0];
  double f10_im = x1[4+xi+0];
  double f01_im = x0[4+xi+1];
  double f11_im = x1[4+xi+1];
  int clen2 = (clen+xi) * 2;
  double* y0 = &y[0];
  double* y1 = &y[clen2];
  #pragma GCC unroll 0
  for (int c = 0; c < clen2; c += 8) {
    // y0[c] = y0[c] - x0[c]*conj(f00) - x1[c]*conj(f10);
    // y1[c] = y1[c] - x0[c]*conj(f01) - x1[c]*conj(f11);
    #pragma GCC unroll 4
    for (int k = 0; k < 4; ++k) {
      double x0_re = x0[c+0+k];
      double x0_im = x0[c+4+k];
      double y0_re = y0[c+0+k];
      double y0_im = y0[c+4+k];
      double y1_re = y1[c+0+k];
      double y1_im = y1[c+4+k];
      y0_re = y0_re - x0_re * f00_re - x0_im * f00_im;
      y0_im = y0_im + x0_re * f00_im - x0_im * f00_re;
      y1_re = y1_re - x0_re * f01_re - x0_im * f01_im;
      y1_im = y1_im + x0_re * f01_im - x0_im * f01_re;
      double x1_re = x1[c+0+k];
      double x1_im = x1[c+4+k];
      y0_re = y0_re - x1_re * f10_re - x1_im * f10_im;
      y0_im = y0_im + x1_re * f10_im - x1_im * f10_re;
      y1_re = y1_re - x1_re * f11_re - x1_im * f11_im;
      y1_im = y1_im + x1_re * f11_im - x1_im * f11_re;
      y0[c+0+k] = y0_re;
      y0[c+4+k] = y0_im;
      y1[c+0+k] = y1_re;
      y1[c+4+k] = y1_im;
    }
  }
}

When compiled with 'gcc.10.2. -march=skylake -O3' it produces pretty decent code. The only problem is over-aggressive load+op combining similar to what we already discussed in 97127. It seems, this problem can't be solved without major overhaul of gcc optimizer architecture, but luckily an impact is quite minor.
But when we compile with 'gcc.10.2. -march=skylake -Ofast' the fun begins:

.L5:
	vmovupd	(%r9), %ymm7
	vmovupd	64(%r9), %ymm6
	vunpcklpd	32(%r9), %ymm7, %ymm2
	vunpckhpd	32(%r9), %ymm7, %ymm0
	vmovupd	64(%r9), %ymm7
	vmovupd	192(%r9), %ymm4
	vunpckhpd	96(%r9), %ymm7, %ymm5
	vmovupd	128(%r9), %ymm7
	vunpcklpd	96(%r9), %ymm6, %ymm6
	vunpcklpd	160(%r9), %ymm7, %ymm3
	vunpckhpd	160(%r9), %ymm7, %ymm1
	vmovupd	192(%r9), %ymm7
	vunpcklpd	224(%r9), %ymm4, %ymm4
	vunpckhpd	224(%r9), %ymm7, %ymm8
	vpermpd	$216, %ymm6, %ymm6
	vpermpd	$216, %ymm5, %ymm5
	vpermpd	$216, %ymm4, %ymm4
	vpermpd	$216, %ymm8, %ymm8
	vpermpd	$216, %ymm2, %ymm2
	vpermpd	$216, %ymm0, %ymm0
	vpermpd	$216, %ymm3, %ymm3
	vpermpd	$216, %ymm1, %ymm1
	vunpcklpd	%ymm6, %ymm2, %ymm7
	vunpckhpd	%ymm6, %ymm2, %ymm2
	vunpcklpd	%ymm4, %ymm3, %ymm6
	vunpckhpd	%ymm4, %ymm3, %ymm3
	vunpcklpd	%ymm5, %ymm0, %ymm4
	vunpckhpd	%ymm5, %ymm0, %ymm0
	vunpcklpd	%ymm8, %ymm1, %ymm5
	vpermpd	$216, %ymm5, %ymm5
	vpermpd	$216, %ymm4, %ymm4
	vpermpd	$216, %ymm3, %ymm3
	vunpcklpd	%ymm5, %ymm4, %ymm11
	vpermpd	$216, %ymm2, %ymm2
	vunpckhpd	%ymm5, %ymm4, %ymm4
	vunpckhpd	%ymm8, %ymm1, %ymm1
	vpermpd	$216, %ymm0, %ymm0
	vpermpd	$216, %ymm4, %ymm8
	vpermpd	$216, %ymm1, %ymm1
	vunpcklpd	%ymm3, %ymm2, %ymm4
	vunpckhpd	%ymm3, %ymm2, %ymm2
	vpermpd	$216, %ymm2, %ymm5
	vunpcklpd	%ymm1, %ymm0, %ymm2
	vpermpd	$216, %ymm4, %ymm10
	vpermpd	$216, %ymm2, %ymm4
	vmovupd	64(%rax), %ymm2
	vmovupd	(%rax), %ymm3
	vmovupd	%ymm4, 448(%rsp)
	vunpckhpd	96(%rax), %ymm2, %ymm4
	vmovupd	128(%rax), %ymm2
	vpermpd	$216, %ymm6, %ymm6
	vunpckhpd	%ymm1, %ymm0, %ymm1
	vpermpd	$216, %ymm7, %ymm7
	vunpcklpd	32(%rax), %ymm3, %ymm9
	vunpckhpd	32(%rax), %ymm3, %ymm14
	vunpckhpd	160(%rax), %ymm2, %ymm0
	vmovupd	64(%rax), %ymm3
	vunpcklpd	%ymm6, %ymm7, %ymm12
	vunpckhpd	%ymm6, %ymm7, %ymm7
	vpermpd	$216, %ymm1, %ymm6
	vunpcklpd	160(%rax), %ymm2, %ymm1
	vmovupd	192(%rax), %ymm2
	vunpcklpd	96(%rax), %ymm3, %ymm3
	vmovupd	%ymm5, 416(%rsp)
	vunpcklpd	224(%rax), %ymm2, %ymm5
	vunpckhpd	224(%rax), %ymm2, %ymm2
	vpermpd	$216, %ymm3, %ymm3
	vpermpd	$216, %ymm5, %ymm5
	vpermpd	$216, %ymm9, %ymm9
	vpermpd	$216, %ymm1, %ymm1
	vpermpd	$216, %ymm4, %ymm4
	vpermpd	$216, %ymm0, %ymm0
	vmovupd	%ymm10, 384(%rsp)
	vpermpd	$216, %ymm14, %ymm14
	vunpcklpd	%ymm3, %ymm9, %ymm10
	vpermpd	$216, %ymm2, %ymm2
	vunpckhpd	%ymm3, %ymm9, %ymm9
	vunpcklpd	%ymm5, %ymm1, %ymm3
	vpermpd	$216, %ymm3, %ymm3
	vmovupd	%ymm8, 288(%rsp)
	vpermpd	$216, %ymm10, %ymm10
	vunpcklpd	%ymm4, %ymm14, %ymm8
	vunpckhpd	%ymm4, %ymm14, %ymm14
	vunpcklpd	%ymm2, %ymm0, %ymm4
	vpermpd	$216, %ymm4, %ymm4
	vpermpd	$216, %ymm8, %ymm8
	vunpckhpd	%ymm2, %ymm0, %ymm2
	vunpcklpd	%ymm3, %ymm10, %ymm0
	vpermpd	$216, %ymm0, %ymm13
	vunpcklpd	%ymm4, %ymm8, %ymm0
	vunpckhpd	%ymm4, %ymm8, %ymm8
	vpermpd	$216, %ymm2, %ymm2
	vunpckhpd	%ymm3, %ymm10, %ymm10
	vpermpd	$216, %ymm14, %ymm14
	vpermpd	$216, %ymm0, %ymm3
	vpermpd	$216, %ymm8, %ymm0
	vmovupd	%ymm6, 480(%rsp)
	vunpckhpd	%ymm5, %ymm1, %ymm1
	vmovupd	%ymm3, 512(%rsp)
	vmovupd	(%rsi), %ymm3
	vmovupd	%ymm0, 544(%rsp)
	vunpcklpd	%ymm2, %ymm14, %ymm0
	vpermpd	$216, %ymm1, %ymm1
	vpermpd	$216, %ymm0, %ymm4
	vpermpd	$216, %ymm9, %ymm9
	vunpcklpd	%ymm1, %ymm9, %ymm6
	vmovupd	%ymm4, 640(%rsp)
	vunpckhpd	%ymm1, %ymm9, %ymm9
	vunpcklpd	32(%rsi), %ymm3, %ymm4
	vunpckhpd	32(%rsi), %ymm3, %ymm1
	vmovupd	64(%rsi), %ymm3
	vunpckhpd	%ymm2, %ymm14, %ymm14
	vunpcklpd	96(%rsi), %ymm3, %ymm8
	vunpckhpd	96(%rsi), %ymm3, %ymm5
	vmovupd	128(%rsi), %ymm3
	vpermpd	$216, %ymm14, %ymm2
	vunpckhpd	160(%rsi), %ymm3, %ymm0
	vmovupd	%ymm2, 672(%rsp)
	vunpcklpd	160(%rsi), %ymm3, %ymm2
	vmovupd	192(%rsi), %ymm3
	vmovupd	192(%rsi), %ymm14
	vunpcklpd	224(%rsi), %ymm3, %ymm3
	vpermpd	$216, %ymm9, %ymm9
	vmovupd	%ymm9, 608(%rsp)
	vunpckhpd	224(%rsi), %ymm14, %ymm9
	vpermpd	$216, %ymm8, %ymm8
	vpermpd	$216, %ymm3, %ymm3
	vpermpd	$216, %ymm6, %ymm6
	vpermpd	$216, %ymm4, %ymm4
	vpermpd	$216, %ymm2, %ymm2
	vpermpd	$216, %ymm5, %ymm5
	vpermpd	$216, %ymm9, %ymm9
	vmovupd	%ymm6, 576(%rsp)
	vpermpd	$216, %ymm1, %ymm1
	vunpcklpd	%ymm8, %ymm4, %ymm6
	vpermpd	$216, %ymm0, %ymm0
	vunpckhpd	%ymm8, %ymm4, %ymm4
	vunpcklpd	%ymm3, %ymm2, %ymm8
	vpermpd	$216, %ymm8, %ymm8
	vpermpd	$216, %ymm6, %ymm6
	vunpckhpd	%ymm3, %ymm2, %ymm2
	vunpcklpd	%ymm5, %ymm1, %ymm3
	vunpckhpd	%ymm5, %ymm1, %ymm1
	vunpcklpd	%ymm9, %ymm0, %ymm5
	vpermpd	$216, %ymm2, %ymm2
	vpermpd	$216, %ymm5, %ymm5
	vunpcklpd	%ymm8, %ymm6, %ymm14
	vpermpd	$216, %ymm4, %ymm4
	vunpckhpd	%ymm8, %ymm6, %ymm6
	vpermpd	$216, %ymm3, %ymm3
	vunpckhpd	%ymm9, %ymm0, %ymm0
	vpermpd	$216, %ymm6, %ymm9
	vunpcklpd	%ymm5, %ymm3, %ymm6
	vunpckhpd	%ymm5, %ymm3, %ymm3
	vunpcklpd	%ymm2, %ymm4, %ymm5
	vunpckhpd	%ymm2, %ymm4, %ymm4
	vpermpd	$216, %ymm0, %ymm0
	vpermpd	$216, %ymm4, %ymm2
	vpermpd	$216, %ymm1, %ymm1
	vmovupd	%ymm2, 832(%rsp)
	vunpcklpd	%ymm0, %ymm1, %ymm2
	vunpckhpd	%ymm0, %ymm1, %ymm1
	vpermpd	$216, %ymm1, %ymm0
	vmovupd	%ymm0, 896(%rsp)
	vmovupd	(%rbx), %ymm0
	vpermpd	$216, %ymm2, %ymm4
	vunpckhpd	32(%rbx), %ymm0, %ymm1
	vunpcklpd	32(%rbx), %ymm0, %ymm2
	vmovupd	64(%rbx), %ymm0
	vpermpd	$216, %ymm5, %ymm5
	vmovupd	%ymm5, 800(%rsp)
	vmovupd	%ymm4, 864(%rsp)
	vunpcklpd	96(%rbx), %ymm0, %ymm5
	vunpckhpd	96(%rbx), %ymm0, %ymm4
	vmovupd	128(%rbx), %ymm0
	vpermpd	$216, %ymm6, %ymm6
	vpermpd	$216, %ymm3, %ymm3
	vmovupd	%ymm9, 704(%rsp)
	vmovupd	%ymm6, 736(%rsp)
	vmovupd	%ymm3, 768(%rsp)
	vunpcklpd	160(%rbx), %ymm0, %ymm3
	vmovupd	192(%rbx), %ymm8
	vunpckhpd	160(%rbx), %ymm0, %ymm0
	vunpcklpd	224(%rbx), %ymm8, %ymm6
	vunpckhpd	224(%rbx), %ymm8, %ymm9
	vpermpd	$216, %ymm5, %ymm5
	vpermpd	$216, %ymm4, %ymm4
	vpermpd	$216, %ymm6, %ymm6
	vpermpd	$216, %ymm9, %ymm9
	vpermpd	$216, %ymm2, %ymm2
	vpermpd	$216, %ymm1, %ymm1
	vpermpd	$216, %ymm3, %ymm3
	vpermpd	$216, %ymm0, %ymm0
	vunpcklpd	%ymm5, %ymm2, %ymm8
	vunpckhpd	%ymm5, %ymm2, %ymm2
	vunpcklpd	%ymm6, %ymm3, %ymm5
	vunpckhpd	%ymm6, %ymm3, %ymm3
	vunpcklpd	%ymm4, %ymm1, %ymm6
	vunpckhpd	%ymm4, %ymm1, %ymm1
	vunpcklpd	%ymm9, %ymm0, %ymm4
	vunpckhpd	%ymm9, %ymm0, %ymm0
	vpermpd	$216, %ymm5, %ymm5
	vpermpd	$216, %ymm3, %ymm3
	vpermpd	$216, %ymm4, %ymm4
	vpermpd	$216, %ymm0, %ymm0
	vpermpd	$216, %ymm8, %ymm8
	vpermpd	$216, %ymm2, %ymm2
	vpermpd	$216, %ymm6, %ymm6
	vpermpd	$216, %ymm1, %ymm1
	vunpcklpd	%ymm5, %ymm8, %ymm9
	vunpckhpd	%ymm5, %ymm8, %ymm8
	vunpcklpd	%ymm4, %ymm6, %ymm5
	vunpckhpd	%ymm4, %ymm6, %ymm6
	vunpcklpd	%ymm3, %ymm2, %ymm4
	vunpckhpd	%ymm3, %ymm2, %ymm2
	vunpcklpd	%ymm0, %ymm1, %ymm3
	vunpckhpd	%ymm0, %ymm1, %ymm1
	vpermpd	$216, %ymm9, %ymm9
	vpermpd	$216, %ymm8, %ymm8
	vpermpd	$216, %ymm1, %ymm0
	vpermpd	$216, %ymm10, %ymm15
	vmovupd	%ymm0, 240(%rsp)
	vmulpd	320(%rsp), %ymm9, %ymm10
	vmulpd	64(%rsp), %ymm8, %ymm0
	vmovupd	(%rsp), %ymm1
	vpermpd	$216, %ymm12, %ymm12
	vpermpd	$216, %ymm7, %ymm7
	vfmadd231pd	176(%rsp), %ymm12, %ymm10
	vfmadd231pd	%ymm1, %ymm7, %ymm0
	vpermpd	$216, %ymm14, %ymm14
	vpermpd	$216, %ymm11, %ymm11
	vpermpd	$216, %ymm6, %ymm6
	vpermpd	$216, %ymm5, %ymm5
	vaddpd	%ymm10, %ymm0, %ymm0
	vmulpd	64(%rsp), %ymm9, %ymm10
	vpermpd	$216, %ymm2, %ymm2
	vsubpd	%ymm0, %ymm13, %ymm0
	vmulpd	320(%rsp), %ymm8, %ymm13
	vpermpd	$216, %ymm4, %ymm4
	vfmadd231pd	%ymm1, %ymm12, %ymm10
	vmovupd	%ymm0, 352(%rsp)
	vmovupd	208(%rsp), %ymm0
	vfmadd231pd	176(%rsp), %ymm7, %ymm13
	vpermpd	$216, %ymm3, %ymm3
	addq	$256, %r9
	addq	$256, %rax
	addq	$256, %rsi
	vsubpd	%ymm13, %ymm10, %ymm10
	vmulpd	%ymm0, %ymm9, %ymm13
	vmulpd	96(%rsp), %ymm9, %ymm9
	vaddpd	%ymm15, %ymm10, %ymm10
	vmulpd	96(%rsp), %ymm8, %ymm15
	vmulpd	%ymm0, %ymm8, %ymm8
	vmovupd	%ymm10, 928(%rsp)
	vmovupd	128(%rsp), %ymm10
	vfmadd231pd	32(%rsp), %ymm12, %ymm13
	vfmadd231pd	%ymm10, %ymm12, %ymm9
	vmovupd	32(%rsp), %ymm12
	vfmadd231pd	%ymm10, %ymm7, %ymm15
	vfmadd231pd	%ymm12, %ymm7, %ymm8
	vmovupd	(%rsp), %ymm7
	addq	$256, %rbx
	addq	$256, %r11
	vaddpd	%ymm15, %ymm13, %ymm13
	vsubpd	%ymm8, %ymm9, %ymm9
	vmovapd	%ymm10, %ymm15
	vmovupd	288(%rsp), %ymm10
	vsubpd	%ymm13, %ymm14, %ymm1
	vaddpd	704(%rsp), %ymm9, %ymm13
	vmulpd	%ymm15, %ymm10, %ymm9
	vmulpd	%ymm10, %ymm7, %ymm7
	vmovupd	176(%rsp), %ymm14
	vmovupd	320(%rsp), %ymm15
	vmovupd	%ymm1, 960(%rsp)
	vfmadd231pd	%ymm12, %ymm11, %ymm9
	vmovupd	64(%rsp), %ymm12
	vfmadd231pd	%ymm14, %ymm11, %ymm7
	vmulpd	%ymm12, %ymm6, %ymm8
	vmovupd	512(%rsp), %ymm1
	vfmadd231pd	%ymm15, %ymm5, %ymm8
	vaddpd	%ymm8, %ymm7, %ymm7
	vmulpd	%ymm12, %ymm5, %ymm8
	vmulpd	%ymm15, %ymm6, %ymm12
	vsubpd	%ymm7, %ymm1, %ymm7
	vfmadd231pd	(%rsp), %ymm11, %ymm8
	vfmadd231pd	%ymm14, %ymm10, %ymm12
	vsubpd	%ymm12, %ymm8, %ymm8
	vmulpd	96(%rsp), %ymm6, %ymm12
	vmulpd	%ymm0, %ymm6, %ymm6
	vaddpd	544(%rsp), %ymm8, %ymm8
	vmovupd	736(%rsp), %ymm1
	vmovupd	288(%rsp), %ymm10
	vfmadd231pd	%ymm0, %ymm5, %ymm12
	vmulpd	96(%rsp), %ymm5, %ymm5
	vmovupd	416(%rsp), %ymm0
	vaddpd	%ymm12, %ymm9, %ymm9
	vmovupd	32(%rsp), %ymm12
	vsubpd	%ymm9, %ymm1, %ymm9
	vmovupd	128(%rsp), %ymm1
	vfmadd231pd	%ymm12, %ymm10, %ymm6
	vfmadd231pd	%ymm1, %ymm11, %ymm5
	vmovupd	384(%rsp), %ymm10
	vmovupd	%ymm9, 512(%rsp)
	vsubpd	%ymm6, %ymm5, %ymm11
	vmulpd	%ymm1, %ymm0, %ymm5
	vmovupd	(%rsp), %ymm6
	vmovupd	576(%rsp), %ymm1
	vmulpd	%ymm0, %ymm6, %ymm9
	vaddpd	768(%rsp), %ymm11, %ymm11
	vfmadd231pd	%ymm12, %ymm10, %ymm5
	vmovupd	64(%rsp), %ymm12
	vmulpd	%ymm12, %ymm2, %ymm6
	vfmadd231pd	%ymm10, %ymm14, %ymm9
	vfmadd231pd	%ymm15, %ymm4, %ymm6
	vaddpd	%ymm9, %ymm6, %ymm6
	vmulpd	%ymm12, %ymm4, %ymm9
	vmulpd	%ymm15, %ymm2, %ymm12
	vsubpd	%ymm6, %ymm1, %ymm6
	vmovupd	800(%rsp), %ymm1
	vfmadd231pd	(%rsp), %ymm10, %ymm9
	vfmadd231pd	%ymm14, %ymm0, %ymm12
	vsubpd	%ymm12, %ymm9, %ymm9
	vmulpd	96(%rsp), %ymm2, %ymm12
	vmulpd	208(%rsp), %ymm2, %ymm2
	vaddpd	608(%rsp), %ymm9, %ymm9
	vfmadd231pd	208(%rsp), %ymm4, %ymm12
	vmulpd	96(%rsp), %ymm4, %ymm4
	vaddpd	%ymm12, %ymm5, %ymm5
	vfmadd231pd	128(%rsp), %ymm10, %ymm4
	vmovupd	480(%rsp), %ymm10
	vsubpd	%ymm5, %ymm1, %ymm5
	vmovapd	%ymm0, %ymm1
	vmovupd	32(%rsp), %ymm0
	vfmadd231pd	%ymm0, %ymm1, %ymm2
	vmovupd	448(%rsp), %ymm1
	vsubpd	%ymm2, %ymm4, %ymm4
	vmovupd	(%rsp), %ymm2
	vmulpd	%ymm10, %ymm2, %ymm12
	vmulpd	128(%rsp), %ymm10, %ymm2
	vaddpd	832(%rsp), %ymm4, %ymm4
	vfmadd231pd	%ymm1, %ymm14, %ymm12
	vfmadd231pd	%ymm0, %ymm1, %ymm2
	vmovupd	240(%rsp), %ymm0
	vmulpd	64(%rsp), %ymm0, %ymm14
	vfmadd231pd	%ymm15, %ymm3, %ymm14
	vmulpd	%ymm0, %ymm15, %ymm15
	vaddpd	%ymm14, %ymm12, %ymm12
	vmovupd	640(%rsp), %ymm14
	vfmadd231pd	176(%rsp), %ymm10, %ymm15
	vsubpd	%ymm12, %ymm14, %ymm12
	vmulpd	64(%rsp), %ymm3, %ymm14
	vfmadd231pd	(%rsp), %ymm1, %ymm14
	vsubpd	%ymm15, %ymm14, %ymm14
	vaddpd	672(%rsp), %ymm14, %ymm14
	vmulpd	96(%rsp), %ymm0, %ymm15
	vmovupd	208(%rsp), %ymm0
	vfmadd231pd	%ymm0, %ymm3, %ymm15
	vmulpd	96(%rsp), %ymm3, %ymm3
	vaddpd	%ymm15, %ymm2, %ymm2
	vmovupd	864(%rsp), %ymm15
	vfmadd231pd	128(%rsp), %ymm1, %ymm3
	vsubpd	%ymm2, %ymm15, %ymm2
	vmovupd	240(%rsp), %ymm15
	vmulpd	%ymm0, %ymm15, %ymm1
	vpermpd	$68, 352(%rsp), %ymm15
	vpermpd	$238, 352(%rsp), %ymm0
	vfmadd231pd	32(%rsp), %ymm10, %ymm1
	vmovupd	928(%rsp), %ymm10
	vsubpd	%ymm1, %ymm3, %ymm1
	vpermpd	$68, %ymm10, %ymm3
	vpermpd	$238, %ymm10, %ymm10
	vshufpd	$12, %ymm3, %ymm15, %ymm3
	vshufpd	$12, %ymm10, %ymm0, %ymm10
	vpermpd	$68, %ymm7, %ymm15
	vpermpd	$68, %ymm8, %ymm0
	vpermpd	$238, %ymm7, %ymm7
	vpermpd	$238, %ymm8, %ymm8
	vshufpd	$12, %ymm0, %ymm15, %ymm15
	vshufpd	$12, %ymm8, %ymm7, %ymm7
	vpermpd	$68, %ymm9, %ymm0
	vpermpd	$68, %ymm6, %ymm8
	vshufpd	$12, %ymm0, %ymm8, %ymm8
	vpermpd	$238, %ymm6, %ymm6
	vpermpd	$238, %ymm9, %ymm0
	vshufpd	$12, %ymm0, %ymm6, %ymm0
	vpermpd	$68, %ymm12, %ymm9
	vpermpd	$68, %ymm14, %ymm6
	vpermpd	$238, %ymm12, %ymm12
	vpermpd	$238, %ymm14, %ymm14
	vshufpd	$12, %ymm6, %ymm9, %ymm6
	vshufpd	$12, %ymm14, %ymm12, %ymm12
	vpermpd	$68, %ymm8, %ymm9
	vpermpd	$68, %ymm3, %ymm14
	vpermpd	$238, %ymm8, %ymm8
	vpermpd	$238, %ymm3, %ymm3
	vshufpd	$12, %ymm9, %ymm14, %ymm9
	vshufpd	$12, %ymm8, %ymm3, %ymm8
	vpermpd	$68, %ymm10, %ymm14
	vpermpd	$68, %ymm0, %ymm3
	vpermpd	$238, %ymm10, %ymm10
	vpermpd	$238, %ymm0, %ymm0
	vshufpd	$12, %ymm3, %ymm14, %ymm3
	vshufpd	$12, %ymm0, %ymm10, %ymm0
	vpermpd	$68, %ymm15, %ymm14
	vpermpd	$68, %ymm6, %ymm10
	vpermpd	$238, %ymm15, %ymm15
	vpermpd	$238, %ymm6, %ymm6
	vshufpd	$12, %ymm10, %ymm14, %ymm10
	vshufpd	$12, %ymm6, %ymm15, %ymm15
	vpermpd	$68, %ymm7, %ymm14
	vpermpd	$68, %ymm12, %ymm6
	vpermpd	$238, %ymm7, %ymm7
	vpermpd	$238, %ymm12, %ymm12
	vshufpd	$12, %ymm6, %ymm14, %ymm6
	vshufpd	$12, %ymm12, %ymm7, %ymm7
	vpermpd	$68, %ymm10, %ymm14
	vpermpd	$68, %ymm9, %ymm12
	vpermpd	$238, %ymm10, %ymm10
	vpermpd	$238, %ymm9, %ymm9
	vshufpd	$12, %ymm10, %ymm9, %ymm9
	vpermpd	$68, %ymm15, %ymm10
	vmovupd	%ymm9, -224(%rax)
	vpermpd	$238, %ymm15, %ymm15
	vpermpd	$68, %ymm8, %ymm9
	vpermpd	$238, %ymm8, %ymm8
	vshufpd	$12, %ymm10, %ymm9, %ymm9
	vshufpd	$12, %ymm15, %ymm8, %ymm8
	vmovupd	%ymm9, -192(%rax)
	vmovupd	%ymm8, -160(%rax)
	vpermpd	$68, %ymm6, %ymm9
	vpermpd	$68, %ymm3, %ymm8
	vpermpd	$238, %ymm6, %ymm6
	vpermpd	$238, %ymm3, %ymm3
	vshufpd	$12, %ymm6, %ymm3, %ymm3
	vpermpd	$68, %ymm7, %ymm6
	vmovupd	%ymm3, -96(%rax)
	vpermpd	$238, %ymm7, %ymm7
	vpermpd	$68, %ymm0, %ymm3
	vpermpd	$238, %ymm0, %ymm0
	vshufpd	$12, %ymm7, %ymm0, %ymm0
	vmovupd	960(%rsp), %ymm7
	vshufpd	$12, %ymm6, %ymm3, %ymm3
	vshufpd	$12, %ymm14, %ymm12, %ymm12
	vmovupd	%ymm3, -64(%rax)
	vpermpd	$238, %ymm7, %ymm14
	vpermpd	$68, %ymm7, %ymm3
	vmovupd	512(%rsp), %ymm7
	vmovupd	%ymm0, -32(%rax)
	vaddpd	896(%rsp), %ymm1, %ymm1
	vpermpd	$68, %ymm13, %ymm0
	vshufpd	$12, %ymm0, %ymm3, %ymm3
	vpermpd	$68, %ymm7, %ymm6
	vpermpd	$68, %ymm11, %ymm0
	vshufpd	$12, %ymm9, %ymm8, %ymm8
	vshufpd	$12, %ymm0, %ymm6, %ymm6
	vmovupd	%ymm8, -128(%rax)
	vpermpd	$238, %ymm7, %ymm9
	vpermpd	$68, %ymm4, %ymm0
	vpermpd	$68, %ymm5, %ymm8
	vpermpd	$238, %ymm11, %ymm11
	vshufpd	$12, %ymm11, %ymm9, %ymm11
	vshufpd	$12, %ymm0, %ymm8, %ymm8
	vpermpd	$68, %ymm2, %ymm9
	vpermpd	$68, %ymm1, %ymm0
	vshufpd	$12, %ymm0, %ymm9, %ymm9
	vpermpd	$238, %ymm5, %ymm5
	vpermpd	$68, %ymm8, %ymm0
	vpermpd	$68, %ymm3, %ymm7
	vpermpd	$238, %ymm13, %ymm13
	vpermpd	$238, %ymm4, %ymm4
	vshufpd	$12, %ymm4, %ymm5, %ymm4
	vshufpd	$12, %ymm0, %ymm7, %ymm7
	vpermpd	$238, %ymm2, %ymm2
	vpermpd	$68, %ymm4, %ymm0
	vshufpd	$12, %ymm13, %ymm14, %ymm13
	vpermpd	$238, %ymm4, %ymm4
	vpermpd	$68, %ymm13, %ymm10
	vpermpd	$238, %ymm1, %ymm1
	vpermpd	$238, %ymm13, %ymm13
	vshufpd	$12, %ymm1, %ymm2, %ymm1
	vshufpd	$12, %ymm0, %ymm10, %ymm10
	vpermpd	$68, %ymm9, %ymm2
	vshufpd	$12, %ymm4, %ymm13, %ymm0
	vpermpd	$68, %ymm6, %ymm4
	vshufpd	$12, %ymm2, %ymm4, %ymm4
	vpermpd	$238, %ymm8, %ymm8
	vpermpd	$68, %ymm1, %ymm2
	vpermpd	$68, %ymm11, %ymm5
	vpermpd	$238, %ymm3, %ymm3
	vshufpd	$12, %ymm8, %ymm3, %ymm3
	vshufpd	$12, %ymm2, %ymm5, %ymm5
	vpermpd	$68, %ymm4, %ymm8
	vpermpd	$68, %ymm7, %ymm2
	vpermpd	$238, %ymm4, %ymm4
	vpermpd	$238, %ymm6, %ymm6
	vpermpd	$238, %ymm9, %ymm9
	vpermpd	$238, %ymm7, %ymm7
	vmovupd	%ymm12, -256(%rax)
	vshufpd	$12, %ymm9, %ymm6, %ymm6
	vshufpd	$12, %ymm8, %ymm2, %ymm2
	vshufpd	$12, %ymm4, %ymm7, %ymm7
	vmovupd	%ymm2, -256(%r11)
	vpermpd	$68, %ymm6, %ymm4
	vpermpd	$68, %ymm3, %ymm2
	vpermpd	$238, %ymm6, %ymm6
	vpermpd	$238, %ymm3, %ymm3
	vshufpd	$12, %ymm4, %ymm2, %ymm2
	vshufpd	$12, %ymm6, %ymm3, %ymm3
	vmovupd	%ymm2, -192(%r11)
	vmovupd	%ymm3, -160(%r11)
	vpermpd	$68, %ymm10, %ymm2
	vpermpd	$68, %ymm5, %ymm3
	vpermpd	$238, %ymm11, %ymm11
	vpermpd	$238, %ymm1, %ymm1
	vshufpd	$12, %ymm3, %ymm2, %ymm2
	vshufpd	$12, %ymm1, %ymm11, %ymm1
	vmovupd	%ymm2, -128(%r11)
	vpermpd	$68, %ymm1, %ymm3
	vpermpd	$68, %ymm0, %ymm2
	vpermpd	$238, %ymm10, %ymm10
	vpermpd	$238, %ymm5, %ymm5
	vpermpd	$238, %ymm0, %ymm0
	vpermpd	$238, %ymm1, %ymm1
	vshufpd	$12, %ymm5, %ymm10, %ymm5
	vshufpd	$12, %ymm3, %ymm2, %ymm2
	vshufpd	$12, %ymm1, %ymm0, %ymm1
	vmovupd	%ymm7, -224(%r11)
	vmovupd	%ymm5, -96(%r11)
	vmovupd	%ymm2, -64(%r11)
	vmovupd	%ymm1, -32(%r11)
	cmpq	%r9, %rdi
	jne	.L5

That's almost 7 times slower than -O3, 2.4 times slower than scalar code, generated by -O2 and twice slower than clang -Ofast.
Being twice slower than clang is not a small fit.

I knew about this bug several weeks ago, but somehow didn't realize that 11.0 is so near, so was lazy to report at time.
Now I am sorry.

Sources and compilation scripts for bigger, more real-world testbench here:
https://github.com/already5chosen/others/tree/master/cholesky_solver/gcc-badopt-aosoa-caxpy2x2
Comment 1 Richard Biener 2020-11-16 07:21:22 UTC
Let me do some initial analysis.
Comment 2 Richard Biener 2020-11-16 11:11:56 UTC
It's again reassociation making a mess out of the natural SLP opportunity (and thus SLP discovery fails miserably).

One idea worth playing with would be to change reassociation to rank references
from the same load group (as later vectorization would discover) the same.

That said, further analysis and maybe a smaller testcase to look at is useful
here.  There is, after all, the opportunity to turn "bad" association at the
source level to good for vectorization when -ffast-math is enabled as well.
Comment 3 Michael_S 2020-11-16 20:11:37 UTC
(In reply to Richard Biener from comment #2)
> It's again reassociation making a mess out of the natural SLP opportunity
> (and thus SLP discovery fails miserably).
> 
> One idea worth playing with would be to change reassociation to rank
> references
> from the same load group (as later vectorization would discover) the same.
> 
> That said, further analysis and maybe a smaller testcase to look at is useful
> here.  There is, after all, the opportunity to turn "bad" association at the
> source level to good for vectorization when -ffast-math is enabled as well.

It turned out, much simpler kernel suffers from the same problem.

void foo1x1(double* restrict y, const double* restrict x, int clen)
{
  int xi = clen & 2;
  double f_re = x[0+xi+0];
  double f_im = x[4+xi+0];
  int clen2 = (clen+xi) * 2;
  #pragma GCC unroll 0
  for (int c = 0; c < clen2; c += 8) {
    // y[c] = y[c] - x[c]*conj(f);
    #pragma GCC unroll 4
    for (int k = 0; k < 4; ++k) {
      double x_re = x[c+0+k];
      double x_im = x[c+4+k];
      double y_re = y[c+0+k];
      double y_im = y[c+4+k];
      y_re = y_re - x_re * f_re - x_im * f_im;;
      y_im = y_im + x_re * f_im - x_im * f_re;
      y[c+0+k] = y_re;
      y[c+4+k] = y_im;
    }
  }
}

May be, it's possible to simplify further, but probably not by much.
Comment 4 Richard Biener 2020-11-17 09:21:11 UTC
Ah, thanks - that helps.  So we're re-associating from

  *_89 = (((*_89) - (f_re_34 * x_re_82)) - (f_im_35 * x_im_88));
  *_91 = (((*_91) + (f_im_35 * x_re_82)) - (f_re_34 * x_im_88));

to

  *_89 = ((*_89) - ((f_re_34 * x_re_82) + (f_im_35 * x_im_88)));
  *_91 = (((*_91) + (f_im_35 * x_re_82)) - (f_re_34 * x_im_88));

that makes the operations unbalanced.  This is (a - b) - c -> a - (b + c)
as we're optimizing this as a + -b + -c.

Even smaller testcase:

double a[1024], b[1024], c[1024];

void foo()
{
  for (int i = 0; i < 256; ++i)
    {
      a[2*i] = a[2*i] + b[2*i] - c[2*i];
      a[2*i+1] = a[2*i+1] - b[2*i+1] - c[2*i+1];
    }
}

here ranks end up associating the expr as (-b + -c) + a and negate
re-propagation goes (-b - c) + a -> -(b + c) + a -> a - (b + c)
which is all sensible in isolation.

You could say that associating as (-b + -c) + a is worse than
(a + -b) + -c in this respect.  Ranks are

Rank for _8 is 327683 (a)
Rank for _13 is 327684 (-b)
Rank for _21 is 327684 (-c)

where the rank is one more for the negated values because of the
negate operation.  While heuristically ignoring negates for rank
propagation to make all ranks equal helps this new testcase it
doesn't help for the larger two.

It might still be a generally sound heuristic improvement though.

For the effects on vectorization I think we need to do sth in the
vectorizer itself, for example linearizing expressions.  The
first reassoc pass is supposed to do this but then negate
re-propagation undoes it in this case - which maybe points to
it that needs fixing, somehow associating a not negated operand
first.
Comment 5 Richard Biener 2020-11-17 10:18:09 UTC
OK, so I have a patch to keep the association linear which IMHO is good.  It fixes the smaller and my testcase but not the original one which now is
linear but still not homogenous.  The store groups are as follows

  *_115 = (((((*_115) - (f00_re_68 * x0_re_108)) - (f10_re_70 * x1_re_140)) - (f00_im_73 * x0_im_114)) - (f10_im_74 * x1_im_142));
  *_117 = (((((*_117) + (f00_im_73 * x0_re_108)) + (f10_im_74 * x1_re_140)) - (f00_re_68 * x0_im_114)) - (f10_re_70 * x1_im_142));
  *_119 = (((((*_119) - (f01_re_71 * x0_re_108)) - (f11_re_72 * x1_re_140)) - (f01_im_75 * x0_im_114)) - (f11_im_76 * x1_im_142));
  *_121 = (((((*_121) + (f01_im_75 * x0_re_108)) + (f11_im_76 * x1_re_140)) - (f01_re_71 * x0_im_114)) - (f11_re_72 * x1_im_142));

(good)

  *_177 = (((((*_177) - (f00_re_68 * x0_re_170)) - (f00_im_73 * x0_im_176)) - (f10_re_70 * x1_re_202)) - (f10_im_74 * x1_im_204));
  *_179 = (((((f00_im_73 * x0_re_170) + (f10_im_74 * x1_re_202)) + (*_179)) - (f00_re_68 * x0_im_176)) - (f10_re_70 * x1_im_204));
  *_181 = (((((*_181) - (f01_re_71 * x0_re_170)) - (f01_im_75 * x0_im_176)) - (f11_re_72 * x1_re_202)) - (f11_im_76 * x1_im_204));
  *_183 = (((((f01_im_75 * x0_re_170) + (f11_im_76 * x1_re_202)) + (*_183)) - (f01_re_71 * x0_im_176)) - (f11_re_72 * x1_im_204));

already bad.  Now, this is sth to tackle in the vectorizer which ideally
should not try to match up individual adds during SLP discoverly but
instead (if association is allowed) the whole addition chain, commutating
within the whole change rather than just swapping individual add operands.

I still think the reassoc change I came up with is good since it avoids
the need to linearlize in the vectorizer.  So testing that now.
Comment 6 Richard Biener 2020-11-18 08:53:15 UTC
So for example we'd like to vectorize with SLP when reassociation is permitted
(thus with -Ofast for example):

double a[1024], b[1024], c[1024];

void foo()
{
  for (int i = 0; i < 256; ++i)
    {
      a[2*i] = 1. - a[2*i] + b[2*i];
      a[2*i+1] = a[2*i+1] + b[2*i+1] + 1.;
    }
}

it again works when written as follows and with -fno-tree-reassoc

double a[1024], b[1024], c[1024];

void foo()
{
  for (int i = 0; i < 256; ++i)
    {
      a[2*i] = 1. - a[2*i] + b[2*i];
      a[2*i+1] = 1 + a[2*i+1] + b[2*i+1];
    }
}
Comment 7 Richard Biener 2020-11-18 09:15:07 UTC
Or

double a[1024], b[1024], c[1024];

void foo()
{
  for (int i = 0; i < 256; ++i)
    {
      a[2*i] = 1. - a[2*i] + b[2*i];
      a[2*i+1] = 1 + a[2*i+1] - b[2*i+1];
    }
}

which early folding breaks unless we add -fno-associative-math.  We then
end up with

  a[_1] = (((b[_1]) - (a[_1])) + 1.0e+0);
  a[_6] = (((a[_6]) - (b[_6])) + 1.0e+0);

where SLP operator swaping cannot handle to bring the grouped loads into
the same lanes.

So the idea is to look at single-use chains of plus/minus operations and
handle those as wide associated SLP nodes with flags denoting which lanes
need negation.  We'd have three children and each child has a per-lane
spec whether to add or subtract.
Comment 8 Richard Biener 2020-11-18 13:23:52 UTC
Created attachment 49586 [details]
prototype

This is a prototype patch which can serve as proof-of-concept.  It needs cleanup plus better handling of hybrid SLP discovery.

It depends on https://gcc.gnu.org/pipermail/gcc-patches/2020-November/559347.html to fix the testcase in this PR (which is included in the patch).
Comment 9 Richard Biener 2020-11-18 13:39:57 UTC
There's then also a permute optimization left on the plate:

t.c:16:3: note:   node 0x3a19590 (max_nunits=4, refcnt=2)
t.c:16:3: note:         stmt 0 _153 = f11_im_76 * x1_im_142;
t.c:16:3: note:         stmt 1 _213 = f11_re_72 * x1_re_202;
t.c:16:3: note:         stmt 2 _275 = f11_re_72 * x1_re_264;
t.c:16:3: note:         stmt 3 _337 = f11_re_72 * x1_re_326;
t.c:16:3: note:         stmt 4 _155 = f11_im_76 * x1_re_140;
t.c:16:3: note:         stmt 5 _217 = f11_im_76 * x1_re_202;
t.c:16:3: note:         stmt 6 _279 = f11_im_76 * x1_re_264;
t.c:16:3: note:         stmt 7 _341 = f11_im_76 * x1_re_326;
t.c:16:3: note:         children 0x3a19600 0x3a19670
t.c:16:3: note:   node (external) 0x3a19600 (max_nunits=1, refcnt=1)
t.c:16:3: note:         { f11_im_76, f11_re_72, f11_re_72, f11_re_72, f11_im_76, f11_im_76, f11_im_76, f11_im_76 }
t.c:16:3: note:   node 0x3a19670 (max_nunits=4, refcnt=1)
t.c:16:3: note:         stmt 0 x1_im_142 = *_141;
t.c:16:3: note:         stmt 1 x1_re_202 = *_201;
t.c:16:3: note:         stmt 2 x1_re_264 = *_263;
t.c:16:3: note:         stmt 3 x1_re_326 = *_325;
t.c:16:3: note:         stmt 4 x1_re_140 = *_139;
t.c:16:3: note:         stmt 5 x1_re_202 = *_201;
t.c:16:3: note:         stmt 6 x1_re_264 = *_263;
t.c:16:3: note:         stmt 7 x1_re_326 = *_325;
t.c:16:3: note:         load permutation { 4 1 2 3 0 1 2 3 }

which we currently do not handle (there's a FIXME as to permute externals,
currently we only handle splats as transparent for permutes).
Comment 10 Michael_S 2020-11-19 19:55:08 UTC
I lost track of what you're talking about long time ago.
But that's o.k.
Comment 11 Richard Biener 2020-11-20 07:10:57 UTC
(In reply to Michael_S from comment #10)
> I lost track of what you're talking about long time ago.
> But that's o.k.

No problem - difficult PRs tend to be used as media to brain-dump and record work progress.