I am reporting under 'target' because AVX2+FMA is the only 256-bit SIMD platform I have to play with. If it's really tree-optomization, please change. void foo(double* restrict y, const double* restrict x0, const double* restrict x1, int clen) { int xi = clen & 2; double f00_re = x0[0+xi+0]; double f10_re = x1[0+xi+0]; double f01_re = x0[0+xi+1]; double f11_re = x1[0+xi+1]; double f00_im = x0[4+xi+0]; double f10_im = x1[4+xi+0]; double f01_im = x0[4+xi+1]; double f11_im = x1[4+xi+1]; int clen2 = (clen+xi) * 2; double* y0 = &y[0]; double* y1 = &y[clen2]; #pragma GCC unroll 0 for (int c = 0; c < clen2; c += 8) { // y0[c] = y0[c] - x0[c]*conj(f00) - x1[c]*conj(f10); // y1[c] = y1[c] - x0[c]*conj(f01) - x1[c]*conj(f11); #pragma GCC unroll 4 for (int k = 0; k < 4; ++k) { double x0_re = x0[c+0+k]; double x0_im = x0[c+4+k]; double y0_re = y0[c+0+k]; double y0_im = y0[c+4+k]; double y1_re = y1[c+0+k]; double y1_im = y1[c+4+k]; y0_re = y0_re - x0_re * f00_re - x0_im * f00_im; y0_im = y0_im + x0_re * f00_im - x0_im * f00_re; y1_re = y1_re - x0_re * f01_re - x0_im * f01_im; y1_im = y1_im + x0_re * f01_im - x0_im * f01_re; double x1_re = x1[c+0+k]; double x1_im = x1[c+4+k]; y0_re = y0_re - x1_re * f10_re - x1_im * f10_im; y0_im = y0_im + x1_re * f10_im - x1_im * f10_re; y1_re = y1_re - x1_re * f11_re - x1_im * f11_im; y1_im = y1_im + x1_re * f11_im - x1_im * f11_re; y0[c+0+k] = y0_re; y0[c+4+k] = y0_im; y1[c+0+k] = y1_re; y1[c+4+k] = y1_im; } } } When compiled with 'gcc.10.2. -march=skylake -O3' it produces pretty decent code. The only problem is over-aggressive load+op combining similar to what we already discussed in 97127. It seems, this problem can't be solved without major overhaul of gcc optimizer architecture, but luckily an impact is quite minor. But when we compile with 'gcc.10.2. -march=skylake -Ofast' the fun begins: .L5: vmovupd (%r9), %ymm7 vmovupd 64(%r9), %ymm6 vunpcklpd 32(%r9), %ymm7, %ymm2 vunpckhpd 32(%r9), %ymm7, %ymm0 vmovupd 64(%r9), %ymm7 vmovupd 192(%r9), %ymm4 vunpckhpd 96(%r9), %ymm7, %ymm5 vmovupd 128(%r9), %ymm7 vunpcklpd 96(%r9), %ymm6, %ymm6 vunpcklpd 160(%r9), %ymm7, %ymm3 vunpckhpd 160(%r9), %ymm7, %ymm1 vmovupd 192(%r9), %ymm7 vunpcklpd 224(%r9), %ymm4, %ymm4 vunpckhpd 224(%r9), %ymm7, %ymm8 vpermpd $216, %ymm6, %ymm6 vpermpd $216, %ymm5, %ymm5 vpermpd $216, %ymm4, %ymm4 vpermpd $216, %ymm8, %ymm8 vpermpd $216, %ymm2, %ymm2 vpermpd $216, %ymm0, %ymm0 vpermpd $216, %ymm3, %ymm3 vpermpd $216, %ymm1, %ymm1 vunpcklpd %ymm6, %ymm2, %ymm7 vunpckhpd %ymm6, %ymm2, %ymm2 vunpcklpd %ymm4, %ymm3, %ymm6 vunpckhpd %ymm4, %ymm3, %ymm3 vunpcklpd %ymm5, %ymm0, %ymm4 vunpckhpd %ymm5, %ymm0, %ymm0 vunpcklpd %ymm8, %ymm1, %ymm5 vpermpd $216, %ymm5, %ymm5 vpermpd $216, %ymm4, %ymm4 vpermpd $216, %ymm3, %ymm3 vunpcklpd %ymm5, %ymm4, %ymm11 vpermpd $216, %ymm2, %ymm2 vunpckhpd %ymm5, %ymm4, %ymm4 vunpckhpd %ymm8, %ymm1, %ymm1 vpermpd $216, %ymm0, %ymm0 vpermpd $216, %ymm4, %ymm8 vpermpd $216, %ymm1, %ymm1 vunpcklpd %ymm3, %ymm2, %ymm4 vunpckhpd %ymm3, %ymm2, %ymm2 vpermpd $216, %ymm2, %ymm5 vunpcklpd %ymm1, %ymm0, %ymm2 vpermpd $216, %ymm4, %ymm10 vpermpd $216, %ymm2, %ymm4 vmovupd 64(%rax), %ymm2 vmovupd (%rax), %ymm3 vmovupd %ymm4, 448(%rsp) vunpckhpd 96(%rax), %ymm2, %ymm4 vmovupd 128(%rax), %ymm2 vpermpd $216, %ymm6, %ymm6 vunpckhpd %ymm1, %ymm0, %ymm1 vpermpd $216, %ymm7, %ymm7 vunpcklpd 32(%rax), %ymm3, %ymm9 vunpckhpd 32(%rax), %ymm3, %ymm14 vunpckhpd 160(%rax), %ymm2, %ymm0 vmovupd 64(%rax), %ymm3 vunpcklpd %ymm6, %ymm7, %ymm12 vunpckhpd %ymm6, %ymm7, %ymm7 vpermpd $216, %ymm1, %ymm6 vunpcklpd 160(%rax), %ymm2, %ymm1 vmovupd 192(%rax), %ymm2 vunpcklpd 96(%rax), %ymm3, %ymm3 vmovupd %ymm5, 416(%rsp) vunpcklpd 224(%rax), %ymm2, %ymm5 vunpckhpd 224(%rax), %ymm2, %ymm2 vpermpd $216, %ymm3, %ymm3 vpermpd $216, %ymm5, %ymm5 vpermpd $216, %ymm9, %ymm9 vpermpd $216, %ymm1, %ymm1 vpermpd $216, %ymm4, %ymm4 vpermpd $216, %ymm0, %ymm0 vmovupd %ymm10, 384(%rsp) vpermpd $216, %ymm14, %ymm14 vunpcklpd %ymm3, %ymm9, %ymm10 vpermpd $216, %ymm2, %ymm2 vunpckhpd %ymm3, %ymm9, %ymm9 vunpcklpd %ymm5, %ymm1, %ymm3 vpermpd $216, %ymm3, %ymm3 vmovupd %ymm8, 288(%rsp) vpermpd $216, %ymm10, %ymm10 vunpcklpd %ymm4, %ymm14, %ymm8 vunpckhpd %ymm4, %ymm14, %ymm14 vunpcklpd %ymm2, %ymm0, %ymm4 vpermpd $216, %ymm4, %ymm4 vpermpd $216, %ymm8, %ymm8 vunpckhpd %ymm2, %ymm0, %ymm2 vunpcklpd %ymm3, %ymm10, %ymm0 vpermpd $216, %ymm0, %ymm13 vunpcklpd %ymm4, %ymm8, %ymm0 vunpckhpd %ymm4, %ymm8, %ymm8 vpermpd $216, %ymm2, %ymm2 vunpckhpd %ymm3, %ymm10, %ymm10 vpermpd $216, %ymm14, %ymm14 vpermpd $216, %ymm0, %ymm3 vpermpd $216, %ymm8, %ymm0 vmovupd %ymm6, 480(%rsp) vunpckhpd %ymm5, %ymm1, %ymm1 vmovupd %ymm3, 512(%rsp) vmovupd (%rsi), %ymm3 vmovupd %ymm0, 544(%rsp) vunpcklpd %ymm2, %ymm14, %ymm0 vpermpd $216, %ymm1, %ymm1 vpermpd $216, %ymm0, %ymm4 vpermpd $216, %ymm9, %ymm9 vunpcklpd %ymm1, %ymm9, %ymm6 vmovupd %ymm4, 640(%rsp) vunpckhpd %ymm1, %ymm9, %ymm9 vunpcklpd 32(%rsi), %ymm3, %ymm4 vunpckhpd 32(%rsi), %ymm3, %ymm1 vmovupd 64(%rsi), %ymm3 vunpckhpd %ymm2, %ymm14, %ymm14 vunpcklpd 96(%rsi), %ymm3, %ymm8 vunpckhpd 96(%rsi), %ymm3, %ymm5 vmovupd 128(%rsi), %ymm3 vpermpd $216, %ymm14, %ymm2 vunpckhpd 160(%rsi), %ymm3, %ymm0 vmovupd %ymm2, 672(%rsp) vunpcklpd 160(%rsi), %ymm3, %ymm2 vmovupd 192(%rsi), %ymm3 vmovupd 192(%rsi), %ymm14 vunpcklpd 224(%rsi), %ymm3, %ymm3 vpermpd $216, %ymm9, %ymm9 vmovupd %ymm9, 608(%rsp) vunpckhpd 224(%rsi), %ymm14, %ymm9 vpermpd $216, %ymm8, %ymm8 vpermpd $216, %ymm3, %ymm3 vpermpd $216, %ymm6, %ymm6 vpermpd $216, %ymm4, %ymm4 vpermpd $216, %ymm2, %ymm2 vpermpd $216, %ymm5, %ymm5 vpermpd $216, %ymm9, %ymm9 vmovupd %ymm6, 576(%rsp) vpermpd $216, %ymm1, %ymm1 vunpcklpd %ymm8, %ymm4, %ymm6 vpermpd $216, %ymm0, %ymm0 vunpckhpd %ymm8, %ymm4, %ymm4 vunpcklpd %ymm3, %ymm2, %ymm8 vpermpd $216, %ymm8, %ymm8 vpermpd $216, %ymm6, %ymm6 vunpckhpd %ymm3, %ymm2, %ymm2 vunpcklpd %ymm5, %ymm1, %ymm3 vunpckhpd %ymm5, %ymm1, %ymm1 vunpcklpd %ymm9, %ymm0, %ymm5 vpermpd $216, %ymm2, %ymm2 vpermpd $216, %ymm5, %ymm5 vunpcklpd %ymm8, %ymm6, %ymm14 vpermpd $216, %ymm4, %ymm4 vunpckhpd %ymm8, %ymm6, %ymm6 vpermpd $216, %ymm3, %ymm3 vunpckhpd %ymm9, %ymm0, %ymm0 vpermpd $216, %ymm6, %ymm9 vunpcklpd %ymm5, %ymm3, %ymm6 vunpckhpd %ymm5, %ymm3, %ymm3 vunpcklpd %ymm2, %ymm4, %ymm5 vunpckhpd %ymm2, %ymm4, %ymm4 vpermpd $216, %ymm0, %ymm0 vpermpd $216, %ymm4, %ymm2 vpermpd $216, %ymm1, %ymm1 vmovupd %ymm2, 832(%rsp) vunpcklpd %ymm0, %ymm1, %ymm2 vunpckhpd %ymm0, %ymm1, %ymm1 vpermpd $216, %ymm1, %ymm0 vmovupd %ymm0, 896(%rsp) vmovupd (%rbx), %ymm0 vpermpd $216, %ymm2, %ymm4 vunpckhpd 32(%rbx), %ymm0, %ymm1 vunpcklpd 32(%rbx), %ymm0, %ymm2 vmovupd 64(%rbx), %ymm0 vpermpd $216, %ymm5, %ymm5 vmovupd %ymm5, 800(%rsp) vmovupd %ymm4, 864(%rsp) vunpcklpd 96(%rbx), %ymm0, %ymm5 vunpckhpd 96(%rbx), %ymm0, %ymm4 vmovupd 128(%rbx), %ymm0 vpermpd $216, %ymm6, %ymm6 vpermpd $216, %ymm3, %ymm3 vmovupd %ymm9, 704(%rsp) vmovupd %ymm6, 736(%rsp) vmovupd %ymm3, 768(%rsp) vunpcklpd 160(%rbx), %ymm0, %ymm3 vmovupd 192(%rbx), %ymm8 vunpckhpd 160(%rbx), %ymm0, %ymm0 vunpcklpd 224(%rbx), %ymm8, %ymm6 vunpckhpd 224(%rbx), %ymm8, %ymm9 vpermpd $216, %ymm5, %ymm5 vpermpd $216, %ymm4, %ymm4 vpermpd $216, %ymm6, %ymm6 vpermpd $216, %ymm9, %ymm9 vpermpd $216, %ymm2, %ymm2 vpermpd $216, %ymm1, %ymm1 vpermpd $216, %ymm3, %ymm3 vpermpd $216, %ymm0, %ymm0 vunpcklpd %ymm5, %ymm2, %ymm8 vunpckhpd %ymm5, %ymm2, %ymm2 vunpcklpd %ymm6, %ymm3, %ymm5 vunpckhpd %ymm6, %ymm3, %ymm3 vunpcklpd %ymm4, %ymm1, %ymm6 vunpckhpd %ymm4, %ymm1, %ymm1 vunpcklpd %ymm9, %ymm0, %ymm4 vunpckhpd %ymm9, %ymm0, %ymm0 vpermpd $216, %ymm5, %ymm5 vpermpd $216, %ymm3, %ymm3 vpermpd $216, %ymm4, %ymm4 vpermpd $216, %ymm0, %ymm0 vpermpd $216, %ymm8, %ymm8 vpermpd $216, %ymm2, %ymm2 vpermpd $216, %ymm6, %ymm6 vpermpd $216, %ymm1, %ymm1 vunpcklpd %ymm5, %ymm8, %ymm9 vunpckhpd %ymm5, %ymm8, %ymm8 vunpcklpd %ymm4, %ymm6, %ymm5 vunpckhpd %ymm4, %ymm6, %ymm6 vunpcklpd %ymm3, %ymm2, %ymm4 vunpckhpd %ymm3, %ymm2, %ymm2 vunpcklpd %ymm0, %ymm1, %ymm3 vunpckhpd %ymm0, %ymm1, %ymm1 vpermpd $216, %ymm9, %ymm9 vpermpd $216, %ymm8, %ymm8 vpermpd $216, %ymm1, %ymm0 vpermpd $216, %ymm10, %ymm15 vmovupd %ymm0, 240(%rsp) vmulpd 320(%rsp), %ymm9, %ymm10 vmulpd 64(%rsp), %ymm8, %ymm0 vmovupd (%rsp), %ymm1 vpermpd $216, %ymm12, %ymm12 vpermpd $216, %ymm7, %ymm7 vfmadd231pd 176(%rsp), %ymm12, %ymm10 vfmadd231pd %ymm1, %ymm7, %ymm0 vpermpd $216, %ymm14, %ymm14 vpermpd $216, %ymm11, %ymm11 vpermpd $216, %ymm6, %ymm6 vpermpd $216, %ymm5, %ymm5 vaddpd %ymm10, %ymm0, %ymm0 vmulpd 64(%rsp), %ymm9, %ymm10 vpermpd $216, %ymm2, %ymm2 vsubpd %ymm0, %ymm13, %ymm0 vmulpd 320(%rsp), %ymm8, %ymm13 vpermpd $216, %ymm4, %ymm4 vfmadd231pd %ymm1, %ymm12, %ymm10 vmovupd %ymm0, 352(%rsp) vmovupd 208(%rsp), %ymm0 vfmadd231pd 176(%rsp), %ymm7, %ymm13 vpermpd $216, %ymm3, %ymm3 addq $256, %r9 addq $256, %rax addq $256, %rsi vsubpd %ymm13, %ymm10, %ymm10 vmulpd %ymm0, %ymm9, %ymm13 vmulpd 96(%rsp), %ymm9, %ymm9 vaddpd %ymm15, %ymm10, %ymm10 vmulpd 96(%rsp), %ymm8, %ymm15 vmulpd %ymm0, %ymm8, %ymm8 vmovupd %ymm10, 928(%rsp) vmovupd 128(%rsp), %ymm10 vfmadd231pd 32(%rsp), %ymm12, %ymm13 vfmadd231pd %ymm10, %ymm12, %ymm9 vmovupd 32(%rsp), %ymm12 vfmadd231pd %ymm10, %ymm7, %ymm15 vfmadd231pd %ymm12, %ymm7, %ymm8 vmovupd (%rsp), %ymm7 addq $256, %rbx addq $256, %r11 vaddpd %ymm15, %ymm13, %ymm13 vsubpd %ymm8, %ymm9, %ymm9 vmovapd %ymm10, %ymm15 vmovupd 288(%rsp), %ymm10 vsubpd %ymm13, %ymm14, %ymm1 vaddpd 704(%rsp), %ymm9, %ymm13 vmulpd %ymm15, %ymm10, %ymm9 vmulpd %ymm10, %ymm7, %ymm7 vmovupd 176(%rsp), %ymm14 vmovupd 320(%rsp), %ymm15 vmovupd %ymm1, 960(%rsp) vfmadd231pd %ymm12, %ymm11, %ymm9 vmovupd 64(%rsp), %ymm12 vfmadd231pd %ymm14, %ymm11, %ymm7 vmulpd %ymm12, %ymm6, %ymm8 vmovupd 512(%rsp), %ymm1 vfmadd231pd %ymm15, %ymm5, %ymm8 vaddpd %ymm8, %ymm7, %ymm7 vmulpd %ymm12, %ymm5, %ymm8 vmulpd %ymm15, %ymm6, %ymm12 vsubpd %ymm7, %ymm1, %ymm7 vfmadd231pd (%rsp), %ymm11, %ymm8 vfmadd231pd %ymm14, %ymm10, %ymm12 vsubpd %ymm12, %ymm8, %ymm8 vmulpd 96(%rsp), %ymm6, %ymm12 vmulpd %ymm0, %ymm6, %ymm6 vaddpd 544(%rsp), %ymm8, %ymm8 vmovupd 736(%rsp), %ymm1 vmovupd 288(%rsp), %ymm10 vfmadd231pd %ymm0, %ymm5, %ymm12 vmulpd 96(%rsp), %ymm5, %ymm5 vmovupd 416(%rsp), %ymm0 vaddpd %ymm12, %ymm9, %ymm9 vmovupd 32(%rsp), %ymm12 vsubpd %ymm9, %ymm1, %ymm9 vmovupd 128(%rsp), %ymm1 vfmadd231pd %ymm12, %ymm10, %ymm6 vfmadd231pd %ymm1, %ymm11, %ymm5 vmovupd 384(%rsp), %ymm10 vmovupd %ymm9, 512(%rsp) vsubpd %ymm6, %ymm5, %ymm11 vmulpd %ymm1, %ymm0, %ymm5 vmovupd (%rsp), %ymm6 vmovupd 576(%rsp), %ymm1 vmulpd %ymm0, %ymm6, %ymm9 vaddpd 768(%rsp), %ymm11, %ymm11 vfmadd231pd %ymm12, %ymm10, %ymm5 vmovupd 64(%rsp), %ymm12 vmulpd %ymm12, %ymm2, %ymm6 vfmadd231pd %ymm10, %ymm14, %ymm9 vfmadd231pd %ymm15, %ymm4, %ymm6 vaddpd %ymm9, %ymm6, %ymm6 vmulpd %ymm12, %ymm4, %ymm9 vmulpd %ymm15, %ymm2, %ymm12 vsubpd %ymm6, %ymm1, %ymm6 vmovupd 800(%rsp), %ymm1 vfmadd231pd (%rsp), %ymm10, %ymm9 vfmadd231pd %ymm14, %ymm0, %ymm12 vsubpd %ymm12, %ymm9, %ymm9 vmulpd 96(%rsp), %ymm2, %ymm12 vmulpd 208(%rsp), %ymm2, %ymm2 vaddpd 608(%rsp), %ymm9, %ymm9 vfmadd231pd 208(%rsp), %ymm4, %ymm12 vmulpd 96(%rsp), %ymm4, %ymm4 vaddpd %ymm12, %ymm5, %ymm5 vfmadd231pd 128(%rsp), %ymm10, %ymm4 vmovupd 480(%rsp), %ymm10 vsubpd %ymm5, %ymm1, %ymm5 vmovapd %ymm0, %ymm1 vmovupd 32(%rsp), %ymm0 vfmadd231pd %ymm0, %ymm1, %ymm2 vmovupd 448(%rsp), %ymm1 vsubpd %ymm2, %ymm4, %ymm4 vmovupd (%rsp), %ymm2 vmulpd %ymm10, %ymm2, %ymm12 vmulpd 128(%rsp), %ymm10, %ymm2 vaddpd 832(%rsp), %ymm4, %ymm4 vfmadd231pd %ymm1, %ymm14, %ymm12 vfmadd231pd %ymm0, %ymm1, %ymm2 vmovupd 240(%rsp), %ymm0 vmulpd 64(%rsp), %ymm0, %ymm14 vfmadd231pd %ymm15, %ymm3, %ymm14 vmulpd %ymm0, %ymm15, %ymm15 vaddpd %ymm14, %ymm12, %ymm12 vmovupd 640(%rsp), %ymm14 vfmadd231pd 176(%rsp), %ymm10, %ymm15 vsubpd %ymm12, %ymm14, %ymm12 vmulpd 64(%rsp), %ymm3, %ymm14 vfmadd231pd (%rsp), %ymm1, %ymm14 vsubpd %ymm15, %ymm14, %ymm14 vaddpd 672(%rsp), %ymm14, %ymm14 vmulpd 96(%rsp), %ymm0, %ymm15 vmovupd 208(%rsp), %ymm0 vfmadd231pd %ymm0, %ymm3, %ymm15 vmulpd 96(%rsp), %ymm3, %ymm3 vaddpd %ymm15, %ymm2, %ymm2 vmovupd 864(%rsp), %ymm15 vfmadd231pd 128(%rsp), %ymm1, %ymm3 vsubpd %ymm2, %ymm15, %ymm2 vmovupd 240(%rsp), %ymm15 vmulpd %ymm0, %ymm15, %ymm1 vpermpd $68, 352(%rsp), %ymm15 vpermpd $238, 352(%rsp), %ymm0 vfmadd231pd 32(%rsp), %ymm10, %ymm1 vmovupd 928(%rsp), %ymm10 vsubpd %ymm1, %ymm3, %ymm1 vpermpd $68, %ymm10, %ymm3 vpermpd $238, %ymm10, %ymm10 vshufpd $12, %ymm3, %ymm15, %ymm3 vshufpd $12, %ymm10, %ymm0, %ymm10 vpermpd $68, %ymm7, %ymm15 vpermpd $68, %ymm8, %ymm0 vpermpd $238, %ymm7, %ymm7 vpermpd $238, %ymm8, %ymm8 vshufpd $12, %ymm0, %ymm15, %ymm15 vshufpd $12, %ymm8, %ymm7, %ymm7 vpermpd $68, %ymm9, %ymm0 vpermpd $68, %ymm6, %ymm8 vshufpd $12, %ymm0, %ymm8, %ymm8 vpermpd $238, %ymm6, %ymm6 vpermpd $238, %ymm9, %ymm0 vshufpd $12, %ymm0, %ymm6, %ymm0 vpermpd $68, %ymm12, %ymm9 vpermpd $68, %ymm14, %ymm6 vpermpd $238, %ymm12, %ymm12 vpermpd $238, %ymm14, %ymm14 vshufpd $12, %ymm6, %ymm9, %ymm6 vshufpd $12, %ymm14, %ymm12, %ymm12 vpermpd $68, %ymm8, %ymm9 vpermpd $68, %ymm3, %ymm14 vpermpd $238, %ymm8, %ymm8 vpermpd $238, %ymm3, %ymm3 vshufpd $12, %ymm9, %ymm14, %ymm9 vshufpd $12, %ymm8, %ymm3, %ymm8 vpermpd $68, %ymm10, %ymm14 vpermpd $68, %ymm0, %ymm3 vpermpd $238, %ymm10, %ymm10 vpermpd $238, %ymm0, %ymm0 vshufpd $12, %ymm3, %ymm14, %ymm3 vshufpd $12, %ymm0, %ymm10, %ymm0 vpermpd $68, %ymm15, %ymm14 vpermpd $68, %ymm6, %ymm10 vpermpd $238, %ymm15, %ymm15 vpermpd $238, %ymm6, %ymm6 vshufpd $12, %ymm10, %ymm14, %ymm10 vshufpd $12, %ymm6, %ymm15, %ymm15 vpermpd $68, %ymm7, %ymm14 vpermpd $68, %ymm12, %ymm6 vpermpd $238, %ymm7, %ymm7 vpermpd $238, %ymm12, %ymm12 vshufpd $12, %ymm6, %ymm14, %ymm6 vshufpd $12, %ymm12, %ymm7, %ymm7 vpermpd $68, %ymm10, %ymm14 vpermpd $68, %ymm9, %ymm12 vpermpd $238, %ymm10, %ymm10 vpermpd $238, %ymm9, %ymm9 vshufpd $12, %ymm10, %ymm9, %ymm9 vpermpd $68, %ymm15, %ymm10 vmovupd %ymm9, -224(%rax) vpermpd $238, %ymm15, %ymm15 vpermpd $68, %ymm8, %ymm9 vpermpd $238, %ymm8, %ymm8 vshufpd $12, %ymm10, %ymm9, %ymm9 vshufpd $12, %ymm15, %ymm8, %ymm8 vmovupd %ymm9, -192(%rax) vmovupd %ymm8, -160(%rax) vpermpd $68, %ymm6, %ymm9 vpermpd $68, %ymm3, %ymm8 vpermpd $238, %ymm6, %ymm6 vpermpd $238, %ymm3, %ymm3 vshufpd $12, %ymm6, %ymm3, %ymm3 vpermpd $68, %ymm7, %ymm6 vmovupd %ymm3, -96(%rax) vpermpd $238, %ymm7, %ymm7 vpermpd $68, %ymm0, %ymm3 vpermpd $238, %ymm0, %ymm0 vshufpd $12, %ymm7, %ymm0, %ymm0 vmovupd 960(%rsp), %ymm7 vshufpd $12, %ymm6, %ymm3, %ymm3 vshufpd $12, %ymm14, %ymm12, %ymm12 vmovupd %ymm3, -64(%rax) vpermpd $238, %ymm7, %ymm14 vpermpd $68, %ymm7, %ymm3 vmovupd 512(%rsp), %ymm7 vmovupd %ymm0, -32(%rax) vaddpd 896(%rsp), %ymm1, %ymm1 vpermpd $68, %ymm13, %ymm0 vshufpd $12, %ymm0, %ymm3, %ymm3 vpermpd $68, %ymm7, %ymm6 vpermpd $68, %ymm11, %ymm0 vshufpd $12, %ymm9, %ymm8, %ymm8 vshufpd $12, %ymm0, %ymm6, %ymm6 vmovupd %ymm8, -128(%rax) vpermpd $238, %ymm7, %ymm9 vpermpd $68, %ymm4, %ymm0 vpermpd $68, %ymm5, %ymm8 vpermpd $238, %ymm11, %ymm11 vshufpd $12, %ymm11, %ymm9, %ymm11 vshufpd $12, %ymm0, %ymm8, %ymm8 vpermpd $68, %ymm2, %ymm9 vpermpd $68, %ymm1, %ymm0 vshufpd $12, %ymm0, %ymm9, %ymm9 vpermpd $238, %ymm5, %ymm5 vpermpd $68, %ymm8, %ymm0 vpermpd $68, %ymm3, %ymm7 vpermpd $238, %ymm13, %ymm13 vpermpd $238, %ymm4, %ymm4 vshufpd $12, %ymm4, %ymm5, %ymm4 vshufpd $12, %ymm0, %ymm7, %ymm7 vpermpd $238, %ymm2, %ymm2 vpermpd $68, %ymm4, %ymm0 vshufpd $12, %ymm13, %ymm14, %ymm13 vpermpd $238, %ymm4, %ymm4 vpermpd $68, %ymm13, %ymm10 vpermpd $238, %ymm1, %ymm1 vpermpd $238, %ymm13, %ymm13 vshufpd $12, %ymm1, %ymm2, %ymm1 vshufpd $12, %ymm0, %ymm10, %ymm10 vpermpd $68, %ymm9, %ymm2 vshufpd $12, %ymm4, %ymm13, %ymm0 vpermpd $68, %ymm6, %ymm4 vshufpd $12, %ymm2, %ymm4, %ymm4 vpermpd $238, %ymm8, %ymm8 vpermpd $68, %ymm1, %ymm2 vpermpd $68, %ymm11, %ymm5 vpermpd $238, %ymm3, %ymm3 vshufpd $12, %ymm8, %ymm3, %ymm3 vshufpd $12, %ymm2, %ymm5, %ymm5 vpermpd $68, %ymm4, %ymm8 vpermpd $68, %ymm7, %ymm2 vpermpd $238, %ymm4, %ymm4 vpermpd $238, %ymm6, %ymm6 vpermpd $238, %ymm9, %ymm9 vpermpd $238, %ymm7, %ymm7 vmovupd %ymm12, -256(%rax) vshufpd $12, %ymm9, %ymm6, %ymm6 vshufpd $12, %ymm8, %ymm2, %ymm2 vshufpd $12, %ymm4, %ymm7, %ymm7 vmovupd %ymm2, -256(%r11) vpermpd $68, %ymm6, %ymm4 vpermpd $68, %ymm3, %ymm2 vpermpd $238, %ymm6, %ymm6 vpermpd $238, %ymm3, %ymm3 vshufpd $12, %ymm4, %ymm2, %ymm2 vshufpd $12, %ymm6, %ymm3, %ymm3 vmovupd %ymm2, -192(%r11) vmovupd %ymm3, -160(%r11) vpermpd $68, %ymm10, %ymm2 vpermpd $68, %ymm5, %ymm3 vpermpd $238, %ymm11, %ymm11 vpermpd $238, %ymm1, %ymm1 vshufpd $12, %ymm3, %ymm2, %ymm2 vshufpd $12, %ymm1, %ymm11, %ymm1 vmovupd %ymm2, -128(%r11) vpermpd $68, %ymm1, %ymm3 vpermpd $68, %ymm0, %ymm2 vpermpd $238, %ymm10, %ymm10 vpermpd $238, %ymm5, %ymm5 vpermpd $238, %ymm0, %ymm0 vpermpd $238, %ymm1, %ymm1 vshufpd $12, %ymm5, %ymm10, %ymm5 vshufpd $12, %ymm3, %ymm2, %ymm2 vshufpd $12, %ymm1, %ymm0, %ymm1 vmovupd %ymm7, -224(%r11) vmovupd %ymm5, -96(%r11) vmovupd %ymm2, -64(%r11) vmovupd %ymm1, -32(%r11) cmpq %r9, %rdi jne .L5 That's almost 7 times slower than -O3, 2.4 times slower than scalar code, generated by -O2 and twice slower than clang -Ofast. Being twice slower than clang is not a small fit. I knew about this bug several weeks ago, but somehow didn't realize that 11.0 is so near, so was lazy to report at time. Now I am sorry. Sources and compilation scripts for bigger, more real-world testbench here: https://github.com/already5chosen/others/tree/master/cholesky_solver/gcc-badopt-aosoa-caxpy2x2
Let me do some initial analysis.
It's again reassociation making a mess out of the natural SLP opportunity (and thus SLP discovery fails miserably). One idea worth playing with would be to change reassociation to rank references from the same load group (as later vectorization would discover) the same. That said, further analysis and maybe a smaller testcase to look at is useful here. There is, after all, the opportunity to turn "bad" association at the source level to good for vectorization when -ffast-math is enabled as well.
(In reply to Richard Biener from comment #2) > It's again reassociation making a mess out of the natural SLP opportunity > (and thus SLP discovery fails miserably). > > One idea worth playing with would be to change reassociation to rank > references > from the same load group (as later vectorization would discover) the same. > > That said, further analysis and maybe a smaller testcase to look at is useful > here. There is, after all, the opportunity to turn "bad" association at the > source level to good for vectorization when -ffast-math is enabled as well. It turned out, much simpler kernel suffers from the same problem. void foo1x1(double* restrict y, const double* restrict x, int clen) { int xi = clen & 2; double f_re = x[0+xi+0]; double f_im = x[4+xi+0]; int clen2 = (clen+xi) * 2; #pragma GCC unroll 0 for (int c = 0; c < clen2; c += 8) { // y[c] = y[c] - x[c]*conj(f); #pragma GCC unroll 4 for (int k = 0; k < 4; ++k) { double x_re = x[c+0+k]; double x_im = x[c+4+k]; double y_re = y[c+0+k]; double y_im = y[c+4+k]; y_re = y_re - x_re * f_re - x_im * f_im;; y_im = y_im + x_re * f_im - x_im * f_re; y[c+0+k] = y_re; y[c+4+k] = y_im; } } } May be, it's possible to simplify further, but probably not by much.
Ah, thanks - that helps. So we're re-associating from *_89 = (((*_89) - (f_re_34 * x_re_82)) - (f_im_35 * x_im_88)); *_91 = (((*_91) + (f_im_35 * x_re_82)) - (f_re_34 * x_im_88)); to *_89 = ((*_89) - ((f_re_34 * x_re_82) + (f_im_35 * x_im_88))); *_91 = (((*_91) + (f_im_35 * x_re_82)) - (f_re_34 * x_im_88)); that makes the operations unbalanced. This is (a - b) - c -> a - (b + c) as we're optimizing this as a + -b + -c. Even smaller testcase: double a[1024], b[1024], c[1024]; void foo() { for (int i = 0; i < 256; ++i) { a[2*i] = a[2*i] + b[2*i] - c[2*i]; a[2*i+1] = a[2*i+1] - b[2*i+1] - c[2*i+1]; } } here ranks end up associating the expr as (-b + -c) + a and negate re-propagation goes (-b - c) + a -> -(b + c) + a -> a - (b + c) which is all sensible in isolation. You could say that associating as (-b + -c) + a is worse than (a + -b) + -c in this respect. Ranks are Rank for _8 is 327683 (a) Rank for _13 is 327684 (-b) Rank for _21 is 327684 (-c) where the rank is one more for the negated values because of the negate operation. While heuristically ignoring negates for rank propagation to make all ranks equal helps this new testcase it doesn't help for the larger two. It might still be a generally sound heuristic improvement though. For the effects on vectorization I think we need to do sth in the vectorizer itself, for example linearizing expressions. The first reassoc pass is supposed to do this but then negate re-propagation undoes it in this case - which maybe points to it that needs fixing, somehow associating a not negated operand first.
OK, so I have a patch to keep the association linear which IMHO is good. It fixes the smaller and my testcase but not the original one which now is linear but still not homogenous. The store groups are as follows *_115 = (((((*_115) - (f00_re_68 * x0_re_108)) - (f10_re_70 * x1_re_140)) - (f00_im_73 * x0_im_114)) - (f10_im_74 * x1_im_142)); *_117 = (((((*_117) + (f00_im_73 * x0_re_108)) + (f10_im_74 * x1_re_140)) - (f00_re_68 * x0_im_114)) - (f10_re_70 * x1_im_142)); *_119 = (((((*_119) - (f01_re_71 * x0_re_108)) - (f11_re_72 * x1_re_140)) - (f01_im_75 * x0_im_114)) - (f11_im_76 * x1_im_142)); *_121 = (((((*_121) + (f01_im_75 * x0_re_108)) + (f11_im_76 * x1_re_140)) - (f01_re_71 * x0_im_114)) - (f11_re_72 * x1_im_142)); (good) *_177 = (((((*_177) - (f00_re_68 * x0_re_170)) - (f00_im_73 * x0_im_176)) - (f10_re_70 * x1_re_202)) - (f10_im_74 * x1_im_204)); *_179 = (((((f00_im_73 * x0_re_170) + (f10_im_74 * x1_re_202)) + (*_179)) - (f00_re_68 * x0_im_176)) - (f10_re_70 * x1_im_204)); *_181 = (((((*_181) - (f01_re_71 * x0_re_170)) - (f01_im_75 * x0_im_176)) - (f11_re_72 * x1_re_202)) - (f11_im_76 * x1_im_204)); *_183 = (((((f01_im_75 * x0_re_170) + (f11_im_76 * x1_re_202)) + (*_183)) - (f01_re_71 * x0_im_176)) - (f11_re_72 * x1_im_204)); already bad. Now, this is sth to tackle in the vectorizer which ideally should not try to match up individual adds during SLP discoverly but instead (if association is allowed) the whole addition chain, commutating within the whole change rather than just swapping individual add operands. I still think the reassoc change I came up with is good since it avoids the need to linearlize in the vectorizer. So testing that now.
So for example we'd like to vectorize with SLP when reassociation is permitted (thus with -Ofast for example): double a[1024], b[1024], c[1024]; void foo() { for (int i = 0; i < 256; ++i) { a[2*i] = 1. - a[2*i] + b[2*i]; a[2*i+1] = a[2*i+1] + b[2*i+1] + 1.; } } it again works when written as follows and with -fno-tree-reassoc double a[1024], b[1024], c[1024]; void foo() { for (int i = 0; i < 256; ++i) { a[2*i] = 1. - a[2*i] + b[2*i]; a[2*i+1] = 1 + a[2*i+1] + b[2*i+1]; } }
Or double a[1024], b[1024], c[1024]; void foo() { for (int i = 0; i < 256; ++i) { a[2*i] = 1. - a[2*i] + b[2*i]; a[2*i+1] = 1 + a[2*i+1] - b[2*i+1]; } } which early folding breaks unless we add -fno-associative-math. We then end up with a[_1] = (((b[_1]) - (a[_1])) + 1.0e+0); a[_6] = (((a[_6]) - (b[_6])) + 1.0e+0); where SLP operator swaping cannot handle to bring the grouped loads into the same lanes. So the idea is to look at single-use chains of plus/minus operations and handle those as wide associated SLP nodes with flags denoting which lanes need negation. We'd have three children and each child has a per-lane spec whether to add or subtract.
Created attachment 49586 [details] prototype This is a prototype patch which can serve as proof-of-concept. It needs cleanup plus better handling of hybrid SLP discovery. It depends on https://gcc.gnu.org/pipermail/gcc-patches/2020-November/559347.html to fix the testcase in this PR (which is included in the patch).
There's then also a permute optimization left on the plate: t.c:16:3: note: node 0x3a19590 (max_nunits=4, refcnt=2) t.c:16:3: note: stmt 0 _153 = f11_im_76 * x1_im_142; t.c:16:3: note: stmt 1 _213 = f11_re_72 * x1_re_202; t.c:16:3: note: stmt 2 _275 = f11_re_72 * x1_re_264; t.c:16:3: note: stmt 3 _337 = f11_re_72 * x1_re_326; t.c:16:3: note: stmt 4 _155 = f11_im_76 * x1_re_140; t.c:16:3: note: stmt 5 _217 = f11_im_76 * x1_re_202; t.c:16:3: note: stmt 6 _279 = f11_im_76 * x1_re_264; t.c:16:3: note: stmt 7 _341 = f11_im_76 * x1_re_326; t.c:16:3: note: children 0x3a19600 0x3a19670 t.c:16:3: note: node (external) 0x3a19600 (max_nunits=1, refcnt=1) t.c:16:3: note: { f11_im_76, f11_re_72, f11_re_72, f11_re_72, f11_im_76, f11_im_76, f11_im_76, f11_im_76 } t.c:16:3: note: node 0x3a19670 (max_nunits=4, refcnt=1) t.c:16:3: note: stmt 0 x1_im_142 = *_141; t.c:16:3: note: stmt 1 x1_re_202 = *_201; t.c:16:3: note: stmt 2 x1_re_264 = *_263; t.c:16:3: note: stmt 3 x1_re_326 = *_325; t.c:16:3: note: stmt 4 x1_re_140 = *_139; t.c:16:3: note: stmt 5 x1_re_202 = *_201; t.c:16:3: note: stmt 6 x1_re_264 = *_263; t.c:16:3: note: stmt 7 x1_re_326 = *_325; t.c:16:3: note: load permutation { 4 1 2 3 0 1 2 3 } which we currently do not handle (there's a FIXME as to permute externals, currently we only handle splats as transparent for permutes).
I lost track of what you're talking about long time ago. But that's o.k.
(In reply to Michael_S from comment #10) > I lost track of what you're talking about long time ago. > But that's o.k. No problem - difficult PRs tend to be used as media to brain-dump and record work progress.
The master branch has been updated by Richard Biener <rguenth@gcc.gnu.org>: https://gcc.gnu.org/g:ce670e4faafb296d1f1a7828d20f8c8ba4686797 commit r12-1329-gce670e4faafb296d1f1a7828d20f8c8ba4686797 Author: Richard Biener <rguenther@suse.de> Date: Wed Nov 18 14:17:34 2020 +0100 tree-optimization/97832 - handle associatable chains in SLP discovery This makes SLP discovery handle associatable (including mixed plus/minus) chains better by swapping operands across the whole chain. To work this adds caching of the 'matches' lanes for failed SLP discovery attempts, thereby fixing a failed SLP discovery for the slp-pr98855.cc testcase which results in building an operand from scalars as expected. Unfortunately this makes us trip over the cost threshold so I'm XFAILing the testcase for now. For BB vectorization all this doesn't work because we have no way to distinguish good from bad associations as we eventually build operands from scalars and thus not fail in the classical sense. 2021-05-31 Richard Biener <rguenther@suse.de> PR tree-optimization/97832 * tree-vectorizer.h (_slp_tree::failed): New. * tree-vect-slp.c (_slp_tree::_slp_tree): Initialize failed member. (_slp_tree::~_slp_tree): Free failed. (vect_build_slp_tree): Retain failed nodes and record matches in them, copying that back out when running into a cached fail. Dump start and end of discovery. (dt_sort_cmp): New. (vect_build_slp_tree_2): Handle associatable chains together doing more aggressive operand swapping. * gcc.dg/vect/pr97832-1.c: New testcase. * gcc.dg/vect/pr97832-2.c: Likewise. * gcc.dg/vect/pr97832-3.c: Likewise. * g++.dg/vect/slp-pr98855.cc: XFAIL.
Fixed (hopefully), for GCC 12.
I tested a smaller test bench from Comment 3 with gcc trunk on godbolt. Issue appears to be only partially fixed. -Ofast result is no longer a horror that it was before, but it is still not as good as -O3 or -O2. -Ofast code generation is still strange and there are few vblendpd instruction that serve no useful purpose. And -O2/O3 is still not as good as it should be or as good as icc. But, as mentioned in my original post, over-aggressive load+op combining is a separate problem.
I can confirm we get .L3: vmovupd (%rsi), %ymm1 vmovupd 32(%rsi), %ymm0 addl $1, %eax addq $64, %rdi addq $64, %rsi vblendpd $14, %ymm1, %ymm0, %ymm3 vblendpd $14, %ymm0, %ymm1, %ymm2 vfnmadd213pd -64(%rdi), %ymm5, %ymm3 vfmadd213pd -32(%rdi), %ymm7, %ymm1 vfnmadd132pd %ymm4, %ymm3, %ymm2 vfnmadd132pd %ymm6, %ymm1, %ymm0 vmovupd %ymm2, -64(%rdi) vmovupd %ymm0, -32(%rdi) cmpl %edx, %eax jb .L3 instead of .L3: vmovupd (%rdx), %ymm1 vmovupd (%rdx), %ymm0 addl $1, %ecx addq $64, %rax vfmadd213pd -32(%rax), %ymm3, %ymm1 vfnmadd213pd -64(%rax), %ymm2, %ymm0 addq $64, %rdx vfnmadd231pd -32(%rdx), %ymm3, %ymm0 vfnmadd231pd -32(%rdx), %ymm2, %ymm1 vmovupd %ymm0, -64(%rax) vmovupd %ymm1, -32(%rax) cmpl %esi, %ecx jb .L3 the good case sees <bb 4> [local count: 214748368]: # ivtmp.27_211 = PHI <ivtmp.27_210(4), 0(3)> # ivtmp.32_209 = PHI <ivtmp.32_208(4), ivtmp.32_212(3)> # ivtmp.34_28 = PHI <ivtmp.34_51(4), ivtmp.34_52(3)> _53 = (void *) ivtmp.34_28; vect_x_re_54.13_193 = MEM <const vector(4) double> [(const double *)_53]; vect_x_im_60.21_176 = MEM <const vector(4) double> [(const double *)_53 + 32B]; _54 = (void *) ivtmp.32_209; vect_y_re_62.9_200 = MEM <vector(4) double> [(double *)_54]; vect_y_re_62.10_198 = MEM <vector(4) double> [(double *)_54 + 32B]; vect__154.17_185 = .FMA (vect_x_re_54.13_193, _197, vect_y_re_62.10_198); vect__66.16_188 = .FNMA (vect_x_re_54.13_193, _196, vect_y_re_62.9_200); vect_y_re_68.23_173 = .FNMA (vect_x_im_60.21_176, _197, vect__66.16_188); vect_y_re_68.23_172 = .FNMA (vect_x_im_60.21_176, _196, vect__154.17_185); MEM <vector(4) double> [(double *)_54] = vect_y_re_68.23_173; MEM <vector(4) double> [(double *)_54 + 32B] = vect_y_re_68.23_172; ivtmp.27_210 = ivtmp.27_211 + 1; ivtmp.32_208 = ivtmp.32_209 + 64; ivtmp.34_51 = ivtmp.34_28 + 64; if (bnd.6_207 > ivtmp.27_210) goto <bb 4>; [90.00%] while the bad has <bb 4> [local count: 214748368]: # ivtmp.31_65 = PHI <ivtmp.31_64(4), 0(3)> # ivtmp.36_63 = PHI <ivtmp.36_62(4), ivtmp.36_204(3)> # ivtmp.38_203 = PHI <ivtmp.38_59(4), ivtmp.38_60(3)> _61 = (void *) ivtmp.38_203; vect_x_im_60.13_211 = MEM <const vector(4) double> [(const double *)_61]; vect_x_im_60.14_209 = MEM <const vector(4) double> [(const double *)_61 + 32B]; vect_x_re_54.15_208 = VEC_PERM_EXPR <vect_x_im_60.14_209, vect_x_im_60.13_211, { 0, 5, 6, 7 }>; vect_x_re_54.23_192 = VEC_PERM_EXPR <vect_x_im_60.13_211, vect_x_im_60.14_209, { 0, 5, 6, 7 }>; _58 = (void *) ivtmp.36_63; vect_y_re_62.9_218 = MEM <vector(4) double> [(double *)_58]; vect_y_re_62.10_216 = MEM <vector(4) double> [(double *)_58 + 32B]; vect__41.18_202 = .FMA (vect_x_im_60.13_211, _215, vect_y_re_62.10_216); vect_y_re_68.17_205 = .FNMA (vect_x_re_54.15_208, _214, vect_y_re_62.9_218); vect_y_re_68.25_189 = .FNMA (vect_x_re_54.23_192, _198, vect_y_re_68.17_205); vect_y_re_68.25_188 = .FNMA (_199, vect_x_im_60.14_209, vect__41.18_202); MEM <vector(4) double> [(double *)_58] = vect_y_re_68.25_189; MEM <vector(4) double> [(double *)_58 + 32B] = vect_y_re_68.25_188; ivtmp.31_64 = ivtmp.31_65 + 1; ivtmp.36_62 = ivtmp.36_63 + 64; ivtmp.38_59 = ivtmp.38_203 + 64; if (ivtmp.31_64 < bnd.6_225) goto <bb 4>; [90.00%] the blends do not look like no-ops so I wonder if this is really computing the same thing ... (it swaps lane 0 from the two loads from x but not the stores)
On unrelated note, why loop overhead uses so many instructions? Assuming that I am as misguided as gcc about load-op combining, I would write it as: sub %rax, %rdx .L3: vmovupd (%rdx,%rax), %ymm1 vmovupd 32(%rdx,%rax), %ymm0 vfmadd213pd 32(%rax), %ymm3, %ymm1 vfnmadd213pd (%rax), %ymm2, %ymm0 vfnmadd231pd 32(%rdx,%rax), %ymm3, %ymm0 vfnmadd231pd (%rdx,%rax), %ymm2, %ymm1 vmovupd %ymm0, (%rax) vmovupd %ymm1, 32(%rax) addq $64, %rax decl %esi jb .L3 The loop overhead in my variant is 3 x86 instructions==2 macro-ops, vs 5 x86 instructions==4 macro-ops in gcc variant. Also, in gcc variant all memory accesses have displacement that makes them 1 byte longer. In my variant only half of accesses have displacement. I think, in the past I had seen cases where gcc generates optimal or near-optimal code sequences for loop overhead. I wonder why it can not do it here.
(In reply to Michael_S from comment #16) > On unrelated note, why loop overhead uses so many instructions? > Assuming that I am as misguided as gcc about load-op combining, I would > write it as: > sub %rax, %rdx > .L3: > vmovupd (%rdx,%rax), %ymm1 > vmovupd 32(%rdx,%rax), %ymm0 > vfmadd213pd 32(%rax), %ymm3, %ymm1 > vfnmadd213pd (%rax), %ymm2, %ymm0 > vfnmadd231pd 32(%rdx,%rax), %ymm3, %ymm0 > vfnmadd231pd (%rdx,%rax), %ymm2, %ymm1 > vmovupd %ymm0, (%rax) > vmovupd %ymm1, 32(%rax) > addq $64, %rax > decl %esi > jb .L3 > > The loop overhead in my variant is 3 x86 instructions==2 macro-ops, > vs 5 x86 instructions==4 macro-ops in gcc variant. > Also, in gcc variant all memory accesses have displacement that makes them > 1 byte longer. In my variant only half of accesses have displacement. > > I think, in the past I had seen cases where gcc generates optimal or > near-optimal > code sequences for loop overhead. I wonder why it can not do it here. I don't think we currently consider IVs based on the difference of two addresses. The cost benefit of no displacement is only size, otherwise I have no idea why we have biased the %rax accesses by -32. Why we fail to consider decrement-to-zero for the counter IV is probably because IVCANON would add such IV but the vectorizer replaces that and IVOPTs doesn't consider re-adding that.
The apparent 'bias' is introduced by instruction scheduling: haifa-sched lifts a +64 increment over memory accesses, transforming +0 and +32 displacements to -64 and -32. Sometimes this helps a little bit even on modern x86 CPUs. Also note that 'vfnmadd231pd 32(%rdx,%rax), %ymm3, %ymm0' would be 'unlaminated' (turned to 2 uops before renaming), so selecting independent IVs for the two arrays actually helps on this testcase.
(In reply to Alexander Monakov from comment #18) > The apparent 'bias' is introduced by instruction scheduling: haifa-sched > lifts a +64 increment over memory accesses, transforming +0 and +32 > displacements to -64 and -32. Sometimes this helps a little bit even on > modern x86 CPUs. I don't think that it ever helps on Intel Sandy Bridge or later or on AMD Zen1 or later. > > Also note that 'vfnmadd231pd 32(%rdx,%rax), %ymm3, %ymm0' would be > 'unlaminated' (turned to 2 uops before renaming), so selecting independent > IVs for the two arrays actually helps on this testcase. Both 'vfnmadd231pd 32(%rdx,%rax), %ymm3, %ymm0' and 'vfnmadd231pd 32(%rdx), %ymm3, %ymm0' would be turned into 2 uops. Misuse of load+op is far bigger problem in this particular test case than sub-optimal loop overhead. Assuming execution on Intel Skylake, it turns loop that can potentially run at 3 clocks per iteration into loop of 4+ clocks per iteration. But I consider it a separate issue. I reported similar issue in 97127, but here it is more serious. It looks to me that the issue is not soluble within existing gcc optimization framework. The only chance is if you accept my old and simple advice - within inner loops pretend that AVX is RISC, i.e. generate code as if load-op form of AVX instructions weren't existing.
(In reply to Richard Biener from comment #17) > (In reply to Michael_S from comment #16) > > On unrelated note, why loop overhead uses so many instructions? > > Assuming that I am as misguided as gcc about load-op combining, I would > > write it as: > > sub %rax, %rdx > > .L3: > > vmovupd (%rdx,%rax), %ymm1 > > vmovupd 32(%rdx,%rax), %ymm0 > > vfmadd213pd 32(%rax), %ymm3, %ymm1 > > vfnmadd213pd (%rax), %ymm2, %ymm0 > > vfnmadd231pd 32(%rdx,%rax), %ymm3, %ymm0 > > vfnmadd231pd (%rdx,%rax), %ymm2, %ymm1 > > vmovupd %ymm0, (%rax) > > vmovupd %ymm1, 32(%rax) > > addq $64, %rax > > decl %esi > > jb .L3 > > > > The loop overhead in my variant is 3 x86 instructions==2 macro-ops, > > vs 5 x86 instructions==4 macro-ops in gcc variant. > > Also, in gcc variant all memory accesses have displacement that makes them > > 1 byte longer. In my variant only half of accesses have displacement. > > > > I think, in the past I had seen cases where gcc generates optimal or > > near-optimal > > code sequences for loop overhead. I wonder why it can not do it here. > > I don't think we currently consider IVs based on the difference of two > addresses. It seems to me that I had seen you doing it. But, may be, I confuse gcc with clang. > The cost benefit of no displacement is only size, Size is pretty important in high-IPC SIMD loops. Esp. on Intel and when # of iterations is small, because Intel has 16-byte fetch out of L1I cache. SIMD instructions tend to be long and not many instructions fit within 16 bytes even when memory accesses have no offsets. Offset adds impact to the injury. > otherwise > I have no idea why we have biased the %rax accesses by -32. Why we > fail to consider decrement-to-zero for the counter IV is probably because > IVCANON would add such IV but the vectorizer replaces that and IVOPTs > doesn't consider re-adding that. Sorry, I have no idea about the meaning of IVCANON.
(In reply to Michael_S from comment #19) > > Also note that 'vfnmadd231pd 32(%rdx,%rax), %ymm3, %ymm0' would be > > 'unlaminated' (turned to 2 uops before renaming), so selecting independent > > IVs for the two arrays actually helps on this testcase. > > Both 'vfnmadd231pd 32(%rdx,%rax), %ymm3, %ymm0' and 'vfnmadd231pd 32(%rdx), > %ymm3, %ymm0' would be turned into 2 uops. The difference is at which point in the pipeline. The latter goes through renaming as one fused uop. > Misuse of load+op is far bigger problem in this particular test case than > sub-optimal loop overhead. Assuming execution on Intel Skylake, it turns > loop that can potentially run at 3 clocks per iteration into loop of 4+ > clocks per iteration. Sorry, which assembler output this refers to? > But I consider it a separate issue. I reported similar issue in 97127, but > here it is more serious. It looks to me that the issue is not soluble within > existing gcc optimization framework. The only chance is if you accept my old > and simple advice - within inner loops pretend that AVX is RISC, i.e. > generate code as if load-op form of AVX instructions weren't existing. In bug 97127 the best explanation we have so far is we don't optimally handle the case where non-memory inputs of an fma are reused, so we can't combine a load with an fma without causing an extra register copy (PR 97127 comment 16 demonstrates what I mean). I cannot imagine such trouble arising with more common commutative operations like mul/add, especially with non-destructive VEX encoding. If you hit such examples, I would suggest to report them also, because their root cause might be different. In general load-op combining should be very helpful on x86, because it reduces the number of uops flowing through the renaming stage, which is one of the narrowest points in the pipeline.
(In reply to Alexander Monakov from comment #21) > (In reply to Michael_S from comment #19) > > > Also note that 'vfnmadd231pd 32(%rdx,%rax), %ymm3, %ymm0' would be > > > 'unlaminated' (turned to 2 uops before renaming), so selecting independent > > > IVs for the two arrays actually helps on this testcase. > > > > Both 'vfnmadd231pd 32(%rdx,%rax), %ymm3, %ymm0' and 'vfnmadd231pd 32(%rdx), > > %ymm3, %ymm0' would be turned into 2 uops. > > The difference is at which point in the pipeline. The latter goes through > renaming as one fused uop. > Intel never documents such fine details in their Optimization Reference manuals. But I believe you. > > Misuse of load+op is far bigger problem in this particular test case than > > sub-optimal loop overhead. Assuming execution on Intel Skylake, it turns > > loop that can potentially run at 3 clocks per iteration into loop of 4+ > > clocks per iteration. > > Sorry, which assembler output this refers to? > gcc12 -O3 -mavx2 -mfma gcc12 -O3 -march=skylake does not suffer from this problem. I still think that RISC-style icc code will be a little faster on Skylake, but here we are arguing about 1/4th of the cycle per iteration rather than a full cycle. https://godbolt.org/z/nfa7c9se3 > > But I consider it a separate issue. I reported similar issue in 97127, but > > here it is more serious. It looks to me that the issue is not soluble within > > existing gcc optimization framework. The only chance is if you accept my old > > and simple advice - within inner loops pretend that AVX is RISC, i.e. > > generate code as if load-op form of AVX instructions weren't existing. > > In bug 97127 the best explanation we have so far is we don't optimally > handle the case where non-memory inputs of an fma are reused, so we can't > combine a load with an fma without causing an extra register copy (PR 97127 > comment 16 demonstrates what I mean). I cannot imagine such trouble arising > with more common commutative operations like mul/add, especially with > non-destructive VEX encoding. If you hit such examples, I would suggest to > report them also, because their root cause might be different. > > In general load-op combining should be very helpful on x86, because it > reduces the number of uops flowing through the renaming stage, which is one > of the narrowest points in the pipeline. If compilers were perfect, AVX load-op combining would be somewhat helpful. I have my doubts about very helpful. But compilers are not perfect. For none-AVX case, where every op is destructive and repeated loads are on average cheaper than on AVX, combined load-ops is far more profitable.
> the blends do not look like no-ops so I wonder if this is really computing > the same thing ... (it swaps lane 0 from the two loads from x but not the > stores) They're computing the same thing since we also do the same "permutation" for the invariants: f_re and f_imm, can we eliminate that in the vectorizer? _232 = {f_im_36, f_im_36, f_im_36, f_im_36}; _231 = {f_im_36, f_re_35, f_re_35, f_re_35}; ------- here _216 = {f_re_35, f_re_35, f_re_35, f_re_35}; _215 = {f_re_35, f_im_36, f_im_36, f_im_36}; ------ and here. ivtmp.36_221 = (unsigned long) y_41(D); ivtmp.38_61 = (unsigned long) x_33(D); <bb 4> [local count: 214748368]: # ivtmp.32_66 = PHI <ivtmp.32_65(4), 0(3)> # ivtmp.36_64 = PHI <ivtmp.36_63(4), ivtmp.36_221(3)> # ivtmp.38_220 = PHI <ivtmp.38_60(4), ivtmp.38_61(3)> # DEBUG c => NULL # DEBUG k => 0 # DEBUG BEGIN_STMT # DEBUG BEGIN_STMT # DEBUG D#78 => D#79 * 8 # DEBUG D#77 => x_33(D) + D#78 _62 = (void *) ivtmp.38_220; vect_x_im_61.13_228 = MEM <const vector(4) double> [(const double *)_62]; vect_x_im_61.14_226 = MEM <const vector(4) double> [(const double *)_62 + 32B]; vect_x_re_55.15_225 = VEC_PERM_EXPR <vect_x_im_61.14_226, vect_x_im_61.13_228, { 0, 5, 6, 7 }>; vect_x_re_55.23_209 = VEC_PERM_EXPR <vect_x_im_61.13_228, vect_x_im_61.14_226, { 0, 5, 6, 7 }>; # DEBUG D#76 => *D#77 # DEBUG x_re => D#76 # DEBUG BEGIN_STMT # DEBUG D#74 => (long unsigned int) D#75 # DEBUG D#73 => D#74 * 8 # DEBUG D#72 => x_33(D) + D#73 # DEBUG D#71 => *D#72 # DEBUG x_im => D#71 # DEBUG BEGIN_STMT # DEBUG D#70 => y_41(D) + D#78 _59 = (void *) ivtmp.36_64; vect_y_re_63.9_235 = MEM <vector(4) double> [(double *)_59]; vect_y_re_63.10_233 = MEM <vector(4) double> [(double *)_59 + 32B]; vect__42.18_219 = .FMA (vect_x_im_61.13_228, _232, vect_y_re_63.10_233); vect_y_re_69.17_222 = .FNMA (vect_x_re_55.15_225, _231, vect_y_re_63.9_235); vect_y_re_69.25_206 = .FNMA (vect_x_re_55.23_209, _215, vect_y_re_69.17_222); vect_y_re_69.25_205 = .FNMA (_216, vect_x_im_61.14_226, vect__42.18_219);
_233 = {f_im_36, f_re_35, f_re_35, f_re_35}; _217 = {f_re_35, f_im_36, f_im_36, f_im_36}; ... vect_x_re_55.15_227 = VEC_PERM_EXPR <vect_x_im_61.14_228, vect_x_im_61.13_230, { 0, 5, 6, 7 }>; vect_x_re_55.23_211 = VEC_PERM_EXPR <vect_x_im_61.13_230, vect_x_im_61.14_228, { 0, 5, 6, 7 }>; ... vect_y_re_69.17_224 = .FNMA (vect_x_re_55.15_227, _233, vect_y_re_63.9_237); vect_y_re_69.25_208 = .FNMA (vect_x_re_55.23_211, _217, vect_y_re_69.17_224); is equal to _233 = {f_im_36,f_im_36, f_im_36, f_im_36} _217 = {f_re_35, f_re_35, f_re_35, f_re_35}; ... vect_y_re_69.17_224 = .FNMA (vect_x_im_61.14_228, _233, vect_y_re_63.9_237) vect_y_re_69.25_208 = .FNMA (vect_x_im_61.13_230, _217, vect_y_re_69.17_224) A simplication in match.pd?
On Mon, 28 Nov 2022, crazylht at gmail dot com wrote: > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97832 > > --- Comment #24 from Hongtao.liu <crazylht at gmail dot com> --- > _233 = {f_im_36, f_re_35, f_re_35, f_re_35}; > _217 = {f_re_35, f_im_36, f_im_36, f_im_36}; > ... > vect_x_re_55.15_227 = VEC_PERM_EXPR <vect_x_im_61.14_228, vect_x_im_61.13_230, > { 0, 5, 6, 7 }>; > vect_x_re_55.23_211 = VEC_PERM_EXPR <vect_x_im_61.13_230, > vect_x_im_61.14_228, { 0, 5, 6, 7 }>; > ... > vect_y_re_69.17_224 = .FNMA (vect_x_re_55.15_227, _233, vect_y_re_63.9_237); > vect_y_re_69.25_208 = .FNMA (vect_x_re_55.23_211, _217, vect_y_re_69.17_224); > > is equal to > > _233 = {f_im_36,f_im_36, f_im_36, f_im_36} > _217 = {f_re_35, f_re_35, f_re_35, f_re_35}; > ... > vect_y_re_69.17_224 = .FNMA (vect_x_im_61.14_228, _233, vect_y_re_63.9_237) > vect_y_re_69.25_208 = .FNMA (vect_x_im_61.13_230, _217, vect_y_re_69.17_224) > > A simplication in match.pd? I guess that's possible but the SLP vectorizer has a permute optimization phase (and SLP discovery itself), it would be nice to see why the former doesn't elide the permutes here.
> I guess that's possible but the SLP vectorizer has a permute optimization > phase (and SLP discovery itself), it would be nice to see why the former > doesn't elide the permutes here. I've opened PR107891 for it.