[Bug tree-optimization/54939] Very poor vectorization of loops with complex arithmetic
rguenth at gcc dot gnu.org
gcc-bugzilla@gcc.gnu.org
Thu Jan 3 12:07:00 GMT 2019
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54939
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |segher at gcc dot gnu.org,
| |uros at gcc dot gnu.org
--- Comment #8 from Richard Biener <rguenth at gcc dot gnu.org> ---
On trunk we get with -Ofast -msse4.2
.L3:
movupd (%rdx,%rax), %xmm0
movupd (%rdx,%rax), %xmm1
movupd (%rcx,%rax), %xmm4
mulpd %xmm3, %xmm1
palignr $8, %xmm0, %xmm0
mulpd %xmm2, %xmm0
addsubpd %xmm0, %xmm1
addpd %xmm4, %xmm1
movups %xmm1, (%rcx,%rax)
addq $16, %rax
cmpq %rsi, %rax
jne .L3
ICC unrolls the body once more. With -mavx2 we get
.L4:
vmovupd (%rdx,%rsi), %xmm6
vinsertf128 $0x1, 16(%rdx,%rsi), %ymm6, %ymm1
vmovupd (%rcx,%rsi), %xmm7
vmulpd %ymm5, %ymm1, %ymm2
vpermpd $177, %ymm1, %ymm1
vmulpd %ymm4, %ymm1, %ymm1
vaddsubpd %ymm1, %ymm2, %ymm1
vinsertf128 $0x1, 16(%rcx,%rsi), %ymm7, %ymm2
vaddpd %ymm2, %ymm1, %ymm1
vmovups %xmm1, (%rcx,%rsi)
vextractf128 $0x1, %ymm1, 16(%rcx,%rsi)
addq $32, %rsi
cmpq %rsi, %rdi
jne .L4
if I add -mfma for example via -march=core-avx2 I get again
.L4:
vpermpd $177, (%rdx,%rsi), %ymm2
vmovapd %ymm4, %ymm1
vmulpd %ymm5, %ymm2, %ymm2
vfmsub132pd (%rdx,%rsi), %ymm2, %ymm1
vfmadd231pd (%rdx,%rsi), %ymm4, %ymm2
vshufpd $10, %ymm2, %ymm1, %ymm1
vaddpd (%rcx,%rsi), %ymm1, %ymm1
vmovupd %ymm1, (%rcx,%rsi)
addq $32, %rsi
cmpq %rsi, %rdi
jne .L4
showing that we lack vfmaddsub patterns or those present do not work
properly. combine sees
37: r109:V4DF={r127:V4DF*r105:V4DF+-r108:V4DF}
38: r110:V4DF={r127:V4DF*r105:V4DF+r108:V4DF}
REG_DEAD r127:V4DF
REG_DEAD r108:V4DF
39: r129:V4DF=vec_select(vec_concat(r109:V4DF,r110:V4DF),parallel)
REG_DEAD r110:V4DF
REG_DEAD r109:V4DF
unfortunately the fmaddsub patterns all use UNSPECs with the comment
;; It would be possible to represent these without the UNSPEC as
;;
;; (vec_merge
;; (fma op1 op2 op3)
;; (fma op1 op2 (neg op3))
;; (merge-const))
;;
;; But this doesn't seem useful in practice.
the AVX512 ones do not seem to suffer from this but using AVX512 via
-march=knl also only results in
.L4:
vmovupd (%rdx,%rax), %zmm1
vpermpd $177, %zmm1, %zmm2
vmovapd %zmm1, %zmm0
vmulpd %zmm4, %zmm2, %zmm2
vfmsub132pd %zmm3, %zmm2, %zmm0
vfmadd132pd %zmm3, %zmm2, %zmm1
vshufpd $170, %zmm1, %zmm0, %zmm0
vaddpd (%rcx,%rax), %zmm0, %zmm0
vmovupd %zmm0, (%rcx,%rax)
leaq 64(%rax), %rax
cmpq %rax, %rsi
jne .L4
but maybe KNL doesn't have fmaddsub. Ah, with AVX512 we end up with
vec_merge (like with SSE2) but above AVX256 shows concat/select
(avx_shufpd256_1).
Maybe combine itself can try both variants in case there's a duality
between (vec_merge ...) and (vec_select (vec_concat ...))?
More information about the Gcc-bugs
mailing list