[Bug c/79336] New: Poor vectorisation of additive reduction of complex array
drraph at gmail dot com
gcc-bugzilla@gcc.gnu.org
Thu Feb 2 10:44:00 GMT 2017
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79336
Bug ID: 79336
Summary: Poor vectorisation of additive reduction of complex
array
Product: gcc
Version: 7.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: c
Assignee: unassigned at gcc dot gnu.org
Reporter: drraph at gmail dot com
Target Milestone: ---
Consider this code:
#include <complex.h>
complex float f(complex float x[]) {
complex float p = 1.0;
for (int i = 0; i < 32; i++)
p += x[i];
return p;
}
gcc 7 with -march=core-avx2 -ffast-math gives
f:
lea r10, [rsp+8]
and rsp, -32
push QWORD PTR [r10-8]
push rbp
mov rbp, rsp
push r10
vmovups ymm0, YMMWORD PTR [rdi+64]
vmovaps ymm1, YMMWORD PTR .LC0[rip]
vaddps ymm0, ymm0, YMMWORD PTR [rdi+32]
vaddps ymm1, ymm1, YMMWORD PTR [rdi]
vaddps ymm0, ymm0, ymm1
vmovups ymm1, YMMWORD PTR [rdi+128]
vaddps ymm1, ymm1, YMMWORD PTR [rdi+96]
vaddps ymm0, ymm0, ymm1
vmovups ymm1, YMMWORD PTR [rdi+192]
vaddps ymm1, ymm1, YMMWORD PTR [rdi+160]
vaddps ymm0, ymm0, ymm1
vaddps ymm0, ymm0, YMMWORD PTR [rdi+224]
vunpckhps xmm3, xmm0, xmm0
vshufps xmm2, xmm0, xmm0, 255
vshufps xmm1, xmm0, xmm0, 85
vaddss xmm1, xmm2, xmm1
vaddss xmm3, xmm3, xmm0
vextractf128 xmm0, ymm0, 0x1
vunpckhps xmm4, xmm0, xmm0
vshufps xmm2, xmm0, xmm0, 85
vaddss xmm4, xmm4, xmm0
vshufps xmm0, xmm0, xmm0, 255
vaddss xmm0, xmm2, xmm0
vaddss xmm3, xmm3, xmm4
vaddss xmm1, xmm1, xmm0
vmovss DWORD PTR [rbp-24], xmm3
vmovss DWORD PTR [rbp-20], xmm1
vzeroupper
vmovq xmm0, QWORD PTR [rbp-24]
pop r10
pop rbp
lea rsp, [r10-8]
ret
This is vectorised but appears to perform a number of unnecessary instructions.
By contrast, icc using the same options gives:
f:
vmovups ymm1, YMMWORD PTR [rdi] #5.10
vmovups ymm2, YMMWORD PTR [64+rdi] #5.10
vmovups ymm5, YMMWORD PTR [128+rdi] #5.10
vmovups ymm6, YMMWORD PTR [192+rdi] #5.10
vmovsd xmm0, QWORD PTR p.152.0.0.1[rip] #3.19
vaddps ymm3, ymm1, YMMWORD PTR [32+rdi] #3.19
vaddps ymm4, ymm2, YMMWORD PTR [96+rdi] #3.19
vaddps ymm7, ymm5, YMMWORD PTR [160+rdi] #3.19
vaddps ymm8, ymm6, YMMWORD PTR [224+rdi] #3.19
vaddps ymm9, ymm3, ymm4 #3.19
vaddps ymm10, ymm7, ymm8 #3.19
vaddps ymm11, ymm9, ymm10 #3.19
vextractf128 xmm12, ymm11, 1 #3.19
vaddps xmm13, xmm11, xmm12 #3.19
vmovhlps xmm14, xmm13, xmm13 #3.19
vaddps xmm15, xmm13, xmm14 #3.19
vaddps xmm0, xmm15, xmm0 #3.19
vzeroupper #6.10
ret
More information about the Gcc-bugs
mailing list