[Bug target/79722] New: Missed opportunity for fused multiply/add with avx2
tkoenig at gcc dot gnu.org
gcc-bugzilla@gcc.gnu.org
Mon Feb 27 11:54:00 GMT 2017
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79722
Bug ID: 79722
Summary: Missed opportunity for fused multiply/add with avx2
Product: gcc
Version: unknown
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: tkoenig at gcc dot gnu.org
Target Milestone: ---
Created attachment 40835
--> https://gcc.gnu.org/bugzilla/attachment.cgi?id=40835&action=edit
Output of gcc -Ofast -mavx2 -S -o bar-gcc.s bar.c
The test case is the same as PR 79709:
typedef double v4do __attribute__((vector_size (32)));
typedef long int v4i __attribute__((vector_size (32)));
#define VSET(vect,val) do { vect[0]=val; vect[1]=val; vect[2]=val; vect[3]=val;
} while (0)
void foo(v4do cx, v4do cy, v4i *r)
{
v4do x, y, xn, yn;
v4i add, res;
v4do two, four;
long int done;
VSET(res, 0L);
VSET(two, 2.0);
VSET(four, 4.0);
x = cx;
y = cy;
done = 0;
while (1)
{
xn = x*x - y*y + cx;
yn = two*x*y + cy;
add = xn+xn + yn*yn < four;
res += add;
if (add[0] == 0 || add[1] == 0 || add[2] || add[3])
break;
x = xn;
y = yn;
}
*r = res;
}
With gcc, the inner loop is tranlsated into
.L13:
vpextrq $1, %xmm2, %rax
testq %rax, %rax
je .L2
vextracti128 $0x1, %ymm2, %xmm2
vmovq %xmm2, %rax
testq %rax, %rax
jne .L2
vpextrq $1, %xmm2, %rax
vmovapd %ymm4, %ymm3
testq %rax, %rax
jne .L2
.L3:
vmulpd %ymm3, %ymm3, %ymm4
vmulpd %ymm9, %ymm3, %ymm3
vaddpd %ymm0, %ymm4, %ymm4
vmulpd %ymm7, %ymm3, %ymm3
vsubpd %ymm10, %ymm4, %ymm4
vaddpd %ymm1, %ymm3, %ymm9
vaddpd %ymm4, %ymm4, %ymm2
vmulpd %ymm9, %ymm9, %ymm10
vaddpd %ymm10, %ymm2, %ymm2
vcmpltpd %ymm6, %ymm2, %ymm2
vmovq %xmm2, %rax
vpaddq %ymm2, %ymm8, %ymm8
testq %rax, %rax
jne .L13
icc -O3 -march=core-avx2 -S results in
..B1.6: # Preds ..B1.5
# Execution count [6.94e-01]
vmovdqa %ymm11, %ymm5 #27.7
# LOE rbx rdi r12 r13 r14 r15 ymm0 ymm1 ymm2
ymm3 ymm4 ymm5 ymm6 ymm7
..B1.2: # Preds ..B1.6 ..B1.1
# Execution count [1.69e+00]
vmovaps %ymm4, %ymm11 #21.24
vfmsub213pd %ymm6, %ymm4, %ymm11 #21.24
vfmsub231pd %ymm5, %ymm5, %ymm11 #21.24
vmulpd %ymm3, %ymm5, %ymm5 #22.16
vaddpd %ymm11, %ymm11, %ymm8 #23.16
vfmadd213pd %ymm7, %ymm5, %ymm4 #22.22
vfmadd231pd %ymm4, %ymm4, %ymm8 #23.24
vcmpltpd %ymm2, %ymm8, %ymm9 #23.29
vandpd %ymm9, %ymm1, %ymm10 #23.29
vmovups %ymm10, -64(%rsp) #23.7
vpaddq %ymm10, %ymm0, %ymm0 #24.7
cmpq $0, -64(%rsp) #25.21
je ..B1.7 # Prob 20% #25.21
# LOE rbx rdi r12 r13 r14 r15 ymm0 ymm1 ymm2
ymm3 ymm4 ymm6 ymm7 ymm11
..B1.3: # Preds ..B1.2
# Execution count [1.36e+00]
cmpq $0, -56(%rsp) #25.36
je ..B1.7 # Prob 20% #25.36
# LOE rbx rdi r12 r13 r14 r15 ymm0 ymm1 ymm2
ymm3 ymm4 ymm6 ymm7 ymm11
..B1.4: # Preds ..B1.3
# Execution count [1.08e+00]
cmpq $0, -48(%rsp) #25.45
jne ..B1.7 # Prob 20% #25.45
# LOE rbx rdi r12 r13 r14 r15 ymm0 ymm1 ymm2
ymm3 ymm4 ymm6 ymm7 ymm11
..B1.5: # Preds ..B1.4
# Execution count [8.67e-01]
cmpq $0, -40(%rsp) #25.55
je ..B1.6 # Prob 80% #25.55
# LOE rbx rdi r12 r13 r14 r15 ymm0 ymm1 ymm2
ymm3 ymm4 ymm6 ymm7 ymm11
where icc uses eight double precision floating point operations
vs. gcc's ten.
More information about the Gcc-bugs
mailing list