[Bug target/79709] New: Subobtimal code with -mavx and explicit vector
tkoenig at gcc dot gnu.org
gcc-bugzilla@gcc.gnu.org
Fri Feb 24 20:30:00 GMT 2017
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79709
Bug ID: 79709
Summary: Subobtimal code with -mavx and explicit vector
Product: gcc
Version: 7.0.1
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: tkoenig at gcc dot gnu.org
Target Milestone: ---
For the following code
typedef double v4do __attribute__((vector_size (32)));
typedef long int v4i __attribute__((vector_size (32)));
#define VSET(vect,val) do { vect[0]=val; vect[1]=val; vect[2]=val; vect[3]=val;
} while (0)
void foo(v4do cx, v4do cy, v4i *r)
{
v4do x, y, xn, yn;
v4i add, res;
v4do two, four;
long int done;
VSET(res, 0L);
VSET(two, 2.0);
VSET(four, 4.0);
x = cx;
y = cy;
done = 0;
while (1)
{
xn = x*x - y*y + cx;
yn = two*x*y + cy;
add = xn+xn + yn*yn < four;
res += add;
if (add[0] == 0 || add[1] == 0 || add[2] || add[3])
break;
x = xn;
y = yn;
}
*r = res;
}
gcc compares strange code. The loop is translated with 7.0.1 20170212
with "gcc -O3 -S -mavx v.c" into
.L14:
vpextrq $1, %xmm2, %rax
testq %rax, %rax
je .L2
vmovdqa -48(%rbp), %ymm5
vextractf128 $0x1, %ymm5, %xmm2
vmovq %xmm2, %rax
testq %rax, %rax
jne .L2
vpextrq $1, %xmm2, %rax
vmovapd %ymm3, %ymm5
testq %rax, %rax
jne .L2
.L3:
vmulpd %ymm5, %ymm5, %ymm3
vmulpd %ymm8, %ymm5, %ymm5
vsubpd %ymm6, %ymm3, %ymm3
vmulpd %ymm4, %ymm5, %ymm4
vaddpd %ymm0, %ymm3, %ymm3
vaddpd %ymm1, %ymm4, %ymm4
vaddpd %ymm3, %ymm3, %ymm2
vmulpd %ymm4, %ymm4, %ymm6
vaddpd %ymm6, %ymm2, %ymm2
vcmpltpd %ymm7, %ymm2, %ymm5
vmovapd %ymm5, -48(%rbp)
vmovdqa -48(%rbp), %xmm5
vpaddq -112(%rbp), %xmm5, %xmm5
vmovaps %xmm5, -80(%rbp)
vmovdqa -32(%rbp), %xmm5
vpaddq -96(%rbp), %xmm5, %xmm2
vmovaps %xmm2, -64(%rbp)
vmovdqa -80(%rbp), %ymm2
vmovdqa %ymm2, -112(%rbp)
vmovdqa -48(%rbp), %xmm2
vmovq %xmm2, %rax
testq %rax, %rax
jne .L14
which contains quite a few unnecessary instructions for moving stuff around.
By comparision, clang translates the inner loop to
.LBB0_1: # =>This Inner Loop Header: Depth=1
vmulpd %ymm5, %ymm5, %ymm6
vmulpd %ymm4, %ymm4, %ymm7
vsubpd %ymm7, %ymm6, %ymm6
vaddpd %ymm5, %ymm5, %ymm7
vaddpd %ymm0, %ymm6, %ymm5
vmulpd %ymm7, %ymm4, %ymm4
vaddpd %ymm1, %ymm4, %ymm4
vaddpd %ymm5, %ymm5, %ymm6
vmulpd %ymm4, %ymm4, %ymm7
vaddpd %ymm7, %ymm6, %ymm6
vcmpltpd %ymm8, %ymm6, %ymm6
vextractf128 $1, %ymm6, %xmm7
vextractf128 $1, %ymm2, %xmm3
vpaddq %xmm3, %xmm7, %xmm3
vpaddq %xmm2, %xmm6, %xmm2
vinsertf128 $1, %xmm3, %ymm2, %ymm2
vmovq %xmm7, %rax
vpextrq $1, %xmm7, %rcx
orq %rax, %rcx
jne .LBB0_4
# BB#2: # in Loop: Header=BB0_1 Depth=1
vpextrq $1, %xmm6, %rax
testq %rax, %rax
je .LBB0_4
# BB#3: # in Loop: Header=BB0_1 Depth=1
vmovq %xmm6, %rax
testq %rax, %rax
jne .LBB0_1
which looks much more straighforward, and should be faster.
More information about the Gcc-bugs
mailing list