[Bug target/88494] [9 Regression] polyhedron 10% mdbx runtime regression
rguenther at suse dot de
gcc-bugzilla@gcc.gnu.org
Fri Feb 1 11:06:00 GMT 2019
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88494
--- Comment #7 from rguenther at suse dot de <rguenther at suse dot de> ---
On Fri, 1 Feb 2019, peter at cordes dot ca wrote:
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88494
>
> --- Comment #5 from Peter Cordes <peter at cordes dot ca> ---
> IF ( xij.GT.+HALf ) xij = xij - PBCx
> IF ( xij.LT.-HALf ) xij = xij + PBCx
>
> For code like this, *if we can prove only one of the IF() conditions will be
> true*, we can implement it more efficiently, I think, by checking the magnitude
> of xij to see if a SUB is needed, and if so figuring out the sign to apply to
> PBCx.
...
> I think this is better than IF-conversion of both IFs separately, but I haven't
> really looked. It should be much better for *latency*. But it's only
> equivalent if subtracting PBCx can't possibly make xij negative and the next IF
> condition also true.
Probably the hard thing to prove but probably also the reason why at
least part of the branches are quite well predicted. Note HALf is
in a /common/ but initialized to 0.5 and PBCx are 1, so both
branches are never true. With -fwhole-program we figure out some
bits end up vectorizing the loops:
.L243:
vmovdqa 192(%rsp), %ymm6
vmovapd .LC49(%rip), %ymm2
addq $64, %rax
vmovapd .LC55(%rip), %ymm7
vmovapd .LC49(%rip), %ymm3
vmovdqa %ymm6, %ymm13
vpaddd .LC82(%rip), %ymm13, %ymm0
vpaddd .LC48(%rip), %ymm6, %ymm6
vmovdqa %ymm6, 192(%rsp)
vcvtdq2pd %xmm0, %ymm1
vextracti128 $0x1, %ymm0, %xmm0
vfmadd213pd .LC50(%rip), %ymm1, %ymm3
vfmadd213pd .LC50(%rip), %ymm1, %ymm3
vcvtdq2pd %xmm0, %ymm0
vmulpd .LC51(%rip), %ymm3, %ymm5
vfmadd213pd .LC50(%rip), %ymm0, %ymm2
vmulpd .LC51(%rip), %ymm2, %ymm4
vaddpd .LC52(%rip), %ymm5, %ymm9
vaddpd .LC52(%rip), %ymm4, %ymm8
vfmadd132pd .LC54(%rip), %ymm7, %ymm4
vfmadd132pd .LC54(%rip), %ymm7, %ymm5
vmulpd %ymm3, %ymm3, %ymm15
vmulpd %ymm2, %ymm2, %ymm14
vfmadd213pd .LC53(%rip), %ymm15, %ymm9
vfmadd213pd .LC53(%rip), %ymm14, %ymm8
vmulpd %ymm4, %ymm2, %ymm6
vmulpd .LC56(%rip), %ymm2, %ymm4
vaddpd .LC57(%rip), %ymm4, %ymm7
vmulpd %ymm5, %ymm3, %ymm5
vfmadd213pd .LC52(%rip), %ymm2, %ymm7
vfmadd213pd .LC53(%rip), %ymm7, %ymm14
vmovapd %ymm6, 160(%rsp)
vmulpd .LC56(%rip), %ymm3, %ymm6
vaddpd .LC57(%rip), %ymm6, %ymm12
vfmadd213pd .LC52(%rip), %ymm3, %ymm12
vfmadd213pd .LC53(%rip), %ymm12, %ymm15
vmovapd .LC59(%rip), %ymm12
vfmadd132pd .LC58(%rip), %ymm12, %ymm6
vfmadd132pd .LC58(%rip), %ymm12, %ymm4
vfmadd213pd .LC55(%rip), %ymm3, %ymm6
vmovapd %ymm14, 96(%rsp)
vmovapd %ymm15, 128(%rsp)
vfmadd213pd .LC55(%rip), %ymm2, %ymm4
vmulpd %ymm6, %ymm3, %ymm3
vmulpd %ymm4, %ymm2, %ymm4
vmovapd .LC60(%rip), %ymm2
vfmadd132pd .LC49(%rip), %ymm2, %ymm1
vfmadd132pd .LC49(%rip), %ymm2, %ymm0
vmulpd .LC61(%rip), %ymm0, %ymm2
vmovapd %ymm3, 64(%rsp)
vmulpd .LC61(%rip), %ymm1, %ymm3
vaddpd .LC62(%rip), %ymm2, %ymm6
vaddpd .LC62(%rip), %ymm3, %ymm12
vfmadd213pd .LC63(%rip), %ymm0, %ymm6
vfmadd213pd .LC63(%rip), %ymm1, %ymm12
vmovapd %ymm4, 32(%rsp)
vmovapd .LC66(%rip), %ymm4
vfmadd132pd .LC58(%rip), %ymm4, %ymm3
vfmadd132pd .LC58(%rip), %ymm4, %ymm2
vfmadd213pd .LC64(%rip), %ymm1, %ymm12
vfmadd213pd .LC64(%rip), %ymm0, %ymm6
vfmadd213pd .LC67(%rip), %ymm1, %ymm3
vfmadd213pd .LC67(%rip), %ymm0, %ymm2
vfmadd213pd .LC65(%rip), %ymm0, %ymm6
vfmadd213pd .LC65(%rip), %ymm1, %ymm12
vfmadd213pd .LC64(%rip), %ymm1, %ymm3
vfmadd213pd .LC64(%rip), %ymm2, %ymm0
vpaddd .LC68(%rip), %ymm13, %ymm1
vpminud .LC69(%rip), %ymm1, %ymm2
vpcmpgtd .LC70(%rip), %ymm13, %ymm4
vpcmpeqd %ymm2, %ymm1, %ymm1
vpminsd .LC71(%rip), %ymm13, %ymm2
vpmovsxdq %xmm4, %ymm14
vextracti128 $0x1, %ymm4, %xmm4
vpcmpeqd %ymm2, %ymm13, %ymm2
vpmovsxdq %xmm4, %ymm4
vpmovsxdq %xmm1, %ymm15
vextracti128 $0x1, %ymm1, %xmm1
vblendvpd %ymm15, 128(%rsp), %ymm9, %ymm7
vpmovsxdq %xmm1, %ymm1
vblendvpd %ymm15, 64(%rsp), %ymm5, %ymm15
vpmovsxdq %xmm2, %ymm13
vblendvpd %ymm14, %ymm9, %ymm7, %ymm7
vextracti128 $0x1, %ymm2, %xmm2
vblendvpd %ymm13, %ymm12, %ymm7, %ymm7
vpmovsxdq %xmm2, %ymm2
vblendvpd %ymm14, %ymm5, %ymm15, %ymm5
vmovupd %ymm7, -64(%rax)
vblendvpd %ymm1, 96(%rsp), %ymm8, %ymm7
vblendvpd %ymm13, %ymm3, %ymm5, %ymm3
vblendvpd %ymm4, %ymm8, %ymm7, %ymm7
vblendvpd %ymm2, %ymm6, %ymm7, %ymm7
vmovupd %ymm7, -32(%rax)
vmovapd %ymm3, 15944(%rax)
vmovapd 160(%rsp), %ymm6
vblendvpd %ymm1, 32(%rsp), %ymm6, %ymm1
vblendvpd %ymm4, %ymm6, %ymm1, %ymm4
vblendvpd %ymm2, %ymm0, %ymm4, %ymm0
vmovapd %ymm0, 15976(%rax)
cmpq %rax, %rdx
jne .L243
and in the end not faster than w/o vectorization ... :/ (on Haswell again)
More information about the Gcc-bugs
mailing list