[Bug target/88494] [9 Regression] polyhedron 10% mdbx runtime regression

rguenther at suse dot de gcc-bugzilla@gcc.gnu.org
Fri Feb 1 11:06:00 GMT 2019


https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88494

--- Comment #7 from rguenther at suse dot de <rguenther at suse dot de> ---
On Fri, 1 Feb 2019, peter at cordes dot ca wrote:

> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88494
> 
> --- Comment #5 from Peter Cordes <peter at cordes dot ca> ---
>                IF ( xij.GT.+HALf ) xij = xij - PBCx
>                IF ( xij.LT.-HALf ) xij = xij + PBCx
> 
> For code like this, *if we can prove only one of the IF() conditions will be
> true*, we can implement it more efficiently, I think, by checking the magnitude
> of xij to see if a SUB is needed, and if so figuring out the sign to apply to
> PBCx.

...

> I think this is better than IF-conversion of both IFs separately, but I haven't
> really looked.  It should be much better for *latency*.  But it's only
> equivalent if subtracting PBCx can't possibly make xij negative and the next IF
> condition also true.

Probably the hard thing to prove but probably also the reason why at
least part of the branches are quite well predicted.  Note HALf is
in a /common/ but initialized to 0.5 and PBCx are 1, so both
branches are never true.  With -fwhole-program we figure out some
bits end up vectorizing the loops:

.L243:
        vmovdqa 192(%rsp), %ymm6
        vmovapd .LC49(%rip), %ymm2
        addq    $64, %rax
        vmovapd .LC55(%rip), %ymm7
        vmovapd .LC49(%rip), %ymm3
        vmovdqa %ymm6, %ymm13
        vpaddd  .LC82(%rip), %ymm13, %ymm0
        vpaddd  .LC48(%rip), %ymm6, %ymm6
        vmovdqa %ymm6, 192(%rsp)
        vcvtdq2pd       %xmm0, %ymm1
        vextracti128    $0x1, %ymm0, %xmm0
        vfmadd213pd     .LC50(%rip), %ymm1, %ymm3
        vfmadd213pd     .LC50(%rip), %ymm1, %ymm3
        vcvtdq2pd       %xmm0, %ymm0
        vmulpd  .LC51(%rip), %ymm3, %ymm5
        vfmadd213pd     .LC50(%rip), %ymm0, %ymm2
        vmulpd  .LC51(%rip), %ymm2, %ymm4
        vaddpd  .LC52(%rip), %ymm5, %ymm9
        vaddpd  .LC52(%rip), %ymm4, %ymm8
        vfmadd132pd     .LC54(%rip), %ymm7, %ymm4
        vfmadd132pd     .LC54(%rip), %ymm7, %ymm5
        vmulpd  %ymm3, %ymm3, %ymm15
        vmulpd  %ymm2, %ymm2, %ymm14
        vfmadd213pd     .LC53(%rip), %ymm15, %ymm9
        vfmadd213pd     .LC53(%rip), %ymm14, %ymm8
        vmulpd  %ymm4, %ymm2, %ymm6
        vmulpd  .LC56(%rip), %ymm2, %ymm4
        vaddpd  .LC57(%rip), %ymm4, %ymm7
        vmulpd  %ymm5, %ymm3, %ymm5
        vfmadd213pd     .LC52(%rip), %ymm2, %ymm7
        vfmadd213pd     .LC53(%rip), %ymm7, %ymm14
        vmovapd %ymm6, 160(%rsp)
        vmulpd  .LC56(%rip), %ymm3, %ymm6
        vaddpd  .LC57(%rip), %ymm6, %ymm12
        vfmadd213pd     .LC52(%rip), %ymm3, %ymm12
        vfmadd213pd     .LC53(%rip), %ymm12, %ymm15
        vmovapd .LC59(%rip), %ymm12
        vfmadd132pd     .LC58(%rip), %ymm12, %ymm6
        vfmadd132pd     .LC58(%rip), %ymm12, %ymm4
        vfmadd213pd     .LC55(%rip), %ymm3, %ymm6
        vmovapd %ymm14, 96(%rsp)
        vmovapd %ymm15, 128(%rsp)
        vfmadd213pd     .LC55(%rip), %ymm2, %ymm4
        vmulpd  %ymm6, %ymm3, %ymm3
        vmulpd  %ymm4, %ymm2, %ymm4
        vmovapd .LC60(%rip), %ymm2
        vfmadd132pd     .LC49(%rip), %ymm2, %ymm1
        vfmadd132pd     .LC49(%rip), %ymm2, %ymm0
        vmulpd  .LC61(%rip), %ymm0, %ymm2
        vmovapd %ymm3, 64(%rsp)
        vmulpd  .LC61(%rip), %ymm1, %ymm3
        vaddpd  .LC62(%rip), %ymm2, %ymm6
        vaddpd  .LC62(%rip), %ymm3, %ymm12
        vfmadd213pd     .LC63(%rip), %ymm0, %ymm6
        vfmadd213pd     .LC63(%rip), %ymm1, %ymm12
        vmovapd %ymm4, 32(%rsp)
        vmovapd .LC66(%rip), %ymm4
        vfmadd132pd     .LC58(%rip), %ymm4, %ymm3
        vfmadd132pd     .LC58(%rip), %ymm4, %ymm2
        vfmadd213pd     .LC64(%rip), %ymm1, %ymm12
        vfmadd213pd     .LC64(%rip), %ymm0, %ymm6
        vfmadd213pd     .LC67(%rip), %ymm1, %ymm3
        vfmadd213pd     .LC67(%rip), %ymm0, %ymm2
        vfmadd213pd     .LC65(%rip), %ymm0, %ymm6
        vfmadd213pd     .LC65(%rip), %ymm1, %ymm12
        vfmadd213pd     .LC64(%rip), %ymm1, %ymm3
        vfmadd213pd     .LC64(%rip), %ymm2, %ymm0
        vpaddd  .LC68(%rip), %ymm13, %ymm1
        vpminud .LC69(%rip), %ymm1, %ymm2
        vpcmpgtd        .LC70(%rip), %ymm13, %ymm4
        vpcmpeqd        %ymm2, %ymm1, %ymm1
        vpminsd .LC71(%rip), %ymm13, %ymm2
        vpmovsxdq       %xmm4, %ymm14
        vextracti128    $0x1, %ymm4, %xmm4
        vpcmpeqd        %ymm2, %ymm13, %ymm2
        vpmovsxdq       %xmm4, %ymm4
        vpmovsxdq       %xmm1, %ymm15
        vextracti128    $0x1, %ymm1, %xmm1
        vblendvpd       %ymm15, 128(%rsp), %ymm9, %ymm7
        vpmovsxdq       %xmm1, %ymm1
        vblendvpd       %ymm15, 64(%rsp), %ymm5, %ymm15
        vpmovsxdq       %xmm2, %ymm13
        vblendvpd       %ymm14, %ymm9, %ymm7, %ymm7
        vextracti128    $0x1, %ymm2, %xmm2
        vblendvpd       %ymm13, %ymm12, %ymm7, %ymm7
        vpmovsxdq       %xmm2, %ymm2
        vblendvpd       %ymm14, %ymm5, %ymm15, %ymm5
        vmovupd %ymm7, -64(%rax)
        vblendvpd       %ymm1, 96(%rsp), %ymm8, %ymm7
        vblendvpd       %ymm13, %ymm3, %ymm5, %ymm3
        vblendvpd       %ymm4, %ymm8, %ymm7, %ymm7
        vblendvpd       %ymm2, %ymm6, %ymm7, %ymm7
        vmovupd %ymm7, -32(%rax)
        vmovapd %ymm3, 15944(%rax)
        vmovapd 160(%rsp), %ymm6
        vblendvpd       %ymm1, 32(%rsp), %ymm6, %ymm1
        vblendvpd       %ymm4, %ymm6, %ymm1, %ymm4
        vblendvpd       %ymm2, %ymm0, %ymm4, %ymm0
        vmovapd %ymm0, 15976(%rax)
        cmpq    %rax, %rdx
        jne     .L243

and in the end not faster than w/o vectorization ... :/ (on Haswell again)


More information about the Gcc-bugs mailing list