Bug 64716 - Missed vectorization in a hot code of SPEC2000 ammp
Summary: Missed vectorization in a hot code of SPEC2000 ammp
Status: NEW
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 5.0
: P3 enhancement
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks: spec vectorizer 84613
  Show dependency treegraph
 
Reported: 2015-01-21 16:58 UTC by Vladimir Makarov
Modified: 2021-07-21 03:28 UTC (History)
1 user (show)

See Also:
Host:
Target:
Build:
Known to work:
Known to fail:
Last reconfirmed: 2015-01-21 00:00:00


Attachments
Preprocessed rectmm.c from SPEC2000 amp (15.48 KB, text/plain)
2015-01-21 16:58 UTC, Vladimir Makarov
Details
rectmm.c code annotated by gcov to see other hot code parts (11.39 KB, text/plain)
2015-01-21 17:36 UTC, Vladimir Makarov
Details

Note You need to log in before you can comment on or make changes to this bug.
Description Vladimir Makarov 2015-01-21 16:58:08 UTC
Created attachment 34521 [details]
Preprocessed rectmm.c from SPEC2000 amp

GCC does not vectorize one of the hotest code in SPECFP2000 ammp
(function mm_fv_update_nonbon in file rectmm.c) on x86-64 when -Ofast
-march=core-avx2 -ffast-math -fno-schedule-insns2 is used.  The
preprocessed rectmm.i is in the attachment.

The source code in the consideration is

 r0 = 1./(*vector)[j+3];
 r = r0*r0;
 r = r*r*r;
 xt = a1->q*a2->q*dielectric*r0;
 yt = a1->a*a2->a*r;
 zt = a1->b*a2->b*r*r;
 k = xt - yt + zt;
 xt = xt*r0; yt = yt*r0; zt = zt*r0;
 k1 = xt - yt*6. + zt*12.;
 xt = xt*r0; yt = yt*r0; zt = zt*r0;
 k2 = xt*3.; ka2 = - yt*6.*8.; kb2 = zt*12.*14;




 k1 = -k1;
 xt = (*vector)[j]*r0 ;
 yt = (*vector)[j+1]*r0 ;
 zt = (*vector)[j+2] *r0;





 a1->VP += k;
 a2->dpx -= k1*xt;
 a1->dpx += k1*xt;
 a2->dpy -= k1*yt;
 a1->dpy += k1*yt;
 a2->dpz -= k1*zt;
 a1->dpz += k1*zt;
 xt2 = xt*xt; yt2 = yt*yt; zt2 = zt*zt;
 a2->qxx -= k2*(xt2 - 1./3) + ka2*(xt2 - 1./8)+kb2*(xt2-1./14) ;
 a1->qxx -= k2*(xt2 - 1./3) + ka2*(xt2 - 1./8)+kb2*(xt2-1./14) ;
 a2->qxy -= (k2+ka2+kb2)*yt*xt;
 a1->qxy -= (k2+ka2+kb2)*yt*xt;
 a2->qxz -= (k2+ka2+kb2)*zt*xt;
 a1->qxz -= (k2+ka2+kb2)*zt*xt;
 a2->qyy -= k2*(yt2 - 1./3) + ka2*(yt2 - 1./8)+kb2*(yt2-1./14) ;
 a1->qyy -= k2*(yt2 - 1./3) + ka2*(yt2 - 1./8)+kb2*(yt2-1./14) ;
 a2->qyz -= (k2+ka2+kb2)*yt*zt;
 a1->qyz -= (k2+ka2+kb2)*yt*zt;
 a2->qzz -= k2*(zt2 - 1./3) + ka2*(zt2 - 1./8)+kb2*(zt2-1./14) ;
 a1->qzz -= k2*(zt2 - 1./3) + ka2*(zt2 - 1./8)+kb2*(zt2-1./14) ;

GCC on the trunk generates 118 insns

.L85:
        .cfi_restore_state
        vmovsd  .LC12(%rip), %xmm7
        vdivsd  %xmm0, %xmm7, %xmm6
        vmulsd  %xmm6, %xmm6, %xmm0
        vmulsd  %xmm0, %xmm0, %xmm10
        vmulsd  %xmm10, %xmm0, %xmm0
        vmovsd  56(%rbx), %xmm12
        vmulsd  56(%rdi), %xmm12, %xmm12
        vmulsd  %xmm4, %xmm12, %xmm12
        vmulsd  %xmm12, %xmm6, %xmm12
        vmovsd  64(%rbx), %xmm10
        vmulsd  64(%rdi), %xmm10, %xmm10
        vmulsd  %xmm10, %xmm0, %xmm11
        vmovsd  72(%rbx), %xmm10
        vmulsd  72(%rdi), %xmm10, %xmm10
        vmulsd  %xmm10, %xmm0, %xmm10
        vmulsd  %xmm0, %xmm10, %xmm10
        vmulsd  %xmm12, %xmm6, %xmm0
        vmulsd  %xmm11, %xmm6, %xmm1
        vmulsd  %xmm10, %xmm6, %xmm2
        vmulsd  .LC22(%rip), %xmm2, %xmm8
        vfnmadd231sd    %xmm9, %xmm1, %xmm8
        vaddsd  %xmm8, %xmm0, %xmm8
        vmulsd  .LC21(%rip), %xmm6, %xmm5
        vmulsd  %xmm0, %xmm5, %xmm5
        vmulsd  %xmm1, %xmm6, %xmm0
        vxorpd  %xmm15, %xmm0, %xmm0
        vmulsd  .LC24(%rip), %xmm0, %xmm3
        vmulsd  .LC25(%rip), %xmm6, %xmm7
        vmulsd  %xmm2, %xmm7, %xmm7
        vxorpd  %xmm15, %xmm8, %xmm8
        movslq  %esi, %rax
        vmulsd  (%r12,%rax,8), %xmm6, %xmm2
        leal    1(%rsi), %eax
        cltq
        vmulsd  (%r12,%rax,8), %xmm6, %xmm1
        leal    2(%rsi), %eax
        cltq
        vmulsd  (%r12,%rax,8), %xmm6, %xmm0
        vaddsd  208(%rbx), %xmm12, %xmm12
        vaddsd  %xmm12, %xmm10, %xmm10
        vsubsd  %xmm11, %xmm10, %xmm10
        vmovsd  %xmm10, 208(%rbx)
        vmovapd %xmm8, %xmm6
        vfnmadd213sd    240(%rdi), %xmm2, %xmm6
        vmovsd  %xmm6, 240(%rdi)
        vmovapd %xmm8, %xmm6
        vfmadd213sd     240(%rbx), %xmm2, %xmm6
        vmovsd  %xmm6, 240(%rbx)
        vmovapd %xmm8, %xmm6
        vfnmadd213sd    248(%rdi), %xmm1, %xmm6
        vmovsd  %xmm6, 248(%rdi)
        vmovapd %xmm8, %xmm6
        vfmadd213sd     248(%rbx), %xmm1, %xmm6
        vmovsd  %xmm6, 248(%rbx)
        vmovapd %xmm8, %xmm6
        vfnmadd213sd    256(%rdi), %xmm0, %xmm6
        vmovsd  %xmm6, 256(%rdi)
        vfmadd213sd     256(%rbx), %xmm0, %xmm8
        vmovsd  %xmm8, 256(%rbx)
        vmovsd  .LC26(%rip), %xmm8
        vmovapd %xmm2, %xmm11
        vfnmadd132sd    %xmm2, %xmm8, %xmm11
        vmulsd  %xmm11, %xmm5, %xmm11
        vmovsd  .LC27(%rip), %xmm6
        vmovapd %xmm2, %xmm10
        vfnmadd132sd    %xmm2, %xmm6, %xmm10
        vmovapd %xmm10, %xmm12
        vfmadd132sd     %xmm7, %xmm11, %xmm12
        vmovsd  .LC28(%rip), %xmm10
        vmovapd %xmm2, %xmm11
        vfnmadd132sd    %xmm2, %xmm10, %xmm11
        vfmadd132sd     %xmm3, %xmm12, %xmm11
        vaddsd  264(%rdi), %xmm11, %xmm12
        vmovsd  %xmm12, 264(%rdi)
        vaddsd  264(%rbx), %xmm11, %xmm11
        vmovsd  %xmm11, 264(%rbx)
        vaddsd  %xmm7, %xmm5, %xmm12
        vaddsd  %xmm12, %xmm3, %xmm12
        vmulsd  %xmm12, %xmm1, %xmm11
        vmovapd %xmm2, %xmm13
        vfnmadd213sd    272(%rdi), %xmm11, %xmm13
        vmovsd  %xmm13, 272(%rdi)
        vmovapd %xmm2, %xmm13
        vfnmadd213sd    272(%rbx), %xmm11, %xmm13
        vmovsd  %xmm13, 272(%rbx)
        vmulsd  %xmm0, %xmm2, %xmm2
        vmovapd %xmm12, %xmm13
        vfnmadd213sd    280(%rdi), %xmm2, %xmm13
        vmovsd  %xmm13, 280(%rdi)
        vfnmadd213sd    280(%rbx), %xmm12, %xmm2
        vmovsd  %xmm2, 280(%rbx)
        vmovapd %xmm1, %xmm2
        vfnmadd132sd    %xmm1, %xmm8, %xmm2
        vmulsd  %xmm2, %xmm5, %xmm12
        vmovapd %xmm1, %xmm2
        vfnmadd132sd    %xmm1, %xmm6, %xmm2
        vfmadd132sd     %xmm7, %xmm12, %xmm2
        vfnmadd132sd    %xmm1, %xmm10, %xmm1
        vfmadd132sd     %xmm3, %xmm2, %xmm1
        vaddsd  288(%rdi), %xmm1, %xmm2
        vmovsd  %xmm2, 288(%rdi)
        vaddsd  288(%rbx), %xmm1, %xmm1
        vmovsd  %xmm1, 288(%rbx)
        vmovapd %xmm0, %xmm1
        vfnmadd213sd    296(%rdi), %xmm11, %xmm1
        vmovsd  %xmm1, 296(%rdi)
        vfnmadd213sd    296(%rbx), %xmm0, %xmm11
        vmovsd  %xmm11, 296(%rbx)
        vfnmadd231sd    %xmm0, %xmm0, %xmm8
        vmulsd  %xmm8, %xmm5, %xmm5
        vfnmadd231sd    %xmm0, %xmm0, %xmm6
        vfmadd132sd     %xmm6, %xmm5, %xmm7
        vfnmadd132sd    %xmm0, %xmm10, %xmm0
        vfmadd132sd     %xmm3, %xmm7, %xmm0
        vaddsd  304(%rdi), %xmm0, %xmm1
        vmovsd  %xmm1, 304(%rdi)
        vaddsd  304(%rbx), %xmm0, %xmm0
        vmovsd  %xmm0, 304(%rbx)

LLVM-3.5 with -Ofast -ffast-math -march=core-avx2 generates
107 insns (10% less than GCC!):

.LBB0_135:                              # %if.then1703
                                        #   in Loop: Header=BB0_132 Depth=3
         leal    (,%r15,4), %eax
        vmovsd  .LCPI0_4(%rip), %xmm1
        vdivsd  %xmm0, %xmm1, %xmm1
        vmulsd  %xmm1, %xmm1, %xmm0
        vmulsd  %xmm0, %xmm0, %xmm2
        vmulsd  %xmm2, %xmm0, %xmm0
        vmovsd  56(%r13), %xmm2
        vmovsd  64(%r13), %xmm3
        vmulsd  56(%rcx), %xmm2, %xmm2
        vmovsd  368(%rsp), %xmm4        # 8-byte Reload
        vmulsd  %xmm2, %xmm4, %xmm2
        vmulsd  %xmm2, %xmm1, %xmm2
        vmulsd  64(%rcx), %xmm3, %xmm3
        vmulsd  %xmm3, %xmm0, %xmm3
        vmovsd  72(%r13), %xmm4
        vmulsd  72(%rcx), %xmm4, %xmm4
        vmulsd  %xmm0, %xmm0, %xmm0
        vmulsd  %xmm4, %xmm0, %xmm0
        vsubsd  %xmm3, %xmm2, %xmm4
        vaddsd  %xmm0, %xmm4, %xmm5
        vmulsd  %xmm2, %xmm1, %xmm2
        vmulsd  %xmm3, %xmm1, %xmm3
        vmulsd  %xmm0, %xmm1, %xmm0
        vmovsd  .LCPI0_9(%rip), %xmm4
        vfmsub213sd     %xmm2, %xmm3, %xmm4
        vmovsd  .LCPI0_10(%rip), %xmm6
        vfmadd213sd     %xmm4, %xmm0, %xmm6
        vmulsd  %xmm2, %xmm1, %xmm2
        vmulsd  %xmm3, %xmm1, %xmm4
        vmulsd  %xmm0, %xmm1, %xmm0
        vmulsd  .LCPI0_11(%rip), %xmm2, %xmm11
        vmulsd  .LCPI0_12(%rip), %xmm4, %xmm14
        vmulsd  .LCPI0_13(%rip), %xmm0, %xmm10
        cltq
        vpermilpd       $0, %xmm1, %xmm0 # xmm0 = xmm1[0,0]
        vmulpd  (%r11,%rax,8), %xmm0, %xmm0
        orl     $2, %eax
        cltq
        vmulsd  (%r11,%rax,8), %xmm1, %xmm9
        vaddsd  208(%r13), %xmm5, %xmm5
        vmovsd  %xmm5, 208(%r13)
        vpermilpd       $0, %xmm6, %xmm5 # xmm5 = xmm6[0,0]
        vmulpd  %xmm0, %xmm5, %xmm5
        vmovupd 240(%rcx), %xmm7
        vsubpd  %xmm5, %xmm7, %xmm7
        vmovupd %xmm7, 240(%rcx)
        vaddpd  240(%r13), %xmm5, %xmm5
        vmovupd %xmm5, 240(%r13)
        vmulsd  %xmm6, %xmm9, %xmm5
        vmovsd  256(%rcx), %xmm6
        vsubsd  %xmm5, %xmm6, %xmm6
        vmovsd  %xmm6, 256(%rcx)
        vaddsd  256(%r13), %xmm5, %xmm5
        vmovsd  %xmm5, 256(%r13)
        vmulsd  %xmm0, %xmm0, %xmm5
        vunpckhpd       %xmm0, %xmm0, %xmm8 # xmm8 = xmm0[1,1]
        vmulsd  %xmm8, %xmm8, %xmm15
        vmulsd  %xmm9, %xmm9, %xmm7
        vmovsd  .LCPI0_14(%rip), %xmm3
        vaddsd  %xmm3, %xmm5, %xmm1
        vmovsd  .LCPI0_15(%rip), %xmm4
        vaddsd  %xmm4, %xmm5, %xmm2
        vmulsd  %xmm2, %xmm14, %xmm2
        vfmadd213sd     %xmm2, %xmm11, %xmm1
        vmovsd  .LCPI0_16(%rip), %xmm6
        vaddsd  %xmm6, %xmm5, %xmm5
        vfmadd213sd     %xmm1, %xmm10, %xmm5
        vaddsd  %xmm3, %xmm15, %xmm1
        vaddsd  %xmm4, %xmm15, %xmm2
        vmulsd  %xmm2, %xmm14, %xmm2
        vfmadd213sd     %xmm2, %xmm11, %xmm1
        vaddsd  %xmm3, %xmm7, %xmm2
        vaddsd  %xmm4, %xmm7, %xmm3
        vmulsd  %xmm3, %xmm14, %xmm3
        vfmadd213sd     %xmm3, %xmm11, %xmm2
        vaddsd  %xmm14, %xmm11, %xmm3
        vaddsd  %xmm6, %xmm15, %xmm4
        vfmadd213sd     %xmm1, %xmm10, %xmm4
        vaddsd  %xmm6, %xmm7, %xmm1
        vfmadd213sd     %xmm2, %xmm10, %xmm1
        vaddsd  %xmm10, %xmm3, %xmm2
        vmulsd  %xmm2, %xmm8, %xmm3
        vmulsd  %xmm3, %xmm0, %xmm6
        vunpcklpd       %xmm6, %xmm5, %xmm5 # xmm5 = xmm5[0],xmm6[0]
        vmovupd 264(%rcx), %xmm6
        vsubpd  %xmm5, %xmm6, %xmm6
        vmovupd %xmm6, 264(%rcx)
        vmovupd 264(%r13), %xmm6
        vsubpd  %xmm5, %xmm6, %xmm5
        vmovupd %xmm5, 264(%r13)
        vmulsd  %xmm2, %xmm9, %xmm2
        vmulsd  %xmm2, %xmm0, %xmm0
        vunpcklpd       %xmm4, %xmm0, %xmm0 # xmm0 = xmm0[0],xmm4[0]
        vmovupd 280(%rcx), %xmm2
        vsubpd  %xmm0, %xmm2, %xmm2
        vmovupd %xmm2, 280(%rcx)
        vmovupd 280(%r13), %xmm2
        vsubpd  %xmm0, %xmm2, %xmm0
        vmovupd %xmm0, 280(%r13)
        vmulsd  %xmm3, %xmm9, %xmm0
        vunpcklpd       %xmm1, %xmm0, %xmm0 # xmm0 = xmm0[0],xmm1[0]
        vmovupd 296(%rcx), %xmm1
        vsubpd  %xmm0, %xmm1, %xmm1
        vmovupd %xmm1, 296(%rcx)
        vmovupd 296(%r13), %xmm1
        vsubpd  %xmm0, %xmm1, %xmm0
        vmovupd %xmm0, 296(%r13)

It is achieved by vectorization, please see vsubpd and vmulpd in LLVM
generated code.
Comment 1 Vladimir Makarov 2015-01-21 17:36:53 UTC
Created attachment 34523 [details]
rectmm.c code annotated by gcov to see other hot code parts
Comment 2 Jakub Jelinek 2015-01-21 19:37:46 UTC
This is about SLP vectorization, and for some reason we treat the whole bb as one item to slp vectorize, rather than trying to vectorize just individual statements where beneficial.
So we end up with:
pr64716.c:2633:12: note: Build SLP failed: unrolling required in basic block SLP
pr64716.c:2633:12: note: Failed to SLP the basic block.
pr64716.c:2633:12: note: not vectorized: failed to find SLP opportunities in basic block.

where 2633 in my copy is that r0 = 1./(*vector)[j+3]; line - group_size is 6 on that stmt and nunits is 4 (for AVX2) or 2 (for 128-bit vectors).
Comment 3 Jakub Jelinek 2015-01-21 19:54:33 UTC
Say on:
 a2->qyz -= (k2+ka2+kb2)*yt*zt;
 a1->qyz -= (k2+ka2+kb2)*yt*zt;
 a2->qzz -= k2*(zt2 - 1./3) + ka2*(zt2 - 1./8)+kb2*(zt2-1./14) ;
 a1->qzz -= k2*(zt2 - 1./3) + ka2*(zt2 - 1./8)+kb2*(zt2-1./14) ;
it seems that
temp1 = (k2+ka2+kb2)*yt*zt
and
temp2 = k2*(zt2 - 1./3) + ka2*(zt2 - 1./8)+kb2*(zt2-1./14)
are computed in scalar code, then combined into a V2DFmode vector and the
a1->qyz -= temp1;
a1->qzz -= temp2;
a2->qyz -= temp1;
a2->qyz -= temp2;
is already performed using vectorized code.  We'd need to carefully analyze the costs if putting the scalars into the vector is beneficial, but supposedly it is if the score shows that.

Or the:
                      xt = (*vector)[j] * r0;
                      yt = (*vector)[j + 1] * r0;
                      zt = (*vector)[j + 2] * r0;
                      a2->dpx -= k1 * xt;
                      a1->dpx += k1 * xt;
                      a2->dpy -= k1 * yt;
                      a1->dpy += k1 * yt;
                      a2->dpz -= k1 * zt;
                      a1->dpz += k1 * zt;
part shows that even though this would be ideally vectorized with V3DFmode vectors, it can be vectorized using V2DFmode + scalar for the *z* elements.
Or say for a group of 6 we could consider vectorizing with 4 units vector and 2 units vector for the remainder (perhaps split appart the SLP instance for that, analyze each individually?).
Comment 4 vekumar 2016-06-10 12:32:48 UTC
Tried to see if there is improvement when allowing splitting the group stores based at VF boundary.
 
Small improvement noted with slightly older trunk 
gcc version 7.0.0 20160524 (experimental) (GCC)

rectmm.c:520:2: note: Basic block will be vectorized using SLP


(Snip)
a1-> px = a1->x + lambda*a1->dx;
a1-> py = a1->y + lambda*a1->dy;
a1-> pz = a1->z + lambda*a1->dz;
(Snip)

---SLP dump---
rectmm.c:520:2: note: Detected interleaving load a1_944->xD.4701 and a1_944->yD.4702
rectmm.c:520:2: note: Detected interleaving load a1_944->xD.4701 and a1_944->zD.4703
rectmm.c:520:2: note: Detected interleaving load a1_944->xD.4701 and a1_944->dxD.4721
rectmm.c:520:2: note: Detected interleaving load a1_944->xD.4701 and a1_944->dyD.4722
rectmm.c:520:2: note: Detected interleaving load a1_944->xD.4701 and a1_944->dzD.4723
rectmm.c:520:2: note: Detected interleaving store a1_944->pxD.4728 and a1_944->pyD.4729
rectmm.c:520:2: note: Detected interleaving store a1_944->pxD.4728 and a1_944->pzD.4730

rectmm.c:520:2: note: Split group into 2 and 1

rectmm.c:520:2: note: Basic block will be vectorized using SLP
rectmm.c:520:2: note: SLPing BB part
rectmm.c:520:2: note: ------>vectorizing SLP node starting from: # VUSE <.MEM_1752>
_672 = a1_944->dxD.4721;
---SLP dump---