Created attachment 34521 [details] Preprocessed rectmm.c from SPEC2000 amp GCC does not vectorize one of the hotest code in SPECFP2000 ammp (function mm_fv_update_nonbon in file rectmm.c) on x86-64 when -Ofast -march=core-avx2 -ffast-math -fno-schedule-insns2 is used. The preprocessed rectmm.i is in the attachment. The source code in the consideration is r0 = 1./(*vector)[j+3]; r = r0*r0; r = r*r*r; xt = a1->q*a2->q*dielectric*r0; yt = a1->a*a2->a*r; zt = a1->b*a2->b*r*r; k = xt - yt + zt; xt = xt*r0; yt = yt*r0; zt = zt*r0; k1 = xt - yt*6. + zt*12.; xt = xt*r0; yt = yt*r0; zt = zt*r0; k2 = xt*3.; ka2 = - yt*6.*8.; kb2 = zt*12.*14; k1 = -k1; xt = (*vector)[j]*r0 ; yt = (*vector)[j+1]*r0 ; zt = (*vector)[j+2] *r0; a1->VP += k; a2->dpx -= k1*xt; a1->dpx += k1*xt; a2->dpy -= k1*yt; a1->dpy += k1*yt; a2->dpz -= k1*zt; a1->dpz += k1*zt; xt2 = xt*xt; yt2 = yt*yt; zt2 = zt*zt; a2->qxx -= k2*(xt2 - 1./3) + ka2*(xt2 - 1./8)+kb2*(xt2-1./14) ; a1->qxx -= k2*(xt2 - 1./3) + ka2*(xt2 - 1./8)+kb2*(xt2-1./14) ; a2->qxy -= (k2+ka2+kb2)*yt*xt; a1->qxy -= (k2+ka2+kb2)*yt*xt; a2->qxz -= (k2+ka2+kb2)*zt*xt; a1->qxz -= (k2+ka2+kb2)*zt*xt; a2->qyy -= k2*(yt2 - 1./3) + ka2*(yt2 - 1./8)+kb2*(yt2-1./14) ; a1->qyy -= k2*(yt2 - 1./3) + ka2*(yt2 - 1./8)+kb2*(yt2-1./14) ; a2->qyz -= (k2+ka2+kb2)*yt*zt; a1->qyz -= (k2+ka2+kb2)*yt*zt; a2->qzz -= k2*(zt2 - 1./3) + ka2*(zt2 - 1./8)+kb2*(zt2-1./14) ; a1->qzz -= k2*(zt2 - 1./3) + ka2*(zt2 - 1./8)+kb2*(zt2-1./14) ; GCC on the trunk generates 118 insns .L85: .cfi_restore_state vmovsd .LC12(%rip), %xmm7 vdivsd %xmm0, %xmm7, %xmm6 vmulsd %xmm6, %xmm6, %xmm0 vmulsd %xmm0, %xmm0, %xmm10 vmulsd %xmm10, %xmm0, %xmm0 vmovsd 56(%rbx), %xmm12 vmulsd 56(%rdi), %xmm12, %xmm12 vmulsd %xmm4, %xmm12, %xmm12 vmulsd %xmm12, %xmm6, %xmm12 vmovsd 64(%rbx), %xmm10 vmulsd 64(%rdi), %xmm10, %xmm10 vmulsd %xmm10, %xmm0, %xmm11 vmovsd 72(%rbx), %xmm10 vmulsd 72(%rdi), %xmm10, %xmm10 vmulsd %xmm10, %xmm0, %xmm10 vmulsd %xmm0, %xmm10, %xmm10 vmulsd %xmm12, %xmm6, %xmm0 vmulsd %xmm11, %xmm6, %xmm1 vmulsd %xmm10, %xmm6, %xmm2 vmulsd .LC22(%rip), %xmm2, %xmm8 vfnmadd231sd %xmm9, %xmm1, %xmm8 vaddsd %xmm8, %xmm0, %xmm8 vmulsd .LC21(%rip), %xmm6, %xmm5 vmulsd %xmm0, %xmm5, %xmm5 vmulsd %xmm1, %xmm6, %xmm0 vxorpd %xmm15, %xmm0, %xmm0 vmulsd .LC24(%rip), %xmm0, %xmm3 vmulsd .LC25(%rip), %xmm6, %xmm7 vmulsd %xmm2, %xmm7, %xmm7 vxorpd %xmm15, %xmm8, %xmm8 movslq %esi, %rax vmulsd (%r12,%rax,8), %xmm6, %xmm2 leal 1(%rsi), %eax cltq vmulsd (%r12,%rax,8), %xmm6, %xmm1 leal 2(%rsi), %eax cltq vmulsd (%r12,%rax,8), %xmm6, %xmm0 vaddsd 208(%rbx), %xmm12, %xmm12 vaddsd %xmm12, %xmm10, %xmm10 vsubsd %xmm11, %xmm10, %xmm10 vmovsd %xmm10, 208(%rbx) vmovapd %xmm8, %xmm6 vfnmadd213sd 240(%rdi), %xmm2, %xmm6 vmovsd %xmm6, 240(%rdi) vmovapd %xmm8, %xmm6 vfmadd213sd 240(%rbx), %xmm2, %xmm6 vmovsd %xmm6, 240(%rbx) vmovapd %xmm8, %xmm6 vfnmadd213sd 248(%rdi), %xmm1, %xmm6 vmovsd %xmm6, 248(%rdi) vmovapd %xmm8, %xmm6 vfmadd213sd 248(%rbx), %xmm1, %xmm6 vmovsd %xmm6, 248(%rbx) vmovapd %xmm8, %xmm6 vfnmadd213sd 256(%rdi), %xmm0, %xmm6 vmovsd %xmm6, 256(%rdi) vfmadd213sd 256(%rbx), %xmm0, %xmm8 vmovsd %xmm8, 256(%rbx) vmovsd .LC26(%rip), %xmm8 vmovapd %xmm2, %xmm11 vfnmadd132sd %xmm2, %xmm8, %xmm11 vmulsd %xmm11, %xmm5, %xmm11 vmovsd .LC27(%rip), %xmm6 vmovapd %xmm2, %xmm10 vfnmadd132sd %xmm2, %xmm6, %xmm10 vmovapd %xmm10, %xmm12 vfmadd132sd %xmm7, %xmm11, %xmm12 vmovsd .LC28(%rip), %xmm10 vmovapd %xmm2, %xmm11 vfnmadd132sd %xmm2, %xmm10, %xmm11 vfmadd132sd %xmm3, %xmm12, %xmm11 vaddsd 264(%rdi), %xmm11, %xmm12 vmovsd %xmm12, 264(%rdi) vaddsd 264(%rbx), %xmm11, %xmm11 vmovsd %xmm11, 264(%rbx) vaddsd %xmm7, %xmm5, %xmm12 vaddsd %xmm12, %xmm3, %xmm12 vmulsd %xmm12, %xmm1, %xmm11 vmovapd %xmm2, %xmm13 vfnmadd213sd 272(%rdi), %xmm11, %xmm13 vmovsd %xmm13, 272(%rdi) vmovapd %xmm2, %xmm13 vfnmadd213sd 272(%rbx), %xmm11, %xmm13 vmovsd %xmm13, 272(%rbx) vmulsd %xmm0, %xmm2, %xmm2 vmovapd %xmm12, %xmm13 vfnmadd213sd 280(%rdi), %xmm2, %xmm13 vmovsd %xmm13, 280(%rdi) vfnmadd213sd 280(%rbx), %xmm12, %xmm2 vmovsd %xmm2, 280(%rbx) vmovapd %xmm1, %xmm2 vfnmadd132sd %xmm1, %xmm8, %xmm2 vmulsd %xmm2, %xmm5, %xmm12 vmovapd %xmm1, %xmm2 vfnmadd132sd %xmm1, %xmm6, %xmm2 vfmadd132sd %xmm7, %xmm12, %xmm2 vfnmadd132sd %xmm1, %xmm10, %xmm1 vfmadd132sd %xmm3, %xmm2, %xmm1 vaddsd 288(%rdi), %xmm1, %xmm2 vmovsd %xmm2, 288(%rdi) vaddsd 288(%rbx), %xmm1, %xmm1 vmovsd %xmm1, 288(%rbx) vmovapd %xmm0, %xmm1 vfnmadd213sd 296(%rdi), %xmm11, %xmm1 vmovsd %xmm1, 296(%rdi) vfnmadd213sd 296(%rbx), %xmm0, %xmm11 vmovsd %xmm11, 296(%rbx) vfnmadd231sd %xmm0, %xmm0, %xmm8 vmulsd %xmm8, %xmm5, %xmm5 vfnmadd231sd %xmm0, %xmm0, %xmm6 vfmadd132sd %xmm6, %xmm5, %xmm7 vfnmadd132sd %xmm0, %xmm10, %xmm0 vfmadd132sd %xmm3, %xmm7, %xmm0 vaddsd 304(%rdi), %xmm0, %xmm1 vmovsd %xmm1, 304(%rdi) vaddsd 304(%rbx), %xmm0, %xmm0 vmovsd %xmm0, 304(%rbx) LLVM-3.5 with -Ofast -ffast-math -march=core-avx2 generates 107 insns (10% less than GCC!): .LBB0_135: # %if.then1703 # in Loop: Header=BB0_132 Depth=3 leal (,%r15,4), %eax vmovsd .LCPI0_4(%rip), %xmm1 vdivsd %xmm0, %xmm1, %xmm1 vmulsd %xmm1, %xmm1, %xmm0 vmulsd %xmm0, %xmm0, %xmm2 vmulsd %xmm2, %xmm0, %xmm0 vmovsd 56(%r13), %xmm2 vmovsd 64(%r13), %xmm3 vmulsd 56(%rcx), %xmm2, %xmm2 vmovsd 368(%rsp), %xmm4 # 8-byte Reload vmulsd %xmm2, %xmm4, %xmm2 vmulsd %xmm2, %xmm1, %xmm2 vmulsd 64(%rcx), %xmm3, %xmm3 vmulsd %xmm3, %xmm0, %xmm3 vmovsd 72(%r13), %xmm4 vmulsd 72(%rcx), %xmm4, %xmm4 vmulsd %xmm0, %xmm0, %xmm0 vmulsd %xmm4, %xmm0, %xmm0 vsubsd %xmm3, %xmm2, %xmm4 vaddsd %xmm0, %xmm4, %xmm5 vmulsd %xmm2, %xmm1, %xmm2 vmulsd %xmm3, %xmm1, %xmm3 vmulsd %xmm0, %xmm1, %xmm0 vmovsd .LCPI0_9(%rip), %xmm4 vfmsub213sd %xmm2, %xmm3, %xmm4 vmovsd .LCPI0_10(%rip), %xmm6 vfmadd213sd %xmm4, %xmm0, %xmm6 vmulsd %xmm2, %xmm1, %xmm2 vmulsd %xmm3, %xmm1, %xmm4 vmulsd %xmm0, %xmm1, %xmm0 vmulsd .LCPI0_11(%rip), %xmm2, %xmm11 vmulsd .LCPI0_12(%rip), %xmm4, %xmm14 vmulsd .LCPI0_13(%rip), %xmm0, %xmm10 cltq vpermilpd $0, %xmm1, %xmm0 # xmm0 = xmm1[0,0] vmulpd (%r11,%rax,8), %xmm0, %xmm0 orl $2, %eax cltq vmulsd (%r11,%rax,8), %xmm1, %xmm9 vaddsd 208(%r13), %xmm5, %xmm5 vmovsd %xmm5, 208(%r13) vpermilpd $0, %xmm6, %xmm5 # xmm5 = xmm6[0,0] vmulpd %xmm0, %xmm5, %xmm5 vmovupd 240(%rcx), %xmm7 vsubpd %xmm5, %xmm7, %xmm7 vmovupd %xmm7, 240(%rcx) vaddpd 240(%r13), %xmm5, %xmm5 vmovupd %xmm5, 240(%r13) vmulsd %xmm6, %xmm9, %xmm5 vmovsd 256(%rcx), %xmm6 vsubsd %xmm5, %xmm6, %xmm6 vmovsd %xmm6, 256(%rcx) vaddsd 256(%r13), %xmm5, %xmm5 vmovsd %xmm5, 256(%r13) vmulsd %xmm0, %xmm0, %xmm5 vunpckhpd %xmm0, %xmm0, %xmm8 # xmm8 = xmm0[1,1] vmulsd %xmm8, %xmm8, %xmm15 vmulsd %xmm9, %xmm9, %xmm7 vmovsd .LCPI0_14(%rip), %xmm3 vaddsd %xmm3, %xmm5, %xmm1 vmovsd .LCPI0_15(%rip), %xmm4 vaddsd %xmm4, %xmm5, %xmm2 vmulsd %xmm2, %xmm14, %xmm2 vfmadd213sd %xmm2, %xmm11, %xmm1 vmovsd .LCPI0_16(%rip), %xmm6 vaddsd %xmm6, %xmm5, %xmm5 vfmadd213sd %xmm1, %xmm10, %xmm5 vaddsd %xmm3, %xmm15, %xmm1 vaddsd %xmm4, %xmm15, %xmm2 vmulsd %xmm2, %xmm14, %xmm2 vfmadd213sd %xmm2, %xmm11, %xmm1 vaddsd %xmm3, %xmm7, %xmm2 vaddsd %xmm4, %xmm7, %xmm3 vmulsd %xmm3, %xmm14, %xmm3 vfmadd213sd %xmm3, %xmm11, %xmm2 vaddsd %xmm14, %xmm11, %xmm3 vaddsd %xmm6, %xmm15, %xmm4 vfmadd213sd %xmm1, %xmm10, %xmm4 vaddsd %xmm6, %xmm7, %xmm1 vfmadd213sd %xmm2, %xmm10, %xmm1 vaddsd %xmm10, %xmm3, %xmm2 vmulsd %xmm2, %xmm8, %xmm3 vmulsd %xmm3, %xmm0, %xmm6 vunpcklpd %xmm6, %xmm5, %xmm5 # xmm5 = xmm5[0],xmm6[0] vmovupd 264(%rcx), %xmm6 vsubpd %xmm5, %xmm6, %xmm6 vmovupd %xmm6, 264(%rcx) vmovupd 264(%r13), %xmm6 vsubpd %xmm5, %xmm6, %xmm5 vmovupd %xmm5, 264(%r13) vmulsd %xmm2, %xmm9, %xmm2 vmulsd %xmm2, %xmm0, %xmm0 vunpcklpd %xmm4, %xmm0, %xmm0 # xmm0 = xmm0[0],xmm4[0] vmovupd 280(%rcx), %xmm2 vsubpd %xmm0, %xmm2, %xmm2 vmovupd %xmm2, 280(%rcx) vmovupd 280(%r13), %xmm2 vsubpd %xmm0, %xmm2, %xmm0 vmovupd %xmm0, 280(%r13) vmulsd %xmm3, %xmm9, %xmm0 vunpcklpd %xmm1, %xmm0, %xmm0 # xmm0 = xmm0[0],xmm1[0] vmovupd 296(%rcx), %xmm1 vsubpd %xmm0, %xmm1, %xmm1 vmovupd %xmm1, 296(%rcx) vmovupd 296(%r13), %xmm1 vsubpd %xmm0, %xmm1, %xmm0 vmovupd %xmm0, 296(%r13) It is achieved by vectorization, please see vsubpd and vmulpd in LLVM generated code.
Created attachment 34523 [details] rectmm.c code annotated by gcov to see other hot code parts
This is about SLP vectorization, and for some reason we treat the whole bb as one item to slp vectorize, rather than trying to vectorize just individual statements where beneficial. So we end up with: pr64716.c:2633:12: note: Build SLP failed: unrolling required in basic block SLP pr64716.c:2633:12: note: Failed to SLP the basic block. pr64716.c:2633:12: note: not vectorized: failed to find SLP opportunities in basic block. where 2633 in my copy is that r0 = 1./(*vector)[j+3]; line - group_size is 6 on that stmt and nunits is 4 (for AVX2) or 2 (for 128-bit vectors).
Say on: a2->qyz -= (k2+ka2+kb2)*yt*zt; a1->qyz -= (k2+ka2+kb2)*yt*zt; a2->qzz -= k2*(zt2 - 1./3) + ka2*(zt2 - 1./8)+kb2*(zt2-1./14) ; a1->qzz -= k2*(zt2 - 1./3) + ka2*(zt2 - 1./8)+kb2*(zt2-1./14) ; it seems that temp1 = (k2+ka2+kb2)*yt*zt and temp2 = k2*(zt2 - 1./3) + ka2*(zt2 - 1./8)+kb2*(zt2-1./14) are computed in scalar code, then combined into a V2DFmode vector and the a1->qyz -= temp1; a1->qzz -= temp2; a2->qyz -= temp1; a2->qyz -= temp2; is already performed using vectorized code. We'd need to carefully analyze the costs if putting the scalars into the vector is beneficial, but supposedly it is if the score shows that. Or the: xt = (*vector)[j] * r0; yt = (*vector)[j + 1] * r0; zt = (*vector)[j + 2] * r0; a2->dpx -= k1 * xt; a1->dpx += k1 * xt; a2->dpy -= k1 * yt; a1->dpy += k1 * yt; a2->dpz -= k1 * zt; a1->dpz += k1 * zt; part shows that even though this would be ideally vectorized with V3DFmode vectors, it can be vectorized using V2DFmode + scalar for the *z* elements. Or say for a group of 6 we could consider vectorizing with 4 units vector and 2 units vector for the remainder (perhaps split appart the SLP instance for that, analyze each individually?).
Tried to see if there is improvement when allowing splitting the group stores based at VF boundary. Small improvement noted with slightly older trunk gcc version 7.0.0 20160524 (experimental) (GCC) rectmm.c:520:2: note: Basic block will be vectorized using SLP (Snip) a1-> px = a1->x + lambda*a1->dx; a1-> py = a1->y + lambda*a1->dy; a1-> pz = a1->z + lambda*a1->dz; (Snip) ---SLP dump--- rectmm.c:520:2: note: Detected interleaving load a1_944->xD.4701 and a1_944->yD.4702 rectmm.c:520:2: note: Detected interleaving load a1_944->xD.4701 and a1_944->zD.4703 rectmm.c:520:2: note: Detected interleaving load a1_944->xD.4701 and a1_944->dxD.4721 rectmm.c:520:2: note: Detected interleaving load a1_944->xD.4701 and a1_944->dyD.4722 rectmm.c:520:2: note: Detected interleaving load a1_944->xD.4701 and a1_944->dzD.4723 rectmm.c:520:2: note: Detected interleaving store a1_944->pxD.4728 and a1_944->pyD.4729 rectmm.c:520:2: note: Detected interleaving store a1_944->pxD.4728 and a1_944->pzD.4730 rectmm.c:520:2: note: Split group into 2 and 1 rectmm.c:520:2: note: Basic block will be vectorized using SLP rectmm.c:520:2: note: SLPing BB part rectmm.c:520:2: note: ------>vectorizing SLP node starting from: # VUSE <.MEM_1752> _672 = a1_944->dxD.4721; ---SLP dump---