[Bug fortran/88713] _gfortran_internal_pack@PLT prevents vectorization
elrodc at gmail dot com
gcc-bugzilla@gcc.gnu.org
Sun Jan 6 18:24:00 GMT 2019
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88713
--- Comment #7 from Chris Elrod <elrodc at gmail dot com> ---
Created attachment 45357
--> https://gcc.gnu.org/bugzilla/attachment.cgi?id=45357&action=edit
Assembly generated by Flang compiler on the original version of the code.
This is the main loop body in the Flang compiled version of the original code
(starts line 132):
.LBB1_8: # %vector.body
# =>This Inner Loop Header: Depth=1
leaq (%rsi,%rbx,4), %r12
vmovups (%rcx,%r12), %zmm2
addq %rcx, %r12
leaq (%r12,%rcx), %rbp
vmovups (%r11,%rbp), %zmm3
addq %r11, %rbp
leaq (%rcx,%rbp), %r13
leaq (%rcx,%r13), %r8
leaq (%r8,%rcx), %r10
leaq (%r10,%rcx), %r14
vmovups (%rcx,%r14), %zmm4
vrsqrt14ps %zmm4, %zmm5
vmulps %zmm5, %zmm4, %zmm4
vfmadd213ps %zmm0, %zmm5, %zmm4 # zmm4 = (zmm5 * zmm4) + zmm0
vmulps %zmm1, %zmm5, %zmm5
vmulps %zmm4, %zmm5, %zmm4
.Ltmp1:
.loc 1 31 1 is_stmt 1 # vectorization_test.f90:31:1
vmulps (%rcx,%r8), %zmm4, %zmm5
.loc 1 32 1 # vectorization_test.f90:32:1
vmulps (%rcx,%r10), %zmm4, %zmm6
vmovups (%rcx,%r13), %zmm7
.loc 1 33 1 # vectorization_test.f90:33:1
vfnmadd231ps %zmm6, %zmm6, %zmm7 # zmm7 = -(zmm6 * zmm6) + zmm7
vrsqrt14ps %zmm7, %zmm8
vmulps %zmm8, %zmm7, %zmm7
vfmadd213ps %zmm0, %zmm8, %zmm7 # zmm7 = (zmm8 * zmm7) + zmm0
vmulps %zmm1, %zmm8, %zmm8
vmulps %zmm7, %zmm8, %zmm7
vmovups (%rcx,%rbp), %zmm8
.loc 1 35 1 # vectorization_test.f90:35:1
vfnmadd231ps %zmm5, %zmm6, %zmm8 # zmm8 = -(zmm6 * zmm5) + zmm8
vmulps %zmm8, %zmm7, %zmm8
vmulps %zmm5, %zmm5, %zmm9
vfmadd231ps %zmm8, %zmm8, %zmm9 # zmm9 = (zmm8 * zmm8) + zmm9
vsubps %zmm9, %zmm3, %zmm3
vrsqrt14ps %zmm3, %zmm9
vmulps %zmm9, %zmm3, %zmm3
vfmadd213ps %zmm0, %zmm9, %zmm3 # zmm3 = (zmm9 * zmm3) + zmm0
vmulps %zmm1, %zmm9, %zmm9
vmulps %zmm3, %zmm9, %zmm3
.loc 1 39 1 # vectorization_test.f90:39:1
vmulps %zmm8, %zmm7, %zmm8
.loc 1 40 1 # vectorization_test.f90:40:1
vmulps (%rcx,%r12), %zmm4, %zmm4
.loc 1 39 1 # vectorization_test.f90:39:1
vmulps %zmm3, %zmm8, %zmm8
.loc 1 41 1 # vectorization_test.f90:41:1
vmulps %zmm8, %zmm2, %zmm9
vfmsub231ps (%rsi,%rbx,4), %zmm3, %zmm9 # zmm9 = (zmm3 * mem) -
zmm9
vmulps %zmm5, %zmm3, %zmm3
vfmsub231ps %zmm8, %zmm6, %zmm3 # zmm3 = (zmm6 * zmm8) - zmm3
vfmadd213ps %zmm9, %zmm4, %zmm3 # zmm3 = (zmm4 * zmm3) + zmm9
.loc 1 42 1 # vectorization_test.f90:42:1
vmulps %zmm4, %zmm6, %zmm5
vmulps %zmm5, %zmm7, %zmm5
vfmsub231ps %zmm7, %zmm2, %zmm5 # zmm5 = (zmm2 * zmm7) - zmm5
.Ltmp2:
.loc 1 15 1 # vectorization_test.f90:15:1
vmovups %zmm3, (%rdi,%rbx,4)
movq -16(%rsp), %rbp # 8-byte Reload
vmovups %zmm5, (%rbp,%rbx,4)
vmovups %zmm4, (%rax,%rbx,4)
addq $16, %rbx
cmpq %rbx, %rdx
jne .LBB1_8
zmm registers are 64 byte registers. It vmovups from memory into registers,
performs a series of arithmetics and inverse square roots on them, and then
vmovups three of these 64 byte registers back into memory.
That is the most efficient memory access pattern (as demonstrated empirically
via benchmarks).
More information about the Gcc-bugs
mailing list