This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug target/79946] New: Suboptimal code with AVX2 copying all arguments to stack
- From: "tkoenig at gcc dot gnu.org" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: Tue, 07 Mar 2017 19:05:38 +0000
- Subject: [Bug target/79946] New: Suboptimal code with AVX2 copying all arguments to stack
- Auto-submitted: auto-generated
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79946
Bug ID: 79946
Summary: Suboptimal code with AVX2 copying all arguments to
stack
Product: gcc
Version: 7.0.1
Status: UNCONFIRMED
Severity: enhancement
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: tkoenig at gcc dot gnu.org
Target Milestone: ---
Created attachment 40916
--> https://gcc.gnu.org/bugzilla/attachment.cgi?id=40916&action=edit
Assembly output for gfortran
I looked at a reduced version of the code in PR 79930:
module foo
use, intrinsic :: iso_fortran_env
implicit none
integer, parameter :: dp = real64 ! KIND for double precision
type Vect3D
real(dp) :: x,y,z
end type
contains
type(Vect3D) recursive pure function TP_LEFT(NU, D, NV) result(tensorproduct)
real(dp), intent(in) :: NU(4), NV(4)
type(Vect3D), intent(in) :: D(4,4)
real(dp) :: Dx(4,4), Dy(4,4), Dz(4,4)
real(dp) :: tmp(4)
Dx = D%x
Dy = D%y
Dz = D%z
tmp = matmul(NU,Dx);
tensorproduct%x = dot_product(tmp,NV)
tmp = matmul(NU,Dy);
tensorproduct%y = dot_product(tmp,NV)
tmp = matmul(NU,Dz);
tensorproduct%z = dot_product(tmp,NV)
end function
end module foo
Translating with
$ gfortran -mavx2 -mfma -S -Ofast -o tp_o_gfortran.s tp_o.f90
led to code at the beginning of the function where all of the
arguments appear to be copied to the stack:
_foo_MOD_tp_left:
.LFB0:
.cfi_startproc
leaq 8(%rsp), %r10
.cfi_def_cfa 10, 0
andq $-32, %rsp
movq %rdi, %rax
pushq -8(%r10)
pushq %rbp
.cfi_escape 0x10,0x6,0x2,0x76,0
movq %rsp, %rbp
pushq %r10
.cfi_escape 0xf,0x3,0x76,0x78,0x6
subq $304, %rsp
vmovsd (%rdx), %xmm0
vmovsd %xmm0, -400(%rbp)
vmovsd 24(%rdx), %xmm0
vmovsd %xmm0, -392(%rbp)
vmovsd 48(%rdx), %xmm0
vmovsd %xmm0, -384(%rbp)
vmovsd 72(%rdx), %xmm0
vmovsd %xmm0, -376(%rbp)
vmovsd 96(%rdx), %xmm0
vmovsd %xmm0, -368(%rbp)
vmovsd 120(%rdx), %xmm0
vmovsd %xmm0, -360(%rbp)
vmovsd 144(%rdx), %xmm0
... and so on. The code appears to unload all of the arguments onto
the stack, then operate from there.
This results in 96 vmovsd instructions and a total of 211 instructions
overall.
By comparision, ifort starts out its code with
pushq %rbp #12.40
movq %rsp, %rbp #12.40
andq $-32, %rsp #12.40
movq %rdi, %rax #27.3
vmovups (%rdx), %xmm4 #18.5
vmovups 16(%rdx), %xmm10 #18.5
vmovups 32(%rdx), %xmm11 #18.5
vinsertf128 $1, 48(%rdx), %ymm4, %ymm15 #18.5
vinsertf128 $1, 64(%rdx), %ymm10, %ymm1 #18.5
vblendpd $10, %ymm1, %ymm15, %ymm2 #18.5
resulting in hardly any memory use and only 136 instructions.