This is the mail archive of the gcc-bugs@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[Bug target/79946] New: Suboptimal code with AVX2 copying all arguments to stack


https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79946

            Bug ID: 79946
           Summary: Suboptimal code with AVX2 copying all arguments to
                    stack
           Product: gcc
           Version: 7.0.1
            Status: UNCONFIRMED
          Severity: enhancement
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: tkoenig at gcc dot gnu.org
  Target Milestone: ---

Created attachment 40916
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=40916&action=edit
Assembly output for gfortran

I looked at a reduced version of the code in PR 79930:

module foo
  use, intrinsic :: iso_fortran_env
  implicit none
  integer, parameter :: dp = real64 ! KIND for double precision
  type Vect3D
    real(dp) :: x,y,z
  end type
contains
  type(Vect3D) recursive pure function TP_LEFT(NU, D, NV) result(tensorproduct)
    real(dp),     intent(in) :: NU(4), NV(4)
    type(Vect3D), intent(in) :: D(4,4)
    real(dp)                 :: Dx(4,4), Dy(4,4), Dz(4,4)
    real(dp)                 :: tmp(4)
    Dx = D%x
    Dy = D%y
    Dz = D%z
    tmp = matmul(NU,Dx);
    tensorproduct%x = dot_product(tmp,NV)
    tmp = matmul(NU,Dy);
    tensorproduct%y = dot_product(tmp,NV)
    tmp = matmul(NU,Dz);
    tensorproduct%z = dot_product(tmp,NV)
  end function
end module foo

Translating with 

$ gfortran -mavx2 -mfma -S -Ofast -o tp_o_gfortran.s tp_o.f90

led to code at the beginning of the function where all of the
arguments appear to be copied to the stack:

_foo_MOD_tp_left:
.LFB0:
        .cfi_startproc
        leaq    8(%rsp), %r10
        .cfi_def_cfa 10, 0
        andq    $-32, %rsp
        movq    %rdi, %rax
        pushq   -8(%r10)
        pushq   %rbp
        .cfi_escape 0x10,0x6,0x2,0x76,0
        movq    %rsp, %rbp
        pushq   %r10
        .cfi_escape 0xf,0x3,0x76,0x78,0x6
        subq    $304, %rsp
        vmovsd  (%rdx), %xmm0
        vmovsd  %xmm0, -400(%rbp)
        vmovsd  24(%rdx), %xmm0
        vmovsd  %xmm0, -392(%rbp)
        vmovsd  48(%rdx), %xmm0
        vmovsd  %xmm0, -384(%rbp)
        vmovsd  72(%rdx), %xmm0
        vmovsd  %xmm0, -376(%rbp)
        vmovsd  96(%rdx), %xmm0
        vmovsd  %xmm0, -368(%rbp)
        vmovsd  120(%rdx), %xmm0
        vmovsd  %xmm0, -360(%rbp)
        vmovsd  144(%rdx), %xmm0

... and so on. The code appears to unload all of the arguments onto
the stack, then operate from there.

This results in 96 vmovsd instructions and a total of 211 instructions
overall.

By comparision, ifort starts out its code with

        pushq     %rbp                                          #12.40
        movq      %rsp, %rbp                                    #12.40
        andq      $-32, %rsp                                    #12.40
        movq      %rdi, %rax                                    #27.3
        vmovups   (%rdx), %xmm4                                 #18.5
        vmovups   16(%rdx), %xmm10                              #18.5
        vmovups   32(%rdx), %xmm11                              #18.5
        vinsertf128 $1, 48(%rdx), %ymm4, %ymm15                 #18.5
        vinsertf128 $1, 64(%rdx), %ymm10, %ymm1                 #18.5
        vblendpd  $10, %ymm1, %ymm15, %ymm2                     #18.5

resulting in hardly any memory use and only 136 instructions.

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]