[Bug c++/91897] New: Very poor optimization on large attribute vector_size

warp at iki dot fi gcc-bugzilla@gcc.gnu.org
Wed Sep 25 11:59:00 GMT 2019


https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91897

            Bug ID: 91897
           Summary: Very poor optimization on large attribute vector_size
           Product: gcc
           Version: 9.2.1
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c++
          Assignee: unassigned at gcc dot gnu.org
          Reporter: warp at iki dot fi
  Target Milestone: ---

Consider the following code:

//-----------------------------------------------------------
typedef double Double16 __attribute__((vector_size(8*16)));

Double16 mult(const Double16& v1, const Double16& v2)
{
    return v1 * v2;
}
//-----------------------------------------------------------

Using the compiler options "-Ofast -march=skylake", clang 9.0.0 produces this
output from it:

//-----------------------------------------------------------
  vmovapd ymm0, ymmword ptr [rsi]
  vmovapd ymm1, ymmword ptr [rsi + 32]
  vmovapd ymm2, ymmword ptr [rsi + 64]
  vmovapd ymm3, ymmword ptr [rsi + 96]
  vmulpd ymm0, ymm0, ymmword ptr [rdi]
  vmulpd ymm1, ymm1, ymmword ptr [rdi + 32]
  vmulpd ymm2, ymm2, ymmword ptr [rdi + 64]
  vmulpd ymm3, ymm3, ymmword ptr [rdi + 96]
  ret
//-----------------------------------------------------------

However, gcc 9.2 produces the following output:

//-----------------------------------------------------------
  push rbp
  mov rax, rdi
  mov rbp, rsp
  and rsp, -128
  sub rsp, 392
  vmovdqa xmm5, XMMWORD PTR [rsi]
  vmovdqa xmm6, XMMWORD PTR [rsi+16]
  vmovdqa xmm7, XMMWORD PTR [rsi+32]
  vmovdqa xmm1, XMMWORD PTR [rsi+48]
  vmovdqa xmm2, XMMWORD PTR [rsi+64]
  vmovdqa xmm3, XMMWORD PTR [rsi+80]
  vmovdqa xmm4, XMMWORD PTR [rsi+96]
  vmovaps XMMWORD PTR [rsp+8], xmm5
  vmovaps XMMWORD PTR [rsp+24], xmm6
  vmovdqa xmm5, XMMWORD PTR [rsi+112]
  vmovdqa xmm6, XMMWORD PTR [rdx]
  vmovaps XMMWORD PTR [rsp+40], xmm7
  vmovaps XMMWORD PTR [rsp+56], xmm1
  vmovdqa xmm7, XMMWORD PTR [rdx+16]
  vmovdqa xmm1, XMMWORD PTR [rdx+32]
  vmovaps XMMWORD PTR [rsp+72], xmm2
  vmovaps XMMWORD PTR [rsp+88], xmm3
  vmovdqa xmm2, XMMWORD PTR [rdx+48]
  vmovdqa xmm3, XMMWORD PTR [rdx+64]
  vmovaps XMMWORD PTR [rsp+104], xmm4
  vmovdqa xmm4, XMMWORD PTR [rdx+80]
  vmovaps XMMWORD PTR [rsp+136], xmm6
  vmovaps XMMWORD PTR [rsp+152], xmm7
  vmovaps XMMWORD PTR [rsp+168], xmm1
  vmovaps XMMWORD PTR [rsp+184], xmm2
  vmovaps XMMWORD PTR [rsp+200], xmm3
  vmovaps XMMWORD PTR [rsp+216], xmm4
  vmovaps XMMWORD PTR [rsp+120], xmm5
  vmovdqa xmm5, XMMWORD PTR [rdx+96]
  vmovapd ymm7, YMMWORD PTR [rsp+8]
  vmovapd ymm1, YMMWORD PTR [rsp+40]
  vmulpd ymm0, ymm7, YMMWORD PTR [rsp+136]
  vmovapd ymm2, YMMWORD PTR [rsp+72]
  vmovdqa xmm6, XMMWORD PTR [rdx+112]
  vmovaps XMMWORD PTR [rsp+232], xmm5
  vmovapd ymm5, YMMWORD PTR [rsp+104]
  vmovdqa xmm4, xmm0
  vmovapd YMMWORD PTR [rsp-120], ymm0
  vmulpd ymm0, ymm1, YMMWORD PTR [rsp+168]
  vmovaps XMMWORD PTR [rsp+248], xmm6
  vmovaps XMMWORD PTR [rdi], xmm4
  vmovdqa xmm4, XMMWORD PTR [rsp-104]
  vmovdqa xmm3, xmm0
  vmovapd YMMWORD PTR [rsp-88], ymm0
  vmulpd ymm0, ymm2, YMMWORD PTR [rsp+200]
  vmovaps XMMWORD PTR [rdi+32], xmm3
  vmovdqa xmm3, XMMWORD PTR [rsp-72]
  vmovaps XMMWORD PTR [rdi+16], xmm4
  vmovaps XMMWORD PTR [rdi+48], xmm3
  vmovdqa xmm2, xmm0
  vmovapd YMMWORD PTR [rsp-56], ymm0
  vmulpd ymm0, ymm5, YMMWORD PTR [rsp+232]
  vmovdqa xmm6, XMMWORD PTR [rsp-40]
  vmovaps XMMWORD PTR [rdi+64], xmm2
  vmovaps XMMWORD PTR [rdi+80], xmm6
  vmovapd YMMWORD PTR [rsp-24], ymm0
  vmovdqa xmm7, XMMWORD PTR [rsp-8]
  vmovaps XMMWORD PTR [rdi+96], xmm0
  vmovaps XMMWORD PTR [rdi+112], xmm7
  vzeroupper
  leave
  ret
//-----------------------------------------------------------

Curiously, the current trunk version of gcc available at godbolt as of writing
this produces this instead:

//-----------------------------------------------------------
  push rbp
  mov rax, rdi
  mov rbp, rsp
  and rsp, -32
  sub rsp, 8
  vmovapd ymm0, YMMWORD PTR [rsi]
  vmovapd ymm2, YMMWORD PTR [rsi+32]
  vmulpd ymm7, ymm0, YMMWORD PTR [rdx]
  vmulpd ymm1, ymm2, YMMWORD PTR [rdx+32]
  vmovapd ymm4, YMMWORD PTR [rsi+64]
  vmovapd ymm6, YMMWORD PTR [rsi+96]
  vmulpd ymm3, ymm4, YMMWORD PTR [rdx+64]
  vmovapd YMMWORD PTR [rsp-120], ymm7
  mov rcx, QWORD PTR [rsp-112]
  vmovdqa xmm0, XMMWORD PTR [rsp-120]
  mov QWORD PTR [rdi+8], rcx
  mov rcx, QWORD PTR [rsp-104]
  vmulpd ymm5, ymm6, YMMWORD PTR [rdx+96]
  vmovapd YMMWORD PTR [rsp-24], ymm1
  mov QWORD PTR [rdi+16], rcx
  vmovq QWORD PTR [rdi], xmm0
  mov rcx, QWORD PTR [rsp-16]
  mov rdi, QWORD PTR [rsp-96]
  mov QWORD PTR [rax+40], rcx
  mov QWORD PTR [rax+24], rdi
  mov rcx, QWORD PTR [rsp]
  mov rdi, QWORD PTR [rsp-8]
  vmovdqa xmm0, XMMWORD PTR [rsp-24]
  vmovapd YMMWORD PTR [rsp-56], ymm3
  mov QWORD PTR [rax+48], rdi
  mov QWORD PTR [rax+56], rcx
  vmovapd YMMWORD PTR [rsp-88], ymm5
  vmovq QWORD PTR [rax+32], xmm0
  vmovdqa xmm0, XMMWORD PTR [rsp-56]
  mov rdi, QWORD PTR [rsp-48]
  mov rdx, QWORD PTR [rsp-40]
  mov QWORD PTR [rax+72], rdi
  mov QWORD PTR [rax+80], rdx
  mov rcx, QWORD PTR [rsp-32]
  mov rsi, QWORD PTR [rsp-80]
  mov rdi, QWORD PTR [rsp-72]
  mov rdx, QWORD PTR [rsp-64]
  vmovq QWORD PTR [rax+64], xmm0
  vmovdqa xmm0, XMMWORD PTR [rsp-88]
  mov QWORD PTR [rax+88], rcx
  mov QWORD PTR [rax+104], rsi
  mov QWORD PTR [rax+112], rdi
  mov QWORD PTR [rax+120], rdx
  vmovq QWORD PTR [rax+96], xmm0
  vzeroupper
  leave
  ret
//-----------------------------------------------------------


More information about the Gcc-bugs mailing list