[Bug c++/91897] New: Very poor optimization on large attribute vector_size
warp at iki dot fi
gcc-bugzilla@gcc.gnu.org
Wed Sep 25 11:59:00 GMT 2019
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91897
Bug ID: 91897
Summary: Very poor optimization on large attribute vector_size
Product: gcc
Version: 9.2.1
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: c++
Assignee: unassigned at gcc dot gnu.org
Reporter: warp at iki dot fi
Target Milestone: ---
Consider the following code:
//-----------------------------------------------------------
typedef double Double16 __attribute__((vector_size(8*16)));
Double16 mult(const Double16& v1, const Double16& v2)
{
return v1 * v2;
}
//-----------------------------------------------------------
Using the compiler options "-Ofast -march=skylake", clang 9.0.0 produces this
output from it:
//-----------------------------------------------------------
vmovapd ymm0, ymmword ptr [rsi]
vmovapd ymm1, ymmword ptr [rsi + 32]
vmovapd ymm2, ymmword ptr [rsi + 64]
vmovapd ymm3, ymmword ptr [rsi + 96]
vmulpd ymm0, ymm0, ymmword ptr [rdi]
vmulpd ymm1, ymm1, ymmword ptr [rdi + 32]
vmulpd ymm2, ymm2, ymmword ptr [rdi + 64]
vmulpd ymm3, ymm3, ymmword ptr [rdi + 96]
ret
//-----------------------------------------------------------
However, gcc 9.2 produces the following output:
//-----------------------------------------------------------
push rbp
mov rax, rdi
mov rbp, rsp
and rsp, -128
sub rsp, 392
vmovdqa xmm5, XMMWORD PTR [rsi]
vmovdqa xmm6, XMMWORD PTR [rsi+16]
vmovdqa xmm7, XMMWORD PTR [rsi+32]
vmovdqa xmm1, XMMWORD PTR [rsi+48]
vmovdqa xmm2, XMMWORD PTR [rsi+64]
vmovdqa xmm3, XMMWORD PTR [rsi+80]
vmovdqa xmm4, XMMWORD PTR [rsi+96]
vmovaps XMMWORD PTR [rsp+8], xmm5
vmovaps XMMWORD PTR [rsp+24], xmm6
vmovdqa xmm5, XMMWORD PTR [rsi+112]
vmovdqa xmm6, XMMWORD PTR [rdx]
vmovaps XMMWORD PTR [rsp+40], xmm7
vmovaps XMMWORD PTR [rsp+56], xmm1
vmovdqa xmm7, XMMWORD PTR [rdx+16]
vmovdqa xmm1, XMMWORD PTR [rdx+32]
vmovaps XMMWORD PTR [rsp+72], xmm2
vmovaps XMMWORD PTR [rsp+88], xmm3
vmovdqa xmm2, XMMWORD PTR [rdx+48]
vmovdqa xmm3, XMMWORD PTR [rdx+64]
vmovaps XMMWORD PTR [rsp+104], xmm4
vmovdqa xmm4, XMMWORD PTR [rdx+80]
vmovaps XMMWORD PTR [rsp+136], xmm6
vmovaps XMMWORD PTR [rsp+152], xmm7
vmovaps XMMWORD PTR [rsp+168], xmm1
vmovaps XMMWORD PTR [rsp+184], xmm2
vmovaps XMMWORD PTR [rsp+200], xmm3
vmovaps XMMWORD PTR [rsp+216], xmm4
vmovaps XMMWORD PTR [rsp+120], xmm5
vmovdqa xmm5, XMMWORD PTR [rdx+96]
vmovapd ymm7, YMMWORD PTR [rsp+8]
vmovapd ymm1, YMMWORD PTR [rsp+40]
vmulpd ymm0, ymm7, YMMWORD PTR [rsp+136]
vmovapd ymm2, YMMWORD PTR [rsp+72]
vmovdqa xmm6, XMMWORD PTR [rdx+112]
vmovaps XMMWORD PTR [rsp+232], xmm5
vmovapd ymm5, YMMWORD PTR [rsp+104]
vmovdqa xmm4, xmm0
vmovapd YMMWORD PTR [rsp-120], ymm0
vmulpd ymm0, ymm1, YMMWORD PTR [rsp+168]
vmovaps XMMWORD PTR [rsp+248], xmm6
vmovaps XMMWORD PTR [rdi], xmm4
vmovdqa xmm4, XMMWORD PTR [rsp-104]
vmovdqa xmm3, xmm0
vmovapd YMMWORD PTR [rsp-88], ymm0
vmulpd ymm0, ymm2, YMMWORD PTR [rsp+200]
vmovaps XMMWORD PTR [rdi+32], xmm3
vmovdqa xmm3, XMMWORD PTR [rsp-72]
vmovaps XMMWORD PTR [rdi+16], xmm4
vmovaps XMMWORD PTR [rdi+48], xmm3
vmovdqa xmm2, xmm0
vmovapd YMMWORD PTR [rsp-56], ymm0
vmulpd ymm0, ymm5, YMMWORD PTR [rsp+232]
vmovdqa xmm6, XMMWORD PTR [rsp-40]
vmovaps XMMWORD PTR [rdi+64], xmm2
vmovaps XMMWORD PTR [rdi+80], xmm6
vmovapd YMMWORD PTR [rsp-24], ymm0
vmovdqa xmm7, XMMWORD PTR [rsp-8]
vmovaps XMMWORD PTR [rdi+96], xmm0
vmovaps XMMWORD PTR [rdi+112], xmm7
vzeroupper
leave
ret
//-----------------------------------------------------------
Curiously, the current trunk version of gcc available at godbolt as of writing
this produces this instead:
//-----------------------------------------------------------
push rbp
mov rax, rdi
mov rbp, rsp
and rsp, -32
sub rsp, 8
vmovapd ymm0, YMMWORD PTR [rsi]
vmovapd ymm2, YMMWORD PTR [rsi+32]
vmulpd ymm7, ymm0, YMMWORD PTR [rdx]
vmulpd ymm1, ymm2, YMMWORD PTR [rdx+32]
vmovapd ymm4, YMMWORD PTR [rsi+64]
vmovapd ymm6, YMMWORD PTR [rsi+96]
vmulpd ymm3, ymm4, YMMWORD PTR [rdx+64]
vmovapd YMMWORD PTR [rsp-120], ymm7
mov rcx, QWORD PTR [rsp-112]
vmovdqa xmm0, XMMWORD PTR [rsp-120]
mov QWORD PTR [rdi+8], rcx
mov rcx, QWORD PTR [rsp-104]
vmulpd ymm5, ymm6, YMMWORD PTR [rdx+96]
vmovapd YMMWORD PTR [rsp-24], ymm1
mov QWORD PTR [rdi+16], rcx
vmovq QWORD PTR [rdi], xmm0
mov rcx, QWORD PTR [rsp-16]
mov rdi, QWORD PTR [rsp-96]
mov QWORD PTR [rax+40], rcx
mov QWORD PTR [rax+24], rdi
mov rcx, QWORD PTR [rsp]
mov rdi, QWORD PTR [rsp-8]
vmovdqa xmm0, XMMWORD PTR [rsp-24]
vmovapd YMMWORD PTR [rsp-56], ymm3
mov QWORD PTR [rax+48], rdi
mov QWORD PTR [rax+56], rcx
vmovapd YMMWORD PTR [rsp-88], ymm5
vmovq QWORD PTR [rax+32], xmm0
vmovdqa xmm0, XMMWORD PTR [rsp-56]
mov rdi, QWORD PTR [rsp-48]
mov rdx, QWORD PTR [rsp-40]
mov QWORD PTR [rax+72], rdi
mov QWORD PTR [rax+80], rdx
mov rcx, QWORD PTR [rsp-32]
mov rsi, QWORD PTR [rsp-80]
mov rdi, QWORD PTR [rsp-72]
mov rdx, QWORD PTR [rsp-64]
vmovq QWORD PTR [rax+64], xmm0
vmovdqa xmm0, XMMWORD PTR [rsp-88]
mov QWORD PTR [rax+88], rcx
mov QWORD PTR [rax+104], rsi
mov QWORD PTR [rax+112], rdi
mov QWORD PTR [rax+120], rdx
vmovq QWORD PTR [rax+96], xmm0
vzeroupper
leave
ret
//-----------------------------------------------------------
More information about the Gcc-bugs
mailing list