[Bug middle-end/100104] std::transform is 1.5 times faster than std::copy with -O3
hewillk at gmail dot com
gcc-bugzilla@gcc.gnu.org
Fri Apr 16 01:43:26 GMT 2021
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100104
--- Comment #3 from 康桓瑋 <hewillk at gmail dot com> ---
Build "copy" with -O2 on x86-64 (https://godbolt.org/z/Gja6xrq9G):
.LC0:
.string "vector::_M_realloc_insert"
copy(std::vector<double, std::allocator<double> > const&):
push r15
pxor xmm0, xmm0
push r14
push r13
push r12
mov r12, rdi
push rbp
push rbx
sub rsp, 40
mov r13, QWORD PTR [rsi+8]
mov rbx, QWORD PTR [rsi]
movups XMMWORD PTR [rdi], xmm0
mov QWORD PTR [rdi+16], 0
cmp rbx, r13
je .L1
xor r8d, r8d
xor ecx, ecx
jmp .L13
.L31:
mov DWORD PTR [rcx], ebp
add rbx, 8
add rcx, 4
mov QWORD PTR [r12+8], rcx
cmp r13, rbx
je .L1
.L13:
cvttsd2si ebp, QWORD PTR [rbx]
cmp rcx, r8
jne .L31
movabs rax, 2305843009213693951
mov r15, QWORD PTR [r12]
sub rcx, r15
mov rdx, rcx
mov r14, rcx
sar rdx, 2
cmp rdx, rax
je .L32
test rdx, rdx
mov eax, 1
cmovne rax, rdx
add rax, rdx
jc .L7
test rax, rax
jne .L33
xor r8d, r8d
xor edi, edi
.L9:
lea rcx, [rdi+4+r14]
movq xmm0, rdi
mov DWORD PTR [rdi+r14], ebp
movq xmm1, rcx
punpcklqdq xmm0, xmm1
test r14, r14
jg .L34
test r15, r15
jne .L35
.L12:
add rbx, 8
mov QWORD PTR [r12+16], r8
movups XMMWORD PTR [r12], xmm0
cmp r13, rbx
jne .L13
.L1:
add rsp, 40
mov rax, r12
pop rbx
pop rbp
pop r12
pop r13
pop r14
pop r15
ret
.L34:
mov rsi, r15
mov rdx, r14
mov QWORD PTR [rsp+8], r8
mov QWORD PTR [rsp], rcx
movaps XMMWORD PTR [rsp+16], xmm0
call memmove
mov rsi, QWORD PTR [r12+16]
mov rcx, QWORD PTR [rsp]
mov r8, QWORD PTR [rsp+8]
movdqa xmm0, XMMWORD PTR [rsp+16]
sub rsi, r15
.L11:
mov rdi, r15
mov QWORD PTR [rsp+8], r8
mov QWORD PTR [rsp], rcx
movaps XMMWORD PTR [rsp+16], xmm0
call operator delete(void*, unsigned long)
movdqa xmm0, XMMWORD PTR [rsp+16]
mov r8, QWORD PTR [rsp+8]
mov rcx, QWORD PTR [rsp]
jmp .L12
.L35:
mov rsi, QWORD PTR [r12+16]
sub rsi, r15
jmp .L11
.L33:
movabs rdx, 2305843009213693951
cmp rax, rdx
cmova rax, rdx
sal rax, 2
mov QWORD PTR [rsp], rax
mov rdi, rax
.L8:
call operator new(unsigned long)
mov r8, QWORD PTR [rsp]
mov rdi, rax
add r8, rax
jmp .L9
.L7:
movabs rax, 9223372036854775804
mov QWORD PTR [rsp], rax
mov rdi, rax
jmp .L8
.L32:
mov edi, OFFSET FLAT:.LC0
call std::__throw_length_error(char const*)
mov rbp, rax
jmp .L15
copy(std::vector<double, std::allocator<double> > const&) [clone .cold]:
===========================================================================
with -O3:
.LC0:
.string "vector::_M_realloc_insert"
copy(std::vector<double, std::allocator<double> > const&):
push r15
pxor xmm0, xmm0
push r14
push r13
push r12
mov r12, rdi
push rbp
push rbx
sub rsp, 40
mov r13, QWORD PTR [rsi+8]
mov rbx, QWORD PTR [rsi]
movups XMMWORD PTR [rdi], xmm0
mov QWORD PTR [rdi+16], 0
cmp rbx, r13
je .L1
xor r8d, r8d
xor ecx, ecx
jmp .L13
.L31:
mov DWORD PTR [rcx], ebp
add rbx, 8
add rcx, 4
mov QWORD PTR [r12+8], rcx
cmp r13, rbx
je .L1
.L13:
cvttsd2si ebp, QWORD PTR [rbx]
cmp rcx, r8
jne .L31
movabs rax, 2305843009213693951
mov r15, QWORD PTR [r12]
sub rcx, r15
mov rdx, rcx
mov r14, rcx
sar rdx, 2
cmp rdx, rax
je .L32
test rdx, rdx
mov eax, 1
cmovne rax, rdx
add rax, rdx
jc .L7
test rax, rax
jne .L33
xor r8d, r8d
xor edi, edi
.L9:
lea rcx, [rdi+4+r14]
movq xmm0, rdi
mov DWORD PTR [rdi+r14], ebp
movq xmm1, rcx
punpcklqdq xmm0, xmm1
test r14, r14
jg .L34
test r15, r15
jne .L35
.L12:
add rbx, 8
mov QWORD PTR [r12+16], r8
movups XMMWORD PTR [r12], xmm0
cmp r13, rbx
jne .L13
.L1:
add rsp, 40
mov rax, r12
pop rbx
pop rbp
pop r12
pop r13
pop r14
pop r15
ret
.L34:
mov rsi, r15
mov rdx, r14
mov QWORD PTR [rsp+8], r8
mov QWORD PTR [rsp], rcx
movaps XMMWORD PTR [rsp+16], xmm0
call memmove
mov rsi, QWORD PTR [r12+16]
mov rcx, QWORD PTR [rsp]
mov r8, QWORD PTR [rsp+8]
movdqa xmm0, XMMWORD PTR [rsp+16]
sub rsi, r15
.L11:
mov rdi, r15
mov QWORD PTR [rsp+8], r8
mov QWORD PTR [rsp], rcx
movaps XMMWORD PTR [rsp+16], xmm0
call operator delete(void*, unsigned long)
movdqa xmm0, XMMWORD PTR [rsp+16]
mov r8, QWORD PTR [rsp+8]
mov rcx, QWORD PTR [rsp]
jmp .L12
.L35:
mov rsi, QWORD PTR [r12+16]
sub rsi, r15
jmp .L11
.L33:
movabs rdx, 2305843009213693951
cmp rax, rdx
cmova rax, rdx
sal rax, 2
mov QWORD PTR [rsp], rax
mov rdi, rax
.L8:
call operator new(unsigned long)
mov r8, QWORD PTR [rsp]
mov rdi, rax
add r8, rax
jmp .L9
.L7:
movabs rax, 9223372036854775804
mov QWORD PTR [rsp], rax
mov rdi, rax
jmp .L8
.L32:
mov edi, OFFSET FLAT:.LC0
call std::__throw_length_error(char const*)
mov rbp, rax
jmp .L15
copy(std::vector<double, std::allocator<double> > const&) [clone .cold]:
===========================================================================
Build "transform" with -O2 on x86-64 (https://godbolt.org/z/YTEfWEbcq):
.LC0:
.string "vector::_M_realloc_insert"
transform(std::vector<double, std::allocator<double> > const&):
push r12
mov r12, rdi
push rbp
push rbx
sub rsp, 16
mov rbp, QWORD PTR [rsi+8]
mov rbx, QWORD PTR [rsi]
mov QWORD PTR [rdi], 0
mov QWORD PTR [rdi+8], 0
mov QWORD PTR [rdi+16], 0
cmp rbx, rbp
je .L19
xor edx, edx
xor esi, esi
jmp .L23
.L31:
mov DWORD PTR [rsi], eax
add rbx, 8
add rsi, 4
mov QWORD PTR [r12+8], rsi
cmp rbp, rbx
je .L19
.L32:
mov rsi, QWORD PTR [r12+8]
mov rdx, QWORD PTR [r12+16]
.L23:
cvttsd2si eax, QWORD PTR [rbx]
mov DWORD PTR [rsp+12], eax
cmp rsi, rdx
jne .L31
lea rdx, [rsp+12]
mov rdi, r12
call void std::vector<int, std::allocator<int>
>::_M_realloc_insert<int>(__gnu_cxx::__normal_iterator<int*, std::vector<int,
std::allocator<int> > >, int&&)
add rbx, 8
cmp rbp, rbx
jne .L32
.L19:
add rsp, 16
mov rax, r12
pop rbx
pop rbp
pop r12
ret
mov rbp, rax
jmp .L24
transform(std::vector<double, std::allocator<double> > const&) [clone .cold]:
===========================================================================
and with -O3:
.LC0:
.string "vector::_M_realloc_insert"
transform(std::vector<double, std::allocator<double> > const&):
push r15
pxor xmm0, xmm0
push r14
push r13
push r12
mov r12, rdi
push rbp
push rbx
sub rsp, 40
mov r13, QWORD PTR [rsi+8]
mov rbx, QWORD PTR [rsi]
movups XMMWORD PTR [rdi], xmm0
mov QWORD PTR [rdi+16], 0
cmp rbx, r13
je .L1
xor r8d, r8d
xor ecx, ecx
jmp .L13
.L31:
mov DWORD PTR [rcx], ebp
add rbx, 8
add rcx, 4
mov QWORD PTR [r12+8], rcx
cmp r13, rbx
je .L1
.L13:
cvttsd2si ebp, QWORD PTR [rbx]
cmp rcx, r8
jne .L31
movabs rax, 2305843009213693951
mov r15, QWORD PTR [r12]
sub rcx, r15
mov rdx, rcx
mov r14, rcx
sar rdx, 2
cmp rdx, rax
je .L32
test rdx, rdx
mov eax, 1
cmovne rax, rdx
add rax, rdx
jc .L7
test rax, rax
jne .L33
xor r8d, r8d
xor edi, edi
.L9:
lea rcx, [rdi+4+r14]
movq xmm0, rdi
mov DWORD PTR [rdi+r14], ebp
movq xmm1, rcx
punpcklqdq xmm0, xmm1
test r14, r14
jg .L34
test r15, r15
jne .L35
.L12:
add rbx, 8
mov QWORD PTR [r12+16], r8
movups XMMWORD PTR [r12], xmm0
cmp r13, rbx
jne .L13
.L1:
add rsp, 40
mov rax, r12
pop rbx
pop rbp
pop r12
pop r13
pop r14
pop r15
ret
.L34:
mov rsi, r15
mov rdx, r14
mov QWORD PTR [rsp+8], r8
mov QWORD PTR [rsp], rcx
movaps XMMWORD PTR [rsp+16], xmm0
call memmove
mov rsi, QWORD PTR [r12+16]
mov rcx, QWORD PTR [rsp]
mov r8, QWORD PTR [rsp+8]
movdqa xmm0, XMMWORD PTR [rsp+16]
sub rsi, r15
.L11:
mov rdi, r15
mov QWORD PTR [rsp+8], r8
mov QWORD PTR [rsp], rcx
movaps XMMWORD PTR [rsp+16], xmm0
call operator delete(void*, unsigned long)
movdqa xmm0, XMMWORD PTR [rsp+16]
mov r8, QWORD PTR [rsp+8]
mov rcx, QWORD PTR [rsp]
jmp .L12
.L35:
mov rsi, QWORD PTR [r12+16]
sub rsi, r15
jmp .L11
.L33:
movabs rdx, 2305843009213693951
cmp rax, rdx
cmova rax, rdx
sal rax, 2
mov QWORD PTR [rsp], rax
mov rdi, rax
.L8:
call operator new(unsigned long)
mov r8, QWORD PTR [rsp]
mov rdi, rax
add r8, rax
jmp .L9
.L7:
movabs rax, 9223372036854775804
mov QWORD PTR [rsp], rax
mov rdi, rax
jmp .L8
.L32:
mov edi, OFFSET FLAT:.LC0
call std::__throw_length_error(char const*)
mov rbp, rax
jmp .L15
transform(std::vector<double, std::allocator<double> > const&) [clone .cold]:
More information about the Gcc-bugs
mailing list