This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug ipa/80317] New: Missed optimization for common standard library case
- From: "antoshkka at gmail dot com" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: Tue, 04 Apr 2017 19:40:24 +0000
- Subject: [Bug ipa/80317] New: Missed optimization for common standard library case
- Auto-submitted: auto-generated
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80317
Bug ID: 80317
Summary: Missed optimization for common standard library case
Product: gcc
Version: 7.0.1
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: ipa
Assignee: unassigned at gcc dot gnu.org
Reporter: antoshkka at gmail dot com
Target Milestone: ---
It's a common case in standard library and in user code to move objects to new
location and then destroy object at the old location. For example GCC's
std::vector::reserve has the following code:
pointer __tmp = _M_allocate_and_copy(__n,
_GLIBCXX_MAKE_MOVE_IF_NOEXCEPT_ITERATOR(this->_M_impl._M_start),
_GLIBCXX_MAKE_MOVE_IF_NOEXCEPT_ITERATOR(this->_M_impl._M_finish));
std::_Destroy(this->_M_impl._M_start, this->_M_impl._M_finish,
_M_get_Tp_allocator());
The problem is that traversing exactly the same data twice is not well
optimized. Minimal example:
///////////////
#include <new>
template <class T>
struct shared_ptr_like {
T* ptr;
shared_ptr_like(shared_ptr_like&& v) noexcept : ptr(v.ptr) { v.ptr = 0; }
~shared_ptr_like() { if (ptr) { delete ptr; } }
};
typedef shared_ptr_like<int> test_type;
void relocate(test_type* new_buffer, test_type* old_buffer, std::size_t size) {
for (std::size_t i = 0; i != size; ++i) {
new (new_buffer + i) test_type{ static_cast<test_type&&>(old_buffer[i])
};
}
for (std::size_t i = 0; i != size; ++i) {
old_buffer[i].~test_type();
}
}
///////////////
Produces assembly that traverses the loop twice, writes zeros to memory and
compares memory with zeros:
relocate(shared_ptr_like<int>*, shared_ptr_like<int>*, unsigned long):
test rdx, rdx
je .L15
push rbp
sal rdx, 3
push rbx
lea r8, [rdi+rdx]
mov rbx, rsi
mov rax, rsi
sub rsp, 8
.L3:
mov rcx, QWORD PTR [rax]
add rdi, 8
add rax, 8
mov QWORD PTR [rdi-8], rcx
mov QWORD PTR [rax-8], 0
cmp rdi, r8
jne .L3
lea rbp, [rsi+rdx]
.L5:
mov rdi, QWORD PTR [rbx]
test rdi, rdi
je .L4
mov esi, 4
call operator delete(void*, unsigned long)
.L4:
add rbx, 8
cmp rbp, rbx
jne .L5
add rsp, 8
pop rbx
pop rbp
ret
.L15:
rep ret
Meanwhile optimal assembly for that function would be
relocate(shared_ptr_like<int>*, shared_ptr_like<int>*, unsigned long):
test rdx, rdx
je .L1
lea rdx, [rdi+rdx*8]
.L3:
mov rax, QWORD PTR [rsi]
add rdi, 8
add rsi, 8
mov QWORD PTR [rdi-8], rax
cmp rdi, rdx
jne .L3
.L1:
rep ret
Note: Compiling the following code produces the optimal assembly from above:
#include <new>
template <class T>
struct shared_ptr_like {
T* ptr;
shared_ptr_like(shared_ptr_like&& v) noexcept : ptr(v.ptr) { v.ptr = 0; }
~shared_ptr_like() { if (ptr) { delete ptr; } }
};
typedef shared_ptr_like<int> test_type;
void relocate(test_type* new_buffer, test_type* old_buffer, std::size_t size) {
for (std::size_t i = 0; i != size; ++i) {
new (new_buffer + i) test_type{ static_cast<test_type&&>(old_buffer[i])
};
old_buffer[i].~test_type();
}
}
Checked on GCC 7.0.1 20170330 with flags -std=c++17 -O2