This is the mail archive of the gcc-bugs@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[Bug ipa/80317] New: Missed optimization for common standard library case


https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80317

            Bug ID: 80317
           Summary: Missed optimization for common standard library case
           Product: gcc
           Version: 7.0.1
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: ipa
          Assignee: unassigned at gcc dot gnu.org
          Reporter: antoshkka at gmail dot com
  Target Milestone: ---

It's a common case in standard library and in user code to move objects to new
location and then destroy object at the old location. For example GCC's
std::vector::reserve has the following code:

  pointer __tmp = _M_allocate_and_copy(__n,
    _GLIBCXX_MAKE_MOVE_IF_NOEXCEPT_ITERATOR(this->_M_impl._M_start),
    _GLIBCXX_MAKE_MOVE_IF_NOEXCEPT_ITERATOR(this->_M_impl._M_finish));
  std::_Destroy(this->_M_impl._M_start, this->_M_impl._M_finish,
                _M_get_Tp_allocator()); 


The problem is that traversing exactly the same data twice is not well
optimized. Minimal example:

///////////////

#include <new>

template <class T>
struct shared_ptr_like {
    T* ptr;
    shared_ptr_like(shared_ptr_like&& v) noexcept : ptr(v.ptr) { v.ptr = 0; }
    ~shared_ptr_like() { if (ptr) { delete ptr; } }
};

typedef shared_ptr_like<int> test_type;
void relocate(test_type* new_buffer, test_type* old_buffer, std::size_t size) {
    for (std::size_t i = 0; i != size; ++i) {
        new (new_buffer + i) test_type{ static_cast<test_type&&>(old_buffer[i])
};
    }
    for (std::size_t i = 0; i != size; ++i) {
        old_buffer[i].~test_type();
    }
}

///////////////

Produces assembly that traverses the loop twice, writes zeros to memory and
compares memory with zeros:

relocate(shared_ptr_like<int>*, shared_ptr_like<int>*, unsigned long):
        test    rdx, rdx
        je      .L15
        push    rbp
        sal     rdx, 3
        push    rbx
        lea     r8, [rdi+rdx]
        mov     rbx, rsi
        mov     rax, rsi
        sub     rsp, 8
.L3:
        mov     rcx, QWORD PTR [rax]
        add     rdi, 8
        add     rax, 8
        mov     QWORD PTR [rdi-8], rcx
        mov     QWORD PTR [rax-8], 0
        cmp     rdi, r8
        jne     .L3
        lea     rbp, [rsi+rdx]
.L5:
        mov     rdi, QWORD PTR [rbx]
        test    rdi, rdi
        je      .L4
        mov     esi, 4
        call    operator delete(void*, unsigned long)
.L4:
        add     rbx, 8
        cmp     rbp, rbx
        jne     .L5
        add     rsp, 8
        pop     rbx
        pop     rbp
        ret
.L15:
        rep ret



Meanwhile optimal assembly for that function would be


relocate(shared_ptr_like<int>*, shared_ptr_like<int>*, unsigned long):
        test    rdx, rdx
        je      .L1
        lea     rdx, [rdi+rdx*8]
.L3:
        mov     rax, QWORD PTR [rsi]
        add     rdi, 8
        add     rsi, 8
        mov     QWORD PTR [rdi-8], rax
        cmp     rdi, rdx
        jne     .L3
.L1:
        rep ret


Note: Compiling the following code produces the optimal assembly from above:

#include <new>

template <class T>
struct shared_ptr_like {
    T* ptr;
    shared_ptr_like(shared_ptr_like&& v) noexcept : ptr(v.ptr) { v.ptr = 0; }
    ~shared_ptr_like() { if (ptr) { delete ptr; } }
};

typedef shared_ptr_like<int> test_type;
void relocate(test_type* new_buffer, test_type* old_buffer, std::size_t size) {
    for (std::size_t i = 0; i != size; ++i) {
        new (new_buffer + i) test_type{ static_cast<test_type&&>(old_buffer[i])
};
        old_buffer[i].~test_type();
    }
}


Checked on GCC 7.0.1 20170330 with flags -std=c++17 -O2

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]