[Bug middle-end/100104] std::transform is 1.5 times faster than std::copy with -O3

hewillk at gmail dot com gcc-bugzilla@gcc.gnu.org
Fri Apr 16 01:43:26 GMT 2021


https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100104

--- Comment #3 from 康桓瑋 <hewillk at gmail dot com> ---
Build "copy" with -O2 on x86-64 (https://godbolt.org/z/Gja6xrq9G):

.LC0:
        .string "vector::_M_realloc_insert"
copy(std::vector<double, std::allocator<double> > const&):
        push    r15
        pxor    xmm0, xmm0
        push    r14
        push    r13
        push    r12
        mov     r12, rdi
        push    rbp
        push    rbx
        sub     rsp, 40
        mov     r13, QWORD PTR [rsi+8]
        mov     rbx, QWORD PTR [rsi]
        movups  XMMWORD PTR [rdi], xmm0
        mov     QWORD PTR [rdi+16], 0
        cmp     rbx, r13
        je      .L1
        xor     r8d, r8d
        xor     ecx, ecx
        jmp     .L13
.L31:
        mov     DWORD PTR [rcx], ebp
        add     rbx, 8
        add     rcx, 4
        mov     QWORD PTR [r12+8], rcx
        cmp     r13, rbx
        je      .L1
.L13:
        cvttsd2si       ebp, QWORD PTR [rbx]
        cmp     rcx, r8
        jne     .L31
        movabs  rax, 2305843009213693951
        mov     r15, QWORD PTR [r12]
        sub     rcx, r15
        mov     rdx, rcx
        mov     r14, rcx
        sar     rdx, 2
        cmp     rdx, rax
        je      .L32
        test    rdx, rdx
        mov     eax, 1
        cmovne  rax, rdx
        add     rax, rdx
        jc      .L7
        test    rax, rax
        jne     .L33
        xor     r8d, r8d
        xor     edi, edi
.L9:
        lea     rcx, [rdi+4+r14]
        movq    xmm0, rdi
        mov     DWORD PTR [rdi+r14], ebp
        movq    xmm1, rcx
        punpcklqdq      xmm0, xmm1
        test    r14, r14
        jg      .L34
        test    r15, r15
        jne     .L35
.L12:
        add     rbx, 8
        mov     QWORD PTR [r12+16], r8
        movups  XMMWORD PTR [r12], xmm0
        cmp     r13, rbx
        jne     .L13
.L1:
        add     rsp, 40
        mov     rax, r12
        pop     rbx
        pop     rbp
        pop     r12
        pop     r13
        pop     r14
        pop     r15
        ret
.L34:
        mov     rsi, r15
        mov     rdx, r14
        mov     QWORD PTR [rsp+8], r8
        mov     QWORD PTR [rsp], rcx
        movaps  XMMWORD PTR [rsp+16], xmm0
        call    memmove
        mov     rsi, QWORD PTR [r12+16]
        mov     rcx, QWORD PTR [rsp]
        mov     r8, QWORD PTR [rsp+8]
        movdqa  xmm0, XMMWORD PTR [rsp+16]
        sub     rsi, r15
.L11:
        mov     rdi, r15
        mov     QWORD PTR [rsp+8], r8
        mov     QWORD PTR [rsp], rcx
        movaps  XMMWORD PTR [rsp+16], xmm0
        call    operator delete(void*, unsigned long)
        movdqa  xmm0, XMMWORD PTR [rsp+16]
        mov     r8, QWORD PTR [rsp+8]
        mov     rcx, QWORD PTR [rsp]
        jmp     .L12
.L35:
        mov     rsi, QWORD PTR [r12+16]
        sub     rsi, r15
        jmp     .L11
.L33:
        movabs  rdx, 2305843009213693951
        cmp     rax, rdx
        cmova   rax, rdx
        sal     rax, 2
        mov     QWORD PTR [rsp], rax
        mov     rdi, rax
.L8:
        call    operator new(unsigned long)
        mov     r8, QWORD PTR [rsp]
        mov     rdi, rax
        add     r8, rax
        jmp     .L9
.L7:
        movabs  rax, 9223372036854775804
        mov     QWORD PTR [rsp], rax
        mov     rdi, rax
        jmp     .L8
.L32:
        mov     edi, OFFSET FLAT:.LC0
        call    std::__throw_length_error(char const*)
        mov     rbp, rax
        jmp     .L15
copy(std::vector<double, std::allocator<double> > const&) [clone .cold]:


===========================================================================


with -O3:

.LC0:
        .string "vector::_M_realloc_insert"
copy(std::vector<double, std::allocator<double> > const&):
        push    r15
        pxor    xmm0, xmm0
        push    r14
        push    r13
        push    r12
        mov     r12, rdi
        push    rbp
        push    rbx
        sub     rsp, 40
        mov     r13, QWORD PTR [rsi+8]
        mov     rbx, QWORD PTR [rsi]
        movups  XMMWORD PTR [rdi], xmm0
        mov     QWORD PTR [rdi+16], 0
        cmp     rbx, r13
        je      .L1
        xor     r8d, r8d
        xor     ecx, ecx
        jmp     .L13
.L31:
        mov     DWORD PTR [rcx], ebp
        add     rbx, 8
        add     rcx, 4
        mov     QWORD PTR [r12+8], rcx
        cmp     r13, rbx
        je      .L1
.L13:
        cvttsd2si       ebp, QWORD PTR [rbx]
        cmp     rcx, r8
        jne     .L31
        movabs  rax, 2305843009213693951
        mov     r15, QWORD PTR [r12]
        sub     rcx, r15
        mov     rdx, rcx
        mov     r14, rcx
        sar     rdx, 2
        cmp     rdx, rax
        je      .L32
        test    rdx, rdx
        mov     eax, 1
        cmovne  rax, rdx
        add     rax, rdx
        jc      .L7
        test    rax, rax
        jne     .L33
        xor     r8d, r8d
        xor     edi, edi
.L9:
        lea     rcx, [rdi+4+r14]
        movq    xmm0, rdi
        mov     DWORD PTR [rdi+r14], ebp
        movq    xmm1, rcx
        punpcklqdq      xmm0, xmm1
        test    r14, r14
        jg      .L34
        test    r15, r15
        jne     .L35
.L12:
        add     rbx, 8
        mov     QWORD PTR [r12+16], r8
        movups  XMMWORD PTR [r12], xmm0
        cmp     r13, rbx
        jne     .L13
.L1:
        add     rsp, 40
        mov     rax, r12
        pop     rbx
        pop     rbp
        pop     r12
        pop     r13
        pop     r14
        pop     r15
        ret
.L34:
        mov     rsi, r15
        mov     rdx, r14
        mov     QWORD PTR [rsp+8], r8
        mov     QWORD PTR [rsp], rcx
        movaps  XMMWORD PTR [rsp+16], xmm0
        call    memmove
        mov     rsi, QWORD PTR [r12+16]
        mov     rcx, QWORD PTR [rsp]
        mov     r8, QWORD PTR [rsp+8]
        movdqa  xmm0, XMMWORD PTR [rsp+16]
        sub     rsi, r15
.L11:
        mov     rdi, r15
        mov     QWORD PTR [rsp+8], r8
        mov     QWORD PTR [rsp], rcx
        movaps  XMMWORD PTR [rsp+16], xmm0
        call    operator delete(void*, unsigned long)
        movdqa  xmm0, XMMWORD PTR [rsp+16]
        mov     r8, QWORD PTR [rsp+8]
        mov     rcx, QWORD PTR [rsp]
        jmp     .L12
.L35:
        mov     rsi, QWORD PTR [r12+16]
        sub     rsi, r15
        jmp     .L11
.L33:
        movabs  rdx, 2305843009213693951
        cmp     rax, rdx
        cmova   rax, rdx
        sal     rax, 2
        mov     QWORD PTR [rsp], rax
        mov     rdi, rax
.L8:
        call    operator new(unsigned long)
        mov     r8, QWORD PTR [rsp]
        mov     rdi, rax
        add     r8, rax
        jmp     .L9
.L7:
        movabs  rax, 9223372036854775804
        mov     QWORD PTR [rsp], rax
        mov     rdi, rax
        jmp     .L8
.L32:
        mov     edi, OFFSET FLAT:.LC0
        call    std::__throw_length_error(char const*)
        mov     rbp, rax
        jmp     .L15
copy(std::vector<double, std::allocator<double> > const&) [clone .cold]:


===========================================================================


Build "transform" with -O2 on x86-64 (https://godbolt.org/z/YTEfWEbcq):

.LC0:
        .string "vector::_M_realloc_insert"
transform(std::vector<double, std::allocator<double> > const&):
        push    r12
        mov     r12, rdi
        push    rbp
        push    rbx
        sub     rsp, 16
        mov     rbp, QWORD PTR [rsi+8]
        mov     rbx, QWORD PTR [rsi]
        mov     QWORD PTR [rdi], 0
        mov     QWORD PTR [rdi+8], 0
        mov     QWORD PTR [rdi+16], 0
        cmp     rbx, rbp
        je      .L19
        xor     edx, edx
        xor     esi, esi
        jmp     .L23
.L31:
        mov     DWORD PTR [rsi], eax
        add     rbx, 8
        add     rsi, 4
        mov     QWORD PTR [r12+8], rsi
        cmp     rbp, rbx
        je      .L19
.L32:
        mov     rsi, QWORD PTR [r12+8]
        mov     rdx, QWORD PTR [r12+16]
.L23:
        cvttsd2si       eax, QWORD PTR [rbx]
        mov     DWORD PTR [rsp+12], eax
        cmp     rsi, rdx
        jne     .L31
        lea     rdx, [rsp+12]
        mov     rdi, r12
        call    void std::vector<int, std::allocator<int>
>::_M_realloc_insert<int>(__gnu_cxx::__normal_iterator<int*, std::vector<int,
std::allocator<int> > >, int&&)
        add     rbx, 8
        cmp     rbp, rbx
        jne     .L32
.L19:
        add     rsp, 16
        mov     rax, r12
        pop     rbx
        pop     rbp
        pop     r12
        ret
        mov     rbp, rax
        jmp     .L24
transform(std::vector<double, std::allocator<double> > const&) [clone .cold]:


===========================================================================


and with -O3:

.LC0:
        .string "vector::_M_realloc_insert"
transform(std::vector<double, std::allocator<double> > const&):
        push    r15
        pxor    xmm0, xmm0
        push    r14
        push    r13
        push    r12
        mov     r12, rdi
        push    rbp
        push    rbx
        sub     rsp, 40
        mov     r13, QWORD PTR [rsi+8]
        mov     rbx, QWORD PTR [rsi]
        movups  XMMWORD PTR [rdi], xmm0
        mov     QWORD PTR [rdi+16], 0
        cmp     rbx, r13
        je      .L1
        xor     r8d, r8d
        xor     ecx, ecx
        jmp     .L13
.L31:
        mov     DWORD PTR [rcx], ebp
        add     rbx, 8
        add     rcx, 4
        mov     QWORD PTR [r12+8], rcx
        cmp     r13, rbx
        je      .L1
.L13:
        cvttsd2si       ebp, QWORD PTR [rbx]
        cmp     rcx, r8
        jne     .L31
        movabs  rax, 2305843009213693951
        mov     r15, QWORD PTR [r12]
        sub     rcx, r15
        mov     rdx, rcx
        mov     r14, rcx
        sar     rdx, 2
        cmp     rdx, rax
        je      .L32
        test    rdx, rdx
        mov     eax, 1
        cmovne  rax, rdx
        add     rax, rdx
        jc      .L7
        test    rax, rax
        jne     .L33
        xor     r8d, r8d
        xor     edi, edi
.L9:
        lea     rcx, [rdi+4+r14]
        movq    xmm0, rdi
        mov     DWORD PTR [rdi+r14], ebp
        movq    xmm1, rcx
        punpcklqdq      xmm0, xmm1
        test    r14, r14
        jg      .L34
        test    r15, r15
        jne     .L35
.L12:
        add     rbx, 8
        mov     QWORD PTR [r12+16], r8
        movups  XMMWORD PTR [r12], xmm0
        cmp     r13, rbx
        jne     .L13
.L1:
        add     rsp, 40
        mov     rax, r12
        pop     rbx
        pop     rbp
        pop     r12
        pop     r13
        pop     r14
        pop     r15
        ret
.L34:
        mov     rsi, r15
        mov     rdx, r14
        mov     QWORD PTR [rsp+8], r8
        mov     QWORD PTR [rsp], rcx
        movaps  XMMWORD PTR [rsp+16], xmm0
        call    memmove
        mov     rsi, QWORD PTR [r12+16]
        mov     rcx, QWORD PTR [rsp]
        mov     r8, QWORD PTR [rsp+8]
        movdqa  xmm0, XMMWORD PTR [rsp+16]
        sub     rsi, r15
.L11:
        mov     rdi, r15
        mov     QWORD PTR [rsp+8], r8
        mov     QWORD PTR [rsp], rcx
        movaps  XMMWORD PTR [rsp+16], xmm0
        call    operator delete(void*, unsigned long)
        movdqa  xmm0, XMMWORD PTR [rsp+16]
        mov     r8, QWORD PTR [rsp+8]
        mov     rcx, QWORD PTR [rsp]
        jmp     .L12
.L35:
        mov     rsi, QWORD PTR [r12+16]
        sub     rsi, r15
        jmp     .L11
.L33:
        movabs  rdx, 2305843009213693951
        cmp     rax, rdx
        cmova   rax, rdx
        sal     rax, 2
        mov     QWORD PTR [rsp], rax
        mov     rdi, rax
.L8:
        call    operator new(unsigned long)
        mov     r8, QWORD PTR [rsp]
        mov     rdi, rax
        add     r8, rax
        jmp     .L9
.L7:
        movabs  rax, 9223372036854775804
        mov     QWORD PTR [rsp], rax
        mov     rdi, rax
        jmp     .L8
.L32:
        mov     edi, OFFSET FLAT:.LC0
        call    std::__throw_length_error(char const*)
        mov     rbp, rax
        jmp     .L15
transform(std::vector<double, std::allocator<double> > const&) [clone .cold]:


More information about the Gcc-bugs mailing list