This is the mail archive of the gcc-bugs@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[Bug tree-optimization/46186] Clang creates code running 1600 times faster than gcc's


https://gcc.gnu.org/bugzilla/show_bug.cgi?id=46186

Raphael C <drraph at gmail dot com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |drraph at gmail dot com

--- Comment #26 from Raphael C <drraph at gmail dot com> ---
If I understood this PR correctly, this simpler code shows the same issue:

unsigned long f(unsigned long a)
{
    unsigned long sum = 0;
    for (; a <1000000000; a++)
        sum += a;
    return sum;
}

In gcc 7.1 with -O3 -march=native I get:


f:
        cmp     rdi, 999999999
        ja      .L7
        mov     eax, 999999999
        mov     ecx, 1000000000
        sub     rax, rdi
        sub     rcx, rdi
        cmp     rax, 7
        jbe     .L8
        vmovq   xmm3, rdi
        mov     rdx, rcx
        vpxor   xmm0, xmm0, xmm0
        xor     eax, eax
        vpbroadcastq    ymm1, xmm3
        vmovdqa ymm2, YMMWORD PTR .LC1[rip]
        vpaddq  ymm1, ymm1, YMMWORD PTR .LC0[rip]
        shr     rdx, 2
.L4:
        add     rax, 1
        vpaddq  ymm0, ymm0, ymm1
        vpaddq  ymm1, ymm1, ymm2
        cmp     rax, rdx
        jb      .L4
        vpxor   xmm1, xmm1, xmm1
        mov     rdx, rcx
        vperm2i128      ymm2, ymm0, ymm1, 33
        and     rdx, -4
        vpaddq  ymm0, ymm0, ymm2
        add     rdi, rdx
        vperm2i128      ymm1, ymm0, ymm1, 33
        vpalignr        ymm1, ymm1, ymm0, 8
        vpaddq  ymm0, ymm0, ymm1
        vmovq   rax, xmm0
        cmp     rcx, rdx
        je      .L33
        vzeroupper
.L3:
        lea     rdx, [rdi+1]
        add     rax, rdi
        cmp     rdx, 1000000000
        je      .L31
        add     rax, rdx
        lea     rdx, [rdi+2]
        cmp     rdx, 1000000000
        je      .L31
        add     rax, rdx
        lea     rdx, [rdi+3]
        cmp     rdx, 1000000000
        je      .L31
        add     rax, rdx
        lea     rdx, [rdi+4]
        cmp     rdx, 1000000000
        je      .L31
        add     rax, rdx
        lea     rdx, [rdi+5]
        cmp     rdx, 1000000000
        je      .L31
        add     rax, rdx
        lea     rdx, [rdi+6]
        cmp     rdx, 1000000000
        je      .L31
        add     rax, rdx
        add     rdi, 7
        lea     rdx, [rax+rdi]
        cmp     rdi, 1000000000
        cmovne  rax, rdx
        ret
.L7:
        xor     eax, eax
.L31:
        ret
.L33:
        vzeroupper
        ret
.L8:
        xor     eax, eax
        jmp     .L3


However in clang I get:

f:                                      # @f
        cmp     rdi, 999999999
        ja      .LBB0_1
        mov     eax, 999999999
        sub     rax, rdi
        lea     rcx, [rdi + 1]
        imul    rcx, rax
        mov     edx, 999999998
        sub     rdx, rdi
        mul     rdx
        shl     rdx, 63
        shr     rax
        or      rax, rdx
        add     rcx, rdi
        add     rcx, rax
        mov     rax, rcx
        ret
.LBB0_1:
        xor     ecx, ecx
        mov     rax, rcx
        ret

which is greatly simpler and avoids looping altogether.

What is the current status of this (very old) PR?  Do people think it is worth
addressing?

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]