This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug tree-optimization/46186] Clang creates code running 1600 times faster than gcc's
- From: "drraph at gmail dot com" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: Wed, 24 May 2017 13:36:31 +0000
- Subject: [Bug tree-optimization/46186] Clang creates code running 1600 times faster than gcc's
- Auto-submitted: auto-generated
- References: <bug-46186-4@http.gcc.gnu.org/bugzilla/>
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=46186
Raphael C <drraph at gmail dot com> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |drraph at gmail dot com
--- Comment #26 from Raphael C <drraph at gmail dot com> ---
If I understood this PR correctly, this simpler code shows the same issue:
unsigned long f(unsigned long a)
{
unsigned long sum = 0;
for (; a <1000000000; a++)
sum += a;
return sum;
}
In gcc 7.1 with -O3 -march=native I get:
f:
cmp rdi, 999999999
ja .L7
mov eax, 999999999
mov ecx, 1000000000
sub rax, rdi
sub rcx, rdi
cmp rax, 7
jbe .L8
vmovq xmm3, rdi
mov rdx, rcx
vpxor xmm0, xmm0, xmm0
xor eax, eax
vpbroadcastq ymm1, xmm3
vmovdqa ymm2, YMMWORD PTR .LC1[rip]
vpaddq ymm1, ymm1, YMMWORD PTR .LC0[rip]
shr rdx, 2
.L4:
add rax, 1
vpaddq ymm0, ymm0, ymm1
vpaddq ymm1, ymm1, ymm2
cmp rax, rdx
jb .L4
vpxor xmm1, xmm1, xmm1
mov rdx, rcx
vperm2i128 ymm2, ymm0, ymm1, 33
and rdx, -4
vpaddq ymm0, ymm0, ymm2
add rdi, rdx
vperm2i128 ymm1, ymm0, ymm1, 33
vpalignr ymm1, ymm1, ymm0, 8
vpaddq ymm0, ymm0, ymm1
vmovq rax, xmm0
cmp rcx, rdx
je .L33
vzeroupper
.L3:
lea rdx, [rdi+1]
add rax, rdi
cmp rdx, 1000000000
je .L31
add rax, rdx
lea rdx, [rdi+2]
cmp rdx, 1000000000
je .L31
add rax, rdx
lea rdx, [rdi+3]
cmp rdx, 1000000000
je .L31
add rax, rdx
lea rdx, [rdi+4]
cmp rdx, 1000000000
je .L31
add rax, rdx
lea rdx, [rdi+5]
cmp rdx, 1000000000
je .L31
add rax, rdx
lea rdx, [rdi+6]
cmp rdx, 1000000000
je .L31
add rax, rdx
add rdi, 7
lea rdx, [rax+rdi]
cmp rdi, 1000000000
cmovne rax, rdx
ret
.L7:
xor eax, eax
.L31:
ret
.L33:
vzeroupper
ret
.L8:
xor eax, eax
jmp .L3
However in clang I get:
f: # @f
cmp rdi, 999999999
ja .LBB0_1
mov eax, 999999999
sub rax, rdi
lea rcx, [rdi + 1]
imul rcx, rax
mov edx, 999999998
sub rdx, rdi
mul rdx
shl rdx, 63
shr rax
or rax, rdx
add rcx, rdi
add rcx, rax
mov rax, rcx
ret
.LBB0_1:
xor ecx, ecx
mov rax, rcx
ret
which is greatly simpler and avoids looping altogether.
What is the current status of this (very old) PR? Do people think it is worth
addressing?