This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug tree-optimization/85375] New: possible missed optimisation / regression from 6.3 with while (__builtin_ffs(x) && x)
- From: "vegard.nossum at oracle dot com" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: Thu, 12 Apr 2018 11:16:14 +0000
- Subject: [Bug tree-optimization/85375] New: possible missed optimisation / regression from 6.3 with while (__builtin_ffs(x) && x)
- Auto-submitted: auto-generated
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85375
Bug ID: 85375
Summary: possible missed optimisation / regression from 6.3
with while (__builtin_ffs(x) && x)
Product: gcc
Version: 8.0.1
Status: UNCONFIRMED
Keywords: missed-optimization
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: vegard.nossum at oracle dot com
Target Milestone: ---
Input:
extern int a;
int f(int x)
{
while (__builtin_ffs(x) && x)
x -= a;
return x;
}
gcc 6.3.0 with -O3 compiled this as:
f(int):
movl %edi, %eax
movl a(%rip), %esi
movl $-1, %ecx
jmp .L3
.L11:
testl %eax, %eax
je .L2
subl %esi, %eax
.L3:
bsfl %eax, %edx
cmove %ecx, %edx
cmpl $-1, %edx
jne .L11
.L2:
rep ret
whereas current trunk (also with -O3) compiles it as:
f(int):
movl $-1, %ecx
bsfl %edi, %eax
cmove %ecx, %eax
cmpl %ecx, %eax
je .L5
testl %edi, %edi
je .L6
movl a(%rip), %esi
movl %edi, %eax
jmp .L3
.L4:
testl %eax, %eax
je .L1
.L3:
subl %esi, %eax
bsfl %eax, %edx
cmove %ecx, %edx
cmpl $-1, %edx
jne .L4
ret
.L6:
xorl %eax, %eax
.L1:
ret
.L5:
movl %edi, %eax
ret
There are fewer instructions overall for the case where x is 0 on entry, but
trunk still has longer code overall even if we change the while-condition to
__builtin_expect(!!__builtin_ffs(x) && x, 1) which ideally should pessimise
this case.
For the same input, clang trunk with -O3 gives:
f(int): # @f(int)
test edi, edi
je .LBB0_3
mov eax, dword ptr [rip + a]
neg edi
.LBB0_2: # =>This Inner Loop Header: Depth=1
add edi, eax
jne .LBB0_2
.LBB0_3:
xor eax, eax
ret
This seems to rely simply on the fact that (__builtin_ffs(x) == 0) and (x == 0)
are equivalent.
If you simplify the while-condition to simply __builtin_ffs(x), then the
difference is smaller but still there:
6.3.0:
f(int):
movl %edi, %eax
movl a(%rip), %esi
movl $-1, %ecx
jmp .L3
.L5:
subl %esi, %eax
.L3:
bsfl %eax, %edx
cmove %ecx, %edx
cmpl $-1, %edx
jne .L5
rep ret
trunk:
f(int):
movl $-1, %ecx
bsfl %edi, %eax
cmove %ecx, %eax
cmpl %ecx, %eax
je .L4
movl a(%rip), %esi
movl %edi, %eax
.L3:
subl %esi, %eax
bsfl %eax, %edx
cmove %ecx, %edx
cmpl $-1, %edx
jne .L3
ret
.L4:
movl %edi, %eax
ret