[Bug tree-optimization/21485] [5/6/7 Regression] missed load PRE, PRE makes i?86/7 suck
law at redhat dot com
gcc-bugzilla@gcc.gnu.org
Tue Feb 14 16:52:00 GMT 2017
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=21485
--- Comment #63 from Jeffrey A. Law <law at redhat dot com> ---
So going back to the gcc-3.4 code the major change is we make much more use of
the complex addressing modes on x86:
NumSift:
.LFB2:
pushq %rbx
.LCFI0:
movq %rsi, %r8
.p2align 4,,7
.L11:
leaq (%r8,%r8), %rcx
cmpq %rdx, %rcx
ja .L10
.L14:
cmpq %rdx, %rcx
movq %rcx, %rsi
jae .L4
movq 8(%rdi,%rcx,8), %rbx
cmpq %rbx, (%rdi,%rcx,8)
leaq 1(%rcx), %rax
cmovl %rax, %rsi
.L4:
movq (%rdi,%r8,8), %rax
movq (%rdi,%rsi,8), %rcx
cmpq %rcx, %rax
jge .L6
movq %rax, (%rdi,%rsi,8)
movq %rcx, (%rdi,%r8,8)
movq %rsi, %r8
leaq (%r8,%r8), %rcx
cmpq %rdx, %rcx
jbe .L14
.L10:
popq %rbx
ret
.p2align 4,,7
.L6:
leaq 1(%rdx), %r8
jmp .L11
Contrast the loop body to what we generate now:
NumSift:
.LFB0:
.cfi_startproc
pushq %rbx
.cfi_def_cfa_offset 16
.cfi_offset 3, -16
leaq 1(%rdx), %rbx
.L2:
leaq (%rsi,%rsi), %rcx
cmpq %rdx, %rcx
ja .L9
.L6:
movq %rsi, %rax
salq $4, %rax
addq %rdi, %rax
cmpq %rdx, %rcx
movq (%rax), %r8
jnb .L3
leaq 1(%rcx), %r9
leaq (%rdi,%r9,8), %r10
movq (%r10), %r11
cmpq %r8, %r11
jle .L3
movq %r11, %r8
movq %r10, %rax
movq %r9, %rcx
.L3:
leaq (%rdi,%rsi,8), %r9
movq %rbx, %rsi
movq (%r9), %r10
cmpq %r8, %r10
jge .L2
movq %rcx, %rsi
movq %r10, (%rax)
movq %r8, (%r9)
leaq (%rsi,%rsi), %rcx
cmpq %rdx, %rcx
jbe .L6
.L9:
popq %rbx
I haven't looked deeply at the dumps, but I suspect that CSE/PRE on the address
arithmetic spoils our ability to utilize the complex addressing modes. Which
would tend match the findings from c#62 where a change in a conditional
inhibits PRE and we end up with better code.
More information about the Gcc-bugs
mailing list