[Bug regression/33928] 33% performance slowdown from 4.2.2 to 4.3.0 in floating-point code with computed gotos
rguenth at gcc dot gnu dot org
gcc-bugzilla@gcc.gnu.org
Sun Oct 28 16:38:00 GMT 2007
------- Comment #9 from rguenth at gcc dot gnu dot org 2007-10-28 16:38 -------
The main difference I see is that 4.2 avoids re-use of %eax as index register:
.L34:
movq %r11, %rdi
addq 8(%r10), %rdi
movq 8(%r10), %rsi
movq 8(%r10), %rdx
movq 40(%r10), %rax
leaq 4(%r11), %rbx
addq %rdi, %rsi
leaq 4(%rdi), %r9
movq %rdi, -8(%r10)
addq %rsi, %rdx
leaq 4(%rsi), %r8
movq %rsi, -24(%r10)
leaq 4(%rdx), %rcx
movq %r9, -16(%r10)
movq %rdx, -40(%r10)
movq %r8, -32(%r10)
addq $7, %rax
movq %rcx, -48(%r10)
movsd (%rax,%rcx,2), %xmm12
leaq (%rbx,%rbx), %rcx
movsd (%rax,%rdx,2), %xmm3
leaq (%rax,%r11,2), %rdx
addq $8, %r11
movsd (%rax,%r8,2), %xmm14
cmpq %r11, %r13
movsd (%rax,%rsi,2), %xmm13
movsd (%rax,%r9,2), %xmm11
movsd (%rax,%rdi,2), %xmm10
movsd (%rax,%rcx), %xmm8
...
while 4.3 always re-loads %rax as index:
.L26:
leaq 4(%rdi), %rdx
movq %rdi, %rax
movq %rdx, -8(%rsp)
addq (%r8), %rax
movq %rax, (%r9)
addq $4, %rax
movq %rax, (%rbp)
movq (%r9), %rax
addq (%r8), %rax
movq %rax, (%r10)
addq $4, %rax
movq %rax, (%rbx)
movq (%r10), %rax
addq (%r8), %rax
movq %rax, (%r11)
movq -64(%rsp), %rcx
addq $4, %rax
movq %rax, (%rcx)
movq (%rsi), %rdx
movq -8(%rsp), %rcx
addq $7, %rdx
movsd (%rdx,%rax,2), %xmm13
movq (%r11), %rax
addq %rcx, %rcx
movsd (%rdx,%rcx), %xmm8
movsd (%rdx,%rax,2), %xmm3
movq (%rbx), %rax
movsd (%rdx,%rax,2), %xmm14
movq (%r10), %rax
movsd (%rdx,%rax,2), %xmm12
movq (%rbp), %rax
movsd (%rdx,%rax,2), %xmm11
movq (%r9), %rax
movsd (%rdx,%rax,2), %xmm10
movq (%r12), %rax
leaq (%rdx,%rdi,2), %rdx
...
the root cause needs to be investigated still.
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928
More information about the Gcc-bugs
mailing list