[Bug other/47167] Performance regression in numerical code
ubizjak at gmail dot com
gcc-bugzilla@gcc.gnu.org
Wed Jan 5 17:38:00 GMT 2011
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=47167
--- Comment #2 from Uros Bizjak <ubizjak at gmail dot com> 2011-01-05 17:31:20 UTC ---
The only difference in the hot loop is the usage of two regs in the address:
4.5.1:
.L142:
movapd %xmm0, (%rcx)
mulpd %xmm6, %xmm2
addq $32, %rbx
movapd %xmm1, %xmm6
mulpd %xmm0, %xmm6
movsd (%rax), %xmm1
movsd 8(%rax), %xmm3
unpcklpd %xmm1, %xmm1
subpd %xmm2, %xmm6
unpcklpd %xmm3, %xmm3
mulpd %xmm9, %xmm1
mulpd %xmm0, %xmm3
movapd %xmm6, 16(%rcx)
addq $32, %rcx
movapd %xmm1, %xmm0
movsd 16(%rax), %xmm1
mulpd %xmm6, %xmm0
unpcklpd %xmm1, %xmm1
movsd 24(%rax), %xmm2
addq $32, %rax
cmpq %rsi, %rbx
unpcklpd %xmm2, %xmm2
subpd %xmm3, %xmm0
mulpd %xmm9, %xmm1
jne .L142
4.6:
.L167:
movapd %xmm0, %xmm10
.L143:
mulpd %xmm2, %xmm6
movapd %xmm3, %xmm2
movapd %xmm10, (%rsi,%rcx)
mulpd %xmm10, %xmm2
movsd (%rdx), %xmm0
movsd 8(%rdx), %xmm1
subpd %xmm6, %xmm2
unpcklpd %xmm0, %xmm0
unpcklpd %xmm1, %xmm1
mulpd %xmm11, %xmm0
movapd %xmm2, 16(%rsi,%rcx)
mulpd %xmm10, %xmm1
addq $32, %rcx
mulpd %xmm2, %xmm0
movsd 16(%rdx), %xmm3
movsd 24(%rdx), %xmm6
addq $32, %rdx
cmpq %rdi, %rcx
unpcklpd %xmm3, %xmm3
unpcklpd %xmm6, %xmm6
subpd %xmm1, %xmm0
mulpd %xmm11, %xmm3
jne .L167
Given the comment above ix86_address_cost:
/* Return cost of the memory address x.
For i386, it is better to use a complex address than let gcc copy
the address into a reg and make a new pseudo. But not if the address
requires to two regs - that would mean more pseudos with longer
lifetimes. */
this could be the reason for slowdown.
More information about the Gcc-bugs
mailing list