This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug tree-optimization/33928] [4.3/4.4/4.5 Regression] 30% performance slowdown in floating-point code caused by r118475
- From: "lucier at math dot purdue dot edu" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: 6 May 2009 03:43:37 -0000
- Subject: [Bug tree-optimization/33928] [4.3/4.4/4.5 Regression] 30% performance slowdown in floating-point code caused by r118475
- References: <bug-33928-271@http.gcc.gnu.org/bugzilla/>
- Reply-to: gcc-bugzilla at gcc dot gnu dot org
------- Comment #53 from lucier at math dot purdue dot edu 2009-05-06 03:43 -------
I posted a possible fix to gcc-patches with the subject line
Possible fix for 30% performance regression in PR 33928
Here's the assembly for the main loop after the changes I proposed:
.L4230:
movq %r11, %rdi
addq 8(%r10), %rdi
movq 8(%r10), %rsi
movq 8(%r10), %rdx
movq 40(%r10), %rax
leaq 4(%r11), %rbx
addq %rdi, %rsi
leaq 4(%rdi), %r9
movq %rdi, -8(%r10)
addq %rsi, %rdx
leaq 4(%rsi), %r8
movq %rsi, -24(%r10)
leaq 4(%rdx), %rcx
movq %r9, -16(%r10)
movq %rdx, -40(%r10)
movq %r8, -32(%r10)
addq $7, %rax
movq %rcx, -48(%r10)
movsd (%rax,%rcx,2), %xmm12
leaq (%rbx,%rbx), %rcx
movsd (%rax,%rdx,2), %xmm3
leaq (%rax,%r11,2), %rdx
addq $8, %r11
movsd (%rax,%r8,2), %xmm14
cmpq %r11, %r13
movsd (%rax,%rsi,2), %xmm13
movsd (%rax,%r9,2), %xmm11
movsd (%rax,%rdi,2), %xmm10
movsd (%rax,%rcx), %xmm8
movq 24(%r10), %rax
movsd (%rdx), %xmm7
movsd 15(%rax), %xmm2
movsd 7(%rax), %xmm1
movapd %xmm2, %xmm0
movsd 31(%rax), %xmm9
movapd %xmm1, %xmm6
mulsd %xmm3, %xmm0
movapd %xmm1, %xmm4
mulsd %xmm12, %xmm6
mulsd %xmm3, %xmm4
movapd %xmm1, %xmm3
mulsd %xmm13, %xmm1
mulsd %xmm14, %xmm3
addsd %xmm0, %xmm6
movapd %xmm2, %xmm0
movsd 23(%rax), %xmm5
mulsd %xmm12, %xmm0
movapd %xmm7, %xmm12
subsd %xmm0, %xmm4
movapd %xmm2, %xmm0
mulsd %xmm14, %xmm2
movapd %xmm8, %xmm14
mulsd %xmm13, %xmm0
movapd %xmm11, %xmm13
addsd %xmm6, %xmm11
subsd %xmm6, %xmm13
subsd %xmm2, %xmm1
movapd %xmm10, %xmm2
addsd %xmm0, %xmm3
movapd %xmm5, %xmm0
subsd %xmm4, %xmm2
addsd %xmm4, %xmm10
subsd %xmm1, %xmm12
addsd %xmm1, %xmm7
movapd %xmm9, %xmm1
subsd %xmm3, %xmm14
mulsd %xmm2, %xmm0
xorpd .LC5(%rip), %xmm1
addsd %xmm3, %xmm8
movapd %xmm1, %xmm3
mulsd %xmm2, %xmm1
movapd %xmm5, %xmm2
mulsd %xmm13, %xmm3
mulsd %xmm11, %xmm2
addsd %xmm0, %xmm3
movapd %xmm5, %xmm0
mulsd %xmm10, %xmm5
mulsd %xmm13, %xmm0
subsd %xmm0, %xmm1
movapd %xmm9, %xmm0
mulsd %xmm11, %xmm9
mulsd %xmm10, %xmm0
subsd %xmm9, %xmm5
addsd %xmm0, %xmm2
movapd %xmm7, %xmm0
addsd %xmm5, %xmm0
subsd %xmm5, %xmm7
movsd %xmm0, (%rdx)
movapd %xmm8, %xmm0
movq 40(%r10), %rax
subsd %xmm2, %xmm8
addsd %xmm2, %xmm0
movsd %xmm0, 7(%rcx,%rax)
movq -8(%r10), %rdx
movq 40(%r10), %rax
movapd %xmm12, %xmm0
subsd %xmm1, %xmm12
movsd %xmm7, 7(%rax,%rdx,2)
movq -16(%r10), %rdx
movq 40(%r10), %rax
addsd %xmm1, %xmm0
movsd %xmm8, 7(%rax,%rdx,2)
movq -24(%r10), %rdx
movq 40(%r10), %rax
movsd %xmm0, 7(%rax,%rdx,2)
movapd %xmm14, %xmm0
movq -32(%r10), %rdx
movq 40(%r10), %rax
subsd %xmm3, %xmm14
addsd %xmm3, %xmm0
movsd %xmm0, 7(%rax,%rdx,2)
movq -40(%r10), %rdx
movq 40(%r10), %rax
movsd %xmm12, 7(%rax,%rdx,2)
movq -48(%r10), %rdx
movq 40(%r10), %rax
movsd %xmm14, 7(%rax,%rdx,2)
jg .L4230
movq %rbx, %r13
.L4228:
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928