This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug regression/39914] New: 96% performance regression in floating point code; part of the problem started 2009/03/12-13
- From: "lucier at math dot purdue dot edu" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: 26 Apr 2009 18:23:51 -0000
- Subject: [Bug regression/39914] New: 96% performance regression in floating point code; part of the problem started 2009/03/12-13
- Reply-to: gcc-bugzilla at gcc dot gnu dot org
With this compiler:
gcc version 4.4.0 20090312 (experimental) [trunk revision 144801] (GCC)
running the test in
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928
(same .i file, same instructions for reproducing, same compiler options, same
everything)
gives a time of
132 ms cpu time (132 user, 0 system)
with assembly code in the main loop of
.L2958:
movq %rdx, %rcx
addq (%r11), %rcx
leaq 4(%rdx), %r14
movq %rcx, (%rdi)
addq $4, %rcx
movq %rcx, (%r10)
movq (%r11), %rcx
addq (%rdi), %rcx
movq %rcx, (%rsi)
addq $4, %rcx
movq %rcx, (%r9)
movq (%r11), %r12
addq (%rsi), %r12
movq %r12, (%rbp)
addq $4, %r12
movq %r12, (%r15)
movq (%rax), %rcx
addq $7, %rcx
movsd (%rcx,%r12,2), %xmm7
movq (%rbp), %r12
leaq (%rcx,%rdx,2), %r13
addq $8, %rdx
movsd (%r13), %xmm4
movsd (%rcx,%r12,2), %xmm10
movq (%r9), %r12
movsd (%rcx,%r12,2), %xmm5
movq (%rsi), %r12
movsd (%rcx,%r12,2), %xmm6
movq (%r10), %r12
movsd (%rcx,%r12,2), %xmm13
movq (%rdi), %r12
movsd (%rcx,%r12,2), %xmm11
leaq (%r14,%r14), %r12
movsd (%rcx,%r12), %xmm9
movq 24(%r8), %rcx
movapd %xmm11, %xmm14
movsd 15(%rcx), %xmm1
movsd 7(%rcx), %xmm2
movapd %xmm1, %xmm8
movsd 31(%rcx), %xmm3
movapd %xmm2, %xmm12
mulsd %xmm10, %xmm8
mulsd %xmm7, %xmm12
mulsd %xmm2, %xmm10
mulsd %xmm1, %xmm7
movsd 23(%rcx), %xmm0
addsd %xmm8, %xmm12
movapd %xmm2, %xmm8
mulsd %xmm6, %xmm2
subsd %xmm7, %xmm10
movapd %xmm1, %xmm7
mulsd %xmm5, %xmm1
mulsd %xmm6, %xmm7
movapd %xmm4, %xmm6
mulsd %xmm5, %xmm8
movapd %xmm9, %xmm5
subsd %xmm10, %xmm14
subsd %xmm1, %xmm2
movapd %xmm3, %xmm1
addsd %xmm11, %xmm10
xorpd .LC5(%rip), %xmm1
addsd %xmm7, %xmm8
movapd %xmm13, %xmm7
subsd %xmm2, %xmm6
subsd %xmm12, %xmm7
subsd %xmm8, %xmm5
addsd %xmm4, %xmm2
movapd %xmm0, %xmm4
addsd %xmm9, %xmm8
movapd %xmm1, %xmm9
mulsd %xmm14, %xmm4
addsd %xmm13, %xmm12
mulsd %xmm7, %xmm9
mulsd %xmm1, %xmm14
movapd %xmm3, %xmm1
mulsd %xmm0, %xmm7
mulsd %xmm10, %xmm1
mulsd %xmm0, %xmm10
addsd %xmm9, %xmm4
subsd %xmm7, %xmm14
movapd %xmm0, %xmm7
movapd %xmm2, %xmm0
mulsd %xmm12, %xmm7
mulsd %xmm3, %xmm12
addsd %xmm1, %xmm7
subsd %xmm12, %xmm10
addsd %xmm10, %xmm0
subsd %xmm10, %xmm2
movsd %xmm0, (%r13)
movapd %xmm8, %xmm0
movq (%rax), %rcx
subsd %xmm7, %xmm8
addsd %xmm7, %xmm0
movsd %xmm0, 7(%r12,%rcx)
movq (%rdi), %r12
movq (%rax), %rcx
movapd %xmm6, %xmm0
subsd %xmm14, %xmm6
movsd %xmm2, 7(%rcx,%r12,2)
movq (%r10), %r12
movq (%rax), %rcx
addsd %xmm14, %xmm0
movsd %xmm8, 7(%rcx,%r12,2)
movq (%rsi), %r12
movq (%rax), %rcx
movsd %xmm0, 7(%rcx,%r12,2)
movapd %xmm5, %xmm0
movq (%r9), %r12
movq (%rax), %rcx
subsd %xmm4, %xmm5
addsd %xmm4, %xmm0
movsd %xmm0, 7(%rcx,%r12,2)
movq (%rbp), %r12
movq (%rax), %rcx
movsd %xmm6, 7(%rcx,%r12,2)
movq (%r15), %r12
movq (%rax), %rcx
movsd %xmm5, 7(%rcx,%r12,2)
cmpq %rdx, -104(%rsp)
jg .L2958
movq %r14, -104(%rsp)
With this compiler
/pkgs/gcc-mainline/bin/gcc -v
Using built-in specs.
Target: x86_64-unknown-linux-gnu
Configured with: /tmp/lucier/gcc/mainline/configure --enable-checking=release
--prefix=/pkgs/gcc-mainline --enable-languages=c
--enable-gather-detailed-mem-stats
Thread model: posix
gcc version 4.4.0 20090313 (experimental) [trunk revision 144829] (GCC)
one gets a time of
212 ms cpu time (212 user, 0 system)
and the assembly language for the main loop is
.L2946:
movq %rbx, %rdx
addq (%r11), %rdx
leaq 4(%rbx), %rbp
movq %rdx, (%rsi)
addq $4, %rdx
movq %rdx, (%r10)
movq (%r11), %rdx
addq (%rsi), %rdx
movq %rdx, (%rcx)
addq $4, %rdx
movq %rdx, (%r9)
movq (%r11), %r13
addq (%rcx), %r13
movq %r13, (%r8)
addq $4, %r13
movq %r13, (%r15)
movq (%rax), %rdx
addq $7, %rdx
movsd (%rdx,%r13,2), %xmm0
leaq (%rdx,%rbx,2), %r14
addq $8, %rbx
movsd %xmm0, -48(%rsp)
movq (%r8), %r13
movsd (%rdx,%r13,2), %xmm0
movsd %xmm0, -56(%rsp)
movq (%r9), %r13
movsd (%rdx,%r13,2), %xmm0
movsd %xmm0, -64(%rsp)
movq (%rcx), %r13
movsd (%rdx,%r13,2), %xmm0
movsd %xmm0, -72(%rsp)
movq (%r10), %r13
movsd (%rdx,%r13,2), %xmm0
movsd %xmm0, -80(%rsp)
movq (%rsi), %r13
movsd (%rdx,%r13,2), %xmm0
leaq (%rbp,%rbp), %r13
movsd %xmm0, -104(%rsp)
movsd (%rdx,%r13), %xmm0
movsd %xmm0, -88(%rsp)
movq 24(%rdi), %rdx
movsd 31(%rdx), %xmm0
movsd %xmm0, -32(%rsp)
movsd 23(%rdx), %xmm0
movsd %xmm0, -40(%rsp)
movsd 15(%rdx), %xmm0
movsd %xmm0, -112(%rsp)
movsd 7(%rdx), %xmm0
movsd %xmm0, -120(%rsp)
movapd %xmm0, %xmm1
movsd -112(%rsp), %xmm0
mulsd -48(%rsp), %xmm1
mulsd -56(%rsp), %xmm0
addsd %xmm0, %xmm1
movsd -112(%rsp), %xmm0
mulsd -48(%rsp), %xmm0
movsd %xmm1, -8(%rsp)
movsd -120(%rsp), %xmm1
mulsd -56(%rsp), %xmm1
subsd %xmm0, %xmm1
movsd -112(%rsp), %xmm0
mulsd -72(%rsp), %xmm0
movsd %xmm1, -16(%rsp)
movsd -120(%rsp), %xmm1
mulsd -64(%rsp), %xmm1
addsd %xmm0, %xmm1
movsd -112(%rsp), %xmm0
mulsd -64(%rsp), %xmm0
movsd %xmm1, -24(%rsp)
movsd -120(%rsp), %xmm1
mulsd -72(%rsp), %xmm1
subsd %xmm0, %xmm1
movsd -80(%rsp), %xmm0
subsd -8(%rsp), %xmm0
movsd %xmm1, -120(%rsp)
movsd %xmm0, -48(%rsp)
movsd -104(%rsp), %xmm0
subsd -16(%rsp), %xmm0
movsd %xmm0, -112(%rsp)
movsd -88(%rsp), %xmm0
subsd -24(%rsp), %xmm0
movsd %xmm0, -56(%rsp)
movsd (%r14), %xmm0
subsd %xmm1, %xmm0
movsd %xmm0, -64(%rsp)
movsd -80(%rsp), %xmm0
addsd -8(%rsp), %xmm0
movsd %xmm0, -80(%rsp)
movsd -104(%rsp), %xmm0
addsd -16(%rsp), %xmm0
movsd %xmm0, -104(%rsp)
movsd -88(%rsp), %xmm0
addsd -24(%rsp), %xmm0
movsd %xmm0, -88(%rsp)
movsd (%r14), %xmm0
addsd %xmm1, %xmm0
movsd %xmm0, -96(%rsp)
movsd -32(%rsp), %xmm0
xorpd .LC5(%rip), %xmm0
movsd %xmm0, -120(%rsp)
movapd %xmm0, %xmm1
movsd -40(%rsp), %xmm0
mulsd -48(%rsp), %xmm1
mulsd -112(%rsp), %xmm0
addsd %xmm0, %xmm1
movsd -40(%rsp), %xmm0
mulsd -48(%rsp), %xmm0
movsd %xmm1, -72(%rsp)
movsd -120(%rsp), %xmm1
mulsd -112(%rsp), %xmm1
subsd %xmm0, %xmm1
movsd -32(%rsp), %xmm0
mulsd -104(%rsp), %xmm0
movsd %xmm1, -112(%rsp)
movsd -40(%rsp), %xmm1
mulsd -80(%rsp), %xmm1
addsd %xmm0, %xmm1
movsd -32(%rsp), %xmm0
mulsd -80(%rsp), %xmm0
movsd %xmm1, -120(%rsp)
movsd -40(%rsp), %xmm1
mulsd -104(%rsp), %xmm1
subsd %xmm0, %xmm1
movsd %xmm1, -104(%rsp)
movsd -96(%rsp), %xmm0
addsd %xmm1, %xmm0
movsd %xmm0, (%r14)
movq (%rax), %rdx
movsd -88(%rsp), %xmm0
addsd -120(%rsp), %xmm0
movsd %xmm0, 7(%r13,%rdx)
movq (%rsi), %r13
movq (%rax), %rdx
movsd -96(%rsp), %xmm0
subsd -104(%rsp), %xmm0
movsd %xmm0, 7(%rdx,%r13,2)
movq (%r10), %r13
movq (%rax), %rdx
movsd -88(%rsp), %xmm0
subsd -120(%rsp), %xmm0
movsd %xmm0, 7(%rdx,%r13,2)
movq (%rcx), %r13
movq (%rax), %rdx
movsd -64(%rsp), %xmm0
addsd -112(%rsp), %xmm0
movsd %xmm0, 7(%rdx,%r13,2)
movq (%r9), %r13
movq (%rax), %rdx
movsd -56(%rsp), %xmm0
addsd -72(%rsp), %xmm0
movsd %xmm0, 7(%rdx,%r13,2)
movq (%r8), %r13
movq (%rax), %rdx
movsd -64(%rsp), %xmm0
subsd -112(%rsp), %xmm0
movsd %xmm0, 7(%rdx,%r13,2)
movq (%r15), %r13
movq (%rax), %rdx
movsd -56(%rsp), %xmm0
subsd -72(%rsp), %xmm0
movsd %xmm0, 7(%rdx,%r13,2)
cmpq %rbx, (%rsp)
jg .L2946
movq %rbp, (%rsp)
I'm reporting this separately because it doesn't have the same cause as the
previous PR 33928
BTW, with 4.2.4 this test runs in 108 ms on this machine, hence the total
regression amount noted in the subject line. This part itself causes about 60%
performance regression, the rest is accounte for by
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928
Brad
--
Summary: 96% performance regression in floating point code; part
of the problem started 2009/03/12-13
Product: gcc
Version: 4.4.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: regression
AssignedTo: unassigned at gcc dot gnu dot org
ReportedBy: lucier at math dot purdue dot edu
GCC build triplet: x86_64-unknown-linux-gnu
GCC host triplet: x86_64-unknown-linux-gnu
GCC target triplet: x86_64-unknown-linux-gnu
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39914