Bug 39914 - [4.4 Regression] 96% performance regression in floating point code; part of the problem started 2009/03/12-13
Summary: [4.4 Regression] 96% performance regression in floating point code; part of t...
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: rtl-optimization (show other bugs)
Version: 4.4.0
: P3 normal
Target Milestone: 4.4.1
Assignee: Uroš Bizjak
URL:
Keywords:
Depends on: 39432
Blocks:
  Show dependency treegraph
 
Reported: 2009-04-26 18:23 UTC by lucier
Modified: 2009-05-03 19:41 UTC (History)
1 user (show)

See Also:
Host: x86_64-unknown-linux-gnu
Target: x86_64-unknown-linux-gnu
Build: x86_64-unknown-linux-gnu
Known to work: 4.3.3
Known to fail: 4.4.0 4.5.0
Last reconfirmed: 2009-04-28 16:19:32


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description lucier 2009-04-26 18:23:50 UTC
With this compiler:

gcc version 4.4.0 20090312 (experimental) [trunk revision 144801] (GCC) 

running the test in

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928

(same .i file, same instructions for reproducing, same compiler options, same everything)

gives a time of

    132 ms cpu time (132 user, 0 system)

with assembly code in the main loop of

.L2958:
        movq    %rdx, %rcx
        addq    (%r11), %rcx
        leaq    4(%rdx), %r14
        movq    %rcx, (%rdi)
        addq    $4, %rcx
        movq    %rcx, (%r10)
        movq    (%r11), %rcx
        addq    (%rdi), %rcx
        movq    %rcx, (%rsi)
        addq    $4, %rcx
        movq    %rcx, (%r9)
        movq    (%r11), %r12
        addq    (%rsi), %r12
        movq    %r12, (%rbp)
        addq    $4, %r12
        movq    %r12, (%r15)
        movq    (%rax), %rcx
        addq    $7, %rcx
        movsd   (%rcx,%r12,2), %xmm7
        movq    (%rbp), %r12
        leaq    (%rcx,%rdx,2), %r13
        addq    $8, %rdx
        movsd   (%r13), %xmm4
        movsd   (%rcx,%r12,2), %xmm10
        movq    (%r9), %r12
        movsd   (%rcx,%r12,2), %xmm5
        movq    (%rsi), %r12
        movsd   (%rcx,%r12,2), %xmm6
        movq    (%r10), %r12
        movsd   (%rcx,%r12,2), %xmm13
        movq    (%rdi), %r12
        movsd   (%rcx,%r12,2), %xmm11
        leaq    (%r14,%r14), %r12
        movsd   (%rcx,%r12), %xmm9
        movq    24(%r8), %rcx
        movapd  %xmm11, %xmm14
        movsd   15(%rcx), %xmm1
        movsd   7(%rcx), %xmm2
        movapd  %xmm1, %xmm8
        movsd   31(%rcx), %xmm3
        movapd  %xmm2, %xmm12
        mulsd   %xmm10, %xmm8
        mulsd   %xmm7, %xmm12
        mulsd   %xmm2, %xmm10
        mulsd   %xmm1, %xmm7
        movsd   23(%rcx), %xmm0
        addsd   %xmm8, %xmm12
        movapd  %xmm2, %xmm8
        mulsd   %xmm6, %xmm2
        subsd   %xmm7, %xmm10
        movapd  %xmm1, %xmm7
        mulsd   %xmm5, %xmm1
        mulsd   %xmm6, %xmm7
        movapd  %xmm4, %xmm6
        mulsd   %xmm5, %xmm8
        movapd  %xmm9, %xmm5
        subsd   %xmm10, %xmm14
        subsd   %xmm1, %xmm2
        movapd  %xmm3, %xmm1
        addsd   %xmm11, %xmm10
        xorpd   .LC5(%rip), %xmm1
        addsd   %xmm7, %xmm8
        movapd  %xmm13, %xmm7
        subsd   %xmm2, %xmm6
        subsd   %xmm12, %xmm7
        subsd   %xmm8, %xmm5
        addsd   %xmm4, %xmm2
        movapd  %xmm0, %xmm4
        addsd   %xmm9, %xmm8
        movapd  %xmm1, %xmm9
        mulsd   %xmm14, %xmm4
        addsd   %xmm13, %xmm12
        mulsd   %xmm7, %xmm9
        mulsd   %xmm1, %xmm14
        movapd  %xmm3, %xmm1
        mulsd   %xmm0, %xmm7
        mulsd   %xmm10, %xmm1
        mulsd   %xmm0, %xmm10
        addsd   %xmm9, %xmm4
        subsd   %xmm7, %xmm14
        movapd  %xmm0, %xmm7
        movapd  %xmm2, %xmm0
        mulsd   %xmm12, %xmm7
        mulsd   %xmm3, %xmm12
        addsd   %xmm1, %xmm7
        subsd   %xmm12, %xmm10
        addsd   %xmm10, %xmm0
        subsd   %xmm10, %xmm2
        movsd   %xmm0, (%r13)
        movapd  %xmm8, %xmm0
        movq    (%rax), %rcx
        subsd   %xmm7, %xmm8
        addsd   %xmm7, %xmm0
        movsd   %xmm0, 7(%r12,%rcx)
        movq    (%rdi), %r12
        movq    (%rax), %rcx
        movapd  %xmm6, %xmm0
        subsd   %xmm14, %xmm6
        movsd   %xmm2, 7(%rcx,%r12,2)
        movq    (%r10), %r12
        movq    (%rax), %rcx
        addsd   %xmm14, %xmm0
        movsd   %xmm8, 7(%rcx,%r12,2)
        movq    (%rsi), %r12
        movq    (%rax), %rcx
        movsd   %xmm0, 7(%rcx,%r12,2)
        movapd  %xmm5, %xmm0
        movq    (%r9), %r12
        movq    (%rax), %rcx
        subsd   %xmm4, %xmm5
        addsd   %xmm4, %xmm0
        movsd   %xmm0, 7(%rcx,%r12,2)
        movq    (%rbp), %r12
        movq    (%rax), %rcx
        movsd   %xmm6, 7(%rcx,%r12,2)
        movq    (%r15), %r12
        movq    (%rax), %rcx
        movsd   %xmm5, 7(%rcx,%r12,2)
        cmpq    %rdx, -104(%rsp)
        jg      .L2958
        movq    %r14, -104(%rsp)

With this compiler

/pkgs/gcc-mainline/bin/gcc -v
Using built-in specs.
Target: x86_64-unknown-linux-gnu
Configured with: /tmp/lucier/gcc/mainline/configure --enable-checking=release --prefix=/pkgs/gcc-mainline --enable-languages=c --enable-gather-detailed-mem-stats
Thread model: posix
gcc version 4.4.0 20090313 (experimental) [trunk revision 144829] (GCC) 

one gets a time of

    212 ms cpu time (212 user, 0 system)

and the assembly language for the main loop is

.L2946:
        movq    %rbx, %rdx
        addq    (%r11), %rdx
        leaq    4(%rbx), %rbp
        movq    %rdx, (%rsi)
        addq    $4, %rdx
        movq    %rdx, (%r10)
        movq    (%r11), %rdx
        addq    (%rsi), %rdx
        movq    %rdx, (%rcx)
        addq    $4, %rdx
        movq    %rdx, (%r9)
        movq    (%r11), %r13
        addq    (%rcx), %r13
        movq    %r13, (%r8)
        addq    $4, %r13
        movq    %r13, (%r15)
        movq    (%rax), %rdx
        addq    $7, %rdx
        movsd   (%rdx,%r13,2), %xmm0
        leaq    (%rdx,%rbx,2), %r14
        addq    $8, %rbx
        movsd   %xmm0, -48(%rsp)
        movq    (%r8), %r13
        movsd   (%rdx,%r13,2), %xmm0
        movsd   %xmm0, -56(%rsp)
        movq    (%r9), %r13
        movsd   (%rdx,%r13,2), %xmm0
        movsd   %xmm0, -64(%rsp)
        movq    (%rcx), %r13
        movsd   (%rdx,%r13,2), %xmm0
        movsd   %xmm0, -72(%rsp)
        movq    (%r10), %r13
        movsd   (%rdx,%r13,2), %xmm0
        movsd   %xmm0, -80(%rsp)
        movq    (%rsi), %r13
        movsd   (%rdx,%r13,2), %xmm0
        leaq    (%rbp,%rbp), %r13
        movsd   %xmm0, -104(%rsp)
        movsd   (%rdx,%r13), %xmm0
        movsd   %xmm0, -88(%rsp)
        movq    24(%rdi), %rdx
        movsd   31(%rdx), %xmm0
        movsd   %xmm0, -32(%rsp)
        movsd   23(%rdx), %xmm0
        movsd   %xmm0, -40(%rsp)
        movsd   15(%rdx), %xmm0
        movsd   %xmm0, -112(%rsp)
        movsd   7(%rdx), %xmm0
        movsd   %xmm0, -120(%rsp)
        movapd  %xmm0, %xmm1
        movsd   -112(%rsp), %xmm0
        mulsd   -48(%rsp), %xmm1
        mulsd   -56(%rsp), %xmm0
        addsd   %xmm0, %xmm1
        movsd   -112(%rsp), %xmm0
        mulsd   -48(%rsp), %xmm0
        movsd   %xmm1, -8(%rsp)
        movsd   -120(%rsp), %xmm1
        mulsd   -56(%rsp), %xmm1
        subsd   %xmm0, %xmm1
        movsd   -112(%rsp), %xmm0
        mulsd   -72(%rsp), %xmm0
        movsd   %xmm1, -16(%rsp)
        movsd   -120(%rsp), %xmm1
        mulsd   -64(%rsp), %xmm1
        addsd   %xmm0, %xmm1
        movsd   -112(%rsp), %xmm0
        mulsd   -64(%rsp), %xmm0
        movsd   %xmm1, -24(%rsp)
        movsd   -120(%rsp), %xmm1
        mulsd   -72(%rsp), %xmm1
        subsd   %xmm0, %xmm1
        movsd   -80(%rsp), %xmm0
        subsd   -8(%rsp), %xmm0
        movsd   %xmm1, -120(%rsp)
        movsd   %xmm0, -48(%rsp)
        movsd   -104(%rsp), %xmm0
        subsd   -16(%rsp), %xmm0
        movsd   %xmm0, -112(%rsp)
        movsd   -88(%rsp), %xmm0
        subsd   -24(%rsp), %xmm0
        movsd   %xmm0, -56(%rsp)
        movsd   (%r14), %xmm0
        subsd   %xmm1, %xmm0
        movsd   %xmm0, -64(%rsp)
        movsd   -80(%rsp), %xmm0
        addsd   -8(%rsp), %xmm0
        movsd   %xmm0, -80(%rsp)
        movsd   -104(%rsp), %xmm0
        addsd   -16(%rsp), %xmm0
        movsd   %xmm0, -104(%rsp)
        movsd   -88(%rsp), %xmm0
        addsd   -24(%rsp), %xmm0
        movsd   %xmm0, -88(%rsp)
        movsd   (%r14), %xmm0
        addsd   %xmm1, %xmm0
        movsd   %xmm0, -96(%rsp)
        movsd   -32(%rsp), %xmm0
        xorpd   .LC5(%rip), %xmm0
        movsd   %xmm0, -120(%rsp)
        movapd  %xmm0, %xmm1
        movsd   -40(%rsp), %xmm0
        mulsd   -48(%rsp), %xmm1
        mulsd   -112(%rsp), %xmm0
        addsd   %xmm0, %xmm1
        movsd   -40(%rsp), %xmm0
        mulsd   -48(%rsp), %xmm0
        movsd   %xmm1, -72(%rsp)
        movsd   -120(%rsp), %xmm1
        mulsd   -112(%rsp), %xmm1
        subsd   %xmm0, %xmm1
        movsd   -32(%rsp), %xmm0
        mulsd   -104(%rsp), %xmm0
        movsd   %xmm1, -112(%rsp)
        movsd   -40(%rsp), %xmm1
        mulsd   -80(%rsp), %xmm1
        addsd   %xmm0, %xmm1
        movsd   -32(%rsp), %xmm0
        mulsd   -80(%rsp), %xmm0
        movsd   %xmm1, -120(%rsp)
        movsd   -40(%rsp), %xmm1
        mulsd   -104(%rsp), %xmm1
        subsd   %xmm0, %xmm1
        movsd   %xmm1, -104(%rsp)
        movsd   -96(%rsp), %xmm0
        addsd   %xmm1, %xmm0
        movsd   %xmm0, (%r14)
        movq    (%rax), %rdx
        movsd   -88(%rsp), %xmm0
        addsd   -120(%rsp), %xmm0
        movsd   %xmm0, 7(%r13,%rdx)
        movq    (%rsi), %r13
        movq    (%rax), %rdx
        movsd   -96(%rsp), %xmm0
        subsd   -104(%rsp), %xmm0
        movsd   %xmm0, 7(%rdx,%r13,2)
        movq    (%r10), %r13
        movq    (%rax), %rdx
        movsd   -88(%rsp), %xmm0
        subsd   -120(%rsp), %xmm0
        movsd   %xmm0, 7(%rdx,%r13,2)
        movq    (%rcx), %r13
        movq    (%rax), %rdx
        movsd   -64(%rsp), %xmm0
        addsd   -112(%rsp), %xmm0
        movsd   %xmm0, 7(%rdx,%r13,2)
        movq    (%r9), %r13
        movq    (%rax), %rdx
        movsd   -56(%rsp), %xmm0
        addsd   -72(%rsp), %xmm0
        movsd   %xmm0, 7(%rdx,%r13,2)
        movq    (%r8), %r13
        movq    (%rax), %rdx
        movsd   -64(%rsp), %xmm0
        subsd   -112(%rsp), %xmm0
        movsd   %xmm0, 7(%rdx,%r13,2)
        movq    (%r15), %r13
        movq    (%rax), %rdx
        movsd   -56(%rsp), %xmm0
        subsd   -72(%rsp), %xmm0
        movsd   %xmm0, 7(%rdx,%r13,2)
        cmpq    %rbx, (%rsp)
        jg      .L2946
        movq    %rbp, (%rsp)

I'm reporting this separately because it doesn't have the same cause as the previous PR 33928

BTW, with 4.2.4 this test runs in 108 ms on this machine, hence the total regression amount noted in the subject line.  This part itself causes about 60% performance regression, the rest is accounte for by

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928

Brad
Comment 1 Uroš Bizjak 2009-04-26 18:43:20 UTC
There are a couple of possible candidates in this range:

URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=144812
Log:
2009-03-12  Vladimir Makarov  <vmakarov@redhat.com>

	PR debug/39432
	* ira-int.h (struct allocno): Fix comment for calls_crossed_num.
	* ira-conflicts.c (ira_build_conflicts): Prohibit call used
	registers for allocnos created from user-defined variables.

URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=144817
Log:
2009-03-12  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/38824
	* config/i386/i386.md: Compare REGNO on the new peephole2
	patterns.

URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=144823
Log:
gcc/

2009-03-12  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/39445
	* config/i386/i386.c (ix86_expand_push): Don't set memory
	alignment.
Comment 2 Uroš Bizjak 2009-04-27 08:16:20 UTC
(In reply to comment #0)

> (same .i file, same instructions for reproducing, same compiler options, same
> everything)

I guess that this is direct.i compiled with -O1?

Trunk, revision: 146825 -O1 on x86_64 linux gives:

.L27:
	leaq	4(%rbx), %rbp
	movq	%rbx, %rdx
	addq	(%r11), %rdx
	movq	%rdx, (%rsi)
	addq	$4, %rdx
	movq	%rdx, (%r10)
	movq	(%r11), %rdx
	addq	(%rsi), %rdx
	movq	%rdx, (%rcx)
	addq	$4, %rdx
	movq	%rdx, (%r9)
	movq	(%r11), %r12
	addq	(%rcx), %r12
	movq	%r12, (%r8)
	addq	$4, %r12
	movq	%r12, (%r15)
	movq	(%rax), %rdx
	addq	$7, %rdx
	movsd	(%rdx,%r12,2), %xmm2
	movsd	%xmm2, -96(%rsp)
	movq	(%r8), %r12
	movsd	(%rdx,%r12,2), %xmm2
	movsd	%xmm2, -64(%rsp)
	movq	(%r9), %r12
	movsd	(%rdx,%r12,2), %xmm2
	movsd	%xmm2, -56(%rsp)
	movq	(%rcx), %r12
	movsd	(%rdx,%r12,2), %xmm2
	movsd	%xmm2, -48(%rsp)
	movq	(%r10), %r12
	movsd	(%rdx,%r12,2), %xmm2
	movsd	%xmm2, -104(%rsp)
	movq	(%rsi), %r12
	movsd	(%rdx,%r12,2), %xmm2
	movsd	%xmm2, -88(%rsp)
	leaq	(%rbp,%rbp), %r12
	movsd	(%r12,%rdx), %xmm2
	movsd	%xmm2, -80(%rsp)
	leaq	(%rdx,%rbx,2), %r14
	movq	24(%rdi), %rdx
	movsd	31(%rdx), %xmm2
	movsd	%xmm2, -32(%rsp)
	movsd	23(%rdx), %xmm2
	movsd	%xmm2, -40(%rsp)
	movsd	15(%rdx), %xmm2
	movsd	%xmm2, -120(%rsp)
	movsd	7(%rdx), %xmm2
	movsd	%xmm2, -112(%rsp)
	movapd	%xmm2, %xmm3
	mulsd	-96(%rsp), %xmm3
	movsd	-120(%rsp), %xmm2
	mulsd	-64(%rsp), %xmm2
	addsd	%xmm2, %xmm3
	movsd	%xmm3, -24(%rsp)
	movsd	-112(%rsp), %xmm3
	mulsd	-64(%rsp), %xmm3
	movsd	-120(%rsp), %xmm2
	mulsd	-96(%rsp), %xmm2
	subsd	%xmm2, %xmm3
	movsd	%xmm3, -96(%rsp)
	movsd	-112(%rsp), %xmm3
	mulsd	-56(%rsp), %xmm3
	movsd	-120(%rsp), %xmm2
	mulsd	-48(%rsp), %xmm2
	addsd	%xmm2, %xmm3
	movsd	%xmm3, -64(%rsp)
	movsd	-112(%rsp), %xmm3
	mulsd	-48(%rsp), %xmm3
	movsd	-120(%rsp), %xmm2
	mulsd	-56(%rsp), %xmm2
	subsd	%xmm2, %xmm3
	movsd	%xmm3, -120(%rsp)
	movsd	-104(%rsp), %xmm2
	subsd	-24(%rsp), %xmm2
	movsd	%xmm2, -112(%rsp)
	movsd	-88(%rsp), %xmm2
	subsd	-96(%rsp), %xmm2
	movsd	%xmm2, -56(%rsp)
	movsd	-80(%rsp), %xmm2
	subsd	-64(%rsp), %xmm2
	movsd	%xmm2, -48(%rsp)
	movsd	(%r14), %xmm2
	subsd	%xmm3, %xmm2
	movsd	%xmm2, -16(%rsp)
	movsd	-104(%rsp), %xmm2
	addsd	-24(%rsp), %xmm2
	movsd	%xmm2, -104(%rsp)
	movsd	-88(%rsp), %xmm2
	addsd	-96(%rsp), %xmm2
	movsd	%xmm2, -88(%rsp)
	movsd	-80(%rsp), %xmm2
	addsd	-64(%rsp), %xmm2
	movsd	%xmm2, -80(%rsp)
	movsd	(%r14), %xmm2
	addsd	%xmm3, %xmm2
	movsd	%xmm2, -72(%rsp)
	movsd	-32(%rsp), %xmm2
	xorpd	%xmm0, %xmm2
	movsd	%xmm2, -120(%rsp)
	movapd	%xmm2, %xmm3
	mulsd	-112(%rsp), %xmm3
	movsd	-40(%rsp), %xmm2
	mulsd	-56(%rsp), %xmm2
	addsd	%xmm2, %xmm3
	movsd	%xmm3, -96(%rsp)
	movsd	-120(%rsp), %xmm3
	mulsd	-56(%rsp), %xmm3
	movsd	-40(%rsp), %xmm2
	mulsd	-112(%rsp), %xmm2
	subsd	%xmm2, %xmm3
	movsd	%xmm3, -120(%rsp)
	movsd	-40(%rsp), %xmm3
	mulsd	-104(%rsp), %xmm3
	movsd	-32(%rsp), %xmm2
	mulsd	-88(%rsp), %xmm2
	addsd	%xmm2, %xmm3
	movsd	%xmm3, -112(%rsp)
	movsd	-40(%rsp), %xmm3
	mulsd	-88(%rsp), %xmm3
	movsd	-32(%rsp), %xmm2
	mulsd	-104(%rsp), %xmm2
	subsd	%xmm2, %xmm3
	movsd	%xmm3, -104(%rsp)
	movsd	-72(%rsp), %xmm2
	addsd	%xmm3, %xmm2
	movsd	%xmm2, (%r14)
	movq	(%rax), %rdx
	movsd	-80(%rsp), %xmm2
	addsd	-112(%rsp), %xmm2
	movsd	%xmm2, 7(%r12,%rdx)
	movq	(%rsi), %r12
	movq	(%rax), %rdx
	movsd	-72(%rsp), %xmm2
	subsd	-104(%rsp), %xmm2
	movsd	%xmm2, 7(%rdx,%r12,2)
	movq	(%r10), %r12
	movq	(%rax), %rdx
	movsd	-80(%rsp), %xmm2
	subsd	-112(%rsp), %xmm2
	movsd	%xmm2, 7(%rdx,%r12,2)
	movq	(%rcx), %r12
	movq	(%rax), %rdx
	movsd	-16(%rsp), %xmm2
	addsd	-120(%rsp), %xmm2
	movsd	%xmm2, 7(%rdx,%r12,2)
	movq	(%r9), %r12
	movq	(%rax), %rdx
	movsd	-48(%rsp), %xmm2
	addsd	-96(%rsp), %xmm2
	movsd	%xmm2, 7(%rdx,%r12,2)
	movq	(%r8), %r12
	movq	(%rax), %rdx
	movsd	-16(%rsp), %xmm2
	subsd	-120(%rsp), %xmm2
	movsd	%xmm2, 7(%rdx,%r12,2)
	movq	(%r15), %r12
	movq	(%rax), %rdx
	movsd	-48(%rsp), %xmm2
	subsd	-96(%rsp), %xmm2
	movsd	%xmm2, 7(%rdx,%r12,2)
	addq	$8, %rbx
	cmpq	%rbx, -8(%rsp)
	jg	.L27

The code above looks similar to your gcc version 4.4.0 20090313 code.

Using -O2, I get:

.L27:
	movq	-96(%rsp), %r14
	leaq	(%rax,%rcx,2), %rdi
	leaq	-8(%rax,%rcx,2), %rbp
	leaq	(%rax,%rsi,2), %r8
	leaq	-8(%rax,%rsi,2), %r9
	leaq	8(%rax,%rdx,2), %r12
	movsd	(%rdi), %xmm2
	leaq	8(%rax,%rbx,2), %r10
	movsd	(%r14), %xmm4
	movq	-88(%rsp), %r14
	movsd	(%rbp), %xmm6
	leaq	(%rax,%rbx,2), %r11
	movsd	(%r8), %xmm9
	leaq	(%rax,%rdx,2), %r13
	movsd	(%r14), %xmm1
	movq	-120(%rsp), %r14
	movsd	(%r9), %xmm10
	movq	%rcx, -80(%rsp)
	movapd	%xmm1, %xmm14
	addq	$8, %rdx
	movsd	(%r14), %xmm5
	addq	$8, %rcx
	mulsd	%xmm6, %xmm14
	addq	$8, %rsi
	addq	$8, %rbx
	movapd	%xmm5, %xmm7
	mulsd	%xmm5, %xmm6
	movsd	(%r12), %xmm11
	cmpq	%rdx, -112(%rsp)
	mulsd	%xmm2, %xmm7
	mulsd	%xmm1, %xmm2
	movsd	(%r15), %xmm8
	movsd	(%r11), %xmm3
	addsd	%xmm14, %xmm7
	movapd	%xmm1, %xmm14
	subsd	%xmm2, %xmm6
	movapd	%xmm5, %xmm2
	mulsd	%xmm10, %xmm14
	mulsd	%xmm9, %xmm2
	mulsd	%xmm9, %xmm1
	movapd	%xmm11, %xmm9
	mulsd	%xmm10, %xmm5
	movsd	(%r10), %xmm15
	addsd	%xmm14, %xmm2
	movsd	(%r13), %xmm0
	movapd	%xmm15, %xmm14
	subsd	%xmm1, %xmm5
	movapd	%xmm3, %xmm1
	subsd	%xmm7, %xmm14
	movapd	%xmm0, %xmm10
	subsd	%xmm2, %xmm9
	addsd	%xmm2, %xmm11
	movapd	%xmm8, %xmm2
	subsd	%xmm6, %xmm1
	xorpd	%xmm12, %xmm2
	subsd	%xmm5, %xmm10
	addsd	%xmm3, %xmm6
	movapd	%xmm4, %xmm3
	addsd	%xmm0, %xmm5
	movapd	%xmm2, %xmm0
	mulsd	%xmm1, %xmm3
	addsd	%xmm15, %xmm7
	mulsd	%xmm2, %xmm1
	mulsd	%xmm14, %xmm0
	movapd	%xmm4, %xmm2
	mulsd	%xmm4, %xmm14
	mulsd	%xmm7, %xmm2
	addsd	%xmm3, %xmm0
	movapd	%xmm8, %xmm3
	mulsd	%xmm8, %xmm7
	subsd	%xmm14, %xmm1
	mulsd	%xmm6, %xmm3
	addsd	%xmm3, %xmm2
	movapd	%xmm4, %xmm3
	movapd	%xmm5, %xmm4
	mulsd	%xmm6, %xmm3
	subsd	%xmm7, %xmm3
	addsd	%xmm3, %xmm4
	subsd	%xmm3, %xmm5
	movsd	%xmm4, (%r13)
	movapd	%xmm11, %xmm4
	subsd	%xmm2, %xmm11
	addsd	%xmm2, %xmm4
	movapd	%xmm10, %xmm2
	subsd	%xmm1, %xmm10
	addsd	%xmm1, %xmm2
	movsd	%xmm4, (%r12)
	movsd	%xmm5, (%r11)
	movsd	%xmm11, (%r10)
	movsd	%xmm2, (%r9)
	movapd	%xmm9, %xmm2
	subsd	%xmm0, %xmm9
	addsd	%xmm0, %xmm2
	movsd	%xmm2, (%r8)
	movsd	%xmm10, (%rbp)
	movsd	%xmm9, (%rdi)
	jg	.L27

It is not clear from your report, if -O1 flag is problematic, -O2 code looks good to me.
Comment 3 lucier 2009-04-27 15:07:37 UTC
Subject: Re:  96% performance regression in floating
 point code; part of the problem started 2009/03/12-13

On Sun, 2009-04-26 at 18:43 +0000, ubizjak at gmail dot com wrote:
> 
> 
> ------- Comment #1 from ubizjak at gmail dot com  2009-04-26 18:43 -------
> There are a couple of possible candidates in this range:
> 
> URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=144812
> Log:
> 2009-03-12  Vladimir Makarov  <vmakarov@redhat.com>
> 
>         PR debug/39432
>         * ira-int.h (struct allocno): Fix comment for calls_crossed_num.
>         * ira-conflicts.c (ira_build_conflicts): Prohibit call used
>         registers for allocnos created from user-defined variables.

The problem exists in 

gcc version 4.4.0 20090312 (experimental) [trunk revision 144812] (GCC) 

So perhaps it's this checkin.

Brad

Comment 4 lucier 2009-04-27 15:11:36 UTC
Subject: Re:  96% performance regression in floating
 point code; part of the problem started 2009/03/12-13

On Mon, 2009-04-27 at 08:16 +0000, ubizjak at gmail dot com wrote:
> 
> 
> ------- Comment #2 from ubizjak at gmail dot com  2009-04-27 08:16 -------
> (In reply to comment #0)
> 
> > (same .i file, same instructions for reproducing, same compiler options, same
> > everything)
> 
> I guess that this is direct.i compiled with -O1?
> 

Yes, the compile flags are

-Wall -W -Wno-unused -O1 -fno-math-errno -fschedule-insns2 -fno-trapping-math -fno-strict-aliasing -fwrapv -fomit-frame-pointer -fPIC -fno-common -mieee-fp

> It is not clear from your report, if -O1 flag is problematic, -O2 code looks
> good to me.

Yes, the -O2 code looks good to me, too.

I've used the above list of options (starting with -O1) on this code
instead of -O2 because the above list (a) has generally given faster
performance, and (b) has required much less compile time and memory to
compile the C code generated by the Gambit Scheme->C compiler.  I have
not yet seen any evidence that -O2 generates better code (overall) than
those set of options above.

Brad

Comment 5 Andrew Pinski 2009-04-27 15:26:09 UTC
This is by design -O1 is way slower than -O2 now.
Comment 6 lucier 2009-04-27 15:32:28 UTC
Subject: Re:  96% performance regression in floating
 point code; part of the problem started 2009/03/12-13

On Mon, 2009-04-27 at 15:26 +0000, pinskia at gcc dot gnu dot org wrote:

> This is by design -O1 is way slower than -O2 now.

I have seen no general discussion that -O1 should be destroyed as a
useful compilation option.

Comment 7 lucier 2009-04-27 15:35:33 UTC
Subject: Re:  96% performance regression in floating
 point code; part of the problem started 2009/03/12-13

On Mon, 2009-04-27 at 15:32 +0000, lucier at math dot purdue dot edu
wrote:


> On Mon, 2009-04-27 at 15:26 +0000, pinskia at gcc dot gnu dot org wrote:
> 
> > This is by design -O1 is way slower than -O2 now.
> 
> I have seen no general discussion that -O1 should be destroyed as a
> useful compilation option.

Perhaps I should also point out that code generated by -O2 is not
generally much faster than before, so if you believe that -O1 is much
slower than -O2 now by design, it is only by making code generated by
-O1 much slower.

BTW, this code runs in 108 ms when compiled with gcc-4.2.4 with the
given options (including -O1).

Brad

Comment 8 lucier 2009-04-27 16:29:37 UTC
I hadn't noticed before that Andrew had marked it as "RESOLVED INVALID".

I'm reopening it, as I believe that resolving it as INVALID should require a more general discussion than a one-line dismissal of the bug.

Brad
Comment 9 Uroš Bizjak 2009-04-27 18:21:03 UTC
Following patch should fix the performance hit with -O1:

--cut here--
Index: ira-conflicts.c
===================================================================
--- ira-conflicts.c	(revision 146825)
+++ ira-conflicts.c	(working copy)
@@ -806,7 +806,7 @@ ira_build_conflicts (void)
       if ((! flag_caller_saves && ALLOCNO_CALLS_CROSSED_NUM (a) != 0)
 	  /* For debugging purposes don't put user defined variables in
 	     callee-clobbered registers.  */
-	  || (optimize <= 1
+	  || (optimize == 0
 	      && (attrs = REG_ATTRS (regno_reg_rtx [ALLOCNO_REGNO (a)])) != NULL
 	      && (decl = attrs->decl) != NULL
 	      && VAR_OR_FUNCTION_DECL_P (decl)
--cut here--

IMO, such a performance hit is not acceptable with -O1, we want to _optimize_ the code, we have -O0 to achieve full debug functionality.
Comment 10 Paolo Bonzini 2009-04-27 19:04:12 UTC
Yeah, it's basically destroying caller-save optimization.
Comment 11 lucier 2009-04-27 20:37:58 UTC
As far as I can tell, the patch proposed by Uros restores the performance of code generated by

gcc version 4.4.0 20090312 (experimental) [trunk revision 144812] (GCC) 

In particular, the assembly code for the main loop is identical for code generated by

gcc version 4.4.0 20090312 (experimental) [trunk revision 144801] (GCC) 

and by

gcc version 4.4.0 20090312 (experimental) [trunk revision 144812] (GCC) 

after his patch.

Thanks for getting to this so quickly.

Brad
Comment 12 lucier 2009-04-28 01:39:41 UTC
I tried to build and check with this patch, but I got stopped with:

/tmp/lucier/gcc/objdirs/mainline/./prev-gcc/xgcc -B/tmp/lucier/gcc/objdirs/mainline/./prev-gcc/ -B/pkgs/gcc-mainline/x86_64-unknown-linux-gnu/bin/ -c  -g -O2 -DIN_GCC   -W -Wall -Wwrite-strings -Wstrict-prototypes -Wmissing-prototypes -Wcast-qual -Wold-style-definition -Wc++-compat -Wmissing-format-attribute -pedantic -Wno-long-long -Wno-variadic-macros -Wno-overlength-strings -Werror -fno-common  -DHAVE_CONFIG_H -DGENERATOR_FILE -I. -Ibuild -I../../../mainline/gcc -I../../../mainline/gcc/build -I../../../mainline/gcc/../include -I../../../mainline/gcc/../libcpp/include -I/tmp/lucier/gcc/objdirs/mainline/./gmp -I/tmp/lucier/gcc/mainline/gmp -I/tmp/lucier/gcc/objdirs/mainline/./mpfr -I/tmp/lucier/gcc/mainline/mpfr  -I../../../mainline/gcc/../libdecnumber -I../../../mainline/gcc/../libdecnumber/bid -I../libdecnumber    -o build/vec.o ../../../mainline/gcc/vec.c
cc1: warnings being treated as errors
../../../mainline/gcc/vec.c: In function ‘vec_descriptor’:
../../../mainline/gcc/vec.c:116: error: enum conversion when passing argument 3 of ‘htab_find_slot’ is invalid in C++
../../../mainline/gcc/../include/hashtab.h:172: note: expected ‘enum insert_option’ but argument is of type ‘int’
make[3]: *** [build/vec.o] Error 1
Comment 13 uros 2009-04-28 16:18:44 UTC
Subject: Bug 39914

Author: uros
Date: Tue Apr 28 16:18:17 2009
New Revision: 146904

URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=146904
Log:
	PR rtl-optimization/39914
	* ira-conflicts.c (ira_build_conflicts): Prohibit call used
	registers for allocnos created from user-defined variables only
	when not optimizing.


Modified:
    trunk/gcc/ChangeLog
    trunk/gcc/ira-conflicts.c

Comment 14 Uroš Bizjak 2009-04-28 16:19:32 UTC
Fixed on the trunk so far.
Comment 15 uros 2009-05-03 19:40:46 UTC
Subject: Bug 39914

Author: uros
Date: Sun May  3 19:40:35 2009
New Revision: 147081

URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=147081
Log:
	Backport from mainline:
	2009-04-28  Uros Bizjak  <ubizjak@gmail.com>

	PR rtl-optimization/39914
	* ira-conflicts.c (ira_build_conflicts): Prohibit call used
	registers for allocnos created from user-defined variables only
	when not optimizing.


Modified:
    branches/gcc-4_4-branch/gcc/ChangeLog
    branches/gcc-4_4-branch/gcc/ira-conflicts.c

Comment 16 Uroš Bizjak 2009-05-03 19:41:28 UTC
Fixed.