With this compiler: gcc version 4.4.0 20090312 (experimental) [trunk revision 144801] (GCC) running the test in http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928 (same .i file, same instructions for reproducing, same compiler options, same everything) gives a time of 132 ms cpu time (132 user, 0 system) with assembly code in the main loop of .L2958: movq %rdx, %rcx addq (%r11), %rcx leaq 4(%rdx), %r14 movq %rcx, (%rdi) addq $4, %rcx movq %rcx, (%r10) movq (%r11), %rcx addq (%rdi), %rcx movq %rcx, (%rsi) addq $4, %rcx movq %rcx, (%r9) movq (%r11), %r12 addq (%rsi), %r12 movq %r12, (%rbp) addq $4, %r12 movq %r12, (%r15) movq (%rax), %rcx addq $7, %rcx movsd (%rcx,%r12,2), %xmm7 movq (%rbp), %r12 leaq (%rcx,%rdx,2), %r13 addq $8, %rdx movsd (%r13), %xmm4 movsd (%rcx,%r12,2), %xmm10 movq (%r9), %r12 movsd (%rcx,%r12,2), %xmm5 movq (%rsi), %r12 movsd (%rcx,%r12,2), %xmm6 movq (%r10), %r12 movsd (%rcx,%r12,2), %xmm13 movq (%rdi), %r12 movsd (%rcx,%r12,2), %xmm11 leaq (%r14,%r14), %r12 movsd (%rcx,%r12), %xmm9 movq 24(%r8), %rcx movapd %xmm11, %xmm14 movsd 15(%rcx), %xmm1 movsd 7(%rcx), %xmm2 movapd %xmm1, %xmm8 movsd 31(%rcx), %xmm3 movapd %xmm2, %xmm12 mulsd %xmm10, %xmm8 mulsd %xmm7, %xmm12 mulsd %xmm2, %xmm10 mulsd %xmm1, %xmm7 movsd 23(%rcx), %xmm0 addsd %xmm8, %xmm12 movapd %xmm2, %xmm8 mulsd %xmm6, %xmm2 subsd %xmm7, %xmm10 movapd %xmm1, %xmm7 mulsd %xmm5, %xmm1 mulsd %xmm6, %xmm7 movapd %xmm4, %xmm6 mulsd %xmm5, %xmm8 movapd %xmm9, %xmm5 subsd %xmm10, %xmm14 subsd %xmm1, %xmm2 movapd %xmm3, %xmm1 addsd %xmm11, %xmm10 xorpd .LC5(%rip), %xmm1 addsd %xmm7, %xmm8 movapd %xmm13, %xmm7 subsd %xmm2, %xmm6 subsd %xmm12, %xmm7 subsd %xmm8, %xmm5 addsd %xmm4, %xmm2 movapd %xmm0, %xmm4 addsd %xmm9, %xmm8 movapd %xmm1, %xmm9 mulsd %xmm14, %xmm4 addsd %xmm13, %xmm12 mulsd %xmm7, %xmm9 mulsd %xmm1, %xmm14 movapd %xmm3, %xmm1 mulsd %xmm0, %xmm7 mulsd %xmm10, %xmm1 mulsd %xmm0, %xmm10 addsd %xmm9, %xmm4 subsd %xmm7, %xmm14 movapd %xmm0, %xmm7 movapd %xmm2, %xmm0 mulsd %xmm12, %xmm7 mulsd %xmm3, %xmm12 addsd %xmm1, %xmm7 subsd %xmm12, %xmm10 addsd %xmm10, %xmm0 subsd %xmm10, %xmm2 movsd %xmm0, (%r13) movapd %xmm8, %xmm0 movq (%rax), %rcx subsd %xmm7, %xmm8 addsd %xmm7, %xmm0 movsd %xmm0, 7(%r12,%rcx) movq (%rdi), %r12 movq (%rax), %rcx movapd %xmm6, %xmm0 subsd %xmm14, %xmm6 movsd %xmm2, 7(%rcx,%r12,2) movq (%r10), %r12 movq (%rax), %rcx addsd %xmm14, %xmm0 movsd %xmm8, 7(%rcx,%r12,2) movq (%rsi), %r12 movq (%rax), %rcx movsd %xmm0, 7(%rcx,%r12,2) movapd %xmm5, %xmm0 movq (%r9), %r12 movq (%rax), %rcx subsd %xmm4, %xmm5 addsd %xmm4, %xmm0 movsd %xmm0, 7(%rcx,%r12,2) movq (%rbp), %r12 movq (%rax), %rcx movsd %xmm6, 7(%rcx,%r12,2) movq (%r15), %r12 movq (%rax), %rcx movsd %xmm5, 7(%rcx,%r12,2) cmpq %rdx, -104(%rsp) jg .L2958 movq %r14, -104(%rsp) With this compiler /pkgs/gcc-mainline/bin/gcc -v Using built-in specs. Target: x86_64-unknown-linux-gnu Configured with: /tmp/lucier/gcc/mainline/configure --enable-checking=release --prefix=/pkgs/gcc-mainline --enable-languages=c --enable-gather-detailed-mem-stats Thread model: posix gcc version 4.4.0 20090313 (experimental) [trunk revision 144829] (GCC) one gets a time of 212 ms cpu time (212 user, 0 system) and the assembly language for the main loop is .L2946: movq %rbx, %rdx addq (%r11), %rdx leaq 4(%rbx), %rbp movq %rdx, (%rsi) addq $4, %rdx movq %rdx, (%r10) movq (%r11), %rdx addq (%rsi), %rdx movq %rdx, (%rcx) addq $4, %rdx movq %rdx, (%r9) movq (%r11), %r13 addq (%rcx), %r13 movq %r13, (%r8) addq $4, %r13 movq %r13, (%r15) movq (%rax), %rdx addq $7, %rdx movsd (%rdx,%r13,2), %xmm0 leaq (%rdx,%rbx,2), %r14 addq $8, %rbx movsd %xmm0, -48(%rsp) movq (%r8), %r13 movsd (%rdx,%r13,2), %xmm0 movsd %xmm0, -56(%rsp) movq (%r9), %r13 movsd (%rdx,%r13,2), %xmm0 movsd %xmm0, -64(%rsp) movq (%rcx), %r13 movsd (%rdx,%r13,2), %xmm0 movsd %xmm0, -72(%rsp) movq (%r10), %r13 movsd (%rdx,%r13,2), %xmm0 movsd %xmm0, -80(%rsp) movq (%rsi), %r13 movsd (%rdx,%r13,2), %xmm0 leaq (%rbp,%rbp), %r13 movsd %xmm0, -104(%rsp) movsd (%rdx,%r13), %xmm0 movsd %xmm0, -88(%rsp) movq 24(%rdi), %rdx movsd 31(%rdx), %xmm0 movsd %xmm0, -32(%rsp) movsd 23(%rdx), %xmm0 movsd %xmm0, -40(%rsp) movsd 15(%rdx), %xmm0 movsd %xmm0, -112(%rsp) movsd 7(%rdx), %xmm0 movsd %xmm0, -120(%rsp) movapd %xmm0, %xmm1 movsd -112(%rsp), %xmm0 mulsd -48(%rsp), %xmm1 mulsd -56(%rsp), %xmm0 addsd %xmm0, %xmm1 movsd -112(%rsp), %xmm0 mulsd -48(%rsp), %xmm0 movsd %xmm1, -8(%rsp) movsd -120(%rsp), %xmm1 mulsd -56(%rsp), %xmm1 subsd %xmm0, %xmm1 movsd -112(%rsp), %xmm0 mulsd -72(%rsp), %xmm0 movsd %xmm1, -16(%rsp) movsd -120(%rsp), %xmm1 mulsd -64(%rsp), %xmm1 addsd %xmm0, %xmm1 movsd -112(%rsp), %xmm0 mulsd -64(%rsp), %xmm0 movsd %xmm1, -24(%rsp) movsd -120(%rsp), %xmm1 mulsd -72(%rsp), %xmm1 subsd %xmm0, %xmm1 movsd -80(%rsp), %xmm0 subsd -8(%rsp), %xmm0 movsd %xmm1, -120(%rsp) movsd %xmm0, -48(%rsp) movsd -104(%rsp), %xmm0 subsd -16(%rsp), %xmm0 movsd %xmm0, -112(%rsp) movsd -88(%rsp), %xmm0 subsd -24(%rsp), %xmm0 movsd %xmm0, -56(%rsp) movsd (%r14), %xmm0 subsd %xmm1, %xmm0 movsd %xmm0, -64(%rsp) movsd -80(%rsp), %xmm0 addsd -8(%rsp), %xmm0 movsd %xmm0, -80(%rsp) movsd -104(%rsp), %xmm0 addsd -16(%rsp), %xmm0 movsd %xmm0, -104(%rsp) movsd -88(%rsp), %xmm0 addsd -24(%rsp), %xmm0 movsd %xmm0, -88(%rsp) movsd (%r14), %xmm0 addsd %xmm1, %xmm0 movsd %xmm0, -96(%rsp) movsd -32(%rsp), %xmm0 xorpd .LC5(%rip), %xmm0 movsd %xmm0, -120(%rsp) movapd %xmm0, %xmm1 movsd -40(%rsp), %xmm0 mulsd -48(%rsp), %xmm1 mulsd -112(%rsp), %xmm0 addsd %xmm0, %xmm1 movsd -40(%rsp), %xmm0 mulsd -48(%rsp), %xmm0 movsd %xmm1, -72(%rsp) movsd -120(%rsp), %xmm1 mulsd -112(%rsp), %xmm1 subsd %xmm0, %xmm1 movsd -32(%rsp), %xmm0 mulsd -104(%rsp), %xmm0 movsd %xmm1, -112(%rsp) movsd -40(%rsp), %xmm1 mulsd -80(%rsp), %xmm1 addsd %xmm0, %xmm1 movsd -32(%rsp), %xmm0 mulsd -80(%rsp), %xmm0 movsd %xmm1, -120(%rsp) movsd -40(%rsp), %xmm1 mulsd -104(%rsp), %xmm1 subsd %xmm0, %xmm1 movsd %xmm1, -104(%rsp) movsd -96(%rsp), %xmm0 addsd %xmm1, %xmm0 movsd %xmm0, (%r14) movq (%rax), %rdx movsd -88(%rsp), %xmm0 addsd -120(%rsp), %xmm0 movsd %xmm0, 7(%r13,%rdx) movq (%rsi), %r13 movq (%rax), %rdx movsd -96(%rsp), %xmm0 subsd -104(%rsp), %xmm0 movsd %xmm0, 7(%rdx,%r13,2) movq (%r10), %r13 movq (%rax), %rdx movsd -88(%rsp), %xmm0 subsd -120(%rsp), %xmm0 movsd %xmm0, 7(%rdx,%r13,2) movq (%rcx), %r13 movq (%rax), %rdx movsd -64(%rsp), %xmm0 addsd -112(%rsp), %xmm0 movsd %xmm0, 7(%rdx,%r13,2) movq (%r9), %r13 movq (%rax), %rdx movsd -56(%rsp), %xmm0 addsd -72(%rsp), %xmm0 movsd %xmm0, 7(%rdx,%r13,2) movq (%r8), %r13 movq (%rax), %rdx movsd -64(%rsp), %xmm0 subsd -112(%rsp), %xmm0 movsd %xmm0, 7(%rdx,%r13,2) movq (%r15), %r13 movq (%rax), %rdx movsd -56(%rsp), %xmm0 subsd -72(%rsp), %xmm0 movsd %xmm0, 7(%rdx,%r13,2) cmpq %rbx, (%rsp) jg .L2946 movq %rbp, (%rsp) I'm reporting this separately because it doesn't have the same cause as the previous PR 33928 BTW, with 4.2.4 this test runs in 108 ms on this machine, hence the total regression amount noted in the subject line. This part itself causes about 60% performance regression, the rest is accounte for by http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928 Brad
There are a couple of possible candidates in this range: URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=144812 Log: 2009-03-12 Vladimir Makarov <vmakarov@redhat.com> PR debug/39432 * ira-int.h (struct allocno): Fix comment for calls_crossed_num. * ira-conflicts.c (ira_build_conflicts): Prohibit call used registers for allocnos created from user-defined variables. URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=144817 Log: 2009-03-12 H.J. Lu <hongjiu.lu@intel.com> PR target/38824 * config/i386/i386.md: Compare REGNO on the new peephole2 patterns. URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=144823 Log: gcc/ 2009-03-12 H.J. Lu <hongjiu.lu@intel.com> PR target/39445 * config/i386/i386.c (ix86_expand_push): Don't set memory alignment.
(In reply to comment #0) > (same .i file, same instructions for reproducing, same compiler options, same > everything) I guess that this is direct.i compiled with -O1? Trunk, revision: 146825 -O1 on x86_64 linux gives: .L27: leaq 4(%rbx), %rbp movq %rbx, %rdx addq (%r11), %rdx movq %rdx, (%rsi) addq $4, %rdx movq %rdx, (%r10) movq (%r11), %rdx addq (%rsi), %rdx movq %rdx, (%rcx) addq $4, %rdx movq %rdx, (%r9) movq (%r11), %r12 addq (%rcx), %r12 movq %r12, (%r8) addq $4, %r12 movq %r12, (%r15) movq (%rax), %rdx addq $7, %rdx movsd (%rdx,%r12,2), %xmm2 movsd %xmm2, -96(%rsp) movq (%r8), %r12 movsd (%rdx,%r12,2), %xmm2 movsd %xmm2, -64(%rsp) movq (%r9), %r12 movsd (%rdx,%r12,2), %xmm2 movsd %xmm2, -56(%rsp) movq (%rcx), %r12 movsd (%rdx,%r12,2), %xmm2 movsd %xmm2, -48(%rsp) movq (%r10), %r12 movsd (%rdx,%r12,2), %xmm2 movsd %xmm2, -104(%rsp) movq (%rsi), %r12 movsd (%rdx,%r12,2), %xmm2 movsd %xmm2, -88(%rsp) leaq (%rbp,%rbp), %r12 movsd (%r12,%rdx), %xmm2 movsd %xmm2, -80(%rsp) leaq (%rdx,%rbx,2), %r14 movq 24(%rdi), %rdx movsd 31(%rdx), %xmm2 movsd %xmm2, -32(%rsp) movsd 23(%rdx), %xmm2 movsd %xmm2, -40(%rsp) movsd 15(%rdx), %xmm2 movsd %xmm2, -120(%rsp) movsd 7(%rdx), %xmm2 movsd %xmm2, -112(%rsp) movapd %xmm2, %xmm3 mulsd -96(%rsp), %xmm3 movsd -120(%rsp), %xmm2 mulsd -64(%rsp), %xmm2 addsd %xmm2, %xmm3 movsd %xmm3, -24(%rsp) movsd -112(%rsp), %xmm3 mulsd -64(%rsp), %xmm3 movsd -120(%rsp), %xmm2 mulsd -96(%rsp), %xmm2 subsd %xmm2, %xmm3 movsd %xmm3, -96(%rsp) movsd -112(%rsp), %xmm3 mulsd -56(%rsp), %xmm3 movsd -120(%rsp), %xmm2 mulsd -48(%rsp), %xmm2 addsd %xmm2, %xmm3 movsd %xmm3, -64(%rsp) movsd -112(%rsp), %xmm3 mulsd -48(%rsp), %xmm3 movsd -120(%rsp), %xmm2 mulsd -56(%rsp), %xmm2 subsd %xmm2, %xmm3 movsd %xmm3, -120(%rsp) movsd -104(%rsp), %xmm2 subsd -24(%rsp), %xmm2 movsd %xmm2, -112(%rsp) movsd -88(%rsp), %xmm2 subsd -96(%rsp), %xmm2 movsd %xmm2, -56(%rsp) movsd -80(%rsp), %xmm2 subsd -64(%rsp), %xmm2 movsd %xmm2, -48(%rsp) movsd (%r14), %xmm2 subsd %xmm3, %xmm2 movsd %xmm2, -16(%rsp) movsd -104(%rsp), %xmm2 addsd -24(%rsp), %xmm2 movsd %xmm2, -104(%rsp) movsd -88(%rsp), %xmm2 addsd -96(%rsp), %xmm2 movsd %xmm2, -88(%rsp) movsd -80(%rsp), %xmm2 addsd -64(%rsp), %xmm2 movsd %xmm2, -80(%rsp) movsd (%r14), %xmm2 addsd %xmm3, %xmm2 movsd %xmm2, -72(%rsp) movsd -32(%rsp), %xmm2 xorpd %xmm0, %xmm2 movsd %xmm2, -120(%rsp) movapd %xmm2, %xmm3 mulsd -112(%rsp), %xmm3 movsd -40(%rsp), %xmm2 mulsd -56(%rsp), %xmm2 addsd %xmm2, %xmm3 movsd %xmm3, -96(%rsp) movsd -120(%rsp), %xmm3 mulsd -56(%rsp), %xmm3 movsd -40(%rsp), %xmm2 mulsd -112(%rsp), %xmm2 subsd %xmm2, %xmm3 movsd %xmm3, -120(%rsp) movsd -40(%rsp), %xmm3 mulsd -104(%rsp), %xmm3 movsd -32(%rsp), %xmm2 mulsd -88(%rsp), %xmm2 addsd %xmm2, %xmm3 movsd %xmm3, -112(%rsp) movsd -40(%rsp), %xmm3 mulsd -88(%rsp), %xmm3 movsd -32(%rsp), %xmm2 mulsd -104(%rsp), %xmm2 subsd %xmm2, %xmm3 movsd %xmm3, -104(%rsp) movsd -72(%rsp), %xmm2 addsd %xmm3, %xmm2 movsd %xmm2, (%r14) movq (%rax), %rdx movsd -80(%rsp), %xmm2 addsd -112(%rsp), %xmm2 movsd %xmm2, 7(%r12,%rdx) movq (%rsi), %r12 movq (%rax), %rdx movsd -72(%rsp), %xmm2 subsd -104(%rsp), %xmm2 movsd %xmm2, 7(%rdx,%r12,2) movq (%r10), %r12 movq (%rax), %rdx movsd -80(%rsp), %xmm2 subsd -112(%rsp), %xmm2 movsd %xmm2, 7(%rdx,%r12,2) movq (%rcx), %r12 movq (%rax), %rdx movsd -16(%rsp), %xmm2 addsd -120(%rsp), %xmm2 movsd %xmm2, 7(%rdx,%r12,2) movq (%r9), %r12 movq (%rax), %rdx movsd -48(%rsp), %xmm2 addsd -96(%rsp), %xmm2 movsd %xmm2, 7(%rdx,%r12,2) movq (%r8), %r12 movq (%rax), %rdx movsd -16(%rsp), %xmm2 subsd -120(%rsp), %xmm2 movsd %xmm2, 7(%rdx,%r12,2) movq (%r15), %r12 movq (%rax), %rdx movsd -48(%rsp), %xmm2 subsd -96(%rsp), %xmm2 movsd %xmm2, 7(%rdx,%r12,2) addq $8, %rbx cmpq %rbx, -8(%rsp) jg .L27 The code above looks similar to your gcc version 4.4.0 20090313 code. Using -O2, I get: .L27: movq -96(%rsp), %r14 leaq (%rax,%rcx,2), %rdi leaq -8(%rax,%rcx,2), %rbp leaq (%rax,%rsi,2), %r8 leaq -8(%rax,%rsi,2), %r9 leaq 8(%rax,%rdx,2), %r12 movsd (%rdi), %xmm2 leaq 8(%rax,%rbx,2), %r10 movsd (%r14), %xmm4 movq -88(%rsp), %r14 movsd (%rbp), %xmm6 leaq (%rax,%rbx,2), %r11 movsd (%r8), %xmm9 leaq (%rax,%rdx,2), %r13 movsd (%r14), %xmm1 movq -120(%rsp), %r14 movsd (%r9), %xmm10 movq %rcx, -80(%rsp) movapd %xmm1, %xmm14 addq $8, %rdx movsd (%r14), %xmm5 addq $8, %rcx mulsd %xmm6, %xmm14 addq $8, %rsi addq $8, %rbx movapd %xmm5, %xmm7 mulsd %xmm5, %xmm6 movsd (%r12), %xmm11 cmpq %rdx, -112(%rsp) mulsd %xmm2, %xmm7 mulsd %xmm1, %xmm2 movsd (%r15), %xmm8 movsd (%r11), %xmm3 addsd %xmm14, %xmm7 movapd %xmm1, %xmm14 subsd %xmm2, %xmm6 movapd %xmm5, %xmm2 mulsd %xmm10, %xmm14 mulsd %xmm9, %xmm2 mulsd %xmm9, %xmm1 movapd %xmm11, %xmm9 mulsd %xmm10, %xmm5 movsd (%r10), %xmm15 addsd %xmm14, %xmm2 movsd (%r13), %xmm0 movapd %xmm15, %xmm14 subsd %xmm1, %xmm5 movapd %xmm3, %xmm1 subsd %xmm7, %xmm14 movapd %xmm0, %xmm10 subsd %xmm2, %xmm9 addsd %xmm2, %xmm11 movapd %xmm8, %xmm2 subsd %xmm6, %xmm1 xorpd %xmm12, %xmm2 subsd %xmm5, %xmm10 addsd %xmm3, %xmm6 movapd %xmm4, %xmm3 addsd %xmm0, %xmm5 movapd %xmm2, %xmm0 mulsd %xmm1, %xmm3 addsd %xmm15, %xmm7 mulsd %xmm2, %xmm1 mulsd %xmm14, %xmm0 movapd %xmm4, %xmm2 mulsd %xmm4, %xmm14 mulsd %xmm7, %xmm2 addsd %xmm3, %xmm0 movapd %xmm8, %xmm3 mulsd %xmm8, %xmm7 subsd %xmm14, %xmm1 mulsd %xmm6, %xmm3 addsd %xmm3, %xmm2 movapd %xmm4, %xmm3 movapd %xmm5, %xmm4 mulsd %xmm6, %xmm3 subsd %xmm7, %xmm3 addsd %xmm3, %xmm4 subsd %xmm3, %xmm5 movsd %xmm4, (%r13) movapd %xmm11, %xmm4 subsd %xmm2, %xmm11 addsd %xmm2, %xmm4 movapd %xmm10, %xmm2 subsd %xmm1, %xmm10 addsd %xmm1, %xmm2 movsd %xmm4, (%r12) movsd %xmm5, (%r11) movsd %xmm11, (%r10) movsd %xmm2, (%r9) movapd %xmm9, %xmm2 subsd %xmm0, %xmm9 addsd %xmm0, %xmm2 movsd %xmm2, (%r8) movsd %xmm10, (%rbp) movsd %xmm9, (%rdi) jg .L27 It is not clear from your report, if -O1 flag is problematic, -O2 code looks good to me.
Subject: Re: 96% performance regression in floating point code; part of the problem started 2009/03/12-13 On Sun, 2009-04-26 at 18:43 +0000, ubizjak at gmail dot com wrote: > > > ------- Comment #1 from ubizjak at gmail dot com 2009-04-26 18:43 ------- > There are a couple of possible candidates in this range: > > URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=144812 > Log: > 2009-03-12 Vladimir Makarov <vmakarov@redhat.com> > > PR debug/39432 > * ira-int.h (struct allocno): Fix comment for calls_crossed_num. > * ira-conflicts.c (ira_build_conflicts): Prohibit call used > registers for allocnos created from user-defined variables. The problem exists in gcc version 4.4.0 20090312 (experimental) [trunk revision 144812] (GCC) So perhaps it's this checkin. Brad
Subject: Re: 96% performance regression in floating point code; part of the problem started 2009/03/12-13 On Mon, 2009-04-27 at 08:16 +0000, ubizjak at gmail dot com wrote: > > > ------- Comment #2 from ubizjak at gmail dot com 2009-04-27 08:16 ------- > (In reply to comment #0) > > > (same .i file, same instructions for reproducing, same compiler options, same > > everything) > > I guess that this is direct.i compiled with -O1? > Yes, the compile flags are -Wall -W -Wno-unused -O1 -fno-math-errno -fschedule-insns2 -fno-trapping-math -fno-strict-aliasing -fwrapv -fomit-frame-pointer -fPIC -fno-common -mieee-fp > It is not clear from your report, if -O1 flag is problematic, -O2 code looks > good to me. Yes, the -O2 code looks good to me, too. I've used the above list of options (starting with -O1) on this code instead of -O2 because the above list (a) has generally given faster performance, and (b) has required much less compile time and memory to compile the C code generated by the Gambit Scheme->C compiler. I have not yet seen any evidence that -O2 generates better code (overall) than those set of options above. Brad
This is by design -O1 is way slower than -O2 now.
Subject: Re: 96% performance regression in floating point code; part of the problem started 2009/03/12-13 On Mon, 2009-04-27 at 15:26 +0000, pinskia at gcc dot gnu dot org wrote: > This is by design -O1 is way slower than -O2 now. I have seen no general discussion that -O1 should be destroyed as a useful compilation option.
Subject: Re: 96% performance regression in floating point code; part of the problem started 2009/03/12-13 On Mon, 2009-04-27 at 15:32 +0000, lucier at math dot purdue dot edu wrote: > On Mon, 2009-04-27 at 15:26 +0000, pinskia at gcc dot gnu dot org wrote: > > > This is by design -O1 is way slower than -O2 now. > > I have seen no general discussion that -O1 should be destroyed as a > useful compilation option. Perhaps I should also point out that code generated by -O2 is not generally much faster than before, so if you believe that -O1 is much slower than -O2 now by design, it is only by making code generated by -O1 much slower. BTW, this code runs in 108 ms when compiled with gcc-4.2.4 with the given options (including -O1). Brad
I hadn't noticed before that Andrew had marked it as "RESOLVED INVALID". I'm reopening it, as I believe that resolving it as INVALID should require a more general discussion than a one-line dismissal of the bug. Brad
Following patch should fix the performance hit with -O1: --cut here-- Index: ira-conflicts.c =================================================================== --- ira-conflicts.c (revision 146825) +++ ira-conflicts.c (working copy) @@ -806,7 +806,7 @@ ira_build_conflicts (void) if ((! flag_caller_saves && ALLOCNO_CALLS_CROSSED_NUM (a) != 0) /* For debugging purposes don't put user defined variables in callee-clobbered registers. */ - || (optimize <= 1 + || (optimize == 0 && (attrs = REG_ATTRS (regno_reg_rtx [ALLOCNO_REGNO (a)])) != NULL && (decl = attrs->decl) != NULL && VAR_OR_FUNCTION_DECL_P (decl) --cut here-- IMO, such a performance hit is not acceptable with -O1, we want to _optimize_ the code, we have -O0 to achieve full debug functionality.
Yeah, it's basically destroying caller-save optimization.
As far as I can tell, the patch proposed by Uros restores the performance of code generated by gcc version 4.4.0 20090312 (experimental) [trunk revision 144812] (GCC) In particular, the assembly code for the main loop is identical for code generated by gcc version 4.4.0 20090312 (experimental) [trunk revision 144801] (GCC) and by gcc version 4.4.0 20090312 (experimental) [trunk revision 144812] (GCC) after his patch. Thanks for getting to this so quickly. Brad
I tried to build and check with this patch, but I got stopped with: /tmp/lucier/gcc/objdirs/mainline/./prev-gcc/xgcc -B/tmp/lucier/gcc/objdirs/mainline/./prev-gcc/ -B/pkgs/gcc-mainline/x86_64-unknown-linux-gnu/bin/ -c -g -O2 -DIN_GCC -W -Wall -Wwrite-strings -Wstrict-prototypes -Wmissing-prototypes -Wcast-qual -Wold-style-definition -Wc++-compat -Wmissing-format-attribute -pedantic -Wno-long-long -Wno-variadic-macros -Wno-overlength-strings -Werror -fno-common -DHAVE_CONFIG_H -DGENERATOR_FILE -I. -Ibuild -I../../../mainline/gcc -I../../../mainline/gcc/build -I../../../mainline/gcc/../include -I../../../mainline/gcc/../libcpp/include -I/tmp/lucier/gcc/objdirs/mainline/./gmp -I/tmp/lucier/gcc/mainline/gmp -I/tmp/lucier/gcc/objdirs/mainline/./mpfr -I/tmp/lucier/gcc/mainline/mpfr -I../../../mainline/gcc/../libdecnumber -I../../../mainline/gcc/../libdecnumber/bid -I../libdecnumber -o build/vec.o ../../../mainline/gcc/vec.c cc1: warnings being treated as errors ../../../mainline/gcc/vec.c: In function ‘vec_descriptor’: ../../../mainline/gcc/vec.c:116: error: enum conversion when passing argument 3 of ‘htab_find_slot’ is invalid in C++ ../../../mainline/gcc/../include/hashtab.h:172: note: expected ‘enum insert_option’ but argument is of type ‘int’ make[3]: *** [build/vec.o] Error 1
Subject: Bug 39914 Author: uros Date: Tue Apr 28 16:18:17 2009 New Revision: 146904 URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=146904 Log: PR rtl-optimization/39914 * ira-conflicts.c (ira_build_conflicts): Prohibit call used registers for allocnos created from user-defined variables only when not optimizing. Modified: trunk/gcc/ChangeLog trunk/gcc/ira-conflicts.c
Fixed on the trunk so far.
Subject: Bug 39914 Author: uros Date: Sun May 3 19:40:35 2009 New Revision: 147081 URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=147081 Log: Backport from mainline: 2009-04-28 Uros Bizjak <ubizjak@gmail.com> PR rtl-optimization/39914 * ira-conflicts.c (ira_build_conflicts): Prohibit call used registers for allocnos created from user-defined variables only when not optimizing. Modified: branches/gcc-4_4-branch/gcc/ChangeLog branches/gcc-4_4-branch/gcc/ira-conflicts.c
Fixed.