This is the mail archive of the gcc-bugs@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[Bug regression/39914] New: 96% performance regression in floating point code; part of the problem started 2009/03/12-13


With this compiler:

gcc version 4.4.0 20090312 (experimental) [trunk revision 144801] (GCC) 

running the test in

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928

(same .i file, same instructions for reproducing, same compiler options, same
everything)

gives a time of

    132 ms cpu time (132 user, 0 system)

with assembly code in the main loop of

.L2958:
        movq    %rdx, %rcx
        addq    (%r11), %rcx
        leaq    4(%rdx), %r14
        movq    %rcx, (%rdi)
        addq    $4, %rcx
        movq    %rcx, (%r10)
        movq    (%r11), %rcx
        addq    (%rdi), %rcx
        movq    %rcx, (%rsi)
        addq    $4, %rcx
        movq    %rcx, (%r9)
        movq    (%r11), %r12
        addq    (%rsi), %r12
        movq    %r12, (%rbp)
        addq    $4, %r12
        movq    %r12, (%r15)
        movq    (%rax), %rcx
        addq    $7, %rcx
        movsd   (%rcx,%r12,2), %xmm7
        movq    (%rbp), %r12
        leaq    (%rcx,%rdx,2), %r13
        addq    $8, %rdx
        movsd   (%r13), %xmm4
        movsd   (%rcx,%r12,2), %xmm10
        movq    (%r9), %r12
        movsd   (%rcx,%r12,2), %xmm5
        movq    (%rsi), %r12
        movsd   (%rcx,%r12,2), %xmm6
        movq    (%r10), %r12
        movsd   (%rcx,%r12,2), %xmm13
        movq    (%rdi), %r12
        movsd   (%rcx,%r12,2), %xmm11
        leaq    (%r14,%r14), %r12
        movsd   (%rcx,%r12), %xmm9
        movq    24(%r8), %rcx
        movapd  %xmm11, %xmm14
        movsd   15(%rcx), %xmm1
        movsd   7(%rcx), %xmm2
        movapd  %xmm1, %xmm8
        movsd   31(%rcx), %xmm3
        movapd  %xmm2, %xmm12
        mulsd   %xmm10, %xmm8
        mulsd   %xmm7, %xmm12
        mulsd   %xmm2, %xmm10
        mulsd   %xmm1, %xmm7
        movsd   23(%rcx), %xmm0
        addsd   %xmm8, %xmm12
        movapd  %xmm2, %xmm8
        mulsd   %xmm6, %xmm2
        subsd   %xmm7, %xmm10
        movapd  %xmm1, %xmm7
        mulsd   %xmm5, %xmm1
        mulsd   %xmm6, %xmm7
        movapd  %xmm4, %xmm6
        mulsd   %xmm5, %xmm8
        movapd  %xmm9, %xmm5
        subsd   %xmm10, %xmm14
        subsd   %xmm1, %xmm2
        movapd  %xmm3, %xmm1
        addsd   %xmm11, %xmm10
        xorpd   .LC5(%rip), %xmm1
        addsd   %xmm7, %xmm8
        movapd  %xmm13, %xmm7
        subsd   %xmm2, %xmm6
        subsd   %xmm12, %xmm7
        subsd   %xmm8, %xmm5
        addsd   %xmm4, %xmm2
        movapd  %xmm0, %xmm4
        addsd   %xmm9, %xmm8
        movapd  %xmm1, %xmm9
        mulsd   %xmm14, %xmm4
        addsd   %xmm13, %xmm12
        mulsd   %xmm7, %xmm9
        mulsd   %xmm1, %xmm14
        movapd  %xmm3, %xmm1
        mulsd   %xmm0, %xmm7
        mulsd   %xmm10, %xmm1
        mulsd   %xmm0, %xmm10
        addsd   %xmm9, %xmm4
        subsd   %xmm7, %xmm14
        movapd  %xmm0, %xmm7
        movapd  %xmm2, %xmm0
        mulsd   %xmm12, %xmm7
        mulsd   %xmm3, %xmm12
        addsd   %xmm1, %xmm7
        subsd   %xmm12, %xmm10
        addsd   %xmm10, %xmm0
        subsd   %xmm10, %xmm2
        movsd   %xmm0, (%r13)
        movapd  %xmm8, %xmm0
        movq    (%rax), %rcx
        subsd   %xmm7, %xmm8
        addsd   %xmm7, %xmm0
        movsd   %xmm0, 7(%r12,%rcx)
        movq    (%rdi), %r12
        movq    (%rax), %rcx
        movapd  %xmm6, %xmm0
        subsd   %xmm14, %xmm6
        movsd   %xmm2, 7(%rcx,%r12,2)
        movq    (%r10), %r12
        movq    (%rax), %rcx
        addsd   %xmm14, %xmm0
        movsd   %xmm8, 7(%rcx,%r12,2)
        movq    (%rsi), %r12
        movq    (%rax), %rcx
        movsd   %xmm0, 7(%rcx,%r12,2)
        movapd  %xmm5, %xmm0
        movq    (%r9), %r12
        movq    (%rax), %rcx
        subsd   %xmm4, %xmm5
        addsd   %xmm4, %xmm0
        movsd   %xmm0, 7(%rcx,%r12,2)
        movq    (%rbp), %r12
        movq    (%rax), %rcx
        movsd   %xmm6, 7(%rcx,%r12,2)
        movq    (%r15), %r12
        movq    (%rax), %rcx
        movsd   %xmm5, 7(%rcx,%r12,2)
        cmpq    %rdx, -104(%rsp)
        jg      .L2958
        movq    %r14, -104(%rsp)

With this compiler

/pkgs/gcc-mainline/bin/gcc -v
Using built-in specs.
Target: x86_64-unknown-linux-gnu
Configured with: /tmp/lucier/gcc/mainline/configure --enable-checking=release
--prefix=/pkgs/gcc-mainline --enable-languages=c
--enable-gather-detailed-mem-stats
Thread model: posix
gcc version 4.4.0 20090313 (experimental) [trunk revision 144829] (GCC) 

one gets a time of

    212 ms cpu time (212 user, 0 system)

and the assembly language for the main loop is

.L2946:
        movq    %rbx, %rdx
        addq    (%r11), %rdx
        leaq    4(%rbx), %rbp
        movq    %rdx, (%rsi)
        addq    $4, %rdx
        movq    %rdx, (%r10)
        movq    (%r11), %rdx
        addq    (%rsi), %rdx
        movq    %rdx, (%rcx)
        addq    $4, %rdx
        movq    %rdx, (%r9)
        movq    (%r11), %r13
        addq    (%rcx), %r13
        movq    %r13, (%r8)
        addq    $4, %r13
        movq    %r13, (%r15)
        movq    (%rax), %rdx
        addq    $7, %rdx
        movsd   (%rdx,%r13,2), %xmm0
        leaq    (%rdx,%rbx,2), %r14
        addq    $8, %rbx
        movsd   %xmm0, -48(%rsp)
        movq    (%r8), %r13
        movsd   (%rdx,%r13,2), %xmm0
        movsd   %xmm0, -56(%rsp)
        movq    (%r9), %r13
        movsd   (%rdx,%r13,2), %xmm0
        movsd   %xmm0, -64(%rsp)
        movq    (%rcx), %r13
        movsd   (%rdx,%r13,2), %xmm0
        movsd   %xmm0, -72(%rsp)
        movq    (%r10), %r13
        movsd   (%rdx,%r13,2), %xmm0
        movsd   %xmm0, -80(%rsp)
        movq    (%rsi), %r13
        movsd   (%rdx,%r13,2), %xmm0
        leaq    (%rbp,%rbp), %r13
        movsd   %xmm0, -104(%rsp)
        movsd   (%rdx,%r13), %xmm0
        movsd   %xmm0, -88(%rsp)
        movq    24(%rdi), %rdx
        movsd   31(%rdx), %xmm0
        movsd   %xmm0, -32(%rsp)
        movsd   23(%rdx), %xmm0
        movsd   %xmm0, -40(%rsp)
        movsd   15(%rdx), %xmm0
        movsd   %xmm0, -112(%rsp)
        movsd   7(%rdx), %xmm0
        movsd   %xmm0, -120(%rsp)
        movapd  %xmm0, %xmm1
        movsd   -112(%rsp), %xmm0
        mulsd   -48(%rsp), %xmm1
        mulsd   -56(%rsp), %xmm0
        addsd   %xmm0, %xmm1
        movsd   -112(%rsp), %xmm0
        mulsd   -48(%rsp), %xmm0
        movsd   %xmm1, -8(%rsp)
        movsd   -120(%rsp), %xmm1
        mulsd   -56(%rsp), %xmm1
        subsd   %xmm0, %xmm1
        movsd   -112(%rsp), %xmm0
        mulsd   -72(%rsp), %xmm0
        movsd   %xmm1, -16(%rsp)
        movsd   -120(%rsp), %xmm1
        mulsd   -64(%rsp), %xmm1
        addsd   %xmm0, %xmm1
        movsd   -112(%rsp), %xmm0
        mulsd   -64(%rsp), %xmm0
        movsd   %xmm1, -24(%rsp)
        movsd   -120(%rsp), %xmm1
        mulsd   -72(%rsp), %xmm1
        subsd   %xmm0, %xmm1
        movsd   -80(%rsp), %xmm0
        subsd   -8(%rsp), %xmm0
        movsd   %xmm1, -120(%rsp)
        movsd   %xmm0, -48(%rsp)
        movsd   -104(%rsp), %xmm0
        subsd   -16(%rsp), %xmm0
        movsd   %xmm0, -112(%rsp)
        movsd   -88(%rsp), %xmm0
        subsd   -24(%rsp), %xmm0
        movsd   %xmm0, -56(%rsp)
        movsd   (%r14), %xmm0
        subsd   %xmm1, %xmm0
        movsd   %xmm0, -64(%rsp)
        movsd   -80(%rsp), %xmm0
        addsd   -8(%rsp), %xmm0
        movsd   %xmm0, -80(%rsp)
        movsd   -104(%rsp), %xmm0
        addsd   -16(%rsp), %xmm0
        movsd   %xmm0, -104(%rsp)
        movsd   -88(%rsp), %xmm0
        addsd   -24(%rsp), %xmm0
        movsd   %xmm0, -88(%rsp)
        movsd   (%r14), %xmm0
        addsd   %xmm1, %xmm0
        movsd   %xmm0, -96(%rsp)
        movsd   -32(%rsp), %xmm0
        xorpd   .LC5(%rip), %xmm0
        movsd   %xmm0, -120(%rsp)
        movapd  %xmm0, %xmm1
        movsd   -40(%rsp), %xmm0
        mulsd   -48(%rsp), %xmm1
        mulsd   -112(%rsp), %xmm0
        addsd   %xmm0, %xmm1
        movsd   -40(%rsp), %xmm0
        mulsd   -48(%rsp), %xmm0
        movsd   %xmm1, -72(%rsp)
        movsd   -120(%rsp), %xmm1
        mulsd   -112(%rsp), %xmm1
        subsd   %xmm0, %xmm1
        movsd   -32(%rsp), %xmm0
        mulsd   -104(%rsp), %xmm0
        movsd   %xmm1, -112(%rsp)
        movsd   -40(%rsp), %xmm1
        mulsd   -80(%rsp), %xmm1
        addsd   %xmm0, %xmm1
        movsd   -32(%rsp), %xmm0
        mulsd   -80(%rsp), %xmm0
        movsd   %xmm1, -120(%rsp)
        movsd   -40(%rsp), %xmm1
        mulsd   -104(%rsp), %xmm1
        subsd   %xmm0, %xmm1
        movsd   %xmm1, -104(%rsp)
        movsd   -96(%rsp), %xmm0
        addsd   %xmm1, %xmm0
        movsd   %xmm0, (%r14)
        movq    (%rax), %rdx
        movsd   -88(%rsp), %xmm0
        addsd   -120(%rsp), %xmm0
        movsd   %xmm0, 7(%r13,%rdx)
        movq    (%rsi), %r13
        movq    (%rax), %rdx
        movsd   -96(%rsp), %xmm0
        subsd   -104(%rsp), %xmm0
        movsd   %xmm0, 7(%rdx,%r13,2)
        movq    (%r10), %r13
        movq    (%rax), %rdx
        movsd   -88(%rsp), %xmm0
        subsd   -120(%rsp), %xmm0
        movsd   %xmm0, 7(%rdx,%r13,2)
        movq    (%rcx), %r13
        movq    (%rax), %rdx
        movsd   -64(%rsp), %xmm0
        addsd   -112(%rsp), %xmm0
        movsd   %xmm0, 7(%rdx,%r13,2)
        movq    (%r9), %r13
        movq    (%rax), %rdx
        movsd   -56(%rsp), %xmm0
        addsd   -72(%rsp), %xmm0
        movsd   %xmm0, 7(%rdx,%r13,2)
        movq    (%r8), %r13
        movq    (%rax), %rdx
        movsd   -64(%rsp), %xmm0
        subsd   -112(%rsp), %xmm0
        movsd   %xmm0, 7(%rdx,%r13,2)
        movq    (%r15), %r13
        movq    (%rax), %rdx
        movsd   -56(%rsp), %xmm0
        subsd   -72(%rsp), %xmm0
        movsd   %xmm0, 7(%rdx,%r13,2)
        cmpq    %rbx, (%rsp)
        jg      .L2946
        movq    %rbp, (%rsp)

I'm reporting this separately because it doesn't have the same cause as the
previous PR 33928

BTW, with 4.2.4 this test runs in 108 ms on this machine, hence the total
regression amount noted in the subject line.  This part itself causes about 60%
performance regression, the rest is accounte for by

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928

Brad


-- 
           Summary: 96% performance regression in floating point code; part
                    of the problem started 2009/03/12-13
           Product: gcc
           Version: 4.4.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: regression
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: lucier at math dot purdue dot edu
 GCC build triplet: x86_64-unknown-linux-gnu
  GCC host triplet: x86_64-unknown-linux-gnu
GCC target triplet: x86_64-unknown-linux-gnu


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39914


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]