[Bug rtl-optimization/33928] [4.3/4.4/4.5 Regression] 30% performance slowdown in floating-point code caused by r118475

Thu May 7 15:58:00 GMT 2009


------- Comment #69 from lucier at math dot purdue dot edu  2009-05-07 15:57 -------
    Well, adding -frename-registers by itself to -O1 and not
-fforward-propagate and -fno-move-loop-invariants doesn't help (loop is given
below, along with complete compile options), the time is

        140 ms cpu time (140 user, 0 system)

    and adding -frename-registers and -fno-move-loop-invariants without
-fforward-propagate doesn't help (loop is again given below), it gets

        140 ms cpu time (140 user, 0 system)

    Adding all three gives a very consistent time this morning of

        120 ms cpu time (120 user, 0 system)

    so which is the same as the 4.2.4 time without any of these options (this
morning).

    But -fforward-propagate is not a viable option in general for this type of
code; here are some times for the testcase from PR 31957 with various options
on a 2.something GHz Xeon server:

    pythagoras-45% time /pkgs/gcc-mainline/bin/gcc -save-temps -I../include -I.
-Wall -W -Wno-unused -O1 -fno-math-errno -fschedule-insns2 -fno-trapping-math
-fno-strict-aliasing -fwrapv -fomit-frame-pointer -fPIC -fno-common -mieee-fp
-frename-registers -DHAVE_CONFIG_H -D___PRIMAL -D___LIBRARY -c compiler.i
-ftime-report -fmem-report >& rename-report
    252.987u 9.592s 4:23.20 99.7%   0+0k 0+0io 0pf+0w
    pythagoras-46% time /pkgs/gcc-mainline/bin/gcc -save-temps -I../include -I.
-Wall -W -Wno-unused -O1 -fno-math-errno -fschedule-insns2 -fno-trapping-math
-fno-strict-aliasing -fwrapv -fomit-frame-pointer -fPIC -fno-common -mieee-fp
-DHAVE_CONFIG_H -D___PRIMAL -D___LIBRARY -c compiler.i -ftime-report
-fmem-report > & no-rename-report
    249.875u 10.544s 4:21.73 99.4%  0+0k 0+0io 0pf+0w
    pythagoras-47% time /pkgs/gcc-mainline/bin/gcc -save-temps -I../include -I.
-Wall -W -Wno-unused -O1 -fno-math-errno -fschedule-insns2 -fno-trapping-math
-fno-strict-aliasing -fwrapv -fomit-frame-pointer -fPIC -fno-common -mieee-fp
-frename-registers -fno-move-loop-invariants -DHAVE_CONFIG_H -D___PRIMAL
-D___LIBRARY -c compiler.i -ftime-report -fmem-report > &
rename-no-move-loop-invariants-report
    246.663u 10.484s 4:18.30 99.5%  0+0k 0+0io 0pf+0w
    pythagoras-48% time /pkgs/gcc-mainline/bin/gcc -save-temps -I../include -I.
-Wall -W -Wno-unused -O1 -fno-math-errno -fschedule-insns2 -fno-trapping-math
-fno-strict-aliasing -fwrapv -fomit-frame-pointer -fPIC -fno-common -mieee-fp
-frename-registers -fno-move-loop-invariants -fforward-propagate
-DHAVE_CONFIG_H -D___PRIMAL -D___LIBRARY -c compiler.i -ftime-report
-fmem-report > & rename-no-move-loop-invariants-forward-propagate-report
    357.830u 28.417s 6:27.81 99.5%  0+0k 0+0io 11pf+0w

    With -fforward-propagate the memory required went up to at least 21GB.

    I'll attach the time reports for the various options, but the compiler
wasn't configured to provide detailed memory reports.

    Brad


    Loop with -frename-registers

    /pkgs/gcc-mainline/bin/gcc -save-temps -I../include -I. -Wall -W
-Wno-unused -O1 -fno-math-errno -fschedule-insns2 -fno-trapping-math
-fno-strict-aliasing -fwrapv -fomit-frame-pointer -fPIC -fno-common -mieee-fp
-frename-registers  -DHAVE_CONFIG_H -D___PRIMAL -D___LIBRARY
-D___GAMBCDIR="\"/usr/local/Gambit-C/v4.1.2\"" -D___SYS_TYPE_CPU="\"x86_64\""
-D___SYS_TYPE_VENDOR="\"unknown\"" -D___SYS_TYPE_OS="\"linux-gnu\"" -c _num.c


            movq    %rdx, %r12
            addq    (%r11), %r12
            leaq    4(%rdx), %r14
            movq    %r12, (%rsi)
            addq    $4, %r12
            movq    %r12, (%r10)
            movq    (%r11), %rcx
            addq    (%rsi), %rcx
            movq    %rcx, (%rbx)
            addq    $4, %rcx
            movq    %rcx, (%r9)
            movq    (%r11), %r13
            addq    (%rbx), %r13
            movq    %r13, (%r8)
            addq    $4, %r13
            movq    %r13, (%r15)
            movq    (%rax), %rcx
            movq    (%r8), %r12
            addq    $7, %rcx
            movsd   (%rcx,%r12,2), %xmm10
            movq    (%rbx), %r12
            movsd   (%rcx,%r13,2), %xmm13
            movq    (%r9), %r13
            movsd   (%rcx,%r12,2), %xmm6
            movq    (%rsi), %r12
            movsd   (%rcx,%r13,2), %xmm5
            movq    (%r10), %r13
            movsd   (%rcx,%r12,2), %xmm9
            leaq    (%r14,%r14), %r12
            movsd   (%rcx,%r13,2), %xmm11
            leaq    (%rcx,%rdx,2), %r13
            movsd   (%rcx,%r12), %xmm3
            movq    24(%rdi), %rcx
            movsd   (%r13), %xmm4
            addq    $8, %rdx
            movsd   15(%rcx), %xmm14
            movsd   7(%rcx), %xmm15
            movapd  %xmm14, %xmm8
            movapd  %xmm14, %xmm7
            movapd  %xmm15, %xmm12
            mulsd   %xmm10, %xmm8
            mulsd   %xmm13, %xmm12
            mulsd   %xmm15, %xmm10
            mulsd   %xmm14, %xmm13
            movsd   31(%rcx), %xmm2
            addsd   %xmm8, %xmm12
            movapd  %xmm15, %xmm8
            mulsd   %xmm6, %xmm7
            mulsd   %xmm5, %xmm14
            subsd   %xmm13, %xmm10
            mulsd   %xmm5, %xmm8
            movapd  %xmm2, %xmm13
            mulsd   %xmm6, %xmm15
            movapd  %xmm4, %xmm6
            xorpd   .LC5(%rip), %xmm13
            movapd  %xmm3, %xmm5
            addsd   %xmm7, %xmm8
            movapd  %xmm11, %xmm7
            subsd   %xmm14, %xmm15
            movapd  %xmm9, %xmm14
            movsd   23(%rcx), %xmm0
            subsd   %xmm12, %xmm7
            subsd   %xmm10, %xmm14
            movapd  %xmm13, %xmm1
            addsd   %xmm11, %xmm12
            movapd  %xmm2, %xmm11
            subsd   %xmm15, %xmm6
            addsd   %xmm4, %xmm15
            movapd  %xmm0, %xmm4
            mulsd   %xmm7, %xmm1
            addsd   %xmm9, %xmm10
            mulsd   %xmm14, %xmm4
            subsd   %xmm8, %xmm5
            mulsd   %xmm0, %xmm7
            addsd   %xmm3, %xmm8
            mulsd   %xmm13, %xmm14
            movapd  %xmm15, %xmm9
            mulsd   %xmm10, %xmm11
            mulsd   %xmm0, %xmm10
            addsd   %xmm1, %xmm4
            movapd  %xmm8, %xmm3
            movapd  %xmm5, %xmm1
            subsd   %xmm7, %xmm14
            movapd  %xmm0, %xmm7
            mulsd   %xmm12, %xmm7
            addsd   %xmm4, %xmm1
            mulsd   %xmm2, %xmm12
            movapd  %xmm6, %xmm2
            subsd   %xmm14, %xmm6
            addsd   %xmm14, %xmm2
            addsd   %xmm11, %xmm7
            subsd   %xmm12, %xmm10
            subsd   %xmm4, %xmm5
            addsd   %xmm7, %xmm3
            addsd   %xmm10, %xmm9
            subsd   %xmm10, %xmm15
            subsd   %xmm7, %xmm8
            movsd   %xmm9, (%r13)
            movq    (%rax), %rcx
            movsd   %xmm3, 7(%r12,%rcx)
            movq    (%rsi), %r13
            movq    (%rax), %rcx
            movsd   %xmm15, 7(%rcx,%r13,2)
            movq    (%r10), %r12
            movq    (%rax), %r13
            movsd   %xmm8, 7(%r13,%r12,2)
            movq    (%rbx), %rcx
            movq    (%rax), %r13
            movsd   %xmm2, 7(%r13,%rcx,2)
            movq    (%r9), %r12
            movq    (%rax), %rcx
            movsd   %xmm1, 7(%rcx,%r12,2)
            movq    (%r8), %r13
            movq    (%rax), %rcx
            movsd   %xmm6, 7(%rcx,%r13,2)
            movq    (%r15), %r12
            movq    (%rax), %r13
            movsd   %xmm5, 7(%r13,%r12,2)
            cmpq    %rdx, -104(%rsp)
            jg      .L2941

    Loop with -frename-registers -fno-move-loop-invariants

    /pkgs/gcc-mainline/bin/gcc -save-temps -I../include -I. -Wall -W
-Wno-unused -O1 -fno-math-errno -fschedule-insns2 -fno-trapping-math
-fno-strict-aliasing -fwrapv -fomit-frame-pointer -fPIC -fno-common -mieee-fp
-frename-registers -fno-move-loop-invariants -DHAVE_CONFIG_H -D___PRIMAL
-D___LIBRARY -D___GAMBCDIR="\"/usr/local/Gambit-C/v4.1.2\""
-D___SYS_TYPE_CPU="\"x86_64\"" -D___SYS_TYPE_VENDOR="\"unknown\""
-D___SYS_TYPE_OS="\"linux-gnu\"" -c _num.c

    .L2755:
            leaq    8(%rax), %rdx
            movq    %rcx, %r13
            leaq    -16(%rax), %r9
            leaq    -8(%rax), %r10
            leaq    -24(%rax), %r8
            leaq    -32(%rax), %rdi
            addq    (%rdx), %r13
            leaq    4(%rcx), %r14
            leaq    4(%r13), %rsi
            movq    %r13, (%r10)
            movq    %rsi, (%r9)
            addq    (%rdx), %r13
            leaq    -40(%rax), %rsi
            leaq    4(%r13), %r11
            movq    %r13, (%r8)
            movq    %r11, (%rdi)
            addq    (%rdx), %r13
            leaq    -48(%rax), %r11
            leaq    40(%rax), %rdx
            movq    %r13, (%rsi)
            addq    $4, %r13
            movq    %r13, (%r11)
            movq    (%rdx), %rbx
            movq    (%rsi), %r12
            addq    $7, %rbx
            movsd   (%rbx,%r12,2), %xmm11
            movq    (%r8), %r12
            movsd   (%rbx,%r13,2), %xmm9
            movq    (%rdi), %r13
            movsd   (%rbx,%r12,2), %xmm7
            movq    (%r10), %r12
            movsd   (%rbx,%r13,2), %xmm5
            movq    (%r9), %r13
            movsd   (%rbx,%r12,2), %xmm6
            leaq    (%r14,%r14), %r12
            movsd   (%rbx,%r13,2), %xmm14
            leaq    (%rbx,%rcx,2), %r13
            movsd   (%rbx,%r12), %xmm8
            movq    24(%rax), %rbx
            movapd  %xmm6, %xmm13
            addq    $8, %rcx
            movsd   (%r13), %xmm4
            cmpq    %rcx, %r15
            movsd   15(%rbx), %xmm1
            movsd   7(%rbx), %xmm2
            movapd  %xmm1, %xmm3
            movsd   31(%rbx), %xmm0
            movapd  %xmm2, %xmm10
            mulsd   %xmm11, %xmm3
            movapd  %xmm2, %xmm12
            mulsd   %xmm9, %xmm10
            mulsd   %xmm2, %xmm11
            mulsd   %xmm1, %xmm9
            mulsd   %xmm7, %xmm2
            addsd   %xmm10, %xmm3
            mulsd   %xmm5, %xmm12
            movapd  %xmm14, %xmm10
            movsd   23(%rbx), %xmm15
            subsd   %xmm9, %xmm11
            movapd  %xmm1, %xmm9
            mulsd   %xmm5, %xmm1
            movapd  %xmm8, %xmm5
            mulsd   %xmm7, %xmm9
            subsd   %xmm3, %xmm10
            movapd  %xmm4, %xmm7
            subsd   %xmm11, %xmm13
            addsd   %xmm6, %xmm11
            movsd   .LC5(%rip), %xmm6
            subsd   %xmm1, %xmm2
            xorpd   %xmm0, %xmm6
            addsd   %xmm14, %xmm3
            addsd   %xmm12, %xmm9
            movapd  %xmm15, %xmm14
            movapd  %xmm0, %xmm12
            subsd   %xmm2, %xmm7
            mulsd   %xmm13, %xmm14
            addsd   %xmm4, %xmm2
            movapd  %xmm6, %xmm4
            subsd   %xmm9, %xmm5
            mulsd   %xmm3, %xmm0
            addsd   %xmm8, %xmm9
            mulsd   %xmm10, %xmm4
            movapd  %xmm15, %xmm8
            mulsd   %xmm15, %xmm10
            mulsd   %xmm11, %xmm15
            movapd  %xmm7, %xmm1
            mulsd   %xmm13, %xmm6
            mulsd   %xmm3, %xmm8
            movapd  %xmm9, %xmm3
            mulsd   %xmm11, %xmm12
            addsd   %xmm14, %xmm4
            subsd   %xmm0, %xmm15
            movapd  %xmm5, %xmm0
            subsd   %xmm10, %xmm6
            movapd  %xmm2, %xmm10
            addsd   %xmm12, %xmm8
            addsd   %xmm15, %xmm10
            subsd   %xmm15, %xmm2
            addsd   %xmm6, %xmm1
            addsd   %xmm8, %xmm3
            movsd   %xmm10, (%r13)
            movq    (%rdx), %rbx
            subsd   %xmm8, %xmm9
            addsd   %xmm4, %xmm0
            subsd   %xmm6, %xmm7
            movsd   %xmm3, 7(%r12,%rbx)
            movq    (%r10), %r10
            movq    (%rdx), %r13
            subsd   %xmm4, %xmm5
            movsd   %xmm2, 7(%r13,%r10,2)
            movq    (%r9), %rbx
            movq    (%rdx), %r12
            movsd   %xmm9, 7(%r12,%rbx,2)
            movq    (%r8), %r13
            movq    (%rdx), %r10
            movsd   %xmm1, 7(%r10,%r13,2)
            movq    (%rdi), %r9
            movq    (%rdx), %rbx
            movsd   %xmm0, 7(%rbx,%r9,2)
            movq    (%rsi), %rsi
            movq    (%rdx), %r8
            movsd   %xmm7, 7(%r8,%rsi,2)
            movq    (%r11), %rdi
            movq    (%rdx), %r12
            movsd   %xmm5, 7(%r12,%rdi,2)
            jg      .L2755


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928