[Bug rtl-optimization/33928] [4.3/4.4/4.5 Regression] 30% performance slowdown in floating-point code caused by r118475
lucier at math dot purdue dot edu
gcc-bugzilla@gcc.gnu.org
Thu May 7 15:58:00 GMT 2009
------- Comment #69 from lucier at math dot purdue dot edu 2009-05-07 15:57 -------
Well, adding -frename-registers by itself to -O1 and not
-fforward-propagate and -fno-move-loop-invariants doesn't help (loop is given
below, along with complete compile options), the time is
140 ms cpu time (140 user, 0 system)
and adding -frename-registers and -fno-move-loop-invariants without
-fforward-propagate doesn't help (loop is again given below), it gets
140 ms cpu time (140 user, 0 system)
Adding all three gives a very consistent time this morning of
120 ms cpu time (120 user, 0 system)
so which is the same as the 4.2.4 time without any of these options (this
morning).
But -fforward-propagate is not a viable option in general for this type of
code; here are some times for the testcase from PR 31957 with various options
on a 2.something GHz Xeon server:
pythagoras-45% time /pkgs/gcc-mainline/bin/gcc -save-temps -I../include -I.
-Wall -W -Wno-unused -O1 -fno-math-errno -fschedule-insns2 -fno-trapping-math
-fno-strict-aliasing -fwrapv -fomit-frame-pointer -fPIC -fno-common -mieee-fp
-frename-registers -DHAVE_CONFIG_H -D___PRIMAL -D___LIBRARY -c compiler.i
-ftime-report -fmem-report >& rename-report
252.987u 9.592s 4:23.20 99.7% 0+0k 0+0io 0pf+0w
pythagoras-46% time /pkgs/gcc-mainline/bin/gcc -save-temps -I../include -I.
-Wall -W -Wno-unused -O1 -fno-math-errno -fschedule-insns2 -fno-trapping-math
-fno-strict-aliasing -fwrapv -fomit-frame-pointer -fPIC -fno-common -mieee-fp
-DHAVE_CONFIG_H -D___PRIMAL -D___LIBRARY -c compiler.i -ftime-report
-fmem-report > & no-rename-report
249.875u 10.544s 4:21.73 99.4% 0+0k 0+0io 0pf+0w
pythagoras-47% time /pkgs/gcc-mainline/bin/gcc -save-temps -I../include -I.
-Wall -W -Wno-unused -O1 -fno-math-errno -fschedule-insns2 -fno-trapping-math
-fno-strict-aliasing -fwrapv -fomit-frame-pointer -fPIC -fno-common -mieee-fp
-frename-registers -fno-move-loop-invariants -DHAVE_CONFIG_H -D___PRIMAL
-D___LIBRARY -c compiler.i -ftime-report -fmem-report > &
rename-no-move-loop-invariants-report
246.663u 10.484s 4:18.30 99.5% 0+0k 0+0io 0pf+0w
pythagoras-48% time /pkgs/gcc-mainline/bin/gcc -save-temps -I../include -I.
-Wall -W -Wno-unused -O1 -fno-math-errno -fschedule-insns2 -fno-trapping-math
-fno-strict-aliasing -fwrapv -fomit-frame-pointer -fPIC -fno-common -mieee-fp
-frename-registers -fno-move-loop-invariants -fforward-propagate
-DHAVE_CONFIG_H -D___PRIMAL -D___LIBRARY -c compiler.i -ftime-report
-fmem-report > & rename-no-move-loop-invariants-forward-propagate-report
357.830u 28.417s 6:27.81 99.5% 0+0k 0+0io 11pf+0w
With -fforward-propagate the memory required went up to at least 21GB.
I'll attach the time reports for the various options, but the compiler
wasn't configured to provide detailed memory reports.
Brad
Loop with -frename-registers
/pkgs/gcc-mainline/bin/gcc -save-temps -I../include -I. -Wall -W
-Wno-unused -O1 -fno-math-errno -fschedule-insns2 -fno-trapping-math
-fno-strict-aliasing -fwrapv -fomit-frame-pointer -fPIC -fno-common -mieee-fp
-frename-registers -DHAVE_CONFIG_H -D___PRIMAL -D___LIBRARY
-D___GAMBCDIR="\"/usr/local/Gambit-C/v4.1.2\"" -D___SYS_TYPE_CPU="\"x86_64\""
-D___SYS_TYPE_VENDOR="\"unknown\"" -D___SYS_TYPE_OS="\"linux-gnu\"" -c _num.c
movq %rdx, %r12
addq (%r11), %r12
leaq 4(%rdx), %r14
movq %r12, (%rsi)
addq $4, %r12
movq %r12, (%r10)
movq (%r11), %rcx
addq (%rsi), %rcx
movq %rcx, (%rbx)
addq $4, %rcx
movq %rcx, (%r9)
movq (%r11), %r13
addq (%rbx), %r13
movq %r13, (%r8)
addq $4, %r13
movq %r13, (%r15)
movq (%rax), %rcx
movq (%r8), %r12
addq $7, %rcx
movsd (%rcx,%r12,2), %xmm10
movq (%rbx), %r12
movsd (%rcx,%r13,2), %xmm13
movq (%r9), %r13
movsd (%rcx,%r12,2), %xmm6
movq (%rsi), %r12
movsd (%rcx,%r13,2), %xmm5
movq (%r10), %r13
movsd (%rcx,%r12,2), %xmm9
leaq (%r14,%r14), %r12
movsd (%rcx,%r13,2), %xmm11
leaq (%rcx,%rdx,2), %r13
movsd (%rcx,%r12), %xmm3
movq 24(%rdi), %rcx
movsd (%r13), %xmm4
addq $8, %rdx
movsd 15(%rcx), %xmm14
movsd 7(%rcx), %xmm15
movapd %xmm14, %xmm8
movapd %xmm14, %xmm7
movapd %xmm15, %xmm12
mulsd %xmm10, %xmm8
mulsd %xmm13, %xmm12
mulsd %xmm15, %xmm10
mulsd %xmm14, %xmm13
movsd 31(%rcx), %xmm2
addsd %xmm8, %xmm12
movapd %xmm15, %xmm8
mulsd %xmm6, %xmm7
mulsd %xmm5, %xmm14
subsd %xmm13, %xmm10
mulsd %xmm5, %xmm8
movapd %xmm2, %xmm13
mulsd %xmm6, %xmm15
movapd %xmm4, %xmm6
xorpd .LC5(%rip), %xmm13
movapd %xmm3, %xmm5
addsd %xmm7, %xmm8
movapd %xmm11, %xmm7
subsd %xmm14, %xmm15
movapd %xmm9, %xmm14
movsd 23(%rcx), %xmm0
subsd %xmm12, %xmm7
subsd %xmm10, %xmm14
movapd %xmm13, %xmm1
addsd %xmm11, %xmm12
movapd %xmm2, %xmm11
subsd %xmm15, %xmm6
addsd %xmm4, %xmm15
movapd %xmm0, %xmm4
mulsd %xmm7, %xmm1
addsd %xmm9, %xmm10
mulsd %xmm14, %xmm4
subsd %xmm8, %xmm5
mulsd %xmm0, %xmm7
addsd %xmm3, %xmm8
mulsd %xmm13, %xmm14
movapd %xmm15, %xmm9
mulsd %xmm10, %xmm11
mulsd %xmm0, %xmm10
addsd %xmm1, %xmm4
movapd %xmm8, %xmm3
movapd %xmm5, %xmm1
subsd %xmm7, %xmm14
movapd %xmm0, %xmm7
mulsd %xmm12, %xmm7
addsd %xmm4, %xmm1
mulsd %xmm2, %xmm12
movapd %xmm6, %xmm2
subsd %xmm14, %xmm6
addsd %xmm14, %xmm2
addsd %xmm11, %xmm7
subsd %xmm12, %xmm10
subsd %xmm4, %xmm5
addsd %xmm7, %xmm3
addsd %xmm10, %xmm9
subsd %xmm10, %xmm15
subsd %xmm7, %xmm8
movsd %xmm9, (%r13)
movq (%rax), %rcx
movsd %xmm3, 7(%r12,%rcx)
movq (%rsi), %r13
movq (%rax), %rcx
movsd %xmm15, 7(%rcx,%r13,2)
movq (%r10), %r12
movq (%rax), %r13
movsd %xmm8, 7(%r13,%r12,2)
movq (%rbx), %rcx
movq (%rax), %r13
movsd %xmm2, 7(%r13,%rcx,2)
movq (%r9), %r12
movq (%rax), %rcx
movsd %xmm1, 7(%rcx,%r12,2)
movq (%r8), %r13
movq (%rax), %rcx
movsd %xmm6, 7(%rcx,%r13,2)
movq (%r15), %r12
movq (%rax), %r13
movsd %xmm5, 7(%r13,%r12,2)
cmpq %rdx, -104(%rsp)
jg .L2941
Loop with -frename-registers -fno-move-loop-invariants
/pkgs/gcc-mainline/bin/gcc -save-temps -I../include -I. -Wall -W
-Wno-unused -O1 -fno-math-errno -fschedule-insns2 -fno-trapping-math
-fno-strict-aliasing -fwrapv -fomit-frame-pointer -fPIC -fno-common -mieee-fp
-frename-registers -fno-move-loop-invariants -DHAVE_CONFIG_H -D___PRIMAL
-D___LIBRARY -D___GAMBCDIR="\"/usr/local/Gambit-C/v4.1.2\""
-D___SYS_TYPE_CPU="\"x86_64\"" -D___SYS_TYPE_VENDOR="\"unknown\""
-D___SYS_TYPE_OS="\"linux-gnu\"" -c _num.c
.L2755:
leaq 8(%rax), %rdx
movq %rcx, %r13
leaq -16(%rax), %r9
leaq -8(%rax), %r10
leaq -24(%rax), %r8
leaq -32(%rax), %rdi
addq (%rdx), %r13
leaq 4(%rcx), %r14
leaq 4(%r13), %rsi
movq %r13, (%r10)
movq %rsi, (%r9)
addq (%rdx), %r13
leaq -40(%rax), %rsi
leaq 4(%r13), %r11
movq %r13, (%r8)
movq %r11, (%rdi)
addq (%rdx), %r13
leaq -48(%rax), %r11
leaq 40(%rax), %rdx
movq %r13, (%rsi)
addq $4, %r13
movq %r13, (%r11)
movq (%rdx), %rbx
movq (%rsi), %r12
addq $7, %rbx
movsd (%rbx,%r12,2), %xmm11
movq (%r8), %r12
movsd (%rbx,%r13,2), %xmm9
movq (%rdi), %r13
movsd (%rbx,%r12,2), %xmm7
movq (%r10), %r12
movsd (%rbx,%r13,2), %xmm5
movq (%r9), %r13
movsd (%rbx,%r12,2), %xmm6
leaq (%r14,%r14), %r12
movsd (%rbx,%r13,2), %xmm14
leaq (%rbx,%rcx,2), %r13
movsd (%rbx,%r12), %xmm8
movq 24(%rax), %rbx
movapd %xmm6, %xmm13
addq $8, %rcx
movsd (%r13), %xmm4
cmpq %rcx, %r15
movsd 15(%rbx), %xmm1
movsd 7(%rbx), %xmm2
movapd %xmm1, %xmm3
movsd 31(%rbx), %xmm0
movapd %xmm2, %xmm10
mulsd %xmm11, %xmm3
movapd %xmm2, %xmm12
mulsd %xmm9, %xmm10
mulsd %xmm2, %xmm11
mulsd %xmm1, %xmm9
mulsd %xmm7, %xmm2
addsd %xmm10, %xmm3
mulsd %xmm5, %xmm12
movapd %xmm14, %xmm10
movsd 23(%rbx), %xmm15
subsd %xmm9, %xmm11
movapd %xmm1, %xmm9
mulsd %xmm5, %xmm1
movapd %xmm8, %xmm5
mulsd %xmm7, %xmm9
subsd %xmm3, %xmm10
movapd %xmm4, %xmm7
subsd %xmm11, %xmm13
addsd %xmm6, %xmm11
movsd .LC5(%rip), %xmm6
subsd %xmm1, %xmm2
xorpd %xmm0, %xmm6
addsd %xmm14, %xmm3
addsd %xmm12, %xmm9
movapd %xmm15, %xmm14
movapd %xmm0, %xmm12
subsd %xmm2, %xmm7
mulsd %xmm13, %xmm14
addsd %xmm4, %xmm2
movapd %xmm6, %xmm4
subsd %xmm9, %xmm5
mulsd %xmm3, %xmm0
addsd %xmm8, %xmm9
mulsd %xmm10, %xmm4
movapd %xmm15, %xmm8
mulsd %xmm15, %xmm10
mulsd %xmm11, %xmm15
movapd %xmm7, %xmm1
mulsd %xmm13, %xmm6
mulsd %xmm3, %xmm8
movapd %xmm9, %xmm3
mulsd %xmm11, %xmm12
addsd %xmm14, %xmm4
subsd %xmm0, %xmm15
movapd %xmm5, %xmm0
subsd %xmm10, %xmm6
movapd %xmm2, %xmm10
addsd %xmm12, %xmm8
addsd %xmm15, %xmm10
subsd %xmm15, %xmm2
addsd %xmm6, %xmm1
addsd %xmm8, %xmm3
movsd %xmm10, (%r13)
movq (%rdx), %rbx
subsd %xmm8, %xmm9
addsd %xmm4, %xmm0
subsd %xmm6, %xmm7
movsd %xmm3, 7(%r12,%rbx)
movq (%r10), %r10
movq (%rdx), %r13
subsd %xmm4, %xmm5
movsd %xmm2, 7(%r13,%r10,2)
movq (%r9), %rbx
movq (%rdx), %r12
movsd %xmm9, 7(%r12,%rbx,2)
movq (%r8), %r13
movq (%rdx), %r10
movsd %xmm1, 7(%r10,%r13,2)
movq (%rdi), %r9
movq (%rdx), %rbx
movsd %xmm0, 7(%rbx,%r9,2)
movq (%rsi), %rsi
movq (%rdx), %r8
movsd %xmm7, 7(%r8,%rsi,2)
movq (%r11), %rdi
movq (%rdx), %r12
movsd %xmm5, 7(%r12,%rdi,2)
jg .L2755
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928
More information about the Gcc-bugs
mailing list