[Bug rtl-optimization/33928] [4.3/4.4/4.5 Regression] 30% performance slowdown in floating-point code caused by r118475
lucier at math dot purdue dot edu
gcc-bugzilla@gcc.gnu.org
Thu May 7 05:27:00 GMT 2009
------- Comment #66 from lucier at math dot purdue dot edu 2009-05-07 05:27 -------
Adding -frename-registers gives a significant speedup (sometimes as fast as
4.1.2 on this shared machine, i.e., it somtimes hits 108 ms instead of
132-140ms), the command line with -fforward-propagate -fno-move-loop-invariants
-frename-registers is
/pkgs/gcc-mainline/bin/gcc -save-temps -I../include -I. -Wall -W -Wno-unused
-O1 -fno-math-errno -fschedule-insns2 -fno-trapping-math -fno-strict-aliasing
-fwrapv -fomit-frame-pointer -fPIC -fno-common -mieee-fp -fforward-propagate
-fno-move-loop-invariants -frename-registers -DHAVE_CONFIG_H -D___PRIMAL
-D___LIBRARY -D___GAMBCDIR="\"/usr/local/Gambit-C/v4.1.2\""
-D___SYS_TYPE_CPU="\"x86_64\"" -D___SYS_TYPE_VENDOR="\"unknown\""
-D___SYS_TYPE_OS="\"linux-gnu\"" -c _num.c
and the loop is
.L2752:
movq %rcx, %r12
addq 8(%rax), %r12
leaq 4(%rcx), %rdi
movq %r12, -8(%rax)
leaq 4(%r12), %r8
addq 8(%rax), %r12
movq %r8, -16(%rax)
movq -8(%rax), %r8
movq -16(%rax), %rdx
movq %r12, -24(%rax)
leaq 4(%r12), %rbx
addq 8(%rax), %r12
movq -24(%rax), %r9
movq %rbx, -32(%rax)
movq 24(%rax), %rbx
movq -32(%rax), %r10
leaq 4(%r12), %r11
movq %r12, -40(%rax)
movq 40(%rax), %r12
movq -40(%rax), %r14
movq %r11, -48(%rax)
movsd 15(%rbx), %xmm1
movsd 7(%rbx), %xmm2
movsd 7(%r12,%r11,2), %xmm9
movapd %xmm1, %xmm3
movsd 7(%r12,%r14,2), %xmm11
leaq 7(%r12,%rcx,2), %r11
movapd %xmm2, %xmm10
leaq (%rdi,%rdi), %r14
mulsd %xmm11, %xmm3
movapd %xmm2, %xmm12
mulsd %xmm9, %xmm10
addq $8, %rcx
mulsd %xmm1, %xmm9
cmpq %rcx, %r13
mulsd %xmm2, %xmm11
movsd 7(%r12,%r10,2), %xmm5
movsd 7(%r12,%r9,2), %xmm7
addsd %xmm10, %xmm3
movsd 7(%r12,%r8,2), %xmm6
subsd %xmm9, %xmm11
mulsd %xmm7, %xmm2
movapd %xmm1, %xmm9
mulsd %xmm5, %xmm1
movapd %xmm6, %xmm13
movsd 7(%r12,%rdx,2), %xmm14
mulsd %xmm5, %xmm12
mulsd %xmm7, %xmm9
subsd %xmm11, %xmm13
movsd 31(%rbx), %xmm0
addsd %xmm6, %xmm11
movsd .LC5(%rip), %xmm6
subsd %xmm1, %xmm2
movsd (%r11), %xmm4
movapd %xmm14, %xmm10
xorpd %xmm0, %xmm6
addsd %xmm12, %xmm9
movsd 7(%r14,%r12), %xmm8
subsd %xmm3, %xmm10
movapd %xmm4, %xmm7
addsd %xmm14, %xmm3
movsd 23(%rbx), %xmm15
subsd %xmm2, %xmm7
movapd %xmm8, %xmm5
addsd %xmm4, %xmm2
movapd %xmm6, %xmm4
subsd %xmm9, %xmm5
movapd %xmm15, %xmm14
addsd %xmm8, %xmm9
mulsd %xmm10, %xmm4
movapd %xmm15, %xmm8
mulsd %xmm15, %xmm10
movapd %xmm0, %xmm12
mulsd %xmm11, %xmm15
mulsd %xmm3, %xmm0
movapd %xmm7, %xmm1
mulsd %xmm13, %xmm6
mulsd %xmm3, %xmm8
movapd %xmm9, %xmm3
mulsd %xmm11, %xmm12
subsd %xmm0, %xmm15
mulsd %xmm13, %xmm14
subsd %xmm10, %xmm6
movapd %xmm2, %xmm10
movapd %xmm5, %xmm0
addsd %xmm12, %xmm8
addsd %xmm15, %xmm10
subsd %xmm15, %xmm2
addsd %xmm14, %xmm4
addsd %xmm8, %xmm3
movsd %xmm10, (%r11)
movq 40(%rax), %r10
subsd %xmm8, %xmm9
addsd %xmm6, %xmm1
addsd %xmm4, %xmm0
movsd %xmm3, 7(%r14,%r10)
movq -8(%rax), %r9
movq 40(%rax), %rdx
subsd %xmm6, %xmm7
subsd %xmm4, %xmm5
movsd %xmm2, 7(%rdx,%r9,2)
movq -16(%rax), %r8
movq 40(%rax), %r12
movsd %xmm9, 7(%r12,%r8,2)
movq -24(%rax), %rbx
movq 40(%rax), %r11
movsd %xmm1, 7(%r11,%rbx,2)
movq -32(%rax), %r14
movq 40(%rax), %r10
movsd %xmm0, 7(%r10,%r14,2)
movq -40(%rax), %r9
movq 40(%rax), %rdx
movsd %xmm7, 7(%rdx,%r9,2)
movq -48(%rax), %r8
movq 40(%rax), %r12
movsd %xmm5, 7(%r12,%r8,2)
jg .L2752
Adding -fforward-propagate -fno-move-loop-invariants -fweb instead of
-fforward-propagate -fno-move-loop-invariants -frename-registers, so the
compile line is
/pkgs/gcc-mainline/bin/gcc -save-temps -I../include -I. -Wall -W -Wno-unused
-O1 -fno-math-errno -fschedule-insns2 -fno-trapping-math -fno-strict-aliasing
-fwrapv -fomit-frame-pointer -fPIC -fno-common -mieee-fp -fforward-propagate
-fno-move-loop-invariants -fweb -DHAVE_CONFIG_H -D___PRIMAL -D___LIBRARY
-D___GAMBCDIR="\"/usr/local/Gambit-C/v4.1.2\"" -D___SYS_TYPE_CPU="\"x86_64\""
-D___SYS_TYPE_VENDOR="\"unknown\"" -D___SYS_TYPE_OS="\"linux-gnu\"" -c _num.c
the time is not so good (consistently 128ms) and the loop is
.L2752:
movq %rcx, %rdx
addq 8(%rax), %rdx
leaq 4(%rcx), %rdi
movq %rdx, -8(%rax)
leaq 4(%rdx), %rbx
addq 8(%rax), %rdx
movq %rbx, -16(%rax)
movq %rdx, -24(%rax)
leaq 4(%rdx), %rbx
addq 8(%rax), %rdx
movq %rbx, -32(%rax)
movq %rdx, -40(%rax)
leaq 4(%rdx), %rbx
movq 40(%rax), %rdx
movq %rbx, -48(%rax)
movsd 7(%rdx,%rbx,2), %xmm9
movq -40(%rax), %rbx
leaq 7(%rdx,%rcx,2), %r8
addq $8, %rcx
movsd (%r8), %xmm4
cmpq %rcx, %r13
movsd 7(%rdx,%rbx,2), %xmm11
movq -32(%rax), %rbx
movsd 7(%rdx,%rbx,2), %xmm5
movq -24(%rax), %rbx
movsd 7(%rdx,%rbx,2), %xmm7
movq -16(%rax), %rbx
movsd 7(%rdx,%rbx,2), %xmm14
movq -8(%rax), %rbx
movsd 7(%rdx,%rbx,2), %xmm6
leaq (%rdi,%rdi), %rbx
movsd 7(%rbx,%rdx), %xmm8
movq 24(%rax), %rdx
movapd %xmm6, %xmm13
movsd 15(%rdx), %xmm1
movsd 7(%rdx), %xmm2
movapd %xmm1, %xmm10
movsd 31(%rdx), %xmm3
movapd %xmm2, %xmm12
mulsd %xmm11, %xmm10
mulsd %xmm9, %xmm12
mulsd %xmm2, %xmm11
mulsd %xmm1, %xmm9
movsd 23(%rdx), %xmm0
addsd %xmm12, %xmm10
movapd %xmm2, %xmm12
mulsd %xmm7, %xmm2
subsd %xmm9, %xmm11
movapd %xmm1, %xmm9
mulsd %xmm5, %xmm12
mulsd %xmm5, %xmm1
movapd %xmm8, %xmm5
mulsd %xmm7, %xmm9
movapd %xmm4, %xmm7
subsd %xmm11, %xmm13
addsd %xmm6, %xmm11
movsd .LC5(%rip), %xmm6
subsd %xmm1, %xmm2
movapd %xmm0, %xmm1
addsd %xmm12, %xmm9
movapd %xmm14, %xmm12
xorpd %xmm3, %xmm6
subsd %xmm10, %xmm12
mulsd %xmm13, %xmm1
subsd %xmm2, %xmm7
addsd %xmm4, %xmm2
movapd %xmm6, %xmm4
addsd %xmm14, %xmm10
mulsd %xmm13, %xmm6
mulsd %xmm12, %xmm4
subsd %xmm9, %xmm5
mulsd %xmm0, %xmm12
addsd %xmm8, %xmm9
movapd %xmm0, %xmm8
mulsd %xmm11, %xmm0
addsd %xmm1, %xmm4
movapd %xmm3, %xmm1
mulsd %xmm10, %xmm3
subsd %xmm12, %xmm6
mulsd %xmm11, %xmm1
mulsd %xmm10, %xmm8
subsd %xmm3, %xmm0
addsd %xmm1, %xmm8
movapd %xmm2, %xmm1
addsd %xmm0, %xmm1
subsd %xmm0, %xmm2
movapd %xmm7, %xmm0
subsd %xmm6, %xmm7
addsd %xmm6, %xmm0
movsd %xmm1, (%r8)
movapd %xmm9, %xmm1
movq 40(%rax), %rdx
subsd %xmm8, %xmm9
addsd %xmm8, %xmm1
movsd %xmm1, 7(%rbx,%rdx)
movq -8(%rax), %rbx
movq 40(%rax), %rdx
movsd %xmm2, 7(%rdx,%rbx,2)
movq -16(%rax), %rbx
movq 40(%rax), %rdx
movsd %xmm9, 7(%rdx,%rbx,2)
movq -24(%rax), %rbx
movq 40(%rax), %rdx
movsd %xmm0, 7(%rdx,%rbx,2)
movapd %xmm5, %xmm0
movq -32(%rax), %rbx
movq 40(%rax), %rdx
subsd %xmm4, %xmm5
addsd %xmm4, %xmm0
movsd %xmm0, 7(%rdx,%rbx,2)
movq -40(%rax), %rbx
movq 40(%rax), %rdx
movsd %xmm7, 7(%rdx,%rbx,2)
movq -48(%rax), %rbx
movq 40(%rax), %rdx
movsd %xmm5, 7(%rdx,%rbx,2)
jg .L2752
And I still count 117 instructions in the loop in comment 64 (whether that
matters, I don't know).
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928
More information about the Gcc-bugs
mailing list