This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug tree-optimization/25621] Missed optimization when unrolling the loop (splitting up the sum) (only with -ffast-math)
- From: "jv244 at cam dot ac dot uk" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: 27 Apr 2010 18:25:13 -0000
- Subject: [Bug tree-optimization/25621] Missed optimization when unrolling the loop (splitting up the sum) (only with -ffast-math)
- References: <bug-25621-6642@http.gcc.gnu.org/bugzilla/>
- Reply-to: gcc-bugzilla at gcc dot gnu dot org
------- Comment #11 from jv244 at cam dot ac dot uk 2010-04-27 18:25 -------
the original loop gets now (4.6.0) vectorized, and gets the same performance as
the 'hand optimized loop' (which does not get vectorized):
> ./a.out
default loop 0.88005500000000003
hand optimized loop 0.86005399999999987
it is still not quite as fast as the ifort code:
ifort -fno-inline -O3 -xT -static t.f90
> ~/a.out
default loop 0.444028000000000
hand optimized loop 0.964060000000000
ifort's asm looks good:
# -- Begin s31_
# mark_begin;
.align 16,0x90
.globl s31_
s31_:
# parameter 1: %rdi
# parameter 2: %rsi
# parameter 3: %rdx
# parameter 4: %rcx
..B2.1: # Preds ..B2.0
..___tag_value_s31_.10: #3.12
xorps %xmm1, %xmm1 #9.2
movaps %xmm1, %xmm0 #9.2
xorl %eax, %eax #9.2
# LOE rax rdx rbx rbp rsi rdi r12 r13 r14 r15
xmm0 xmm1
..B2.2: # Preds ..B2.2 ..B2.1
movaps (%rdi,%rax,8), %xmm2 #10.8
movaps 16(%rdi,%rax,8), %xmm3 #10.8
movaps 32(%rdi,%rax,8), %xmm4 #10.8
movaps 48(%rdi,%rax,8), %xmm5 #10.8
mulpd (%rsi,%rax,8), %xmm2 #10.12
mulpd 16(%rsi,%rax,8), %xmm3 #10.12
mulpd 32(%rsi,%rax,8), %xmm4 #10.12
mulpd 48(%rsi,%rax,8), %xmm5 #10.12
addpd %xmm2, %xmm0 #10.4
addq $8, %rax #9.2
cmpq $1024, %rax #9.2
addpd %xmm3, %xmm1 #10.4
addpd %xmm4, %xmm0 #10.4
addpd %xmm5, %xmm1 #10.4
jb ..B2.2 # Prob 82% #9.2
# LOE rax rdx rbx rbp rsi rdi r12 r13 r14 r15
xmm0 xmm1
..B2.3: # Preds ..B2.2
addpd %xmm1, %xmm0 #9.2
haddpd %xmm0, %xmm0 #9.2
movsd %xmm0, (%rdx) #10.4
ret #12.1
.align 16,0x90
..___tag_value_s31_.11: #
while gcc has more complicated-looking asm
.globl s31_
.type s31_, @function
s31_:
.LFB0:
movl (%rcx), %r9d
movq $0, (%rdx)
testl %r9d, %r9d
jle .L9
movl %r9d, %r8d
shrl %r8d
cmpl $4, %r9d
leal (%r8,%r8), %r10d
jbe .L15
testl %r10d, %r10d
je .L15
xorl %eax, %eax
xorl %ecx, %ecx
xorpd %xmm1, %xmm1
.p2align 4,,10
.p2align 3
.L12:
movsd (%rsi,%rax), %xmm2
movsd (%rdi,%rax), %xmm3
movhpd 8(%rsi,%rax), %xmm2
movhpd 8(%rdi,%rax), %xmm3
movapd %xmm2, %xmm0
incl %ecx
mulpd %xmm3, %xmm0
addq $16, %rax
addpd %xmm0, %xmm1
cmpl %ecx, %r8d
ja .L12
haddpd %xmm1, %xmm1
leal 1(%r10), %eax
cmpl %r9d, %r10d
je .L13
.L11:
movslq %eax, %rcx
subl %eax, %r9d
leaq -8(,%rcx,8), %rcx
xorl %eax, %eax
addq %rcx, %rdi
addq %rcx, %rsi
leaq 8(,%r9,8), %rcx
.p2align 4,,10
.p2align 3
.L14:
movsd (%rsi), %xmm0
addq $8, %rax
mulsd (%rdi), %xmm0
addq $8, %rsi
addq $8, %rdi
addsd %xmm0, %xmm1
cmpq %rcx, %rax
jne .L14
.L13:
movsd %xmm1, (%rdx)
.L9:
rep
ret
.L15:
xorpd %xmm1, %xmm1
movl $1, %eax
jmp .L11
.LFE0:
.size s31_, .-s31_
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25621