This is the mail archive of the
gcc@gcc.gnu.org
mailing list for the GCC project.
Re: GCC Benchmarks (coybench), AMD64 and i686, 14 August 2004
Are we sure that ICC does 2*sin(x)*cos(x) -> sin(2*x)? It seems to me
that just being able to produce x < PI2 ? 2*sin(x)*cos(x) : 0.0 gives
most of the available speedup. Such a transformation should be
achievable
and preserves the result.
It doesn't even do that. This is ICC 8.0.
..B3.1: # Preds ..B3.0
pushl %ebx #62.1
movl %esp, %ebx #62.1
andl $-8, %esp #62.1
subl $8, %esp #62.1
movsd 8(%ebx), %xmm0 #61.15
minsd PI2, %xmm0 #63.28
call __libm_sse2_sincos #63.18
# LOE ebp esi edi xmm0 xmm1
..B3.4: # Preds ..B3.1
addsd %xmm0, %xmm0 #63.18
mulsd %xmm1, %xmm0 #63.47
movsd %xmm0, (%esp) #63.47
fldl (%esp) #63.47
movl %ebx, %esp #63.47
popl %ebx #63.47
ret #63.47
.align 4,0x90
Even before it transformed to this builtin, it was just doing a normal
(sin + sin ) * cos, at all levels of it's transformations.
About the only interesting thing it does to the code is to use a = sin;
b = cos; return (a + a) *b instead of return 2 * sin * cos. (IE it
removes a floating point multiply by two in favor of a floating point
add).
Note that it only uses this builtin if you specify -xN.
if you remove the -xN, the runtime doubles from 5 seconds to 10 seconds
on my machine.
Without -xN, it does:
# parameter 1: 8 + %ebx
..B3.1: # Preds ..B3.0
pushl %ebx #62.1
movl %esp, %ebx #62.1
andl $-8, %esp #62.1
fldl 8(%ebx) #61.15
fldl PI2 #63.28
fcom %st(1) #63.28
fnstsw %ax #63.28
sahf #63.28
ja .L9 # Prob 50% #63.28
fst %st(1) #63.28
.L9: #
fstp %st(0) #63.28
fsincos #63.18
fxch %st(1) #63.18
fadd %st(0), %st #63.18
fmulp %st, %st(1) #63.47
movl %ebx, %esp #63.47
popl %ebx #63.47
ret #63.47
.align 4,0x90
# LOE
If you use -xN -nolib_inline, to stop it from generating the intrisic
call, it does:
dv:
# parameter 1: 8 + %ebx
..B3.1: # Preds ..B3.0
pushl %ebx #62.1
movl %esp, %ebx #62.1
andl $-64, %esp #62.1
subl $64, %esp #62.1
movsd PI2, %xmm0 #63.28
movsd 8(%ebx), %xmm1 #63.28
minsd %xmm0, %xmm1 #63.28
movsd %xmm1, (%esp) #63.28
call sin #63.18
# LOE ebp esi edi f1
..B3.6: # Preds ..B3.1
fstpl 8(%esp) #63.18
movsd 8(%esp), %xmm0 #63.18
movsd %xmm0, 24(%esp) #63.18
# LOE ebp esi edi
..B3.2: # Preds ..B3.6
movsd PI2, %xmm0 #63.57
movsd 8(%ebx), %xmm1 #63.57
minsd %xmm0, %xmm1 #63.57
movsd %xmm1, (%esp) #63.57
call cos #63.47
# LOE ebp esi edi f1
..B3.7: # Preds ..B3.2
fstpl 16(%esp) #63.47
movsd 16(%esp), %xmm0 #63.47
# LOE ebp esi edi xmm0
..B3.3: # Preds ..B3.7
movsd 24(%esp), %xmm1 #63.18
addsd %xmm1, %xmm1 #63.18
mulsd %xmm0, %xmm1 #63.47
movsd %xmm1, 16(%esp) #63.47
fldl 16(%esp) #63.47
movl %ebx, %esp #63.47
popl %ebx #63.47
ret #63.47
again, no short circuiting.
--Dan