This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: GCC Benchmarks (coybench), AMD64 and i686, 14 August 2004


Are we sure that ICC does 2*sin(x)*cos(x) -> sin(2*x)? It seems to me
that just being able to produce x < PI2 ? 2*sin(x)*cos(x) : 0.0 gives
most of the available speedup. Such a transformation should be achievable
and preserves the result.


It doesn't even do that. This is ICC 8.0.

..B3.1:                         # Preds ..B3.0
        pushl     %ebx                                          #62.1
        movl      %esp, %ebx                                    #62.1
        andl      $-8, %esp                                     #62.1
        subl      $8, %esp                                      #62.1
        movsd     8(%ebx), %xmm0                                #61.15
        minsd     PI2, %xmm0                                    #63.28
        call      __libm_sse2_sincos                            #63.18
                                # LOE ebp esi edi xmm0 xmm1
..B3.4:                         # Preds ..B3.1
        addsd     %xmm0, %xmm0                                  #63.18
        mulsd     %xmm1, %xmm0                                  #63.47
        movsd     %xmm0, (%esp)                                 #63.47
        fldl      (%esp)                                        #63.47
        movl      %ebx, %esp                                    #63.47
        popl      %ebx                                          #63.47
        ret                                                     #63.47
        .align    4,0x90

Even before it transformed to this builtin, it was just doing a normal (sin + sin ) * cos, at all levels of it's transformations.
About the only interesting thing it does to the code is to use a = sin; b = cos; return (a + a) *b instead of return 2 * sin * cos. (IE it removes a floating point multiply by two in favor of a floating point add).


Note that it only uses this builtin if you specify -xN.
if you remove the -xN, the runtime doubles from 5 seconds to 10 seconds on my machine.


Without -xN, it does:
# parameter 1: 8 + %ebx
..B3.1:                         # Preds ..B3.0
        pushl     %ebx                                          #62.1
        movl      %esp, %ebx                                    #62.1
        andl      $-8, %esp                                     #62.1
        fldl      8(%ebx)                                       #61.15
        fldl      PI2                                           #63.28
        fcom      %st(1)                                        #63.28
        fnstsw    %ax                                           #63.28
        sahf                                                    #63.28
        ja        .L9           # Prob 50%                      #63.28
        fst       %st(1)                                        #63.28
.L9:                                                            #
        fstp      %st(0)                                        #63.28
        fsincos                                                 #63.18
        fxch      %st(1)                                        #63.18
        fadd      %st(0), %st                                   #63.18
        fmulp     %st, %st(1)                                   #63.47
        movl      %ebx, %esp                                    #63.47
        popl      %ebx                                          #63.47
        ret                                                     #63.47
        .align    4,0x90
                                # LOE

If you use -xN -nolib_inline, to stop it from generating the intrisic call, it does:

dv:
# parameter 1: 8 + %ebx
..B3.1:                         # Preds ..B3.0
        pushl     %ebx                                          #62.1
        movl      %esp, %ebx                                    #62.1
        andl      $-64, %esp                                    #62.1
        subl      $64, %esp                                     #62.1
        movsd     PI2, %xmm0                                    #63.28
        movsd     8(%ebx), %xmm1                                #63.28
        minsd     %xmm0, %xmm1                                  #63.28
        movsd     %xmm1, (%esp)                                 #63.28
        call      sin                                           #63.18
                                # LOE ebp esi edi f1
..B3.6:                         # Preds ..B3.1
        fstpl     8(%esp)                                       #63.18
        movsd     8(%esp), %xmm0                                #63.18
        movsd     %xmm0, 24(%esp)                               #63.18
                                # LOE ebp esi edi
..B3.2:                         # Preds ..B3.6
        movsd     PI2, %xmm0                                    #63.57
        movsd     8(%ebx), %xmm1                                #63.57
        minsd     %xmm0, %xmm1                                  #63.57
        movsd     %xmm1, (%esp)                                 #63.57
        call      cos                                           #63.47
                                # LOE ebp esi edi f1
..B3.7:                         # Preds ..B3.2
        fstpl     16(%esp)                                      #63.47
        movsd     16(%esp), %xmm0                               #63.47
                                # LOE ebp esi edi xmm0
..B3.3:                         # Preds ..B3.7
        movsd     24(%esp), %xmm1                               #63.18
        addsd     %xmm1, %xmm1                                  #63.18
        mulsd     %xmm0, %xmm1                                  #63.47
        movsd     %xmm1, 16(%esp)                               #63.47
        fldl      16(%esp)                                      #63.47
        movl      %ebx, %esp                                    #63.47
        popl      %ebx                                          #63.47
        ret                                                     #63.47


again, no short circuiting.


--Dan




Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]