This is the mail archive of the
gcc@gcc.gnu.org
mailing list for the GCC project.
Re: -finline-functions tuning (Using on ET)
- To: Kurt Garloff <kurt at garloff dot de>, GCC Mailing List <gcc at gcc dot gnu dot org>
- Subject: Re: -finline-functions tuning (Using on ET)
- From: Olaf Petzold <opetzold at wit dot regiocom dot net>
- Date: Fri, 31 Aug 2001 08:18:38 +0200
- References: <20010830131025.A28458@gum01m.etpnet.phys.tue.nl>
Hello Kurt,
as I promised here some results on blitz-20001213. The file is:
---- ray.cc ----
#include <iostream>
#include <cmath>
#include <blitz/tinyvec.h>
typedef blitz::TinyVector<double, 3> vector3d;
void reflect(vector3d& reflection, const vector3d& ray, const vector3d& surfaceNormal)
{
using blitz::dot;
reflection = ray - 2 * dot(ray,surfaceNormal) * surfaceNormal;
}
int main() {
vector3d x, y, z;
y[0] = 1; y[1] = 0; y[2] = -1;
z[0] = 0; z[1] = 0; z[2] = 1;
reflect(x, y, z);
std::cout << "Reflected ray is: [ " << x[0] << " " << x[1] << " " << x[2]
<< " ]" << std::endl;
}
----------------
The answer is late but, yesterday my floppy disk had read errors so that I had
to copy the results yesterday once more. Fortunally I was able to test with the
v3 patch and the brand new gcc-inline-func-acct-v1 patch. Anyway here are the
results:
code for reflect produced by g++-2.96-0.48mdk:
# g++ -O2 -I. ray.cc -S
----8<----
reflect__FRQ25blitzt10TinyVector2Zdi3RCQ25blitzt10TinyVector2Zdi3T1:
.LFB1:
pushl %ebp
.LCFI0:
movl %esp, %ebp
.LCFI1:
subl $280, %esp
.LCFI2:
movl 8(%ebp), %ecx
movl 12(%ebp), %edx
movl 16(%ebp), %eax
movl %ebp, %esp
fldl 8(%edx)
fldl 16(%edx)
fxch %st(1)
fmull 8(%eax)
fldl (%edx)
fxch %st(2)
fmull 16(%eax)
fxch %st(2)
fmull (%eax)
fxch %st(1)
faddp %st, %st(2)
faddp %st, %st(1)
fadd %st(0), %st
fld %st(0)
fmull (%eax)
fsubrl (%edx)
fstpl (%ecx)
fld %st(0)
fmull 8(%eax)
fsubrl 8(%edx)
fstpl 8(%ecx)
fmull 16(%eax)
fsubrl 16(%edx)
fstpl 16(%ecx)
popl %ebp
ret
---->8----
Looks good. Code produced by -O not worth to show...
# /opt/gcc/bin/g++ --version
2.95.3
# /opt/gcc/bin/g++ -O -I. ray.cc -S
----8<----
reflect__FRQ25blitzt10TinyVector2Zdi3RCQ25blitzt10TinyVector2Zdi3T1:
.LFB1:
pushl %ebp
.LCFI0:
movl %esp,%ebp
.LCFI1:
subl $184,%esp
.LCFI2:
movl 8(%ebp),%ecx
movl 12(%ebp),%edx
movl 16(%ebp),%eax
fldl 16(%edx)
fmull 16(%eax)
fldl 8(%edx)
fmull 8(%eax)
faddp %st,%st(1)
fldl (%edx)
fmull (%eax)
faddp %st,%st(1)
fadd %st(0),%st
fstl -48(%ebp)
movl %eax,-52(%ebp)
fstl -40(%ebp)
movl %eax,-32(%ebp)
fstl -28(%ebp)
movl %eax,-20(%ebp)
movl %edx,-72(%ebp)
fstl -84(%ebp)
movl %eax,-76(%ebp)
movl %edx,-68(%ebp)
fstl -64(%ebp)
movl %eax,-56(%ebp)
movl %edx,-16(%ebp)
fstl -12(%ebp)
movl %eax,-4(%ebp)
movl %edx,-100(%ebp)
fstl -96(%ebp)
movl %eax,-88(%ebp)
movl %edx,%eax
movl %eax,-116(%ebp)
fld %st(0)
fstl -112(%ebp)
movl -88(%ebp),%eax
movl %eax,-104(%ebp)
movl -116(%ebp),%eax
movl -104(%ebp),%edx
fmull (%edx)
fsubrl (%eax)
fstpl (%ecx)
movl %eax,-132(%ebp)
fstl -128(%ebp)
movl %edx,-120(%ebp)
fld %st(0)
fmull 8(%edx)
fsubrl 8(%eax)
fstpl 8(%ecx)
movl %eax,-148(%ebp)
fstl -144(%ebp)
movl %edx,-136(%ebp)
fld %st(0)
fmull 16(%edx)
fsubrl 16(%eax)
fstpl 16(%ecx)
movl %eax,-164(%ebp)
fstpl -160(%ebp)
movl %edx,-152(%ebp)
movl %ebp,%esp
popl %ebp
ret
---->8----
# /opt/gcc/bin/g++ -O2 -I. ray.cc -S
----8<----
reflect__FRQ25blitzt10TinyVector2Zdi3RCQ25blitzt10TinyVector2Zdi3T1:
.LFB1:
pushl %ebp
.LCFI0:
movl %esp,%ebp
.LCFI1:
subl $184,%esp
.LCFI2:
movl 12(%ebp),%edx
movl 16(%ebp),%eax
fldl 16(%edx)
fmull 16(%eax)
fldl 8(%edx)
fmull 8(%eax)
faddp %st,%st(1)
fldl (%edx)
fldl (%eax)
fld %st(1)
fmul %st(1),%st
movl 8(%ebp),%ecx
movl %eax,-52(%ebp)
faddp %st,%st(3)
fxch %st(2)
movl %eax,-32(%ebp)
movl %eax,-20(%ebp)
fadd %st(0),%st
movl %edx,-72(%ebp)
movl %eax,-76(%ebp)
fld %st(0)
fmulp %st,%st(3)
fxch %st(1)
movl %edx,-68(%ebp)
movl %eax,-56(%ebp)
movl %edx,-16(%ebp)
movl %eax,-4(%ebp)
movl %edx,-100(%ebp)
movl %eax,-88(%ebp)
fsubp %st,%st(2)
movl %edx,-116(%ebp)
movl %eax,-104(%ebp)
fstl -48(%ebp)
fstl -40(%ebp)
fstl -28(%ebp)
fstl -84(%ebp)
fstl -64(%ebp)
fstl -12(%ebp)
fstl -96(%ebp)
fstl -112(%ebp)
fld %st(0)
fxch %st(2)
fstpl (%ecx)
movl %edx,-132(%ebp)
fstl -128(%ebp)
fxch %st(1)
movl %eax,-120(%ebp)
fmull 8(%eax)
fsubrl 8(%edx)
fstpl 8(%ecx)
movl %edx,-148(%ebp)
movl %eax,-136(%ebp)
fstl -144(%ebp)
fld %st(0)
fmull 16(%eax)
fsubrl 16(%edx)
fstpl 16(%ecx)
movl %edx,-164(%ebp)
movl %eax,-152(%ebp)
fstpl -160(%ebp)
movl %ebp,%esp
popl %ebp
ret
---->8----
Slighly better, but many indirect adressing.
Let's start with the g++-3.0.1-inline-heuristic-v2
# /opt/gcc3/bin/g++ --version
3.0.1
# /opt/gcc3/bin/g++ -O2 -I. ray.cc -S
----8<----
_Z7reflectRN5blitz10TinyVectorIdLi3EEERKS1_S4_:
.LFB1:
pushl %ebp
.LCFI0:
movl %esp, %ebp
.LCFI1:
pushl %edi
.LCFI2:
pushl %esi
.LCFI3:
movl 12(%ebp), %edx
pushl %ebx
.LCFI4:
subl $132, %esp
.LCFI5:
movl 16(%ebp), %eax
fldl 8(%edx)
fldl 16(%edx)
fxch %st(1)
fmull 8(%eax)
fldl (%edx)
fxch %st(2)
fmull 16(%eax)
fxch %st(2)
leal -104(%ebp), %ebx
leal -88(%ebp), %edi
fmull (%eax)
fxch %st(1)
faddp %st, %st(2)
leal 4(%ebx), %ecx
movl %eax, -48(%ebp)
movl %eax, -64(%ebp)
movl %eax, -92(%ebp)
faddp %st, %st(1)
leal -36(%ebp), %eax
pushl %ecx
movl %edx, -104(%ebp)
movl %edx, -40(%ebp)
fadd %st(0), %st
pushl %eax
leal -72(%ebp), %esi
fstl -88(%ebp)
fstl -56(%ebp)
fstl -72(%ebp)
fstpl -100(%ebp)
movl %edx, -88(%ebp)
.LCFI6:
call _ZN5blitz13_bz_VecExprOpINS_19_bz_VecExprConstantIdEENS_19TinyVectorIterConstIdLi3ELi1EEENS_12_bz_MultiplyIddEEEC1ERKS7_
popl %ecx
movl -32(%ebp), %edx
popl %eax
movl %edx, -96(%ebp)
movl -40(%ebp), %eax
movl %eax, -104(%ebp)
pushl %ebx
movl -36(%ebp), %eax
pushl %edi
movl %eax, -100(%ebp)
movl -28(%ebp), %eax
movl %eax, -92(%ebp)
movb $0, -105(%ebp)
call _ZN5blitz13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS_11_bz_VecExprINS0_INS_19_bz_VecExprConstantIdEES2_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEC1ERKSC_
popl %eax
movl -76(%ebp), %eax
fldl -84(%ebp)
popl %edx
fmull (%eax)
movl -88(%ebp), %eax
fsubrl (%eax)
movl 8(%ebp), %eax
fstpl (%eax)
pushl %edi
pushl %esi
call _ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ERKSD_
fldl -68(%ebp)
movl -60(%ebp), %eax
popl %ecx
popl %ebx
fmull 8(%eax)
movl -72(%ebp), %eax
fsubrl 8(%eax)
movl 8(%ebp), %eax
fstpl 8(%eax)
pushl %esi
leal -56(%ebp), %eax
pushl %eax
call _ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ERKSD_
popl %eax
movl -44(%ebp), %eax
fldl -52(%ebp)
popl %edx
fmull 16(%eax)
movl -56(%ebp), %eax
fsubrl 16(%eax)
movl 8(%ebp), %eax
fstpl 16(%eax)
leal -56(%ebp), %eax
pushl %eax
leal -136(%ebp), %eax
pushl %eax
call _ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ERKSD_
addl $16, %esp
leal -12(%ebp), %esp
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
[....]
_ZN5blitz13_bz_VecExprOpINS_19_bz_VecExprConstantIdEENS_19TinyVectorIterConstIdLi3ELi1EEENS_12_bz_MultiplyIddEEEC1ERKS7_:
.LFB28:
pushl %ebp
.LCFI194:
movl %esp, %ebp
.LCFI195:
pushl %ebx
.LCFI196:
movl 12(%ebp), %ecx
movl 8(%ebp), %ebx
movl (%ecx), %eax
movl 4(%ecx), %edx
movl %edx, 4(%ebx)
movl %eax, (%ebx)
movl 8(%ecx), %eax
movl %eax, 8(%ebx)
popl %ebx
popl %ebp
ret
[...etc...]
---->8----
# /opt/gcc3/bin/g++ -O3 -I. ray.cc -S
----8<----
_Z7reflectRN5blitz10TinyVectorIdLi3EEERKS1_S4_:
.LFB2:
pushl %ebp
.LCFI4:
movl %esp, %ebp
.LCFI5:
pushl %edi
.LCFI6:
pushl %esi
.LCFI7:
movl 12(%ebp), %edx
pushl %ebx
.LCFI8:
subl $132, %esp
.LCFI9:
movl 16(%ebp), %eax
fldl 8(%edx)
fldl 16(%edx)
fxch %st(1)
fmull 8(%eax)
fldl (%edx)
fxch %st(2)
fmull 16(%eax)
fxch %st(2)
leal -104(%ebp), %ebx
leal -88(%ebp), %edi
fmull (%eax)
fxch %st(1)
faddp %st, %st(2)
leal 4(%ebx), %ecx
movl %edx, -104(%ebp)
movl %edx, -40(%ebp)
pushl %ecx
faddp %st, %st(1)
movl %eax, -48(%ebp)
movl %eax, -64(%ebp)
movl %eax, -92(%ebp)
leal -72(%ebp), %esi
fadd %st(0), %st
fstl -88(%ebp)
movl %edx, -88(%ebp)
leal -36(%ebp), %edx
fstl -56(%ebp)
fstl -72(%ebp)
fstpl -100(%ebp)
pushl %edx
.LCFI10:
call _ZN5blitz13_bz_VecExprOpINS_19_bz_VecExprConstantIdEENS_19TinyVectorIterConstIdLi3ELi1EEENS_12_bz_MultiplyIddEEEC1ERKS7_
popl %ecx
movl -32(%ebp), %ecx
popl %eax
movl %ecx, -96(%ebp)
movl -40(%ebp), %eax
movl %eax, -104(%ebp)
pushl %ebx
movl -36(%ebp), %eax
pushl %edi
movl %eax, -100(%ebp)
movl -28(%ebp), %eax
movl %eax, -92(%ebp)
movb $0, -105(%ebp)
call _ZN5blitz13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS_11_bz_VecExprINS0_INS_19_bz_VecExprConstantIdEES2_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEC1ERKSC_
popl %eax
movl -76(%ebp), %eax
fldl -84(%ebp)
popl %edx
fmull (%eax)
movl -88(%ebp), %eax
fsubrl (%eax)
movl 8(%ebp), %eax
fstpl (%eax)
pushl %edi
pushl %esi
call _ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ERKSD_
fldl -68(%ebp)
movl -60(%ebp), %eax
popl %ebx
popl %edi
leal -56(%ebp), %ecx
fmull 8(%eax)
movl -72(%ebp), %eax
fsubrl 8(%eax)
movl 8(%ebp), %eax
fstpl 8(%eax)
pushl %esi
pushl %ecx
call _ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ERKSD_
fldl -52(%ebp)
movl -44(%ebp), %eax
popl %edx
popl %ecx
leal -56(%ebp), %edx
fmull 16(%eax)
movl -56(%ebp), %eax
fsubrl 16(%eax)
movl 8(%ebp), %eax
fstpl 16(%eax)
pushl %edx
leal -136(%ebp), %edx
pushl %edx
call _ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ERKSD_
addl $16, %esp
leal -12(%ebp), %esp
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
---->8----
Some Xpr aren't inlined even with -O3 and keyword inline
# /opt/gcc3/bin/g++ -O3 -finline-limit=3000 -I. ray.cc -S
----8<----
_Z7reflectRN5blitz10TinyVectorIdLi3EEERKS1_S4_:
.LFB2:
pushl %ebp
.LCFI4:
movl %esp, %ebp
.LCFI5:
subl $120, %esp
.LCFI6:
movl 12(%ebp), %edx
movl 16(%ebp), %eax
movb $0, -89(%ebp)
fldl 8(%edx)
fldl 16(%edx)
fxch %st(1)
fmull 8(%eax)
fldl (%edx)
fxch %st(2)
fmull 16(%eax)
fxch %st(2)
movl 8(%ebp), %ecx
fmull (%eax)
fxch %st(1)
faddp %st, %st(2)
movl %eax, -32(%ebp)
movl %eax, -48(%ebp)
faddp %st, %st(1)
fadd %st(0), %st
fld %st(0)
fstl -72(%ebp)
fstl -40(%ebp)
fstl -56(%ebp)
fxch %st(1)
fmull (%eax)
fld %st(1)
fxch %st(1)
fsubrl (%edx)
fstpl (%ecx)
fmull 8(%eax)
fsubrl 8(%edx)
fstpl 8(%ecx)
fmull 16(%eax)
fsubrl 16(%edx)
fstpl 16(%ecx)
movl %ebp, %esp
popl %ebp
ret
---->8----
Some indirect calls are still there (but better code than before).
# /opt/gcc/bin/g++ -O2 -finline-limit=3000 -I. ray.cc -S
doesn't make any difference related to reflect. The inline-limit seems
to between 1500 and 2000 for this case. Your ideas I haven't checked yet.
Well, lets start with applied g++-rec-inline-heuristics-v3 and
gcc-inline-func-acct-v1 patch (the next day):
# /opt/gcc3/bin/g++ -O2 -I. ray.cc -S
----8<----
_Z7reflectRN5blitz10TinyVectorIdLi3EEERKS1_S4_:
.LFB1:
pushl %ebp
.LCFI0:
movl %esp, %ebp
.LCFI1:
pushl %edi
.LCFI2:
pushl %esi
.LCFI3:
movl 12(%ebp), %edx
pushl %ebx
.LCFI4:
subl $132, %esp
.LCFI5:
movl 16(%ebp), %eax
fldl 8(%edx)
fldl 16(%edx)
fxch %st(1)
fmull 8(%eax)
fldl (%edx)
fxch %st(2)
fmull 16(%eax)
fxch %st(2)
movb $0, -105(%ebp)
fmull (%eax)
fxch %st(1)
faddp %st, %st(2)
movl %edx, -40(%ebp)
movl %edx, -104(%ebp)
movl %eax, -48(%ebp)
movl %eax, -64(%ebp)
faddp %st, %st(1)
movl %eax, -28(%ebp)
movl %eax, -92(%ebp)
movl %eax, -76(%ebp)
movl 8(%ebp), %ebx
fadd %st(0), %st
leal -72(%ebp), %esi
leal -84(%ebp), %ecx
leal -56(%ebp), %edi
fstl -88(%ebp)
fstl -72(%ebp)
fstl -56(%ebp)
fstl -36(%ebp)
fstl -100(%ebp)
fstl -84(%ebp)
movl %edx, -88(%ebp)
fmull (%eax)
leal 4(%esi), %eax
fsubrl (%edx)
fstpl (%ebx)
movl %edx, -72(%ebp)
pushl %ecx
pushl %eax
.LCFI6:
call _ZN5blitz13_bz_VecExprOpINS_19_bz_VecExprConstantIdEENS_19TinyVectorIterConstIdLi3ELi1EEENS_12_bz_MultiplyIddEEEC1ERKS7_
popl %ecx
popl %eax
movl -60(%ebp), %eax
fldl -68(%ebp)
fmull 8(%eax)
movl -72(%ebp), %eax
fsubrl 8(%eax)
fstpl 8(%ebx)
pushl %esi
pushl %edi
call _ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ERKSD_
popl %eax
movl -44(%ebp), %eax
fldl -52(%ebp)
popl %edx
fmull 16(%eax)
movl -56(%ebp), %eax
fsubrl 16(%eax)
leal -136(%ebp), %eax
fstpl 16(%ebx)
pushl %edi
pushl %eax
call _ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ERKSD_
addl $16, %esp
leal -12(%ebp), %esp
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
---->8----
Doesn't look good. Similar results with
# /opt/gcc3/bin/g++ -O3 -I. ray.cc -S
So lets looking forward with
# /opt/gcc/bin/g++ -O2 -finline-limit=3000 -I. ray.cc -S
cc1plus: Invalid option `-finline-limit=3000'
Opps ????? Somethings gone wrong ?
Well, info pages shows -fkeep-inline-functions as well:
# /opt/gcc/bin/g++ -O2 -fkeep-inline-functions -I. ray.cc -S
----8<----
reflect__FRQ25blitzt10TinyVector2Zdi3RCQ25blitzt10TinyVector2Zdi3T1:
.LFB1:
pushl %ebp
.LCFI0:
movl %esp,%ebp
.LCFI1:
subl $184,%esp
.LCFI2:
movl 12(%ebp),%edx
movl 16(%ebp),%eax
fldl 16(%edx)
fmull 16(%eax)
fldl 8(%edx)
fmull 8(%eax)
faddp %st,%st(1)
fldl (%edx)
fldl (%eax)
fld %st(1)
fmul %st(1),%st
movl 8(%ebp),%ecx
movl %eax,-52(%ebp)
faddp %st,%st(3)
fxch %st(2)
movl %eax,-32(%ebp)
movl %eax,-20(%ebp)
fadd %st(0),%st
movl %edx,-72(%ebp)
movl %eax,-76(%ebp)
fld %st(0)
fmulp %st,%st(3)
fxch %st(1)
movl %edx,-68(%ebp)
movl %eax,-56(%ebp)
movl %edx,-16(%ebp)
movl %eax,-4(%ebp)
movl %edx,-100(%ebp)
movl %eax,-88(%ebp)
fsubp %st,%st(2)
movl %edx,-116(%ebp)
movl %eax,-104(%ebp)
fstl -48(%ebp)
fstl -40(%ebp)
fstl -28(%ebp)
fstl -84(%ebp)
fstl -64(%ebp)
fstl -12(%ebp)
fstl -96(%ebp)
fstl -112(%ebp)
fld %st(0)
fxch %st(2)
fstpl (%ecx)
movl %edx,-132(%ebp)
fstl -128(%ebp)
fxch %st(1)
movl %eax,-120(%ebp)
fmull 8(%eax)
fsubrl 8(%edx)
fstpl 8(%ecx)
movl %edx,-148(%ebp)
movl %eax,-136(%ebp)
fstl -144(%ebp)
fld %st(0)
fmull 16(%eax)
fsubrl 16(%edx)
fstpl 16(%ecx)
movl %edx,-164(%ebp)
movl %eax,-152(%ebp)
fstpl -160(%ebp)
movl %ebp,%esp
popl %ebp
ret
---->8----
Wow, the optimizer should keep all as inlined declared functions as is!
(giving penalty?) A lot of indirect adressing still there 8(
# /opt/gcc/bin/g++ -O3 -fkeep-inline-functions -foptimize-register-move -fexpensive-optimizations -I. ray.cc -S
----8<----
reflect__FRQ25blitzt10TinyVector2Zdi3RCQ25blitzt10TinyVector2Zdi3T1:
.LFB1:
pushl %ebp
.LCFI0:
movl %esp,%ebp
.LCFI1:
subl $184,%esp
.LCFI2:
movl 12(%ebp),%edx
movl 16(%ebp),%eax
fldl 16(%edx)
fmull 16(%eax)
fldl 8(%edx)
fmull 8(%eax)
faddp %st,%st(1)
fldl (%edx)
fldl (%eax)
fld %st(1)
fmul %st(1),%st
movl 8(%ebp),%ecx
movl %eax,-52(%ebp)
faddp %st,%st(3)
fxch %st(2)
movl %eax,-32(%ebp)
movl %eax,-20(%ebp)
fadd %st(0),%st
movl %edx,-72(%ebp)
movl %eax,-76(%ebp)
fld %st(0)
fmulp %st,%st(3)
fxch %st(1)
movl %edx,-68(%ebp)
movl %eax,-56(%ebp)
movl %edx,-16(%ebp)
movl %eax,-4(%ebp)
movl %edx,-100(%ebp)
movl %eax,-88(%ebp)
fsubp %st,%st(2)
movl %edx,-116(%ebp)
movl %eax,-104(%ebp)
fstl -48(%ebp)
fstl -40(%ebp)
fstl -28(%ebp)
fstl -84(%ebp)
fstl -64(%ebp)
fstl -12(%ebp)
fstl -96(%ebp)
fstl -112(%ebp)
fld %st(0)
fxch %st(2)
fstpl (%ecx)
movl %edx,-132(%ebp)
fstl -128(%ebp)
fxch %st(1)
movl %eax,-120(%ebp)
fmull 8(%eax)
fsubrl 8(%edx)
fstpl 8(%ecx)
movl %edx,-148(%ebp)
movl %eax,-136(%ebp)
fstl -144(%ebp)
fld %st(0)
fmull 16(%eax)
fsubrl 16(%edx)
fstpl 16(%ecx)
movl %edx,-164(%ebp)
movl %eax,-152(%ebp)
fstpl -160(%ebp)
movl %ebp,%esp
popl %ebp
ret
---->8----
Similar with less options. Where comes the indirect calls from ? Inlining seems
to be only one problem ihmo.
Regards
Olaf