This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

Re: -finline-functions tuning (Using on ET)


Hello Kurt,

as I promised here some results on blitz-20001213. The file is:

---- ray.cc ----
#include <iostream>
#include <cmath>
#include <blitz/tinyvec.h>

typedef blitz::TinyVector<double, 3> vector3d;

void reflect(vector3d& reflection, const vector3d& ray, const vector3d& surfaceNormal)
{
  using blitz::dot;
  reflection = ray - 2 * dot(ray,surfaceNormal) * surfaceNormal;
}

int main() {

  vector3d x, y, z;

  y[0] = 1;  y[1] = 0;  y[2] = -1;
  z[0] = 0;  z[1] = 0;  z[2] = 1;  
  reflect(x, y, z);
  std::cout << "Reflected ray is: [ " << x[0] << " " << x[1] << " " << x[2]
	    << " ]" << std::endl;
  
}
----------------

The answer is late but, yesterday my floppy disk had read errors so that I had
to copy the results yesterday once more. Fortunally I was able to test with the
v3 patch and the brand new gcc-inline-func-acct-v1 patch. Anyway here are the
results:


code for reflect produced by g++-2.96-0.48mdk:
# g++ -O2 -I. ray.cc -S
----8<----
reflect__FRQ25blitzt10TinyVector2Zdi3RCQ25blitzt10TinyVector2Zdi3T1:
.LFB1:
	pushl	%ebp
.LCFI0:
	movl	%esp, %ebp
.LCFI1:
	subl	$280, %esp
.LCFI2:
	movl	8(%ebp), %ecx
	movl	12(%ebp), %edx
	movl	16(%ebp), %eax
	movl	%ebp, %esp
	fldl	8(%edx)
	fldl	16(%edx)
	fxch	%st(1)
	fmull	8(%eax)
	fldl	(%edx)
	fxch	%st(2)
	fmull	16(%eax)
	fxch	%st(2)
	fmull	(%eax)
	fxch	%st(1)
	faddp	%st, %st(2)
	faddp	%st, %st(1)
	fadd	%st(0), %st
	fld	%st(0)
	fmull	(%eax)
	fsubrl	(%edx)
	fstpl	(%ecx)
	fld	%st(0)
	fmull	8(%eax)
	fsubrl	8(%edx)
	fstpl	8(%ecx)
	fmull	16(%eax)
	fsubrl	16(%edx)
	fstpl	16(%ecx)
	popl	%ebp
	ret
---->8----
Looks good. Code produced by -O not worth to show...

# /opt/gcc/bin/g++ --version
2.95.3 
# /opt/gcc/bin/g++ -O -I. ray.cc -S

----8<----
reflect__FRQ25blitzt10TinyVector2Zdi3RCQ25blitzt10TinyVector2Zdi3T1:
.LFB1:
	pushl %ebp
.LCFI0:
	movl %esp,%ebp
.LCFI1:
	subl $184,%esp
.LCFI2:
	movl 8(%ebp),%ecx
	movl 12(%ebp),%edx
	movl 16(%ebp),%eax
	fldl 16(%edx)
	fmull 16(%eax)
	fldl 8(%edx)
	fmull 8(%eax)
	faddp %st,%st(1)
	fldl (%edx)
	fmull (%eax)
	faddp %st,%st(1)
	fadd %st(0),%st
	fstl -48(%ebp)
	movl %eax,-52(%ebp)
	fstl -40(%ebp)
	movl %eax,-32(%ebp)
	fstl -28(%ebp)
	movl %eax,-20(%ebp)
	movl %edx,-72(%ebp)
	fstl -84(%ebp)
	movl %eax,-76(%ebp)
	movl %edx,-68(%ebp)
	fstl -64(%ebp)
	movl %eax,-56(%ebp)
	movl %edx,-16(%ebp)
	fstl -12(%ebp)
	movl %eax,-4(%ebp)
	movl %edx,-100(%ebp)
	fstl -96(%ebp)
	movl %eax,-88(%ebp)
	movl %edx,%eax
	movl %eax,-116(%ebp)
	fld %st(0)
	fstl -112(%ebp)
	movl -88(%ebp),%eax
	movl %eax,-104(%ebp)
	movl -116(%ebp),%eax
	movl -104(%ebp),%edx
	fmull (%edx)
	fsubrl (%eax)
	fstpl (%ecx)
	movl %eax,-132(%ebp)
	fstl -128(%ebp)
	movl %edx,-120(%ebp)
	fld %st(0)
	fmull 8(%edx)
	fsubrl 8(%eax)
	fstpl 8(%ecx)
	movl %eax,-148(%ebp)
	fstl -144(%ebp)
	movl %edx,-136(%ebp)
	fld %st(0)
	fmull 16(%edx)
	fsubrl 16(%eax)
	fstpl 16(%ecx)
	movl %eax,-164(%ebp)
	fstpl -160(%ebp)
	movl %edx,-152(%ebp)
	movl %ebp,%esp
	popl %ebp
	ret
---->8----
# /opt/gcc/bin/g++ -O2 -I. ray.cc -S

----8<----
reflect__FRQ25blitzt10TinyVector2Zdi3RCQ25blitzt10TinyVector2Zdi3T1:
.LFB1:
	pushl %ebp
.LCFI0:
	movl %esp,%ebp
.LCFI1:
	subl $184,%esp
.LCFI2:
	movl 12(%ebp),%edx
	movl 16(%ebp),%eax
	fldl 16(%edx)
	fmull 16(%eax)
	fldl 8(%edx)
	fmull 8(%eax)
	faddp %st,%st(1)
	fldl (%edx)
	fldl (%eax)
	fld %st(1)
	fmul %st(1),%st
	movl 8(%ebp),%ecx
	movl %eax,-52(%ebp)
	faddp %st,%st(3)
	fxch %st(2)
	movl %eax,-32(%ebp)
	movl %eax,-20(%ebp)
	fadd %st(0),%st
	movl %edx,-72(%ebp)
	movl %eax,-76(%ebp)
	fld %st(0)
	fmulp %st,%st(3)
	fxch %st(1)
	movl %edx,-68(%ebp)
	movl %eax,-56(%ebp)
	movl %edx,-16(%ebp)
	movl %eax,-4(%ebp)
	movl %edx,-100(%ebp)
	movl %eax,-88(%ebp)
	fsubp %st,%st(2)
	movl %edx,-116(%ebp)
	movl %eax,-104(%ebp)
	fstl -48(%ebp)
	fstl -40(%ebp)
	fstl -28(%ebp)
	fstl -84(%ebp)
	fstl -64(%ebp)
	fstl -12(%ebp)
	fstl -96(%ebp)
	fstl -112(%ebp)
	fld %st(0)
	fxch %st(2)
	fstpl (%ecx)
	movl %edx,-132(%ebp)
	fstl -128(%ebp)
	fxch %st(1)
	movl %eax,-120(%ebp)
	fmull 8(%eax)
	fsubrl 8(%edx)
	fstpl 8(%ecx)
	movl %edx,-148(%ebp)
	movl %eax,-136(%ebp)
	fstl -144(%ebp)
	fld %st(0)
	fmull 16(%eax)
	fsubrl 16(%edx)
	fstpl 16(%ecx)
	movl %edx,-164(%ebp)
	movl %eax,-152(%ebp)
	fstpl -160(%ebp)
	movl %ebp,%esp
	popl %ebp
	ret
---->8----

Slighly better, but many indirect adressing.
Let's start with the g++-3.0.1-inline-heuristic-v2

# /opt/gcc3/bin/g++ --version
3.0.1
# /opt/gcc3/bin/g++ -O2 -I. ray.cc -S

----8<----
_Z7reflectRN5blitz10TinyVectorIdLi3EEERKS1_S4_:
.LFB1:
	pushl	%ebp
.LCFI0:
	movl	%esp, %ebp
.LCFI1:
	pushl	%edi
.LCFI2:
	pushl	%esi
.LCFI3:
	movl	12(%ebp), %edx
	pushl	%ebx
.LCFI4:
	subl	$132, %esp
.LCFI5:
	movl	16(%ebp), %eax
	fldl	8(%edx)
	fldl	16(%edx)
	fxch	%st(1)
	fmull	8(%eax)
	fldl	(%edx)
	fxch	%st(2)
	fmull	16(%eax)
	fxch	%st(2)
	leal	-104(%ebp), %ebx
	leal	-88(%ebp), %edi
	fmull	(%eax)
	fxch	%st(1)
	faddp	%st, %st(2)
	leal	4(%ebx), %ecx
	movl	%eax, -48(%ebp)
	movl	%eax, -64(%ebp)
	movl	%eax, -92(%ebp)
	faddp	%st, %st(1)
	leal	-36(%ebp), %eax
	pushl	%ecx
	movl	%edx, -104(%ebp)
	movl	%edx, -40(%ebp)
	fadd	%st(0), %st
	pushl	%eax
	leal	-72(%ebp), %esi
	fstl	-88(%ebp)
	fstl	-56(%ebp)
	fstl	-72(%ebp)
	fstpl	-100(%ebp)
	movl	%edx, -88(%ebp)
.LCFI6:
	call	_ZN5blitz13_bz_VecExprOpINS_19_bz_VecExprConstantIdEENS_19TinyVectorIterConstIdLi3ELi1EEENS_12_bz_MultiplyIddEEEC1ERKS7_
	popl	%ecx
	movl	-32(%ebp), %edx
	popl	%eax
	movl	%edx, -96(%ebp)
	movl	-40(%ebp), %eax
	movl	%eax, -104(%ebp)
	pushl	%ebx
	movl	-36(%ebp), %eax
	pushl	%edi
	movl	%eax, -100(%ebp)
	movl	-28(%ebp), %eax
	movl	%eax, -92(%ebp)
	movb	$0, -105(%ebp)
	call	_ZN5blitz13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS_11_bz_VecExprINS0_INS_19_bz_VecExprConstantIdEES2_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEC1ERKSC_
	popl	%eax
	movl	-76(%ebp), %eax
	fldl	-84(%ebp)
	popl	%edx
	fmull	(%eax)
	movl	-88(%ebp), %eax
	fsubrl	(%eax)
	movl	8(%ebp), %eax
	fstpl	(%eax)
	pushl	%edi
	pushl	%esi
	call	_ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ERKSD_
	fldl	-68(%ebp)
	movl	-60(%ebp), %eax
	popl	%ecx
	popl	%ebx
	fmull	8(%eax)
	movl	-72(%ebp), %eax
	fsubrl	8(%eax)
	movl	8(%ebp), %eax
	fstpl	8(%eax)
	pushl	%esi
	leal	-56(%ebp), %eax
	pushl	%eax
	call	_ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ERKSD_
	popl	%eax
	movl	-44(%ebp), %eax
	fldl	-52(%ebp)
	popl	%edx
	fmull	16(%eax)
	movl	-56(%ebp), %eax
	fsubrl	16(%eax)
	movl	8(%ebp), %eax
	fstpl	16(%eax)
	leal	-56(%ebp), %eax
	pushl	%eax
	leal	-136(%ebp), %eax
	pushl	%eax
	call	_ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ERKSD_
	addl	$16, %esp
	leal	-12(%ebp), %esp
	popl	%ebx
	popl	%esi
	popl	%edi
	popl	%ebp
	ret

[....]
_ZN5blitz13_bz_VecExprOpINS_19_bz_VecExprConstantIdEENS_19TinyVectorIterConstIdLi3ELi1EEENS_12_bz_MultiplyIddEEEC1ERKS7_:
.LFB28:
	pushl	%ebp
.LCFI194:
	movl	%esp, %ebp
.LCFI195:
	pushl	%ebx
.LCFI196:
	movl	12(%ebp), %ecx
	movl	8(%ebp), %ebx
	movl	(%ecx), %eax
	movl	4(%ecx), %edx
	movl	%edx, 4(%ebx)
	movl	%eax, (%ebx)
	movl	8(%ecx), %eax
	movl	%eax, 8(%ebx)
	popl	%ebx
	popl	%ebp
	ret
[...etc...]
---->8----

# /opt/gcc3/bin/g++ -O3 -I. ray.cc -S

----8<----
_Z7reflectRN5blitz10TinyVectorIdLi3EEERKS1_S4_:
.LFB2:
	pushl	%ebp
.LCFI4:
	movl	%esp, %ebp
.LCFI5:
	pushl	%edi
.LCFI6:
	pushl	%esi
.LCFI7:
	movl	12(%ebp), %edx
	pushl	%ebx
.LCFI8:
	subl	$132, %esp
.LCFI9:
	movl	16(%ebp), %eax
	fldl	8(%edx)
	fldl	16(%edx)
	fxch	%st(1)
	fmull	8(%eax)
	fldl	(%edx)
	fxch	%st(2)
	fmull	16(%eax)
	fxch	%st(2)
	leal	-104(%ebp), %ebx
	leal	-88(%ebp), %edi
	fmull	(%eax)
	fxch	%st(1)
	faddp	%st, %st(2)
	leal	4(%ebx), %ecx
	movl	%edx, -104(%ebp)
	movl	%edx, -40(%ebp)
	pushl	%ecx
	faddp	%st, %st(1)
	movl	%eax, -48(%ebp)
	movl	%eax, -64(%ebp)
	movl	%eax, -92(%ebp)
	leal	-72(%ebp), %esi
	fadd	%st(0), %st
	fstl	-88(%ebp)
	movl	%edx, -88(%ebp)
	leal	-36(%ebp), %edx
	fstl	-56(%ebp)
	fstl	-72(%ebp)
	fstpl	-100(%ebp)
	pushl	%edx
.LCFI10:
	call	_ZN5blitz13_bz_VecExprOpINS_19_bz_VecExprConstantIdEENS_19TinyVectorIterConstIdLi3ELi1EEENS_12_bz_MultiplyIddEEEC1ERKS7_
	popl	%ecx
	movl	-32(%ebp), %ecx
	popl	%eax
	movl	%ecx, -96(%ebp)
	movl	-40(%ebp), %eax
	movl	%eax, -104(%ebp)
	pushl	%ebx
	movl	-36(%ebp), %eax
	pushl	%edi
	movl	%eax, -100(%ebp)
	movl	-28(%ebp), %eax
	movl	%eax, -92(%ebp)
	movb	$0, -105(%ebp)
	call	_ZN5blitz13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS_11_bz_VecExprINS0_INS_19_bz_VecExprConstantIdEES2_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEC1ERKSC_
	popl	%eax
	movl	-76(%ebp), %eax
	fldl	-84(%ebp)
	popl	%edx
	fmull	(%eax)
	movl	-88(%ebp), %eax
	fsubrl	(%eax)
	movl	8(%ebp), %eax
	fstpl	(%eax)
	pushl	%edi
	pushl	%esi
	call	_ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ERKSD_
	fldl	-68(%ebp)
	movl	-60(%ebp), %eax
	popl	%ebx
	popl	%edi
	leal	-56(%ebp), %ecx
	fmull	8(%eax)
	movl	-72(%ebp), %eax
	fsubrl	8(%eax)
	movl	8(%ebp), %eax
	fstpl	8(%eax)
	pushl	%esi
	pushl	%ecx
	call	_ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ERKSD_
	fldl	-52(%ebp)
	movl	-44(%ebp), %eax
	popl	%edx
	popl	%ecx
	leal	-56(%ebp), %edx
	fmull	16(%eax)
	movl	-56(%ebp), %eax
	fsubrl	16(%eax)
	movl	8(%ebp), %eax
	fstpl	16(%eax)
	pushl	%edx
	leal	-136(%ebp), %edx
	pushl	%edx
	call	_ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ERKSD_
	addl	$16, %esp
	leal	-12(%ebp), %esp
	popl	%ebx
	popl	%esi
	popl	%edi
	popl	%ebp
	ret
---->8----

Some Xpr aren't inlined even with -O3 and keyword inline

# /opt/gcc3/bin/g++ -O3 -finline-limit=3000 -I. ray.cc -S

----8<----
_Z7reflectRN5blitz10TinyVectorIdLi3EEERKS1_S4_:
.LFB2:
	pushl	%ebp
.LCFI4:
	movl	%esp, %ebp
.LCFI5:
	subl	$120, %esp
.LCFI6:
	movl	12(%ebp), %edx
	movl	16(%ebp), %eax
	movb	$0, -89(%ebp)
	fldl	8(%edx)
	fldl	16(%edx)
	fxch	%st(1)
	fmull	8(%eax)
	fldl	(%edx)
	fxch	%st(2)
	fmull	16(%eax)
	fxch	%st(2)
	movl	8(%ebp), %ecx
	fmull	(%eax)
	fxch	%st(1)
	faddp	%st, %st(2)
	movl	%eax, -32(%ebp)
	movl	%eax, -48(%ebp)
	faddp	%st, %st(1)
	fadd	%st(0), %st
	fld	%st(0)
	fstl	-72(%ebp)
	fstl	-40(%ebp)
	fstl	-56(%ebp)
	fxch	%st(1)
	fmull	(%eax)
	fld	%st(1)
	fxch	%st(1)
	fsubrl	(%edx)
	fstpl	(%ecx)
	fmull	8(%eax)
	fsubrl	8(%edx)
	fstpl	8(%ecx)
	fmull	16(%eax)
	fsubrl	16(%edx)
	fstpl	16(%ecx)
	movl	%ebp, %esp
	popl	%ebp
	ret
---->8----

Some indirect calls are still there (but better code than before).

# /opt/gcc/bin/g++ -O2 -finline-limit=3000 -I. ray.cc -S
doesn't make any difference related to reflect. The inline-limit seems
to between 1500 and 2000 for this case. Your ideas I haven't checked yet.

Well, lets start with applied  g++-rec-inline-heuristics-v3 and 
gcc-inline-func-acct-v1 patch (the next day):

# /opt/gcc3/bin/g++ -O2  -I. ray.cc -S

----8<----
_Z7reflectRN5blitz10TinyVectorIdLi3EEERKS1_S4_:
.LFB1:
	pushl	%ebp
.LCFI0:
	movl	%esp, %ebp
.LCFI1:
	pushl	%edi
.LCFI2:
	pushl	%esi
.LCFI3:
	movl	12(%ebp), %edx
	pushl	%ebx
.LCFI4:
	subl	$132, %esp
.LCFI5:
	movl	16(%ebp), %eax
	fldl	8(%edx)
	fldl	16(%edx)
	fxch	%st(1)
	fmull	8(%eax)
	fldl	(%edx)
	fxch	%st(2)
	fmull	16(%eax)
	fxch	%st(2)
	movb	$0, -105(%ebp)
	fmull	(%eax)
	fxch	%st(1)
	faddp	%st, %st(2)
	movl	%edx, -40(%ebp)
	movl	%edx, -104(%ebp)
	movl	%eax, -48(%ebp)
	movl	%eax, -64(%ebp)
	faddp	%st, %st(1)
	movl	%eax, -28(%ebp)
	movl	%eax, -92(%ebp)
	movl	%eax, -76(%ebp)
	movl	8(%ebp), %ebx
	fadd	%st(0), %st
	leal	-72(%ebp), %esi
	leal	-84(%ebp), %ecx
	leal	-56(%ebp), %edi
	fstl	-88(%ebp)
	fstl	-72(%ebp)
	fstl	-56(%ebp)
	fstl	-36(%ebp)
	fstl	-100(%ebp)
	fstl	-84(%ebp)
	movl	%edx, -88(%ebp)
	fmull	(%eax)
	leal	4(%esi), %eax
	fsubrl	(%edx)
	fstpl	(%ebx)
	movl	%edx, -72(%ebp)
	pushl	%ecx
	pushl	%eax
.LCFI6:
	call	_ZN5blitz13_bz_VecExprOpINS_19_bz_VecExprConstantIdEENS_19TinyVectorIterConstIdLi3ELi1EEENS_12_bz_MultiplyIddEEEC1ERKS7_
	popl	%ecx
	popl	%eax
	movl	-60(%ebp), %eax
	fldl	-68(%ebp)
	fmull	8(%eax)
	movl	-72(%ebp), %eax
	fsubrl	8(%eax)
	fstpl	8(%ebx)
	pushl	%esi
	pushl	%edi
	call	_ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ERKSD_
	popl	%eax
	movl	-44(%ebp), %eax
	fldl	-52(%ebp)
	popl	%edx
	fmull	16(%eax)
	movl	-56(%ebp), %eax
	fsubrl	16(%eax)
	leal	-136(%ebp), %eax
	fstpl	16(%ebx)
	pushl	%edi
	pushl	%eax
	call	_ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ERKSD_
	addl	$16, %esp
	leal	-12(%ebp), %esp
	popl	%ebx
	popl	%esi
	popl	%edi
	popl	%ebp
	ret
---->8----
Doesn't look good. Similar results with 
# /opt/gcc3/bin/g++ -O3  -I. ray.cc -S

So lets looking forward with 

# /opt/gcc/bin/g++ -O2 -finline-limit=3000 -I. ray.cc -S
cc1plus: Invalid option `-finline-limit=3000'

Opps ????? Somethings gone wrong ?

Well, info pages shows -fkeep-inline-functions as well:
# /opt/gcc/bin/g++ -O2 -fkeep-inline-functions  -I. ray.cc -S

----8<----
reflect__FRQ25blitzt10TinyVector2Zdi3RCQ25blitzt10TinyVector2Zdi3T1:
.LFB1:
	pushl %ebp
.LCFI0:
	movl %esp,%ebp
.LCFI1:
	subl $184,%esp
.LCFI2:
	movl 12(%ebp),%edx
	movl 16(%ebp),%eax
	fldl 16(%edx)
	fmull 16(%eax)
	fldl 8(%edx)
	fmull 8(%eax)
	faddp %st,%st(1)
	fldl (%edx)
	fldl (%eax)
	fld %st(1)
	fmul %st(1),%st
	movl 8(%ebp),%ecx
	movl %eax,-52(%ebp)
	faddp %st,%st(3)
	fxch %st(2)
	movl %eax,-32(%ebp)
	movl %eax,-20(%ebp)
	fadd %st(0),%st
	movl %edx,-72(%ebp)
	movl %eax,-76(%ebp)
	fld %st(0)
	fmulp %st,%st(3)
	fxch %st(1)
	movl %edx,-68(%ebp)
	movl %eax,-56(%ebp)
	movl %edx,-16(%ebp)
	movl %eax,-4(%ebp)
	movl %edx,-100(%ebp)
	movl %eax,-88(%ebp)
	fsubp %st,%st(2)
	movl %edx,-116(%ebp)
	movl %eax,-104(%ebp)
	fstl -48(%ebp)
	fstl -40(%ebp)
	fstl -28(%ebp)
	fstl -84(%ebp)
	fstl -64(%ebp)
	fstl -12(%ebp)
	fstl -96(%ebp)
	fstl -112(%ebp)
	fld %st(0)
	fxch %st(2)
	fstpl (%ecx)
	movl %edx,-132(%ebp)
	fstl -128(%ebp)
	fxch %st(1)
	movl %eax,-120(%ebp)
	fmull 8(%eax)
	fsubrl 8(%edx)
	fstpl 8(%ecx)
	movl %edx,-148(%ebp)
	movl %eax,-136(%ebp)
	fstl -144(%ebp)
	fld %st(0)
	fmull 16(%eax)
	fsubrl 16(%edx)
	fstpl 16(%ecx)
	movl %edx,-164(%ebp)
	movl %eax,-152(%ebp)
	fstpl -160(%ebp)
	movl %ebp,%esp
	popl %ebp
	ret
---->8----

Wow, the optimizer should keep all as inlined declared functions as is!
(giving penalty?) A lot of indirect adressing still there 8(

# /opt/gcc/bin/g++ -O3 -fkeep-inline-functions -foptimize-register-move -fexpensive-optimizations -I. ray.cc -S

----8<----
reflect__FRQ25blitzt10TinyVector2Zdi3RCQ25blitzt10TinyVector2Zdi3T1:
.LFB1:
	pushl %ebp
.LCFI0:
	movl %esp,%ebp
.LCFI1:
	subl $184,%esp
.LCFI2:
	movl 12(%ebp),%edx
	movl 16(%ebp),%eax
	fldl 16(%edx)
	fmull 16(%eax)
	fldl 8(%edx)
	fmull 8(%eax)
	faddp %st,%st(1)
	fldl (%edx)
	fldl (%eax)
	fld %st(1)
	fmul %st(1),%st
	movl 8(%ebp),%ecx
	movl %eax,-52(%ebp)
	faddp %st,%st(3)
	fxch %st(2)
	movl %eax,-32(%ebp)
	movl %eax,-20(%ebp)
	fadd %st(0),%st
	movl %edx,-72(%ebp)
	movl %eax,-76(%ebp)
	fld %st(0)
	fmulp %st,%st(3)
	fxch %st(1)
	movl %edx,-68(%ebp)
	movl %eax,-56(%ebp)
	movl %edx,-16(%ebp)
	movl %eax,-4(%ebp)
	movl %edx,-100(%ebp)
	movl %eax,-88(%ebp)
	fsubp %st,%st(2)
	movl %edx,-116(%ebp)
	movl %eax,-104(%ebp)
	fstl -48(%ebp)
	fstl -40(%ebp)
	fstl -28(%ebp)
	fstl -84(%ebp)
	fstl -64(%ebp)
	fstl -12(%ebp)
	fstl -96(%ebp)
	fstl -112(%ebp)
	fld %st(0)
	fxch %st(2)
	fstpl (%ecx)
	movl %edx,-132(%ebp)
	fstl -128(%ebp)
	fxch %st(1)
	movl %eax,-120(%ebp)
	fmull 8(%eax)
	fsubrl 8(%edx)
	fstpl 8(%ecx)
	movl %edx,-148(%ebp)
	movl %eax,-136(%ebp)
	fstl -144(%ebp)
	fld %st(0)
	fmull 16(%eax)
	fsubrl 16(%edx)
	fstpl 16(%ecx)
	movl %edx,-164(%ebp)
	movl %eax,-152(%ebp)
	fstpl -160(%ebp)
	movl %ebp,%esp
	popl %ebp
	ret
---->8----
Similar with less options. Where comes the indirect calls from ? Inlining seems
to be only one problem ihmo.

Regards
Olaf


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]