This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

Re: -finline-functions tuning (Using on ET)


Hello Kurts,

once more 8)

Let's remember the code produced by mandrakes g++

# g++ -I. -O2 -S ray.cc
----8<----
reflect__FRQ25blitzt10TinyVector2Zdi3RCQ25blitzt10TinyVector2Zdi3T1:
.LFB1:
	pushl	%ebp
.LCFI0:
	movl	%esp, %ebp
.LCFI1:
	subl	$280, %esp
.LCFI2:
	movl	8(%ebp), %ecx
	movl	12(%ebp), %edx
	movl	16(%ebp), %eax
	movl	%ebp, %esp
	fldl	8(%edx)
	fldl	16(%edx)
	fxch	%st(1)
	fmull	8(%eax)
	fldl	(%edx)
	fxch	%st(2)
	fmull	16(%eax)
	fxch	%st(2)
	fmull	(%eax)
	fxch	%st(1)
	faddp	%st, %st(2)
	faddp	%st, %st(1)
	fadd	%st(0), %st
	fld	%st(0)
	fmull	(%eax)
	fsubrl	(%edx)
	fstpl	(%ecx)
	fld	%st(0)
	fmull	8(%eax)
	fsubrl	8(%edx)
	fstpl	8(%ecx)
	fmull	16(%eax)
	fsubrl	16(%edx)
	fstpl	16(%ecx)
	popl	%ebp
	ret
---->8----

and now check the gcc-3.0.1 with patch:

# /opt/gcc3/bin/g++ -I. -O2 -S ray.cc

----8<----
_Z7reflectRN5blitz10TinyVectorIdLi3EEERKS1_S4_:
.LFB1:
	pushl	%ebp
.LCFI0:
	movl	%esp, %ebp
.LCFI1:
	pushl	%edi
.LCFI2:
	pushl	%esi
.LCFI3:
	movl	12(%ebp), %edx
	pushl	%ebx
.LCFI4:
	subl	$132, %esp
.LCFI5:
	movl	16(%ebp), %eax
	fldl	8(%edx)
	fldl	16(%edx)
	fxch	%st(1)
	fmull	8(%eax)
	fldl	(%edx)
	fxch	%st(2)
	fmull	16(%eax)
	fxch	%st(2)
	movb	$0, -105(%ebp)
	fmull	(%eax)
	fxch	%st(1)
	faddp	%st, %st(2)
	movl	%edx, -40(%ebp)
	movl	%edx, -104(%ebp)
	movl	%eax, -48(%ebp)
	movl	%eax, -64(%ebp)
	faddp	%st, %st(1)
	movl	%eax, -28(%ebp)
	movl	%eax, -92(%ebp)
	movl	%eax, -76(%ebp)
	movl	8(%ebp), %ebx
	fadd	%st(0), %st
	leal	-72(%ebp), %esi
	leal	-84(%ebp), %ecx
	leal	-56(%ebp), %edi
	fstl	-88(%ebp)
	fstl	-72(%ebp)
	fstl	-56(%ebp)
	fstl	-36(%ebp)
	fstl	-100(%ebp)
	fstl	-84(%ebp)
	movl	%edx, -88(%ebp)
	fmull	(%eax)
	leal	4(%esi), %eax
	fsubrl	(%edx)
	fstpl	(%ebx)
	movl	%edx, -72(%ebp)
	pushl	%ecx
	pushl	%eax
.LCFI6:
	call	_ZN5blitz13_bz_VecExprOpINS_19_bz_VecExprConstantIdEENS_19TinyVectorIterConstIdLi3ELi1EEENS_12_bz_MultiplyIddEEEC1ERKS7_
	popl	%ecx
	popl	%eax
	movl	-60(%ebp), %eax
	fldl	-68(%ebp)
	fmull	8(%eax)
	movl	-72(%ebp), %eax
	fsubrl	8(%eax)
	fstpl	8(%ebx)
	pushl	%esi
	pushl	%edi
	call	_ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ERKSD_
	popl	%eax
	movl	-44(%ebp), %eax
	fldl	-52(%ebp)
	popl	%edx
	fmull	16(%eax)
	movl	-56(%ebp), %eax
	fsubrl	16(%eax)
	leal	-136(%ebp), %eax
	fstpl	16(%ebx)
	pushl	%edi
	pushl	%eax
	call	_ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ERKSD_
	addl	$16, %esp
	leal	-12(%ebp), %esp
	popl	%ebx
	popl	%esi
	popl	%edi
	popl	%ebp
	ret
---->8----

# /opt/gcc3/bin/g++ -I. -O3 -S ray.cc 

----8<----
_Z7reflectRN5blitz10TinyVectorIdLi3EEERKS1_S4_:
.LFB2:
	pushl	%ebp
.LCFI4:
	movl	%esp, %ebp
.LCFI5:
	pushl	%edi
.LCFI6:
	pushl	%esi
.LCFI7:
	pushl	%ebx
.LCFI8:
	movl	16(%ebp), %eax
	movl	12(%ebp), %ebx
	addl	$-128, %esp
.LCFI9:
	leal	-104(%ebp), %esi
	leal	-88(%ebp), %edi
	fldl	8(%ebx)
	fldl	16(%ebx)
	fxch	%st(1)
	fmull	8(%eax)
	fldl	(%ebx)
	fxch	%st(2)
	fmull	16(%eax)
	fxch	%st(2)
	fmull	(%eax)
	fxch	%st(1)
	faddp	%st, %st(2)
	pushl	%esi
	movl	%eax, -104(%ebp)
	pushl	%edi
	leal	-72(%ebp), %edi
	faddp	%st, %st(1)
	pushl	%edi
	fadd	%st(0), %st
	fstpl	-88(%ebp)
.LCFI10:
	call	_ZN5blitz13_bz_VecExprOpINS_19_bz_VecExprConstantIdEENS_19TinyVectorIterConstIdLi3ELi1EEENS_12_bz_MultiplyIddEEEC1ES2_S4_
	leal	-56(%ebp), %edx
	popl	%ecx
	popl	%eax
	pushl	%edi
	pushl	%edx
	call	_ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19_bz_VecExprConstantIdEENS_19TinyVectorIterConstIdLi3ELi1EEENS_12_bz_MultiplyIddEEEEEC1ES8_
	popl	%eax
	leal	-56(%ebp), %ecx
	popl	%edx
	movl	%ebx, -88(%ebp)
	pushl	%ecx
	pushl	%edi
	call	_ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19_bz_VecExprConstantIdEENS_19TinyVectorIterConstIdLi3ELi1EEENS_12_bz_MultiplyIddEEEEEC1ERKS9_
	addl	$12, %esp
	leal	-88(%ebp), %edx
	pushl	%edi
	pushl	%edx
	pushl	%esi
	call	_ZN5blitz13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS_11_bz_VecExprINS0_INS_19_bz_VecExprConstantIdEES2_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEC1ES2_S9_
	popl	%ecx
	popl	%ebx
	leal	-40(%ebp), %ebx
	pushl	%esi
	pushl	%ebx
	call	_ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ESC_
	popl	%eax
	popl	%edx
	pushl	%ebx
	pushl	%esi
	call	_ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ERKSD_
	popl	%ecx
	popl	%ebx
	leal	-88(%ebp), %ebx
	movb	$0, -105(%ebp)
	pushl	%esi
	leal	-88(%ebp), %esi
	pushl	%esi
	call	_ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ERKSD_
	popl	%eax
	movl	-76(%ebp), %eax
	fldl	-84(%ebp)
	popl	%edx
	fmull	(%eax)
	movl	-88(%ebp), %eax
	fsubrl	(%eax)
	movl	8(%ebp), %eax
	fstpl	(%eax)
	pushl	%ebx
	pushl	%edi
	call	_ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ERKSD_
	fldl	-68(%ebp)
	movl	-60(%ebp), %eax
	popl	%ebx
	popl	%esi
	leal	-56(%ebp), %ecx
	fmull	8(%eax)
	movl	-72(%ebp), %eax
	fsubrl	8(%eax)
	movl	8(%ebp), %eax
	fstpl	8(%eax)
	pushl	%edi
	pushl	%ecx
	call	_ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ERKSD_
	fldl	-52(%ebp)
	movl	-44(%ebp), %eax
	popl	%edx
	popl	%ecx
	leal	-56(%ebp), %edx
	fmull	16(%eax)
	movl	-56(%ebp), %eax
	fsubrl	16(%eax)
	movl	8(%ebp), %eax
	fstpl	16(%eax)
	pushl	%edx
	leal	-136(%ebp), %edx
	pushl	%edx
	call	_ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ERKSD_
	addl	$16, %esp
	leal	-12(%ebp), %esp
	popl	%ebx
	popl	%esi
	popl	%edi
	popl	%ebp
	ret
---->8----
Mmmmh, something seems to be wrong with -O3. There are more calls as on -O2!
Never in mind for optimization of code size.

# /opt/gcc3/bin/g++ -I. -O2 -finline-limit=2000 -S ray.c

----8<----
_Z7reflectRN5blitz10TinyVectorIdLi3EEERKS1_S4_:
.LFB1:
	pushl	%ebp
.LCFI0:
	movl	%esp, %ebp
.LCFI1:
	subl	$120, %esp
.LCFI2:
	movl	12(%ebp), %edx
	movl	16(%ebp), %eax
	movb	$0, -89(%ebp)
	fldl	8(%edx)
	fldl	16(%edx)
	fxch	%st(1)
	fmull	8(%eax)
	fldl	(%edx)
	fxch	%st(2)
	fmull	16(%eax)
	fxch	%st(2)
	movl	8(%ebp), %ecx
	fmull	(%eax)
	fxch	%st(1)
	faddp	%st, %st(2)
	movl	%eax, -32(%ebp)
	movl	%eax, -48(%ebp)
	faddp	%st, %st(1)
	fadd	%st(0), %st
	fld	%st(0)
	fstl	-72(%ebp)
	fstl	-40(%ebp)
	fstl	-56(%ebp)
	fxch	%st(1)
	fmull	(%eax)
	fsubrl	(%edx)
	fstpl	(%ecx)
	fld	%st(0)
	fmull	8(%eax)
	fsubrl	8(%edx)
	fstpl	8(%ecx)
	fmull	16(%eax)
	fsubrl	16(%edx)
	fstpl	16(%ecx)
	movl	%ebp, %esp
	popl	%ebp
	ret

---->8----

looks even better. Lets try

# /opt/gcc3/bin/g++ -I. -O -finline-limit=2000 -S ray.cc

----8<----
_Z7reflectRN5blitz10TinyVectorIdLi3EEERKS1_S4_:
.LFB1:
	pushl	%ebp
.LCFI0:
	movl	%esp, %ebp
.LCFI1:
	pushl	%esi
.LCFI2:
	pushl	%ebx
.LCFI3:
	subl	$112, %esp
.LCFI4:
	movl	12(%ebp), %edx
	movl	16(%ebp), %eax
	movl	8(%ebp), %esi
	fldl	(%edx)
	fmull	(%eax)
	fldl	8(%edx)
	fmull	8(%eax)
	fldl	16(%edx)
	fmull	16(%eax)
	faddp	%st, %st(1)
	faddp	%st, %st(1)
	fadd	%st(0), %st
	fstl	-72(%ebp)
	leal	-88(%ebp), %ebx
	fstl	-40(%ebp)
	movl	%eax, -32(%ebp)
	leal	-24(%ebp), %ecx
	movl	%edx, -72(%ebp)
	fstl	-56(%ebp)
	movl	%eax, -48(%ebp)
	movl	%edx, -24(%ebp)
	fstl	4(%ecx)
	movl	%eax, 12(%ecx)
	movl	%edx, -88(%ebp)
	fstpl	4(%ebx)
	movl	%eax, 12(%ebx)
	movb	$0, -89(%ebp)
	movb	$1, %al
	testb	%al, %al
	je	.L55
	leal	-72(%ebp), %ecx
	movl	%ebx, %eax
	movl	-88(%ebp), %edx
	movl	%edx, -72(%ebp)
	fldl	4(%eax)
	fstl	4(%ecx)
	movl	12(%eax), %eax
	movl	%eax, 12(%ecx)
	fld	%st(0)
	fmull	(%eax)
	fsubrl	(%edx)
	fstpl	(%esi)
	leal	-56(%ebp), %ecx
	movl	%edx, -56(%ebp)
	fstl	4(%ecx)
	movl	%eax, 12(%ecx)
	fld	%st(0)
	fmull	8(%eax)
	fsubrl	8(%edx)
	fstpl	8(%esi)
	leal	-40(%ebp), %ecx
	movl	%edx, -40(%ebp)
	fstl	4(%ecx)
	movl	%eax, 12(%ecx)
	fld	%st(0)
	fmull	16(%eax)
	fsubrl	16(%edx)
	fstpl	16(%esi)
	leal	-120(%ebp), %ecx
	movl	%edx, -120(%ebp)
	jmp	.L199
	.p2align 4,,7
.L55:
	leal	-120(%ebp), %ecx
	leal	-88(%ebp), %eax
	movl	-88(%ebp), %edx
	movl	%edx, -120(%ebp)
	fldl	4(%eax)
	fstl	4(%ecx)
	movl	12(%eax), %eax
	movl	%eax, 12(%ecx)
	fld	%st(0)
	fmull	(%eax)
	fsubrl	(%edx)
	fstpl	(%esi)
	leal	-72(%ebp), %ecx
	movl	%edx, -72(%ebp)
	fstl	4(%ecx)
	movl	%eax, 12(%ecx)
	fld	%st(0)
	fmull	8(%eax)
	fsubrl	8(%edx)
	fstpl	8(%esi)
	leal	-56(%ebp), %ecx
	movl	%edx, -56(%ebp)
	fstl	4(%ecx)
	movl	%eax, 12(%ecx)
	fld	%st(0)
	fmull	16(%eax)
	fsubrl	16(%edx)
	fstpl	16(%esi)
	leal	-40(%ebp), %ecx
	movl	%edx, -40(%ebp)
.L199:
	fstpl	4(%ecx)
	movl	%eax, 12(%ecx)
	addl	$112, %esp
	popl	%ebx
	popl	%esi
	popl	%ebp
	ret
---->8----

Well, ET seems to live from inliner and some other hidden options on
-O2. 

# /opt/gcc3/bin/g++ -I. -O3 -finline-limit=2000 -S ray.cc

----8<----
_Z7reflectRN5blitz10TinyVectorIdLi3EEERKS1_S4_:
.LFB2:
	pushl	%ebp
.LCFI4:
	movl	%esp, %ebp
.LCFI5:
	subl	$120, %esp
.LCFI6:
	movl	12(%ebp), %edx
	movl	16(%ebp), %eax
	movb	$0, -89(%ebp)
	fldl	8(%edx)
	fldl	16(%edx)
	fxch	%st(1)
	fmull	8(%eax)
	fldl	(%edx)
	fxch	%st(2)
	fmull	16(%eax)
	fxch	%st(2)
	movl	8(%ebp), %ecx
	fmull	(%eax)
	fxch	%st(1)
	faddp	%st, %st(2)
	movl	%eax, -32(%ebp)
	movl	%eax, -48(%ebp)
	faddp	%st, %st(1)
	fadd	%st(0), %st
	fld	%st(0)
	fstl	-72(%ebp)
	fstl	-40(%ebp)
	fstl	-56(%ebp)
	fxch	%st(1)
	fmull	(%eax)
	fld	%st(1)
	fxch	%st(1)
	fsubrl	(%edx)
	fstpl	(%ecx)
	fmull	8(%eax)
	fsubrl	8(%edx)
	fstpl	8(%ecx)
	fmull	16(%eax)
	fsubrl	16(%edx)
	fstpl	16(%ecx)
	movl	%ebp, %esp
	popl	%ebp
	ret
---->8----

-fsave-memoized
-fforce-mem
-fforce-addr 
doesn't change the relevant code. So lets try

# /opt/gcc3/bin/g++ -I. -O3 -finline-limit=2000 -fomit-frame-pointer -S ray.cc

----8<----
_Z7reflectRN5blitz10TinyVectorIdLi3EEERKS1_S4_:
.LFB2:
	subl	$124, %esp
.LCFI12:
	movl	132(%esp), %edx
	movl	136(%esp), %eax
	movb	$0, 31(%esp)
	fldl	8(%edx)
	fldl	16(%edx)
	fxch	%st(1)
	fmull	8(%eax)
	fldl	(%edx)
	fxch	%st(2)
	fmull	16(%eax)
	fxch	%st(2)
	movl	128(%esp), %ecx
	fmull	(%eax)
	fxch	%st(1)
	faddp	%st, %st(2)
	movl	%eax, 88(%esp)
	movl	%eax, 72(%esp)
	faddp	%st, %st(1)
	fadd	%st(0), %st
	fld	%st(0)
	fstl	48(%esp)
	fstl	80(%esp)
	fstl	64(%esp)
	fxch	%st(1)
	fmull	(%eax)
	fld	%st(1)
	fxch	%st(1)
	fsubrl	(%edx)
	fstpl	(%ecx)
	fmull	8(%eax)
	fsubrl	8(%edx)
	fstpl	8(%ecx)
	fmull	16(%eax)
	fsubrl	16(%edx)
	fstpl	16(%ecx)
	addl	$124, %esp
.LCFI13:
	ret
---->8----

Looks good! The interesting is that I have to use to omit the 
frame-pointer to get rid off the indirect adressign. Anyway, I'm
interesting to the these options for g++-2.96:

----8<----
reflect__FRQ25blitzt10TinyVector2Zdi3RCQ25blitzt10TinyVector2Zdi3T1:
.LFB1:
	movl	8(%esp), %edx
	movl	12(%esp), %eax
	movl	4(%esp), %ecx
	fldl	8(%edx)
	fldl	16(%edx)
	fxch	%st(1)
	fmull	8(%eax)
	fldl	(%edx)
	fxch	%st(2)
	fmull	16(%eax)
	fxch	%st(2)
	fmull	(%eax)
	fxch	%st(1)
	faddp	%st, %st(2)
	faddp	%st, %st(1)
	fadd	%st(0), %st
	fld	%st(0)
	fmull	(%eax)
	fsubrl	(%edx)
	fstpl	(%ecx)
	fld	%st(0)
	fmull	8(%eax)
	fsubrl	8(%edx)
	fstpl	8(%ecx)
	fmull	16(%eax)
	fsubrl	16(%edx)
	fstpl	16(%ecx)
	ret
---->8----

Looks compacter than g++-3.0.1.

Regards
Olaf


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]