This is the mail archive of the
gcc@gcc.gnu.org
mailing list for the GCC project.
Re: -finline-functions tuning (Using on ET)
- To: Kurt Garloff <kurt at garloff dot de>, GCC Mailing List <gcc at gcc dot gnu dot org>
- Subject: Re: -finline-functions tuning (Using on ET)
- From: Olaf Petzold <opetzold at wit dot regiocom dot net>
- Date: Sat, 1 Sep 2001 13:14:41 +0200
- References: <20010830131025.A28458@gum01m.etpnet.phys.tue.nl> <01083108260700.00419@gatekeeper> <20010831120254.A19154@gum01m.etpnet.phys.tue.nl>
Hello Kurts,
once more 8)
Let's remember the code produced by mandrakes g++
# g++ -I. -O2 -S ray.cc
----8<----
reflect__FRQ25blitzt10TinyVector2Zdi3RCQ25blitzt10TinyVector2Zdi3T1:
.LFB1:
pushl %ebp
.LCFI0:
movl %esp, %ebp
.LCFI1:
subl $280, %esp
.LCFI2:
movl 8(%ebp), %ecx
movl 12(%ebp), %edx
movl 16(%ebp), %eax
movl %ebp, %esp
fldl 8(%edx)
fldl 16(%edx)
fxch %st(1)
fmull 8(%eax)
fldl (%edx)
fxch %st(2)
fmull 16(%eax)
fxch %st(2)
fmull (%eax)
fxch %st(1)
faddp %st, %st(2)
faddp %st, %st(1)
fadd %st(0), %st
fld %st(0)
fmull (%eax)
fsubrl (%edx)
fstpl (%ecx)
fld %st(0)
fmull 8(%eax)
fsubrl 8(%edx)
fstpl 8(%ecx)
fmull 16(%eax)
fsubrl 16(%edx)
fstpl 16(%ecx)
popl %ebp
ret
---->8----
and now check the gcc-3.0.1 with patch:
# /opt/gcc3/bin/g++ -I. -O2 -S ray.cc
----8<----
_Z7reflectRN5blitz10TinyVectorIdLi3EEERKS1_S4_:
.LFB1:
pushl %ebp
.LCFI0:
movl %esp, %ebp
.LCFI1:
pushl %edi
.LCFI2:
pushl %esi
.LCFI3:
movl 12(%ebp), %edx
pushl %ebx
.LCFI4:
subl $132, %esp
.LCFI5:
movl 16(%ebp), %eax
fldl 8(%edx)
fldl 16(%edx)
fxch %st(1)
fmull 8(%eax)
fldl (%edx)
fxch %st(2)
fmull 16(%eax)
fxch %st(2)
movb $0, -105(%ebp)
fmull (%eax)
fxch %st(1)
faddp %st, %st(2)
movl %edx, -40(%ebp)
movl %edx, -104(%ebp)
movl %eax, -48(%ebp)
movl %eax, -64(%ebp)
faddp %st, %st(1)
movl %eax, -28(%ebp)
movl %eax, -92(%ebp)
movl %eax, -76(%ebp)
movl 8(%ebp), %ebx
fadd %st(0), %st
leal -72(%ebp), %esi
leal -84(%ebp), %ecx
leal -56(%ebp), %edi
fstl -88(%ebp)
fstl -72(%ebp)
fstl -56(%ebp)
fstl -36(%ebp)
fstl -100(%ebp)
fstl -84(%ebp)
movl %edx, -88(%ebp)
fmull (%eax)
leal 4(%esi), %eax
fsubrl (%edx)
fstpl (%ebx)
movl %edx, -72(%ebp)
pushl %ecx
pushl %eax
.LCFI6:
call _ZN5blitz13_bz_VecExprOpINS_19_bz_VecExprConstantIdEENS_19TinyVectorIterConstIdLi3ELi1EEENS_12_bz_MultiplyIddEEEC1ERKS7_
popl %ecx
popl %eax
movl -60(%ebp), %eax
fldl -68(%ebp)
fmull 8(%eax)
movl -72(%ebp), %eax
fsubrl 8(%eax)
fstpl 8(%ebx)
pushl %esi
pushl %edi
call _ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ERKSD_
popl %eax
movl -44(%ebp), %eax
fldl -52(%ebp)
popl %edx
fmull 16(%eax)
movl -56(%ebp), %eax
fsubrl 16(%eax)
leal -136(%ebp), %eax
fstpl 16(%ebx)
pushl %edi
pushl %eax
call _ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ERKSD_
addl $16, %esp
leal -12(%ebp), %esp
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
---->8----
# /opt/gcc3/bin/g++ -I. -O3 -S ray.cc
----8<----
_Z7reflectRN5blitz10TinyVectorIdLi3EEERKS1_S4_:
.LFB2:
pushl %ebp
.LCFI4:
movl %esp, %ebp
.LCFI5:
pushl %edi
.LCFI6:
pushl %esi
.LCFI7:
pushl %ebx
.LCFI8:
movl 16(%ebp), %eax
movl 12(%ebp), %ebx
addl $-128, %esp
.LCFI9:
leal -104(%ebp), %esi
leal -88(%ebp), %edi
fldl 8(%ebx)
fldl 16(%ebx)
fxch %st(1)
fmull 8(%eax)
fldl (%ebx)
fxch %st(2)
fmull 16(%eax)
fxch %st(2)
fmull (%eax)
fxch %st(1)
faddp %st, %st(2)
pushl %esi
movl %eax, -104(%ebp)
pushl %edi
leal -72(%ebp), %edi
faddp %st, %st(1)
pushl %edi
fadd %st(0), %st
fstpl -88(%ebp)
.LCFI10:
call _ZN5blitz13_bz_VecExprOpINS_19_bz_VecExprConstantIdEENS_19TinyVectorIterConstIdLi3ELi1EEENS_12_bz_MultiplyIddEEEC1ES2_S4_
leal -56(%ebp), %edx
popl %ecx
popl %eax
pushl %edi
pushl %edx
call _ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19_bz_VecExprConstantIdEENS_19TinyVectorIterConstIdLi3ELi1EEENS_12_bz_MultiplyIddEEEEEC1ES8_
popl %eax
leal -56(%ebp), %ecx
popl %edx
movl %ebx, -88(%ebp)
pushl %ecx
pushl %edi
call _ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19_bz_VecExprConstantIdEENS_19TinyVectorIterConstIdLi3ELi1EEENS_12_bz_MultiplyIddEEEEEC1ERKS9_
addl $12, %esp
leal -88(%ebp), %edx
pushl %edi
pushl %edx
pushl %esi
call _ZN5blitz13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS_11_bz_VecExprINS0_INS_19_bz_VecExprConstantIdEES2_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEC1ES2_S9_
popl %ecx
popl %ebx
leal -40(%ebp), %ebx
pushl %esi
pushl %ebx
call _ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ESC_
popl %eax
popl %edx
pushl %ebx
pushl %esi
call _ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ERKSD_
popl %ecx
popl %ebx
leal -88(%ebp), %ebx
movb $0, -105(%ebp)
pushl %esi
leal -88(%ebp), %esi
pushl %esi
call _ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ERKSD_
popl %eax
movl -76(%ebp), %eax
fldl -84(%ebp)
popl %edx
fmull (%eax)
movl -88(%ebp), %eax
fsubrl (%eax)
movl 8(%ebp), %eax
fstpl (%eax)
pushl %ebx
pushl %edi
call _ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ERKSD_
fldl -68(%ebp)
movl -60(%ebp), %eax
popl %ebx
popl %esi
leal -56(%ebp), %ecx
fmull 8(%eax)
movl -72(%ebp), %eax
fsubrl 8(%eax)
movl 8(%ebp), %eax
fstpl 8(%eax)
pushl %edi
pushl %ecx
call _ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ERKSD_
fldl -52(%ebp)
movl -44(%ebp), %eax
popl %edx
popl %ecx
leal -56(%ebp), %edx
fmull 16(%eax)
movl -56(%ebp), %eax
fsubrl 16(%eax)
movl 8(%ebp), %eax
fstpl 16(%eax)
pushl %edx
leal -136(%ebp), %edx
pushl %edx
call _ZN5blitz11_bz_VecExprINS_13_bz_VecExprOpINS_19TinyVectorIterConstIdLi3ELi1EEENS0_INS1_INS_19_bz_VecExprConstantIdEES3_NS_12_bz_MultiplyIddEEEEEENS_12_bz_SubtractIddEEEEEC1ERKSD_
addl $16, %esp
leal -12(%ebp), %esp
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
---->8----
Mmmmh, something seems to be wrong with -O3. There are more calls as on -O2!
Never in mind for optimization of code size.
# /opt/gcc3/bin/g++ -I. -O2 -finline-limit=2000 -S ray.c
----8<----
_Z7reflectRN5blitz10TinyVectorIdLi3EEERKS1_S4_:
.LFB1:
pushl %ebp
.LCFI0:
movl %esp, %ebp
.LCFI1:
subl $120, %esp
.LCFI2:
movl 12(%ebp), %edx
movl 16(%ebp), %eax
movb $0, -89(%ebp)
fldl 8(%edx)
fldl 16(%edx)
fxch %st(1)
fmull 8(%eax)
fldl (%edx)
fxch %st(2)
fmull 16(%eax)
fxch %st(2)
movl 8(%ebp), %ecx
fmull (%eax)
fxch %st(1)
faddp %st, %st(2)
movl %eax, -32(%ebp)
movl %eax, -48(%ebp)
faddp %st, %st(1)
fadd %st(0), %st
fld %st(0)
fstl -72(%ebp)
fstl -40(%ebp)
fstl -56(%ebp)
fxch %st(1)
fmull (%eax)
fsubrl (%edx)
fstpl (%ecx)
fld %st(0)
fmull 8(%eax)
fsubrl 8(%edx)
fstpl 8(%ecx)
fmull 16(%eax)
fsubrl 16(%edx)
fstpl 16(%ecx)
movl %ebp, %esp
popl %ebp
ret
---->8----
looks even better. Lets try
# /opt/gcc3/bin/g++ -I. -O -finline-limit=2000 -S ray.cc
----8<----
_Z7reflectRN5blitz10TinyVectorIdLi3EEERKS1_S4_:
.LFB1:
pushl %ebp
.LCFI0:
movl %esp, %ebp
.LCFI1:
pushl %esi
.LCFI2:
pushl %ebx
.LCFI3:
subl $112, %esp
.LCFI4:
movl 12(%ebp), %edx
movl 16(%ebp), %eax
movl 8(%ebp), %esi
fldl (%edx)
fmull (%eax)
fldl 8(%edx)
fmull 8(%eax)
fldl 16(%edx)
fmull 16(%eax)
faddp %st, %st(1)
faddp %st, %st(1)
fadd %st(0), %st
fstl -72(%ebp)
leal -88(%ebp), %ebx
fstl -40(%ebp)
movl %eax, -32(%ebp)
leal -24(%ebp), %ecx
movl %edx, -72(%ebp)
fstl -56(%ebp)
movl %eax, -48(%ebp)
movl %edx, -24(%ebp)
fstl 4(%ecx)
movl %eax, 12(%ecx)
movl %edx, -88(%ebp)
fstpl 4(%ebx)
movl %eax, 12(%ebx)
movb $0, -89(%ebp)
movb $1, %al
testb %al, %al
je .L55
leal -72(%ebp), %ecx
movl %ebx, %eax
movl -88(%ebp), %edx
movl %edx, -72(%ebp)
fldl 4(%eax)
fstl 4(%ecx)
movl 12(%eax), %eax
movl %eax, 12(%ecx)
fld %st(0)
fmull (%eax)
fsubrl (%edx)
fstpl (%esi)
leal -56(%ebp), %ecx
movl %edx, -56(%ebp)
fstl 4(%ecx)
movl %eax, 12(%ecx)
fld %st(0)
fmull 8(%eax)
fsubrl 8(%edx)
fstpl 8(%esi)
leal -40(%ebp), %ecx
movl %edx, -40(%ebp)
fstl 4(%ecx)
movl %eax, 12(%ecx)
fld %st(0)
fmull 16(%eax)
fsubrl 16(%edx)
fstpl 16(%esi)
leal -120(%ebp), %ecx
movl %edx, -120(%ebp)
jmp .L199
.p2align 4,,7
.L55:
leal -120(%ebp), %ecx
leal -88(%ebp), %eax
movl -88(%ebp), %edx
movl %edx, -120(%ebp)
fldl 4(%eax)
fstl 4(%ecx)
movl 12(%eax), %eax
movl %eax, 12(%ecx)
fld %st(0)
fmull (%eax)
fsubrl (%edx)
fstpl (%esi)
leal -72(%ebp), %ecx
movl %edx, -72(%ebp)
fstl 4(%ecx)
movl %eax, 12(%ecx)
fld %st(0)
fmull 8(%eax)
fsubrl 8(%edx)
fstpl 8(%esi)
leal -56(%ebp), %ecx
movl %edx, -56(%ebp)
fstl 4(%ecx)
movl %eax, 12(%ecx)
fld %st(0)
fmull 16(%eax)
fsubrl 16(%edx)
fstpl 16(%esi)
leal -40(%ebp), %ecx
movl %edx, -40(%ebp)
.L199:
fstpl 4(%ecx)
movl %eax, 12(%ecx)
addl $112, %esp
popl %ebx
popl %esi
popl %ebp
ret
---->8----
Well, ET seems to live from inliner and some other hidden options on
-O2.
# /opt/gcc3/bin/g++ -I. -O3 -finline-limit=2000 -S ray.cc
----8<----
_Z7reflectRN5blitz10TinyVectorIdLi3EEERKS1_S4_:
.LFB2:
pushl %ebp
.LCFI4:
movl %esp, %ebp
.LCFI5:
subl $120, %esp
.LCFI6:
movl 12(%ebp), %edx
movl 16(%ebp), %eax
movb $0, -89(%ebp)
fldl 8(%edx)
fldl 16(%edx)
fxch %st(1)
fmull 8(%eax)
fldl (%edx)
fxch %st(2)
fmull 16(%eax)
fxch %st(2)
movl 8(%ebp), %ecx
fmull (%eax)
fxch %st(1)
faddp %st, %st(2)
movl %eax, -32(%ebp)
movl %eax, -48(%ebp)
faddp %st, %st(1)
fadd %st(0), %st
fld %st(0)
fstl -72(%ebp)
fstl -40(%ebp)
fstl -56(%ebp)
fxch %st(1)
fmull (%eax)
fld %st(1)
fxch %st(1)
fsubrl (%edx)
fstpl (%ecx)
fmull 8(%eax)
fsubrl 8(%edx)
fstpl 8(%ecx)
fmull 16(%eax)
fsubrl 16(%edx)
fstpl 16(%ecx)
movl %ebp, %esp
popl %ebp
ret
---->8----
-fsave-memoized
-fforce-mem
-fforce-addr
doesn't change the relevant code. So lets try
# /opt/gcc3/bin/g++ -I. -O3 -finline-limit=2000 -fomit-frame-pointer -S ray.cc
----8<----
_Z7reflectRN5blitz10TinyVectorIdLi3EEERKS1_S4_:
.LFB2:
subl $124, %esp
.LCFI12:
movl 132(%esp), %edx
movl 136(%esp), %eax
movb $0, 31(%esp)
fldl 8(%edx)
fldl 16(%edx)
fxch %st(1)
fmull 8(%eax)
fldl (%edx)
fxch %st(2)
fmull 16(%eax)
fxch %st(2)
movl 128(%esp), %ecx
fmull (%eax)
fxch %st(1)
faddp %st, %st(2)
movl %eax, 88(%esp)
movl %eax, 72(%esp)
faddp %st, %st(1)
fadd %st(0), %st
fld %st(0)
fstl 48(%esp)
fstl 80(%esp)
fstl 64(%esp)
fxch %st(1)
fmull (%eax)
fld %st(1)
fxch %st(1)
fsubrl (%edx)
fstpl (%ecx)
fmull 8(%eax)
fsubrl 8(%edx)
fstpl 8(%ecx)
fmull 16(%eax)
fsubrl 16(%edx)
fstpl 16(%ecx)
addl $124, %esp
.LCFI13:
ret
---->8----
Looks good! The interesting is that I have to use to omit the
frame-pointer to get rid off the indirect adressign. Anyway, I'm
interesting to the these options for g++-2.96:
----8<----
reflect__FRQ25blitzt10TinyVector2Zdi3RCQ25blitzt10TinyVector2Zdi3T1:
.LFB1:
movl 8(%esp), %edx
movl 12(%esp), %eax
movl 4(%esp), %ecx
fldl 8(%edx)
fldl 16(%edx)
fxch %st(1)
fmull 8(%eax)
fldl (%edx)
fxch %st(2)
fmull 16(%eax)
fxch %st(2)
fmull (%eax)
fxch %st(1)
faddp %st, %st(2)
faddp %st, %st(1)
fadd %st(0), %st
fld %st(0)
fmull (%eax)
fsubrl (%edx)
fstpl (%ecx)
fld %st(0)
fmull 8(%eax)
fsubrl 8(%edx)
fstpl 8(%ecx)
fmull 16(%eax)
fsubrl 16(%edx)
fstpl 16(%ecx)
ret
---->8----
Looks compacter than g++-3.0.1.
Regards
Olaf