This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

Slowndown in latest snapshot



Hi
I've noticed some slowdown in mandelbrot loop code, wich is base of my XaoS-
fractal browser. It seems to be caused by new loop code in first post 1.1
snapshot.

I've tested XaoS under about two months old snapshots and didn't noticed
any slowdowns. Currently I have only 1.0 and latest snapshot installed
so here are results:

simplified version of loop is:

#include <stdio.h>
#include <time.h>
static long maxiter=10000000;
typedef long double number_t;
static int mand_calc(register number_t cre,
                     register number_t cim,
		     register number_t pre,
		     register number_t pim)
{
    register number_t rp = 0, ip = 0;
    register unsigned long iter = maxiter;
    register number_t zre, zim;
    zre = cre;
    zim = cim;
    while ((iter) && (rp + ip < 4)) {
            ip = (zim * zim);
            zim = (zim * zre) * 2 + pim;
            rp = (zre * zre);
            zre = rp - ip + pre;
            iter--;

        }
    iter = maxiter - iter;
    return(iter);
}
main()
{clock_t t1,t2;
  int i;
   t1=clock();
   i=mand_calc(0,0,0,0);
   t2=clock();
   printf("Mloops:%f",((double)maxiter)/(t2-t1)*CLOCKS_PER_SEC);
   return(i);
}

Result of gcc 1.0 is:
egcc -O3 -mpentium -ffast-math -fomit-frame-pointer -ffast-math -funroll-loops
aa.c

(loop is unrooled and speed is 5405405.405405 loops per second.)

	.file	"aa.c"
	.version	"01.01"
/ GNU C version egcs-2.90.29 980515 (egcs-1.0.3 release) (i486-linux) compiled by GNU C version egcs-2.90.29 980515 (egcs-1.0.3 release).
/ options passed:  -mcpu=pentium -mpentium -O3 -ffast-math
/ -fomit-frame-pointer -ffast-math -funroll-loops
/ options enabled:  -fdefer-pop -fomit-frame-pointer -fcse-follow-jumps
/ -fcse-skip-blocks -fexpensive-optimizations -fthread-jumps
/ -fstrength-reduce -funroll-loops -fpeephole -fforce-mem -ffunction-cse
/ -finline-functions -finline -fkeep-static-consts -fcaller-saves
/ -fpcc-struct-return -frerun-cse-after-loop -frerun-loop-opt
/ -fschedule-insns2 -ffast-math -fcommon -fverbose-asm -fgnu-linker
/ -fregmove -falias-check -fargument-alias -m80387 -mhard-float
/ -mno-soft-float -mieee-fp -mfp-ret-in-387 -mschedule-prologue
/ -mcpu=pentium -march=pentium

gcc2_compiled.:
.data
	.align 4
	.type	 maxiter,@object
	.size	 maxiter,4
maxiter:
	.long 10000000
.section	.rodata
.LC3:
	.string	"Mloops:%f"
	.align 4
.LC4:
	.long 0x0,0x80000000,0x4001
	.align 4
.LC5:
	.long 0x0,0x412e8480
.text
	.align 4
.globl main
	.type	 main,@function
main:
	pushl %esi
	pushl %ebx
	call clock
	movl %eax,%esi
	fldz
	movl maxiter,%ebx
	fld %st(0)
	fld %st(1)
	fld %st(2)
	testl %ebx,%ebx
	je .L42
	movl %ebx,%edx
	negl %edx
	fldt .LC4
	andl $3,%edx
	je .L9
	cmpl $3,%edx
	jge .L16
	cmpl $2,%edx
	jge .L17
	decl %ebx
.L17:
	fxch %st(4)
	faddp %st,%st(3)
	fxch %st(2)
	fcomp %st(3)
	fnstsw %ax
	andb $5,%ah
	je .L40
	fld %st(1)
	fld %st(1)
	fxch %st(1)
	fmul %st(3),%st
	fxch %st(1)
	fmul %st(2),%st
	fxch %st(3)
	fmulp %st,%st(2)
	decl %ebx
	fld %st(2)
	fxch %st(2)
	fadd %st(0),%st
	fxch %st(2)
	fsub %st(1),%st
	fxch %st(2)
	fxch %st(1)
	fxch %st(3)
	fxch %st(4)
.L16:
	fxch %st(4)
	faddp %st,%st(3)
	fxch %st(2)
	fcomp %st(3)
	fnstsw %ax
	andb $5,%ah
	je .L40
	fld %st(1)
	fld %st(1)
	fxch %st(1)
	fmul %st(3),%st
	fxch %st(1)
	fmul %st(2),%st
	fxch %st(3)
	fmulp %st,%st(2)
	fld %st(2)
	fxch %st(2)
	fadd %st(0),%st
	fxch %st(2)
	fsub %st(1),%st
	decl %ebx
	jz .L43
	fxch %st(2)
	fxch %st(1)
	fxch %st(3)
	fxch %st(4)
	jmp .L9
	.align 4
.L41:
	fxch %st(2)
	fxch %st(4)
	fxch %st(1)
	fxch %st(3)
	fxch %st(1)
.L9:
	fxch %st(4)
	faddp %st,%st(3)
	fxch %st(2)
	fcomp %st(3)
	fnstsw %ax
	andb $5,%ah
	je .L40
	fld %st(1)
	fld %st(1)
	fmul %st(2),%st
	fxch %st(1)
	fmul %st(3),%st
	fxch %st(3)
	fmulp %st,%st(2)
	decl %ebx
	fld %st(0)
	fadd %st(3),%st
	fxch %st(2)
	fadd %st(0),%st
	fxch %st(1)
	fsubp %st,%st(3)
	fxch %st(1)
	fcomp %st(3)
	fnstsw %ax
	andb $5,%ah
	je .L40
	fld %st(0)
	fld %st(2)
	fmul %st(3),%st
	fxch %st(1)
	fmul %st(2),%st
	fxch %st(2)
	fmulp %st,%st(3)
	decl %ebx
	fld %st(0)
	fadd %st(2),%st
	fxch %st(3)
	fadd %st(0),%st
	fxch %st(1)
	fsubp %st,%st(2)
	fxch %st(2)
	fcomp %st(3)
	fnstsw %ax
	andb $5,%ah
	je .L39
	fld %st(1)
	fld %st(1)
	fmul %st(2),%st
	fxch %st(1)
	fmul %st(3),%st
	fxch %st(3)
	fmulp %st,%st(2)
	decl %ebx
	fld %st(0)
	fadd %st(3),%st
	fxch %st(2)
	fadd %st(0),%st
	fxch %st(1)
	fsubp %st,%st(3)
	fxch %st(1)
	fcomp %st(3)
	fnstsw %ax
	andb $5,%ah
	je .L40
	fld %st(0)
	fld %st(2)
	fxch %st(1)
	fmul %st(2),%st
	fxch %st(1)
	fmul %st(3),%st
	fxch %st(2)
	fmulp %st,%st(3)
	fld %st(1)
	fxch %st(3)
	fadd %st(0),%st
	fxch %st(3)
	fsub %st(1),%st
	decl %ebx
	jnz .L41
.L43:
	fstp %st(0)
.L42:
	fstp %st(0)
.L39:
.L40:
	fstp %st(0)
	fstp %st(0)
	fstp %st(0)
	movl maxiter,%eax
	subl %ebx,%eax
	movl %eax,%ebx
	call clock
	movl %eax,%edx
	subl %esi,%edx
	pushl %edx
	fildl (%esp)
	addl $4,%esp
	fidivrl maxiter
	fldl .LC5
	fmulp %st,%st(1)
	subl $8,%esp
	fstpl (%esp)
	pushl $.LC3
	call printf
	addl $12,%esp
	movl %ebx,%eax
	popl %ebx
	popl %esi
	ret
.Lfe1:
	.size	 main,.Lfe1-main
	.ident	"GCC: (GNU) egcs-2.90.29 980515 (egcs-1.0.3 release)"

Output of egcs-19980906 is

(loop is not unrooled, speed is 4291845.493562 loops per second)

	.file	"aa.c"
	.version	"01.01"
gcc2_compiled.:
.data
	.align 4
	.type	 maxiter,@object
	.size	 maxiter,4
maxiter:
	.long 10000000
.section	.rodata
.LC3:
	.string	"Mloops:%f"
	.align 16
.LC4:
	.long 0x0,0x80000000,0x4001
	.align 8
.LC5:
	.long 0x0,0x412e8480
.text
	.align 4
.globl main
	.type	 main,@function
main:
	pushl %esi
	pushl %ebx
	call clock
	movl maxiter,%ebx
	movl %eax,%esi
	movl %ebx,%edx
	fldz
	fld %st(0)
	testl %ebx,%ebx
	je .L19
	fldt .LC4
	jmp .L10
.L17:
	fxch %st(1)
	fxch %st(2)
	fxch %st(1)
	.align 4
.L10:
	fld %st(1)
	fld %st(3)
	fmul %st(4),%st
	fxch %st(3)
	fmulp %st,%st(4)
	fmul %st(0),%st
	fld %st(2)
	fxch %st(4)
	fadd %st(0),%st
	fxch %st(4)
	fsub %st(1),%st
	decl %ebx
	jz .L16
	fxch %st(3)
	faddp %st,%st(1)
	fcomp %st(1)
	fnstsw %ax
	andb $69,%ah
	cmpb $1,%ah
	je .L17
	jmp .L18
.L16:
	fstp %st(0)
	fstp %st(0)
.L18:
	fstp %st(0)
.L19:
	fstp %st(0)
	fstp %st(0)
	subl %ebx,%edx
	movl %edx,%ebx
	call clock
	subl %esi,%eax
	pushl %eax
	fildl (%esp)
	addl $4,%esp
	fidivrl maxiter
	fldl .LC5
	fmulp %st,%st(1)
	subl $8,%esp
	fstpl (%esp)
	pushl $.LC3
	call printf
	addl $12,%esp
	movl %ebx,%eax
	popl %ebx
	popl %esi
	ret
.Lfe1:
	.size	 main,.Lfe1-main
	.ident	"GCC: (GNU) egcs-2.92.04 19980906 (gcc2 ss-980609 experimental)"

-funroll-all-loops help a bit. Loops is unrooled again in the different
way but is still slower - 5154639.175258 loops)


	.file	"aa.c"
	.version	"01.01"
gcc2_compiled.:
.data
	.align 4
	.type	 maxiter,@object
	.size	 maxiter,4
maxiter:
	.long 10000000
.section	.rodata
.LC3:
	.string	"Mloops:%f"
	.align 16
.LC4:
	.long 0x0,0x80000000,0x4001
	.align 8
.LC5:
	.long 0x0,0x412e8480
.text
	.align 4
.globl main
	.type	 main,@function
main:
	pushl %esi
	pushl %ebx
	call clock
	movl maxiter,%ebx
	movl %eax,%esi
	movl %ebx,%edx
	fldz
	fld %st(0)
	testl %ebx,%ebx
	je .L32
	fldt .LC4
	.align 4
.L10:
	fld %st(1)
	fld %st(3)
	fmul %st(4),%st
	fxch %st(3)
	fmulp %st,%st(4)
	fmul %st(0),%st
	fld %st(2)
	fxch %st(4)
	fadd %st(0),%st
	fxch %st(4)
	fsub %st(1),%st
	decl %ebx
	jz .L30
	fxch %st(3)
	faddp %st,%st(1)
	fcomp %st(1)
	fnstsw %ax
	andb $5,%ah
	je .L31
	fld %st(2)
	fld %st(2)
	fmul %st(3),%st
	fxch %st(4)
	fmulp %st,%st(3)
	fmul %st(0),%st
	fld %st(3)
	fxch %st(3)
	fadd %st(0),%st
	fxch %st(3)
	fsub %st(1),%st
	decl %ebx
	jz .L30
	fxch %st(4)
	faddp %st,%st(1)
	fcomp %st(1)
	fnstsw %ax
	andb $5,%ah
	je .L31
	fld %st(1)
	fld %st(3)
	fmul %st(4),%st
	fxch %st(3)
	fmulp %st,%st(4)
	fmul %st(0),%st
	fld %st(2)
	fxch %st(4)
	fadd %st(0),%st
	fxch %st(4)
	fsub %st(1),%st
	decl %ebx
	jz .L30
	fxch %st(3)
	faddp %st,%st(1)
	fcomp %st(1)
	fnstsw %ax
	andb $5,%ah
	je .L31
	fld %st(2)
	fld %st(2)
	fmul %st(3),%st
	fxch %st(4)
	fmulp %st,%st(3)
	fmul %st(0),%st
	fld %st(3)
	fxch %st(3)
	fadd %st(0),%st
	fxch %st(3)
	fsub %st(1),%st
	decl %ebx
	jz .L30
	fxch %st(4)
	faddp %st,%st(1)
	fcomp %st(1)
	fnstsw %ax
	andb $69,%ah
	cmpb $1,%ah
	je .L10
	jmp .L31
.L30:
	fstp %st(0)
	fstp %st(0)
.L31:
	fstp %st(0)
.L32:
	fstp %st(0)
	fstp %st(0)
	subl %ebx,%edx
	movl %edx,%ebx
	call clock
	subl %esi,%eax
	pushl %eax
	fildl (%esp)
	addl $4,%esp
	fidivrl maxiter
	fldl .LC5
	fmulp %st,%st(1)
	subl $8,%esp
	fstpl (%esp)
	pushl $.LC3
	call printf
	addl $12,%esp
	movl %ebx,%eax
	popl %ebx
	popl %esi
	ret
.Lfe1:
	.size	 main,.Lfe1-main
	.ident	"GCC: (GNU) egcs-2.92.04 19980906 (gcc2 ss-980609 experimental)"

Loop in XaoS is probably bit different case, because it is unrooled by hand.
I expect it is big enought to prevent egcs from unrooling (contains 8
copies of main loop) Also number of iteration is unknown.
If you are interested, I might try to look at it too. 


Honza
-- 
                       OK. Lets make a signature file.
+-------------------------------------------------------------------------+
|        Jan Hubicka (Jan Hubi\v{c}ka in TeX) hubicka@freesoft.cz         |
|         Czech free software foundation: http://www.freesoft.cz          |
|AA project - the new way for computer graphics - http://www.ta.jcu.cz/aa |
|  homepage: http://www.paru.cas.cz/~hubicka/, games koules, Xonix, fast  |
|  fractal zoomer XaoS, index of Czech GNU/Linux/UN*X documentation etc.  | 
+-------------------------------------------------------------------------+


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]