This is the mail archive of the mailing list for the GCC project.

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

Slowndown in latest snapshot

I've noticed some slowdown in mandelbrot loop code, wich is base of my XaoS-
fractal browser. It seems to be caused by new loop code in first post 1.1

I've tested XaoS under about two months old snapshots and didn't noticed
any slowdowns. Currently I have only 1.0 and latest snapshot installed
so here are results:

simplified version of loop is:

#include <stdio.h>
#include <time.h>
static long maxiter=10000000;
typedef long double number_t;
static int mand_calc(register number_t cre,
                     register number_t cim,
		     register number_t pre,
		     register number_t pim)
    register number_t rp = 0, ip = 0;
    register unsigned long iter = maxiter;
    register number_t zre, zim;
    zre = cre;
    zim = cim;
    while ((iter) && (rp + ip < 4)) {
            ip = (zim * zim);
            zim = (zim * zre) * 2 + pim;
            rp = (zre * zre);
            zre = rp - ip + pre;

    iter = maxiter - iter;
{clock_t t1,t2;
  int i;

Result of gcc 1.0 is:
egcc -O3 -mpentium -ffast-math -fomit-frame-pointer -ffast-math -funroll-loops

(loop is unrooled and speed is 5405405.405405 loops per second.)

	.file	"aa.c"
	.version	"01.01"
/ GNU C version egcs-2.90.29 980515 (egcs-1.0.3 release) (i486-linux) compiled by GNU C version egcs-2.90.29 980515 (egcs-1.0.3 release).
/ options passed:  -mcpu=pentium -mpentium -O3 -ffast-math
/ -fomit-frame-pointer -ffast-math -funroll-loops
/ options enabled:  -fdefer-pop -fomit-frame-pointer -fcse-follow-jumps
/ -fcse-skip-blocks -fexpensive-optimizations -fthread-jumps
/ -fstrength-reduce -funroll-loops -fpeephole -fforce-mem -ffunction-cse
/ -finline-functions -finline -fkeep-static-consts -fcaller-saves
/ -fpcc-struct-return -frerun-cse-after-loop -frerun-loop-opt
/ -fschedule-insns2 -ffast-math -fcommon -fverbose-asm -fgnu-linker
/ -fregmove -falias-check -fargument-alias -m80387 -mhard-float
/ -mno-soft-float -mieee-fp -mfp-ret-in-387 -mschedule-prologue
/ -mcpu=pentium -march=pentium

	.align 4
	.type	 maxiter,@object
	.size	 maxiter,4
	.long 10000000
.section	.rodata
	.string	"Mloops:%f"
	.align 4
	.long 0x0,0x80000000,0x4001
	.align 4
	.long 0x0,0x412e8480
	.align 4
.globl main
	.type	 main,@function
	pushl %esi
	pushl %ebx
	call clock
	movl %eax,%esi
	movl maxiter,%ebx
	fld %st(0)
	fld %st(1)
	fld %st(2)
	testl %ebx,%ebx
	je .L42
	movl %ebx,%edx
	negl %edx
	fldt .LC4
	andl $3,%edx
	je .L9
	cmpl $3,%edx
	jge .L16
	cmpl $2,%edx
	jge .L17
	decl %ebx
	fxch %st(4)
	faddp %st,%st(3)
	fxch %st(2)
	fcomp %st(3)
	fnstsw %ax
	andb $5,%ah
	je .L40
	fld %st(1)
	fld %st(1)
	fxch %st(1)
	fmul %st(3),%st
	fxch %st(1)
	fmul %st(2),%st
	fxch %st(3)
	fmulp %st,%st(2)
	decl %ebx
	fld %st(2)
	fxch %st(2)
	fadd %st(0),%st
	fxch %st(2)
	fsub %st(1),%st
	fxch %st(2)
	fxch %st(1)
	fxch %st(3)
	fxch %st(4)
	fxch %st(4)
	faddp %st,%st(3)
	fxch %st(2)
	fcomp %st(3)
	fnstsw %ax
	andb $5,%ah
	je .L40
	fld %st(1)
	fld %st(1)
	fxch %st(1)
	fmul %st(3),%st
	fxch %st(1)
	fmul %st(2),%st
	fxch %st(3)
	fmulp %st,%st(2)
	fld %st(2)
	fxch %st(2)
	fadd %st(0),%st
	fxch %st(2)
	fsub %st(1),%st
	decl %ebx
	jz .L43
	fxch %st(2)
	fxch %st(1)
	fxch %st(3)
	fxch %st(4)
	jmp .L9
	.align 4
	fxch %st(2)
	fxch %st(4)
	fxch %st(1)
	fxch %st(3)
	fxch %st(1)
	fxch %st(4)
	faddp %st,%st(3)
	fxch %st(2)
	fcomp %st(3)
	fnstsw %ax
	andb $5,%ah
	je .L40
	fld %st(1)
	fld %st(1)
	fmul %st(2),%st
	fxch %st(1)
	fmul %st(3),%st
	fxch %st(3)
	fmulp %st,%st(2)
	decl %ebx
	fld %st(0)
	fadd %st(3),%st
	fxch %st(2)
	fadd %st(0),%st
	fxch %st(1)
	fsubp %st,%st(3)
	fxch %st(1)
	fcomp %st(3)
	fnstsw %ax
	andb $5,%ah
	je .L40
	fld %st(0)
	fld %st(2)
	fmul %st(3),%st
	fxch %st(1)
	fmul %st(2),%st
	fxch %st(2)
	fmulp %st,%st(3)
	decl %ebx
	fld %st(0)
	fadd %st(2),%st
	fxch %st(3)
	fadd %st(0),%st
	fxch %st(1)
	fsubp %st,%st(2)
	fxch %st(2)
	fcomp %st(3)
	fnstsw %ax
	andb $5,%ah
	je .L39
	fld %st(1)
	fld %st(1)
	fmul %st(2),%st
	fxch %st(1)
	fmul %st(3),%st
	fxch %st(3)
	fmulp %st,%st(2)
	decl %ebx
	fld %st(0)
	fadd %st(3),%st
	fxch %st(2)
	fadd %st(0),%st
	fxch %st(1)
	fsubp %st,%st(3)
	fxch %st(1)
	fcomp %st(3)
	fnstsw %ax
	andb $5,%ah
	je .L40
	fld %st(0)
	fld %st(2)
	fxch %st(1)
	fmul %st(2),%st
	fxch %st(1)
	fmul %st(3),%st
	fxch %st(2)
	fmulp %st,%st(3)
	fld %st(1)
	fxch %st(3)
	fadd %st(0),%st
	fxch %st(3)
	fsub %st(1),%st
	decl %ebx
	jnz .L41
	fstp %st(0)
	fstp %st(0)
	fstp %st(0)
	fstp %st(0)
	fstp %st(0)
	movl maxiter,%eax
	subl %ebx,%eax
	movl %eax,%ebx
	call clock
	movl %eax,%edx
	subl %esi,%edx
	pushl %edx
	fildl (%esp)
	addl $4,%esp
	fidivrl maxiter
	fldl .LC5
	fmulp %st,%st(1)
	subl $8,%esp
	fstpl (%esp)
	pushl $.LC3
	call printf
	addl $12,%esp
	movl %ebx,%eax
	popl %ebx
	popl %esi
	.size	 main,.Lfe1-main
	.ident	"GCC: (GNU) egcs-2.90.29 980515 (egcs-1.0.3 release)"

Output of egcs-19980906 is

(loop is not unrooled, speed is 4291845.493562 loops per second)

	.file	"aa.c"
	.version	"01.01"
	.align 4
	.type	 maxiter,@object
	.size	 maxiter,4
	.long 10000000
.section	.rodata
	.string	"Mloops:%f"
	.align 16
	.long 0x0,0x80000000,0x4001
	.align 8
	.long 0x0,0x412e8480
	.align 4
.globl main
	.type	 main,@function
	pushl %esi
	pushl %ebx
	call clock
	movl maxiter,%ebx
	movl %eax,%esi
	movl %ebx,%edx
	fld %st(0)
	testl %ebx,%ebx
	je .L19
	fldt .LC4
	jmp .L10
	fxch %st(1)
	fxch %st(2)
	fxch %st(1)
	.align 4
	fld %st(1)
	fld %st(3)
	fmul %st(4),%st
	fxch %st(3)
	fmulp %st,%st(4)
	fmul %st(0),%st
	fld %st(2)
	fxch %st(4)
	fadd %st(0),%st
	fxch %st(4)
	fsub %st(1),%st
	decl %ebx
	jz .L16
	fxch %st(3)
	faddp %st,%st(1)
	fcomp %st(1)
	fnstsw %ax
	andb $69,%ah
	cmpb $1,%ah
	je .L17
	jmp .L18
	fstp %st(0)
	fstp %st(0)
	fstp %st(0)
	fstp %st(0)
	fstp %st(0)
	subl %ebx,%edx
	movl %edx,%ebx
	call clock
	subl %esi,%eax
	pushl %eax
	fildl (%esp)
	addl $4,%esp
	fidivrl maxiter
	fldl .LC5
	fmulp %st,%st(1)
	subl $8,%esp
	fstpl (%esp)
	pushl $.LC3
	call printf
	addl $12,%esp
	movl %ebx,%eax
	popl %ebx
	popl %esi
	.size	 main,.Lfe1-main
	.ident	"GCC: (GNU) egcs-2.92.04 19980906 (gcc2 ss-980609 experimental)"

-funroll-all-loops help a bit. Loops is unrooled again in the different
way but is still slower - 5154639.175258 loops)

	.file	"aa.c"
	.version	"01.01"
	.align 4
	.type	 maxiter,@object
	.size	 maxiter,4
	.long 10000000
.section	.rodata
	.string	"Mloops:%f"
	.align 16
	.long 0x0,0x80000000,0x4001
	.align 8
	.long 0x0,0x412e8480
	.align 4
.globl main
	.type	 main,@function
	pushl %esi
	pushl %ebx
	call clock
	movl maxiter,%ebx
	movl %eax,%esi
	movl %ebx,%edx
	fld %st(0)
	testl %ebx,%ebx
	je .L32
	fldt .LC4
	.align 4
	fld %st(1)
	fld %st(3)
	fmul %st(4),%st
	fxch %st(3)
	fmulp %st,%st(4)
	fmul %st(0),%st
	fld %st(2)
	fxch %st(4)
	fadd %st(0),%st
	fxch %st(4)
	fsub %st(1),%st
	decl %ebx
	jz .L30
	fxch %st(3)
	faddp %st,%st(1)
	fcomp %st(1)
	fnstsw %ax
	andb $5,%ah
	je .L31
	fld %st(2)
	fld %st(2)
	fmul %st(3),%st
	fxch %st(4)
	fmulp %st,%st(3)
	fmul %st(0),%st
	fld %st(3)
	fxch %st(3)
	fadd %st(0),%st
	fxch %st(3)
	fsub %st(1),%st
	decl %ebx
	jz .L30
	fxch %st(4)
	faddp %st,%st(1)
	fcomp %st(1)
	fnstsw %ax
	andb $5,%ah
	je .L31
	fld %st(1)
	fld %st(3)
	fmul %st(4),%st
	fxch %st(3)
	fmulp %st,%st(4)
	fmul %st(0),%st
	fld %st(2)
	fxch %st(4)
	fadd %st(0),%st
	fxch %st(4)
	fsub %st(1),%st
	decl %ebx
	jz .L30
	fxch %st(3)
	faddp %st,%st(1)
	fcomp %st(1)
	fnstsw %ax
	andb $5,%ah
	je .L31
	fld %st(2)
	fld %st(2)
	fmul %st(3),%st
	fxch %st(4)
	fmulp %st,%st(3)
	fmul %st(0),%st
	fld %st(3)
	fxch %st(3)
	fadd %st(0),%st
	fxch %st(3)
	fsub %st(1),%st
	decl %ebx
	jz .L30
	fxch %st(4)
	faddp %st,%st(1)
	fcomp %st(1)
	fnstsw %ax
	andb $69,%ah
	cmpb $1,%ah
	je .L10
	jmp .L31
	fstp %st(0)
	fstp %st(0)
	fstp %st(0)
	fstp %st(0)
	fstp %st(0)
	subl %ebx,%edx
	movl %edx,%ebx
	call clock
	subl %esi,%eax
	pushl %eax
	fildl (%esp)
	addl $4,%esp
	fidivrl maxiter
	fldl .LC5
	fmulp %st,%st(1)
	subl $8,%esp
	fstpl (%esp)
	pushl $.LC3
	call printf
	addl $12,%esp
	movl %ebx,%eax
	popl %ebx
	popl %esi
	.size	 main,.Lfe1-main
	.ident	"GCC: (GNU) egcs-2.92.04 19980906 (gcc2 ss-980609 experimental)"

Loop in XaoS is probably bit different case, because it is unrooled by hand.
I expect it is big enought to prevent egcs from unrooling (contains 8
copies of main loop) Also number of iteration is unknown.
If you are interested, I might try to look at it too. 

                       OK. Lets make a signature file.
|        Jan Hubicka (Jan Hubi\v{c}ka in TeX)         |
|         Czech free software foundation:          |
|AA project - the new way for computer graphics - |
|  homepage:, games koules, Xonix, fast  |
|  fractal zoomer XaoS, index of Czech GNU/Linux/UN*X documentation etc.  | 

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]