This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

g77 performance on ALPHA


Hi,
i participate in the beta test of Compaq's fortran compiler
for Linux-Alpha. I have a EV56 sytem running at 530MHz.
The result is not too good for g77 (2.95.1 release).
I tested a big (>1MB source) code (electrical analog circuit simulator)
and Compaq's compiler produces more than twice the performance of g77.

So i tested with a small daxpy operation:
      SUBROUTINE DAXPY(N,ALPHA,X,I1,Y,I2)
      IMPLICIT NONE
      INTEGER*4 N,I,I1,I2
      REAL*8 ALPHA,X(N),Y(N)

      DO I=1, N
         Y(I)=Y(I)+ALPHA*X(I)
      ENDDO
      RETURN
      END

My main program looks like that:
      IMPLICIT NONE
      INTEGER*4 N,NLOOP,I
      PARAMETER(N=1000,NLOOP=100000)
      REAL*4 TS,TE,TARR(2),ETIME
      REAL*8 X(N),Y(N),OPS

      DO I=1, N
         X(I) = 1.1D0
         Y(I) = 1.1D0
      ENDDO

      TS = ETIME(TARR)
      DO I=1, NLOOP
         CALL DAXPY(N,1.01D0,X,1,Y,1)
      ENDDO
      TE =  ETIME(TARR)
      OPS = (1D-6*N)*(NLOOP*2D0)
      WRITE(*,*) OPS/(TE-TS),' MFlops'
      END

Result:
g77 -O3 -o main main.f daxpy.f
./main
72.75 MFlops

fort -O -fast -o main main.f daxpy.f
./main
191.94 MFlops

(The handcoded asm-daxpy of Mr. Goto gets 378.55 MFlops)

g77 -O3 -S daxpy.f results in this:
	.file	1 "dxpy.f"
	.set noat
	.set noreorder
	.arch ev56
.text
	.align 5
	.globl daxpy_
	.ent daxpy_
daxpy_:
	.frame $30,0,$26,0
$daxpy_..ng:
	.prologue 0
	ldl $1,0($16)
	subl $1,1,$1
	fnop
	blt $1,$L2
	ldt $f12,0($17)
	subq $1,1,$2
	.align 4
$L6:
	ldt $f10,0($18)
	ldt $f11,0($20)
	mov $2,$1
	addq $18,8,$18
	mult $f12,$f10,$f10
	addl $1,$31,$1
	subq $2,1,$2
	addt $f11,$f10,$f11
	stt $f11,0($20)
	addq $20,8,$20
	bge $1,$L6
$L2:
	ret $31,($26),1
	.end daxpy_
	.ident	"GCC: (GNU) 2.95.1 19990816 (release)"

The loop isn't unrolled with -funroll-loops, either.
Using -funroll-all-loops, we get
	.file	1 "dxpy.f"
	.set noat
	.set noreorder
	.arch ev56
.text
	.align 5
	.globl daxpy_
	.ent daxpy_
daxpy_:
	.frame $30,0,$26,0
$daxpy_..ng:
	.prologue 0
	ldl $1,0($16)
	subl $1,1,$1
	fnop
	blt $1,$L2
	ldt $f12,0($17)
	subq $1,1,$2
	.align 4
$L6:
	ldt $f10,0($18)
	ldt $f11,0($20)
	subq $2,1,$3
	addl $2,$31,$1
	mult $f12,$f10,$f10
	addt $f11,$f10,$f11
	stt $f11,0($20)
	blt $1,$L2
	ldt $f10,8($18)
	ldt $f11,8($20)
	addl $3,$31,$1
	nop
	mult $f12,$f10,$f10
	subq $2,2,$3
	addt $f11,$f10,$f11
	nop
	stt $f11,8($20)
	blt $1,$L2
	ldt $f10,16($18)
	ldt $f11,16($20)
	addl $3,$31,$1
	mult $f12,$f10,$f10
	subq $2,3,$3
	addt $f11,$f10,$f11
	stt $f11,16($20)
	blt $1,$L2
	ldt $f10,24($18)
	ldt $f11,24($20)
	addl $3,$31,$1
	addq $18,32,$18
	mult $f12,$f10,$f10
	subq $2,4,$2
	addt $f11,$f10,$f11
	stt $f11,24($20)
	addq $20,32,$20
	bge $1,$L6
$L2:
	ret $31,($26),1
	.end daxpy_
	.ident	"GCC: (GNU) 2.95.1 19990816 (release)"

This results in even worse 65.9368963 MFlops.

The code of Compaq's compiler looks quite different:
	.file 1 "dxpy.f"
	.loc 1 1
 #      1       SUBROUTINE DAXPY(N,ALPHA,X,S1,Y,S2)
	.globl  daxpy_
	.ent 	daxpy_
	.loc 1 1
daxpy_:															   # 000001
	.frame  $sp, 0, $26
	.prologue 0
	.loc 1 6
 #      2       IMPLICIT NONE
 #      3       INTEGER*4 N,I,S1,S2
 #      4       REAL*8 ALPHA,X(N),Y(N)
 #      5 
 #      6       DO I=1, N
	ldl	$1, ($16)												   # 000006
	ble	$1, lab$0001
	.loc 1 7
 #      7          Y(I)=Y(I)+ALPHA*X(I)
	ldt	$f0, ($17)												   # 000007
	.loc 1 6
	cmple	$1, 3, $16												   # 000006
	bne	$16, L$9
lab$0004:
	.loc 1 7
	ldt	$f1, ($18)												   # 000007
	ldt	$f10, 8($18)
	ldt	$f11, 16($18)
	ldt	$f12, 24($18)
	ldt	$f13, ($20)
	ldt	$f14, 8($20)
	ldt	$f15, 16($20)
	ldt	$f16, 24($20)
	mult	$f0, $f1, $f1
	.loc 1 6
	lda	$1, -4($1)												   # 000006
	.loc 1 7
	mult	$f0, $f10, $f10												   # 000007
	mult	$f0, $f11, $f11
	.loc 1 6
	cmple	$1, 3, $4												   # 000006
	lda	$18, 32($18)
	.loc 1 7
	mult	$f0, $f12, $f12												   # 000007
	addt	$f13, $f1, $f1
	.loc 1 6
	lda	$20, 32($20)												   # 000006
	.loc 1 7
	addt	$f14, $f10, $f10											   # 000007
	addt	$f15, $f11, $f11
	addt	$f16, $f12, $f12
	stt	$f1, -32($20)
	stt	$f10, -24($20)
	stt	$f11, -16($20)
	stt	$f12, -8($20)
	.loc 1 6
	beq	$4, lab$0004												   # 000006
	ble	$1, lab$0001
	unop
L$9:
	.loc 1 7
	ldt	$f17, ($18)												   # 000007
	ldt	$f18, ($20)
	.loc 1 6
	lda	$1, -1($1)												   # 000006
	lda	$18, 8($18)
	lda	$20, 8($20)
	.loc 1 7
	mult	$f0, $f17, $f17												   # 000007
	addt	$f18, $f17, $f17
	unop
	stt	$f17, -8($20)
	.loc 1 6
	bgt	$1, L$9													   # 000006
	.loc 1 10
 #      8       ENDDO
 #      9       RETURN
 #     10       END
lab$0001:														   # 000010
	ret	($26)
	.end 	daxpy_

So, how could g77 be improved most easily?
Is it a problem with the Haifa-scheduler? It seems to me that gcc
can't take that much advantage from unrolling like Compaq's compiler.

Any thoughts?
Martin.

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]