This is the mail archive of the
gcc@gcc.gnu.org
mailing list for the GCC project.
g77 performance on ALPHA
- To: egcs@egcs.cygnus.com
- Subject: g77 performance on ALPHA
- From: martin.kahlert@provi.de
- Date: Sat, 28 Aug 1999 13:30:48 +0200
Hi,
i participate in the beta test of Compaq's fortran compiler
for Linux-Alpha. I have a EV56 sytem running at 530MHz.
The result is not too good for g77 (2.95.1 release).
I tested a big (>1MB source) code (electrical analog circuit simulator)
and Compaq's compiler produces more than twice the performance of g77.
So i tested with a small daxpy operation:
SUBROUTINE DAXPY(N,ALPHA,X,I1,Y,I2)
IMPLICIT NONE
INTEGER*4 N,I,I1,I2
REAL*8 ALPHA,X(N),Y(N)
DO I=1, N
Y(I)=Y(I)+ALPHA*X(I)
ENDDO
RETURN
END
My main program looks like that:
IMPLICIT NONE
INTEGER*4 N,NLOOP,I
PARAMETER(N=1000,NLOOP=100000)
REAL*4 TS,TE,TARR(2),ETIME
REAL*8 X(N),Y(N),OPS
DO I=1, N
X(I) = 1.1D0
Y(I) = 1.1D0
ENDDO
TS = ETIME(TARR)
DO I=1, NLOOP
CALL DAXPY(N,1.01D0,X,1,Y,1)
ENDDO
TE = ETIME(TARR)
OPS = (1D-6*N)*(NLOOP*2D0)
WRITE(*,*) OPS/(TE-TS),' MFlops'
END
Result:
g77 -O3 -o main main.f daxpy.f
./main
72.75 MFlops
fort -O -fast -o main main.f daxpy.f
./main
191.94 MFlops
(The handcoded asm-daxpy of Mr. Goto gets 378.55 MFlops)
g77 -O3 -S daxpy.f results in this:
.file 1 "dxpy.f"
.set noat
.set noreorder
.arch ev56
.text
.align 5
.globl daxpy_
.ent daxpy_
daxpy_:
.frame $30,0,$26,0
$daxpy_..ng:
.prologue 0
ldl $1,0($16)
subl $1,1,$1
fnop
blt $1,$L2
ldt $f12,0($17)
subq $1,1,$2
.align 4
$L6:
ldt $f10,0($18)
ldt $f11,0($20)
mov $2,$1
addq $18,8,$18
mult $f12,$f10,$f10
addl $1,$31,$1
subq $2,1,$2
addt $f11,$f10,$f11
stt $f11,0($20)
addq $20,8,$20
bge $1,$L6
$L2:
ret $31,($26),1
.end daxpy_
.ident "GCC: (GNU) 2.95.1 19990816 (release)"
The loop isn't unrolled with -funroll-loops, either.
Using -funroll-all-loops, we get
.file 1 "dxpy.f"
.set noat
.set noreorder
.arch ev56
.text
.align 5
.globl daxpy_
.ent daxpy_
daxpy_:
.frame $30,0,$26,0
$daxpy_..ng:
.prologue 0
ldl $1,0($16)
subl $1,1,$1
fnop
blt $1,$L2
ldt $f12,0($17)
subq $1,1,$2
.align 4
$L6:
ldt $f10,0($18)
ldt $f11,0($20)
subq $2,1,$3
addl $2,$31,$1
mult $f12,$f10,$f10
addt $f11,$f10,$f11
stt $f11,0($20)
blt $1,$L2
ldt $f10,8($18)
ldt $f11,8($20)
addl $3,$31,$1
nop
mult $f12,$f10,$f10
subq $2,2,$3
addt $f11,$f10,$f11
nop
stt $f11,8($20)
blt $1,$L2
ldt $f10,16($18)
ldt $f11,16($20)
addl $3,$31,$1
mult $f12,$f10,$f10
subq $2,3,$3
addt $f11,$f10,$f11
stt $f11,16($20)
blt $1,$L2
ldt $f10,24($18)
ldt $f11,24($20)
addl $3,$31,$1
addq $18,32,$18
mult $f12,$f10,$f10
subq $2,4,$2
addt $f11,$f10,$f11
stt $f11,24($20)
addq $20,32,$20
bge $1,$L6
$L2:
ret $31,($26),1
.end daxpy_
.ident "GCC: (GNU) 2.95.1 19990816 (release)"
This results in even worse 65.9368963 MFlops.
The code of Compaq's compiler looks quite different:
.file 1 "dxpy.f"
.loc 1 1
# 1 SUBROUTINE DAXPY(N,ALPHA,X,S1,Y,S2)
.globl daxpy_
.ent daxpy_
.loc 1 1
daxpy_: # 000001
.frame $sp, 0, $26
.prologue 0
.loc 1 6
# 2 IMPLICIT NONE
# 3 INTEGER*4 N,I,S1,S2
# 4 REAL*8 ALPHA,X(N),Y(N)
# 5
# 6 DO I=1, N
ldl $1, ($16) # 000006
ble $1, lab$0001
.loc 1 7
# 7 Y(I)=Y(I)+ALPHA*X(I)
ldt $f0, ($17) # 000007
.loc 1 6
cmple $1, 3, $16 # 000006
bne $16, L$9
lab$0004:
.loc 1 7
ldt $f1, ($18) # 000007
ldt $f10, 8($18)
ldt $f11, 16($18)
ldt $f12, 24($18)
ldt $f13, ($20)
ldt $f14, 8($20)
ldt $f15, 16($20)
ldt $f16, 24($20)
mult $f0, $f1, $f1
.loc 1 6
lda $1, -4($1) # 000006
.loc 1 7
mult $f0, $f10, $f10 # 000007
mult $f0, $f11, $f11
.loc 1 6
cmple $1, 3, $4 # 000006
lda $18, 32($18)
.loc 1 7
mult $f0, $f12, $f12 # 000007
addt $f13, $f1, $f1
.loc 1 6
lda $20, 32($20) # 000006
.loc 1 7
addt $f14, $f10, $f10 # 000007
addt $f15, $f11, $f11
addt $f16, $f12, $f12
stt $f1, -32($20)
stt $f10, -24($20)
stt $f11, -16($20)
stt $f12, -8($20)
.loc 1 6
beq $4, lab$0004 # 000006
ble $1, lab$0001
unop
L$9:
.loc 1 7
ldt $f17, ($18) # 000007
ldt $f18, ($20)
.loc 1 6
lda $1, -1($1) # 000006
lda $18, 8($18)
lda $20, 8($20)
.loc 1 7
mult $f0, $f17, $f17 # 000007
addt $f18, $f17, $f17
unop
stt $f17, -8($20)
.loc 1 6
bgt $1, L$9 # 000006
.loc 1 10
# 8 ENDDO
# 9 RETURN
# 10 END
lab$0001: # 000010
ret ($26)
.end daxpy_
So, how could g77 be improved most easily?
Is it a problem with the Haifa-scheduler? It seems to me that gcc
can't take that much advantage from unrolling like Compaq's compiler.
Any thoughts?
Martin.