This is the mail archive of the
gcc@gcc.gnu.org
mailing list for the GCC project.
Re: (a+b)+c should be replaced by a+(b+c)
Joost VandeVondele wrote:
BTW, timing of the code below on IBM SP4 with xlf90, would be useful to
see how gfortran performs.
Being in a benchmarking mood, I took your code and compiled it on a
2.8GHz Pentium 4 (Northwood core). The results did not show gfortran in
a very good light:
- - - - - - - - - - - - - - - - - - - - -
Tycho$ ifort -O3 -tpp7 -xN -ipo -o matmuli matmul.for
IPO: using IR for /tmp/ifortyRX1Wg.o
IPO: performing single-file optimizations
matmul.for(6) : (col. 6) remark: LOOP WAS VECTORIZED.
matmul.for(7) : (col. 6) remark: LOOP WAS VECTORIZED.
matmul.for(8) : (col. 6) remark: LOOP WAS VECTORIZED.
Tycho:$ ./matmuli
5.90410300000000 10.2399999999998
Tycho$ gfortran -o matmulg -O3 -ffast-math -march=pentium4 matmul.for
Tycho$ ./matmulg 71.4641360000000 10.2400000000000
Tycho$ icc -V
Intel(R) C++ Compiler for 32-bit applications, Version 8.0 Build
20031211Z Package ID: l_cc_p_8.0.055_pe057
Copyright (C) 1985-2003 Intel Corporation. All rights reserved.
Tycho$ gfortran -v
Reading specs from
/opt/gcc-tree-ssa/lib/gcc/i686-pc-linux-gnu/3.5-tree-ssa/specs
Configured with: ../gcc/configure --prefix=/opt/gcc-tree-ssa
--disable-checking --enable-shared --enable-threads=posix
--enable-__cxa_atexit --enable-languages=c,c++,f95
Thread model: posix
gcc version 3.5-tree-ssa 20040316 (merged 20040307)
- - - - - - - - - - - - - - - - - - - - -
The generated assembler from GCC looks like:
.globl mult_
.type mult_, @function
mult_:
pushl %ebp
movl %esp, %ebp
pushl %edi
pushl %esi
pushl %ebx
subl $36, %esp
movl 20(%ebp), %eax
movl 16(%ebp), %esi
movl 8(%ebp), %edi
movl 12(%ebp), %ecx
movl (%eax), %ebx
movl %ebx, -16(%ebp)
xorl $-1, %ebx
sall $3, %ebx
movl -16(%ebp), %eax
movl %ebx, -28(%ebp)
addl %ebx, %esi
movl -28(%ebp), %edx
addl %ebx, %edi
addl %ecx, %edx
movl %esi, -20(%ebp)
movl %edi, -24(%ebp)
movl %edx, -28(%ebp)
testl %eax, %eax
jle .L1
movl -16(%ebp), %edx
movl %edx, %ebx
movl %edx, -36(%ebp)
movl %edx, -44(%ebp)
movl %edx, %esi
sall $3, %ebx
movl %edx, %edi
.L4:
movl -28(%ebp), %eax
movl $1, -32(%ebp)
movl -20(%ebp), %edx
leal (%eax,%edi,8), %ecx
movl %ecx, -40(%ebp)
movl %esi, %ecx
.p2align 4,,7
.L5:
movl -32(%ebp), %edi
movl -44(%ebp), %eax
addl %edi, %eax
movl -24(%ebp), %edi
movl %eax, -48(%ebp)
fldl (%edx,%eax,8)
movl -32(%ebp), %eax
movl -40(%ebp), %edx
addl %ecx, %eax
addl $8, %edx
leal (%edi,%eax,8), %eax
.p2align 4,,7
.L6:
fldl (%edx)
fmull (%eax)
decl %ecx
addl %ebx, %eax
addl $8, %edx
testl %ecx, %ecx
faddp %st, %st(1)
jg .L6
.L7:
movl -32(%ebp), %ecx
movl -48(%ebp), %eax
movl -20(%ebp), %edx
incl %ecx
decl %esi
movl %ecx, -32(%ebp)
fstpl (%edx,%eax,8)
testl %esi, %esi
jle .L18
movl -16(%ebp), %ecx
jmp .L5
.L2:
.L1:
addl $36, %esp
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
.L8:
.L18:
movl -36(%ebp), %edx
movl -44(%ebp), %ecx
decl %edx
movl -16(%ebp), %edi
movl %edx, -36(%ebp)
addl %edi, %ecx
movl -36(%ebp), %esi
movl %ecx, -44(%ebp)
testl %esi, %esi
jle .L1
movl %edi, %esi
movl -44(%ebp), %edi
jmp .L4
.size mult_, .-mult_
.local c.2
.comm c.2,8388608,32
.local a.0
.comm a.0,8388608,32
.local b.1
.comm b.1,8388608,32
.section .rodata.str1.1,"aMS",@progbits,1
- - - - - - - - - - - - - - - - - - - - -
The generated assembler for Intel Fortran:
.globl mult_
mult_:
# parameter 1: 28 + %esp
# parameter 2: 32 + %esp
# parameter 3: 36 + %esp
# parameter 4: 40 + %esp
..B2.1: # Preds ..B2.0
pushl %edi #15.17
pushl %esi #15.17
pushl %ebp #15.17
pushl %ebx #15.17
subl $8, %esp #15.17
movl 40(%esp), %eax #1.0
movl (%eax), %ebp #15.17
movl $1, %ebx #18.6
testl %ebp, %ebp #18.6
jle ..B2.9 # Prob 1% #18.6
# LOE ebx ebp
..B2.2: # Preds ..B2.1
movl 28(%esp), %esi #
movl 32(%esp), %edx #
movl 36(%esp), %edi #
lea (%ebp,%ebp), %eax #
addl %eax, %eax #
addl %eax, %eax #
subl %eax, %esi #
movl %esi, (%esp) #
subl %eax, %edx #
subl %eax, %edi #
movl %ebx, %ecx #
imull %eax, %ecx #
addl %edx, %ecx #
movl %ebx, %edx #
imull %eax, %edx #
addl %edi, %edx #
# LOE eax edx ecx ebx ebp
..B2.3: # Preds ..B2.7 ..B2.2
movl (%esp), %esi #19.6
movl %ebx, 4(%esp) #19.6
movl $1, %edi #19.6
lea (%eax,%esi), %esi #19.6
# LOE eax edx ecx ebp esi edi
..B2.4: # Preds ..B2.6 ..B2.3
movsd -8(%ecx,%edi,8), %xmm0 #21.29
movl $1, %ebx #20.6
.align 4,0x90
# LOE eax edx ecx ebx ebp esi edi xmm0
..B2.5: # Preds ..B2.5 ..B2.4
movsd -8(%esi,%ebx,8), %xmm1 #21.22
mulsd %xmm0, %xmm1 #21.28
addsd -8(%edx,%ebx,8), %xmm1 #21.21
movsd %xmm1, -8(%edx,%ebx,8) #21.8
addl $1, %ebx #20.6
cmpl %ebp, %ebx #20.6
jle ..B2.5 # Prob 99% #20.6
# LOE eax edx ecx ebx ebp esi edi xmm0
..B2.6: # Preds ..B2.5
addl %eax, %esi #19.6
addl $1, %edi #19.6
cmpl %ebp, %edi #19.6
jle ..B2.4 # Prob 99% #19.6
# LOE eax edx ecx ebp esi edi
..B2.7: # Preds ..B2.6
movl 4(%esp), %ebx #
addl %eax, %ecx #18.6
addl %eax, %edx #18.6
addl $1, %ebx #18.6
cmpl %ebp, %ebx #18.6
jle ..B2.3 # Prob 99% #18.6
# LOE eax edx ecx ebx ebp
..B2.9: # Preds ..B2.7 ..B2.1
addl $8, %esp #26.6
popl %ebx #26.6
popl %ebp #26.6
popl %esi #26.6
popl %edi #26.6
ret #26.6
- - - - - - - - - - - - - - - - - - - - -
I think gfortran gets its tail stomped by Intel's effort in this comparison.
Side note: I assume you are aware that your code is a brute force
technique for matrix multiplies, and that other algorithms are much more
efficient.
If anyone is interested, I can perform the same experiment with the
Intel and GNU C compilers.
--
Scott Robert Ladd
Coyote Gulch Productions (http://www.coyotegulch.com)
Software Invention for High-Performance Computing