This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

Re: (a+b)+c should be replaced by a+(b+c)

From: Scott Robert Ladd <coyote at coyotegulch dot com>
To: gcc mailing list <gcc at gcc dot gnu dot org>
Date: Thu, 25 Mar 2004 09:21:48 -0500
Subject: Re: (a+b)+c should be replaced by a+(b+c)
References: <Pine.SOL.4.58.0403250711490.18567@yellow.csi.cam.ac.uk> <4062D287.7050100@gnat.com> <Pine.SOL.4.58.0403251240250.18567@yellow.csi.cam.ac.uk>

Joost VandeVondele wrote:

BTW, timing of the code below on IBM SP4 with xlf90, would be useful to
see how gfortran performs.


Being in a benchmarking mood, I took your code and compiled it on a
2.8GHz Pentium 4 (Northwood core). The results did not show gfortran in
a very good light:

- - - - - - - - - - - - - - - - - - - - -

Tycho$ ifort -O3 -tpp7 -xN -ipo -o matmuli matmul.for
IPO: using IR for /tmp/ifortyRX1Wg.o
IPO: performing single-file optimizations
matmul.for(6) : (col. 6) remark: LOOP WAS VECTORIZED.
matmul.for(7) : (col. 6) remark: LOOP WAS VECTORIZED.
matmul.for(8) : (col. 6) remark: LOOP WAS VECTORIZED.
Tycho:$ ./matmuli
   5.90410300000000        10.2399999999998
Tycho$ gfortran -o matmulg -O3 -ffast-math -march=pentium4 matmul.for
Tycho$ ./matmulg     71.4641360000000         10.2400000000000

Tycho$ icc -V
Intel(R) C++ Compiler for 32-bit applications, Version 8.0   Build
20031211Z Package ID: l_cc_p_8.0.055_pe057
Copyright (C) 1985-2003 Intel Corporation.  All rights reserved.

Tycho$ gfortran -v
Reading specs from
/opt/gcc-tree-ssa/lib/gcc/i686-pc-linux-gnu/3.5-tree-ssa/specs
Configured with: ../gcc/configure --prefix=/opt/gcc-tree-ssa
--disable-checking --enable-shared --enable-threads=posix
--enable-__cxa_atexit --enable-languages=c,c++,f95
Thread model: posix
gcc version 3.5-tree-ssa 20040316 (merged 20040307)

- - - - - - - - - - - - - - - - - - - - -

The generated assembler from GCC looks like:

.globl mult_
	.type	mult_, @function
mult_:
	pushl	%ebp
	movl	%esp, %ebp
	pushl	%edi
	pushl	%esi
	pushl	%ebx
	subl	$36, %esp
	movl	20(%ebp), %eax
	movl	16(%ebp), %esi
	movl	8(%ebp), %edi
	movl	12(%ebp), %ecx
	movl	(%eax), %ebx
	movl	%ebx, -16(%ebp)
	xorl	$-1, %ebx
	sall	$3, %ebx
	movl	-16(%ebp), %eax
	movl	%ebx, -28(%ebp)
	addl	%ebx, %esi
	movl	-28(%ebp), %edx
	addl	%ebx, %edi
	addl	%ecx, %edx
	movl	%esi, -20(%ebp)
	movl	%edi, -24(%ebp)
	movl	%edx, -28(%ebp)
	testl	%eax, %eax
	jle	.L1
	movl	-16(%ebp), %edx
	movl	%edx, %ebx
	movl	%edx, -36(%ebp)
	movl	%edx, -44(%ebp)
	movl	%edx, %esi
	sall	$3, %ebx
	movl	%edx, %edi
.L4:
	movl	-28(%ebp), %eax
	movl	$1, -32(%ebp)
	movl	-20(%ebp), %edx
	leal	(%eax,%edi,8), %ecx
	movl	%ecx, -40(%ebp)
	movl	%esi, %ecx
	.p2align 4,,7
.L5:
	movl	-32(%ebp), %edi
	movl	-44(%ebp), %eax
	addl	%edi, %eax
	movl	-24(%ebp), %edi
	movl	%eax, -48(%ebp)
	fldl	(%edx,%eax,8)
	movl	-32(%ebp), %eax
	movl	-40(%ebp), %edx
	addl	%ecx, %eax
	addl	$8, %edx
	leal	(%edi,%eax,8), %eax
	.p2align 4,,7
.L6:
	fldl	(%edx)
	fmull	(%eax)
	decl	%ecx
	addl	%ebx, %eax
	addl	$8, %edx
	testl	%ecx, %ecx
	faddp	%st, %st(1)
	jg	.L6
.L7:
	movl	-32(%ebp), %ecx
	movl	-48(%ebp), %eax
	movl	-20(%ebp), %edx
	incl	%ecx
	decl	%esi
	movl	%ecx, -32(%ebp)
	fstpl	(%edx,%eax,8)
	testl	%esi, %esi
	jle	.L18
	movl	-16(%ebp), %ecx
	jmp	.L5
.L2:
.L1:
	addl	$36, %esp
	popl	%ebx
	popl	%esi
	popl	%edi
	popl	%ebp
	ret
.L8:
.L18:
	movl	-36(%ebp), %edx
	movl	-44(%ebp), %ecx
	decl	%edx
	movl	-16(%ebp), %edi
	movl	%edx, -36(%ebp)
	addl	%edi, %ecx
	movl	-36(%ebp), %esi
	movl	%ecx, -44(%ebp)
	testl	%esi, %esi
	jle	.L1
	movl	%edi, %esi
	movl	-44(%ebp), %edi
	jmp	.L4
	.size	mult_, .-mult_
	.local	c.2
	.comm	c.2,8388608,32
	.local	a.0
	.comm	a.0,8388608,32
	.local	b.1
	.comm	b.1,8388608,32
	.section	.rodata.str1.1,"aMS",@progbits,1

- - - - - - - - - - - - - - - - - - - - -

The generated assembler for Intel Fortran:

	.globl mult_
mult_:
# parameter 1: 28 + %esp
# parameter 2: 32 + %esp
# parameter 3: 36 + %esp
# parameter 4: 40 + %esp
..B2.1:                         # Preds ..B2.0
        pushl     %edi                                          #15.17
        pushl     %esi                                          #15.17
        pushl     %ebp                                          #15.17
        pushl     %ebx                                          #15.17
        subl      $8, %esp                                      #15.17
        movl      40(%esp), %eax                                #1.0
        movl      (%eax), %ebp                                  #15.17
        movl      $1, %ebx                                      #18.6
        testl     %ebp, %ebp                                    #18.6
        jle       ..B2.9        # Prob 1%                       #18.6
                                # LOE ebx ebp
..B2.2:                         # Preds ..B2.1
        movl      28(%esp), %esi                                #
        movl      32(%esp), %edx                                #
        movl      36(%esp), %edi                                #
        lea       (%ebp,%ebp), %eax                             #
        addl      %eax, %eax                                    #
        addl      %eax, %eax                                    #
        subl      %eax, %esi                                    #
        movl      %esi, (%esp)                                  #
        subl      %eax, %edx                                    #
        subl      %eax, %edi                                    #
        movl      %ebx, %ecx                                    #
        imull     %eax, %ecx                                    #
        addl      %edx, %ecx                                    #
        movl      %ebx, %edx                                    #
        imull     %eax, %edx                                    #
        addl      %edi, %edx                                    #
                                # LOE eax edx ecx ebx ebp
..B2.3:                         # Preds ..B2.7 ..B2.2
        movl      (%esp), %esi                                  #19.6
        movl      %ebx, 4(%esp)                                 #19.6
        movl      $1, %edi                                      #19.6
        lea       (%eax,%esi), %esi                             #19.6
                                # LOE eax edx ecx ebp esi edi
..B2.4:                         # Preds ..B2.6 ..B2.3
        movsd     -8(%ecx,%edi,8), %xmm0                        #21.29
        movl      $1, %ebx                                      #20.6
        .align    4,0x90
                                # LOE eax edx ecx ebx ebp esi edi xmm0
..B2.5:                         # Preds ..B2.5 ..B2.4
        movsd     -8(%esi,%ebx,8), %xmm1                        #21.22
        mulsd     %xmm0, %xmm1                                  #21.28
        addsd     -8(%edx,%ebx,8), %xmm1                        #21.21
        movsd     %xmm1, -8(%edx,%ebx,8)                        #21.8
        addl      $1, %ebx                                      #20.6
        cmpl      %ebp, %ebx                                    #20.6
        jle       ..B2.5        # Prob 99%                      #20.6
                                # LOE eax edx ecx ebx ebp esi edi xmm0
..B2.6:                         # Preds ..B2.5
        addl      %eax, %esi                                    #19.6
        addl      $1, %edi                                      #19.6
        cmpl      %ebp, %edi                                    #19.6
        jle       ..B2.4        # Prob 99%                      #19.6
                                # LOE eax edx ecx ebp esi edi
..B2.7:                         # Preds ..B2.6
        movl      4(%esp), %ebx                                 #
        addl      %eax, %ecx                                    #18.6
        addl      %eax, %edx                                    #18.6
        addl      $1, %ebx                                      #18.6
        cmpl      %ebp, %ebx                                    #18.6
        jle       ..B2.3        # Prob 99%                      #18.6
                                # LOE eax edx ecx ebx ebp
..B2.9:                         # Preds ..B2.7 ..B2.1
        addl      $8, %esp                                      #26.6
        popl      %ebx                                          #26.6
        popl      %ebp                                          #26.6
        popl      %esi                                          #26.6
        popl      %edi                                          #26.6
        ret                                                     #26.6

- - - - - - - - - - - - - - - - - - - - -

I think gfortran gets its tail stomped by Intel's effort in this comparison.

Side note: I assume you are aware that your code is a brute force
technique for matrix multiplies, and that other algorithms are much more
efficient.

If anyone is interested, I can perform the same experiment with the
Intel and GNU C compilers.

--
Scott Robert Ladd
Coyote Gulch Productions (http://www.coyotegulch.com)
Software Invention for High-Performance Computing

Follow-Ups:
- Re: (a+b)+c should be replaced by a+(b+c)
  - From: Jakub Jelinek

References:
- (a+b)+c should be replaced by a+(b+c)
  - From: Joost VandeVondele
- Re: (a+b)+c should be replaced by a+(b+c)
  - From: Robert Dewar
- Re: (a+b)+c should be replaced by a+(b+c)
  - From: Joost VandeVondele

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]