This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]

Performance of Integer Multiplication on PIII

To: <gcc at gcc dot gnu dot org>
Subject: Performance of Integer Multiplication on PIII
From: Kevin Atkinson <kevin at atkinson dot dhs dot org>
Date: Fri, 2 Nov 2001 13:45:59 -0500 (EST)

Trying to teach some my self x86 assembly language as was playing around
with hand coding a functions which encodes an array of integers (with
limited range) into a number for direct referencing and I discovered that
my hand coded code that does not do any fancy tricks is roughly 50% faster
than Gcc code which attempts to avoid imul by using shifts and adds.  Here
are the results I get.

[kevina@kevin kevina]$ gcc-3.0.2 -O2 -march=i686 read-empty.c t.c &&
./a.out
1
1.260000

[kevina@kevin kevina]$ gcc-3.0.2 -O2 -march=i686 read.c t.c && ./a.out
2366519
6.140000

[kevina@kevin kevina]$ gcc-3.0.2 -O2 -march=i686 read.S t.c && ./a.out
2366519
3.690000

Thus, when subtracting the cost of the actual loop and function calls gcc
code takes 4.88 sec of CPU time while my code takes 2.43 sections of cpu
time. Thus my code is roughly 50% faster.

Here is my test code:

#include <stdio.h>
#include <time.h>

unsigned read(const unsigned *spl);

int main()
{
  time_t start,stop;
  unsigned a[8] = {1, 2, 3, 4, 1, 2, 3, 4};
  unsigned i;
  printf("%d\n", read(a));
  start = clock();
  for (i = 0; i != 0X4000000; ++i)
    read(a);
  stop = clock();
  printf("%f\n", (stop-start)/(double)CLOCKS_PER_SEC);
  return 0;
}

read-empty.c is simply an empty function:

unsigned read(const unsigned * spl)
{
}

here is my C code for read():

unsigned read(const unsigned * spl)
{
  return
    spl[0] +
    spl[1] * 7 +
    spl[2] * 7*7 +
    spl[3] * 7*7*7 +
    spl[4] * 7*7*7*7 +
    spl[5] * 7*7*7*7*6 +
    spl[6] * 7*7*7*7*6*6 +
    spl[7] * 7*7*7*7*6*6*6;
}

Here is my hand coded assembly:

.intel_syntax noprefix
.globl read
	.type	 read,@function
read:
	push ebx
	mov edx,[esp+8]
	imul eax,[edx+7*4],7*7*7*7*6*6*6
	imul ebx,[edx+6*4],7*7*7*7*6*6
	add eax,ebx
	imul ebx,[edx+5*4],7*7*7*7*6
	imul ecx,[edx+4*4],7*7*7*7
	add  ebx,ecx
	add eax,ebx
	imul ebx,[edx+3*4],7*7*7
	imul ecx,[edx+2*4],7*7
	add ebx,ecx
	add eax,ebx
	imul ebx,[edx+1*4],7
	add ebx,[edx+0*4]
	add eax,ebx
	pop ebx
	ret
.end_read:
	.size	 read,.end_read-read

And here is Gcc output to the C code (in intel syntax)
[kevina@kevin kevina]$ gcc-3.0.2 -O2 -march=i686 read.c -S -mintel-syntax

	.file	"read.c"
	.intel_syntax
	.text
	.align 16
.globl read
	.type	read,@function
read:
	push	%ebp
	mov	%ebp, %esp
	push	%edi
	push	%esi
	mov	%esi, DWORD PTR [%ebp+8]
	push	%ebx
	mov	%edx, DWORD PTR [%esi+20]
	mov	%eax, %edx
	sal	%eax, 8
	add	%eax, %edx
	lea	%eax, [%edx+%eax*4]
	mov	%edx, DWORD PTR [%esi+16]
	lea	%edi, [0+%eax*8]
	sub	%edi, %eax
	mov	%eax, DWORD PTR [%esi+24]
	lea	%ecx, [%edx+%edx*4]
	imul	%eax, %eax, 86436
	lea	%edi, [%eax+%edi*2]
	mov	%eax, DWORD PTR [%esi+12]
	lea	%ebx, [%eax+%eax*8]
	lea	%ebx, [%eax+%ebx*2]
	lea	%ebx, [%ebx+%ebx*8]
	lea	%ebx, [%eax+%ebx*2]
	mov	%eax, %ecx
	sal	%eax, 4
	sub	%eax, %ecx
	mov	%ecx, DWORD PTR [%esi+8]
	sal	%eax, 5
	add	%eax, %edx
	add	%ebx, %eax
	mov	%eax, DWORD PTR [%esi+4]
	lea	%edx, [0+%eax*8]
	sub	%edx, %eax
	lea	%eax, [%ecx+%ecx*2]
	sal	%eax, 4
	add	%eax, %ecx
	add	%edx, %eax
	mov	%eax, DWORD PTR [%esi]
	add	%edx, %eax
	mov	%eax, DWORD PTR [%esi+28]
	add	%ebx, %edx
	add	%edi, %ebx
	pop	%ebx
	mov	%edx, %eax
	sal	%edx, 8
	add	%edx, %eax
	lea	%edx, [%eax+%edx*4]
	mov	%eax, %edx
	pop	%esi
	sal	%eax, 6
	sub	%eax, %edx
	lea	%eax, [%edi+%eax*8]
	pop	%edi
	pop	%ebp
	ret
.Lfe1:
	.size	read,.Lfe1-read
	.ident	"GCC: (GNU) 3.0.2"

When running these same tests on on Mobile Pentium MMX (using -march=i586)
Gcc code does out perform mine.  I do not have anything in between to run
these tests on so I would appreciate it if someone with a Pentium Pro and
PII (or is that the same thing as a Pentium Pro?) could run them and post
the results.

So I guess the lesson here is that on PIII integer multiplication is fast
enough that doing special tricks to avoid integer multiplication will hurt
performs in stead of helping it.

Is this a known issue?  And if so does anyone plan on addressing it.  I
seams to me that the only fix would me to introduce more options for
-march and -mcpu for the faster Pentiums.

Thanks in advance.

Follow-Ups:
- Re: Performance of Integer Multiplication on PIII
  - From: Tim Prince
- Re: Performance of Integer Multiplication on PIII
  - From: Kevin Atkinson
- Re: Performance of Integer Multiplication on PIII
  - From: Jan Hubicka
- Re: Performance of Integer Multiplication on PIII
  - From: Kevin Atkinson

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]