This is the mail archive of the
gcc@gcc.gnu.org
mailing list for the GCC project.
Performance of Integer Multiplication on PIII
- To: <gcc at gcc dot gnu dot org>
- Subject: Performance of Integer Multiplication on PIII
- From: Kevin Atkinson <kevin at atkinson dot dhs dot org>
- Date: Fri, 2 Nov 2001 13:45:59 -0500 (EST)
Trying to teach some my self x86 assembly language as was playing around
with hand coding a functions which encodes an array of integers (with
limited range) into a number for direct referencing and I discovered that
my hand coded code that does not do any fancy tricks is roughly 50% faster
than Gcc code which attempts to avoid imul by using shifts and adds. Here
are the results I get.
[kevina@kevin kevina]$ gcc-3.0.2 -O2 -march=i686 read-empty.c t.c &&
./a.out
1
1.260000
[kevina@kevin kevina]$ gcc-3.0.2 -O2 -march=i686 read.c t.c && ./a.out
2366519
6.140000
[kevina@kevin kevina]$ gcc-3.0.2 -O2 -march=i686 read.S t.c && ./a.out
2366519
3.690000
Thus, when subtracting the cost of the actual loop and function calls gcc
code takes 4.88 sec of CPU time while my code takes 2.43 sections of cpu
time. Thus my code is roughly 50% faster.
Here is my test code:
#include <stdio.h>
#include <time.h>
unsigned read(const unsigned *spl);
int main()
{
time_t start,stop;
unsigned a[8] = {1, 2, 3, 4, 1, 2, 3, 4};
unsigned i;
printf("%d\n", read(a));
start = clock();
for (i = 0; i != 0X4000000; ++i)
read(a);
stop = clock();
printf("%f\n", (stop-start)/(double)CLOCKS_PER_SEC);
return 0;
}
read-empty.c is simply an empty function:
unsigned read(const unsigned * spl)
{
}
here is my C code for read():
unsigned read(const unsigned * spl)
{
return
spl[0] +
spl[1] * 7 +
spl[2] * 7*7 +
spl[3] * 7*7*7 +
spl[4] * 7*7*7*7 +
spl[5] * 7*7*7*7*6 +
spl[6] * 7*7*7*7*6*6 +
spl[7] * 7*7*7*7*6*6*6;
}
Here is my hand coded assembly:
.intel_syntax noprefix
.globl read
.type read,@function
read:
push ebx
mov edx,[esp+8]
imul eax,[edx+7*4],7*7*7*7*6*6*6
imul ebx,[edx+6*4],7*7*7*7*6*6
add eax,ebx
imul ebx,[edx+5*4],7*7*7*7*6
imul ecx,[edx+4*4],7*7*7*7
add ebx,ecx
add eax,ebx
imul ebx,[edx+3*4],7*7*7
imul ecx,[edx+2*4],7*7
add ebx,ecx
add eax,ebx
imul ebx,[edx+1*4],7
add ebx,[edx+0*4]
add eax,ebx
pop ebx
ret
.end_read:
.size read,.end_read-read
And here is Gcc output to the C code (in intel syntax)
[kevina@kevin kevina]$ gcc-3.0.2 -O2 -march=i686 read.c -S -mintel-syntax
.file "read.c"
.intel_syntax
.text
.align 16
.globl read
.type read,@function
read:
push %ebp
mov %ebp, %esp
push %edi
push %esi
mov %esi, DWORD PTR [%ebp+8]
push %ebx
mov %edx, DWORD PTR [%esi+20]
mov %eax, %edx
sal %eax, 8
add %eax, %edx
lea %eax, [%edx+%eax*4]
mov %edx, DWORD PTR [%esi+16]
lea %edi, [0+%eax*8]
sub %edi, %eax
mov %eax, DWORD PTR [%esi+24]
lea %ecx, [%edx+%edx*4]
imul %eax, %eax, 86436
lea %edi, [%eax+%edi*2]
mov %eax, DWORD PTR [%esi+12]
lea %ebx, [%eax+%eax*8]
lea %ebx, [%eax+%ebx*2]
lea %ebx, [%ebx+%ebx*8]
lea %ebx, [%eax+%ebx*2]
mov %eax, %ecx
sal %eax, 4
sub %eax, %ecx
mov %ecx, DWORD PTR [%esi+8]
sal %eax, 5
add %eax, %edx
add %ebx, %eax
mov %eax, DWORD PTR [%esi+4]
lea %edx, [0+%eax*8]
sub %edx, %eax
lea %eax, [%ecx+%ecx*2]
sal %eax, 4
add %eax, %ecx
add %edx, %eax
mov %eax, DWORD PTR [%esi]
add %edx, %eax
mov %eax, DWORD PTR [%esi+28]
add %ebx, %edx
add %edi, %ebx
pop %ebx
mov %edx, %eax
sal %edx, 8
add %edx, %eax
lea %edx, [%eax+%edx*4]
mov %eax, %edx
pop %esi
sal %eax, 6
sub %eax, %edx
lea %eax, [%edi+%eax*8]
pop %edi
pop %ebp
ret
.Lfe1:
.size read,.Lfe1-read
.ident "GCC: (GNU) 3.0.2"
When running these same tests on on Mobile Pentium MMX (using -march=i586)
Gcc code does out perform mine. I do not have anything in between to run
these tests on so I would appreciate it if someone with a Pentium Pro and
PII (or is that the same thing as a Pentium Pro?) could run them and post
the results.
So I guess the lesson here is that on PIII integer multiplication is fast
enough that doing special tricks to avoid integer multiplication will hurt
performs in stead of helping it.
Is this a known issue? And if so does anyone plan on addressing it. I
seams to me that the only fix would me to introduce more options for
-march and -mcpu for the faster Pentiums.
Thanks in advance.