This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

Re: Performance of Integer Multiplication on PIII


On Fri, 2 Nov 2001, Kevin Atkinson wrote:

> Trying to teach some my self x86 assembly language as was playing around
> with hand coding a functions which encodes an array of integers (with
> limited range) into a number for direct referencing and I discovered that
> my hand coded code that does not do any fancy tricks is roughly 50% faster
> than Gcc code which attempts to avoid imul by using shifts and adds.  Here
> are the results I get.

I'm sorry.  I neglected to mention that this was on a Pentium III running
at 500 MHz.  i686 to the best of my knowledge optimized for Pentium Pros.

>
> [kevina@kevin kevina]$ gcc-3.0.2 -O2 -march=i686 read-empty.c t.c &&
> ./a.out
> 1
> 1.260000
>
> [kevina@kevin kevina]$ gcc-3.0.2 -O2 -march=i686 read.c t.c && ./a.out
> 2366519
> 6.140000
>
> [kevina@kevin kevina]$ gcc-3.0.2 -O2 -march=i686 read.S t.c && ./a.out
> 2366519
> 3.690000
>
> Thus, when subtracting the cost of the actual loop and function calls gcc
> code takes 4.88 sec of CPU time while my code takes 2.43 sections of cpu
> time. Thus my code is roughly 50% faster.
>
> Here is my test code:
>
> #include <stdio.h>
> #include <time.h>
>
> unsigned read(const unsigned *spl);
>
> int main()
> {
>   time_t start,stop;
>   unsigned a[8] = {1, 2, 3, 4, 1, 2, 3, 4};
>   unsigned i;
>   printf("%d\n", read(a));
>   start = clock();
>   for (i = 0; i != 0X4000000; ++i)
>     read(a);
>   stop = clock();
>   printf("%f\n", (stop-start)/(double)CLOCKS_PER_SEC);
>   return 0;
> }
>
> read-empty.c is simply an empty function:
>
> unsigned read(const unsigned * spl)
> {
> }
>
> here is my C code for read():
>
> unsigned read(const unsigned * spl)
> {
>   return
>     spl[0] +
>     spl[1] * 7 +
>     spl[2] * 7*7 +
>     spl[3] * 7*7*7 +
>     spl[4] * 7*7*7*7 +
>     spl[5] * 7*7*7*7*6 +
>     spl[6] * 7*7*7*7*6*6 +
>     spl[7] * 7*7*7*7*6*6*6;
> }
>
> Here is my hand coded assembly:
>
> .intel_syntax noprefix
> .globl read
> 	.type	 read,@function
> read:
> 	push ebx
> 	mov edx,[esp+8]
> 	imul eax,[edx+7*4],7*7*7*7*6*6*6
> 	imul ebx,[edx+6*4],7*7*7*7*6*6
> 	add eax,ebx
> 	imul ebx,[edx+5*4],7*7*7*7*6
> 	imul ecx,[edx+4*4],7*7*7*7
> 	add  ebx,ecx
> 	add eax,ebx
> 	imul ebx,[edx+3*4],7*7*7
> 	imul ecx,[edx+2*4],7*7
> 	add ebx,ecx
> 	add eax,ebx
> 	imul ebx,[edx+1*4],7
> 	add ebx,[edx+0*4]
> 	add eax,ebx
> 	pop ebx
> 	ret
> .end_read:
> 	.size	 read,.end_read-read
>
> And here is Gcc output to the C code (in intel syntax)
> [kevina@kevin kevina]$ gcc-3.0.2 -O2 -march=i686 read.c -S -mintel-syntax
>
> 	.file	"read.c"
> 	.intel_syntax
> 	.text
> 	.align 16
> .globl read
> 	.type	read,@function
> read:
> 	push	%ebp
> 	mov	%ebp, %esp
> 	push	%edi
> 	push	%esi
> 	mov	%esi, DWORD PTR [%ebp+8]
> 	push	%ebx
> 	mov	%edx, DWORD PTR [%esi+20]
> 	mov	%eax, %edx
> 	sal	%eax, 8
> 	add	%eax, %edx
> 	lea	%eax, [%edx+%eax*4]
> 	mov	%edx, DWORD PTR [%esi+16]
> 	lea	%edi, [0+%eax*8]
> 	sub	%edi, %eax
> 	mov	%eax, DWORD PTR [%esi+24]
> 	lea	%ecx, [%edx+%edx*4]
> 	imul	%eax, %eax, 86436
> 	lea	%edi, [%eax+%edi*2]
> 	mov	%eax, DWORD PTR [%esi+12]
> 	lea	%ebx, [%eax+%eax*8]
> 	lea	%ebx, [%eax+%ebx*2]
> 	lea	%ebx, [%ebx+%ebx*8]
> 	lea	%ebx, [%eax+%ebx*2]
> 	mov	%eax, %ecx
> 	sal	%eax, 4
> 	sub	%eax, %ecx
> 	mov	%ecx, DWORD PTR [%esi+8]
> 	sal	%eax, 5
> 	add	%eax, %edx
> 	add	%ebx, %eax
> 	mov	%eax, DWORD PTR [%esi+4]
> 	lea	%edx, [0+%eax*8]
> 	sub	%edx, %eax
> 	lea	%eax, [%ecx+%ecx*2]
> 	sal	%eax, 4
> 	add	%eax, %ecx
> 	add	%edx, %eax
> 	mov	%eax, DWORD PTR [%esi]
> 	add	%edx, %eax
> 	mov	%eax, DWORD PTR [%esi+28]
> 	add	%ebx, %edx
> 	add	%edi, %ebx
> 	pop	%ebx
> 	mov	%edx, %eax
> 	sal	%edx, 8
> 	add	%edx, %eax
> 	lea	%edx, [%eax+%edx*4]
> 	mov	%eax, %edx
> 	pop	%esi
> 	sal	%eax, 6
> 	sub	%eax, %edx
> 	lea	%eax, [%edi+%eax*8]
> 	pop	%edi
> 	pop	%ebp
> 	ret
> .Lfe1:
> 	.size	read,.Lfe1-read
> 	.ident	"GCC: (GNU) 3.0.2"
>
> When running these same tests on on Mobile Pentium MMX (using -march=i586)
> Gcc code does out perform mine.  I do not have anything in between to run
> these tests on so I would appreciate it if someone with a Pentium Pro and
> PII (or is that the same thing as a Pentium Pro?) could run them and post
> the results.
>
> So I guess the lesson here is that on PIII integer multiplication is fast
> enough that doing special tricks to avoid integer multiplication will hurt
> performs in stead of helping it.
>
> Is this a known issue?  And if so does anyone plan on addressing it.  I
> seams to me that the only fix would me to introduce more options for
> -march and -mcpu for the faster Pentiums.
>
> Thanks in advance.
>


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]