This is the mail archive of the
gcc@gcc.gnu.org
mailing list for the GCC project.
Re: Performance of Integer Multiplication on PIII
- To: <gcc at gcc dot gnu dot org>
- Subject: Re: Performance of Integer Multiplication on PIII
- From: Kevin Atkinson <kevin at atkinson dot dhs dot org>
- Date: Sun, 4 Nov 2001 02:56:48 -0500 (EST)
For those of you who are interested I refined my code a bit and here are
the results.
$ gcc -O2 -march=i686 read-empty.c read.hand.s t.c && ./a.out
Loop: 1.33, Code: 1.94
Clocks: 14.45
$ icc read-empty.c read.c t.c && ./a.out
Loop: 1.10, Code: 1.97
Clocks: 14.68
$ gcc -O2 -march=i686 read-empty.c read.c t.c && ./a.out
Loop: 1.33, Code: 3.57
Clocks: 26.60
The file read.intel.s is the assembly output from the Intel assembly after
being converted into Intel syntax. Similarly the file read.gcc.s is the
output from gcc. The file read.hand.s is my assembler code which is
modified a bit from what I posted last time.
When looking at the code from the Intel compiler it is doing almost the
same thing I am expect that it is first moving the vales from the array
into a register before multiplying.
For those of you forgot this is on a Pentium III running at 500 MHz.
Finally, just for comparison sakes:
$ gcc -O2 read-empty.c read.c t.c && ./a.out
Loop: 1.30, Code: 5.48
Clocks: 40.82
#include <stdio.h>
#include <assert.h>
#include <time.h>
unsigned read(const unsigned *spl);
static const unsigned iter = 0X4000000;
static const double cpu_speed = 499.947*1e6;
static double loop_time;
int main()
{
time_t start,stop;
double time;
unsigned a[8] = {1, 2, 3, 4,
1, 2, 3, 4};
unsigned i;
i = read(a);
if (i != 2366519)
printf("Warning Wrong Value: %f\n", i);
start = clock();
for (i = 0; i != iter; ++i)
read_empty(a);
stop = clock();
loop_time = (stop-start)/(double)CLOCKS_PER_SEC;
start = clock();
for (i = 0; i != iter; ++i)
read(a);
stop = clock();
time = (stop-start)/(double)CLOCKS_PER_SEC - loop_time;
printf("Loop: %.2f, Code: %.2f\n", loop_time, time);
printf("Clocks: %.2f\n", time*cpu_speed/iter);
return 0;
}
unsigned read(const unsigned * spl)
{
return
spl[0] +
spl[1] * 7 +
spl[2] * 7*7 +
spl[3] * 7*7*7 +
spl[4] * 7*7*7*7 +
spl[5] * 7*7*7*7*6 +
spl[6] * 7*7*7*7*6*6 +
spl[7] * 7*7*7*7*6*6*6;
}
.intel_syntax noprefix
.globl read
.type read,@function
read:
mov edx,[esp+4]
mov eax,[edx+0*4]
imul ecx,[edx+1*4],7
add eax,ecx
imul ecx,[edx+2*4],7*7
add eax,ecx
imul ecx,[edx+3*4],7*7*7
add eax,ecx
imul ecx,[edx+4*4],7*7*7*7
add eax,ecx
imul ecx,[edx+5*4],7*7*7*7*6
add eax,ecx
imul ecx,[edx+6*4],7*7*7*7*6*6
add eax,ecx
imul ecx,[edx+7*4],7*7*7*7*6*6*6
add eax,ecx
ret
.end_read:
.size read,.end_read-read
;FILE "read.c"
gcc2_compiled.:
SECTION .text
ALIGN 16
GLOBAL read
GLOBAL read:function
read:
push ebp
mov ebp,esp
push edi
mov edi, [ebp+8]
push esi
push ebx
mov eax, [edi+20]
mov edx, [edi+24]
mov ecx, [edi+16]
imul eax,eax,14406
imul edx,edx,86436
lea ebx, [ecx+ecx*4]
add eax,edx
mov edx, [edi+12]
lea esi, [edx+edx*8]
lea esi, [edx+esi*2]
lea esi, [esi+esi*8]
lea esi, [edx+esi*2]
mov edx,ebx
sal edx,4
sub edx,ebx
mov ebx, [edi+8]
sal edx,5
add edx,ecx
add esi,edx
mov edx, [edi+4]
lea ecx, [edx*8+0]
sub ecx,edx
lea edx, [ebx+ebx*2]
sal edx,4
add edx,ebx
pop ebx
add ecx,edx
mov edx, [edi]
add ecx,edx
mov edx, [edi+28]
add esi,ecx
add eax,esi
pop esi
imul edx,edx,518616
pop edi
add eax,edx
pop ebp
ret
.Lfe1:
GLOBAL read:function (.Lfe1-read)
;IDENT "GCC: (GNU) 2.96 20000731 (Mandrake Linux 8.1 2.96-0.62mdk)"
; -- Machine type PX
; mark_description "Intel(R) C++ Compiler for 32-bit applications, Version 5.0.1 Build 010730D0";
; mark_description "-tp p6 -long_double -D__int64=long long -S";
;IDENT "Intel(R) C++ Compiler for 32-bit applications, Version 5.0.1 Build 010730D0"
;IDENT "-tp p6 -long_double -D__int64=long long -S"
;FILE "read.c"
SECTION .text
SECTION .data
ALIGN 4
SECTION .bss
ALIGN 4
;IDENT "-?comment:Intel(R) C++ Compiler for 32-bit applications, Version 5.0.1 Build 010730D0 : read.c : -tp p6 -long_double -D__int64=long long -S"
SECTION .data
SECTION .text
; -- Begin read
; mark_begin;
ALIGN 4, db 090h
; parameter 1: 4 + %esp
GLOBAL read
read:
.B1.1: ; Preds .B1.0
mov ecx, [esp+4] ;1.10
mov edx, [ecx+4] ;5.5
lea eax, [edx+edx] ;5.5
add eax,eax ;5.5
add eax,eax ;5.5
sub eax,edx ;5.5
mov edx, [ecx+8] ;6.5
add eax, [ecx] ;5.5
imul edx,edx,49 ;6.5
add eax,edx ;6.5
mov edx, [ecx+12] ;7.5
imul edx,edx,343 ;7.5
add eax,edx ;7.5
mov edx, [ecx+16] ;8.5
imul edx,edx,2401 ;8.5
add eax,edx ;8.5
mov edx, [ecx+20] ;9.5
imul edx,edx,14406 ;9.5
add eax,edx ;9.5
mov edx, [ecx+24] ;10.5
imul edx,edx,86436 ;10.5
mov ecx, [ecx+28] ;11.5
imul ecx,ecx,518616 ;11.5
add edx,ecx ;10.5
add eax,edx ;11.5
ret ;11.5
ALIGN 4, db 090h
; LOE
; mark_end;
GLOBAL read:function
GLOBAL read:function (.-read)
SECTION .data
; -- End read
SECTION .data
; End
unsigned read_empty(const unsigned * spl)
{
}