This is the mail archive of the
gcc@gcc.gnu.org
mailing list for the GCC project.
Multiplication of a block of integers
- To: Jan Hubicka <jh at suse dot cz>
- Subject: Multiplication of a block of integers
- From: Frank Klemm <pfk at fuchs dot offl dot uni-jena dot de>
- Date: Sun, 2 Sep 2001 17:34:40 +0200
- >Received: (from pfk@localhost)by fuchs.offl.uni-jena.de (8.9.3/8.9.3/SuSE Linux 8.9.3-0.1) id RAA25421;Sun, 2 Sep 2001 17:34:40 +0200
- Cc: gcc at gcc dot gnu dot org
- References: <20010826202953.E2544@fuchs.offl.uni-jena.de> <20010826233634.A6693@atrey.karlin.mff.cuni.cz> <20010827004731.G2544@fuchs.offl.uni-jena.de> <20010827121624.D8568@atrey.karlin.mff.cuni.cz> <20010827143032.C636@fuchs.offl.uni-jena.de> <20010827173025.F11402@atrey.karlin.mff.cuni.cz> <20010901202854.A7713@fuchs.offl.uni-jena.de> <20010902000000.C27182@atrey.karlin.mff.cuni.cz> <20010902024104.F7713@fuchs.offl.uni-jena.de> <20010902110741.D13434@atrey.karlin.mff.cuni.cz>
This is a simple function multipliying a block of integers with 5000.
C code needs 6.35 clocks per item
unrolled C code needs 5.4 clocks per item
stupid assembler code needs 4 clocks
unrolled assembler code 2.26 clocks
Because of a factor of 1:2.4 between Assembler and C this may be interesting.
I know this is not a typical application, but it show some weak points.
--
Frank Klemm
/*
* compile with: gcc -funroll-loops -march=athlon -O4 -o z z.c zz.s
* ^^^^^^^^^^^^^ or what CPU you are using
*
* Note for '*5000': There exist no shortcut which is faster that imul $5000,r.
*
* Don't run the program within the midnight commander!
* This corrupts the measurement (and I don't have the slightest idea why).
*/
/*
* Execution times on a Athlon in clocks per loop (version with align 16)
*
* t1 = 6.349269 C code (without -funroll-loops)
* t1 = 5.415100 C code (with -funroll-loops)
* t2i = 4.015972 Stupid unoptimized assembler code using the simple: imul $imm,reg (no unroll)
* t2r = 4.015593 loads the value 5000 into a register X and uses: imul X,reg (no unroll) (same speed as t2i) (#undef FIVE)
* 5.015591 loads the value 5000 into a register X and uses: imul X,reg (no unroll) (20% slower than t2i) (#define FIVE)
* t3i = 3.515236 Stupid unoptimized assembler code using the simple: imul $imm,reg (unroll 2)
* t3r = 2.639786 loads the value 5000 into a register X and uses: imul X,reg (unroll 2) (33% faster than t3i)
* t4i = 3.515237 Stupid unoptimized assembler code using the simple: imul $imm,reg (unroll 4)
* t4r = 2.514549 loads the value 5000 into a register X and uses: imul X,reg (unroll 4) (40% faster than t4i)
* t5r = 2.265299 loads the value 5000 into a register X and uses: imul X,reg (unroll 8) (55% faster than t4i)
*/
/************ z.c ***************************/
#include <stdio.h>
#include <time.h>
#include <asm/msr.h>
#include <sys/resource.h>
#include <sched.h>
static void Set_Realtime ( void )
{
struct sched_param sp;
memset ( &sp, 0, sizeof(sp) );
sp.sched_priority = sched_get_priority_min ( SCHED_FIFO );
sched_setscheduler ( 0, SCHED_RR, &sp );
setpriority ( PRIO_PROCESS, getpid(), -20 );
}
static int table [2000];
void f1 ( int* p )
{
int i;
for ( i = 0; i < sizeof(table)/sizeof(*table); i++ )
p[i] *= 5000;
}
extern void f2r ( int* p );
extern void f2i ( int* p );
extern void f3r ( int* p );
extern void f3i ( int* p );
extern void f4r ( int* p );
extern void f4i ( int* p );
extern void f5r ( int* p );
int main ( void )
{
int i;
long long t1;
long long t2;
Set_Realtime ();
f1 (table);
rdtscll (t1);
for ( i = 0; i < 100000; i++ )
f1 (table);
rdtscll (t2);
printf ("t1 = %f\n", (t2-t1)/2000./100000. );
f2i (table);
rdtscll (t1);
for ( i = 0; i < 100000; i++ )
f2i (table);
rdtscll (t2);
printf ("t2i = %f\n", (t2-t1)/2000./100000. );
f2r (table);
rdtscll (t1);
for ( i = 0; i < 100000; i++ )
f2r (table);
rdtscll (t2);
printf ("t2r = %f\n", (t2-t1)/2000./100000. );
f3i (table);
rdtscll (t1);
for ( i = 0; i < 100000; i++ )
f3i (table);
rdtscll (t2);
printf ("t3i = %f\n", (t2-t1)/2000./100000. );
f3r (table);
rdtscll (t1);
for ( i = 0; i < 100000; i++ )
f3r (table);
rdtscll (t2);
printf ("t3r = %f\n", (t2-t1)/2000./100000. );
f4i (table);
rdtscll (t1);
for ( i = 0; i < 100000; i++ )
f4i (table);
rdtscll (t2);
printf ("t4i = %f\n", (t2-t1)/2000./100000. );
f4r (table);
rdtscll (t1);
for ( i = 0; i < 100000; i++ )
f4r (table);
rdtscll (t2);
printf ("t4r = %f\n", (t2-t1)/2000./100000. );
#ifdef FIVE /* don't ask me why this slows down f2r */
f5r (table);
rdtscll (t1);
for ( i = 0; i < 100000; i++ )
f5r (table);
rdtscll (t2);
printf ("t5r = %f\n", (t2-t1)/2000./100000. );
#endif
return 0;
}
.text
.global f2i
.type f2i,@function
.align 16
.long 0,0
.byte 0 # here alignment plays a big role
f2i:
pushl %ebx
movl 8(%esp), %ebx
xorl %ecx, %ecx
.L2i: imull $5000, (%ebx), %eax
movl %eax, (%ebx)
addl $4, %ebx
incl %ecx
cmpl $1999, %ecx
jle .L2i
popl %ebx
ret
.global f2r
.type f2r,@function
.align 16
.long 0
f2r:
pushl %ebx
movl 8(%esp), %ebx
xorl %ecx, %ecx
movl $5000, %edx
.L2r: movl (%ebx), %eax
imull %edx, %eax
movl %eax, (%ebx)
addl $4, %ebx
incl %ecx
cmpl $1999, %ecx
jle .L2r
popl %ebx
ret
.global f3r
.type f3r,@function
.align 16
.long 0
f3r:
pushl %ebx
movl 8(%esp), %ebx
xorl %ecx, %ecx
movl $5000, %edx
.L3r: movl (%ebx), %eax
imull %edx, %eax
movl %eax, (%ebx)
movl 4(%ebx), %eax
imull %edx, %eax
movl %eax, 4(%ebx)
addl $8, %ebx
incl %ecx
cmpl $999, %ecx
jle .L3r
popl %ebx
ret
.global f3i
.type f3i,@function
.align 16
.long 0,0
.byte 0 # here alignment plays a big role
f3i:
pushl %ebx
movl 8(%esp), %ebx
xorl %ecx, %ecx
.L3i: imull $5000, (%ebx), %eax
movl %eax, (%ebx)
imull $5000, 4(%ebx), %eax
movl %eax, 4(%ebx)
addl $8, %ebx
incl %ecx
cmpl $999, %ecx
jle .L3i
popl %ebx
ret
.global f4r
.type f4r,@function
.align 16
.long 0
f4r:
pushl %ebx
movl 8(%esp), %ebx
xorl %ecx, %ecx
movl $5000, %edx
.L4r: movl (%ebx), %eax
imull %edx, %eax
movl %eax, (%ebx)
movl 4(%ebx), %eax
imull %edx, %eax
movl %eax, 4(%ebx)
movl 8(%ebx), %eax
imull %edx, %eax
movl %eax, 8(%ebx)
movl 12(%ebx), %eax
imull %edx, %eax
movl %eax, 12(%ebx)
addl $16, %ebx
incl %ecx
cmpl $499, %ecx
jle .L4r
popl %ebx
ret
.global f4i
.type f4i,@function
.align 16
.long 0,0
.byte 0 # here alignment plays a big role
f4i:
pushl %ebx
movl 8(%esp), %ebx
xorl %ecx, %ecx
.L4i: imull $5000, (%ebx), %eax
movl %eax, (%ebx)
imull $5000, 4(%ebx), %eax
movl %eax, 4(%ebx)
imull $5000, 8(%ebx), %eax
movl %eax, 8(%ebx)
imull $5000, 12(%ebx), %eax
movl %eax, 12(%ebx)
addl $16, %ebx
incl %ecx
cmpl $499, %ecx
jle .L4i
popl %ebx
ret
.global f5r
.type f5r,@function
.align 16
.long 0
f5r:
pushl %ebx
movl 8(%esp), %ebx
xorl %ecx, %ecx
movl $5000, %edx
.L5r: movl (%ebx), %eax
imull %edx, %eax
movl %eax, (%ebx)
movl 4(%ebx), %eax
imull %edx, %eax
movl %eax, 4(%ebx)
movl 8(%ebx), %eax
imull %edx, %eax
movl %eax, 8(%ebx)
movl 12(%ebx), %eax
imull %edx, %eax
movl %eax, 12(%ebx)
movl 16(%ebx), %eax
imull %edx, %eax
movl %eax, 16(%ebx)
movl 20(%ebx), %eax
imull %edx, %eax
movl %eax, 20(%ebx)
movl 24(%ebx), %eax
imull %edx, %eax
movl %eax, 24(%ebx)
movl 28(%ebx), %eax
imull %edx, %eax
movl %eax, 28(%ebx)
addl $32, %ebx
incl %ecx
cmpl $249, %ecx
jle .L5r
popl %ebx
ret