This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]

Multiplication of a block of integers

To: Jan Hubicka <jh at suse dot cz>
Subject: Multiplication of a block of integers
From: Frank Klemm <pfk at fuchs dot offl dot uni-jena dot de>
Date: Sun, 2 Sep 2001 17:34:40 +0200
>Received: (from pfk@localhost)by fuchs.offl.uni-jena.de (8.9.3/8.9.3/SuSE Linux 8.9.3-0.1) id RAA25421;Sun, 2 Sep 2001 17:34:40 +0200
Cc: gcc at gcc dot gnu dot org
References: <20010826202953.E2544@fuchs.offl.uni-jena.de> <20010826233634.A6693@atrey.karlin.mff.cuni.cz> <20010827004731.G2544@fuchs.offl.uni-jena.de> <20010827121624.D8568@atrey.karlin.mff.cuni.cz> <20010827143032.C636@fuchs.offl.uni-jena.de> <20010827173025.F11402@atrey.karlin.mff.cuni.cz> <20010901202854.A7713@fuchs.offl.uni-jena.de> <20010902000000.C27182@atrey.karlin.mff.cuni.cz> <20010902024104.F7713@fuchs.offl.uni-jena.de> <20010902110741.D13434@atrey.karlin.mff.cuni.cz>

This is a simple function multipliying a block of integers with 5000.

C code needs 6.35 clocks per item
unrolled C code needs 5.4 clocks per item
stupid assembler code needs 4 clocks
unrolled assembler code 2.26 clocks

Because of a factor of 1:2.4 between Assembler and C this may be interesting.

I know this is not a typical application, but it show some weak points.

-- 
Frank Klemm

/* 
 *  compile with: gcc -funroll-loops -march=athlon -O4 -o z z.c zz.s
 *                                   ^^^^^^^^^^^^^ or what CPU you are using
 *
 *  Note for '*5000': There exist no shortcut which is faster that imul $5000,r.
 *
 *  Don't run the program within the midnight commander! 
 *  This corrupts the measurement (and I don't have the slightest idea why).
 */

/*
 *  Execution times on a Athlon in clocks per loop (version with align 16)
 *
 *  t1  = 6.349269	C code (without -funroll-loops)
 *  t1  = 5.415100	C code (with -funroll-loops)
 *  t2i = 4.015972	Stupid unoptimized assembler code using the simple: imul $imm,reg  (no unroll)
 *  t2r = 4.015593	loads the value 5000 into a register X and uses: imul X,reg        (no unroll) (same speed as t2i)   (#undef FIVE)
 *        5.015591	loads the value 5000 into a register X and uses: imul X,reg        (no unroll) (20% slower than t2i) (#define FIVE)
 *  t3i = 3.515236	Stupid unoptimized assembler code using the simple: imul $imm,reg  (unroll 2)
 *  t3r = 2.639786	loads the value 5000 into a register X and uses: imul X,reg        (unroll 2)  (33% faster than t3i)
 *  t4i = 3.515237	Stupid unoptimized assembler code using the simple: imul $imm,reg  (unroll 4)
 *  t4r = 2.514549	loads the value 5000 into a register X and uses: imul X,reg        (unroll 4)  (40% faster than t4i)
 *  t5r = 2.265299	loads the value 5000 into a register X and uses: imul X,reg        (unroll 8)  (55% faster than t4i)
 */

/************ z.c ***************************/

#include <stdio.h>
#include <time.h>
#include <asm/msr.h>
#include <sys/resource.h>
#include <sched.h>


static void  Set_Realtime ( void )
{
    struct sched_param  sp;

    memset      ( &sp, 0, sizeof(sp) );
    sp.sched_priority = sched_get_priority_min ( SCHED_FIFO );
    sched_setscheduler ( 0, SCHED_RR, &sp );
    setpriority ( PRIO_PROCESS, getpid(), -20 );
}


static int  table [2000];

void  f1 ( int* p )
{
    int  i;
    
    for ( i = 0; i < sizeof(table)/sizeof(*table); i++ )
        p[i] *= 5000;
}

extern void  f2r ( int* p );
extern void  f2i ( int* p );
extern void  f3r ( int* p );
extern void  f3i ( int* p );
extern void  f4r ( int* p );
extern void  f4i ( int* p );
extern void  f5r ( int* p );

int  main ( void )
{
    int        i;
    long long  t1;
    long long  t2;
    
    Set_Realtime ();
    
    f1 (table);
    rdtscll (t1);
    for ( i = 0; i < 100000; i++ )
        f1 (table);
    rdtscll (t2);
    printf ("t1  = %f\n", (t2-t1)/2000./100000. );
    
    f2i (table);
    rdtscll (t1);
    for ( i = 0; i < 100000; i++ )
        f2i (table);
    rdtscll (t2);
    printf ("t2i = %f\n", (t2-t1)/2000./100000. );
    
    f2r (table);
    rdtscll (t1);
    for ( i = 0; i < 100000; i++ )
        f2r (table);
    rdtscll (t2);
    printf ("t2r = %f\n", (t2-t1)/2000./100000. );
    
    f3i (table);
    rdtscll (t1);
    for ( i = 0; i < 100000; i++ )
        f3i (table);
    rdtscll (t2);
    printf ("t3i = %f\n", (t2-t1)/2000./100000. );
    
    f3r (table);
    rdtscll (t1);
    for ( i = 0; i < 100000; i++ )
        f3r (table);
    rdtscll (t2);
    printf ("t3r = %f\n", (t2-t1)/2000./100000. );
    
    f4i (table);
    rdtscll (t1);
    for ( i = 0; i < 100000; i++ )
        f4i (table);
    rdtscll (t2);
    printf ("t4i = %f\n", (t2-t1)/2000./100000. );
    
    f4r (table);
    rdtscll (t1);
    for ( i = 0; i < 100000; i++ )
        f4r (table);
    rdtscll (t2);
    printf ("t4r = %f\n", (t2-t1)/2000./100000. );

#ifdef FIVE  /* don't ask me why this slows down f2r */
    f5r (table);
    rdtscll (t1);
    for ( i = 0; i < 100000; i++ )
        f5r (table);
    rdtscll (t2);
    printf ("t5r = %f\n", (t2-t1)/2000./100000. );
#endif  
    
    return 0;
}


.text
.global f2i
.type	f2i,@function
.align 16
.long 0,0
.byte 0				# here alignment plays a big role

f2i:
	pushl	%ebx
	movl	8(%esp), %ebx
	xorl	%ecx, %ecx
.L2i:	imull	$5000, (%ebx), %eax
	movl	%eax, (%ebx)
	addl	$4, %ebx
	incl	%ecx
	cmpl	$1999, %ecx
	jle	.L2i
	popl	%ebx
	ret

.global f2r
.type	f2r,@function
.align 16
.long 0
f2r:
	pushl	%ebx
	movl	8(%esp), %ebx
	xorl	%ecx, %ecx
	movl	$5000, %edx
.L2r:	movl	(%ebx), %eax
	imull	%edx, %eax
	movl	%eax, (%ebx)
	addl	$4, %ebx
	incl	%ecx
	cmpl	$1999, %ecx
	jle	.L2r
	popl	%ebx
	ret

.global f3r
.type	f3r,@function
.align 16
.long 0

f3r:
	pushl	%ebx
	movl	8(%esp), %ebx
	xorl	%ecx, %ecx
	movl	$5000, %edx
.L3r:	movl	(%ebx), %eax
	imull	%edx, %eax
	movl	%eax, (%ebx)
	movl	4(%ebx), %eax
	imull	%edx, %eax
	movl	%eax, 4(%ebx)
	addl	$8, %ebx
	incl	%ecx
	cmpl	$999, %ecx
	jle	.L3r
	popl	%ebx
	ret

.global f3i
.type	f3i,@function
.align 16
.long 0,0
.byte 0				# here alignment plays a big role

f3i:
	pushl	%ebx
	movl	8(%esp), %ebx
	xorl	%ecx, %ecx
.L3i:	imull	$5000, (%ebx), %eax
	movl	%eax, (%ebx)
	imull	$5000, 4(%ebx), %eax
	movl	%eax, 4(%ebx)
	addl	$8, %ebx
	incl	%ecx
	cmpl	$999, %ecx
	jle	.L3i
	popl	%ebx
	ret


.global f4r
.type	f4r,@function
.align 16
.long 0

f4r:
	pushl	%ebx
	movl	8(%esp), %ebx
	xorl	%ecx, %ecx
	movl	$5000, %edx
.L4r:	movl	(%ebx), %eax
	imull	%edx, %eax
	movl	%eax, (%ebx)
	movl	4(%ebx), %eax
	imull	%edx, %eax
	movl	%eax, 4(%ebx)
	movl	8(%ebx), %eax
	imull	%edx, %eax
	movl	%eax, 8(%ebx)
	movl	12(%ebx), %eax
	imull	%edx, %eax
	movl	%eax, 12(%ebx)
	addl	$16, %ebx
	incl	%ecx
	cmpl	$499, %ecx
	jle	.L4r
	popl	%ebx
	ret

.global f4i
.type	f4i,@function
.align 16
.long 0,0
.byte 0				# here alignment plays a big role

f4i:
	pushl	%ebx
	movl	8(%esp), %ebx
	xorl	%ecx, %ecx
.L4i:	imull	$5000, (%ebx), %eax
	movl	%eax, (%ebx)
	imull	$5000, 4(%ebx), %eax
	movl	%eax, 4(%ebx)
	imull	$5000, 8(%ebx), %eax
	movl	%eax, 8(%ebx)
	imull	$5000, 12(%ebx), %eax
	movl	%eax, 12(%ebx)
	addl	$16, %ebx
	incl	%ecx
	cmpl	$499, %ecx
	jle	.L4i
	popl	%ebx
	ret

.global f5r
.type	f5r,@function
.align 16
.long 0

f5r:
	pushl	%ebx
	movl	8(%esp), %ebx
	xorl	%ecx, %ecx
	movl	$5000, %edx
.L5r:	movl	(%ebx), %eax
	imull	%edx, %eax
	movl	%eax, (%ebx)
	movl	4(%ebx), %eax
	imull	%edx, %eax
	movl	%eax, 4(%ebx)
	movl	8(%ebx), %eax
	imull	%edx, %eax
	movl	%eax, 8(%ebx)
	movl	12(%ebx), %eax
	imull	%edx, %eax
	movl	%eax, 12(%ebx)
	movl	16(%ebx), %eax
	imull	%edx, %eax
	movl	%eax, 16(%ebx)
	movl	20(%ebx), %eax
	imull	%edx, %eax
	movl	%eax, 20(%ebx)
	movl	24(%ebx), %eax
	imull	%edx, %eax
	movl	%eax, 24(%ebx)
	movl	28(%ebx), %eax
	imull	%edx, %eax
	movl	%eax, 28(%ebx)
	addl	$32, %ebx
	incl	%ecx
	cmpl	$249, %ecx
	jle	.L5r
	popl	%ebx
	ret

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]