This is the mail archive of the gcc-bugs@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

Re: optimization/8092: cross-jump triggers too often

From: Bernd Paysan <bernd dot paysan at gmx dot de>
To: Richard Henderson <rth at redhat dot com>
Cc: rth at gcc dot gnu dot org, gcc-bugs at gcc dot gnu dot org, nobody at gcc dot gnu dot org,gcc-gnats at gcc dot gnu dot org
Date: Fri, 4 Oct 2002 01:09:14 +0200
Subject: Re: optimization/8092: cross-jump triggers too often
References: <20020930212033.32125.qmail@sources.redhat.com> <200210011638.40807.bernd.paysan@gmx.de> <20021001185502.GC12410@redhat.com>

On Tuesday 01 October 2002 20:55, Richard Henderson wrote:
> On Tue, Oct 01, 2002 at 04:38:40PM +0200, Bernd Paysan wrote:
> > Ok, here's engine.i (compressed with bzip2). The relevant function is
> > engine.
>
> Well I'm horrified all right -- by the source.  For the record, I
> wouldn't count on this thing working indefinitely.
>
> I had to modify it a bit to get it to compile with mainline.  The
> use of asm register variables falls over the compiler's use of
> __builtin_memset, which requires edi.  I wish I could give a proper
> error message for this instead of ICE, but we don't save enough
> information long enough for me to remember that this is the user's
> fault.
>
> Anyway, I don't see anything that sticks out like a sore thumb wrt
> either cross-jumping or gcse.  Can you be more specific?

I patched GCC 3.2 and added a flag to disable cross-jumping (Sources from SuSE 
8.1, without SuSE patches applied):

-----------------------------fcross-jump.patch---------------------------------
--- gcc-3.2/gcc/toplev.c~       2002-05-27 07:48:15.000000000 +0200
+++ gcc-3.2/gcc/toplev.c        2002-10-03 21:28:10.000000000 +0200
@@ -610,6 +610,10 @@
 
 int flag_syntax_only = 0;
 
+/* Nonzero means perform crossjump optimization. */
+
+static int flag_crossjump = 0;
+
 /* Nonzero means perform global cse.  */
 
 static int flag_gcse;
@@ -1023,6 +1027,8 @@
    N_("Return 'short' aggregates in registers") },
   {"delayed-branch", &flag_delayed_branch, 1,
    N_("Attempt to fill delay slots of branch instructions") },
+  {"cross-jump", &flag_crossjump, 1,
+   N_("Perform crossjump optimization") },
   {"gcse", &flag_gcse, 1,
    N_("Perform the global common subexpression elimination") },
   {"gcse-lm", &flag_gcse_lm, 1,
@@ -3286,7 +3292,7 @@
   /* Cross-jumping is O(N^3) on the number of edges, thus trying to
      perform cross-jumping on flow graphs which have a high connectivity
      will take a long time.  This is similar to the test to disable GCSE.  */
-  cleanup_crossjump = CLEANUP_CROSSJUMP;
+  cleanup_crossjump = flag_crossjump ? CLEANUP_CROSSJUMP : 0;
   if (n_basic_blocks > 1000 && n_edges / n_basic_blocks >= 20)
     {
       if (optimize && warn_disabled_optimization)
@@ -4701,6 +4707,7 @@
       flag_optimize_sibling_calls = 1;
       flag_cse_follow_jumps = 1;
       flag_cse_skip_blocks = 1;
+      flag_crossjump = 1;
       flag_gcse = 1;
       flag_expensive_optimizations = 1;
       flag_strength_reduce = 1;
-----------------------------------------------------------------------------------

I changed the register allocation a bit (this is the current development 
branch of Gforth, not the release - new .i file in attachment). That's how it 
looks with

gcc -O2 -Wall -fomit-frame-pointer -fforce-addr -fforce-mem -march=pentium 
-fno-defer-pop -fcaller-saves -fno-gcse -fno-cross-jump -S engine.b.i

and searching for negl	%ecx.

.L96:
	addl	$4, %ebx
	negl	%ecx
	jmp	*-4(%ebx)
.L97:
	addl	$4, %ebx
	incl	%ecx
	jmp	*-4(%ebx)
.L98:
	addl	$4, %ebx
	decl	%ecx
	jmp	*-4(%ebx)

This is exactly how I want it to be.

Now with cross jumps:

gcc -O2 -Wall -fomit-frame-pointer -fforce-addr -fforce-mem -march=pentium 
-fno-defer-pop -fcaller-saves -fno-gcse -S engine.b.i

.L99:
	negl	%ecx
	jmp	.L1143
.L100:
.L205:
	incl	%ecx
.L206:
	jmp	.L1143
.L101:
.L1182:
	decl	%ecx
	jmp	.L1143

...

.L1143:
	addl	$4, %ebx
.L922:
	jmp	*-4(%ebx)

Apart from the superfluous jump, there's not much damage done here. Note that 
this cross-jump pessimation not only introduces an unnecessary jump, but 
kills the branch prediction logic of modern x86 implementations, which 
usually predicts following primitives quite well. When there's only one 
central computed goto branch, the branch prediction logic fails. And there's 
really no point in saving one byte with a 5-byte jump to a total of 6 bytes 
instructions.

But look at what happens when allowing global CSE:

gcc -O2 -Wall -fomit-frame-pointer -fforce-addr -fforce-mem -march=pentium 
-fno-defer-pop -fcaller-saves -S engine.b.i

.L99:
	negl	%ecx
	jmp	.L1266
.L100:
.L205:
	incl	%ecx
	jmp	.L1266
.L101:
.L1496:
	decl	%ecx
	jmp	.L1266

Seems to look the same, but look what happens at .L1266:

.L1266:
	addl	$4, %ebx
.L1242:
	movl	996(%esp), %edx
	addl	$8, %edx
	movl	%edx, 36(%esp)
.L1274:
	leal	16(%edi), %ebp
	leal	12(%edi), %eax
	movl	__ctype_toupper, %edx
	movl	%ebp, 48(%esp)
	movl	%eax, 52(%esp)
	movl	%edx, 80(%esp)
.L1267:
	leal	8(%edi), %ebp
	movl	symbols.3+24, %eax
	movl	%ebp, 60(%esp)
	movl	%eax, 40(%esp)
.L1335:
	movl	stdin, %edx
	movl	%edx, 84(%esp)
	jmp	.L1244

...

.L1244:
	movl	stderr, %ebp
	leal	12(%esi), %eax
	movl	%ebp, 16(%esp)
	movl	stdout, %edx
	leal	20(%esi), %ebp
	movl	%eax, 56(%esp)
	movl	%ebp, 44(%esp)
	jmp	.L971

...

.L971:
	leal	4(%edi), %eax
	leal	16(%esi), %ebp
	movl	%eax, 68(%esp)
.L922:
	leal	4(%esi), %eax
	movl	%eax, 72(%esp)
.L974:
	leal	8(%esi), %eax
	movl	%eax, 64(%esp)
.L923:
	jmp	*-4(%ebx)

Or without global jumps, but GCSE:

gcc -O2 -Wall -fomit-frame-pointer -fforce-addr -fforce-mem -march=pentium 
-fno-defer-pop -fcaller-saves -fno-cross-jump -S engine.b.i

.L96:
	leal	12(%edi), %eax
	leal	16(%edi), %ebp
	movl	996(%esp), %edx
	movl	%eax, 52(%esp)
	movl	symbols.3+24, %eax
	addl	$8, %edx
	movl	%ebp, 48(%esp)
	movl	%eax, 40(%esp)
	leal	8(%edi), %ebp
	leal	12(%esi), %eax
	movl	%edx, 36(%esp)
	movl	%ebp, 60(%esp)
	movl	__ctype_toupper, %edx
	movl	%eax, 56(%esp)
	movl	stderr, %ebp
	leal	4(%edi), %eax
	movl	%edx, 80(%esp)
	movl	%ebp, 16(%esp)
	movl	stdin, %edx
	movl	%eax, 68(%esp)
	leal	20(%esi), %ebp
	addl	$4, %ebx
	leal	4(%esi), %eax
	movl	%edx, 84(%esp)
	movl	%ebp, 44(%esp)
	movl	%eax, 72(%esp)
	negl	%ecx
	leal	8(%esi), %eax
	movl	stdout, %edx
	leal	16(%esi), %ebp
	movl	%eax, 64(%esp)
	jmp	*-4(%ebx)
.L97:
	leal	12(%edi), %eax
	leal	16(%edi), %ebp
	movl	996(%esp), %edx
	movl	%eax, 52(%esp)
	movl	symbols.3+24, %eax
	addl	$8, %edx
	movl	%ebp, 48(%esp)
	movl	%eax, 40(%esp)
	leal	8(%edi), %ebp
	leal	12(%esi), %eax
	movl	%edx, 36(%esp)
	movl	%ebp, 60(%esp)
	movl	__ctype_toupper, %edx
	movl	%eax, 56(%esp)
	movl	stderr, %ebp
	leal	4(%edi), %eax
	movl	%edx, 80(%esp)
	movl	%ebp, 16(%esp)
	movl	stdin, %edx
	movl	%eax, 68(%esp)
	leal	20(%esi), %ebp
	addl	$4, %ebx
	leal	4(%esi), %eax
	movl	%edx, 84(%esp)
	movl	%ebp, 44(%esp)
	movl	%eax, 72(%esp)
	incl	%ecx
	movl	stdout, %edx
	leal	8(%esi), %eax
	leal	16(%esi), %ebp
	movl	%eax, 64(%esp)
	jmp	*-4(%ebx)
.L98:
	leal	12(%edi), %eax
	leal	16(%edi), %ebp
	movl	996(%esp), %edx
	movl	%eax, 52(%esp)
	movl	symbols.3+24, %eax
	addl	$8, %edx
	movl	%ebp, 48(%esp)
	movl	%eax, 40(%esp)
	leal	8(%edi), %ebp
	leal	12(%esi), %eax
	movl	%edx, 36(%esp)
	movl	%ebp, 60(%esp)
	movl	__ctype_toupper, %edx
	movl	%eax, 56(%esp)
	movl	stderr, %ebp
	leal	4(%edi), %eax
	movl	%edx, 80(%esp)
	movl	%ebp, 16(%esp)
	movl	stdin, %edx
	movl	%eax, 68(%esp)
	leal	20(%esi), %ebp
	addl	$4, %ebx
	leal	4(%esi), %eax
	movl	%edx, 84(%esp)
	movl	%ebp, 44(%esp)
	movl	%eax, 72(%esp)
	decl	%ecx
	movl	stdout, %edx
	leal	8(%esi), %eax
	leal	16(%esi), %ebp
	movl	%eax, 64(%esp)
	jmp	*-4(%ebx)

I noticed a further problem. There's one primitive that converts a double 
float to a long long. I moved that conversion out into a function 
(double2ll). Originally, this conversion is just an inline operation. If you 
automatically inline the function (with either -finline-function or -O3), you 
get parts of it moved all over the place, even with global CSE disabled:

gcc -O3 -Wall -fomit-frame-pointer -fforce-addr -fforce-mem -march=pentium 
-fno-defer-pop -fcaller-saves -fno-gcse -fno-cross-jump -S engine.b.i

.L109:
	fnstcw	1230(%esp)
	movw	1230(%esp), %ax
	addl	$4, %ebx
	movb	$12, %ah
	negl	%ecx
	movw	%ax, 1228(%esp)
	jmp	*-4(%ebx)
.L110:
	fnstcw	1230(%esp)
	movw	1230(%esp), %ax
	addl	$4, %ebx
	movb	$12, %ah
	incl	%ecx
	movw	%ax, 1228(%esp)
	jmp	*-4(%ebx)
.L111:
	fnstcw	1230(%esp)
	movw	1230(%esp), %ax
	addl	$4, %ebx
	movb	$12, %ah
	decl	%ecx
	movw	%ax, 1228(%esp)
	jmp	*-4(%ebx)

Is this specific enough why I'm horrified with the code GCC 3.2 generates? The 
C code vmgen produces just looks ugly, but this code *is* ugly. I hope you 
now can see the sore thumb sticking out. I don't want code to be moved where 
it doesn't belong to, nor do I want unnecessary jumps inserted for no 
particular purpose.

-- 
Bernd Paysan
"If you want it done right, you have to do it yourself"
http://www.jwdt.com/~paysan/

Attachment: engine.b.i.bz2
Description: BZip2 compressed data

References:
- Re: optimization/8092: cross-jump triggers too often
  - From: Bernd Paysan
- Re: optimization/8092: cross-jump triggers too often
  - From: Richard Henderson

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]