This is the mail archive of the gcc@gcc.gnu.org mailing list for the GCC project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
RE: A case that PRE optimization hurts performance

From: "Jiangning Liu" <jiangning dot liu at arm dot com>
To: "'Richard Guenther'" <richard dot guenther at gmail dot com>
Cc: <gcc at gcc dot gnu dot org>
Date: Fri, 16 Sep 2011 10:00:42 +0800
Subject: RE: A case that PRE optimization hurts performance
References: <4e3762ed.030d440a.1143.2af2SMTPIN_ADDED@mx.google.com> <CAFiYyc0KyF4unL8hurQ77=rArSrxonFzz_MGKZDLZ=rSYZ__DQ@mail.gmail.com>
Hi Richard,

I slightly changed the case to be like below,

int f(char *t) {
    int s=0;

    while (*t && s != 1) {
        switch (s) {
        case 0:   /* path 1 */
            s = 2;
            break;
        case 2:   /* path 2 */
            s = 3; /* changed */
            break;
        default:  /* path 3 */
            if (*t == '-') 
                s = 2;
            break;
        }
        t++;
    }

    return s;
}

"-O2" is still worse than "-O2 -fno-tree-pre". 

"-O2 -fno-tree-pre" result is 

f:
	pushl	%ebp
	xorl	%eax, %eax
	movl	%esp, %ebp
	movl	8(%ebp), %edx
	movzbl	(%edx), %ecx
	jmp	.L14
	.p2align 4,,7
	.p2align 3
.L5:
	movl	$2, %eax
.L7:
	addl	$1, %edx
	cmpl	$1, %eax
	movzbl	(%edx), %ecx
	je	.L3
.L14:
	testb	%cl, %cl
	je	.L3
	testl	%eax, %eax
	je	.L5
	cmpl	$2, %eax
	.p2align 4,,5
	je	.L17
	cmpb	$45, %cl
	.p2align 4,,5
	je	.L5
	addl	$1, %edx
	cmpl	$1, %eax
	movzbl	(%edx), %ecx
	jne	.L14
	.p2align 4,,7
	.p2align 3
.L3:
	popl	%ebp
	.p2align 4,,2
	ret
	.p2align 4,,7
	.p2align 3
.L17:
	movb	$3, %al
	.p2align 4,,3
	jmp	.L7

While "-O2" result is 

f:
	pushl	%ebp
	xorl	%eax, %eax
	movl	%esp, %ebp
	movl	8(%ebp), %edx
	pushl	%ebx
	movzbl	(%edx), %ecx
	jmp	.L14
	.p2align 4,,7
	.p2align 3
.L5:
	movl	$1, %ebx
	movl	$2, %eax
.L7:
	addl	$1, %edx
	testb	%bl, %bl
	movzbl	(%edx), %ecx
	je	.L3
.L14:
	testb	%cl, %cl
	je	.L3
	testl	%eax, %eax
	je	.L5
	cmpl	$2, %eax
	.p2align 4,,5
	je	.L16
	cmpb	$45, %cl
	.p2align 4,,5
	je	.L5
	cmpl	$1, %eax
	setne	%bl
	addl	$1, %edx
	testb	%bl, %bl
	movzbl	(%edx), %ecx
	jne	.L14
	.p2align 4,,7
	.p2align 3
.L3:
	popl	%ebx
	popl	%ebp
	ret
	.p2align 4,,7
	.p2align 3
.L16:
	movl	$1, %ebx
	movb	$3, %al
	jmp	.L7

You may notice that register ebx is introduced, and some more instructions
around ebx are generated as well. i.e.

	setne	%bl
	testb	%bl, %bl

I agree with you that in theory PRE does the right thing to minimize the
computation cost on gimple level. However, the problem is the cost of
converting comparison result to a bool value is not considered, so it
actually makes binary code worse. For this case, as I summarized below, to
complete the same functionality "With PRE" is worse than "Without PRE" for
all three paths,

* Without PRE,

Path1:
	movl	$2, %eax
	cmpl	$1, %eax
	je	.L3

Path2:
	movb	$3, %al
	cmpl	$1, %eax
	je	.L3

Path3:
	cmpl	$1, %eax
	jne	.L14

* With PRE,

Path1:
	movl	$1, %ebx
	movl	$2, %eax
	testb	%bl, %bl
	je	.L3

Path2:
	movl	$1, %ebx
	movb	$3, %al
	testb	%bl, %bl
	je	.L3

Path3:
	cmpl	$1, %eax
	setne	%bl
	testb	%bl, %bl
	jne	.L14

Do you have any more thoughts?

Thanks,
-Jiangning

> -----Original Message-----
> From: Richard Guenther [mailto:richard.guenther@gmail.com]
> Sent: Tuesday, August 02, 2011 5:23 PM
> To: Jiangning Liu
> Cc: gcc@gcc.gnu.org
> Subject: Re: A case that PRE optimization hurts performance
> 
> On Tue, Aug 2, 2011 at 4:37 AM, Jiangning Liu <jiangning.liu@arm.com>
> wrote:
> > Hi,
> >
> > For the following simple test case, PRE optimization hoists
> computation
> > (s!=1) into the default branch of the switch statement, and finally
> causes
> > very poor code generation. This problem occurs in both X86 and ARM,
> and I
> > believe it is also a problem for other targets.
> >
> > int f(char *t) {
> > ? ?int s=0;
> >
> > ? ?while (*t && s != 1) {
> > ? ? ? ?switch (s) {
> > ? ? ? ?case 0:
> > ? ? ? ? ? ?s = 2;
> > ? ? ? ? ? ?break;
> > ? ? ? ?case 2:
> > ? ? ? ? ? ?s = 1;
> > ? ? ? ? ? ?break;
> > ? ? ? ?default:
> > ? ? ? ? ? ?if (*t == '-')
> > ? ? ? ? ? ? ? ?s = 1;
> > ? ? ? ? ? ?break;
> > ? ? ? ?}
> > ? ? ? ?t++;
> > ? ?}
> >
> > ? ?return s;
> > }
> >
> > Taking X86 as an example, with option "-O2" you may find 52
> instructions
> > generated like below,
> >
> > 00000000 <f>:
> > ? 0: ? 55 ? ? ? ? ? ? ? ? ? ? ?push ? %ebp
> > ? 1: ? 31 c0 ? ? ? ? ? ? ? ? ? xor ? ?%eax,%eax
> > ? 3: ? 89 e5 ? ? ? ? ? ? ? ? ? mov ? ?%esp,%ebp
> > ? 5: ? 57 ? ? ? ? ? ? ? ? ? ? ?push ? %edi
> > ? 6: ? 56 ? ? ? ? ? ? ? ? ? ? ?push ? %esi
> > ? 7: ? 53 ? ? ? ? ? ? ? ? ? ? ?push ? %ebx
> > ? 8: ? 8b 55 08 ? ? ? ? ? ? ? ?mov ? ?0x8(%ebp),%edx
> > ? b: ? 0f b6 0a ? ? ? ? ? ? ? ?movzbl (%edx),%ecx
> > ? e: ? 84 c9 ? ? ? ? ? ? ? ? ? test ? %cl,%cl
> > ?10: ? 74 50 ? ? ? ? ? ? ? ? ? je ? ? 62 <f+0x62>
> > ?12: ? 83 c2 01 ? ? ? ? ? ? ? ?add ? ?$0x1,%edx
> > ?15: ? 85 c0 ? ? ? ? ? ? ? ? ? test ? %eax,%eax
> > ?17: ? 75 23 ? ? ? ? ? ? ? ? ? jne ? ?3c <f+0x3c>
> > ?19: ? 8d b4 26 00 00 00 00 ? ?lea ? ?0x0(%esi,%eiz,1),%esi
> > ?20: ? 0f b6 0a ? ? ? ? ? ? ? ?movzbl (%edx),%ecx
> > ?23: ? 84 c9 ? ? ? ? ? ? ? ? ? test ? %cl,%cl
> > ?25: ? 0f 95 c0 ? ? ? ? ? ? ? ?setne ?%al
> > ?28: ? 89 c7 ? ? ? ? ? ? ? ? ? mov ? ?%eax,%edi
> > ?2a: ? b8 02 00 00 00 ? ? ? ? ?mov ? ?$0x2,%eax
> > ?2f: ? 89 fb ? ? ? ? ? ? ? ? ? mov ? ?%edi,%ebx
> > ?31: ? 83 c2 01 ? ? ? ? ? ? ? ?add ? ?$0x1,%edx
> > ?34: ? 84 db ? ? ? ? ? ? ? ? ? test ? %bl,%bl
> > ?36: ? 74 2a ? ? ? ? ? ? ? ? ? je ? ? 62 <f+0x62>
> > ?38: ? 85 c0 ? ? ? ? ? ? ? ? ? test ? %eax,%eax
> > ?3a: ? 74 e4 ? ? ? ? ? ? ? ? ? je ? ? 20 <f+0x20>
> > ?3c: ? 83 f8 02 ? ? ? ? ? ? ? ?cmp ? ?$0x2,%eax
> > ?3f: ? 74 1f ? ? ? ? ? ? ? ? ? je ? ? 60 <f+0x60>
> > ?41: ? 80 f9 2d ? ? ? ? ? ? ? ?cmp ? ?$0x2d,%cl
> > ?44: ? 74 22 ? ? ? ? ? ? ? ? ? je ? ? 68 <f+0x68>
> > ?46: ? 0f b6 0a ? ? ? ? ? ? ? ?movzbl (%edx),%ecx
> > ?49: ? 83 f8 01 ? ? ? ? ? ? ? ?cmp ? ?$0x1,%eax
> > ?4c: ? 0f 95 c3 ? ? ? ? ? ? ? ?setne ?%bl
> > ?4f: ? 89 df ? ? ? ? ? ? ? ? ? mov ? ?%ebx,%edi
> > ?51: ? 84 c9 ? ? ? ? ? ? ? ? ? test ? %cl,%cl
> > ?53: ? 0f 95 c3 ? ? ? ? ? ? ? ?setne ?%bl
> > ?56: ? 89 de ? ? ? ? ? ? ? ? ? mov ? ?%ebx,%esi
> > ?58: ? 21 f7 ? ? ? ? ? ? ? ? ? and ? ?%esi,%edi
> > ?5a: ? eb d3 ? ? ? ? ? ? ? ? ? jmp ? ?2f <f+0x2f>
> > ?5c: ? 8d 74 26 00 ? ? ? ? ? ? lea ? ?0x0(%esi,%eiz,1),%esi
> > ?60: ? b0 01 ? ? ? ? ? ? ? ? ? mov ? ?$0x1,%al
> > ?62: ? 5b ? ? ? ? ? ? ? ? ? ? ?pop ? ?%ebx
> > ?63: ? 5e ? ? ? ? ? ? ? ? ? ? ?pop ? ?%esi
> > ?64: ? 5f ? ? ? ? ? ? ? ? ? ? ?pop ? ?%edi
> > ?65: ? 5d ? ? ? ? ? ? ? ? ? ? ?pop ? ?%ebp
> > ?66: ? c3 ? ? ? ? ? ? ? ? ? ? ?ret
> > ?67: ? 90 ? ? ? ? ? ? ? ? ? ? ?nop
> > ?68: ? b8 01 00 00 00 ? ? ? ? ?mov ? ?$0x1,%eax
> > ?6d: ? 5b ? ? ? ? ? ? ? ? ? ? ?pop ? ?%ebx
> > ?6e: ? 5e ? ? ? ? ? ? ? ? ? ? ?pop ? ?%esi
> > ?6f: ? 5f ? ? ? ? ? ? ? ? ? ? ?pop ? ?%edi
> > ?70: ? 5d ? ? ? ? ? ? ? ? ? ? ?pop ? ?%ebp
> > ?71: ? c3 ? ? ? ? ? ? ? ? ? ? ?ret
> >
> > But with command line option "-O2 -fno-tree-pre", there are only 12
> > instructions generated, and the code would be very clean like below,
> >
> > 00000000 <f>:
> > ? 0: ? 55 ? ? ? ? ? ? ? ? ? ? ?push ? %ebp
> > ? 1: ? 31 c0 ? ? ? ? ? ? ? ? ? xor ? ?%eax,%eax
> > ? 3: ? 89 e5 ? ? ? ? ? ? ? ? ? mov ? ?%esp,%ebp
> > ? 5: ? 8b 55 08 ? ? ? ? ? ? ? ?mov ? ?0x8(%ebp),%edx
> > ? 8: ? 80 3a 00 ? ? ? ? ? ? ? ?cmpb ? $0x0,(%edx)
> > ? b: ? 74 0e ? ? ? ? ? ? ? ? ? je ? ? 1b <f+0x1b>
> > ? d: ? 80 7a 01 00 ? ? ? ? ? ? cmpb ? $0x0,0x1(%edx)
> > ?11: ? b0 02 ? ? ? ? ? ? ? ? ? mov ? ?$0x2,%al
> > ?13: ? ba 01 00 00 00 ? ? ? ? ?mov ? ?$0x1,%edx
> > ?18: ? 0f 45 c2 ? ? ? ? ? ? ? ?cmovne %edx,%eax
> > ?1b: ? 5d ? ? ? ? ? ? ? ? ? ? ?pop ? ?%ebp
> > ?1c: ? c3 ? ? ? ? ? ? ? ? ? ? ?ret
> >
> > Do you have any idea about this?
> 
> If you look at what PRE does it is clear that it's a profitable
> transformation
> as it reduces the number of instructions computed on the paths where
> s is set to a constant.  If you ask for size optimization you won't get
> this transformation.
> 
> Now - on the tree level we don't see the if-conversion opportunity
> (we don't have a conditional move instruction), but still we could
> improve phiopt to handle switch statements - not sure what we
> could do with the case in question though which is
> 
> <bb 3>:
>   switch (s_3) <default: <L3>, case 0: <L11>, case 2: <L10>>
> 
> <L11>:
>   goto <bb 7> (<L10>);
> 
> <L3>:
>   if (D.2703_6 == 45)
>     goto <bb 6>;
>   else
>     goto <bb 7> (<L10>);
> 
> <bb 6>:
> 
>   # s_2 = PHI <2(4), 1(3), 1(6), s_3(5)>
> <L10>:
> 
> as the condition in at L3 complicates things.
> 
> The code is also really really special (and written in an awful
> way ...).
> 
> Richard.
> 
> > Thanks,
> > -Jiangning
> >
> >
> >
> >
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]