This is the mail archive of the
gcc@gcc.gnu.org
mailing list for the GCC project.
RE: A case that PRE optimization hurts performance
- From: "Jiangning Liu" <jiangning dot liu at arm dot com>
- To: "'Richard Guenther'" <richard dot guenther at gmail dot com>
- Cc: <gcc at gcc dot gnu dot org>
- Date: Fri, 16 Sep 2011 10:00:42 +0800
- Subject: RE: A case that PRE optimization hurts performance
- References: <4e3762ed.030d440a.1143.2af2SMTPIN_ADDED@mx.google.com> <CAFiYyc0KyF4unL8hurQ77=rArSrxonFzz_MGKZDLZ=rSYZ__DQ@mail.gmail.com>
Hi Richard,
I slightly changed the case to be like below,
int f(char *t) {
int s=0;
while (*t && s != 1) {
switch (s) {
case 0: /* path 1 */
s = 2;
break;
case 2: /* path 2 */
s = 3; /* changed */
break;
default: /* path 3 */
if (*t == '-')
s = 2;
break;
}
t++;
}
return s;
}
"-O2" is still worse than "-O2 -fno-tree-pre".
"-O2 -fno-tree-pre" result is
f:
pushl %ebp
xorl %eax, %eax
movl %esp, %ebp
movl 8(%ebp), %edx
movzbl (%edx), %ecx
jmp .L14
.p2align 4,,7
.p2align 3
.L5:
movl $2, %eax
.L7:
addl $1, %edx
cmpl $1, %eax
movzbl (%edx), %ecx
je .L3
.L14:
testb %cl, %cl
je .L3
testl %eax, %eax
je .L5
cmpl $2, %eax
.p2align 4,,5
je .L17
cmpb $45, %cl
.p2align 4,,5
je .L5
addl $1, %edx
cmpl $1, %eax
movzbl (%edx), %ecx
jne .L14
.p2align 4,,7
.p2align 3
.L3:
popl %ebp
.p2align 4,,2
ret
.p2align 4,,7
.p2align 3
.L17:
movb $3, %al
.p2align 4,,3
jmp .L7
While "-O2" result is
f:
pushl %ebp
xorl %eax, %eax
movl %esp, %ebp
movl 8(%ebp), %edx
pushl %ebx
movzbl (%edx), %ecx
jmp .L14
.p2align 4,,7
.p2align 3
.L5:
movl $1, %ebx
movl $2, %eax
.L7:
addl $1, %edx
testb %bl, %bl
movzbl (%edx), %ecx
je .L3
.L14:
testb %cl, %cl
je .L3
testl %eax, %eax
je .L5
cmpl $2, %eax
.p2align 4,,5
je .L16
cmpb $45, %cl
.p2align 4,,5
je .L5
cmpl $1, %eax
setne %bl
addl $1, %edx
testb %bl, %bl
movzbl (%edx), %ecx
jne .L14
.p2align 4,,7
.p2align 3
.L3:
popl %ebx
popl %ebp
ret
.p2align 4,,7
.p2align 3
.L16:
movl $1, %ebx
movb $3, %al
jmp .L7
You may notice that register ebx is introduced, and some more instructions
around ebx are generated as well. i.e.
setne %bl
testb %bl, %bl
I agree with you that in theory PRE does the right thing to minimize the
computation cost on gimple level. However, the problem is the cost of
converting comparison result to a bool value is not considered, so it
actually makes binary code worse. For this case, as I summarized below, to
complete the same functionality "With PRE" is worse than "Without PRE" for
all three paths,
* Without PRE,
Path1:
movl $2, %eax
cmpl $1, %eax
je .L3
Path2:
movb $3, %al
cmpl $1, %eax
je .L3
Path3:
cmpl $1, %eax
jne .L14
* With PRE,
Path1:
movl $1, %ebx
movl $2, %eax
testb %bl, %bl
je .L3
Path2:
movl $1, %ebx
movb $3, %al
testb %bl, %bl
je .L3
Path3:
cmpl $1, %eax
setne %bl
testb %bl, %bl
jne .L14
Do you have any more thoughts?
Thanks,
-Jiangning
> -----Original Message-----
> From: Richard Guenther [mailto:richard.guenther@gmail.com]
> Sent: Tuesday, August 02, 2011 5:23 PM
> To: Jiangning Liu
> Cc: gcc@gcc.gnu.org
> Subject: Re: A case that PRE optimization hurts performance
>
> On Tue, Aug 2, 2011 at 4:37 AM, Jiangning Liu <jiangning.liu@arm.com>
> wrote:
> > Hi,
> >
> > For the following simple test case, PRE optimization hoists
> computation
> > (s!=1) into the default branch of the switch statement, and finally
> causes
> > very poor code generation. This problem occurs in both X86 and ARM,
> and I
> > believe it is also a problem for other targets.
> >
> > int f(char *t) {
> > ? ?int s=0;
> >
> > ? ?while (*t && s != 1) {
> > ? ? ? ?switch (s) {
> > ? ? ? ?case 0:
> > ? ? ? ? ? ?s = 2;
> > ? ? ? ? ? ?break;
> > ? ? ? ?case 2:
> > ? ? ? ? ? ?s = 1;
> > ? ? ? ? ? ?break;
> > ? ? ? ?default:
> > ? ? ? ? ? ?if (*t == '-')
> > ? ? ? ? ? ? ? ?s = 1;
> > ? ? ? ? ? ?break;
> > ? ? ? ?}
> > ? ? ? ?t++;
> > ? ?}
> >
> > ? ?return s;
> > }
> >
> > Taking X86 as an example, with option "-O2" you may find 52
> instructions
> > generated like below,
> >
> > 00000000 <f>:
> > ? 0: ? 55 ? ? ? ? ? ? ? ? ? ? ?push ? %ebp
> > ? 1: ? 31 c0 ? ? ? ? ? ? ? ? ? xor ? ?%eax,%eax
> > ? 3: ? 89 e5 ? ? ? ? ? ? ? ? ? mov ? ?%esp,%ebp
> > ? 5: ? 57 ? ? ? ? ? ? ? ? ? ? ?push ? %edi
> > ? 6: ? 56 ? ? ? ? ? ? ? ? ? ? ?push ? %esi
> > ? 7: ? 53 ? ? ? ? ? ? ? ? ? ? ?push ? %ebx
> > ? 8: ? 8b 55 08 ? ? ? ? ? ? ? ?mov ? ?0x8(%ebp),%edx
> > ? b: ? 0f b6 0a ? ? ? ? ? ? ? ?movzbl (%edx),%ecx
> > ? e: ? 84 c9 ? ? ? ? ? ? ? ? ? test ? %cl,%cl
> > ?10: ? 74 50 ? ? ? ? ? ? ? ? ? je ? ? 62 <f+0x62>
> > ?12: ? 83 c2 01 ? ? ? ? ? ? ? ?add ? ?$0x1,%edx
> > ?15: ? 85 c0 ? ? ? ? ? ? ? ? ? test ? %eax,%eax
> > ?17: ? 75 23 ? ? ? ? ? ? ? ? ? jne ? ?3c <f+0x3c>
> > ?19: ? 8d b4 26 00 00 00 00 ? ?lea ? ?0x0(%esi,%eiz,1),%esi
> > ?20: ? 0f b6 0a ? ? ? ? ? ? ? ?movzbl (%edx),%ecx
> > ?23: ? 84 c9 ? ? ? ? ? ? ? ? ? test ? %cl,%cl
> > ?25: ? 0f 95 c0 ? ? ? ? ? ? ? ?setne ?%al
> > ?28: ? 89 c7 ? ? ? ? ? ? ? ? ? mov ? ?%eax,%edi
> > ?2a: ? b8 02 00 00 00 ? ? ? ? ?mov ? ?$0x2,%eax
> > ?2f: ? 89 fb ? ? ? ? ? ? ? ? ? mov ? ?%edi,%ebx
> > ?31: ? 83 c2 01 ? ? ? ? ? ? ? ?add ? ?$0x1,%edx
> > ?34: ? 84 db ? ? ? ? ? ? ? ? ? test ? %bl,%bl
> > ?36: ? 74 2a ? ? ? ? ? ? ? ? ? je ? ? 62 <f+0x62>
> > ?38: ? 85 c0 ? ? ? ? ? ? ? ? ? test ? %eax,%eax
> > ?3a: ? 74 e4 ? ? ? ? ? ? ? ? ? je ? ? 20 <f+0x20>
> > ?3c: ? 83 f8 02 ? ? ? ? ? ? ? ?cmp ? ?$0x2,%eax
> > ?3f: ? 74 1f ? ? ? ? ? ? ? ? ? je ? ? 60 <f+0x60>
> > ?41: ? 80 f9 2d ? ? ? ? ? ? ? ?cmp ? ?$0x2d,%cl
> > ?44: ? 74 22 ? ? ? ? ? ? ? ? ? je ? ? 68 <f+0x68>
> > ?46: ? 0f b6 0a ? ? ? ? ? ? ? ?movzbl (%edx),%ecx
> > ?49: ? 83 f8 01 ? ? ? ? ? ? ? ?cmp ? ?$0x1,%eax
> > ?4c: ? 0f 95 c3 ? ? ? ? ? ? ? ?setne ?%bl
> > ?4f: ? 89 df ? ? ? ? ? ? ? ? ? mov ? ?%ebx,%edi
> > ?51: ? 84 c9 ? ? ? ? ? ? ? ? ? test ? %cl,%cl
> > ?53: ? 0f 95 c3 ? ? ? ? ? ? ? ?setne ?%bl
> > ?56: ? 89 de ? ? ? ? ? ? ? ? ? mov ? ?%ebx,%esi
> > ?58: ? 21 f7 ? ? ? ? ? ? ? ? ? and ? ?%esi,%edi
> > ?5a: ? eb d3 ? ? ? ? ? ? ? ? ? jmp ? ?2f <f+0x2f>
> > ?5c: ? 8d 74 26 00 ? ? ? ? ? ? lea ? ?0x0(%esi,%eiz,1),%esi
> > ?60: ? b0 01 ? ? ? ? ? ? ? ? ? mov ? ?$0x1,%al
> > ?62: ? 5b ? ? ? ? ? ? ? ? ? ? ?pop ? ?%ebx
> > ?63: ? 5e ? ? ? ? ? ? ? ? ? ? ?pop ? ?%esi
> > ?64: ? 5f ? ? ? ? ? ? ? ? ? ? ?pop ? ?%edi
> > ?65: ? 5d ? ? ? ? ? ? ? ? ? ? ?pop ? ?%ebp
> > ?66: ? c3 ? ? ? ? ? ? ? ? ? ? ?ret
> > ?67: ? 90 ? ? ? ? ? ? ? ? ? ? ?nop
> > ?68: ? b8 01 00 00 00 ? ? ? ? ?mov ? ?$0x1,%eax
> > ?6d: ? 5b ? ? ? ? ? ? ? ? ? ? ?pop ? ?%ebx
> > ?6e: ? 5e ? ? ? ? ? ? ? ? ? ? ?pop ? ?%esi
> > ?6f: ? 5f ? ? ? ? ? ? ? ? ? ? ?pop ? ?%edi
> > ?70: ? 5d ? ? ? ? ? ? ? ? ? ? ?pop ? ?%ebp
> > ?71: ? c3 ? ? ? ? ? ? ? ? ? ? ?ret
> >
> > But with command line option "-O2 -fno-tree-pre", there are only 12
> > instructions generated, and the code would be very clean like below,
> >
> > 00000000 <f>:
> > ? 0: ? 55 ? ? ? ? ? ? ? ? ? ? ?push ? %ebp
> > ? 1: ? 31 c0 ? ? ? ? ? ? ? ? ? xor ? ?%eax,%eax
> > ? 3: ? 89 e5 ? ? ? ? ? ? ? ? ? mov ? ?%esp,%ebp
> > ? 5: ? 8b 55 08 ? ? ? ? ? ? ? ?mov ? ?0x8(%ebp),%edx
> > ? 8: ? 80 3a 00 ? ? ? ? ? ? ? ?cmpb ? $0x0,(%edx)
> > ? b: ? 74 0e ? ? ? ? ? ? ? ? ? je ? ? 1b <f+0x1b>
> > ? d: ? 80 7a 01 00 ? ? ? ? ? ? cmpb ? $0x0,0x1(%edx)
> > ?11: ? b0 02 ? ? ? ? ? ? ? ? ? mov ? ?$0x2,%al
> > ?13: ? ba 01 00 00 00 ? ? ? ? ?mov ? ?$0x1,%edx
> > ?18: ? 0f 45 c2 ? ? ? ? ? ? ? ?cmovne %edx,%eax
> > ?1b: ? 5d ? ? ? ? ? ? ? ? ? ? ?pop ? ?%ebp
> > ?1c: ? c3 ? ? ? ? ? ? ? ? ? ? ?ret
> >
> > Do you have any idea about this?
>
> If you look at what PRE does it is clear that it's a profitable
> transformation
> as it reduces the number of instructions computed on the paths where
> s is set to a constant. If you ask for size optimization you won't get
> this transformation.
>
> Now - on the tree level we don't see the if-conversion opportunity
> (we don't have a conditional move instruction), but still we could
> improve phiopt to handle switch statements - not sure what we
> could do with the case in question though which is
>
> <bb 3>:
> switch (s_3) <default: <L3>, case 0: <L11>, case 2: <L10>>
>
> <L11>:
> goto <bb 7> (<L10>);
>
> <L3>:
> if (D.2703_6 == 45)
> goto <bb 6>;
> else
> goto <bb 7> (<L10>);
>
> <bb 6>:
>
> # s_2 = PHI <2(4), 1(3), 1(6), s_3(5)>
> <L10>:
>
> as the condition in at L3 complicates things.
>
> The code is also really really special (and written in an awful
> way ...).
>
> Richard.
>
> > Thanks,
> > -Jiangning
> >
> >
> >
> >