Bug 17935 - Two consecutive movzbl are generated
Summary: Two consecutive movzbl are generated
Status: NEW
Alias: None
Product: gcc
Classification: Unclassified
Component: rtl-optimization (show other bugs)
Version: 4.0.0
: P2 enhancement
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on: 21520
Blocks: 65964
  Show dependency treegraph
 
Reported: 2004-10-11 16:38 UTC by Kazu Hirata
Modified: 2020-03-10 16:59 UTC (History)
2 users (show)

See Also:
Host:
Target: i686-*-*
Build:
Known to work:
Known to fail: 4.0.0
Last reconfirmed: 2006-10-22 21:11:55


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Kazu Hirata 2004-10-11 16:38:43 UTC
Consider:

struct flags {
  unsigned f0 : 1;
};

_Bool
bar (struct flags *p, struct flags *q)
{
  return (!p->f0 && !q->f0);
}

With "cc1 -O2 -fomit-frame-pointer -march=i386", I get

bar:
	movl	4(%esp), %eax
	testb	$1, (%eax)
	jne	.L9
	movl	8(%esp), %eax
	testb	$1, (%eax)
	sete	%al
	movzbl	%al, %eax
	movzbl	%al, %eax
	ret
	.p2align 2,,3
.L9:
	xorl	%eax, %eax
	movzbl	%al, %eax
	ret

Note the two consecutive movzbl.  We don't need the second one.

Also note the xorl followed by movzbl.  We don't need the movzbl.
Comment 1 Andrew Pinski 2004-10-11 17:24:44 UTC
Confirmed, this is 4.0 Regression again.
3.4.0 gives:
bar:
        movl    4(%esp), %eax
        xorl    %edx, %edx
        testb   $1, (%eax)
        jne     .L2
        movl    8(%esp), %ecx
        testb   $1, (%ecx)
        jne     .L2
        movb    $1, %dl
        .p2align 2,,3
.L2:
        movzbl  %dl, %eax
        ret
Comment 2 Andrew Pinski 2004-10-12 19:20:22 UTC
Note this is not a really a regression, the main issue I see here is that we are ifcvt if(a) 1 else 0; to a and 
then we copy the return part (the movzbl and ret) on the other branch but we don't combine them.  
Maybe we should rerun combine after reload.
Comment 3 Kazu Hirata 2004-12-12 17:57:44 UTC
With today's mainline, I get

bar:
	movl	4(%esp), %eax
	testb	$1, (%eax)
	jne	.L2
	movl	8(%esp), %eax
	testb	$1, (%eax)
	jne	.L2
	movl	$1, %eax
	movzbl	%al, %eax
	ret
	.p2align 2,,3
.L2:
	xorl	%eax, %eax
	movzbl	%al, %eax
	ret

Still we have a similar problem.  movl (or xorl) followed by movzbl is
meaningless.

It turns out that the fix for PR 14843 can fix this at tree level.
Here is the resulting assembly with my patch for PR 14843.

bar:
	movl	4(%esp), %eax
	testb	$1, (%eax)
	jne	.L2
	movl	8(%esp), %eax
	testb	$1, (%eax)
	jne	.L2
	movl	$1, %eax
	ret
	.p2align 2,,3
.L2:
	xorl	%eax, %eax
	ret

The reason my patch fixes this is because it removes casts before expansion.

Without my patch:

bar (p, q)
{
  int iftmp.0;

<bb 0>:
  if (p->f0 != 0) goto <L3>; else goto <L0>;

<L0>:;
  if (q->f0 != 0) goto <L3>; else goto <L7>;

<L7>:;
  iftmp.0 = 1;
  goto <bb 4> (<L8>);

<L3>:;
  iftmp.0 = 0;

<L8>:;
  return (int) (_Bool) iftmp.0;  <-- Notice these casts

}

With my patch:

bar (p, q)
{
  int D.1122;

<bb 0>:
  if (p->f0 != 0) goto <L3>; else goto <L0>;

<L0>:;
  if (q->f0 != 0) goto <L3>; else goto <L7>;

<L7>:;
  D.1122 = 1;
  goto <bb 4> (<L8>);

<L3>:;
  D.1122 = 0;

<L8>:;
  return D.1122;  <-- Casts are gone!

}
Comment 4 Andrew Pinski 2004-12-12 18:06:23 UTC
This seems like someone should be calling fold (and fold should be changing the types back of the 
orginal tree):
(int) (_Bool) ((int) p->f0 == 0 && (int) q->f0 == 0)
Comment 5 Andrew Pinski 2005-05-12 17:52:45 UTC
With the patch for PR 21520, we will get better code.
Right now on the mainline we get:
bar:
        movl    4(%esp), %eax
        testb   $1, (%eax)
        jne     .L7
        movl    8(%esp), %eax
        movb    (%eax), %al
        andl    $1, %eax
        xorl    $1, %eax
        movzbl  %al, %eax
        ret
        .p2align 2,,3
.L7:
        xorl    %eax, %eax
        movzbl  %al, %eax
        ret

but after that patch we should be able to remove the extra movzbl in the second branch.
Comment 6 Uroš Bizjak 2005-05-13 10:07:19 UTC
I think there is another optimization opportunity regarding movzbl following andl.

Consider this part:

        movb    (%eax), %al        EAX = x...xbbbbbbbb
        andl    $1, %eax           EAX = 0...00000000b
        movzbl  %al, %eax          (not needed)
        ret

If the value of the constant to andl operator is less than 2 ^ bit width of the
register we would like to extend, then andl instruction inherently performs
zero-extension.

So if the xorl in comment #5 is moved before andl, we could apply above
simplification to get:

        movb    (%eax), %al
        xorl    $1, %eax
        andl    $1, %eax
        ret
Comment 7 Andrew Pinski 2005-09-29 03:47:38 UTC
(In reply to comment #6)
> I think there is another optimization opportunity regarding movzbl following andl.

I think that is that the tracer pass runs late which causes the movzbl to be there late.
Comment 8 Andrew Pinski 2006-01-18 05:07:12 UTC
We get now:
        movb    (%eax), %al
        andl    $1, %eax
        xorl    $1, %eax
        andl    $1, %eax
        ret

(insn 23 22 24 4 (parallel [
            (set (reg:QI 63)
                (and:QI (mem/s:QI (reg/v/f:SI 61 [ q ]) [0 S1 A32])
                    (const_int 1 [0x1])))
            (clobber (reg:CC 17 flags))
        ]) 212 {*andqi_1} (nil)
    (expr_list:REG_DEAD (reg/v/f:SI 61 [ q ])
        (expr_list:REG_UNUSED (reg:CC 17 flags)
            (nil))))

(insn 24 23 25 4 (parallel [
            (set (reg:QI 64)
                (xor:QI (reg:QI 63)
                    (const_int 1 [0x1])))
            (clobber (reg:CC 17 flags))
        ]) 241 {*xorqi_1} (insn_list:REG_DEP_TRUE 23 (nil))
    (expr_list:REG_DEAD (reg:QI 63)
        (expr_list:REG_UNUSED (reg:CC 17 flags)
            (nil))))

(note 25 24 26 4 NOTE_INSN_DELETED)

(insn 26 25 27 4 (parallel [
            (set (reg:SI 58 [ prephitmp.25 ])
                (and:SI (subreg:SI (reg:QI 64) 0)
                    (const_int 1 [0x1])))
            (clobber (reg:CC 17 flags))
        ]) 208 {*andsi_1} (insn_list:REG_DEP_TRUE 24 (nil))
    (expr_list:REG_DEAD (reg:QI 64)
        (expr_list:REG_UNUSED (reg:CC 17 flags)
            (nil))))

We must lose that (a&1)^1 has only the the one bit set.