Bug 15826 - don't use "if" to extract a single bit bit-field.
Summary: don't use "if" to extract a single bit bit-field.
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 4.0.0
: P2 enhancement
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on: 15268 15459 15618
Blocks: 19466 19987
  Show dependency treegraph
 
Reported: 2004-06-04 18:55 UTC by Kazu Hirata
Modified: 2020-04-19 18:28 UTC (History)
3 users (show)

See Also:
Host:
Target:
Build:
Known to work: 5.1.0, 6.0
Known to fail:
Last reconfirmed: 2007-04-05 22:01:25


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Kazu Hirata 2004-06-04 18:55:53 UTC
Transform foo() into bar().

struct s {
  unsigned int bit : 1;
};

unsigned int
foo (struct s *p)
{
  if (p->bit)
    return 1;
  else
    return 0;
}

unsigned int
bar (struct s *p)
{
  return (unsigned int) (p->bit);
}

Currently, the last tree-ssa form I get looks like so:

;; Function foo (foo)

foo (p)
{
  int T.1;
  unsigned int T.0;

<bb 0>:
  T.0_2 = p_1->bit;
  if (T.0_2 != 0) goto <L0>; else goto <L1>;

<L0>:;
  return 1;

<L1>:;
  return 0;

}



;; Function bar (bar)

bar (p)
{
<bb 0>:
  return p_1->bit;

}

The assembly code:

        .p2align 2,,3
.globl foo
        .type   foo, @function
foo:
        testb   $1, (%eax)      # 43    *testqi_1/3     [length = 3]
        setne   %al     # 40    *setcc_1        [length = 3]
        movzbl  %al, %eax       # 45    *zero_extendqisi2_movzbw        [length
= 3]
        ret     # 48    return_internal [length = 1]
        .size   foo, .-foo
        .p2align 2,,3
.globl bar
        .type   bar, @function
bar:
        movzbl  (%eax), %eax    # 28    *zero_extendqisi2_movzbw        [length
= 3]
        andl    $1, %eax        # 12    *andsi_1/1      [length = 3]
        ret     # 31    return_internal [length = 1]
        .size   bar, .-bar

Note that alias.i.t50.tailc contains:

<L22>:;
  T.1776_58 = x_2->unchanging;
  if (T.1776_58 != 0) goto <L25>; else goto <L26>;

<L25>:;
  return 0;

<L26>:;
  return 1;
Comment 1 Andrew Pinski 2004-06-04 19:07:40 UTC
Confirmed, this is related to bug 15618 which is for bools.
Comment 2 Andrew Pinski 2004-06-04 19:26:04 UTC
Here is another example:
struct s {
  unsigned int bit : 1;
};

unsigned int
foo (struct s *p)
{
  int i;
  if (p->bit)
    i = 1;
  else
    i = 0;
  return i;
}

I will note that the example in comment #1 needs to have the return values merged together before the 
real optimization happens (see PR 15268 and a patch which I posted but have not gotten around to 
answering/tying  RTH's question/seguestion yet).

But basically it can be optimization by putting a cast to bool first so a != 0 becomes (bool)a and PHI-
opt already optimizated out the branches.
Comment 3 Andrew Pinski 2004-06-07 18:05:18 UTC
I should note now that PR 15268 is fixed, the only thing that is needed is for one-wide-bitfield != 0 
needs to be converted to (_Bool)one-wide-bitfield.
Comment 4 Kazu Hirata 2004-06-16 06:20:54 UTC
Now that we have a single return statement, let me regenerate t50.tailc.

struct s {
  unsigned int bit : 1;
};

unsigned int
foo (struct s *p)
{
  if (p->bit)
    return 1;
  else
    return 0;
}

unsigned int
bar (struct s *p)
{
  return (unsigned int) (p->bit);
}

unsigned int
andrew (struct s *p)
{
  int i;
  if (p->bit)
    i = 1;
  else
    i = 0;
  return i;
}

I get:

;; Function foo (foo)

foo (p)
{
  _Bool T.7;
  unsigned int T.2;
  int T.1;
  unsigned int T.0;

<bb 0>:
  T.0_3 = p_2->bit;
  T.7_8 = T.0_3 != 0;
  T.2_1 = (unsigned int)T.7_8;
  return T.2_1;

}



;; Function bar (bar)

bar (p)
{
  unsigned int T.3;

<bb 0>:
  T.3_2 = p_1->bit;
  return T.3_2;

}



;; Function andrew (andrew)

andrew (p)
{
  _Bool T.16;
  int i;
  unsigned int T.6;
  int T.5;
  unsigned int T.4;

<bb 0>:
  T.4_3 = p_2->bit;
  T.16_9 = T.4_3 != 0;
  i_1 = (int)T.16_9;
  T.6_5 = (unsigned int)i_1;
  return T.6_5;

}

So basically, the problems boils down to the tree combiner.
Comment 5 Andrew Pinski 2006-03-02 14:24:16 UTC
This is interesting, we now get BIT_FIELD_REF.
Comment 6 Steven Bosscher 2007-04-05 22:01:25 UTC
The tree dump for the original test case now looks like this for me:

;; Function foo (foo)

foo (p)
{
<bb 2>:
  return (unsigned int) ((BIT_FIELD_REF <*p, 8, 0> & 1) != 0);

}



;; Function bar (bar)

bar (p)
{
<bb 2>:
  return (unsigned int) p->bit;

}



The resulting assembler output is the same, but I imagine VRP should be able to fold away the "& 1" test.  I don't know if the BIT_FIELD_REF itself should be optimized away, but I guess so.  Consider the following test case:

struct s
{
  unsigned int bit:1;
};

unsigned int
foo (struct s *p)
{
  if (p->bit)
    return (unsigned int) (p->bit);
  else
    return 0;
}


This gets "optimized" at the tree level to this ugly code:
;; Function foo (foo)

foo (p)
{
  unsigned int D.1979;

<bb 2>:
  if ((BIT_FIELD_REF <*p, 8, 0> & 1) != 0) goto <L0>; else goto <L4>;

<L4>:;
  D.1979 = 0;
  goto <bb 5> (<L2>);

<L0>:;
  D.1979 = (unsigned int) p->bit;

<L2>:;
  return D.1979;

}

In summary, I don't think we can close this bug just yet.
Comment 7 Richard Biener 2008-04-30 19:21:43 UTC
On the trunk I have after phiopt1:

;; Function andrew (andrew)

Removing basic block 3
Merging blocks 2 and 4
andrew (struct s * p)
{
  _Bool D.1212;
  int i;
  unsigned int D.1183;
  <unnamed-unsigned:1> D.1182;

<bb 2>:
  D.1182_3 = p_2(D)->bit;
  (void) 0;
  D.1212_9 = D.1182_3 != 0;
  i_10 = (int) D.1212_9;
  D.1183_6 = (unsigned int) i_10;
  return D.1183_6;

}

;; Function foo (foo)

Removing basic block 3
Merging blocks 2 and 4
foo (struct s * p)
{
  _Bool D.1199;
  unsigned int D.1177;
  <unnamed-unsigned:1> D.1176;

<bb 2>:
  D.1176_3 = p_2(D)->bit;
  (void) 0;
  D.1199_8 = D.1176_3 != 0;
  D.1177_9 = (unsigned int) D.1199_8;
  return D.1177_9;

}

which the next forwprop pass optimizes to

foo (struct s * p)
{
  unsigned int D.1177;
  <unnamed-unsigned:1> D.1176;

<bb 2>:
  D.1176_3 = p_2(D)->bit;
  D.1177_9 = D.1176_3 != 0;
  return D.1177_9;

}

andrew (struct s * p)
{
  unsigned int D.1183;
  <unnamed-unsigned:1> D.1182;

<bb 2>:
  D.1182_3 = p_2(D)->bit;
  D.1183_6 = D.1182_3 != 0;
  return D.1183_6;

}
Comment 8 Andrew Pinski 2012-01-24 00:03:45 UTC
(In reply to comment #6)
Only comment #6 remains to be fixed.  
FRE does not recognize p->bit as "BIT_FIELD_REF <*p, 8, 0> & 1".
Comment 9 Richard Biener 2014-12-01 14:07:26 UTC
Which boils down to the premature fold-const.c:optimize_bit_field_compare
which creates this BIT_FIELD_REF (fold_truth_andor_1 does similar stupid
stuff).
Comment 10 Martin Sebor 2016-02-26 23:25:01 UTC
Author: msebor
Date: Fri Feb 26 23:24:29 2016
New Revision: 233771

URL: https://gcc.gnu.org/viewcvs?rev=233771&root=gcc&view=rev
Log:
PR tree-optimization/15826 - don't use "if" to extract a single bit
	bit-field
2016-02-26  Martin Sebor  <msebor@redhat.com>

        PR tree-optimization/15826
        * gcc.dg/tree-ssa/pr15826.c: New test.

Added:
    trunk/gcc/testsuite/gcc.dg/tree-ssa/pr15826.c
Modified:
    trunk/gcc/testsuite/ChangeLog
Comment 11 Martin Sebor 2016-02-26 23:25:29 UTC
It looks to me like the missing optimization now takes place, and has since 5.1. In both foo() and andrew() the bit test is optimized away during cddce2 resulting in the following.  The BIT_FIELD_REF is still there, but I assume that's fine.  I'll go ahead, add the test case to the test suite and resolve this as fixed.  Please reopen if I missed something.

foo (struct s * p)
{
  unsigned char _4;
  _Bool _6;
  unsigned int _7;

  <bb 2>:
  _4 = BIT_FIELD_REF <*p_3(D), 8, 0>;
  _6 = (_Bool) _4;
  _7 = (unsigned int) _6;
  return _7;

}
Comment 12 Andrew Pinski 2016-02-26 23:32:08 UTC
You did not read my comment #8.  Please read it again and try it again.  Your testcase does not cover the testcase mentioned in comment #6.
Comment 13 Bill Seurer 2016-03-01 01:58:54 UTC
The test case also fails on powerpc64 LE (but seem to work on BE)

Executing on host: /home/seurer/gcc/build/gcc-test/gcc/xgcc -B/home/seurer/gcc/build/gcc-test/gcc/ /home/seurer/gcc/gcc-test/gcc/testsuite/gcc.dg/tree-ssa/pr15826.c   -fno-diagnostics-show-caret -fdiagnostics-color=never   -O2 -fdump-tree-optimized -S   -o pr15826.s    (timeout = 300)
spawn /home/seurer/gcc/build/gcc-test/gcc/xgcc -B/home/seurer/gcc/build/gcc-test/gcc/ /home/seurer/gcc/gcc-test/gcc/testsuite/gcc.dg/tree-ssa/pr15826.c -fno-diagnostics-show-caret -fdiagnostics-color=never -O2 -fdump-tree-optimized -S -o pr15826.s
PASS: gcc.dg/tree-ssa/pr15826.c (test for excess errors)
FAIL: gcc.dg/tree-ssa/pr15826.c scan-tree-dump-times optimized " & | goto " 0
Comment 14 Martin Sebor 2016-03-01 03:45:34 UTC
(In reply to Andrew Pinski from comment #12)

Thank your for catching it.  I did actually read all the comments.  The trouble is that there are several test cases here and I missed the one in the second half of comment #6 (the first half of the comment doesn't seem to apply anymore, at least not on x86_64).

(In reply to Bill Seurer from comment #13)

You're right, the test fails on powerpc64le (though as far as I can see not on any other target with reported test results).  It seems like a useful test despite the above so rather than backing it out entirely I could disable it for powerpc64le until it's analyzed.

Bill (Seurer or Schmidt), does either of you have a preference for how to handle the failure?
Comment 15 Bill Schmidt 2016-03-01 14:29:20 UTC
My preference is to see the test properly resolved. :)  I don't think you should just XFAIL the powerpc64le case without understanding why it fails, as that tends to leave the XFAIL in place forever.

Here is pr15826.c.211t.optimized on powerpc64le with -O2.  Hopefully that will help you see what's going on.  Please let me know if there is other data I can gather that will be useful.  Thanks!

Bill

;; Function foo (foo, funcdef_no=0, decl_uid=2355, cgraph_uid=0, symbol_order=0)

foo (struct s * p)
{
  unsigned int _4;
  unsigned int _5;
  unsigned int _7;

  <bb 2>:
  _4 = BIT_FIELD_REF <*p_3(D), 32, 0>;
  _5 = _4 & 1;
  _7 = _5;
  return _7;

}



;; Function bar (bar, funcdef_no=1, decl_uid=2358, cgraph_uid=1, symbol_order=1)

bar (struct s * p)
{
  <unnamed-unsigned:1> _3;
  unsigned int _4;

  <bb 2>:
  _3 = p_2(D)->bit;
  _4 = (unsigned int) _3;
  return _4;

}



;; Function andrew (andrew, funcdef_no=2, decl_uid=2361, cgraph_uid=2, symbol_order=2)

andrew (struct s * p)
{
  unsigned int _4;
  unsigned int _5;
  unsigned int _6;

  <bb 2>:
  _4 = BIT_FIELD_REF <*p_3(D), 32, 0>;
  _5 = _4 & 1;
  _6 = _5;
  return _6;

}
Comment 16 Richard Biener 2016-06-29 13:53:52 UTC
(In reply to Steven Bosscher from comment #6)
> The tree dump for the original test case now looks like this for me:
> 
> ;; Function foo (foo)
> 
> foo (p)
> {
> <bb 2>:
>   return (unsigned int) ((BIT_FIELD_REF <*p, 8, 0> & 1) != 0);
> 
> }
> 
> 
> 
> ;; Function bar (bar)
> 
> bar (p)
> {
> <bb 2>:
>   return (unsigned int) p->bit;
> 
> }
> 
> 
> 
> The resulting assembler output is the same, but I imagine VRP should be able
> to fold away the "& 1" test.  I don't know if the BIT_FIELD_REF itself
> should be optimized away, but I guess so.  Consider the following test case:
> 
> struct s
> {
>   unsigned int bit:1;
> };
> 
> unsigned int
> foo (struct s *p)
> {
>   if (p->bit)
>     return (unsigned int) (p->bit);
>   else
>     return 0;
> }
> 
> 
> This gets "optimized" at the tree level to this ugly code:
> ;; Function foo (foo)
> 
> foo (p)
> {
>   unsigned int D.1979;
> 
> <bb 2>:
>   if ((BIT_FIELD_REF <*p, 8, 0> & 1) != 0) goto <L0>; else goto <L4>;
> 
> <L4>:;
>   D.1979 = 0;
>   goto <bb 5> (<L2>);
> 
> <L0>:;
>   D.1979 = (unsigned int) p->bit;
> 
> <L2>:;
>   return D.1979;
> 
> }
> 
> In summary, I don't think we can close this bug just yet.

I don't think VRP can optimize anything here as the BIT_FIELD_REF created
by optimize_bitfield_compare accesses struct s tail-padding.

IMHO this is still very premature optimization done by fold.
Comment 17 Matthew Fortune 2016-12-20 11:36:46 UTC
(In reply to Bill Schmidt from comment #15)
> ;; Function andrew (andrew, funcdef_no=2, decl_uid=2361, cgraph_uid=2,
> symbol_order=2)
> 
> andrew (struct s * p)
> {
>   unsigned int _4;
>   unsigned int _5;
>   unsigned int _6;
> 
>   <bb 2>:
>   _4 = BIT_FIELD_REF <*p_3(D), 32, 0>;
>   _5 = _4 & 1;
>   _6 = _5;
>   return _6;
> 
> }

MIPS sees this same spurious failure. The issue (on a GCC 6 branch) is that the optimization has successfully happened it just looks like the check is wrong in the testcase. The presence of a logical and is not harmful as it is a legitimate way to truncate: (from match.pd)

    /* A truncation to an unsigned type (a zero-extension) should be
       canonicalized as bitwise and of a mask.  */

This simplification kicks in for MIPS and presumably powerpc but does not (I don't know why) for x86_64. There is no 'goto' demonstrating that a single bit extract does not use an IF.

I therefore propose we just update the check to verify there is no 'goto'.

Matthew
Comment 18 Bill Schmidt 2017-01-25 18:17:48 UTC
I agree with Matthew.
Comment 19 Jeffrey A. Law 2020-04-19 18:28:19 UTC
So the core issue here, using an if, conditional moves and the like to do single bit field extraction/testing is resolved.

There is still an issue of canonicalizing the two representations which in turn opens up the possibility of finding more common subexpressions when both forms might be used.  That is actually being tracked via pr81601.