35363 – Missing bit field coalescing optimization

Bug 35363 - Missing bit field coalescing optimization

Summary: Missing bit field coalescing optimization

Status:	RESOLVED FIXED

Alias:	None

Product:	gcc
Classification:	Unclassified
Component:	target (show other bugs)
Version:	4.6.0

Importance:	P3 enhancement
Target Milestone:	8.0
Assignee:	Not yet assigned to anyone

URL:
Keywords:	missed-optimization

Depends on:
Blocks:

Reported:	2008-02-25 05:38 UTC by davidxl
Modified:	2021-11-28 07:20 UTC (History)
CC List:	5 users (show)

See Also:
Host:
Target:	x86_64--
Build:
Known to work:
Known to fail:
Last reconfirmed:	2008-12-29 06:08:22

Attachments
Add an attachment (proposed patch, testcase, etc.)

Note You need to log in before you can comment on or make changes to this bug.

Description davidxl 2008-02-25 05:38:52 UTC

// David Li

struct A{
  int b1: 3;
  int b2: 3;
  int b3: 2;
  int b4: 17;
}a;
void foo()
{
    a.b1 = 2;
    a.b2 = 3;
    a.b4 = 8;
}

This requires one LOAD + one OR + one STORE, but the generate code looks like:

foo:
.LFB2:
        movzbl  a(%rip), %eax
        andl    $-64, %eax
        orl     $26, %eax
        movb    %al, a(%rip)
        movl    a(%rip), %eax
        andl    $-33554177, %eax
        orb     $8, %ah
        movl    %eax, a(%rip)
        ret

Comment 1 Richard Biener 2008-02-25 11:36:38 UTC

(please make our work easier and make the initial reports Severity enhancement
and add missed-optimization as a Keyword and make the report against a
gcc version, preferably 4.3.0 or 4.4.0;  testcases ready for inclusion into
the gcc testsuite would be nice as well, as well as checking for duplicate
reports yourself).

This testcase actually requires one load, one and (mask out bits in the
affected bitfield parts), one or and one store.  The MEM_REF work
might lay grounds for this to work, it makes the read-modify-write
cycles explicit on the tree level and thus allows the loads and
stores to be CSEd.

Comment 2 Andrew Pinski 2008-02-25 18:38:18 UTC

This works correctly for PowerPC*.
PPC64 (with GCC 4.0.1):
        lwz r0,0(r11)
        rlwinm r0,r0,0,3,31
        oris r0,r0,0x4000
        and r0,r0,r9
        oris r0,r0,0xc00
        and r0,r0,r2
        ori r0,r0,1024
        stw r0,0(r11)


PPC32 (GCC 4.0.2):
        lwz r0,0(r9)
        rlwimi r0,r2,29,0,2
        li r2,3
        rlwimi r0,r2,26,3,5
        li r2,8
        rlwimi r0,r2,7,8,24
        stw r0,0(r9)

Comment 3 Richard Biener 2008-02-25 18:44:59 UTC

I would expect the equivalent of

  int tmp = a;
  tmp &= 0x0030;  // fix the mask to be correct, all bits of b3
  tmp |= 2 | 3 | 8; // constant folded and properly shifted
  a = tmp;

I still see three ORs for ppc64 and non-combined rlwimi ops for ppc32.

Comment 4 Richard Biener 2008-02-25 18:49:25 UTC

So, the same code should be generated for

union {
struct {
  int b1: 3;
  int b2: 3;
  int b3: 2;
  int b4: 17;
}a;
int b;
} a;
void foo()
{
    a.a.b1 = 2;
    a.a.b2 = 3;
    a.a.b4 = 8;
}
void bar()
{
 a.b = (a.b & (-1u >> (sizeof(a.b)*8 - 2) << 6)) | 2 | (3 << 3) | (8 << 17);
}

modulo errors I made in the bar() case ;)

Comment 5 Richard Biener 2008-02-25 18:51:48 UTC

With MEM_REF I get on i686

foo:
        movl    a, %eax
        pushl   %ebp
        movl    %esp, %ebp
        popl    %ebp
        andl    $-33554240, %eax
        orl     $2074, %eax
        movl    %eax, a
        ret
        .size   foo, .-foo
        .p2align 4,,15
.globl bar
        .type   bar, @function
bar:
        movl    a, %eax
        pushl   %ebp
        movl    %esp, %ebp
        popl    %ebp
        andl    $192, %eax
        orl     $1048602, %eax
        movl    %eax, a
        ret

which shows that I made an error in the source ;)

Comment 6 Richard Biener 2008-02-25 18:54:13 UTC

The IL after MEM_REF lowering looks like

bar ()
{
  int D.1193;
  unsigned int D.1192;
  unsigned int D.1191;
  unsigned int D.1190;
  int D.1189;

  D.1189 = MEM <int {0}, &a>;
  D.1190 = (unsigned int) D.1189;
  D.1191 = D.1190 & 192;
  D.1192 = D.1191 | 1048602;
  D.1193 = (int) D.1192;
  MEM <int {0}, &a> = D.1193;
  return;
}

foo ()
{
  int MEML.2;
  int MEML.1;
  int MEML.0;

  MEML.0 = MEM <int {0}, &a>;
  MEML.0 = BIT_FIELD_EXPR <MEML.0, 2, 3, 0>;
  MEM <int {0}, &a> = MEML.0;
  MEML.1 = MEM <int {0}, &a>;
  MEML.1 = BIT_FIELD_EXPR <MEML.1, 3, 3, 3>;
  MEM <int {0}, &a> = MEML.1;
  MEML.2 = MEM <int {0}, &a>;
  MEML.2 = BIT_FIELD_EXPR <MEML.2, 8, 17, 8>;
  MEM <int {0}, &a> = MEML.2;
  return;
}

Comment 7 davidxl 2010-11-03 00:20:46 UTC

(In reply to comment #6)
> The IL after MEM_REF lowering looks like
> 
> bar ()
> {
>   int D.1193;
>   unsigned int D.1192;
>   unsigned int D.1191;
>   unsigned int D.1190;
>   int D.1189;
> 
>   D.1189 = MEM <int {0}, &a>;
>   D.1190 = (unsigned int) D.1189;
>   D.1191 = D.1190 & 192;
>   D.1192 = D.1191 | 1048602;
>   D.1193 = (int) D.1192;
>   MEM <int {0}, &a> = D.1193;
>   return;
> }
> 
> foo ()
> {
>   int MEML.2;
>   int MEML.1;
>   int MEML.0;
> 
>   MEML.0 = MEM <int {0}, &a>;
>   MEML.0 = BIT_FIELD_EXPR <MEML.0, 2, 3, 0>;
>   MEM <int {0}, &a> = MEML.0;
>   MEML.1 = MEM <int {0}, &a>;
>   MEML.1 = BIT_FIELD_EXPR <MEML.1, 3, 3, 3>;
>   MEM <int {0}, &a> = MEML.1;
>   MEML.2 = MEM <int {0}, &a>;
>   MEML.2 = BIT_FIELD_EXPR <MEML.2, 8, 17, 8>;
>   MEM <int {0}, &a> = MEML.2;
>   return;
> }


LLVM generates expected code:

.Ltmp1:
	movl	$-33554240, %eax
	andl	a(%rip), %eax
	orl	$2074, %eax
	movl	%eax, a(%rip)

open64 is also bad.

Comment 8 Andrew Pinski 2018-04-18 03:47:47 UTC

There are two issues here, one is SLOW_BYTE_ACCESS and the second is the lowering of bit-fields.  I am going to fix the lower part for GCC 9.  The SLOW_BTYE_ACCESS was being looked at by someone else in the recent past but I can't find the email about it.

Comment 9 Andrew Pinski 2021-11-28 07:20:25 UTC

Store merging fixes this for GCC 8 so closing as fixed:
  _5 = MEM[(struct A *)&a];
  _6 = _5 & 4261413056;
  _7 = _6 | 2074;
  MEM[(struct A *)&a] = _7;

I Will have to make sure we don't regress when adding lowering.