Bug 47059 - compiler fails to coalesce loads/stores
compiler fails to coalesce loads/stores
Status: UNCONFIRMED
Product: gcc
Classification: Unclassified
Component: tree-optimization
4.5.1
: P3 normal
: ---
Assigned To: Not yet assigned to anyone
:
Depends on: 23684
Blocks:
  Show dependency treegraph
 
Reported: 2010-12-24 11:01 UTC by Rahul Kharche
Modified: 2013-09-18 10:06 UTC (History)
4 users (show)

See Also:
Host: i686-pc-linux-gnu
Target: i686-pc-linux-gnu
Build: i686-pc-linux-gnu
Known to work:
Known to fail:
Last reconfirmed:


Attachments
Patch Vs 4.5.2 Rev 167088 (3.62 KB, patch)
2011-01-15 12:32 UTC, Rahul Kharche
Details | Diff

Note You need to log in before you can comment on or make changes to this bug.
Description Rahul Kharche 2010-12-24 11:01:32 UTC
Consider the following test case compiled with GCC4.5.1 (x86) and the following command:

gcc -S -Os test.c

struct struct1
{
  void *data;
  unsigned short f1;
  unsigned short f2;
};
typedef struct struct1 S1;

struct struct2
{
  int f3;
  S1 f4;
};
typedef struct struct2 S2;


extern void foo (S1 *ptr);
extern S2 gstruct2_var;
extern S1 gstruct1_var;

static S1 bar (const S1 *ptr) __attribute__ ((always_inline));

static S1
bar (const S1 *ptr)
{
  S1 ls_var = *ptr;
  foo (&ls_var);
  return ls_var;
}

int
main ()
{
  S2 *ps_var;

  ps_var = &gstruct2_var;
  ps_var->f4 = bar (&gstruct1_var);

  return 0;
}

We get:

main:
        leal    4(%esp), %ecx
        andl    $-16, %esp
        pushl   -4(%ecx)
        pushl   %ebp
        movl    %esp, %ebp
        pushl   %ecx
        subl    $32, %esp
        movl    gstruct1_var, %eax
        movl    gstruct1_var+4, %edx
        movl    %eax, -16(%ebp)
        leal    -16(%ebp), %eax
        pushl   %eax
        movl    %edx, -12(%ebp)
        call    foo
        movl    -16(%ebp), %eax
        movl    -4(%ebp), %ecx
        movl    %eax, gstruct2_var+4
        movl    -12(%ebp), %eax        <-- load1   [ebp - 12] @ 4 bytes
        movw    %ax, gstruct2_var+8    <-- store1  [gstruct2_var + 8] @ 2 bytes
        movw    -10(%ebp), %ax         <-- load2   [ebp - 10] @ 2 bytes
        movw    %ax, gstruct2_var+10   <-- store2  [gstruct2_var + 10] @ 2 bytes
        xorl    %eax, %eax
        leave
        leal    -4(%ecx), %esp
        ret
        .size   main, .-main
        .ident  "GCC: (GNU) 4.5.1"
        .section        .note.GNU-stack,"",@progbits


With GCC4.4.1 we get:

main:
        leal    4(%esp), %ecx
        andl    $-16, %esp
        pushl   -4(%ecx)
        pushl   %ebp
        movl    %esp, %ebp
        pushl   %ecx
        subl    $32, %esp
        movl    gstruct1_var, %eax
        movl    gstruct1_var+4, %edx
        movl    %eax, -16(%ebp)
        leal    -16(%ebp), %eax
        movl    %edx, -12(%ebp)
        pushl   %eax
        call    foo
        movl    -12(%ebp), %eax       <-- Load1 [ebp - 12] @ 4 bytes
        movl    -4(%ebp), %ecx
        movl    %eax, gstruct2_var+8  <-- Store1 [gstruct2_var + 8] @ 4 bytes
        movl    -16(%ebp), %eax
        movl    %eax, gstruct2_var+4
        xorl    %eax, %eax
        leave
        leal    -4(%ecx), %esp
        ret
        .size   main, .-main
        .ident  "GCC: (GNU) 4.4.1"
        .section        .note.GNU-stack,"",@progbits


The extra load stores appear to be the result of change to SRA fully scalarizing structure members f1 and f2. With GCC4.4.1 the access to these fields is done using a BIT_FIELD_REF which combines the two loads and stores.

Talking to MartinJ on IRC I was told the changes to SRA make aggressive scalarization of aggregates. In the past there was some functionality to try and combine appropriate components into BIT_FIELD_REFs so as to reduce the number of loads/stores. This has been removed from 4.5 in favour of simplicity of the Gimple IR and working towards generic MEM_REFs. The plan is to introduce new IR constructs to load/store individual bits and in a separate gimple pass decide how to combine them together. But, this will only be available in 4.7+.

We also have the exact same issue on our port and causes a significant performance regression on our software.
Comment 1 Rahul Kharche 2011-01-15 12:32:01 UTC
Created attachment 22974 [details]
Patch Vs 4.5.2 Rev 167088
Comment 2 Rahul Kharche 2011-01-15 12:43:27 UTC
This issue also exists on the trunk. I am in the process of bootstrap testing this for i686-pc-linux-gnu. I will send out this patch once it checks out.
The attached patch is Vs 4.5.2 Rev 167088.
Comment 3 Denis Vlasenko 2013-09-18 10:06:53 UTC
I encountered this behavior with 4.8.0:

                struct pollfd pfd[3];
                ...
                pfd[2].events = POLLOUT;
                pfd[2].revents = 0;

This compiled to:

        movw    $4, 44(%rsp)    #, pfd[2].events
        movw    $0, 46(%rsp)    #, pfd[2].revents