Bug 45090 - x86_64 passing structure by value to a non-inlined function causes register-resident structures to flush to stack
x86_64 passing structure by value to a non-inlined function causes register-r...
Status: UNCONFIRMED
Product: gcc
Classification: Unclassified
Component: middle-end
4.5.0
: P3 normal
: ---
Assigned To: Not yet assigned to anyone
:
Depends on:
Blocks:
  Show dependency treegraph
 
Reported: 2010-07-26 21:15 UTC by Garth A. Dickie
Modified: 2010-07-26 21:24 UTC (History)
1 user (show)

See Also:
Host: x86_64-unknown-linux-gnu
Target: x86_64-unknown-linux-gnu
Build: x86_64-unknown-linux-gnu
Known to work:
Known to fail:
Last reconfirmed:


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Garth A. Dickie 2010-07-26 21:15:46 UTC
I am running 4.5.0, built locally from a gcc.gnu.org distribution, on Ubuntu 10.04. 

When switching from -m32 to -m64 a dispatch loop in my code gets noticeably slower (20% slower in 4.3.4 and 4.4.3, 10% slower in 4.5.0).

Investigation of the generated assembly shows that register-resident structures are being flushed to locations on the stack around a call through a function pointer. If I change the function call to take scalar arguments rather than a structure passed by value, then the stack writes go away and perforamance improves to be about 10% faster than the 32-bit code. 

The small testcase below includes three examples. One with a pass-by-value through a function pointer which exhibits the problem. Second with passing scalars instead of a structure which shows the workaround, and a third trivial example with an empty structure being passed, which also exhibits a version of the problem.

FWIW, my production code exhibits a particularly egregious version of the problem, but I cannot seem to reproduce it in a small example: the non-inlined function call is at the bottom of several layers of inlined function, and a single register-resident structure is being flushed to multiple stack locations (one per inlined stack frame?) around each call to the function.



Output of "g++ -v -save-temps -O3 -S test.cpp":

Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/usr/local/libexec/gcc/x86_64-unknown-linux-gnu/4.5.0/lto-wrapper
Target: x86_64-unknown-linux-gnu
Configured with: ./configure
Thread model: posix
gcc version 4.5.0 (GCC) 
COLLECT_GCC_OPTIONS='-v' '-save-temps' '-O3' '-S' '-mtune=generic' '-march=x86-64'
 /usr/local/libexec/gcc/x86_64-unknown-linux-gnu/4.5.0/cc1plus -E -quiet -v -D_GNU_SOURCE test.cpp -mtune=generic -march=x86-64 -O3 -fpch-preprocess -o test.ii
ignoring nonexistent directory "/usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/../../../../x86_64-unknown-linux-gnu/include"
#include "..." search starts here:
#include <...> search starts here:
 /usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/../../../../include/c++/4.5.0
 /usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/../../../../include/c++/4.5.0/x86_64-unknown-linux-gnu
 /usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/../../../../include/c++/4.5.0/backward
 /usr/local/include
 /usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/include
 /usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/include-fixed
 /usr/include
End of search list.
COLLECT_GCC_OPTIONS='-v' '-save-temps' '-O3' '-S' '-mtune=generic' '-march=x86-64'
 /usr/local/libexec/gcc/x86_64-unknown-linux-gnu/4.5.0/cc1plus -fpreprocessed test.ii -quiet -dumpbase test.cpp -mtune=generic -march=x86-64 -auxbase test -O3 -version -o test.s
GNU C++ (GCC) version 4.5.0 (x86_64-unknown-linux-gnu)
	compiled by GNU C version 4.5.0, GMP version 4.3.2, MPFR version 2.4.2-p1, MPC version 0.8.1
GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
GNU C++ (GCC) version 4.5.0 (x86_64-unknown-linux-gnu)
	compiled by GNU C version 4.5.0, GMP version 4.3.2, MPFR version 2.4.2-p1, MPC version 0.8.1
GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
Compiler executable checksum: 469157b70a6e6ab9e09e15344033d953
COMPILER_PATH=/usr/local/libexec/gcc/x86_64-unknown-linux-gnu/4.5.0/:/usr/local/libexec/gcc/x86_64-unknown-linux-gnu/4.5.0/:/usr/local/libexec/gcc/x86_64-unknown-linux-gnu/:/usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/:/usr/local/lib/gcc/x86_64-unknown-linux-gnu/
LIBRARY_PATH=/usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/:/usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/../../../../lib64/:/lib/../lib64/:/usr/lib/../lib64/:/usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/../../../:/lib/:/usr/lib/
COLLECT_GCC_OPTIONS='-v' '-save-temps' '-O3' '-S' '-mtune=generic' '-march=x86-64'



And the contents of test.ii afterwards:

# 1 "test.cpp"
# 1 "<built-in>"
# 1 "<command-line>"
# 1 "test.cpp"
struct bitPointer {
    unsigned int * a;
    unsigned int b;
};

extern void (*gCausesFlushToStack)(bitPointer p);

void test1(unsigned int* a, int x) {
    bitPointer p = { a, 0 };

    for (int i = 0; i < x; ++i) {
        gCausesFlushToStack(p);

        p.a += (p.b + 1) >> 3;
        p.b = (p.b + 1) & 0x7;
    }
}

extern void (*gSameValuesAsScalarsDoesntCauseFlush)(unsigned int* a, unsigned int b);

void test2(unsigned int* a, int x) {
    bitPointer p = { a, 0 };

    for (int i = 0; i < x; ++i) {
        gSameValuesAsScalarsDoesntCauseFlush(p.a, p.b);

        p.a += (p.b + 1) >> 3;
        p.b = (p.b + 1) & 0x7;
    }
}

struct emptyObject { };
extern void (*gEvenEmptyStructureCanCauseFlush)(emptyObject object);

void test3(unsigned int* a, int x) {
    bitPointer p = { a, 0 };

    for (int i = 0; i < x; ++i) {
        gEvenEmptyStructureCanCauseFlush(emptyObject());

        p.a += (p.b + 1) >> 3;
        p.b = (p.b + 1) & 0x7;
    }
}
Comment 1 Andrew Pinski 2010-07-26 21:24:42 UTC
I think this is correct behavior dealing with how x86_64 ABI describes struct passing and almost all registers are volatile.

Oh on the trunk test3 gets compiled into a simple loop without any struct being saved and that is a different issue with the movl, see PR 20408.