This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug c++/32412] New: Passing struct as parameter breaks SRA for stack-allocated struct inside called function
- From: "scovich at gmail dot com" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: 20 Jun 2007 08:30:19 -0000
- Subject: [Bug c++/32412] New: Passing struct as parameter breaks SRA for stack-allocated struct inside called function
- Reply-to: gcc-bugzilla at gcc dot gnu dot org
sra-bug.C (below) contains a function which stack-allocates a local struct
containing two small arrays. The function depends on SRA to eliminate repeated
memory accesses to the two arrays as it streams over a large, third array.
The performance of the executables resulting from
g++ -Wall -O3 -msse3 -fpeel-loops sra-bug.C
and
g++ -Wall -O3 -msse3 -fpeel-loops sra-bug.C -DTRIGGER_BUG
differs by exactly 2x on my machine (a 2.66GHz Core2 quad Xeon), with the
runtime increasing from .395 ns/value/entry to .790 ns/value/entry.
The only difference between the two versions is whether the array pointer and
count are passed as separate arguments (fast) or wrapped in a struct (slow),
even though the latter gets copied into local variables before use. Use of the
__restrict keyword didn't seem to make a difference. The assembler output shows
that excessive loads and stores nearly double the instruction count of the
unrolled inner loop for the slower case.
FYI gcc-4.2.0 shows similar behavior, though its output is slower than 4.1 for
both cases (.420ns vs 1.10ns). gcc-4.3-20070617 performs equally badly on both
versions of the code (.690 ns/value/entry).
sra-bug.C:
===========================================================
#include <emmintrin.h>
#include <stdint.h>
#include <cassert>
#include <cstdio>
#include <sys/time.h>
struct stopwatch_t {
struct timeval tv; long long mark;
stopwatch_t() { reset(); }
double time_ns() {
long long old_mark = mark; reset(); return 1e3*(mark - old_mark);
}
void reset() {
gettimeofday(&tv, NULL); mark = tv.tv_usec + tv.tv_sec*1000000ll;
}
};
template<int N, class T, class Action>
inline void unrolled_loop(T* entries, Action &action) {
for(int i=0; i < N; i++) action(entries[i]);
}
static __m128i const ALL_ZEROS = {0ull, 0ull};
static __m128i const ALL_ONES = {~0ull, ~0ull};
static int const COUNT=4;
struct Action16 {
__m128i _results[COUNT];
__m128i _values[COUNT];
__m128i* _dest;
Action16(__m128i* dest, uint64_t const* values) : _dest(dest) {
for(int i=0; i < COUNT; i++) {
_results[i] = ALL_ZEROS;
_values[i] = _mm_set1_epi16((short) values[i]);
}
}
void operator()(__m128i const &entry) {
for(int i=0; i < COUNT; i++)
_results[i] |= _mm_cmpeq_epi16(_values[i], entry);
}
~Action16() {
for(int i=0; i < COUNT; i++)
_dest[i] = _mm_movemask_epi8(_results[i])? ALL_ONES : ALL_ZEROS;
}
};
struct wrapper {
__m128i const* entries;
int count;
};
#ifdef TRIGGER_BUG
void foo(__m128i* dest, uint64_t const* values, wrapper const &w) {
__m128i const* entries = w.entries; int count = w.count;
#else
void foo(__m128i* dest, uint64_t const* values, __m128i const* entries, int
coun
t) {
#endif
static int const unroll_count=16;
Action16 action(dest, values);
assert((count % unroll_count) == 0);
for(int i=0; i+unroll_count < count; i+=unroll_count)
unrolled_loop<unroll_count>(&entries[i], action);
}
int main() {
int VALUE_COUNT = 1000000;
int LIST_SIZE = 2048;
uint64_t* values = new uint64_t[VALUE_COUNT];
__m128i* dest = (__m128i*) _mm_malloc(16*VALUE_COUNT, 16);
__m128i entries[LIST_SIZE];
wrapper w = {entries, LIST_SIZE};
stopwatch_t timer;
for(int j=0; j < 5; j++) {
for(int i=0; i < VALUE_COUNT; i+= COUNT) {
#ifdef TRIGGER_BUG
foo(dest+i, values+i, w);
#else
foo(dest+i, values+i, entries, LIST_SIZE);
#endif
}
printf("%.3lf ns/value/entry\n", timer.time_ns()/LIST_SIZE/VALUE_COUNT);
}
}
--
Summary: Passing struct as parameter breaks SRA for stack-
allocated struct inside called function
Product: gcc
Version: 4.1.2
Status: UNCONFIRMED
Severity: enhancement
Priority: P3
Component: c++
AssignedTo: unassigned at gcc dot gnu dot org
ReportedBy: scovich at gmail dot com
GCC target triplet: x86_64-unknown-linux-gnu
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=32412