This is the mail archive of the gcc-bugs@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[Bug c++/32412] New: Passing struct as parameter breaks SRA for stack-allocated struct inside called function


sra-bug.C (below) contains a function which stack-allocates a local struct
containing two small arrays. The function depends on SRA to eliminate repeated
memory accesses to the two arrays as it streams over a large, third array.

The performance of the executables resulting from
    g++ -Wall -O3 -msse3 -fpeel-loops sra-bug.C
and
    g++ -Wall -O3 -msse3 -fpeel-loops sra-bug.C -DTRIGGER_BUG
differs by exactly 2x on my machine (a 2.66GHz Core2 quad Xeon), with the
runtime increasing from .395 ns/value/entry to .790 ns/value/entry. 

The only difference between the two versions is whether the array pointer and
count are passed as separate arguments (fast) or wrapped in a struct (slow),
even though the latter gets copied into local variables before use. Use of the
__restrict keyword didn't seem to make a difference. The assembler output shows
that excessive loads and stores nearly double the instruction count of the
unrolled inner loop for the slower case.

FYI gcc-4.2.0 shows similar behavior, though its output is slower than 4.1 for
both cases (.420ns vs 1.10ns). gcc-4.3-20070617 performs equally badly on both
versions of the code (.690 ns/value/entry). 

sra-bug.C:
===========================================================
#include <emmintrin.h>
#include <stdint.h>
#include <cassert>
#include <cstdio>
#include <sys/time.h>

struct stopwatch_t {
    struct timeval tv; long long mark;
    stopwatch_t() { reset(); }
    double time_ns() {
        long long old_mark = mark; reset(); return 1e3*(mark - old_mark);
    }
    void reset() {
        gettimeofday(&tv, NULL); mark = tv.tv_usec + tv.tv_sec*1000000ll;
    }
};

template<int N, class T, class Action>
inline void unrolled_loop(T* entries, Action &action) {
  for(int i=0; i < N; i++) action(entries[i]);
}

static __m128i const ALL_ZEROS = {0ull, 0ull};
static __m128i const ALL_ONES = {~0ull, ~0ull};
static int const COUNT=4;

struct Action16 {
  __m128i _results[COUNT];
  __m128i _values[COUNT];
  __m128i* _dest;
  Action16(__m128i* dest, uint64_t const* values) : _dest(dest) {
    for(int i=0; i < COUNT; i++) {
      _results[i] = ALL_ZEROS;
      _values[i] = _mm_set1_epi16((short) values[i]);
    }
  }
  void operator()(__m128i const &entry) {
    for(int i=0; i < COUNT; i++)
      _results[i] |= _mm_cmpeq_epi16(_values[i], entry);
  }
  ~Action16() {
    for(int i=0; i < COUNT; i++)
      _dest[i] = _mm_movemask_epi8(_results[i])? ALL_ONES : ALL_ZEROS;
  }
};

struct wrapper {
  __m128i const* entries;
  int count;
};

#ifdef TRIGGER_BUG
void foo(__m128i* dest, uint64_t const* values, wrapper const &w) {
  __m128i const* entries = w.entries;  int count = w.count;
#else
void foo(__m128i* dest, uint64_t const* values, __m128i const* entries, int
coun
t) {
#endif
  static int const unroll_count=16;
  Action16 action(dest, values);
  assert((count % unroll_count) == 0);
  for(int i=0; i+unroll_count < count; i+=unroll_count)
    unrolled_loop<unroll_count>(&entries[i], action);
}

int main() {
  int VALUE_COUNT = 1000000;
  int LIST_SIZE = 2048;
  uint64_t* values = new uint64_t[VALUE_COUNT];
  __m128i* dest = (__m128i*) _mm_malloc(16*VALUE_COUNT, 16);
  __m128i entries[LIST_SIZE];
  wrapper w = {entries, LIST_SIZE};
  stopwatch_t timer;
  for(int j=0; j < 5; j++) {
    for(int i=0; i < VALUE_COUNT; i+= COUNT) {
#ifdef TRIGGER_BUG
      foo(dest+i, values+i, w);
#else
      foo(dest+i, values+i, entries, LIST_SIZE);
#endif
    }
    printf("%.3lf ns/value/entry\n", timer.time_ns()/LIST_SIZE/VALUE_COUNT);
  }
}


-- 
           Summary: Passing struct as parameter breaks SRA for stack-
                    allocated struct inside called function
           Product: gcc
           Version: 4.1.2
            Status: UNCONFIRMED
          Severity: enhancement
          Priority: P3
         Component: c++
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: scovich at gmail dot com
GCC target triplet: x86_64-unknown-linux-gnu


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=32412


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]