Bug 31802 - SSE2 performance is deteriorating when __m128 is placed in union
Summary: SSE2 performance is deteriorating when __m128 is placed in union
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: middle-end (show other bugs)
Version: 4.1.2
: P3 enhancement
Target Milestone: 4.5.0
Assignee: Not yet assigned to anyone
URL:
Keywords: missed-optimization
Depends on:
Blocks:
 
Reported: 2007-05-03 19:27 UTC by Yuri
Modified: 2021-08-05 21:00 UTC (History)
2 users (show)

See Also:
Host:
Target:
Build:
Known to work: 4.5.3, 4.7.1
Known to fail: 4.4.7
Last reconfirmed:


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Yuri 2007-05-03 19:27:37 UTC
When I compile the following testcase with '-O3 -msse3' options it runs in 
22.562sec, without 'union' clause it runs in 0.280sec. And should be the same time.

-- begin testcase --
typedef float __v2df __attribute__ ((__vector_size__ (16)));
typedef __v2df __m128;

static __inline __m128 _mm_sub_pd (__m128 __A, __m128 __B) { return
(__m128)__builtin_ia32_subps ((__v2df)__A, (__v2df)__B); }
static __inline __m128 _mm_add_pd (__m128 __A, __m128 __B) { return
(__m128)__builtin_ia32_addps ((__v2df)__A, (__v2df)__B); }
static __inline __m128 _mm_setr_ps (float __Z, float __Y, float __X, float __W)
{ return __extension__ (__m128)(__v2df){ __Z, __Y, __X, __W }; }

struct FF {
  union {__m128 d; float f[4];}; // problem
  // __m128 d; // no problem

  __inline FF() { }
  __inline FF(__m128 new_d) : d(new_d) { }
  __inline FF(float f) : d(_mm_setr_ps(f, f, f, f)) { }

  __inline FF operator+(FF other) { return (FF(_mm_add_pd(d,other.d))); }
  __inline FF operator-(FF other) { return (FF(_mm_sub_pd(d,other.d))); }
};

float f[1024*1024];

int main() {
  int i;

  for (i = 0; i < 1024*1024; i++) { f[i] = 1.f/(1024*1024 + 10 - i); }

  FF total(0.f);

  for (int rpt = 0; rpt < 1000; rpt++) {
  FF p1(0.f), p2(0.), c;

  __m128 *pf = (__m128*)f;
  for (i = 0; i < 1024*1024/4; i++) {
    FF c(*pf++);

    total = total + c - p2 + p1;

    p1 = p2;
    p2 = c;
  }
  }
}
-- end testcase

This bug has similar testcase as 25500 (that's fixed now). Only 'union' clause was added.

Yuri
Comment 1 Andrew Pinski 2007-05-03 21:44:03 UTC
This is because the union does not get the correct mode (it gets BLK or TImode) so GCC stores the union to the stack all the time.
Comment 2 Andrew Pinski 2021-08-05 21:00:32 UTC
Fixed by the new SRA implementation way back in 2009 (r0-93753).