Bug 23570 - [4.0 Regression] internal compiler error: in merge_assigned_reloads, at reload1.c:6091
Summary: [4.0 Regression] internal compiler error: in merge_assigned_reloads, at reloa...
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: target (show other bugs)
Version: 4.0.2
: P2 normal
Target Milestone: 4.0.3
Assignee: Uroš Bizjak
URL: http://gcc.gnu.org/ml/gcc-patches/200...
Keywords: ice-on-valid-code, patch, ssemmx
Depends on:
Blocks:
 
Reported: 2005-08-26 02:26 UTC by Qian Chen
Modified: 2005-10-07 05:42 UTC (History)
2 users (show)

See Also:
Host:
Target: i686-pc-linux-gnu
Build:
Known to work: 3.4.0 4.1.0
Known to fail: 4.0.0
Last reconfirmed: 2005-08-31 08:48:43


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Qian Chen 2005-08-26 02:26:04 UTC
The compiler gives internal compiler error when I try to compile my program with
-O2.
If I compile with -O1, it's OK.

% gcc -O2 -msse2 a.c
a.c: In function 'ludcompf':
a.c:505: internal compiler error: in merge_assigned_reloads, at reload1.c:6091
Please submit a full bug report,
with preprocessed source if appropriate.
See <URL:http://gcc.gnu.org/bugs.html> for instructions.


gcc -v
Using built-in specs.
Target: i686-pc-linux-gnu
Configured with: ../gcc-4.0.2/configure --prefix=/usr --libexecdir=/usr/lib
--enable-shared --enable-threads=posix --enable-__cxa_atexit
--enable-clocale=gnu --enable-libada
--enable-languages=c,ada,c++,f95,java,objc,treelang
Thread model: posix
gcc version 4.0.2 20050825 (prerelease)


/* a.c */
extern int printf (__const char *__restrict __format, ...);
extern double fabs (double __x) __attribute__ ((__nothrow__)) __attribute__
((__const__)); extern double __fabs (double __x) __attribute__ ((__nothrow__))
__attribute__ ((__const__));

typedef float __v4sf __attribute__ ((__vector_size__ (16)));
typedef float __m128 __attribute__ ((__vector_size__ (16)));

static __inline __m128
_mm_setzero_ps (void)
{
  return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
}

static __inline __m128
_mm_max_ps (__m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B);
}


static __inline __m128
_mm_cmpeq_ps (__m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
}

static __inline __m128
_mm_set1_ps (float __F)
{
  return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
}

static __inline __m128
_mm_and_ps (__m128 __A, __m128 __B)
{
  return __builtin_ia32_andps (__A, __B);
}

static __inline __m128
_mm_loadu_ps (float const *__P)
{
  return (__m128) __builtin_ia32_loadups (__P);
}

static __inline __m128
_mm_setr_ps (float __Z, float __Y, float __X, float __W)
{
  return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
}

static __inline void
_mm_storeu_ps (float *__P, __m128 __A)
{
  __builtin_ia32_storeups (__P, (__v4sf)__A);
}

static __inline __m128
_mm_add_ps (__m128 __A, __m128 __B)
{
  return (__m128)__builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
}


static __inline __m128
_mm_sub_ps (__m128 __A, __m128 __B)
{
  return (__m128)__builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B);
}


static __inline __m128
_mm_mul_ps (__m128 __A, __m128 __B)
{
  return (__m128)__builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B);
}


typedef double __v2df __attribute__ ((__vector_size__ (16)));
typedef long long __v2di __attribute__ ((__vector_size__ (16)));
typedef int __v4si __attribute__ ((__vector_size__ (16)));

typedef __v2di __m128i;
typedef __v2df __m128d;

static __inline __m128d
_mm_set1_pd (double __F)
{
  return __extension__ (__m128d){ __F, __F };
}

static __inline __m128d
_mm_setr_pd (double __W, double __X)
{
  return __extension__ (__m128d){ __W, __X };
}


static __inline __m128d
_mm_loadu_pd (double const *__P)
{
  return __builtin_ia32_loadupd (__P);
}

static __inline void
_mm_storeu_pd (double *__P, __m128d __A)
{
  __builtin_ia32_storeupd (__P, __A);
}

static __inline __m128d
_mm_set_sd (double __F)
{
  return __extension__ (__m128d){ __F, 0 };
}

static __inline __m128d
_mm_load_sd (double const *__P)
{
  return _mm_set_sd (*__P);
}

static __inline __m128d
_mm_and_pd (__m128d __A, __m128d __B)
{
  return __builtin_ia32_andpd (__A, __B);
}

static __inline __m128d se2_abssd(__m128d a)
{
  static const union {
    __m128d m;
    unsigned int i[4];
  } u = {
    .i[0] = 0xffffffffUL, .i[1] = 0x7fffffffUL,
    .i[2] = 0xffffffffUL, .i[3] = 0xffffffffUL
  };
  __m128d msk = u.m;
  return (__m128d)_mm_and_pd(a, msk);
}

static __inline __m128d
_mm_add_pd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B);
}

static __inline __m128d
_mm_sub_pd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B);
}

static __inline __m128d
_mm_mul_pd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B);
}

static __inline __m128d
_mm_mul_sd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B);
}

static __inline __m128d
_mm_max_pd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B);
}
static __inline __m128d
_mm_unpackhi_pd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B);
}

static __inline __m128d
_mm_cmpeq_pd (__m128d __A, __m128d __B)
{
  return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B);
}

static __inline int
_mm_comilt_sd (__m128d __A, __m128d __B)
{
  return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B);
}

static __inline __m128i
_mm_add_epi32 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B);
}

static __inline __m128i
_mm_and_si128 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_pand128 ((__v2di)__A, (__v2di)__B);
}

static __inline __m128i
_mm_andnot_si128 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B);
}

static __inline __m128i
_mm_or_si128 (__m128i __A, __m128i __B)
{
  return (__m128i)__builtin_ia32_por128 ((__v2di)__A, (__v2di)__B);
}

typedef union {
  __m128 xmm;
  __m128i xmmi;
  __m128d xmmd;
  long long di[2];
  unsigned long long udi[4];
  int si[4];
  unsigned int usi[4];
  short hi[8];
  unsigned short uhi[8];
  signed char qi[16];
  unsigned char uqi[16];
  double df[2];
  float sf[4];
} __attribute__ ((aligned(16))) um128;

static __inline __m128 se_absps(__m128 a)
{
  static const union {
    __m128 m;
    unsigned int i[4];
  } u = {
    .i[0] = 0x7fffffffUL, .i[1] = 0x7fffffffUL,
    .i[2] = 0x7fffffffUL, .i[3] = 0x7fffffffUL
  };
  __m128 msk = u.m;

  return (__m128)_mm_and_ps(a, msk);
}

static __inline __m128d se2_abspd(__m128d a)
{
  static const union {
    __m128d m;
    unsigned int i[4];
  } u = {
    .i[0] = 0xffffffffUL, .i[1] = 0x7fffffffUL,
    .i[2] = 0xffffffffUL, .i[3] = 0x7fffffffUL
  };
  __m128 msk = u.m;

  return (__m128d)_mm_and_pd(a, msk);
}

static void swap_index(int *prow, int n1, int n2)
{
  int *p1 = prow + n1;
  int *p2 = prow + n2;

  n1 = *p1;
  n2 = *p2;
  *p1 = n2;
  *p2 = n1;
}

static int sse2_max_abs_index(double *v, int step, int n)
{
  __m128d m1, mm;
  __m128i mi1, mim, mi, msk;
  um128 u;
  double *v2end;
  int step2, n2;
  static const um128 i0i1 = {
    .si[0]=0, .si[1]=0, .si[2]=1, .si[3] = 0
  };
  static const um128 i1i1 = {
    .si[0]=2, .si[1]=0, .si[2]=2, .si[3] = 0
  };
  for (n2 = 0; n2 < n; ++n2) printf("%f ", v[step * n2]); printf("\n");
  if (n <= 1) return 0;
  step2 = step + step;
  v2end = v + (n / 2) * step2;
  mm = se2_abspd(_mm_setr_pd(v[0], v[step]));
  v += step2;
  mi1 = i1i1.xmmi;
  mim = mi = i0i1.xmmi;
  while (v < v2end) {
    mi = _mm_add_epi32(mi, mi1);
    m1 = se2_abspd(_mm_setr_pd(v[0], v[step]));
    v += step2;
    mm = _mm_max_pd(mm, m1);
    msk = (__m128i)_mm_cmpeq_pd(m1, mm);
    mim = _mm_or_si128(_mm_and_si128(msk, mi), _mm_andnot_si128(msk, mim));
  }
  if (n & 1) {
    mi = _mm_add_epi32(mi, mi1);
    m1 = se2_abssd(_mm_load_sd(v));
    mm = _mm_max_pd(mm, m1);
    msk = (__m128i)_mm_cmpeq_pd(m1, mm);
    mim = _mm_or_si128(_mm_and_si128(msk, mi), _mm_andnot_si128(msk, mim));
  }
  m1 = _mm_unpackhi_pd(mm, mm);
  u.xmmi = mim;
  if (_mm_comilt_sd(mm, m1))
    return u.si[2];
  return u.si[0];
}

static void sse2_add_row(double *dst, double *src, double k, int n)
{
  double *dst2end = dst + (n / 2) * 2;
  __m128d mk = _mm_set1_pd(k);
  while (dst < dst2end) {
    __m128d s = _mm_loadu_pd(src);
    __m128d d = _mm_loadu_pd(dst);
    s = _mm_mul_pd(s, mk);
    d = _mm_add_pd(d, s);
    _mm_storeu_pd(dst, d);
    src += 2;
    dst += 2;
  }
  if (n & 1) {
    dst[0] += k * src[0];
  }
}

static void sse2_swap_row(double *r1, double *r2, int n)
{
  double *r12end = r1 + (n / 2) * 2;
  while (r1 < r12end) {
    __m128d v1 = _mm_loadu_pd(r1);
    __m128d v2 = _mm_loadu_pd(r2);
    _mm_storeu_pd(r1, v2);
    _mm_storeu_pd(r2, v1);
    r1 += 2;
    r2 += 2;
  }
  if (n & 1) {
    double t = *r1;
    *r1 = *r2;
    *r2 = t;
  }
}

static int sse_max_abs_indexf(float *v, int step, int n)
{
  __m128 m1, mm;
  __m128i mi1, mim, mi, msk;
  um128 u, ui;
  float *v4end, t;
  int n4, step2, step3, step4;

  static const um128 i0123 = {
    .si[0]=0, .si[1]=1, .si[2]=2, .si[3]=3
  };
  static const um128 i1111 = {
    .si[0]=4, .si[1]=4, .si[2]=4, .si[3]=4
  };

  if (n <= 1) return 0;
  n4 = (n / 4) * 4;
  mi1 = i1111.xmmi;
  mim = mi = i0123.xmmi;
  mm = _mm_setzero_ps();
  if (n4 > 0) {
    step2 = step + step;
    step3 = step2 + step;
    step4 = step2 + step2;
    v4end = v + n4 * step;
    mm = se_absps(_mm_setr_ps(v[0], v[step], v[step2], v[step3]));
    v += step4;
    mi = _mm_add_epi32(mi, mi1);
    while (v < v4end) {
      m1 = se_absps(_mm_setr_ps(v[0], v[step], v[step2], v[step3]));
      mm = _mm_max_ps(mm, m1);
      msk = (__m128i)_mm_cmpeq_ps(m1, mm);
      mim = _mm_or_si128(_mm_and_si128(msk, mi), _mm_andnot_si128(msk, mim));
      v += step4;
      mi = _mm_add_epi32(mi, mi1);
    }
  }
  n4 = n - n4;
  if (n4) {
    int i;
    u.xmm = _mm_setzero_ps();
    for (i = 0; i < n4; ++i) {
      u.sf[i] = v[0];
      v += step;
    }
    m1 = se_absps(u.xmm);
    mm = _mm_max_ps(mm, m1);
    msk = (__m128i)_mm_cmpeq_ps(m1, mm);
    mim = _mm_or_si128(_mm_and_si128(msk, mi), _mm_andnot_si128(msk, mim));
  }
  ui.xmmi = mim;
  u.xmm = mm;
  t = u.sf[0];
  n = 0;
  if (u.sf[1] > t) { t = u.sf[1]; n = 1; }
  if (u.sf[2] > t) { t = u.sf[2]; n = 2; }
  if (u.sf[3] > t) { t = u.sf[3]; n = 3; }
  return ui.si[n];
}

static void sse_add_rowf(float *dst, float *src, float k, int n)
{
  int n4 = (n / 4) * 4;
  int i;
  float *dst4end = dst + n4;
  __m128 mk = _mm_set1_ps(k);

  while (dst < dst4end) {
    __m128 s = _mm_loadu_ps(src);
    __m128 d = _mm_loadu_ps(dst);
    s = _mm_mul_ps(s, mk);
    d = _mm_add_ps(d, s);
    _mm_storeu_ps(dst, d);
    src += 4;
    dst += 4;
  }
  n4 = n - n4;
  for (i = 0; i < n4; ++i) {
    dst[i] += k * src[i];
  }
}

static void sse_swap_rowf(float *r1, float *r2, int n)
{
  int i;
  int n4 = (n / 4) * 4;
  float *r14end = r1 + n4;
  while (r1 < r14end) {
    __m128 v1 = _mm_loadu_ps(r1);
    __m128 v2 = _mm_loadu_ps(r2);
    _mm_storeu_ps(r1, v2);
    _mm_storeu_ps(r2, v1);
    r1 += 4;
    r2 += 4;
  }
  r14end = r1 + n - n4;
  while (r1 < r14end) {
    float t = *r1;
    *r1 = *r2;
    *r2 = t;
    r1++;
    r2++;
  }
}

int
ludcompd(double *m, int nw, int *prow, int n)
{
  int i, s = 0;
  double *pm;

  for (i = 0; i < n; ++i) prow[i] = i;
  printf("ludcompd(): SSE2 code is used.\n");
  for (i = 0, pm = m; i < n - 1; ++i, pm += nw) {
    int vi = sse2_max_abs_index(pm + i, nw, n - i);
    double r, *pt;
    int j;

    if (vi != 0) {
      sse2_swap_row(pm, pm + vi * nw, nw);
      swap_index(prow, i, i + vi);
      s = 1 - s;
    }
    r = pm[i];
    for (j = i + 1, pt = pm + nw; j < n; ++j, pt += nw) {
      double k = pt[i] / r;
      pt[i] = k;
      sse2_add_row(pt + i + 1, pm + i + 1, -k, n - i - 1);
    }
  }
  return s;
}


int
ludcompf(float *m, int nw, int *prow, int n)
{
  int i, s = 0;
  float *pm;

  for (i = 0; i < n; ++i) prow[i] = i;
  printf("ludcompf(): SSE2 code is used.\n");
  for (i = 0, pm = m; i < n - 1; ++i, pm += nw) {
    int vi = sse_max_abs_indexf(pm + i, nw, n - i);
    float r, *pt;
    int j;

    if (vi != 0) {
      sse_swap_rowf(pm, pm + vi * nw, nw);
      swap_index(prow, i, i + vi);
      s = 1 - s;
    }
    r = pm[i];
    for (j = i + 1, pt = pm + nw; j < n; ++j, pt += nw) {
      float k = pt[i] / r;
      pt[i] = k;
      sse_add_rowf(pt + i + 1, pm + i + 1, -k, n - i - 1);
    }
  }
  return s;
}


void test_ludcompd(void)
{
  static double m[4][4] = {
    { 1, 2, 3, 4 },
    { 4, 2, 1, 7 },
    { 5, 6, 10, 78 },
    { 3, 2, 1, 0 }
  };
  int p[4];
  printf("%d\n", ludcompd(&m[0][0], 4, p, 4));
  printf("%d %d %d %d\n", p[0], p[1], p[2], p[3]);
  printf("%1.3f %1.3f %1.3f %1.3f\n", m[0][0], m[0][1], m[0][2], m[0][3]);
  printf("%1.3f %1.3f %1.3f %1.3f\n", m[1][0], m[1][1], m[1][2], m[1][3]);
  printf("%1.3f %1.3f %1.3f %1.3f\n", m[2][0], m[2][1], m[2][2], m[2][3]);
  printf("%1.3f %1.3f %1.3f %1.3f\n", m[3][0], m[3][1], m[3][2], m[3][3]);
}

void test_ludcompf(void)
{
  static float m[4][4] = {
    { 1, 2, 3, 4 },
    { 4, 2, 1, 7 },
    { 5, 6, 10, 78 },
    { 3, 2, 1, 0 }
  };
  int p[4];
  printf("%d\n", ludcompf(&m[0][0], 4, p, 4));
  printf("%d %d %d %d\n", p[0], p[1], p[2], p[3]);
  printf("%1.3f %1.3f %1.3f %1.3f\n", m[0][0], m[0][1], m[0][2], m[0][3]);
  printf("%1.3f %1.3f %1.3f %1.3f\n", m[1][0], m[1][1], m[1][2], m[1][3]);
  printf("%1.3f %1.3f %1.3f %1.3f\n", m[2][0], m[2][1], m[2][2], m[2][3]);
  printf("%1.3f %1.3f %1.3f %1.3f\n", m[3][0], m[3][1], m[3][2], m[3][3]);
}

int main()
{
  test_ludcompd();
  test_ludcompf();
  return 0;
}
Comment 1 Andrew Pinski 2005-08-26 02:50:22 UTC
Reducing.
Comment 2 Andrew Pinski 2005-08-26 03:36:35 UTC
Reduced as far as I can get this:
typedef float __v4sf __attribute__ ((__vector_size__ (16)));
typedef float __m128 __attribute__ ((__vector_size__ (16)));
static __inline __m128 _mm_cmpeq_ps (__m128 __A, __m128 __B)
{
  return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
}
static __inline __m128 _mm_setr_ps (float __Z, float __Y, float __X, float __W)
{
  return __extension__ (__m128)(__v4sf){__Z, __Y, __X, __W };
}
typedef long long __v2di __attribute__ ((__vector_size__ (16)));
static __inline __m128 _mm_and_si128 (__m128 __A, __m128 __B) {
  return (__m128)__builtin_ia32_pand128 ((__v2di)__A, (__v2di)__B);
}
static __inline __m128 _mm_or_si128 (__m128 __A, __m128 __B) {
  return (__m128)__builtin_ia32_por128 ((__v2di)__A, (__v2di)__B);
}
typedef union { __m128 xmmi; int si[4]; } __attribute__ ((aligned(16))) um128;
um128 u;
static inline int sse_max_abs_indexf(float *v, int step, int n)
{
  __m128 m1, mm;
  __m128 mim, mi, msk;
  um128 u, ui;
  int n4, step2, step3;
  mm = __builtin_ia32_andps((__m128)(__v4sf){0.0, v[step], v[step2], v[step3]},
			    u.xmmi);
  if (n4) {
    int i;
    for (i = 0; i < n4;  ++i) ;
    msk = (__m128)_mm_cmpeq_ps(m1, mm);
    mim = _mm_or_si128(_mm_and_si128(msk, mi), mim);
  }
  ui.xmmi = (__m128)mim;
  return ui.si[n];
}
static void sse_swap_rowf(float *r1, float *r2, int n) {
  int n4 = (n / 4) * 4;
  float *r14end = r1 + n4;
  while (r1 < r14end) {
    *r1 = *r2;
    r1++;
  }
}
void ludcompf(float *m, int nw, int *prow, int n) {
  int i, s = 0;
  float *pm;
  for (i = 0, pm = m; i < n - 1; ++i, pm += nw)
  {
    int vi = sse_max_abs_indexf(pm + i, nw, n - i);
    float *pt;
    int j;
    if (vi != 0)
    {
      sse_swap_rowf(pm, pm + vi * nw, nw);
      swap_index(prow, i, i + vi);
    }
    for (j = i + 1, pt = pm + nw; j < n; ++j, pt += nw)
      sse_add_rowf(pt + i + 1, pm + i + 1, -1.0, n - i - 1);
  }
}
Comment 3 Uroš Bizjak 2005-08-26 07:50:28 UTC
The problem here is in the sse_concatv2sf pattern:

;; ??? In theory we can match memory for the MMX alternative, but allowing
;; nonimmediate_operand for operand 2 and *not* allowing memory for the SSE
;; alternatives pretty much forces the MMX alternative to be chosen.
(define_insn "*sse_concatv2sf"
  [(set (match_operand:V2SF 0 "register_operand"     "=x,x,*y,*y")
	(vec_concat:V2SF
	  (match_operand:SF 1 "nonimmediate_operand" " 0,m, 0, m")
	  (match_operand:SF 2 "vector_move_operand"  " x,C,*y, C")))]

and "vector_move_operand" operand constraint, defined as:

;; Return 1 when OP is operand acceptable for standard SSE move.
(define_predicate "vector_move_operand"
  (ior (match_operand 0 "nonimmediate_operand")
       (match_operand 0 "const0_operand")))

Please note, that "vector_move_operand" allows memory operands, but register 
constraint doesn't. So, following pattern confuses reload:

(insn:HI 63 62 64 3 (set (reg:V2SF 21 xmm0 [117])
        (vec_concat:V2SF (mem:SF (plus:SI (plus:SI (reg/f:SI 68 [ ivtmp.71 ])
                        (reg:SI 88 [ D.1795 ]))
                    (const_int -4 [0xfffffffc])) [2 S4 A32])
            (mem:SF (plus:SI (plus:SI (reg/f:SI 68 [ ivtmp.71 ])
                        (reg:SI 89 [ D.1800 ]))
                    (const_int -4 [0xfffffffc])) [2 S4 A32]))) 612 
{*sse_concatv2sf} (nil)

(BTW: "sse2_loadld" pattern could have the same problem, no "m" register 
constraint.)

The immediate fix would be to define another operand constraint, similar 
to "vector_move_operand":

;; Same as above, but excluding memory operands.
(define_predicate "vector_move_nomem_operand"
  (ior (match_operand 0 "register_operand")
       (match_operand 0 "const0_operand")))

When operand 2 of sse_concatv2sf pattern is constrained with this new 
constraint, gcc is able to compile both testcases, and following result is 
produced (for both -01 and -02):

ludcompd(): SSE2 code is used.
1.000000 4.000000 5.000000 3.000000 
-2.800000 0.800000 -1.600000 
-1.000000 -1.000000 
0
2 1 3 0
5.000 6.000 10.000 78.000
0.800 -2.800 -7.000 -55.400
0.600 0.571 -1.000 -15.143
0.200 -0.286 1.000 -12.286
ludcompf(): SSE2 code is used.
1
2 1 0 3
5.000 6.000 10.000 78.000
0.800 -2.800 -7.000 -55.400
0.200 -0.286 -1.000 -27.429
0.600 0.571 1.000 12.286

Unfortunatelly, ludcompf() result (the second one) is wrong when -O1 or -O2 is 
used. It is correct without optimizations.
Comment 4 Uroš Bizjak 2005-08-26 09:35:14 UTC
(In reply to comment #3)

> Unfortunatelly, ludcompf() result (the second one) is wrong when -O1 or -O2
> is used. It is correct without optimizations.

This is a problem of infamous i387 precision handling. The error can be found 
in this part of the code:

  ...
  if (u.sf[1] > t) { t = u.sf[1]; n = 1; }
  if (u.sf[2] > t) { t = u.sf[2]; n = 2; }
  if (u.sf[3] > t) { t = u.sf[3]; n = 3; }
  ...

Without optimizations, the values of u.sf[1] and t that are at some moment 
loaded into x87 registers are:
u.sf[1] = 1.000000119...
      t = 0.999999880...

and branch is taken. However, with optimizations, the values are different:

u.sf[1] = 0.999999642...
      t = 0.999999821...

This is a problem of the i387 design and not the problem of gcc. In your case, 
you should use -ffloat-store or -mfpmath=sse.

BTW: At the moment, I have very limited time, so I won't be able to create a 
patch to fix the ICE for some time...
Comment 5 Uroš Bizjak 2005-08-31 08:48:42 UTC
Patch.
Comment 6 CVS Commits 2005-08-31 17:28:07 UTC
Subject: Bug 23570

CVSROOT:	/cvs/gcc
Module name:	gcc
Changes by:	rth@gcc.gnu.org	2005-08-31 17:27:54

Modified files:
	gcc            : ChangeLog 
	gcc/config/i386: sse.md 
Added files:
	gcc/testsuite/gcc.target/i386: pr23570.c 

Log message:
	PR target/23570
	* config/i386/sse.md (*sse_concatv2sf): Change operand 2 constraint
	to "reg_or_0_operand".
	(sse2_loadld): Change operand 1 constraint to "reg_or_0_operand".

Patches:
http://gcc.gnu.org/cgi-bin/cvsweb.cgi/gcc/gcc/ChangeLog.diff?cvsroot=gcc&r1=2.9863&r2=2.9864
http://gcc.gnu.org/cgi-bin/cvsweb.cgi/gcc/gcc/config/i386/sse.md.diff?cvsroot=gcc&r1=1.23&r2=1.24
http://gcc.gnu.org/cgi-bin/cvsweb.cgi/gcc/gcc/testsuite/gcc.target/i386/pr23570.c.diff?cvsroot=gcc&r1=NONE&r2=1.1

Comment 7 CVS Commits 2005-10-07 05:32:41 UTC
Subject: Bug 23570

CVSROOT:	/cvs/gcc
Module name:	gcc
Branch: 	gcc-4_0-branch
Changes by:	uros@gcc.gnu.org	2005-10-07 05:32:37

Modified files:
	gcc            : ChangeLog 
	gcc/config/i386: i386.c sse.md 
	gcc/testsuite  : ChangeLog 
Added files:
	gcc/testsuite/gcc.target/i386: pr22576.c pr22585.c pr23570.c 

Log message:
	PR target/23570
	* config/i386/sse.md (*sse_concatv2sf): Change operand 2 constraint
	to "reg_or_0_operand".
	(sse2_loadld): Change operand 1 constraint to "reg_or_0_operand".
	
	testsuite/
	
	PR target/22576
	* gcc.target/i386/pr22576.c: New test.
	
	PR target/22585
	* gcc.target/i386/pr22585.c: New test.
	
	PR target/23570
	* gcc.target/i386/pr23570.c: New test.

Patches:
http://gcc.gnu.org/cgi-bin/cvsweb.cgi/gcc/gcc/ChangeLog.diff?cvsroot=gcc&only_with_tag=gcc-4_0-branch&r1=2.7592.2.451&r2=2.7592.2.452
http://gcc.gnu.org/cgi-bin/cvsweb.cgi/gcc/gcc/config/i386/i386.c.diff?cvsroot=gcc&only_with_tag=gcc-4_0-branch&r1=1.795.6.11&r2=1.795.6.12
http://gcc.gnu.org/cgi-bin/cvsweb.cgi/gcc/gcc/config/i386/sse.md.diff?cvsroot=gcc&only_with_tag=gcc-4_0-branch&r1=1.7.14.2&r2=1.7.14.3
http://gcc.gnu.org/cgi-bin/cvsweb.cgi/gcc/gcc/testsuite/ChangeLog.diff?cvsroot=gcc&only_with_tag=gcc-4_0-branch&r1=1.5084.2.434&r2=1.5084.2.435
http://gcc.gnu.org/cgi-bin/cvsweb.cgi/gcc/gcc/testsuite/gcc.target/i386/pr22576.c.diff?cvsroot=gcc&only_with_tag=gcc-4_0-branch&r1=NONE&r2=1.1.10.1
http://gcc.gnu.org/cgi-bin/cvsweb.cgi/gcc/gcc/testsuite/gcc.target/i386/pr22585.c.diff?cvsroot=gcc&only_with_tag=gcc-4_0-branch&r1=NONE&r2=1.2.10.1
http://gcc.gnu.org/cgi-bin/cvsweb.cgi/gcc/gcc/testsuite/gcc.target/i386/pr23570.c.diff?cvsroot=gcc&only_with_tag=gcc-4_0-branch&r1=NONE&r2=1.1.20.1

Comment 8 Uroš Bizjak 2005-10-07 05:42:38 UTC
Fixed on 4.0 branch.