Bug 44900 - [4.5 Regression] The variable of SSE will be broken
Summary: [4.5 Regression] The variable of SSE will be broken
Status: RESOLVED FIXED
Alias: None
Product: gcc
Classification: Unclassified
Component: tree-optimization (show other bugs)
Version: 4.5.0
: P1 normal
Target Milestone: 4.5.1
Assignee: Martin Jambor
URL:
Keywords: wrong-code
Depends on:
Blocks:
 
Reported: 2010-07-10 08:28 UTC by Kenta Yoshimura
Modified: 2010-07-21 17:21 UTC (History)
8 users (show)

See Also:
Host:
Target:
Build:
Known to work: 4.6.0 4.4.4 4.5.0
Known to fail: 4.5.1
Last reconfirmed: 2010-07-19 13:45:37


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Kenta Yoshimura 2010-07-10 08:28:09 UTC
I'm using gcc-4.5.0-1 of MinGW with Intel x86.
When I execute the following test code with '-O -msse' compiler options, it outputs the wrong value.
I'm expecting that v2.e[3] will be 7.0f.

And, when I exclude LINE_A, LINE_B, LINE_C or LINE_D as a comment, the program outputs the expected value.

-- begin test code --
// g++ -O -msse test.cpp
typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
typedef float __v4sf __attribute__ ((__vector_size__ (16)));

extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
{
  return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
}

struct vec
{
	union {
		__m128 v;
		float  e[4];
	};
	
	static const vec & zero()
	{
		static const vec v = _mm_set_ps(0, 0, 0, 0);
		return v;
	}
	
	vec() {}
	vec(const __m128 & a) : v(a) {}

	operator const __m128&() const { return v; }
};

struct vec2
{
	vec _v1;
	vec _v2;
	
	vec2() {}
	vec2(const vec & a, const vec & b) : _v1(a), _v2(b) {}

	static vec2 load(const float * a)
	{
		return vec2(
			__builtin_ia32_loadups(&a[0]),
			__builtin_ia32_loadups(&a[4]));
	}
	
	const vec & v1() const { return _v1; }
	const vec & v2() const { return _v2; }
};

extern "C" {
	int* _errno(void);
	int  printf (const char*, ...);
}

inline bool test_assert( bool is_succeed, const char * file_name, int line_num )
{
	if ( !is_succeed )
	{
		printf("error: %s(%d)\n", file_name, line_num);
		if ( *_errno() ) // LINE_A
		{
			printf("errno: %d\n", *_errno());
		}
	}
	
	return is_succeed;
}

inline bool operator==(const vec & a, const vec & b)
{ return 0xf == __builtin_ia32_movmskps(__builtin_ia32_cmpeqps(a, b)); }

#define test(x, y) test_assert( (x)==(y), __FILE__, __LINE__ )

int main( int argc, char * argv[] )
{
	__attribute__((aligned(16))) float data[] =
	{ 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5 };

	float * p = &data[2];
	vec2 a;

	a = vec2::load(p);

	vec v1 = a.v1();
	vec v2 = a.v2();
	printf(
		"v1: %f, %f, %f, %f\n"
		"v2: %f, %f, %f, %f\n",
		v1.e[0], v1.e[1], v1.e[2], v1.e[3],
		v2.e[0], v2.e[1], v2.e[2], v2.e[3]);

	test(_mm_set_ps(11, 12, 13, 14), a.v1()); // LINE_B
	test(_mm_set_ps( 7,  8,  9, 10), a.v2()); // LINE_C

	a._v1 = vec::zero();

	test(_mm_set_ps(0, 0, 0, 0), a.v1()); // LINE_D

	return 0;
}
-- end test code --

-- begin output --
v1: 14.000000, 13.000000, 12.000000, 11.000000
v2: 10.000000, 9.000000, 8.000000, 0.000000
error: test.cpp(92)
-- end output --
Comment 1 Kenta Yoshimura 2010-07-10 15:12:51 UTC
Also, I reproduced it on gcc45 of MacPorts 1.9.1 with '-O -m32 -msse' compiler options.
At that time, I modified errno of the test code.

--- test.old.cpp	2010-07-10 23:27:22.000000000 +0900
+++ test.new.cpp	2010-07-10 23:32:57.000000000 +0900
@@ -48,7 +48,7 @@
 };
 
 extern "C" {
-        int* _errno(void);
+        extern int errno;
         int  printf (const char*, ...);
 }
 
@@ -58,9 +58,9 @@
         if ( !is_succeed )
         {
                 printf("error: %s(%d)\n", file_name, line_num);
-                if ( *_errno() ) // LINE_A
+                if ( errno ) // LINE_A
                 {
-                        printf("errno: %d\n", *_errno());
+                        printf("errno: %d\n", errno);
                 }
         }
Comment 2 Uroš Bizjak 2010-07-11 11:25:39 UTC
Confirmed on i686-pc-linux-gnu (with errno fully removed).

4.6.0 works OK.
Comment 3 Uroš Bizjak 2010-07-11 11:39:03 UTC
For some reason, we got:

	movl	$0x41800000, 112(%esp)
	movl	$0x41700000, 116(%esp)
	movl	$0x41600000, 120(%esp)
	movl	$0x41500000, 124(%esp)
	movl	$0x41400000, 128(%esp)
	movl	$0x41300000, 132(%esp)
	movl	$0x41200000, 136(%esp)
	movl	$0x41100000, 140(%esp)
	movl	$0x41000000, 144(%esp)
	movl	$0x40e00000, 148(%esp)
	movl	$0x40c00000, 152(%esp)
	movl	$0x40a00000, 156(%esp)
	movups	136(%esp), %xmm0
	movaps	%xmm0, 96(%esp)
	movups	120(%esp), %xmm0
	movaps	%xmm0, 80(%esp)
	movaps	%xmm0, 160(%esp)
	movaps	96(%esp), %xmm0
	movaps	%xmm0, 176(%esp)
>>>	fldz
	fstpl	60(%esp)
	flds	184(%esp)
	fstpl	52(%esp)
	flds	180(%esp)
	fstpl	44(%esp)
	flds	176(%esp)
	fstpl	36(%esp)
	flds	172(%esp)
	fstpl	28(%esp)
	flds	168(%esp)
	fstpl	20(%esp)
	flds	164(%esp)
	fstpl	12(%esp)
	flds	160(%esp)
	fstpl	4(%esp)
	movl	$.LC12, (%esp)
	call	printf

Changing fldz to "flds	188(%esp)" produces correct output.
Comment 4 Uroš Bizjak 2010-07-11 11:42:03 UTC
4.4 works OK, so this is 4.5 regression.
Comment 5 Uroš Bizjak 2010-07-11 11:59:13 UTC
Expand expands with uninitialized register r68:

   41 [r54:SI-0x20]=r73:V4SF
   42 [r54:SI-0x10]=r72:V4SF
>  43 [r56:SI+0x3c]=float_extend(r68:SF)
   44 [r56:SI+0x34]=float_extend([r54:SI-0x8])
   45 [r56:SI+0x2c]=float_extend([r54:SI-0xc])
   46 [r56:SI+0x24]=float_extend([r54:SI-0x10])
   47 [r56:SI+0x1c]=float_extend([r54:SI-0x14])
   48 [r56:SI+0x14]=float_extend([r54:SI-0x18])
   49 [r56:SI+0xc]=float_extend([r54:SI-0x1c])
   50 [r56:SI+0x4]=float_extend([r54:SI-0x20])
   51 [r56:SI]=`*.LC12'
   52 call <...>

.optimized tree dump looks suspicious, load from uninitialized var:

  SR.14_103 = __builtin_ia32_loadups (&data[6]);
  SR.13_104 = __builtin_ia32_loadups (&data[2]);
  a._v1.D.1895.v = SR.13_104;
  a._v2.D.1895.v = SR.14_103;
  a$_v2$D1895$e$0_98 = a._v2.D.1895.e[0];
  a$_v2$D1895$e$1_110 = a._v2.D.1895.e[1];
  a$_v2$D1895$e$2_105 = a._v2.D.1895.e[2];
  v1$D1895$e$0_99 = a._v1.D.1895.e[0];
  v1$D1895$e$1_40 = a._v1.D.1895.e[1];
  v1$D1895$e$2_95 = a._v1.D.1895.e[2];
  v1$D1895$e$3_79 = a._v1.D.1895.e[3];
  D.2043_5 = (double) a$_v2$D1895$e$3_82(D);     <--- here
  D.2045_7 = (double) a$_v2$D1895$e$2_105;
  D.2047_9 = (double) a$_v2$D1895$e$1_110;
  D.2049_11 = (double) a$_v2$D1895$e$0_98;
  D.2051_13 = (double) v1$D1895$e$3_79;
  D.2053_15 = (double) v1$D1895$e$2_95;
  D.2055_17 = (double) v1$D1895$e$1_40;
  D.2057_19 = (double) v1$D1895$e$0_99;

Needs tree expert. CC'd.
Comment 6 Richard Biener 2010-07-11 12:48:52 UTC
Confirmed, caused by SRA, maybe latent on the trunk.
Comment 7 H.J. Lu 2010-07-11 20:59:52 UTC
It works with gcc 4.5.0.  It is a 4.5.1 regression.
Comment 8 Richard Biener 2010-07-11 21:59:54 UTC
Hm?  I checked 4.5.0 and it was broken as well, so someone should double-check (I can't at the moment).
Comment 9 Andrew Pinski 2010-07-11 22:47:40 UTC
Works with:
GNU C++ (GCC) version 4.5.0 20100401 (experimental) [trunk revision 157933] (x86_64-unknown-linux-gnu)
	compiled by GNU C version 4.5.0 20100401 (experimental) [trunk revision 157933], GMP version 4.3.2, MPFR version 2.4.2, MPC version 0.8.1

Which is before the branch of 4.5.
Comment 10 Kenta Yoshimura 2010-07-11 23:11:15 UTC
Please use '-m32' if host is x64.
Comment 11 H.J. Lu 2010-07-11 23:35:00 UTC
It is very likely caused by revision 158826:

http://gcc.gnu.org/ml/gcc-cvs/2010-04/msg00933.html

I will verify it and find out which checkin on trunk
fixes/hides this bug.
Comment 12 H.J. Lu 2010-07-12 01:05:58 UTC
It is caused by revision 158826. On trunk, it
is fixed/hidden by revision 158732:

http://gcc.gnu.org/ml/gcc-cvs/2010-04/msg00839.html
Comment 13 H.J. Lu 2010-07-12 02:03:47 UTC
If you replace

---
       vec2 a;
        a = vec2::load(p);
---

with

---
        vec2 a = vec2::load(p);
---

the testcase will pass.
Comment 14 Richard Biener 2010-07-12 07:54:55 UTC
It indeed works with the 4.5.0 release which makes it a blocker for 4.5.1.
Comment 15 Kenta Yoshimura 2010-07-14 09:22:26 UTC
I found the similar case with gcc 4.4.4 of MacPorts and gcc 4.4.0 of MinGW.

-- begin testcase --
// g++ -O -msse2 test.cpp
typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));

struct vec
{
        __m128i v;
        
        static vec load(const int * p)
        { return (__m128i) __builtin_ia32_loaddqu((char const *)p); }

        const int & operator [](int i) const
        {
                union u {
                        __m128i v;
                        int     e[4];
                };
                
                return ((const u &)v).e[i];
        }

        vec() {}
        vec(const __m128i & a) : v(a) {}
};

extern "C" {
        int  printf (const char*, ...);
}

int main( int argc, char * argv[] )
{
        __attribute__((aligned(16))) int data[] =
        { 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5 };

        vec a = vec::load(data);

        printf("v: %d, %d, %d, %d\n", a[0], a[1], a[2], a[3]);

        return 0;
}
-- end testcase --

-- begin output --
v: 16, 16, 14, 14
-- end output --
Comment 16 Kenta Yoshimura 2010-07-14 12:24:50 UTC
This is also the wrong result with MinGW gcc 3.4.5.
I'm expecting that all component of v will be 2500.

4.4.4 of MacPorts, 4.5.0 of MacPorts, 4.4.0 of MinGW and 4.5.0-1 of MinGW were worked fine.

This time, I did not check 3.4.6 of MacPorts.
I tried to install gcc34 of MacPorts, but failed.

-- begin testcase --
// g++ -O -msse2 test.cpp
#include <xmmintrin.h>
#include <emmintrin.h>

extern "C" int  printf (const char*, ...);

// There is no _mm_castps_si128() in gcc 3.4
inline __m128i my_castps_si128(const __m128 & a)
{
	//return *(const __m128i *)&a; // same result

	union u {
		__m128  s;
		__m128i i;
	};
	const u & v = (const u &)a;
	
	return v.i;
}

inline __m128 my_castsi128_ps(const __m128i & a)
{
	//return *(const __m128 *)&a; // same result

	union u {
		__m128  s;
		__m128i i;
	};
	const u & v = (const u &)a;
	
	return v.s;
}

int main( int argc, char * argv[] )
{

	union u {
		__m128i v;
		int     e[4];
	};
	__m128i a  = _mm_set1_epi32(1250);
	__m128i b  = _mm_set1_epi32(2);
	__m128i v0 = _mm_setzero_si128();
	__m128i al = _mm_unpacklo_epi32(a, v0);
	__m128i ah = _mm_unpackhi_epi32(a, v0);
	__m128i bl = _mm_unpacklo_epi32(b, v0);
	__m128i bh = _mm_unpackhi_epi32(b, v0);
	__m128i lo = _mm_mul_epu32(al, bl);
	__m128i hi = _mm_mul_epu32(ah, bh);
	__m128  sl = my_castsi128_ps(lo);
	__m128  sh = my_castsi128_ps(hi);
	__m128  s  = _mm_shuffle_ps(sl, sh, 0x88); // 2, 0, 2, 0
	__m128i r  = my_castps_si128(s);

	u & v = (u &)r;
	printf("v: %d, %d, %d, %d\n", v.e[0], v.e[1], v.e[2], v.e[3]);

	return 0;
}
-- end testcase --

-- begin output --
v: 0, 0, 2500, 2500
-- end output --
Comment 17 Ozkan Sezer 2010-07-14 14:02:47 UTC
(In reply to comment #16)
> This is also the wrong result with MinGW gcc 3.4.5.
> I'm expecting that all component of v will be 2500.
> 
> 4.4.4 of MacPorts, 4.5.0 of MacPorts, 4.4.0 of MinGW and 4.5.0-1 of MinGW were
> worked fine.
> 
> This time, I did not check 3.4.6 of MacPorts.
> I tried to install gcc34 of MacPorts, but failed.
> 
> -- begin testcase --
> // g++ -O -msse2 test.cpp
> #include <xmmintrin.h>
> #include <emmintrin.h>
> 
> extern "C" int  printf (const char*, ...);
> 
> // There is no _mm_castps_si128() in gcc 3.4
> inline __m128i my_castps_si128(const __m128 & a)
> {
>         //return *(const __m128i *)&a; // same result
> 
>         union u {
>                 __m128  s;
>                 __m128i i;
>         };
>         const u & v = (const u &)a;
> 
>         return v.i;
> }
> 
> inline __m128 my_castsi128_ps(const __m128i & a)
> {
>         //return *(const __m128 *)&a; // same result
> 
>         union u {
>                 __m128  s;
>                 __m128i i;
>         };
>         const u & v = (const u &)a;
> 
>         return v.s;
> }
> 
> int main( int argc, char * argv[] )
> {
> 
>         union u {
>                 __m128i v;
>                 int     e[4];
>         };
>         __m128i a  = _mm_set1_epi32(1250);
>         __m128i b  = _mm_set1_epi32(2);
>         __m128i v0 = _mm_setzero_si128();
>         __m128i al = _mm_unpacklo_epi32(a, v0);
>         __m128i ah = _mm_unpackhi_epi32(a, v0);
>         __m128i bl = _mm_unpacklo_epi32(b, v0);
>         __m128i bh = _mm_unpackhi_epi32(b, v0);
>         __m128i lo = _mm_mul_epu32(al, bl);
>         __m128i hi = _mm_mul_epu32(ah, bh);
>         __m128  sl = my_castsi128_ps(lo);
>         __m128  sh = my_castsi128_ps(hi);
>         __m128  s  = _mm_shuffle_ps(sl, sh, 0x88); // 2, 0, 2, 0
>         __m128i r  = my_castps_si128(s);
> 
>         u & v = (u &)r;
>         printf("v: %d, %d, %d, %d\n", v.e[0], v.e[1], v.e[2], v.e[3]);
> 
>         return 0;
> }
> -- end testcase --
> 
> -- begin output --
> v: 0, 0, 2500, 2500
> -- end output --
> 

Comment 18 Ozkan Sezer 2010-07-14 14:05:27 UTC
(In reply to comment #15)
> I found the similar case with gcc 4.4.4 of MacPorts and gcc 4.4.0 of MinGW.
> 

This case fails with 4.4 on i686-linux too, printing 16, 16, 14, 14 instead of 16, 15, 14, 13 which 4.3 does.
Comment 19 H.J. Lu 2010-07-14 15:52:25 UTC
(In reply to comment #15)
> I found the similar case with gcc 4.4.4 of MacPorts and gcc 4.4.0 of MinGW.
> 
> -- begin testcase --
> // g++ -O -msse2 test.cpp
> typedef long long __m128i __attribute__ ((__vector_size__ (16),
> __may_alias__));
> 
> struct vec
> {
>         __m128i v;
> 
>         static vec load(const int * p)
>         { return (__m128i) __builtin_ia32_loaddqu((char const *)p); }
> 
>         const int & operator [](int i) const
>         {
>                 union u {
>                         __m128i v;
>                         int     e[4];
>                 };
> 
>                 return ((const u &)v).e[i];
>         }
> 
>         vec() {}
>         vec(const __m128i & a) : v(a) {}
> };
> 
> extern "C" {
>         int  printf (const char*, ...);
> }
> 
> int main( int argc, char * argv[] )
> {
>         __attribute__((aligned(16))) int data[] =
>         { 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5 };
> 
>         vec a = vec::load(data);
> 
>         printf("v: %d, %d, %d, %d\n", a[0], a[1], a[2], a[3]);
> 
>         return 0;
> }
> -- end testcase --
> 
> -- begin output --
> v: 16, 16, 14, 14
> -- end output --
> 

This is caused by revision 134947:

http://gcc.gnu.org/ml/gcc-cvs/2008-05/msg00107.html
Comment 20 Andrew Pinski 2010-07-14 16:41:42 UTC
(In reply to comment #15)
> I found the similar case with gcc 4.4.4 of MacPorts and gcc 4.4.0 of MinGW.
I think the code in comment #15 is invalid and voilates C/C++ aliasing rules.
Comment 21 Andrew Pinski 2010-07-14 16:44:53 UTC
(In reply to comment #20)
> (In reply to comment #15)
> > I found the similar case with gcc 4.4.4 of MacPorts and gcc 4.4.0 of MinGW.
> I think the code in comment #15 is invalid and voilates C/C++ aliasing rules.

Even if it did not voilate aliasing rules, the IR looks good:

  D.4999_70 = VIEW_CONVERT_EXPR<const union u>(D.4995_68).i;
  D.4863_25 = VIEW_CONVERT_EXPR<union u>(D.4999_70).e[3];
  D.4864_26 = VIEW_CONVERT_EXPR<union u>(D.4999_70).e[2];
  D.4865_27 = VIEW_CONVERT_EXPR<union u>(D.4999_70).e[1];
  D.4866_28 = VIEW_CONVERT_EXPR<union u>(D.4999_70).e[0];
Comment 22 Andrew Pinski 2010-07-14 16:50:51 UTC
(In reply to comment #21)
> Even if it did not voilate aliasing rules, the IR looks good:

The expansion looks incorrect though:
(insn 39 38 40 t.cc:55 (set (reg:DI 107)
        (vec_select:DI (reg:V2DI 79 [ D.4999 ])
            (parallel [
                    (const_int 1 [0x1])
                ]))) -1 (nil))

(insn 40 39 41 t.cc:55 (set (reg:DI 108)
        (vec_select:DI (reg:V2DI 79 [ D.4999 ])
            (parallel [
                    (const_int 1 [0x1])
                ]))) -1 (nil))

(insn 41 40 42 t.cc:55 (set (reg:DI 109)
        (vec_select:DI (reg:V2DI 79 [ D.4999 ])
            (parallel [
                    (const_int 0 [0x0])
                ]))) -1 (nil))

(insn 42 41 43 t.cc:55 (set (reg:DI 110)
        (vec_select:DI (reg:V2DI 79 [ D.4999 ])
            (parallel [
                    (const_int 0 [0x0])
                ]))) -1 (nil))

(insn 43 42 44 t.cc:55 (set (reg:SI 37 r8)
        (subreg:SI (reg:DI 107) 0)) -1 (nil))

(insn 44 43 45 t.cc:55 (set (reg:SI 2 cx)
        (subreg:SI (reg:DI 108) 0)) -1 (nil))

(insn 45 44 46 t.cc:55 (set (reg:SI 1 dx)
        (subreg:SI (reg:DI 109) 0)) -1 (nil))

(insn 46 45 47 t.cc:55 (set (reg:SI 4 si)
        (subreg:SI (reg:DI 110) 0)) -1 (nil))

Please file that as a different bug.
Comment 23 Kenta Yoshimura 2010-07-15 03:45:25 UTC
(In reply to comment #22)
> Please file that as a different bug.
May I enter comment #15 as a new bug to Bugzilla?
Comment 24 Martin Jambor 2010-07-20 13:34:32 UTC
I posted a proposed fix to the mailing list:

http://gcc.gnu.org/ml/gcc-patches/2010-07/msg01570.html
Comment 25 Martin Jambor 2010-07-21 13:57:32 UTC
Subject: Bug 44900

Author: jamborm
Date: Wed Jul 21 13:57:12 2010
New Revision: 162374

URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=162374
Log:
2010-07-21  Martin Jambor  <mjambor@suse.cz>

	PR tree-optimization/44900
	* tree-sra.c (load_assign_lhs_subreplacements): Updated comments.
	(sra_modify_assign): Move gsi to the next statmenent unconditionally.

	* testsuite/g++.dg/torture/pr44900.C: New test.


Added:
    branches/gcc-4_5-branch/gcc/testsuite/g++.dg/torture/pr44900.C
Modified:
    branches/gcc-4_5-branch/gcc/ChangeLog
    branches/gcc-4_5-branch/gcc/testsuite/ChangeLog
    branches/gcc-4_5-branch/gcc/tree-sra.c

Comment 26 Martin Jambor 2010-07-21 14:17:29 UTC
Subject: Bug 44900

Author: jamborm
Date: Wed Jul 21 14:17:11 2010
New Revision: 162375

URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=162375
Log:
2010-07-21  Martin Jambor  <mjambor@suse.cz>

	PR tree-optimization/44900
	* tree-sra.c (load_assign_lhs_subreplacements): Updated comments.
	(sra_modify_assign): Move gsi to the next statmenent unconditionally.

	* testsuite/g++.dg/torture/pr44900.C: New test.


Added:
    trunk/gcc/testsuite/g++.dg/torture/pr44900.C
Modified:
    trunk/gcc/ChangeLog
    trunk/gcc/testsuite/ChangeLog
    trunk/gcc/tree-sra.c

Comment 27 Martin Jambor 2010-07-21 17:21:23 UTC
Fixed.