This is the mail archive of the gcc-bugs@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[Bug translation/16111] New: generates invalid SSE movdqa instruction (instead of movaps)


My project stops working after switching from 3.3.3 to 3.4.0; I've found that
the problem is in SSE2; in some (register intensive) code the compiler generates
movdqa (packed ints) instead of movaps (float4 vector) causing invalid result (NaN).

I've created a simple test case to demonstrate the problem:

--------------------------------------------------------------------
#include <math.h>
#include <xmmintrin.h>
#include <stdio.h>

static inline __m128 xmm_dot4(__m128 a, __m128 b) {
	__m128 v0 = _mm_mul_ps(a, b);
	__m128 v1 = _mm_movehl_ps(v1, v0);
	v0 = _mm_add_ps(v0, v1);
	v1 = _mm_shuffle_ps(v0, v0, _MM_SHUFFLE(0,0,0,1));
	return _mm_add_ss(v0, v1);
}

/*
 * gcc-3.4.0+ generates invalid movdqa instruction;
 * works well if you replace movdqa by movaps in asm.
 */
void foo(float* boxCenter, float* boxExtents) 
{
	unsigned int MASK = 0x80000000;
	__m128 mask = _mm_set1_ps((float&)MASK);
	__m128 center = _mm_loadu_ps(boxCenter);
	__m128 extents = _mm_loadu_ps(boxExtents);
	center = _mm_andnot_ps(mask, center); // common code for doing abs
	extents = _mm_xor_ps(mask, extents); // common code for doing neg
	center = xmm_dot4(center, extents);
	_mm_storeu_ps(boxCenter, center);
	_mm_storeu_ps(boxExtents, extents);
}


float center[] = { 1, 1, 1, 1 };
float extents[] = { 27.5f, 27.5f, 0, 0 };

int main() 
{
	foo(center, extents);
	printf("extents (%f %f %f %f)\n", extents[0], extents[1], extents[2],
extents[3]); // prints NaN

	return 0;
}

--------------------------------------------------------------------

I've tried both 3.4.0 release, and latest CVS snapshot:

/opt/gcc-3.4.0/bin/g++-3.4 -v -save-temps -O3 -msse -mfpmath=sse
-fomit-frame-pointer -finline-limit=2000 "test.cxx" -o "test" -L/opt/gcc-3.4.0/lib
Reading specs from /opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1/specs
Configured with: ../configure --prefix=/opt/gcc-3.4.0 --program-suffix=-3.4
--enable-languages=c,c++,java --with-gcc --with-gnu-as --with-gnu-ld
--enable-shared --enable-threads=posix --enable-libgcj --disable-java-awt
--without-x --enable-java-gc=boehm --disable-debug --disable-libgcj-debug
--disable-interpreter --disable-x --enable-hash-synchronization
Thread model: posix
gcc version 3.4.1 20040618 (prerelease)
 /opt/gcc-3.4.0/libexec/gcc/i686-pc-linux-gnu/3.4.1/cc1plus -E -quiet -v
-D_GNU_SOURCE test.cxx -msse -mfpmath=sse -mtune=pentiumpro -fomit-frame-pointer
-finline-limit=2000 -O3 -o test.ii
ignoring nonexistent directory
"/opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1/../../../../i686-pc-linux-gnu/include"
#include "..." search starts here:
#include <...> search starts here:
 /opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1/../../../../include/c++/3.4.1
 /opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1/../../../../include/c++/3.4.1/i686-pc-linux-gnu
 /opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1/../../../../include/c++/3.4.1/backward
 /usr/local/include
 /opt/gcc-3.4.0/include
 /opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1/include
 /usr/include
End of search list.
 /opt/gcc-3.4.0/libexec/gcc/i686-pc-linux-gnu/3.4.1/cc1plus -fpreprocessed
test.ii -quiet -dumpbase test.cxx -msse -mfpmath=sse -mtune=pentiumpro -auxbase
test -O3 -version -fomit-frame-pointer -finline-limit=2000 -o test.s
GNU C++ version 3.4.1 20040618 (prerelease) (i686-pc-linux-gnu)
	compiled by GNU C version 3.3.3 (Debian 20040321).
GGC heuristics: --param ggc-min-expand=90 --param ggc-min-heapsize=113152
 as -V -Qy -o test.o test.s
GNU assembler version 2.14.90.0.7 (i386-linux) using BFD version 2.14.90.0.7
20031029 Debian GNU/Linux
 /opt/gcc-3.4.0/libexec/gcc/i686-pc-linux-gnu/3.4.1/collect2 --eh-frame-hdr -m
elf_i386 -dynamic-linker /lib/ld-linux.so.2 -o test /usr/lib/crt1.o
/usr/lib/crti.o /opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1/crtbegin.o
-L/opt/gcc-3.4.0/lib -L/opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1
-L/opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1/../../.. test.o -lstdc++ -lm
-lgcc_s -lgcc -lc -lgcc_s -lgcc
/opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1/crtend.o /usr/lib/crtn.o
/opt/gcc-3.4.0/bin/g++-3.4 -v -save-temps -O3 -msse -mfpmath=sse
-fomit-frame-pointer -finline-limit=2000 "test.cxx" -S -o "test.S"
-L/opt/gcc-3.4.0/lib
Reading specs from /opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1/specs
Configured with: ../configure --prefix=/opt/gcc-3.4.0 --program-suffix=-3.4
--enable-languages=c,c++,java --with-gcc --with-gnu-as --with-gnu-ld
--enable-shared --enable-threads=posix --enable-libgcj --disable-java-awt
--without-x --enable-java-gc=boehm --disable-debug --disable-libgcj-debug
--disable-interpreter --disable-x --enable-hash-synchronization
Thread model: posix
gcc version 3.4.1 20040618 (prerelease)
 /opt/gcc-3.4.0/libexec/gcc/i686-pc-linux-gnu/3.4.1/cc1plus -E -quiet -v
-D_GNU_SOURCE test.cxx -msse -mfpmath=sse -mtune=pentiumpro -fomit-frame-pointer
-finline-limit=2000 -O3 -o test.ii
ignoring nonexistent directory
"/opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1/../../../../i686-pc-linux-gnu/include"
#include "..." search starts here:
#include <...> search starts here:
 /opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1/../../../../include/c++/3.4.1
 /opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1/../../../../include/c++/3.4.1/i686-pc-linux-gnu
 /opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1/../../../../include/c++/3.4.1/backward
 /usr/local/include
 /opt/gcc-3.4.0/include
 /opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1/include
 /usr/include
End of search list.
 /opt/gcc-3.4.0/libexec/gcc/i686-pc-linux-gnu/3.4.1/cc1plus -fpreprocessed
test.ii -quiet -dumpbase test.cxx -msse -mfpmath=sse -mtune=pentiumpro
-auxbase-strip test.S -O3 -version -fomit-frame-pointer -finline-limit=2000 -o
test.S
GNU C++ version 3.4.1 20040618 (prerelease) (i686-pc-linux-gnu)
	compiled by GNU C version 3.3.3 (Debian 20040321).
GGC heuristics: --param ggc-min-expand=90 --param ggc-min-heapsize=113152

--------------------------------------------------------------------
And the result:

	.file	"test.cxx"
.globl extents
	.data
	.align 4
	.type	extents, @object
	.size	extents, 16
extents:
	.long	1104936960
	.long	1104936960
	.long	0
	.long	0
.globl center
	.align 4
	.type	center, @object
	.size	center, 16
center:
	.long	1065353216
	.long	1065353216
	.long	1065353216
	.long	1065353216
	.text
	.align 2
	.p2align 4,,15
.globl _Z3fooPfS_
	.type	_Z3fooPfS_, @function
_Z3fooPfS_:
.LFB312:
	subl	$4, %esp
.LCFI0:
	movl	8(%esp), %eax
	movl	$0x80000000, (%esp)
	movl	12(%esp), %edx
	movss	(%esp), %xmm1
	movups	(%eax), %xmm6
	movups	(%edx), %xmm5
	shufps	$0, %xmm1, %xmm1
	movdqa	%xmm1, %xmm4
	andnps	%xmm6, %xmm4
	xorps	%xmm5, %xmm1
	movaps	%xmm4, %xmm0
	mulps	%xmm1, %xmm0
	movhlps	%xmm0, %xmm3
	addps	%xmm3, %xmm0
	movaps	%xmm0, %xmm2
	shufps	$1, %xmm0, %xmm2
	addss	%xmm2, %xmm0
	movups	%xmm0, (%eax)
	movups	%xmm1, (%edx)
	popl	%eax
	ret
.LFE312:
	.size	_Z3fooPfS_, .-_Z3fooPfS_
	.section	.rodata.str1.1,"aMS",@progbits,1
.LC0:
	.string	"extents (%f %f %f %f)\n"
	.text
	.align 2
	.p2align 4,,15
.globl main
	.type	main, @function
main:
.LFB313:
	pushl	%ebp
.LCFI1:
	movl	%esp, %ebp
.LCFI2:
	subl	$40, %esp
.LCFI3:
	movl	$0x80000000, -4(%ebp)
	movups	extents, %xmm5
	andl	$-16, %esp
	subl	$16, %esp
	movss	-4(%ebp), %xmm1
	movups	center, %xmm6
	movl	$.LC0, (%esp)
	shufps	$0, %xmm1, %xmm1
	movdqa	%xmm1, %xmm4
	xorps	%xmm5, %xmm1
	andnps	%xmm6, %xmm4
	movaps	%xmm4, %xmm0
	movups	%xmm1, extents
	mulps	%xmm1, %xmm0
	movhlps	%xmm0, %xmm3
	flds	extents+12
	addps	%xmm3, %xmm0
	movaps	%xmm0, %xmm2
	shufps	$1, %xmm0, %xmm2
	addss	%xmm2, %xmm0
	fstpl	28(%esp)
	flds	extents+8
	movups	%xmm0, center
	fstpl	20(%esp)
	flds	extents+4
	fstpl	12(%esp)
	flds	extents
	fstpl	4(%esp)
	call	printf
	leave
	xorl	%eax, %eax
	ret
.LFE313:
	.size	main, .-main
	.section	.eh_frame,"a",@progbits
.Lframe1:
	.long	.LECIE1-.LSCIE1
.LSCIE1:
	.long	0x0
	.byte	0x1
	.string	"zP"
	.uleb128 0x1
	.sleb128 -4
	.byte	0x8
	.uleb128 0x5
	.byte	0x0
	.long	__gxx_personality_v0
	.byte	0xc
	.uleb128 0x4
	.uleb128 0x4
	.byte	0x88
	.uleb128 0x1
	.align 4
.LECIE1:
.LSFDE3:
	.long	.LEFDE3-.LASFDE3
.LASFDE3:
	.long	.LASFDE3-.Lframe1
	.long	.LFB313
	.long	.LFE313-.LFB313
	.uleb128 0x0
	.byte	0x4
	.long	.LCFI1-.LFB313
	.byte	0xe
	.uleb128 0x8
	.byte	0x85
	.uleb128 0x2
	.byte	0x4
	.long	.LCFI2-.LCFI1
	.byte	0xd
	.uleb128 0x5
	.align 4
.LEFDE3:
	.section	.note.GNU-stack,"",@progbits
	.ident	"GCC: (GNU) 3.4.1 20040618 (prerelease)"

--------------------------------------------------------------------

Note that replacing movdqa by movaps (or using gcc 3.3.3;-) fixes the problem.

Hope it helps.

-- 
           Summary: generates invalid SSE movdqa instruction (instead of
                    movaps)
           Product: gcc
           Version: 3.4.1
            Status: UNCONFIRMED
          Severity: normal
          Priority: P2
         Component: translation
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: djp at volny dot cz
                CC: gcc-bugs at gcc dot gnu dot org
 GCC build triplet: i686-pc-linux-gnu
  GCC host triplet: i686-pc-linux-gnu
GCC target triplet: i686-pc-linux-gnu


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16111


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]