This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug translation/16111] New: generates invalid SSE movdqa instruction (instead of movaps)
- From: "djp at volny dot cz" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: 21 Jun 2004 11:42:16 -0000
- Subject: [Bug translation/16111] New: generates invalid SSE movdqa instruction (instead of movaps)
- Reply-to: gcc-bugzilla at gcc dot gnu dot org
My project stops working after switching from 3.3.3 to 3.4.0; I've found that
the problem is in SSE2; in some (register intensive) code the compiler generates
movdqa (packed ints) instead of movaps (float4 vector) causing invalid result (NaN).
I've created a simple test case to demonstrate the problem:
--------------------------------------------------------------------
#include <math.h>
#include <xmmintrin.h>
#include <stdio.h>
static inline __m128 xmm_dot4(__m128 a, __m128 b) {
__m128 v0 = _mm_mul_ps(a, b);
__m128 v1 = _mm_movehl_ps(v1, v0);
v0 = _mm_add_ps(v0, v1);
v1 = _mm_shuffle_ps(v0, v0, _MM_SHUFFLE(0,0,0,1));
return _mm_add_ss(v0, v1);
}
/*
* gcc-3.4.0+ generates invalid movdqa instruction;
* works well if you replace movdqa by movaps in asm.
*/
void foo(float* boxCenter, float* boxExtents)
{
unsigned int MASK = 0x80000000;
__m128 mask = _mm_set1_ps((float&)MASK);
__m128 center = _mm_loadu_ps(boxCenter);
__m128 extents = _mm_loadu_ps(boxExtents);
center = _mm_andnot_ps(mask, center); // common code for doing abs
extents = _mm_xor_ps(mask, extents); // common code for doing neg
center = xmm_dot4(center, extents);
_mm_storeu_ps(boxCenter, center);
_mm_storeu_ps(boxExtents, extents);
}
float center[] = { 1, 1, 1, 1 };
float extents[] = { 27.5f, 27.5f, 0, 0 };
int main()
{
foo(center, extents);
printf("extents (%f %f %f %f)\n", extents[0], extents[1], extents[2],
extents[3]); // prints NaN
return 0;
}
--------------------------------------------------------------------
I've tried both 3.4.0 release, and latest CVS snapshot:
/opt/gcc-3.4.0/bin/g++-3.4 -v -save-temps -O3 -msse -mfpmath=sse
-fomit-frame-pointer -finline-limit=2000 "test.cxx" -o "test" -L/opt/gcc-3.4.0/lib
Reading specs from /opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1/specs
Configured with: ../configure --prefix=/opt/gcc-3.4.0 --program-suffix=-3.4
--enable-languages=c,c++,java --with-gcc --with-gnu-as --with-gnu-ld
--enable-shared --enable-threads=posix --enable-libgcj --disable-java-awt
--without-x --enable-java-gc=boehm --disable-debug --disable-libgcj-debug
--disable-interpreter --disable-x --enable-hash-synchronization
Thread model: posix
gcc version 3.4.1 20040618 (prerelease)
/opt/gcc-3.4.0/libexec/gcc/i686-pc-linux-gnu/3.4.1/cc1plus -E -quiet -v
-D_GNU_SOURCE test.cxx -msse -mfpmath=sse -mtune=pentiumpro -fomit-frame-pointer
-finline-limit=2000 -O3 -o test.ii
ignoring nonexistent directory
"/opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1/../../../../i686-pc-linux-gnu/include"
#include "..." search starts here:
#include <...> search starts here:
/opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1/../../../../include/c++/3.4.1
/opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1/../../../../include/c++/3.4.1/i686-pc-linux-gnu
/opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1/../../../../include/c++/3.4.1/backward
/usr/local/include
/opt/gcc-3.4.0/include
/opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1/include
/usr/include
End of search list.
/opt/gcc-3.4.0/libexec/gcc/i686-pc-linux-gnu/3.4.1/cc1plus -fpreprocessed
test.ii -quiet -dumpbase test.cxx -msse -mfpmath=sse -mtune=pentiumpro -auxbase
test -O3 -version -fomit-frame-pointer -finline-limit=2000 -o test.s
GNU C++ version 3.4.1 20040618 (prerelease) (i686-pc-linux-gnu)
compiled by GNU C version 3.3.3 (Debian 20040321).
GGC heuristics: --param ggc-min-expand=90 --param ggc-min-heapsize=113152
as -V -Qy -o test.o test.s
GNU assembler version 2.14.90.0.7 (i386-linux) using BFD version 2.14.90.0.7
20031029 Debian GNU/Linux
/opt/gcc-3.4.0/libexec/gcc/i686-pc-linux-gnu/3.4.1/collect2 --eh-frame-hdr -m
elf_i386 -dynamic-linker /lib/ld-linux.so.2 -o test /usr/lib/crt1.o
/usr/lib/crti.o /opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1/crtbegin.o
-L/opt/gcc-3.4.0/lib -L/opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1
-L/opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1/../../.. test.o -lstdc++ -lm
-lgcc_s -lgcc -lc -lgcc_s -lgcc
/opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1/crtend.o /usr/lib/crtn.o
/opt/gcc-3.4.0/bin/g++-3.4 -v -save-temps -O3 -msse -mfpmath=sse
-fomit-frame-pointer -finline-limit=2000 "test.cxx" -S -o "test.S"
-L/opt/gcc-3.4.0/lib
Reading specs from /opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1/specs
Configured with: ../configure --prefix=/opt/gcc-3.4.0 --program-suffix=-3.4
--enable-languages=c,c++,java --with-gcc --with-gnu-as --with-gnu-ld
--enable-shared --enable-threads=posix --enable-libgcj --disable-java-awt
--without-x --enable-java-gc=boehm --disable-debug --disable-libgcj-debug
--disable-interpreter --disable-x --enable-hash-synchronization
Thread model: posix
gcc version 3.4.1 20040618 (prerelease)
/opt/gcc-3.4.0/libexec/gcc/i686-pc-linux-gnu/3.4.1/cc1plus -E -quiet -v
-D_GNU_SOURCE test.cxx -msse -mfpmath=sse -mtune=pentiumpro -fomit-frame-pointer
-finline-limit=2000 -O3 -o test.ii
ignoring nonexistent directory
"/opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1/../../../../i686-pc-linux-gnu/include"
#include "..." search starts here:
#include <...> search starts here:
/opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1/../../../../include/c++/3.4.1
/opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1/../../../../include/c++/3.4.1/i686-pc-linux-gnu
/opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1/../../../../include/c++/3.4.1/backward
/usr/local/include
/opt/gcc-3.4.0/include
/opt/gcc-3.4.0/lib/gcc/i686-pc-linux-gnu/3.4.1/include
/usr/include
End of search list.
/opt/gcc-3.4.0/libexec/gcc/i686-pc-linux-gnu/3.4.1/cc1plus -fpreprocessed
test.ii -quiet -dumpbase test.cxx -msse -mfpmath=sse -mtune=pentiumpro
-auxbase-strip test.S -O3 -version -fomit-frame-pointer -finline-limit=2000 -o
test.S
GNU C++ version 3.4.1 20040618 (prerelease) (i686-pc-linux-gnu)
compiled by GNU C version 3.3.3 (Debian 20040321).
GGC heuristics: --param ggc-min-expand=90 --param ggc-min-heapsize=113152
--------------------------------------------------------------------
And the result:
.file "test.cxx"
.globl extents
.data
.align 4
.type extents, @object
.size extents, 16
extents:
.long 1104936960
.long 1104936960
.long 0
.long 0
.globl center
.align 4
.type center, @object
.size center, 16
center:
.long 1065353216
.long 1065353216
.long 1065353216
.long 1065353216
.text
.align 2
.p2align 4,,15
.globl _Z3fooPfS_
.type _Z3fooPfS_, @function
_Z3fooPfS_:
.LFB312:
subl $4, %esp
.LCFI0:
movl 8(%esp), %eax
movl $0x80000000, (%esp)
movl 12(%esp), %edx
movss (%esp), %xmm1
movups (%eax), %xmm6
movups (%edx), %xmm5
shufps $0, %xmm1, %xmm1
movdqa %xmm1, %xmm4
andnps %xmm6, %xmm4
xorps %xmm5, %xmm1
movaps %xmm4, %xmm0
mulps %xmm1, %xmm0
movhlps %xmm0, %xmm3
addps %xmm3, %xmm0
movaps %xmm0, %xmm2
shufps $1, %xmm0, %xmm2
addss %xmm2, %xmm0
movups %xmm0, (%eax)
movups %xmm1, (%edx)
popl %eax
ret
.LFE312:
.size _Z3fooPfS_, .-_Z3fooPfS_
.section .rodata.str1.1,"aMS",@progbits,1
.LC0:
.string "extents (%f %f %f %f)\n"
.text
.align 2
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB313:
pushl %ebp
.LCFI1:
movl %esp, %ebp
.LCFI2:
subl $40, %esp
.LCFI3:
movl $0x80000000, -4(%ebp)
movups extents, %xmm5
andl $-16, %esp
subl $16, %esp
movss -4(%ebp), %xmm1
movups center, %xmm6
movl $.LC0, (%esp)
shufps $0, %xmm1, %xmm1
movdqa %xmm1, %xmm4
xorps %xmm5, %xmm1
andnps %xmm6, %xmm4
movaps %xmm4, %xmm0
movups %xmm1, extents
mulps %xmm1, %xmm0
movhlps %xmm0, %xmm3
flds extents+12
addps %xmm3, %xmm0
movaps %xmm0, %xmm2
shufps $1, %xmm0, %xmm2
addss %xmm2, %xmm0
fstpl 28(%esp)
flds extents+8
movups %xmm0, center
fstpl 20(%esp)
flds extents+4
fstpl 12(%esp)
flds extents
fstpl 4(%esp)
call printf
leave
xorl %eax, %eax
ret
.LFE313:
.size main, .-main
.section .eh_frame,"a",@progbits
.Lframe1:
.long .LECIE1-.LSCIE1
.LSCIE1:
.long 0x0
.byte 0x1
.string "zP"
.uleb128 0x1
.sleb128 -4
.byte 0x8
.uleb128 0x5
.byte 0x0
.long __gxx_personality_v0
.byte 0xc
.uleb128 0x4
.uleb128 0x4
.byte 0x88
.uleb128 0x1
.align 4
.LECIE1:
.LSFDE3:
.long .LEFDE3-.LASFDE3
.LASFDE3:
.long .LASFDE3-.Lframe1
.long .LFB313
.long .LFE313-.LFB313
.uleb128 0x0
.byte 0x4
.long .LCFI1-.LFB313
.byte 0xe
.uleb128 0x8
.byte 0x85
.uleb128 0x2
.byte 0x4
.long .LCFI2-.LCFI1
.byte 0xd
.uleb128 0x5
.align 4
.LEFDE3:
.section .note.GNU-stack,"",@progbits
.ident "GCC: (GNU) 3.4.1 20040618 (prerelease)"
--------------------------------------------------------------------
Note that replacing movdqa by movaps (or using gcc 3.3.3;-) fixes the problem.
Hope it helps.
--
Summary: generates invalid SSE movdqa instruction (instead of
movaps)
Product: gcc
Version: 3.4.1
Status: UNCONFIRMED
Severity: normal
Priority: P2
Component: translation
AssignedTo: unassigned at gcc dot gnu dot org
ReportedBy: djp at volny dot cz
CC: gcc-bugs at gcc dot gnu dot org
GCC build triplet: i686-pc-linux-gnu
GCC host triplet: i686-pc-linux-gnu
GCC target triplet: i686-pc-linux-gnu
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16111