[root@localhost tmp]# cat a.cpp #include <xmmintrin.h> #include <stdio.h> __m128 mm; void test ( const __m128& vm, float r ) { mm = _mm_add_ps( mm, _mm_set_ps( 0.0f, r, r, r) ); float vm0, vm1, vm2; _mm_store_ss( &vm0, vm ); _mm_store_ss( &vm1, _mm_shuffle_ps( vm, vm, 1 ) ); _mm_store_ss( &vm2, _mm_shuffle_ps( vm, vm, 2 ) ); _mm_store_ss( &vm2, _mm_shuffle_ps( vm, vm, 2 ) ); printf("In TEST: %f %f %f\n", vm0, vm1, vm2 ); } int main() { __m128 vm = _mm_set_ps( 4.0f, 3.0f, 2.0f, 1.0f ); float vm0, vm1, vm2; _mm_store_ss( &vm0, vm ); _mm_store_ss( &vm1, _mm_shuffle_ps( vm, vm, 1 ) ); _mm_store_ss( &vm2, _mm_shuffle_ps( vm, vm, 2 ) ); printf("Before TEST: %f %f %f\n", vm0, vm1, vm2 ); test(vm, 0.5f); _mm_store_ss( &vm0, vm ); _mm_store_ss( &vm1, _mm_shuffle_ps( vm, vm, 1 ) ); _mm_store_ss( &vm2, _mm_shuffle_ps( vm, vm, 2 ) ); printf("After TEST: %f %f %f\n", vm0, vm1, vm2 ); return 0; } [root@localhost tmp]# g++ -Wall -O3 -msse a.cpp; ./a.out Before TEST: 1.000000 2.000000 3.000000 In TEST: nan nan nan After TEST: 1.000000 2.000000 3.000000 [root@localhost tmp]# g++ -Wall -O2 -msse a.cpp; ./a.out Before TEST: 1.000000 2.000000 3.000000 In TEST: nan nan nan After TEST: 1.000000 2.000000 3.000000 [root@localhost tmp]# g++ -Wall -O1 -msse a.cpp; ./a.out Before TEST: 1.000000 2.000000 3.000000 In TEST: nan nan nan After TEST: 1.000000 2.000000 3.000000 [root@localhost tmp]# g++ -Wall -O0 -msse a.cpp; ./a.out Before TEST: 1.000000 2.000000 3.000000 In TEST: 1.000000 2.000000 3.000000 After TEST: 1.000000 2.000000 3.000000 [root@localhost tmp]# [root@localhost tmp]# uname -a Linux localhost.localdomain 2.6.17-1.2139_FC5smp #1 SMP Fri Jun 23 13:12:06 EDT 2006 i686 i686 i386 GNU/Linux [root@localhost tmp]# gcc -v Using built-in specs. Target: i386-redhat-linux Configured with: ../configure --prefix=/usr --mandir=/usr/share/man --infodir=/usr/share/info --enable-shared --enable-threads=posix --enable-checking =release --with-system-zlib --enable-__cxa_atexit --disable-libunwind-exceptions --enable-libgcj-multifile --enable-languages=c,c++,objc,obj-c++,java, fortran,ada --enable-java-awt=gtk --disable-dssi --with-java-home=/usr/lib/jvm/java-1.4.2-gcj-1.4.2.0/jre --with-cpu=generic --host=i386-redhat-linux Thread model: posix gcc version 4.1.1 20060525 (Red Hat 4.1.1-1) [root@localhost tmp]# gcc3 is OK
Confirmed. With -mfpmath=sse I get Before TEST: 1.000000 2.000000 3.000000 In TEST: nan 2.000000 3.000000 After TEST: 1.000000 2.000000 3.000000 with -mfpmath=sse,387 Before TEST: 1.000000 2.000000 3.000000 In TEST: 1.000000 2.000000 nan After TEST: 1.000000 2.000000 3.000000
Differences mainline vs. 4.1.x are @@ -8,14 +8,19 @@ test: pushl %ebp movl %esp, %ebp - subl $40, %esp - movss 12(%ebp), %xmm1 - movaps %xmm1, %xmm2 - unpcklps %xmm1, %xmm2 - movaps %xmm2, %xmm0 + subl $56, %esp + movd 12(%ebp), %mm1 + movq %mm1, %mm2 + punpckldq %mm2, %mm2 + movd %mm1, -4(%ebp) + movss -4(%ebp), %xmm1 xorps %xmm2, %xmm2 unpcklps %xmm2, %xmm1 - movlhps %xmm1, %xmm0 + movaps %xmm1, %xmm0 + movq %mm2, -16(%ebp) + movq2dq %mm2, %xmm1 + movlhps %xmm0, %xmm1 + movaps %xmm1, %xmm0 addps mm, %xmm0 movaps %xmm0, mm movl 8(%ebp), %eax i.e. 4.1.x uses MMX code here. This is PR28825 actually, I'll do a backport.
Subject: Bug 28960 Author: rguenth Date: Mon Oct 16 11:34:44 2006 New Revision: 117784 URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=117784 Log: 2006-10-16 Richard Guenther <rguenther@suse.de> PR target/28960 Backport from mainline: 2006-08-23 Stuart Hastings <stuart@apple.com> PR target/28825 * gcc/config/i386/i386.c (ix86_expand_vector_init_duplicate, ix86_expand_vector_init_one_nonzero): Remove TARGET_SSE test. * gcc.target/i386/20060821-1.c: New. Added: branches/gcc-4_1-branch/gcc/testsuite/gcc.target/i386/20060821-1.c - copied unchanged from r116356, trunk/gcc/testsuite/gcc.target/i386/20060821-1.c Modified: branches/gcc-4_1-branch/gcc/ChangeLog branches/gcc-4_1-branch/gcc/config/i386/i386.c branches/gcc-4_1-branch/gcc/testsuite/ChangeLog
Fixed on the 4.1 branch.
Fixed in GCC-4.1.2.