This is the mail archive of the
gcc-bugs@gcc.gnu.org
mailing list for the GCC project.
[Bug inline-asm/42881] New: SSE2 intrinsics miscompiled at -O0 -march=k8
- From: "bugs at 59A2 dot org" <gcc-bugzilla at gcc dot gnu dot org>
- To: gcc-bugs at gcc dot gnu dot org
- Date: 27 Jan 2010 11:02:52 -0000
- Subject: [Bug inline-asm/42881] New: SSE2 intrinsics miscompiled at -O0 -march=k8
- Reply-to: gcc-bugzilla at gcc dot gnu dot org
A simple test program, a.c:
#include <stdio.h>
#include <emmintrin.h>
int main(void) {
double a[2];
__m128d x = _mm_set1_pd(3);
_mm_storeu_pd(a,x);
printf("%f %f\n",a[0],a[1]);
return 0;
}
$ gcc-4.5 -O0 -march=k8 a.c && ./a.out # broken
0.000000 0.000000
$ gcc-4.5 -O1 -march=k8 a.c && ./a.out # good
3.000000 3.000000
$ gcc-4.5 -O0 -march=core2 a.c && ./a.out # good
3.000000 3.000000
$ gcc-4.5 -O0 -march=k8 -c a.c && objdump -d -M intel --prefix-addresses a.o |
grep main
0000000000000000 <main> push rbp
0000000000000001 <main+0x1> mov rbp,rsp
0000000000000004 <main+0x4> sub rsp,0x40
0000000000000008 <main+0x8> mov rax,0x4008000000000000
0000000000000012 <main+0x12> mov QWORD PTR [rbp-0x8],rax
0000000000000016 <main+0x16> movsd xmm2,xmm1
000000000000001a <main+0x1a> unpcklpd xmm2,xmm2
000000000000001e <main+0x1e> movapd xmm0,xmm2
0000000000000022 <main+0x22> movlpd xmm1,QWORD PTR [rbp-0x8]
0000000000000027 <main+0x27> movaps XMMWORD PTR [rbp-0x40],xmm0
000000000000002b <main+0x2b> movapd xmm0,XMMWORD PTR [rbp-0x40]
0000000000000030 <main+0x30> lea rax,[rbp-0x30]
0000000000000034 <main+0x34> mov QWORD PTR [rbp-0x10],rax
0000000000000038 <main+0x38> movaps XMMWORD PTR [rbp-0x20],xmm0
000000000000003c <main+0x3c> mov rax,QWORD PTR [rbp-0x10]
0000000000000040 <main+0x40> movapd xmm0,XMMWORD PTR [rbp-0x20]
0000000000000045 <main+0x45> movupd XMMWORD PTR [rax],xmm0
0000000000000049 <main+0x49> movlpd xmm1,QWORD PTR [rbp-0x28]
000000000000004e <main+0x4e> movlpd xmm0,QWORD PTR [rbp-0x30]
0000000000000053 <main+0x53> mov eax,0x0
0000000000000058 <main+0x58> mov rdi,rax
000000000000005b <main+0x5b> mov eax,0x2
0000000000000060 <main+0x60> call 0000000000000065 <main+0x65>
0000000000000065 <main+0x65> mov eax,0x0
000000000000006a <main+0x6a> leave
000000000000006b <main+0x6b> ret
$ gcc-4.5 -O0 -march=core2 -c a.c && objdump -d -M intel --prefix-addresses a.o
| grep main
0000000000000000 <main> push rbp
0000000000000001 <main+0x1> mov rbp,rsp
0000000000000004 <main+0x4> sub rsp,0x40
0000000000000008 <main+0x8> mov rax,0x4008000000000000
0000000000000012 <main+0x12> mov QWORD PTR [rbp-0x8],rax
0000000000000016 <main+0x16> movddup xmm0,QWORD PTR [rbp-0x8]
000000000000001b <main+0x1b> movapd XMMWORD PTR [rbp-0x40],xmm0
0000000000000020 <main+0x20> movapd xmm0,XMMWORD PTR [rbp-0x40]
0000000000000025 <main+0x25> lea rax,[rbp-0x30]
0000000000000029 <main+0x29> mov QWORD PTR [rbp-0x10],rax
000000000000002d <main+0x2d> movapd XMMWORD PTR [rbp-0x20],xmm0
0000000000000032 <main+0x32> mov rax,QWORD PTR [rbp-0x10]
0000000000000036 <main+0x36> movapd xmm0,XMMWORD PTR [rbp-0x20]
000000000000003b <main+0x3b> movupd XMMWORD PTR [rax],xmm0
000000000000003f <main+0x3f> mov rdx,QWORD PTR [rbp-0x28]
0000000000000043 <main+0x43> movsd xmm0,QWORD PTR [rbp-0x30]
0000000000000048 <main+0x48> mov eax,0x0
000000000000004d <main+0x4d> movq xmm1,rdx
0000000000000052 <main+0x52> mov rdi,rax
0000000000000055 <main+0x55> mov eax,0x2
000000000000005a <main+0x5a> call 000000000000005f <main+0x5f>
000000000000005f <main+0x5f> mov eax,0x0
0000000000000064 <main+0x64> leave
0000000000000065 <main+0x65> ret
The incorrect bit is
0000000000000016 <main+0x16> movsd xmm2,xmm1
000000000000001a <main+0x1a> unpcklpd xmm2,xmm2
000000000000001e <main+0x1e> movapd xmm0,xmm2
0000000000000022 <main+0x22> movlpd xmm1,QWORD PTR [rbp-0x8]
0000000000000027 <main+0x27> movaps XMMWORD PTR [rbp-0x40],xmm0
which is corrected by -march=core2 to
0000000000000016 <main+0x16> movddup xmm0,QWORD PTR [rbp-0x8]
000000000000001b <main+0x1b> movapd XMMWORD PTR [rbp-0x40],xmm0
Of course all the redundant stores are collapsed at any positive optimization
level, and the result becomes correct regardless of -march. Unfortunately, the
bug is in the generic x86-64 target so it's highly visible. This bug is not
present in 4.4.2.
$ gcc-4.5 -v
Using built-in specs.
COLLECT_GCC=gcc-4.5
COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/lto-wrapper
Target: x86_64-unknown-linux-gnu
Configured with: ../configure --prefix=/usr --enable-languages=c,c++,fortran
--enable-gold --enable-plugin --enable-threads=posix --enable-__cxa_atexit
--enable-clocale=gnu --enable-lto --enable-gnu-unique-object --disable-multilib
--disable-libstdcxx-pch --with-tune=generic --with-system-zlib --with-ppl
--with-cloog --libdir=/usr/lib --libexecdir=/usr/lib --mandir=/usr/share/man
--infodir=/usr/share/info --disable-werror --enable-checking=release
--program-suffix=-4.5 --enable-version-specific-runtime-libs : (reconfigured)
../configure --prefix=/usr --enable-languages=c,c++,fortran --enable-gold
--enable-plugin --enable-threads=posix --enable-__cxa_atexit
--enable-clocale=gnu --enable-lto --enable-gnu-unique-object --disable-multilib
--disable-libstdcxx-pch --with-system-zlib --with-ppl --with-cloog
--libdir=/usr/lib --libexecdir=/usr/lib --mandir=/usr/share/man
--infodir=/usr/share/info --disable-werror --enable-checking=release
--program-suffix=-4.5 --enable-version-specific-runtime-libs
Thread model: posix
gcc version 4.5.0 20100121 (experimental) (GCC)
--
Summary: SSE2 intrinsics miscompiled at -O0 -march=k8
Product: gcc
Version: 4.5.0
Status: UNCONFIRMED
Severity: major
Priority: P3
Component: inline-asm
AssignedTo: unassigned at gcc dot gnu dot org
ReportedBy: bugs at 59A2 dot org
GCC build triplet: x86_64-unknown-linux-gnu
GCC host triplet: x86_64-unknown-linux-gnu
GCC target triplet: x86_64-unknown-linux-gnu
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=42881