c++/7582: Intel intrinsics cause segfault with gcc 3.1.1 and 3.2
dholm@telia.com
dholm@telia.com
Mon Aug 12 16:16:00 GMT 2002
>Number: 7582
>Category: c++
>Synopsis: Intel intrinsics cause segfault with gcc 3.1.1 and 3.2
>Confidential: no
>Severity: critical
>Priority: medium
>Responsible: unassigned
>State: open
>Class: sw-bug
>Submitter-Id: net
>Arrival-Date: Mon Aug 12 16:16:01 PDT 2002
>Closed-Date:
>Last-Modified:
>Originator: David Holm
>Release: gcc version 3.2 2002-07-26 (prerelease)
>Organization:
>Environment:
Gentoo Linux 1.4, Pentium 3 (Coppermine)
>Description:
The following code executes perfectly when compiled with the Intel C++ Compiler v6.0 but segfaults when compiled with gcc 3.1.1 or 3.2 (2002-07-26).
It's compiled with "g++ (-g3) -Wall -msse intrin.cpp -o intrin" and runs without any output.
g++ gives no warnings during compilation.
intrin segfaults on this line "_mm_stream_ps((float*) dst, xmm0);"
"g++ -v" returns:
Reading specs from /usr/lib/gcc-lib/i686-pc-linux-gnu/3.2/specs
Configured with: /var/tmp/portage/gcc-3.2_pre/work/gcc-3.2/configure --prefix=/usr --mandir=/usr/share/man --infodir=/usr/share/info --enable-shared --host=i686-pc-linux-gnu --build=i686-pc-linux-gnu --target=i686-pc-linux-gnu --with-system-zlib --enable-languages=c,c++,ada,f77,objc,java --enable-threads=posix --enable-long-long --disable-checking --enable-cstdio=stdio --enable-clocale=generic --enable-version-specific-runtime-libs --with-gxx-include-dir=/usr/include/g++-v32 --with-local-prefix=/usr/local --enable-shared --enable-nls --without-included-gettext
Thread model: posix
gcc version 3.2 2002-07-26 (prerelease)
I haven't got 3.1.1 anymore, so I can't give you the -v output from it.
>How-To-Repeat:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <mmintrin.h>
#include <xmmintrin.h>
#define small_memcpy(dst,src,n) \
register unsigned long int dummy; \
asm volatile ( \
"rep; movsb\n\t" \
:"=&D"(dst), "=&S"(src), "=&c"(dummy) \
:"0" (dst), "1" (src),"2" (n) \
: "memory");
/**
* SIMD Optimized memcpy's are graciously borrowed from DirectFB.
*/
# define SSE_MMREG_SIZE 16
# define MIN_LEN 0x40 /* 64-byte blocks */
void *memcpy_sse( void *dst, const void *src, size_t len )
{
void *retval = dst;
size_t i;
_mm_prefetch((char*) src, _MM_HINT_NTA);
_mm_prefetch((char*) src + 64, _MM_HINT_NTA);
_mm_prefetch((char*) src + 128, _MM_HINT_NTA);
_mm_prefetch((char*) src + 192, _MM_HINT_NTA);
_mm_prefetch((char*) src + 256, _MM_HINT_NTA);
if (len >= MIN_LEN)
{
register unsigned long int delta;
delta = ((unsigned long int) dst) & (SSE_MMREG_SIZE - 1);
if (delta)
{
delta = SSE_MMREG_SIZE - delta;
len -= delta;
small_memcpy(dst, src, delta);
}
i = len >> 6;
len &= 63;
if (((unsigned long) src) & 15)
for (; i > 0; i--)
{
__m128 xmm0, xmm1, xmm2, xmm3;
_mm_prefetch((char*) src + 320, _MM_HINT_NTA);
xmm0 = _mm_loadu_ps((float*) src);
xmm1 = _mm_loadu_ps((float*) src + 4);
xmm2 = _mm_loadu_ps((float*) src + 8);
xmm3 = _mm_loadu_ps((float*) src + 12);
_mm_stream_ps((float*) dst, xmm0);
_mm_stream_ps((float*) dst + 4, xmm1);
_mm_stream_ps((float*) dst + 8, xmm2);
_mm_stream_ps((float*) dst + 12, xmm3);
#ifdef __GNUC__
(char*) src += 64;
(char*) dst += 64;
#else
src += 64;
dst += 64;
#endif
}
else
for (; i > 0; i--)
{
__m128 xmm0, xmm1, xmm2, xmm3;
_mm_prefetch((char*) src + 320, _MM_HINT_NTA);
xmm0 = _mm_load_ps((float*) src);
xmm1 = _mm_load_ps((float*) src + 4);
xmm2 = _mm_load_ps((float*) src + 8);
xmm3 = _mm_load_ps((float*) src + 12);
_mm_stream_ps((float*) dst, xmm0);
_mm_stream_ps((float*) dst + 4, xmm1);
_mm_stream_ps((float*) dst + 8, xmm2);
_mm_stream_ps((float*) dst + 12, xmm3);
#ifdef __GNUC__
(char*) src += 64;
(char*) dst += 64;
#else
src += 64;
dst += 64;
#endif
}
}
if (len)
memcpy(dst, src, len);
return retval;
}
int main(void)
{
char *tmp1, *tmp2;
(void*) tmp1 = malloc(1024 * 1024 * 10);
(void*) tmp2 = malloc(1024 * 1024 * 10);
memcpy_sse(tmp1, tmp2, 1024 * 1024 * 10);
free(tmp1);
free(tmp2);
return 0;
}
>Fix:
>Release-Note:
>Audit-Trail:
>Unformatted:
More information about the Gcc-bugs
mailing list