c++/7582: Intel intrinsics cause segfault with gcc 3.1.1 and 3.2

dholm@telia.com dholm@telia.com
Mon Aug 12 16:16:00 GMT 2002


>Number:         7582
>Category:       c++
>Synopsis:       Intel intrinsics cause segfault with gcc 3.1.1 and 3.2
>Confidential:   no
>Severity:       critical
>Priority:       medium
>Responsible:    unassigned
>State:          open
>Class:          sw-bug
>Submitter-Id:   net
>Arrival-Date:   Mon Aug 12 16:16:01 PDT 2002
>Closed-Date:
>Last-Modified:
>Originator:     David Holm
>Release:        gcc version 3.2 2002-07-26 (prerelease)
>Organization:
>Environment:
Gentoo Linux 1.4, Pentium 3 (Coppermine)
>Description:
The following code executes perfectly when compiled with the Intel C++ Compiler v6.0 but segfaults when compiled with gcc 3.1.1 or 3.2 (2002-07-26).
It's compiled with "g++ (-g3) -Wall -msse intrin.cpp -o intrin" and runs without any output.
g++ gives no warnings during compilation.
intrin segfaults on this line "_mm_stream_ps((float*) dst, xmm0);"

"g++ -v" returns:
Reading specs from /usr/lib/gcc-lib/i686-pc-linux-gnu/3.2/specs
Configured with: /var/tmp/portage/gcc-3.2_pre/work/gcc-3.2/configure --prefix=/usr --mandir=/usr/share/man --infodir=/usr/share/info --enable-shared --host=i686-pc-linux-gnu --build=i686-pc-linux-gnu --target=i686-pc-linux-gnu --with-system-zlib --enable-languages=c,c++,ada,f77,objc,java --enable-threads=posix --enable-long-long --disable-checking --enable-cstdio=stdio --enable-clocale=generic --enable-version-specific-runtime-libs --with-gxx-include-dir=/usr/include/g++-v32 --with-local-prefix=/usr/local --enable-shared --enable-nls --without-included-gettext
Thread model: posix
gcc version 3.2 2002-07-26 (prerelease)

I haven't got 3.1.1 anymore, so I can't give you the -v output from it.
>How-To-Repeat:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <mmintrin.h>
#include <xmmintrin.h>

#define small_memcpy(dst,src,n) \
        register unsigned long int dummy; \
        asm volatile ( \
                "rep; movsb\n\t" \
                :"=&D"(dst), "=&S"(src), "=&c"(dummy) \
        :"0" (dst), "1" (src),"2" (n) \
                : "memory");


/**
 * SIMD Optimized memcpy's are graciously borrowed from DirectFB.
 */

#  define SSE_MMREG_SIZE 16
#  define MIN_LEN 0x40  /* 64-byte blocks */

void *memcpy_sse( void *dst, const void *src, size_t len )
{
        void *retval = dst;
        size_t i;

        _mm_prefetch((char*) src, _MM_HINT_NTA);
        _mm_prefetch((char*) src + 64, _MM_HINT_NTA);
        _mm_prefetch((char*) src + 128, _MM_HINT_NTA);
        _mm_prefetch((char*) src + 192, _MM_HINT_NTA);
        _mm_prefetch((char*) src + 256, _MM_HINT_NTA);

        if (len >= MIN_LEN)
        {
                register unsigned long int delta;
                delta = ((unsigned long int) dst) & (SSE_MMREG_SIZE - 1);
                if (delta)
                {
                        delta = SSE_MMREG_SIZE - delta;
                        len -= delta;
                        small_memcpy(dst, src, delta);
                }
                i = len >> 6;
                len &= 63;

                if (((unsigned long) src) & 15)
                        for (; i > 0; i--)
                        {
                                __m128 xmm0, xmm1, xmm2, xmm3;
                                _mm_prefetch((char*) src + 320, _MM_HINT_NTA);
                                xmm0 = _mm_loadu_ps((float*) src);
                                xmm1 = _mm_loadu_ps((float*) src + 4);
                                xmm2 = _mm_loadu_ps((float*) src + 8);
                                xmm3 = _mm_loadu_ps((float*) src + 12);
                                _mm_stream_ps((float*) dst, xmm0);
                                _mm_stream_ps((float*) dst + 4, xmm1);
                                _mm_stream_ps((float*) dst + 8, xmm2);
                                _mm_stream_ps((float*) dst + 12, xmm3);
#ifdef __GNUC__
                                (char*) src += 64;
                                (char*) dst += 64;
#else
                                src += 64;
                                dst += 64;
#endif
                        }
                else
                        for (; i > 0; i--)
                        {
                                __m128 xmm0, xmm1, xmm2, xmm3;
                                _mm_prefetch((char*) src + 320, _MM_HINT_NTA);
                                xmm0 = _mm_load_ps((float*) src);
                                xmm1 = _mm_load_ps((float*) src + 4);
                                xmm2 = _mm_load_ps((float*) src + 8);
                                xmm3 = _mm_load_ps((float*) src + 12);
                                _mm_stream_ps((float*) dst, xmm0);
                                _mm_stream_ps((float*) dst + 4, xmm1);
                                _mm_stream_ps((float*) dst + 8, xmm2);
                                _mm_stream_ps((float*) dst + 12, xmm3);
#ifdef __GNUC__
                                (char*) src += 64;
                                (char*) dst += 64;
#else
                                src += 64;
                                dst += 64;
#endif
                        }
        }

        if (len)
                memcpy(dst, src, len);

        return retval;
}

int main(void)
{
        char *tmp1, *tmp2;

        (void*) tmp1 = malloc(1024 * 1024 * 10);
        (void*) tmp2 = malloc(1024 * 1024 * 10);

        memcpy_sse(tmp1, tmp2, 1024 * 1024 * 10);

        free(tmp1);
        free(tmp2);

        return 0;
}
>Fix:

>Release-Note:
>Audit-Trail:
>Unformatted:



More information about the Gcc-bugs mailing list