[Bug target/46716] [4.3/4.4/4.5/4.6 Regression] bad code generated with -mno-sse2 -m64

Tue Nov 30 17:50:00 GMT 2010

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46716

--- Comment #5 from David Mathog <mathog at caltech dot edu> 2010-11-30 17:25:01 UTC ---
A (long) side note on how I found this bug - in partial answer to the obvious
question - why would anybody run with -mno-sse2 on an X86_64 platform?  

We have a cluster of Athlon MP machines and one of the applications that run
there is Sean Eddy's HMMER which is used to search a database call PFAMDIR. 
With version 3 of that software PFAMDIR changed format to only work with the
newer software.  HMMER 3 has a reference version (portable) and an SSE (really
SSE2 version).  I found that the reference version did not give exactly the
same answers as the SSE version, so it wasn't going to be possible to refine
the reference version for that platform and get the exact same results as
everybody else, but the SSE2 version could not run on the target since that
processor has no SSE2.  Since I need to make this work on those old machines I
wrote an SSE2 emulator which is a replacement emmintrin.h (latest version here)

  http://saf.bio.caltech.edu/pub/software/linux_or_unix_tools/soft_emmintrin.h

This would be used instead of the native SSE2 (in theory) by dropping it into a
directory as emmintrin.h and using "-I. -mno-sse2 -DSOFT_SSE2" on the gcc
compile line.  The idea being to get the SSE2 version running on the target and
then optimize from that version for the target, retaining the same numerical
results along the way.  

Since there was a lot of recompiling/retesting during development I used my
fastest machine, which happened to be an Opteron running an X86_64 OS.  To
exercise the software SSE2 code all of the SSE2 tests in the gcc testsuite were
run, and these triggered the present bug due to the default implicit -m64.

I also have done some preliminary work on a soft_xmmintrin.h, but have my
doubts that it is possible to use that successfully in combination with the gcc
vector extension, since many strange things happen when -mno-sse is added to
the command line.  It seems that the gcc vector extension is very much
intertwined with SSE on X86 platforms and perhaps cannot be fully separated
from it.  (A point that is not made at all clear in the documentation.) 

Additionally, with -msse -mno-sse2 -m32 and levels of optimization above -O0
complex expression like this is used in a real program (with multiple _mm
functions used, it does not show up in the testsuite with single _mm function
calls):

#define EMMMIN(a,b)      ((a)<(b)?(a):(b))
#define EMM_UINT1(a)   ((unsigned char *)&(a))
/* vector operation:  returns the minimum of each pair of the
16 8 bit unsigned integers from __A, __B */
static __inline __m128i __attribute__((__always_inline__))
_mm_min_epu8 (__m128i __A, __m128i __B)
{
  __v16qi __tmp={  EMMMIN(EMM_UINT1(__A)[ 0],EMM_UINT1(__B)[ 0]), 
                   EMMMIN(EMM_UINT1(__A)[ 1],EMM_UINT1(__B)[ 1]),
                   EMMMIN(EMM_UINT1(__A)[ 2],EMM_UINT1(__B)[ 2]),
                   EMMMIN(EMM_UINT1(__A)[ 3],EMM_UINT1(__B)[ 3]),
                   EMMMIN(EMM_UINT1(__A)[ 4],EMM_UINT1(__B)[ 4]),
                   EMMMIN(EMM_UINT1(__A)[ 5],EMM_UINT1(__B)[ 5]),
                   EMMMIN(EMM_UINT1(__A)[ 6],EMM_UINT1(__B)[ 6]),
                   EMMMIN(EMM_UINT1(__A)[ 7],EMM_UINT1(__B)[ 7]), 
                   EMMMIN(EMM_UINT1(__A)[ 8],EMM_UINT1(__B)[ 8]),
                   EMMMIN(EMM_UINT1(__A)[ 9],EMM_UINT1(__B)[ 9]),
                   EMMMIN(EMM_UINT1(__A)[10],EMM_UINT1(__B)[10]),
                   EMMMIN(EMM_UINT1(__A)[11],EMM_UINT1(__B)[11]),
                   EMMMIN(EMM_UINT1(__A)[12],EMM_UINT1(__B)[12]),
                   EMMMIN(EMM_UINT1(__A)[13],EMM_UINT1(__B)[13]),
                   EMMMIN(EMM_UINT1(__A)[14],EMM_UINT1(__B)[14]),
                   EMMMIN(EMM_UINT1(__A)[15],EMM_UINT1(__B)[15])};
  return (__m128i)__tmp;
}

often result in this sort of compiler error:

./msvfilter.c:208: error: unable to find a register to spill in class
'GENERAL_REGS'
./msvfilter.c:208: error: this is the insn:
(insn 1944 1943 1945 46 ../../easel/emmintrin.h:2348 (set
(strict_low_part (subreg:HI (reg:TI 1239) 0))
        (mem:HI (reg/f:SI 96 [ pretmp.1031 ]) [13 S2 A16])) 47
{*movstricthi_1} (nil))
./msvfilter.c:208: confused by earlier errors, bailing out

Simpler (fewer vector elements, less logic) functions did not do this, although
it may be that they would have had I been able to get past the first error. 
This is, I suspect, again related to an implicit use of SSE2 registers even
though -mno-sse2 had been specified.  This type of error shows up even when
-m32 is specified, so maybe it has a different origin.  In any case, rewriting
the expressions as follows seems to have eliminated this problem even for -O4,
and the primary change was the replacement of the vector {} notation to set the
(same) values.

typedef union {
 __m128i             vi;
 __m128d             vd;
 __m128              vf;
  double             f8[2];
  float              f4[4];
  long long          i8[2];
  int                i4[4];
  short              i2[8];
  char               i1[16];
  unsigned long long u8[2];
  unsigned int       u4[4];
  unsigned short     u2[8];
  unsigned char      u1[16];
} __uni16;
#define EMM_UINT1(a)   (((__uni16)(a)).u1)
#define EMMMIN(a,b)      ((a)<(b)?(a):(b))

/* vector operation:  returns the minimum of each pair of the
16 8 bit unsigned integers from __A, __B */
static __inline __m128i __attribute__((__always_inline__))
_mm_min_epu8 (__m128i __A, __m128i __B)
{
  __uni16 __tmp;
    __tmp.u1[ 0] =  EMMMIN(EMM_UINT1(__A)[ 0],EMM_UINT1(__B)[ 0]);
    __tmp.u1[ 1] =  EMMMIN(EMM_UINT1(__A)[ 1],EMM_UINT1(__B)[ 1]);
    __tmp.u1[ 2] =  EMMMIN(EMM_UINT1(__A)[ 2],EMM_UINT1(__B)[ 2]);
    __tmp.u1[ 3] =  EMMMIN(EMM_UINT1(__A)[ 3],EMM_UINT1(__B)[ 3]);
    __tmp.u1[ 4] =  EMMMIN(EMM_UINT1(__A)[ 4],EMM_UINT1(__B)[ 4]);
    __tmp.u1[ 5] =  EMMMIN(EMM_UINT1(__A)[ 5],EMM_UINT1(__B)[ 5]);
    __tmp.u1[ 6] =  EMMMIN(EMM_UINT1(__A)[ 6],EMM_UINT1(__B)[ 6]);
    __tmp.u1[ 7] =  EMMMIN(EMM_UINT1(__A)[ 7],EMM_UINT1(__B)[ 7]);
    __tmp.u1[ 8] =  EMMMIN(EMM_UINT1(__A)[ 8],EMM_UINT1(__B)[ 8]);
    __tmp.u1[ 9] =  EMMMIN(EMM_UINT1(__A)[ 9],EMM_UINT1(__B)[ 9]);
    __tmp.u1[10] =  EMMMIN(EMM_UINT1(__A)[10],EMM_UINT1(__B)[10]);
    __tmp.u1[11] =  EMMMIN(EMM_UINT1(__A)[11],EMM_UINT1(__B)[11]);
    __tmp.u1[12] =  EMMMIN(EMM_UINT1(__A)[12],EMM_UINT1(__B)[12]);
    __tmp.u1[13] =  EMMMIN(EMM_UINT1(__A)[13],EMM_UINT1(__B)[13]);
    __tmp.u1[14] =  EMMMIN(EMM_UINT1(__A)[14],EMM_UINT1(__B)[14]);
    __tmp.u1[15] =  EMMMIN(EMM_UINT1(__A)[15],EMM_UINT1(__B)[15]);
  return __tmp.vi;
}