PATCH: Add SSE4.1 testcases

H. J. Lu hjl@lucon.org
Tue May 22 16:11:00 GMT 2007


On Tue, May 22, 2007 at 04:23:01PM +0200, Uros Bizjak wrote:
> On 5/22/07, H. J. Lu <hjl@lucon.org> wrote:
> 
> >> This is all code that needs to be duplicated...
> >
> >I wrote those SSE3/SSSE3 tests. Looking back, I wish I had done the
> >way I did for SSE4.1 tests.
> 
> I have no strong preference for either approach, so it is your call. I
> hope that someone will convert other tests to new approach to maintain
> some consistency between tests.
> 

I am checking in this patch. I will submit another patch to update
other testcases.


H.J.
2007-05-22  H.J. Lu  <hongjiu.lu@intel.com>

	* gcc.dg/i386-cpuid.h (bit_SSE4_1): New.
	(bit_SSE4_2): Likewise.
	(bit_POPCNT): Likewise.

	* gcc.target/i386/i386.exp (check_effective_target_sse4): New.
	Check if assembler supports SSE4 instructions.

	* gcc.target/i386/sse4_1-blendpd.c: New file.
	* gcc.target/i386/sse4_1-blendps.c: Likewise.
	* gcc.target/i386/sse4_1-blendvpd.c: Likewise.
	* gcc.target/i386/sse4_1-blendvps.c: Likewise.
	* gcc.target/i386/sse4_1-check.h: Likewise.
	* gcc.target/i386/sse4_1-dppd-1.c: Likewise.
	* gcc.target/i386/sse4_1-dppd-2.c: Likewise.
	* gcc.target/i386/sse4_1-dpps-1.c: Likewise.
	* gcc.target/i386/sse4_1-dpps-2.c: Likewise.
	* gcc.target/i386/sse4_1-extractps.c: Likewise.
	* gcc.target/i386/sse4_1-insertps-1.c: Likewise.
	* gcc.target/i386/sse4_1-insertps-2.c: Likewise.
	* gcc.target/i386/sse4_1-movntdqa.c: Likewise.
	* gcc.target/i386/sse4_1-mpsadbw.c: Likewise.
	* gcc.target/i386/sse4_1-packusdw.c: Likewise.
	* gcc.target/i386/sse4_1-pblendvb.c: Likewise.
	* gcc.target/i386/sse4_1-pblendw.c: Likewise.
	* gcc.target/i386/sse4_1-pcmpeqq.c: Likewise.
	* gcc.target/i386/sse4_1-pextrb.c: Likewise.
	* gcc.target/i386/sse4_1-pextrd.c: Likewise.
	* gcc.target/i386/sse4_1-pextrq.c: Likewise.
	* gcc.target/i386/sse4_1-pextrw.c: Likewise.
	* gcc.target/i386/sse4_1-phminposuw.c: Likewise.
	* gcc.target/i386/sse4_1-pinsrb.c: Likewise.
	* gcc.target/i386/sse4_1-pinsrd.c: Likewise.
	* gcc.target/i386/sse4_1-pinsrq.c: Likewise.
	* gcc.target/i386/sse4_1-pmaxsb.c: Likewise.
	* gcc.target/i386/sse4_1-pmaxsd.c: Likewise.
	* gcc.target/i386/sse4_1-pmaxud.c: Likewise.
	* gcc.target/i386/sse4_1-pmaxuw.c: Likewise.
	* gcc.target/i386/sse4_1-pminsb.c: Likewise.
	* gcc.target/i386/sse4_1-pminsd.c: Likewise.
	* gcc.target/i386/sse4_1-pminud.c: Likewise.
	* gcc.target/i386/sse4_1-pminuw.c: Likewise.
	* gcc.target/i386/sse4_1-pmovsxbd.c: Likewise.
	* gcc.target/i386/sse4_1-pmovsxbq.c: Likewise.
	* gcc.target/i386/sse4_1-pmovsxbw.c: Likewise.
	* gcc.target/i386/sse4_1-pmovsxdq.c: Likewise.
	* gcc.target/i386/sse4_1-pmovsxwd.c: Likewise.
	* gcc.target/i386/sse4_1-pmovsxwq.c: Likewise.
	* gcc.target/i386/sse4_1-pmovzxbd.c: Likewise.
	* gcc.target/i386/sse4_1-pmovzxbq.c: Likewise.
	* gcc.target/i386/sse4_1-pmovzxbw.c: Likewise.
	* gcc.target/i386/sse4_1-pmovzxdq.c: Likewise.
	* gcc.target/i386/sse4_1-pmovzxwd.c: Likewise.
	* gcc.target/i386/sse4_1-pmovzxwq.c: Likewise.
	* gcc.target/i386/sse4_1-pmuldq.c: Likewise.
	* gcc.target/i386/sse4_1-pmulld.c: Likewise.
	* gcc.target/i386/sse4_1-ptest-1.c: Likewise.
	* gcc.target/i386/sse4_1-ptest-2.c: Likewise.
	* gcc.target/i386/sse4_1-ptest-3.c: Likewise.
	* gcc.target/i386/sse4_1-round.h: Likewise.
	* gcc.target/i386/sse4_1-roundpd-1.c: Likewise.
	* gcc.target/i386/sse4_1-roundpd-2.c: Likewise.
	* gcc.target/i386/sse4_1-roundpd-3.c: Likewise.
	* gcc.target/i386/sse4_1-roundps-1.c: Likewise.
	* gcc.target/i386/sse4_1-roundps-2.c: Likewise.
	* gcc.target/i386/sse4_1-roundps-3.c: Likewise.
	* gcc.target/i386/sse4_1-roundsd-1.c: Likewise.
	* gcc.target/i386/sse4_1-roundsd-2.c: Likewise.
	* gcc.target/i386/sse4_1-roundsd-3.c: Likewise.
	* gcc.target/i386/sse4_1-roundsd-4.c: Likewise.
	* gcc.target/i386/sse4_1-roundss-1.c: Likewise.
	* gcc.target/i386/sse4_1-roundss-2.c: Likewise.
	* gcc.target/i386/sse4_1-roundss-3.c: Likewise.
	* gcc.target/i386/sse4_1-roundss-4.c: Likewise.

--- gcc/testsuite/gcc.dg/i386-cpuid.h.sse41-test	2007-03-06 13:56:23.000000000 -0800
+++ gcc/testsuite/gcc.dg/i386-cpuid.h	2007-05-22 07:45:41.000000000 -0700
@@ -5,6 +5,9 @@
 /* %ecx */
 #define bit_SSE3 (1 << 0)
 #define bit_SSSE3 (1 << 9)
+#define bit_SSE4_1 (1 << 19)
+#define bit_SSE4_2 (1 << 20)
+#define bit_POPCNT (1 << 23)
 
 /* %edx */
 #define bit_CMOV (1 << 15)
--- gcc/testsuite/gcc.target/i386/i386.exp.sse41-test	2007-05-22 06:39:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/i386.exp	2007-05-22 08:56:40.000000000 -0700
@@ -37,6 +37,20 @@ proc check_effective_target_ssse3 { } {
     } "-O2 -mssse3" ]
 }
 
+# Return 1 if sse4 instructions can be compiled.
+proc check_effective_target_sse4 { } {
+    return [check_no_compiler_messages sse4.1 object {
+	typedef long long __m128i __attribute__ ((__vector_size__ (16)));
+	typedef int __v4si __attribute__ ((__vector_size__ (16)));
+
+	__m128i _mm_mullo_epi32 (__m128i __X, __m128i __Y)
+	{
+	    return (__m128i) __builtin_ia32_pmulld128 ((__v4si)__X,
+						       (__v4si)__Y);
+	}
+    } "-O2 -msse4.1" ]
+}
+
 # Return 1 if sse4a instructions can be compiled.
 proc check_effective_target_sse4a { } {
     return [check_no_compiler_messages sse4a object {
--- gcc/testsuite/gcc.target/i386/sse4_1-blendpd.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-blendpd.c	2007-05-22 08:51:44.000000000 -0700
@@ -0,0 +1,81 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define NUM 20
+
+#ifndef MASK
+#define MASK 0x03
+#endif
+
+static void
+init_blendpd (double *src1, double *src2)
+{
+  int i, sign = 1;
+
+  for (i = 0; i < NUM * 2; i++)
+    {
+      src1[i] = i * i * sign;
+      src2[i] = (i + 20) * sign;
+      sign = -sign;
+    }
+}
+
+static int
+check_blendpd (__m128d *dst, double *src1, double *src2)
+{
+  double tmp[2];
+  int j;
+
+  memcpy (&tmp[0], src1, sizeof (tmp));
+
+  for(j = 0; j < 2; j++)
+    if ((MASK & (1 << j)))
+      tmp[j] = src2[j];
+
+  return memcmp (dst, &tmp[0], sizeof (tmp));
+}
+
+static void
+sse4_1_test (void)
+{
+  __m128d x, y;
+  union
+    {
+      __m128d x[NUM];
+      double d[NUM * 2];
+    } dst, src1, src2;
+  union
+    {
+      __m128d x;
+      double d[2];
+    } src3;
+  int i;
+
+  init_blendpd (src1.d, src2.d);
+
+  /* Check blendpd imm8, m128, xmm */
+  for (i = 0; i < NUM; i++)
+    {
+      dst.x[i] = _mm_blend_pd (src1.x[i], src2.x[i], MASK);
+      if (check_blendpd (&dst.x[i], &src1.d[i * 2], &src2.d[i * 2]))
+	abort ();
+    }
+    
+  /* Check blendpd imm8, xmm, xmm */
+  src3.x = _mm_setzero_pd ();
+
+  x = _mm_blend_pd (dst.x[2], src3.x, MASK);
+  y = _mm_blend_pd (src3.x, dst.x[2], MASK);
+
+  if (check_blendpd (&x, &dst.d[4], &src3.d[0]))
+    abort ();
+
+  if (check_blendpd (&y, &src3.d[0], &dst.d[4]))
+    abort ();
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-blendps.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-blendps.c	2007-05-22 08:51:51.000000000 -0700
@@ -0,0 +1,78 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define NUM 20
+
+#ifndef MASK
+#define MASK 0x0f
+#endif
+
+static void
+init_blendps (float *src1, float *src2)
+{
+  int i, sign = 1;
+
+  for (i = 0; i < NUM * 4; i++)
+    {
+      src1[i] = i * i * sign;
+      src2[i] = (i + 20) * sign;
+      sign = -sign;
+    }
+}
+
+static int
+check_blendps (__m128 *dst, float *src1, float *src2)
+{
+  float tmp[4];
+  int j;
+
+  memcpy (&tmp[0], src1, sizeof (tmp));
+  for (j = 0; j < 4; j++)
+    if ((MASK & (1 << j)))
+      tmp[j] = src2[j];
+
+  return memcmp (dst, &tmp[0], sizeof (tmp));
+}
+
+static void
+sse4_1_test (void)
+{
+  __m128 x, y;
+  union
+    {
+      __m128 x[NUM];
+      float f[NUM * 4];
+    } dst, src1, src2;
+  union
+    {
+      __m128 x;
+      float f[4];
+    } src3;
+  int i;
+
+  init_blendps (src1.f, src2.f);
+
+  /* Check blendps imm8, m128, xmm */
+  for (i = 0; i < NUM; i++)
+    {
+      dst.x[i] = _mm_blend_ps (src1.x[i], src2.x[i], MASK); 
+      if (check_blendps (&dst.x[i], &src1.f[i * 4], &src2.f[i * 4]))
+	abort ();
+    }
+    
+   /* Check blendps imm8, xmm, xmm */
+  x = _mm_blend_ps (dst.x[2], src3.x, MASK);
+  y = _mm_blend_ps (src3.x, dst.x[2], MASK);
+
+  if (check_blendps (&x, &dst.f[8], &src3.f[0]))
+    abort ();
+
+  if (check_blendps (&y, &src3.f[0], &dst.f[8]))
+    abort ();
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-blendvpd.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-blendvpd.c	2007-05-22 08:51:55.000000000 -0700
@@ -0,0 +1,65 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define NUM 20
+
+static void
+init_blendvpd (double *src1, double *src2, double *mask)
+{
+  int i, msk, sign = 1; 
+
+  msk = -1;
+  for (i = 0; i < NUM * 2; i++)
+    {
+      if((i % 2) == 0)
+	msk++;
+      src1[i] = i* (i + 1) * sign;
+      src2[i] = (i + 20) * sign;
+      mask[i] = (i + 120) * i;
+      if( (msk & (1 << (i % 2))))
+	mask[i] = -mask[i];
+      sign = -sign;
+    }
+}
+
+static int
+check_blendvpd (__m128d *dst, double *src1, double *src2,
+		double *mask)
+{
+  double tmp[2];
+  int j;
+
+  memcpy (&tmp[0], src1, sizeof (tmp));
+  for (j = 0; j < 2; j++)
+    if (mask [j] < 0.0)
+      tmp[j] = src2[j];
+
+  return memcmp (dst, &tmp[0], sizeof (tmp));
+}
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128d x[NUM];
+      double d[NUM * 2];
+    } dst, src1, src2, mask;
+  int i;
+
+  init_blendvpd (src1.d, src2.d, mask.d);
+
+  for (i = 0; i < NUM; i++)
+    {
+      dst.x[i] = _mm_blendv_pd (src1.x[i], src2.x[i], mask.x[i]);
+      if (check_blendvpd (&dst.x[i], &src1.d[i * 2], &src2.d[i * 2],
+			  &mask.d[i * 2]))
+	abort ();
+    }
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-blendvps.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-blendvps.c	2007-05-22 08:51:59.000000000 -0700
@@ -0,0 +1,65 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define NUM 20
+
+static void
+init_blendvps (float *src1, float *src2, float *mask)
+{
+  int i, msk, sign = 1; 
+
+  msk = -1;
+  for (i = 0; i < NUM * 4; i++)
+    {
+      if((i % 4) == 0)
+	msk++;
+      src1[i] = i* (i + 1) * sign;
+      src2[i] = (i + 20) * sign;
+      mask[i] = (i + 120) * i;
+      if( (msk & (1 << (i % 4))))
+	mask[i] = -mask[i];
+      sign = -sign;
+    }
+}
+
+static int
+check_blendvps (__m128 *dst, float *src1, float *src2,
+		float *mask)
+{
+  float tmp[4];
+  int j;
+
+  memcpy (&tmp[0], src1, sizeof (tmp));
+  for (j = 0; j < 4; j++)
+    if (mask [j] < 0.0)
+      tmp[j] = src2[j];
+
+  return memcmp (dst, &tmp[0], sizeof (tmp));
+}
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128 x[NUM];
+      float f[NUM * 4];
+    } dst, src1, src2, mask;
+  int i;
+
+  init_blendvps (src1.f, src2.f, mask.f);
+
+  for (i = 0; i < NUM; i++)
+    {
+      dst.x[i] = _mm_blendv_ps (src1.x[i], src2.x[i], mask.x[i]);
+      if (check_blendvps (&dst.x[i], &src1.f[i * 4], &src2.f[i * 4],
+			  &mask.f[i * 4]))
+	abort ();
+    }
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-check.h.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-check.h	2007-05-22 07:48:00.000000000 -0700
@@ -0,0 +1,22 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "../auto-host.h"
+
+#include "../../gcc.dg/i386-cpuid.h"
+
+static void sse4_1_test (void);
+
+int
+main ()
+{
+  unsigned long cpu_facilities;
+ 
+  cpu_facilities = i386_cpuid_ecx ();
+
+  /* Run SSE4.1 test only if host has SSE4.1 support.  */
+  if ((cpu_facilities & bit_SSE4_1))
+    sse4_1_test ();
+
+  exit (0);
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-dppd-1.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-dppd-1.c	2007-05-22 08:52:04.000000000 -0700
@@ -0,0 +1,63 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define lmskN  0x00
+#define lmsk0  0x01
+#define lmsk1  0x02
+#define lmsk01 0x03
+
+#define hmskA  0x30
+#define hmsk0  0x10
+#define hmsk1  0x20
+#define hmsk01 0x30
+#define hmskN  0x00
+
+#ifndef HIMASK
+#define HIMASK hmskA
+#endif
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128d x;
+      double d[2];
+    } val1, val2, res[4];
+  int masks[4];
+  int i, j;
+
+  val1.d[0] = 2.;
+  val1.d[1] = 3.;
+
+  val2.d[0] = 10.;
+  val2.d[1] = 100.;
+
+  res[0].x = _mm_dp_pd (val1.x, val2.x, HIMASK | lmskN);
+  res[1].x = _mm_dp_pd (val1.x, val2.x, HIMASK | lmsk0);
+  res[2].x = _mm_dp_pd (val1.x, val2.x, HIMASK | lmsk1);
+  res[3].x = _mm_dp_pd (val1.x, val2.x, HIMASK | lmsk01);
+
+  masks[0] = HIMASK | lmskN;
+  masks[1] = HIMASK | lmsk0;
+  masks[2] = HIMASK | lmsk1;
+  masks[3] = HIMASK | lmsk01; 
+
+  for (i = 0; i < 4; i++)
+    {
+      double tmp = 0.;
+
+      for (j = 0; j < 2; j++)
+	if (HIMASK & (0x10 << j))
+	  tmp = tmp + (val1.d[j] * val2.d[j]);
+
+      for (j = 0; j < 2; j++)
+	if ((masks[i] & (1 << j)) && res[i].d[j] != tmp)
+	  abort ();
+   }
+} 
--- gcc/testsuite/gcc.target/i386/sse4_1-dppd-2.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-dppd-2.c	2007-05-22 08:52:08.000000000 -0700
@@ -0,0 +1,64 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define lmskN  0x00
+#define lmsk0  0x01
+#define lmsk1  0x02
+#define lmsk01 0x03
+
+#define hmskA  0x30
+#define hmsk0  0x10
+#define hmsk1  0x20
+#define hmsk01 0x30
+#define hmskN  0x00
+
+#ifndef HIMASK
+#define HIMASK hmskA
+#endif
+
+#ifndef LOMASK
+#define LOMASK lmsk01
+#endif
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128d x;
+      double d[2];
+    } val1[4], val2[4], res[4], chk[4];
+  int i, j;
+  double tmp;
+
+  for (i = 0; i < 4; i++)
+    {
+      val1[i].d [0] = 2.;
+      val1[i].d [1] = 3.;
+
+      val2[i].d [0] = 10.;
+      val2[i].d [1] = 100.;
+
+      tmp = 0.;
+      for (j = 0; j < 2; j++)
+	if ((HIMASK & (0x10 << j)))
+	  tmp += val1[i].d [j] * val2[i].d [j];
+
+      for (j = 0; j < 2; j++)
+        if ((LOMASK & (1 << j)))
+	  chk[i].d[j] = tmp;
+    }
+
+  for (i = 0; i < 4; i++)
+    {
+      res[i].x = _mm_dp_pd (val1[i].x, val2[i].x, HIMASK | LOMASK); 
+      if (memcmp (&res[i], &chk[i], sizeof (chk[i])))
+	abort ();
+    }
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-dpps-1.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-dpps-1.c	2007-05-22 08:52:11.000000000 -0700
@@ -0,0 +1,106 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define lmskN  0x00
+#define lmsk0  0x01
+#define lmsk1  0x02
+#define lmsk2  0x04
+#define lmsk3  0x08
+#define lmsk01 0x03
+#define lmsk02 0x05
+#define lmsk03 0x09
+#define lmsk12 0x06
+#define lmsk13 0x0A
+#define lmsk23 0x0C
+#define lmskA  0x0F
+
+#define hmskN  0x00
+#define hmskA  0xF0
+#define hmsk0  0x10
+#define hmsk1  0x20
+#define hmsk2  0x40
+#define hmsk3  0x80
+#define hmsk01 0x30
+#define hmsk02 0x50
+#define hmsk03 0x90
+#define hmsk12 0x60
+#define hmsk13 0xA0
+#define hmsk23 0xC0
+
+#ifndef HIMASK
+#define HIMASK hmskA
+#endif
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128 x;
+      float f[4];
+    } val1, val2, res[16];
+  int masks[16];
+  int i, j;
+
+  val1.f[0] = 2.;
+  val1.f[1] = 3.;
+  val1.f[2] = 4.;
+  val1.f[3] = 5.;
+
+  val2.f[0] = 10.;
+  val2.f[1] = 100.;
+  val2.f[2] = 1000.;
+  val2.f[3] = 10000.;
+
+  res[0].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk0); 
+  res[1].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk1); 
+  res[2].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk2); 
+  res[3].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk3); 
+  res[4].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk01); 
+  res[5].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk02); 
+  res[6].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk03); 
+  res[7].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk12); 
+  res[8].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk13); 
+  res[9].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk23); 
+  res[10].x = _mm_dp_ps (val1.x, val2.x, HIMASK | (0x0F & ~lmsk0)); 
+  res[11].x = _mm_dp_ps (val1.x, val2.x, HIMASK | (0x0F & ~lmsk1)); 
+  res[12].x = _mm_dp_ps (val1.x, val2.x, HIMASK | (0x0F & ~lmsk2)); 
+  res[13].x = _mm_dp_ps (val1.x, val2.x, HIMASK | (0x0F & ~lmsk3)); 
+  res[14].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmskN); 
+  res[15].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmskA); 
+
+  masks[0] = HIMASK | lmsk0; 
+  masks[1] = HIMASK | lmsk1; 
+  masks[2] = HIMASK | lmsk2; 
+  masks[3] = HIMASK | lmsk3; 
+  masks[4] = HIMASK | lmsk01; 
+  masks[5] = HIMASK | lmsk02; 
+  masks[6] = HIMASK | lmsk03; 
+  masks[7] = HIMASK | lmsk12; 
+  masks[8] = HIMASK | lmsk13; 
+  masks[9] = HIMASK | lmsk23; 
+  masks[10] = HIMASK | (0x0F & ~lmsk0); 
+  masks[11] = HIMASK | (0x0F & ~lmsk1); 
+  masks[12] = HIMASK | (0x0F & ~lmsk2); 
+  masks[13] = HIMASK | (0x0F & ~lmsk3); 
+  masks[14] = HIMASK | lmskN; 
+  masks[15] = HIMASK | lmskA; 
+
+  for (i = 0; i <= 15; i++)
+    {
+      float tmp = 0.;
+
+      for (j = 0; j < 4; j++)
+	if ((HIMASK & (0x10 << j)))
+	  tmp += val1.f[j] * val2.f[j];
+
+      for (j = 0; j < 4; j++)
+	if ((masks[i] & (1 << j)) && res[i].f[j] != tmp)
+	  abort ();
+   }
+} 
--- gcc/testsuite/gcc.target/i386/sse4_1-dpps-2.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-dpps-2.c	2007-05-22 08:52:14.000000000 -0700
@@ -0,0 +1,83 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define lmskN  0x00
+#define lmsk0  0x01
+#define lmsk1  0x02
+#define lmsk2  0x04
+#define lmsk3  0x08
+#define lmsk01 0x03
+#define lmsk02 0x05
+#define lmsk03 0x09
+#define lmsk12 0x06
+#define lmsk13 0x0A
+#define lmsk23 0x0C
+#define lmskA  0x0F
+
+#define hmskN  0x00
+#define hmskA  0xF0
+#define hmsk0  0x10
+#define hmsk1  0x20
+#define hmsk2  0x40
+#define hmsk3  0x80
+#define hmsk01 0x30
+#define hmsk02 0x50
+#define hmsk03 0x90
+#define hmsk12 0x60
+#define hmsk13 0xA0
+#define hmsk23 0xC0
+
+#ifndef HIMASK
+#define HIMASK hmskA
+#endif
+
+#ifndef LOMASK
+#define LOMASK lmskA
+#endif
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128 x;
+      float f[4];
+    } val1[16], val2[16], res[16], chk[16];
+  int i,j;
+  float tmp;
+
+  for (i = 0; i < 16; i++)
+    {
+      val1[i].f[0] = 2.;
+      val1[i].f[1] = 3.;
+      val1[i].f[2] = 4.;
+      val1[i].f[3] = 5.;
+
+      val2[i].f[0] = 10.;
+      val2[i].f[1] = 100.;
+      val2[i].f[2] = 1000.;
+      val2[i].f[3] = 10000.;
+
+      tmp = 0.;
+      for (j = 0; j < 4; j++)
+        if ((HIMASK & (0x10 << j)))
+	  tmp += val1[i].f [j] * val2[i].f [j];
+
+      for (j = 0; j < 4; j++)
+	if ((LOMASK & (1 << j)))
+	  chk[i].f[j] = tmp;
+    }
+
+   for (i = 0; i < 16; i++)
+     {
+       res[i].x = _mm_dp_ps (val1[i].x, val2[i].x, HIMASK | LOMASK);
+       if (memcmp (&res[i], &chk[i], sizeof (chk[i])))
+	 abort ();
+     }
+} 
--- gcc/testsuite/gcc.target/i386/sse4_1-extractps.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-extractps.c	2007-05-22 08:52:19.000000000 -0700
@@ -0,0 +1,64 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+int masks[4];
+
+#define msk0 0x00
+#define msk1 0x01
+#define msk2 0x02
+#define msk3 0x03
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128 x;
+      float f[4];
+    } val1, val2;
+  union
+    {
+      int i;
+      float f;
+    } res[4];
+  float resm[4];
+  int i;
+
+  val1.f[0] = 10.;
+  val1.f[1] = 2.;
+  val1.f[2] = 3.;
+  val1.f[3] = 40.;
+
+  val2.f[0] = 77.;
+  val2.f[1] = 21.;
+  val2.f[2] = 34.;
+  val2.f[3] = 49.;
+
+  res[0].i = _mm_extract_ps (val1.x, msk0);
+  res[1].i = _mm_extract_ps (val1.x, msk1);
+  res[2].i = _mm_extract_ps (val1.x, msk2);
+  res[3].i = _mm_extract_ps (val1.x, msk3);
+
+  _MM_EXTRACT_FLOAT (resm[0], val2.x, msk0);
+  _MM_EXTRACT_FLOAT (resm[1], val2.x, msk1);
+  _MM_EXTRACT_FLOAT (resm[2], val2.x, msk2);
+  _MM_EXTRACT_FLOAT (resm[3], val2.x, msk3);
+  
+  masks[0] = msk0;
+  masks[1] = msk1;
+  masks[2] = msk2;
+  masks[3] = msk3;
+
+  for( i=0; i < 4; i++ )
+    {
+      if (res[i].f != val1.f[masks[i]])
+	abort ();
+      if (resm[i] != val2.f[masks[i]])
+	abort ();
+    }
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-insertps-1.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-insertps-1.c	2007-05-22 08:52:22.000000000 -0700
@@ -0,0 +1,71 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define msk0 0x01
+#define msk1 0x10
+#define msk2 0x29
+#define msk3 0x30
+
+#define msk4 0xFC
+#define msk5 0x05
+#define msk6 0x0A
+#define msk7 0x0F
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128 x;
+      float f[4];
+    } res[8], val1, val2, tmp;
+  int masks[8];
+  int i, j;
+
+  val2.f[0] = 55.0;
+  val2.f[1] = 55.0;
+  val2.f[2] = 55.0;
+  val2.f[3] = 55.0;
+
+  val1.f[0] = 1.;
+  val1.f[1] = 2.;
+  val1.f[2] = 3.;
+  val1.f[3] = 4.;
+
+  res[0].x = _mm_insert_ps (val2.x, val1.x, msk0);
+  res[1].x = _mm_insert_ps (val2.x, val1.x, msk1);
+  res[2].x = _mm_insert_ps (val2.x, val1.x, msk2);
+  res[3].x = _mm_insert_ps (val2.x, val1.x, msk3);
+
+  masks[0] = msk0;
+  masks[1] = msk1;
+  masks[2] = msk2;
+  masks[3] = msk3;
+
+  for (i = 0; i < 4; i++)
+    res[i + 4].x = _mm_insert_ps (val2.x, val1.x, msk4);
+
+  masks[4] = msk4;
+  masks[5] = msk4;
+  masks[6] = msk4;
+  masks[7] = msk4;
+
+  for (i=0; i < 8; i++)
+    {
+      tmp = val2;
+      tmp.f[(masks[i] & 0x30) >> 4] = val1.f[(masks[i] & 0xC0) >> 6];
+
+      for (j = 0; j < 4; j++)
+	if (masks[i] & (0x1 << j))
+	  tmp.f[j] = 0.f;
+
+      if (memcmp (&res[i], &tmp, sizeof (tmp)))
+	abort ();
+    }
+} 
--- gcc/testsuite/gcc.target/i386/sse4_1-insertps-2.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-insertps-2.c	2007-05-22 08:52:25.000000000 -0700
@@ -0,0 +1,44 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128 x;
+      float f[4];
+    } vals[4], val;
+  int i, j;
+
+  val.f[0]= 1.;
+  val.f[1]= 2.;
+  val.f[2]= 3.;
+  val.f[3]= 4.;
+
+  vals[0].x = _MM_PICK_OUT_PS (val.x, 0);
+  vals[1].x = _MM_PICK_OUT_PS (val.x, 1);
+  vals[2].x = _MM_PICK_OUT_PS (val.x, 2);
+  vals[3].x = _MM_PICK_OUT_PS (val.x, 3);
+
+  for (i = 0; i < 4; i++)
+    for (j = 0; j < 4; j++)
+      if ((j != 0 && vals[i].f[j] != 0)
+	  || (j == 0 && vals[i].f[j] != val.f[i]))
+	abort ();
+
+  if (_MM_MK_INSERTPS_NDX(0, 0, 0x1) != 0x01
+      || _MM_MK_INSERTPS_NDX(0, 1, 0x2) != 0x12
+      || _MM_MK_INSERTPS_NDX(0, 2, 0x3) != 0x23
+      || _MM_MK_INSERTPS_NDX(0, 3, 0x4) != 0x34
+      || _MM_MK_INSERTPS_NDX(1, 0, 0x5) != 0x45
+      || _MM_MK_INSERTPS_NDX(1, 1, 0x6) != 0x56
+      || _MM_MK_INSERTPS_NDX(2, 2, 0x7) != 0xA7
+      || _MM_MK_INSERTPS_NDX(3, 3, 0x8) != 0xF8)
+    abort ();
+} 
--- gcc/testsuite/gcc.target/i386/sse4_1-movntdqa.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-movntdqa.c	2007-05-22 08:52:28.000000000 -0700
@@ -0,0 +1,43 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define NUM 20
+
+static void
+init_movntdqa (int *src)
+{
+  int i, j, sign = 1;
+
+  for (i = 0; i < NUM; i++)
+    for (j = 0; j < 4; j++)
+      {
+	src[i * 4 + j] = j * i * i * sign;
+	sign = -sign;
+      }
+}
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x[NUM];
+      int i[NUM * 4];
+    } dst, src;
+  int i;
+
+  init_movntdqa (src.i);
+
+  for (i = 0; i < NUM; i++)
+    dst.x[i] = _mm_stream_load_si128 (&src.x[i]);
+
+  for (i = 0; i < NUM; i++)
+    if (memcmp (&dst.x[i], &src.x[i], sizeof(src.x[i])))
+      abort ();
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-mpsadbw.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-mpsadbw.c	2007-05-22 08:52:31.000000000 -0700
@@ -0,0 +1,122 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define msk0 0xC0
+#define msk1 0x01
+#define msk2 0xF2
+#define msk3 0x03
+#define msk4 0x84
+#define msk5 0x05
+#define msk6 0xE6
+#define msk7 0x67
+
+static __m128i
+compute_mpsadbw (unsigned char *v1, unsigned char *v2, int mask)
+{
+  union
+    {
+      __m128i x;
+      unsigned short s[8];
+    } ret;
+  unsigned char s[4];
+  int i, j;
+  int offs1, offs2;
+
+  offs2 = 4 * (mask & 3);
+  for (i = 0; i < 4; i++)
+    s[i] = v2[offs2 + i];
+
+  offs1 = 4 * ((mask & 4) >> 2);
+  for (j = 0; j < 8; j++)
+    {
+      ret.s[j] = 0;
+      for (i = 0; i < 4; i++)
+	ret.s[j] += abs (v1[offs1 + j + i] - s[i]);
+    }
+
+  return ret.x;
+}
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x;
+      unsigned int i[4];
+      unsigned char c[16];
+    } val1, val2, val3 [8];
+  __m128i res[8], tmp;
+  unsigned char masks[8];
+  int i;
+
+  val1.i[0] = 0x35251505;
+  val1.i[1] = 0x75655545;
+  val1.i[2] = 0xB5A59585;
+  val1.i[3] = 0xF5E5D5C5;
+
+  val2.i[0] = 0x31211101;
+  val2.i[1] = 0x71615141;
+  val2.i[2] = 0xB1A19181;
+  val2.i[3] = 0xF1E1D1C1;
+
+  for (i=0; i < 8; i++)
+    switch (i % 3)
+      {
+      case 1:
+	val3[i].i[0] = 0xF1E1D1C1;
+	val3[i].i[1] = 0xB1A19181;
+	val3[i].i[2] = 0x71615141;
+	val3[i].i[3] = 0x31211101;
+	break;
+      default:
+	val3[i].x = val2.x;
+	break;
+      }
+
+  /* Check mpsadbw imm8, xmm, xmm.  */
+  res[0] = _mm_mpsadbw_epu8 (val1.x, val2.x, msk0);
+  res[1] = _mm_mpsadbw_epu8 (val1.x, val2.x, msk1);
+  res[2] = _mm_mpsadbw_epu8 (val1.x, val2.x, msk2);
+  res[3] = _mm_mpsadbw_epu8 (val1.x, val2.x, msk3);
+  res[4] = _mm_mpsadbw_epu8 (val1.x, val2.x, msk4);
+  res[5] = _mm_mpsadbw_epu8 (val1.x, val2.x, msk5);
+  res[6] = _mm_mpsadbw_epu8 (val1.x, val2.x, msk6);
+  res[7] = _mm_mpsadbw_epu8 (val1.x, val2.x, msk7);
+
+  masks[0] = msk0;
+  masks[1] = msk1;
+  masks[2] = msk2;
+  masks[3] = msk3;
+  masks[4] = msk4;
+  masks[5] = msk5;
+  masks[6] = msk6;
+  masks[7] = msk7;
+
+  for (i=0; i < 8; i++)
+    {
+      tmp = compute_mpsadbw (val1.c, val2.c, masks[i]);
+      if (memcmp (&tmp, &res[i], sizeof (tmp)))
+	abort ();
+    }
+    
+  /* Check mpsadbw imm8, m128, xmm.  */
+  for (i=0; i < 8; i++)
+    {
+      res[i] = _mm_mpsadbw_epu8 (val1.x, val3[i].x, msk4);
+      masks[i] = msk4;
+    }
+
+  for (i=0; i < 8; i++)
+    {
+      tmp = compute_mpsadbw (val1.c, val3[i].c, masks[i]);
+      if (memcmp (&tmp, &res[i], sizeof (tmp)))
+	abort ();
+    }
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-packusdw.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-packusdw.c	2007-05-22 08:52:34.000000000 -0700
@@ -0,0 +1,65 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static unsigned short
+int_to_ushort (int iVal)
+{
+  unsigned short sVal;
+
+  if (iVal < 0)
+    sVal = 0;
+  else if (iVal > 0xffff)
+    sVal = 0xffff;
+  else sVal = iVal;
+
+  return sVal;
+}
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x[NUM / 4];
+      int i[NUM];
+    } src1, src2;
+  union
+    {
+      __m128i x[NUM / 4];
+      unsigned short s[NUM * 2];
+    } dst;
+  int i, sign = 1;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src1.i[i] = i * i * sign;
+      src2.i[i] = (i + 20) * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i < NUM; i += 4)
+    dst.x[i / 4] = _mm_packus_epi32 (src1.x [i / 4], src2.x [i / 4]);
+
+  for (i = 0; i < NUM; i ++)
+    {
+      int dstIndex;
+      unsigned short sVal;
+
+      sVal = int_to_ushort (src1.i[i]);
+      dstIndex = (i % 4) + (i / 4) * 8;
+      if (sVal != dst.s[dstIndex])
+	abort ();
+
+      sVal = int_to_ushort (src2.i[i]);
+      dstIndex += 4;
+      if (sVal != dst.s[dstIndex])
+	abort ();
+    }
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pblendvb.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pblendvb.c	2007-05-22 08:52:37.000000000 -0700
@@ -0,0 +1,62 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define NUM 20
+
+static void
+init_pblendvb (unsigned char *src1, unsigned char *src2,
+	       unsigned char *mask)
+{
+  int i, sign = 1; 
+
+  for (i = 0; i < NUM * 16; i++)
+    {
+      src1[i] = i* i * sign;
+      src2[i] = (i + 20) * sign;
+      mask[i] = (i % 3) + ((i * (14 + sign))
+			   ^ (src1[i] | src2[i] | (i*3)));
+      sign = -sign;
+    }
+}
+
+static int
+check_pblendvb (__m128i *dst, unsigned char *src1,
+		unsigned char *src2, unsigned char *mask)
+{
+  unsigned char tmp[16];
+  int j;
+
+  memcpy (&tmp[0], src1, sizeof (tmp));
+  for (j = 0; j < 16; j++)
+    if (mask [j] & 0x80)
+      tmp[j] = src2[j];
+
+  return memcmp (dst, &tmp[0], sizeof (tmp));
+}
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x[NUM];
+      unsigned char c[NUM * 16];
+    } dst, src1, src2, mask;
+  int i;
+
+  init_pblendvb (src1.c, src2.c, mask.c);
+
+  for (i = 0; i < NUM; i++)
+    {
+      dst.x[i] = _mm_blendv_epi8 (src1.x[i], src2.x[i], mask.x[i]);
+      if (check_pblendvb (&dst.x[i], &src1.c[i * 16], &src2.c[i * 16],
+			  &mask.c[i * 16]))
+	abort ();
+    }
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pblendw.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pblendw.c	2007-05-22 08:52:40.000000000 -0700
@@ -0,0 +1,80 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define NUM 20
+
+#ifndef MASK
+#define MASK 0x0f
+#endif
+
+static void
+init_pblendw (short *src1, short *src2)
+{
+  int i, sign = 1;
+
+  for (i = 0; i < NUM * 8; i++)
+    {
+      src1[i] = i * i * sign;
+      src2[i] = (i + 20) * sign;
+      sign = -sign;
+    }
+}
+
+static int
+check_pblendw (__m128i *dst, short *src1, short *src2)
+{
+  short tmp[8];
+  int j;
+
+  memcpy (&tmp[0], src1, sizeof (tmp));
+  for (j = 0; j < 8; j++)
+    if ((MASK & (1 << j)))
+      tmp[j] = src2[j];
+
+  return memcmp (dst, &tmp[0], sizeof (tmp));
+}
+
+static void
+sse4_1_test (void)
+{
+  __m128i x, y;
+  union
+    {
+      __m128i x[NUM];
+      short s[NUM * 8];
+    } dst, src1, src2;
+  union
+    {
+      __m128i x;
+      short s[8];
+    } src3;
+  int i;
+
+  init_pblendw (src1.s, src2.s);
+
+  /* Check pblendw imm8, m128, xmm */
+  for (i = 0; i < NUM; i++)
+    {
+      dst.x[i] = _mm_blend_epi16 (src1.x[i], src2.x[i], MASK); 
+      if (check_pblendw (&dst.x[i], &src1.s[i * 8], &src2.s[i * 8]))
+	abort ();
+    }
+    
+   /* Check pblendw imm8, xmm, xmm */
+  src3.x = _mm_setzero_si128 ();
+
+  x = _mm_blend_epi16 (dst.x[2], src3.x, MASK);
+  y = _mm_blend_epi16 (src3.x, dst.x[2], MASK);
+
+  if (check_pblendw (&x, &dst.s[16], &src3.s[0]))
+    abort ();
+
+  if (check_pblendw (&y, &src3.s[0], &dst.s[16]))
+    abort ();
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pcmpeqq.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pcmpeqq.c	2007-05-22 08:52:44.000000000 -0700
@@ -0,0 +1,38 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x[NUM / 2];
+      long long ll[NUM];
+    } dst, src1, src2;
+  int i, sign=1;
+  long long is_eq;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src1.ll[i] = i * i * sign;
+      src2.ll[i] = (i + 20) * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i < NUM; i += 2)
+    dst.x [i / 2] = _mm_cmpeq_epi64(src1.x [i / 2], src2.x [i / 2]);
+
+  for (i = 0; i < NUM; i++)
+    {
+      is_eq = src1.ll[i] == src2.ll[i] ? 0xffffffffffffffffLL : 0LL;
+      if (is_eq != dst.ll[i])
+	abort ();
+    }
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pextrb.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pextrb.c	2007-05-22 08:52:47.000000000 -0700
@@ -0,0 +1,80 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define msk0   0
+#define msk1   1
+#define msk2   2
+#define msk3   3
+#define msk4   4
+#define msk5   5
+#define msk6   6
+#define msk7   7
+#define msk8   8
+#define msk9   9
+#define msk10 10
+#define msk11 11
+#define msk12 12
+#define msk13 13
+#define msk14 14
+#define msk15 15
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x;
+      int i[4];
+      char c[16];
+    } val1;
+  int res[16], masks[16];
+  int i;
+
+  val1.i[0] = 0x04030201;
+  val1.i[1] = 0x08070605;
+  val1.i[2] = 0x0C0B0A09;
+  val1.i[3] = 0x100F0E0D;
+
+  res[0] = _mm_extract_epi8 (val1.x, msk0);
+  res[1] = _mm_extract_epi8 (val1.x, msk1);
+  res[2] = _mm_extract_epi8 (val1.x, msk2);
+  res[3] = _mm_extract_epi8 (val1.x, msk3);
+  res[4] = _mm_extract_epi8 (val1.x, msk4);
+  res[5] = _mm_extract_epi8 (val1.x, msk5);
+  res[6] = _mm_extract_epi8 (val1.x, msk6);
+  res[7] = _mm_extract_epi8 (val1.x, msk7);
+  res[8] = _mm_extract_epi8 (val1.x, msk8);
+  res[9] = _mm_extract_epi8 (val1.x, msk9);
+  res[10] = _mm_extract_epi8 (val1.x, msk10);
+  res[11] = _mm_extract_epi8 (val1.x, msk11);
+  res[12] = _mm_extract_epi8 (val1.x, msk12);
+  res[13] = _mm_extract_epi8 (val1.x, msk13);
+  res[14] = _mm_extract_epi8 (val1.x, msk14);
+  res[15] = _mm_extract_epi8 (val1.x, msk15);
+
+  masks[0] = msk0;
+  masks[1] = msk1;
+  masks[2] = msk2;
+  masks[3] = msk3;
+  masks[4] = msk4;
+  masks[5] = msk5;
+  masks[6] = msk6;
+  masks[7] = msk7;
+  masks[8] = msk8;
+  masks[9] = msk9;
+  masks[10] = msk10;
+  masks[11] = msk11;
+  masks[12] = msk12;
+  masks[13] = msk13;
+  masks[14] = msk14;
+  masks[15] = msk15;
+
+  for (i = 0; i < 16; i++)
+    if (res[i] != val1.c [masks[i]])
+      abort ();
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pextrd.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pextrd.c	2007-05-22 08:52:52.000000000 -0700
@@ -0,0 +1,43 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define msk0   0
+#define msk1   1
+#define msk2   2
+#define msk3   3
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x;
+      int i[4];
+    } val1;
+  int res[4], masks[4];
+  int i;
+
+  val1.i[0] = 0x04030201;
+  val1.i[1] = 0x08070605;
+  val1.i[2] = 0x0C0B0A09;
+  val1.i[3] = 0x100F0E0D;
+
+  res[0] = _mm_extract_epi32 (val1.x, msk0);
+  res[1] = _mm_extract_epi32 (val1.x, msk1);
+  res[2] = _mm_extract_epi32 (val1.x, msk2);
+  res[3] = _mm_extract_epi32 (val1.x, msk3);
+
+  masks[0] = msk0;
+  masks[1] = msk1;
+  masks[2] = msk2;
+  masks[3] = msk3;
+
+  for (i = 0; i < 4; i++)
+    if (res[i] != val1.i [masks[i]])
+      abort ();
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pextrq.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pextrq.c	2007-05-22 08:52:57.000000000 -0700
@@ -0,0 +1,36 @@
+/* { dg-do run { target { { i?86-*-* x86_64-*-* } && lp64 } } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define msk0   0
+#define msk1   1
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x;
+      long long ll[2];
+    } val1;
+  long long res[2];
+  int masks[2];
+  int i;
+
+  val1.ll[0] = 0x0807060504030201LL;
+  val1.ll[1] = 0x100F0E0D0C0B0A09LL;
+
+  res[0] = _mm_extract_epi64 (val1.x, msk0);
+  res[1] = _mm_extract_epi64 (val1.x, msk1);
+
+  masks[0] = msk0;
+  masks[1] = msk1;
+
+  for (i = 0; i < 2; i++)
+    if (res[i] != val1.ll [masks[i]])
+      abort ();
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pextrw.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pextrw.c	2007-05-22 08:53:00.000000000 -0700
@@ -0,0 +1,56 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define msk0   0
+#define msk1   1
+#define msk2   2
+#define msk3   3
+#define msk4   4
+#define msk5   5
+#define msk6   6
+#define msk7   7
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x;
+      int i[4];
+      short s[8];
+    } val1;
+  int res[8], masks[8];
+  int i;
+
+  val1.i[0] = 0x04030201;
+  val1.i[1] = 0x08070605;
+  val1.i[2] = 0x0C0B0A09;
+  val1.i[3] = 0x100F0E0D;
+
+  res[0] = _mm_extract_epi16 (val1.x, msk0);
+  res[1] = _mm_extract_epi16 (val1.x, msk1);
+  res[2] = _mm_extract_epi16 (val1.x, msk2);
+  res[3] = _mm_extract_epi16 (val1.x, msk3);
+  res[4] = _mm_extract_epi16 (val1.x, msk4);
+  res[5] = _mm_extract_epi16 (val1.x, msk5);
+  res[6] = _mm_extract_epi16 (val1.x, msk6);
+  res[7] = _mm_extract_epi16 (val1.x, msk7);
+
+  masks[0] = msk0;
+  masks[1] = msk1;
+  masks[2] = msk2;
+  masks[3] = msk3;
+  masks[4] = msk4;
+  masks[5] = msk5;
+  masks[6] = msk6;
+  masks[7] = msk7;
+
+  for (i = 0; i < 8; i++)
+    if (res[i] != val1.s [masks[i]])
+      abort ();
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-phminposuw.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-phminposuw.c	2007-05-22 08:53:03.000000000 -0700
@@ -0,0 +1,49 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x[NUM/8];
+      unsigned short s[NUM];
+    } src;
+  unsigned short minVal[NUM/8];
+  int minInd[NUM/8];
+  unsigned short minValScalar, minIndScalar;
+  int i, j, res;
+
+  for (i = 0; i < NUM; i++)
+    src.s[i] = i * i / (i + i / 3.14 + 1.0);
+
+  for (i = 0, j = 0; i < NUM; i += 8, j++)
+    {
+      res = _mm_cvtsi128_si32 (_mm_minpos_epu16 (src.x [i/8]));
+      minVal[j] = res & 0xffff;
+      minInd[j] = (res >> 16) & 0x3;
+    }
+
+  for (i = 0; i < NUM; i += 8)
+    {
+      minValScalar = src.s[i];
+      minIndScalar = 0;
+
+      for (j = i + 1; j < i + 8; j++)
+	if (minValScalar > src.s[j])
+	  {
+	    minValScalar = src.s[j];
+	    minIndScalar = j - i;
+	  }
+
+      if (minValScalar != minVal[i/8] && minIndScalar != minInd[i/8])
+	abort ();
+    }
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pinsrb.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pinsrb.c	2007-05-22 08:53:06.000000000 -0700
@@ -0,0 +1,102 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define msk0 0x00
+#define msk1 0x01
+#define msk2 0x02
+#define msk3 0x03
+#define msk4 0x04
+#define msk5 0x05
+#define msk6 0x06
+#define msk7 0x07
+#define msk8 0x08
+#define msk9 0x09
+#define mskA 0x0A
+#define mskB 0x0B
+#define mskC 0x0C
+#define mskD 0x0D
+#define mskE 0x0E
+#define mskF 0x0F
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x;
+      unsigned int i[4];
+      unsigned char c[16];
+    } res [16], val, tmp;
+  int masks[16];
+  unsigned char ins[4] = { 3, 4, 5, 6 };
+  int i;
+
+  val.i[0] = 0x35251505;
+  val.i[1] = 0x75655545;
+  val.i[2] = 0xB5A59585;
+  val.i[3] = 0xF5E5D5C5;
+
+  /* Check pinsrb imm8, r32, xmm.  */
+  res[0].x = _mm_insert_epi8 (val.x, ins[0], msk0);
+  res[1].x = _mm_insert_epi8 (val.x, ins[0], msk1);
+  res[2].x = _mm_insert_epi8 (val.x, ins[0], msk2);
+  res[3].x = _mm_insert_epi8 (val.x, ins[0], msk3);
+  res[4].x = _mm_insert_epi8 (val.x, ins[0], msk4);
+  res[5].x = _mm_insert_epi8 (val.x, ins[0], msk5);
+  res[6].x = _mm_insert_epi8 (val.x, ins[0], msk6);
+  res[7].x = _mm_insert_epi8 (val.x, ins[0], msk7);
+  res[8].x = _mm_insert_epi8 (val.x, ins[0], msk8);
+  res[9].x = _mm_insert_epi8 (val.x, ins[0], msk9);
+  res[10].x = _mm_insert_epi8 (val.x, ins[0], mskA);
+  res[11].x = _mm_insert_epi8 (val.x, ins[0], mskB);
+  res[12].x = _mm_insert_epi8 (val.x, ins[0], mskC);
+  res[13].x = _mm_insert_epi8 (val.x, ins[0], mskD);
+  res[14].x = _mm_insert_epi8 (val.x, ins[0], mskE);
+  res[15].x = _mm_insert_epi8 (val.x, ins[0], mskF);
+
+  masks[0] = msk0;
+  masks[1] = msk1;
+  masks[2] = msk2;
+  masks[3] = msk3;
+  masks[4] = msk4;
+  masks[5] = msk5;
+  masks[6] = msk6;
+  masks[7] = msk7;
+  masks[8] = msk8;
+  masks[9] = msk9;
+  masks[10] = mskA;
+  masks[11] = mskB;
+  masks[12] = mskC;
+  masks[13] = mskD;
+  masks[14] = mskE;
+  masks[15] = mskF;
+
+  for (i = 0; i < 16; i++)
+    {
+      tmp.x = val.x;
+      tmp.c[masks[i]] = ins[0];
+      if (memcmp (&tmp, &res[i], sizeof (tmp)))
+	abort ();
+    }
+    
+  /* Check pinsrb imm8, m8, xmm.  */
+  for (i = 0; i < 16; i++)
+    {
+      res[i].x = _mm_insert_epi8 (val.x, ins[i % 4], msk0);
+      masks[i] = msk0;
+    }
+
+  for (i = 0; i < 16; i++)
+    {
+      tmp.x = val.x;
+      tmp.c[masks[i]] = ins[i % 4];
+      if (memcmp (&tmp, &res[i], sizeof (tmp)))
+	abort ();
+    }
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pinsrd.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pinsrd.c	2007-05-22 08:53:09.000000000 -0700
@@ -0,0 +1,65 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define msk0 0x00
+#define msk1 0x01
+#define msk2 0x02
+#define msk3 0x03
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x;
+      unsigned int i[4];
+    } res [4], val, tmp;
+  static unsigned int ins[4] = { 3, 4, 5, 6 };
+  int masks[4];
+  int i;
+
+  val.i[0] = 55;
+  val.i[1] = 55;
+  val.i[2] = 55;
+  val.i[3] = 55;
+
+  /* Check pinsrd imm8, r32, xmm.  */
+  res[0].x = _mm_insert_epi32 (val.x, ins[0], msk0);
+  res[1].x = _mm_insert_epi32 (val.x, ins[0], msk1);
+  res[2].x = _mm_insert_epi32 (val.x, ins[0], msk2);
+  res[3].x = _mm_insert_epi32 (val.x, ins[0], msk3);
+
+  masks[0] = msk0;
+  masks[1] = msk1;
+  masks[2] = msk2;
+  masks[3] = msk3;
+
+  for (i = 0; i < 4; i++)
+    {
+      tmp.x = val.x;
+      tmp.i[masks[i]] = ins[0];
+      if (memcmp (&tmp, &res[i], sizeof (tmp)))
+	abort ();
+    }
+    
+  /* Check pinsrd imm8, m32, xmm.  */
+  for (i = 0; i < 4; i++)
+    {
+      res[i].x = _mm_insert_epi32 (val.x, ins[i], msk0);
+      masks[i] = msk0;
+    }
+
+  for (i = 0; i < 4; i++)
+    {
+      tmp.x = val.x;
+      tmp.i[masks[i]] = ins[i];
+      if (memcmp (&tmp, &res[i], sizeof (tmp)))
+	abort ();
+    }
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pinsrq.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pinsrq.c	2007-05-22 08:53:12.000000000 -0700
@@ -0,0 +1,58 @@
+/* { dg-do run { target { { i?86-*-* x86_64-*-* } && lp64 } } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define msk0 0x00
+#define msk1 0x01
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x;
+      unsigned long long ll[2];
+    } res [4], val, tmp;
+  int masks[4];
+  static unsigned long long ins[2] =
+    { 0xAABBAABBAABBAABBLL, 0xCCDDCCDDCCDDCCDDLL };
+  int i;
+
+  val.ll[0] = 0x0807060504030201LL;
+  val.ll[1] = 0x100F0E0D0C0B0A09LL;
+
+  /* Check pinsrq imm8, r64, xmm.  */
+  res[0].x = _mm_insert_epi64 (val.x, ins[0], msk0);
+  res[1].x = _mm_insert_epi64 (val.x, ins[0], msk1);
+
+  masks[0] = msk0;
+  masks[1] = msk1;
+
+  for (i = 0; i < 2; i++)
+    {
+      tmp.x = val.x;
+      tmp.ll[masks[i]] = ins[0];
+      if (memcmp (&tmp, &res[i], sizeof (tmp)))
+	abort ();
+    }
+    
+  /* Check pinsrq imm8, m64, xmm.  */
+  for (i = 0; i < 2; i++)
+    {
+      res[i].x = _mm_insert_epi64 (val.x, ins[i], msk0);
+      masks[i] = msk0;
+    }
+
+  for (i = 0; i < 2; i++)
+    {
+      tmp.x = val.x;
+      tmp.ll[masks[i]] = ins[i];
+      if (memcmp (&tmp, &res[i], sizeof (tmp)))
+	abort ();
+    }
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pmaxsb.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pmaxsb.c	2007-05-22 08:53:15.000000000 -0700
@@ -0,0 +1,38 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 1024
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x[NUM / 16];
+      char i[NUM];
+    } dst, src1, src2;
+  int i, sign = 1;
+  char max;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src1.i[i] = i * i * sign;
+      src2.i[i] = (i + 20) * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i < NUM; i += 16)
+    dst.x[i / 16] = _mm_max_epi8 (src1.x[i / 16], src2.x[i / 16]);
+
+  for (i = 0; i < NUM; i++)
+    {
+      max = src1.i[i] <= src2.i[i] ? src2.i[i] : src1.i[i];
+      if (max != dst.i[i])
+	abort ();
+    }
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pmaxsd.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pmaxsd.c	2007-05-22 08:53:17.000000000 -0700
@@ -0,0 +1,38 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x[NUM / 4];
+      int i[NUM];
+    } dst, src1, src2;
+  int i, sign = 1;
+  int max;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src1.i[i] = i * i * sign;
+      src2.i[i] = (i + 20) * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i < NUM; i += 4)
+    dst.x[i / 4] = _mm_max_epi32 (src1.x[i / 4], src2.x[i / 4]);
+
+  for (i = 0; i < NUM; i++)
+    {
+      max = src1.i[i] <= src2.i[i] ? src2.i[i] : src1.i[i];
+      if (max != dst.i[i])
+	abort ();
+    }
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pmaxud.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pmaxud.c	2007-05-22 08:53:20.000000000 -0700
@@ -0,0 +1,39 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x[NUM / 4];
+      unsigned int i[NUM];
+    } dst, src1, src2;
+  int i;
+  unsigned int max;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src1.i[i] = i * i;
+      src2.i[i] = i + 20;
+      if ((i % 4))
+	src2.i[i] |= 0x80000000;
+    }
+
+  for (i = 0; i < NUM; i += 4)
+    dst.x[i / 4] = _mm_max_epu32 (src1.x[i / 4], src2.x[i / 4]);
+
+  for (i = 0; i < NUM; i++)
+    {
+      max = src1.i[i] <= src2.i[i] ? src2.i[i] : src1.i[i];
+      if (max != dst.i[i])
+	abort ();
+    }
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pmaxuw.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pmaxuw.c	2007-05-22 08:53:23.000000000 -0700
@@ -0,0 +1,39 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x[NUM / 8];
+      unsigned short i[NUM];
+    } dst, src1, src2;
+  int i;
+  unsigned short max;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src1.i[i] = i * i;
+      src2.i[i] = i + 20;
+      if ((i % 8))
+	src2.i[i] |= 0x8000;
+    }
+
+  for (i = 0; i < NUM; i += 8)
+    dst.x[i / 8] = _mm_max_epu16 (src1.x[i / 8], src2.x[i / 8]);
+
+  for (i = 0; i < NUM; i++)
+    {
+      max = src1.i[i] <= src2.i[i] ? src2.i[i] : src1.i[i];
+      if (max != dst.i[i])
+	abort ();
+    }
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pminsb.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pminsb.c	2007-05-22 08:53:26.000000000 -0700
@@ -0,0 +1,38 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 1024
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x[NUM / 16];
+      char i[NUM];
+    } dst, src1, src2;
+  int i, sign = 1;
+  char min;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src1.i[i] = i * i * sign;
+      src2.i[i] = (i + 20) * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i < NUM; i += 16)
+    dst.x[i / 16] = _mm_min_epi8 (src1.x[i / 16], src2.x[i / 16]);
+
+  for (i = 0; i < NUM; i++)
+    {
+      min = src1.i[i] >= src2.i[i] ? src2.i[i] : src1.i[i];
+      if (min != dst.i[i])
+	abort ();
+    }
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pminsd.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pminsd.c	2007-05-22 08:53:29.000000000 -0700
@@ -0,0 +1,38 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x[NUM / 4];
+      int i[NUM];
+    } dst, src1, src2;
+  int i, sign = 1;
+  int min;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src1.i[i] = i * i * sign;
+      src2.i[i] = (i + 20) * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i < NUM; i += 4)
+    dst.x[i / 4] = _mm_min_epi32 (src1.x[i / 4], src2.x[i / 4]);
+
+  for (i = 0; i < NUM; i++)
+    {
+      min = src1.i[i] >= src2.i[i] ? src2.i[i] : src1.i[i];
+      if (min != dst.i[i])
+	abort ();
+    }
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pminud.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pminud.c	2007-05-22 08:53:32.000000000 -0700
@@ -0,0 +1,39 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x[NUM / 4];
+      unsigned int i[NUM];
+    } dst, src1, src2;
+  int i;
+  unsigned int min;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src1.i[i] = i * i;
+      src2.i[i] = i + 20;
+      if ((i % 4))
+	src2.i[i] |= 0x80000000;
+    }
+
+  for (i = 0; i < NUM; i += 4)
+    dst.x[i / 4] = _mm_min_epu32 (src1.x[i / 4], src2.x[i / 4]);
+
+  for (i = 0; i < NUM; i++)
+    {
+      min = src1.i[i] >= src2.i[i] ? src2.i[i] : src1.i[i];
+      if (min != dst.i[i])
+	abort ();
+    }
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pminuw.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pminuw.c	2007-05-22 08:53:35.000000000 -0700
@@ -0,0 +1,39 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x[NUM / 8];
+      unsigned short i[NUM];
+    } dst, src1, src2;
+  int i;
+  unsigned short min;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src1.i[i] = i * i;
+      src2.i[i] = i + 20;
+      if ((i % 8))
+	src2.i[i] |= 0x8000;
+    }
+
+  for (i = 0; i < NUM; i += 8)
+    dst.x[i / 8] = _mm_min_epu16 (src1.x[i / 8], src2.x[i / 8]);
+
+  for (i = 0; i < NUM; i++)
+    {
+      min = src1.i[i] >= src2.i[i] ? src2.i[i] : src1.i[i];
+      if (min != dst.i[i])
+	abort ();
+    }
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pmovsxbd.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pmovsxbd.c	2007-05-22 08:53:39.000000000 -0700
@@ -0,0 +1,34 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 128
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x[NUM / 4];
+      int i[NUM];
+      char c[NUM * 4];
+    } dst, src;
+  int i, sign = 1;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src.c[(i % 4) + (i / 4) * 16] = i * i * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i < NUM; i += 4)
+    dst.x [i / 4] = _mm_cvtepi8_epi32 (src.x [i / 4]);
+
+  for (i = 0; i < NUM; i++)
+    if (src.c[(i % 4) + (i / 4) * 16] != dst.i[i])
+      abort ();
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pmovsxbq.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pmovsxbq.c	2007-05-22 08:53:42.000000000 -0700
@@ -0,0 +1,34 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 128
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x[NUM / 2];
+      long long ll[NUM];
+      char c[NUM * 8];
+    } dst, src;
+  int i, sign = 1;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src.c[(i % 2) + (i / 2) * 16] = i * i * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i < NUM; i += 2)
+    dst.x [i / 2] = _mm_cvtepi8_epi64 (src.x [i / 2]);
+
+  for (i = 0; i < NUM; i++)
+    if (src.c[(i % 2) + (i / 2) * 16] != dst.ll[i])
+      abort ();
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pmovsxbw.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pmovsxbw.c	2007-05-22 08:53:46.000000000 -0700
@@ -0,0 +1,34 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 128
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x[NUM / 8];
+      short s[NUM];
+      char c[NUM * 2];
+    } dst, src;
+  int i, sign = 1;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src.c[(i % 8) + (i / 8) * 16] = i * i * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i < NUM; i += 8)
+    dst.x [i / 8] = _mm_cvtepi8_epi16 (src.x [i / 8]);
+
+  for (i = 0; i < NUM; i++)
+    if (src.c[(i % 8) + (i / 8) * 16] != dst.s[i])
+      abort ();
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pmovsxdq.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pmovsxdq.c	2007-05-22 08:53:49.000000000 -0700
@@ -0,0 +1,34 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 128
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x[NUM / 2];
+      long long ll[NUM];
+      int i[NUM * 2];
+    } dst, src;
+  int i, sign = 1;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src.i[(i % 2) + (i / 2) * 4] = i * i * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i < NUM; i += 2)
+    dst.x [i / 2] = _mm_cvtepi32_epi64 (src.x [i / 2]);
+
+  for (i = 0; i < NUM; i++)
+    if (src.i[(i % 2) + (i / 2) * 4] != dst.ll[i])
+      abort ();
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pmovsxwd.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pmovsxwd.c	2007-05-22 08:53:57.000000000 -0700
@@ -0,0 +1,34 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 128
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x[NUM / 4];
+      int i[NUM];
+      short s[NUM * 2];
+    } dst, src;
+  int i, sign = 1;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src.s[(i % 4) + (i / 4) * 8] = i * i * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i < NUM; i += 4)
+    dst.x [i / 4] = _mm_cvtepi16_epi32 (src.x [i / 4]);
+
+  for (i = 0; i < NUM; i++)
+    if (src.s[(i % 4) + (i / 4) * 8] != dst.i[i])
+      abort ();
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pmovsxwq.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pmovsxwq.c	2007-05-22 08:54:00.000000000 -0700
@@ -0,0 +1,34 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 128
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x[NUM / 2];
+      long long ll[NUM];
+      short s[NUM * 4];
+    } dst, src;
+  int i, sign = 1;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src.s[(i % 2) + (i / 2) * 8] = i * i * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i < NUM; i += 2)
+    dst.x [i / 2] = _mm_cvtepi16_epi64 (src.x [i / 2]);
+
+  for (i = 0; i < NUM; i++)
+    if (src.s[(i % 2) + (i / 2) * 8] != dst.ll[i])
+      abort ();
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pmovzxbd.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pmovzxbd.c	2007-05-22 08:54:02.000000000 -0700
@@ -0,0 +1,35 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 128
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x[NUM / 4];
+      unsigned int i[NUM];
+      unsigned char c[NUM * 4];
+    } dst, src;
+  int i;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src.c[(i % 4) + (i / 4) * 16] = i * i;
+      if ((i % 4))
+	src.c[(i % 4) + (i / 4) * 16] |= 0x80;
+    }
+
+  for (i = 0; i < NUM; i += 4)
+    dst.x [i / 4] = _mm_cvtepu8_epi32 (src.x [i / 4]);
+
+  for (i = 0; i < NUM; i++)
+    if (src.c[(i % 4) + (i / 4) * 16] != dst.i[i])
+      abort ();
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pmovzxbq.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pmovzxbq.c	2007-05-22 08:54:05.000000000 -0700
@@ -0,0 +1,35 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 128
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x[NUM / 2];
+      unsigned long long ll[NUM];
+      unsigned char c[NUM * 8];
+    } dst, src;
+  int i;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src.c[(i % 2) + (i / 2) * 16] = i * i;
+      if ((i % 2))
+	src.c[(i % 2) + (i / 2) * 16] |= 0x80;
+    }
+
+  for (i = 0; i < NUM; i += 2)
+    dst.x [i / 2] = _mm_cvtepu8_epi64 (src.x [i / 2]);
+
+  for (i = 0; i < NUM; i++)
+    if (src.c[(i % 2) + (i / 2) * 16] != dst.ll[i])
+      abort ();
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pmovzxbw.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pmovzxbw.c	2007-05-22 08:54:07.000000000 -0700
@@ -0,0 +1,35 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 128
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x[NUM / 8];
+      unsigned short s[NUM];
+      unsigned char c[NUM * 2];
+    } dst, src;
+  int i;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src.c[(i % 8) + (i / 8) * 16] = i * i;
+      if ((i % 4))
+	src.c[(i % 8) + (i / 8) * 16] |= 0x80;
+    }
+
+  for (i = 0; i < NUM; i += 8)
+    dst.x [i / 8] = _mm_cvtepu8_epi16 (src.x [i / 8]);
+
+  for (i = 0; i < NUM; i++)
+    if (src.c[(i % 8) + (i / 8) * 16] != dst.s[i])
+      abort ();
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pmovzxdq.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pmovzxdq.c	2007-05-22 08:54:10.000000000 -0700
@@ -0,0 +1,35 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 128
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x[NUM / 2];
+      unsigned long long ll[NUM];
+      unsigned int i[NUM * 2];
+    } dst, src;
+  int i;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src.i[(i % 2) + (i / 2) * 4] = i * i;
+      if ((i % 2))
+        src.i[(i % 2) + (i / 2) * 4] |= 0x80000000;
+    }
+
+  for (i = 0; i < NUM; i += 2)
+    dst.x [i / 2] = _mm_cvtepu32_epi64 (src.x [i / 2]);
+
+  for (i = 0; i < NUM; i++)
+    if (src.i[(i % 2) + (i / 2) * 4] != dst.ll[i])
+      abort ();
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pmovzxwd.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pmovzxwd.c	2007-05-22 08:54:12.000000000 -0700
@@ -0,0 +1,35 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 128
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x[NUM / 4];
+      unsigned int i[NUM];
+      unsigned short s[NUM * 2];
+    } dst, src;
+  int i;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src.s[(i % 4) + (i / 4) * 8] = i * i;
+      if ((i % 4))
+	src.s[(i % 4) + (i / 4) * 8] |= 0x8000;
+    }
+
+  for (i = 0; i < NUM; i += 4)
+    dst.x [i / 4] = _mm_cvtepu16_epi32 (src.x [i / 4]);
+
+  for (i = 0; i < NUM; i++)
+    if (src.s[(i % 4) + (i / 4) * 8] != dst.i[i])
+      abort ();
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pmovzxwq.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pmovzxwq.c	2007-05-22 08:54:14.000000000 -0700
@@ -0,0 +1,35 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 128
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x[NUM / 2];
+      unsigned long long ll[NUM];
+      unsigned short s[NUM * 4];
+    } dst, src;
+  int i;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src.s[(i % 2) + (i / 2) * 8] = i * i;
+      if ((i % 2))
+	src.s[(i % 2) + (i / 2) * 8] |= 0x8000;
+    }
+
+  for (i = 0; i < NUM; i += 2)
+    dst.x [i / 2] = _mm_cvtepu16_epi64 (src.x [i / 2]);
+
+  for (i = 0; i < NUM; i++)
+    if (src.s[(i % 2) + (i / 2) * 8] != dst.ll[i])
+      abort ();
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pmuldq.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pmuldq.c	2007-05-22 08:54:16.000000000 -0700
@@ -0,0 +1,43 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x[NUM / 2];
+      long long ll[NUM];
+    } dst;
+  union
+    {
+      __m128i x[NUM / 2];
+      int i[NUM * 2];
+    } src1, src2;
+  int i, sign = 1;
+  long long value;
+
+  for (i = 0; i < NUM; i += 2)
+    {
+      src1.i[i] = i * i * sign;
+      src2.i[i] = (i + 20) * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i < NUM; i += 2)
+    dst.x[i / 2] = _mm_mul_epi32 (src1.x[i / 2], src2.x[i / 2]);
+
+  for (i = 0; i < NUM; i++)
+    {
+      value = (long long) src1.i[i * 2] * (long long) src2.i[i * 2];
+      if (value != dst.ll[i])
+	abort ();
+    }
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-pmulld.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-pmulld.c	2007-05-22 08:54:19.000000000 -0700
@@ -0,0 +1,38 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x[NUM / 4];
+      int i[NUM];
+    } dst, src1, src2;
+  int i, sign = 1;
+  int value;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src1.i[i] = i * i * sign;
+      src2.i[i] = (i + 20) * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i < NUM; i += 4)
+    dst.x[i / 4] = _mm_mullo_epi32 (src1.x[i / 4], src2.x[i / 4]);
+
+  for (i = 0; i < NUM; i++)
+    {
+      value = src1.i[i] * src2.i[i];
+      if (value != dst.i[i])
+	abort ();
+    }
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-ptest-1.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-ptest-1.c	2007-05-22 08:54:21.000000000 -0700
@@ -0,0 +1,109 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+static int
+make_ptestz (__m128i m, __m128i v)
+{
+  union
+    {
+      __m128i x;
+      unsigned char c[16];
+    } val, mask;
+  int i, z;
+
+  mask.x = m;
+  val.x = v;
+
+  z = 1;
+  for (i = 0; i < 16; i++)
+    if ((mask.c[i] & val.c[i]))
+      {
+	z = 0;
+	break;
+      }
+  return z;
+}
+
+static int
+make_ptestc (__m128i m, __m128i v)
+{
+  union
+    {
+      __m128i x;
+      unsigned char c[16];
+    } val, mask;
+  int i, c;
+
+  mask.x = m;
+  val.x = v;
+
+  c = 1;
+  for (i = 0; i < 16; i++)
+    if ((val.c[i] & ~mask.c[i]))
+      {
+	c = 0;
+	break;
+      }
+  return c;
+}
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x;
+      unsigned int i[4];
+    } val[4];
+  int i, j, l;
+  int res[32];
+
+  val[0].i[0] = 0x11111111;
+  val[0].i[1] = 0x00000000;
+  val[0].i[2] = 0x00000000;
+  val[0].i[3] = 0x11111111;
+    
+  val[1].i[0] = 0x00000000;
+  val[1].i[1] = 0x11111111;
+  val[1].i[2] = 0x11111111;
+  val[1].i[3] = 0x00000000;
+
+  val[2].i[0] = 0;
+  val[2].i[1] = 0;
+  val[2].i[2] = 0;
+  val[2].i[3] = 0;
+
+  val[3].i[0] = 0xffffffff;
+  val[3].i[1] = 0xffffffff;
+  val[3].i[2] = 0xffffffff;
+  val[3].i[3] = 0xffffffff;
+
+  l = 0;
+  for(i = 0; i < 4; i++)
+    for(j = 0; j < 4; j++)
+      {
+	res[l++] = _mm_testz_si128 (val[j].x, val[i].x);
+	res[l++] = _mm_testc_si128 (val[j].x, val[i].x);
+      }
+
+  l = 0;
+  for(i = 0; i < 4; i++)
+    for(j = 0; j < 4; j++)
+      {
+	if (res[l++] != make_ptestz (val[j].x, val[i].x))
+	  abort ();
+	if (res[l++] != make_ptestc (val[j].x, val[i].x))
+	  abort ();
+      }
+
+  if (res[2] != _mm_testz_si128 (val[1].x, val[0].x))
+    abort ();
+
+  if (res[3] != _mm_testc_si128 (val[1].x, val[0].x))
+    abort ();
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-ptest-2.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-ptest-2.c	2007-05-22 08:54:23.000000000 -0700
@@ -0,0 +1,88 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+static int
+make_ptestnzc (__m128i m, __m128i v)
+{
+  union
+    {
+      __m128i x;
+      unsigned char c[16];
+    } val, mask;
+  int i, z, c;
+
+  mask.x = m;
+  val.x = v;
+
+  z = c = 1;
+  for (i = 0; i < 16; i++)
+    {
+      if ((mask.c[i] & val.c[i]))
+	z = 0;
+      if ((~mask.c[i] & val.c[i]))
+	c = 0;
+    }
+
+  return (z == 0 && c == 0) ? 1 : 0;
+}
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x;
+      unsigned int i[4];
+    } val[4];
+  int i, j, l;
+  int res[32];
+
+  val[0].i[0] = 0x11111111;
+  val[0].i[1] = 0x00000000;
+  val[0].i[2] = 0x00000000;
+  val[0].i[3] = 0x11111111;
+    
+  val[1].i[0] = 0x00000000;
+  val[1].i[1] = 0x11111111;
+  val[1].i[2] = 0x11111111;
+  val[1].i[3] = 0x00000000;
+
+  val[2].i[0] = 0;
+  val[2].i[1] = 0;
+  val[2].i[2] = 0;
+  val[2].i[3] = 0;
+
+  val[3].i[0] = 0xffffffff;
+  val[3].i[1] = 0xffffffff;
+  val[3].i[2] = 0xffffffff;
+  val[3].i[3] = 0xffffffff;
+
+  l = 0;
+  for(i = 0; i < 4; i++)
+    for(j = 0; j < 4; j++)
+      {
+	res[l++] = _mm_testnzc_si128 (val[j].x, val[i].x);
+	res[l++] = _mm_testnzc_si128 (val[j].x, val[i].x);
+      }
+
+  l = 0;
+  for(i = 0; i < 4; i++)
+    for(j = 0; j < 4; j++)
+      {
+	if (res[l++] != make_ptestnzc (val[j].x, val[i].x))
+	  abort ();
+	if (res[l++] != make_ptestnzc (val[j].x, val[i].x))
+	  abort ();
+      }
+
+  if (res[2] != _mm_testnzc_si128 (val[1].x, val[0].x))
+    abort ();
+
+  if (res[3] != _mm_testnzc_si128 (val[1].x, val[0].x))
+    abort ();
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-ptest-3.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-ptest-3.c	2007-05-22 08:54:25.000000000 -0700
@@ -0,0 +1,77 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+static void
+sse4_1_test (void)
+{
+  union
+    {
+      __m128i x;
+      unsigned int i[4];
+    } val[4];
+  int correct_zeros[4];
+  int correct_ones[4];
+  int correct_mixed[4];
+  int zeros[4];
+  int ones[4];
+  int mixed[4];
+  int i;
+  __m128i v;
+
+  val[0].i[0] = 0x11111111;
+  val[0].i[1] = 0x00000000;
+  val[0].i[2] = 0x00000000;
+  val[0].i[3] = 0x11111111;
+  correct_zeros[0] = 0;
+  correct_ones[0] = 0;
+  correct_mixed[0] = 1;
+    
+  val[1].i[0] = 0x00000000;
+  val[1].i[1] = 0x11111111;
+  val[1].i[2] = 0x11111111;
+  val[1].i[3] = 0x00000000;
+  correct_zeros[1] = 0;
+  correct_ones[1] = 0;
+  correct_mixed[1] = 1;
+
+  val[2].i[0] = 0;
+  val[2].i[1] = 0;
+  val[2].i[2] = 0;
+  val[2].i[3] = 0;
+  correct_zeros[2] = 1;
+  correct_ones[2] = 0;
+  correct_mixed[2] = 0;
+
+  val[3].i[0] = 0xffffffff;
+  val[3].i[1] = 0xffffffff;
+  val[3].i[2] = 0xffffffff;
+  val[3].i[3] = 0xffffffff;
+  correct_zeros[3] = 0;
+  correct_ones[3] = 1;
+  correct_mixed[3] = 0;
+
+  for (i=0; i < 4; i++)
+    zeros[i] = _mm_test_all_zeros (val[i].x, val[i].x);
+
+  for( i=0; i < 4; i++ )
+    ones[i] = _mm_test_all_ones (val[i].x);
+
+  v = _mm_cmpeq_epi32 (val[0].x, val[0].x);
+  for( i=0; i < 4; i++ )
+    mixed[i] = _mm_test_mix_ones_zeros (val[i].x, v);
+
+  for( i=0; i < 4; i++ )
+    {
+      if (zeros[i] != correct_zeros[i])
+	abort ();
+      if (ones[i] != correct_ones[i])
+	abort ();
+      if (mixed[i] != correct_mixed[i])
+	abort ();
+    }
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-round.h.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-round.h	2007-05-22 07:45:41.000000000 -0700
@@ -0,0 +1,95 @@
+#include <smmintrin.h>
+#include <math.h>
+
+#define NUM 64
+
+static void
+init_round (FP_T *src)
+{
+  int i, sign = 1;
+  FP_T f = rand ();
+
+  for (i = 0; i < NUM; i++)
+    {
+      src[i] = (i + 1)* f * M_PI * sign;
+      if (i < (NUM / 2))
+	{
+          if ((i % 6) == 0)
+	    f = f * src[i];
+        }
+      else if (i == (NUM / 2))
+	f = rand ();
+      else if ((i % 6) == 0)
+	f = 1 / (f * (i + 1) * src[i] * M_PI *sign);
+      sign = -sign;
+    }
+}
+
+static FP_T
+do_round (FP_T f, int type)
+{
+  short saved_cw, new_cw, clr_mask;
+  FP_T ret;
+
+  if ((type & 4))
+    {
+      type = 0;
+      clr_mask = 0xFFFF;
+    }
+  else
+    {
+      type = 0x003F | ((type & 3) << 10);
+      clr_mask = ~0x0C3F;
+    }
+
+  __asm__ ("fld" ASM_SUFFIX " %0" : : "m" (*&f));
+
+  __asm__ ("fstcw %0" : "=m" (*&saved_cw));
+  new_cw = saved_cw & clr_mask;
+  new_cw |= type;
+  __asm__ ("fldcw %0" : : "m" (*&new_cw));
+
+  __asm__ ("frndint\n"
+	   "fstp" ASM_SUFFIX " %0\n" : "=m" (*&ret));
+  __asm__ ("fldcw %0" : : "m" (*&saved_cw));
+  return ret;
+}
+
+static void
+sse4_1_test (void)
+{
+  int i;
+  FP_T f;
+  union
+    {
+      VEC_T x[NUM / LOOP_INCREMENT];
+      FP_T f[NUM];
+    } dst, src;
+
+  init_round (src.f);
+
+  for (i = 0; i < NUM / LOOP_INCREMENT; i++)
+    dst.x[i] =  ROUND_INTRIN (src.x[i], ROUND_MODE);
+
+  for (i = 0; i < NUM; i += CHECK_LOOP_INCREMENT)
+    {
+      f = do_round (src.f[i], CHECK_ROUND_MODE);
+     if (f != dst.f[i])
+       abort ();
+    }
+
+  if (_MM_FROUND_TO_NEAREST_INT != 0x00
+      || _MM_FROUND_TO_NEG_INF != 0x01
+      || _MM_FROUND_TO_POS_INF != 0x02
+      || _MM_FROUND_TO_ZERO != 0x03
+      || _MM_FROUND_CUR_DIRECTION != 0x04
+      || _MM_FROUND_RAISE_EXC != 0x00
+      || _MM_FROUND_NO_EXC != 0x08
+      || _MM_FROUND_NINT != 0x00
+      || _MM_FROUND_FLOOR != 0x01
+      || _MM_FROUND_CEIL != 0x02
+      || _MM_FROUND_TRUNC != 0x03
+      || _MM_FROUND_RINT != 0x04
+      || _MM_FROUND_NEARBYINT != 0x0C)
+    abort ();
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-roundpd-1.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-roundpd-1.c	2007-05-22 08:54:28.000000000 -0700
@@ -0,0 +1,18 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#define VEC_T __m128d
+#define FP_T double
+#define ASM_SUFFIX "l"
+
+#define ROUND_INTRIN(x, mode) _mm_ceil_pd(x)
+#define ROUND_MODE _MM_FROUND_CEIL
+#define CHECK_ROUND_MODE 0x02
+
+#define LOOP_INCREMENT 2
+#define CHECK_LOOP_INCREMENT 1
+
+#include "sse4_1-round.h"
--- gcc/testsuite/gcc.target/i386/sse4_1-roundpd-2.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-roundpd-2.c	2007-05-22 08:54:30.000000000 -0700
@@ -0,0 +1,18 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#define VEC_T __m128d
+#define FP_T double
+#define ASM_SUFFIX "l"
+
+#define ROUND_INTRIN _mm_round_pd
+#define ROUND_MODE _MM_FROUND_NINT
+#define CHECK_ROUND_MODE 0x00
+
+#define LOOP_INCREMENT 2
+#define CHECK_LOOP_INCREMENT 1
+
+#include "sse4_1-round.h"
--- gcc/testsuite/gcc.target/i386/sse4_1-roundpd-3.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-roundpd-3.c	2007-05-22 08:54:32.000000000 -0700
@@ -0,0 +1,18 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#define VEC_T __m128d
+#define FP_T double
+#define ASM_SUFFIX "l"
+
+#define ROUND_INTRIN(x, mode) _mm_floor_pd(x)
+#define ROUND_MODE _MM_FROUND_FLOOR
+#define CHECK_ROUND_MODE 0x01
+
+#define LOOP_INCREMENT 2
+#define CHECK_LOOP_INCREMENT 1
+
+#include "sse4_1-round.h"
--- gcc/testsuite/gcc.target/i386/sse4_1-roundps-1.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-roundps-1.c	2007-05-22 08:54:34.000000000 -0700
@@ -0,0 +1,18 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#define VEC_T __m128
+#define FP_T float
+#define ASM_SUFFIX "s"
+
+#define ROUND_INTRIN(x, mode) _mm_ceil_ps(x)
+#define ROUND_MODE _MM_FROUND_CEIL
+#define CHECK_ROUND_MODE 0x02
+
+#define LOOP_INCREMENT 4
+#define CHECK_LOOP_INCREMENT 1
+
+#include "sse4_1-round.h"
--- gcc/testsuite/gcc.target/i386/sse4_1-roundps-2.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-roundps-2.c	2007-05-22 08:54:37.000000000 -0700
@@ -0,0 +1,18 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#define VEC_T __m128
+#define FP_T float
+#define ASM_SUFFIX "s"
+
+#define ROUND_INTRIN _mm_round_ps
+#define ROUND_MODE _MM_FROUND_NINT
+#define CHECK_ROUND_MODE 0x00
+
+#define LOOP_INCREMENT 4
+#define CHECK_LOOP_INCREMENT 1
+
+#include "sse4_1-round.h"
--- gcc/testsuite/gcc.target/i386/sse4_1-roundps-3.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-roundps-3.c	2007-05-22 08:54:39.000000000 -0700
@@ -0,0 +1,18 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#define VEC_T __m128
+#define FP_T float
+#define ASM_SUFFIX "s"
+
+#define ROUND_INTRIN(x, mode) _mm_floor_ps(x)
+#define ROUND_MODE _MM_FROUND_FLOOR
+#define CHECK_ROUND_MODE 0x01
+
+#define LOOP_INCREMENT 4
+#define CHECK_LOOP_INCREMENT 1
+
+#include "sse4_1-round.h"
--- gcc/testsuite/gcc.target/i386/sse4_1-roundsd-1.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-roundsd-1.c	2007-05-22 08:54:41.000000000 -0700
@@ -0,0 +1,18 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#define VEC_T __m128d
+#define FP_T double
+#define ASM_SUFFIX "l"
+
+#define ROUND_INTRIN(x, mode) _mm_ceil_sd(x, x)
+#define ROUND_MODE _MM_FROUND_CEIL
+#define CHECK_ROUND_MODE 0x02
+
+#define LOOP_INCREMENT 2
+#define CHECK_LOOP_INCREMENT 2
+
+#include "sse4_1-round.h"
--- gcc/testsuite/gcc.target/i386/sse4_1-roundsd-2.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-roundsd-2.c	2007-05-22 08:54:44.000000000 -0700
@@ -0,0 +1,18 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#define VEC_T __m128d
+#define FP_T double
+#define ASM_SUFFIX "l"
+
+#define ROUND_INTRIN(x, mode) _mm_round_sd(x, x, mode)
+#define ROUND_MODE _MM_FROUND_NINT
+#define CHECK_ROUND_MODE 0x00
+
+#define LOOP_INCREMENT 2
+#define CHECK_LOOP_INCREMENT 2
+
+#include "sse4_1-round.h"
--- gcc/testsuite/gcc.target/i386/sse4_1-roundsd-3.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-roundsd-3.c	2007-05-22 08:54:46.000000000 -0700
@@ -0,0 +1,18 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#define VEC_T __m128d
+#define FP_T double
+#define ASM_SUFFIX "l"
+
+#define ROUND_INTRIN(x, mode) _mm_floor_sd(x, x)
+#define ROUND_MODE _MM_FROUND_FLOOR
+#define CHECK_ROUND_MODE 0x01
+
+#define LOOP_INCREMENT 2
+#define CHECK_LOOP_INCREMENT 2
+
+#include "sse4_1-round.h"
--- gcc/testsuite/gcc.target/i386/sse4_1-roundsd-4.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-roundsd-4.c	2007-05-22 08:54:48.000000000 -0700
@@ -0,0 +1,91 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <math.h>
+#include <string.h>
+
+#define NUM 64
+
+static void
+init_round (double *src)
+{
+  int i, sign = 1;
+  double d = rand ();
+
+  for (i = 0; i < NUM; i++)
+    {
+      src[i] = (i + 1)* d * M_PI * sign;
+      if (i < (NUM / 2))
+	{
+          if ((i % 6) == 0)
+	    d = d * src[i];
+        }
+      else if (i == (NUM / 2))
+	d = rand ();
+      else if ((i % 6) == 0)
+	d = 1 / (d * (i + 1) * src[i] * M_PI *sign);
+      sign = -sign;
+    }
+}
+
+static double
+do_round (double f, int type)
+{
+  short saved_cw, new_cw, clr_mask;
+  double ret;
+
+  if ((type & 4))
+    {
+      type = 0;
+      clr_mask = 0xFFFF;
+    }
+  else
+    {
+      type = 0x003F | ((type & 3) << 10);
+      clr_mask = ~0x0C3F;
+    }
+
+  __asm__ ("fldl %0" : : "m" (*&f));
+
+  __asm__ ("fstcw %0" : "=m" (*&saved_cw));
+  new_cw = saved_cw & clr_mask;
+  new_cw |= type;
+  __asm__ ("fldcw %0" : : "m" (*&new_cw));
+
+  __asm__ ("frndint\n"
+	   "fstpl %0\n" : "=m" (*&ret));
+  __asm__ ("fldcw %0" : : "m" (*&saved_cw));
+  return ret;
+}
+
+static void
+sse4_1_test (void)
+{
+  int i;
+  double f;
+  union
+    {
+      __m128d x[NUM / 2];
+      double d[NUM];
+    } dst, src;
+
+  init_round (src.d);
+  memset (&dst, 0, NUM * sizeof(double));
+
+  for (i = 0; i < NUM / 2 ; i++)
+    dst.x[i] =  _mm_round_sd (dst.x[i], src.x[i], _MM_FROUND_TRUNC);
+
+  for (i = 0; i < NUM; i += 2)
+    {
+      if (dst.d[i + 1] != 0.0)
+	abort ();
+
+      f = do_round (src.d[i], 0x03);
+      if (f != dst.d[i])
+	abort ();
+    }
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-roundss-1.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-roundss-1.c	2007-05-22 08:54:50.000000000 -0700
@@ -0,0 +1,18 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#define VEC_T __m128
+#define FP_T float
+#define ASM_SUFFIX "s"
+
+#define ROUND_INTRIN(x, mode) _mm_ceil_ss(x, x)
+#define ROUND_MODE _MM_FROUND_CEIL
+#define CHECK_ROUND_MODE 0x02
+
+#define LOOP_INCREMENT 4
+#define CHECK_LOOP_INCREMENT 4
+
+#include "sse4_1-round.h"
--- gcc/testsuite/gcc.target/i386/sse4_1-roundss-2.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-roundss-2.c	2007-05-22 08:54:52.000000000 -0700
@@ -0,0 +1,18 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#define VEC_T __m128
+#define FP_T float
+#define ASM_SUFFIX "s"
+
+#define ROUND_INTRIN(x, mode) _mm_round_ss(x, x, mode)
+#define ROUND_MODE _MM_FROUND_NINT
+#define CHECK_ROUND_MODE 0x00
+
+#define LOOP_INCREMENT 4
+#define CHECK_LOOP_INCREMENT 4
+
+#include "sse4_1-round.h"
--- gcc/testsuite/gcc.target/i386/sse4_1-roundss-3.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-roundss-3.c	2007-05-22 08:54:57.000000000 -0700
@@ -0,0 +1,18 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#define VEC_T __m128
+#define FP_T float
+#define ASM_SUFFIX "s"
+
+#define ROUND_INTRIN(x, mode) _mm_floor_ss(x, x)
+#define ROUND_MODE _MM_FROUND_FLOOR
+#define CHECK_ROUND_MODE 0x01
+
+#define LOOP_INCREMENT 4
+#define CHECK_LOOP_INCREMENT 4
+
+#include "sse4_1-round.h"
--- gcc/testsuite/gcc.target/i386/sse4_1-roundss-4.c.sse41-test	2007-05-22 07:45:41.000000000 -0700
+++ gcc/testsuite/gcc.target/i386/sse4_1-roundss-4.c	2007-05-22 08:55:00.000000000 -0700
@@ -0,0 +1,106 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <math.h>
+#include <string.h>
+
+#define NUM 64
+
+static void
+init_round (float *src)
+{
+  int i, sign = 1;
+  float f = rand ();
+
+  for (i = 0; i < NUM; i++)
+    {
+      src[i] = (i + 1)* f * M_PI * sign;
+      if (i < (NUM / 2))
+	{
+          if ((i % 6) == 0)
+	    f = f * src[i];
+        }
+      else if (i == (NUM / 2))
+	f = rand ();
+      else if ((i % 6) == 0)
+	f = 1 / (f * (i + 1) * src[i] * M_PI *sign);
+      sign = -sign;
+    }
+}
+
+static float
+do_round (float f, int type)
+{
+  short saved_cw, new_cw, clr_mask;
+  float ret;
+
+  if ((type & 4))
+    {
+      type = 0;
+      clr_mask = 0xFFFF;
+    }
+  else
+    {
+      type = 0x003F | ((type & 3) << 10);
+      clr_mask = ~0x0C3F;
+    }
+
+  __asm__ ("flds %0" : : "m" (*&f));
+
+  __asm__ ("fstcw %0" : "=m" (*&saved_cw));
+  new_cw = saved_cw & clr_mask;
+  new_cw |= type;
+  __asm__ ("fldcw %0" : : "m" (*&new_cw));
+
+  __asm__ ("frndint\n"
+	   "fstps %0\n" : "=m" (*&ret));
+  __asm__ ("fldcw %0" : : "m" (*&saved_cw));
+  return ret;
+}
+
+static void
+sse4_1_test (void)
+{
+  int i, j;
+  float f;
+  union
+    {
+      __m128 x[NUM / 4];
+      float f[NUM];
+    } dst, src;
+
+  init_round (src.f);
+  memset (&dst, 0, NUM * sizeof(float));
+
+  for (i = 0; i < NUM / 4 ; i++)
+    dst.x[i] =  _mm_round_ss (dst.x[i], src.x[i], _MM_FROUND_RINT);
+
+  for (i = 0; i < NUM; i += 4)
+    {
+      for (j = 0; j < 3; j++)
+	if (dst.f[i + j + 1] != 0.0)
+	  abort ();
+
+      f = do_round (src.f[i], 0x04);
+      if (f != dst.f[i])
+	abort ();
+    }
+
+  for (i = 0; i < NUM / 4 ; i++)
+    dst.x[i] =  _mm_round_ss (dst.x[i], src.x[i], _MM_FROUND_NEARBYINT);
+
+  for (i = 0; i < NUM; i += 4)
+    {
+      for (j = 0; j < 3; j++)
+	if (dst.f[i + j + 1] != 0.0)
+	  abort ();
+
+      f = do_round (src.f[i], 0x0c);
+      if (f != dst.f[i])
+	abort ();
+    }
+}



More information about the Gcc-patches mailing list