This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

PATCH: Optimize V8HImode/V16QImode initialization


Hi,

This patch optimizes V8HImode/V16QImode initialization. Before
the change, I got


[hjl@gnu-6 sse-1]$ cat v8hi-1.c
#include <emmintrin.h>

__m128i
foo1 (short x1, short x2, short x3, short x4,
      short x5, short x6, short x7, short x8)
{
  return _mm_set_epi16 (x1, x2, x3, x4, x5, x6, x7, x8);
}
[hjl@gnu-6 sse-1]$ /usr/gcc-4.4/bin/gcc -S -O2 v8hi-1.c
[hjl@gnu-6 sse-1]$ cat v8hi-1.s
        .file   "v8hi-1.c"
        .text
        .p2align 4,,15
.globl foo1
        .type   foo1, @function
foo1:
.LFB518:
        movzwl  8(%rsp), %eax
        movzwl  %r8w, %r8d
        movzwl  %di, %edi
        movzwl  %r9w, %r9d
        salq    $16, %r8
        movzwl  %si, %esi
        salq    $16, %rdi
        orq     %r9, %r8
        movzwl  %dx, %edx
        orq     %rsi, %rdi
        salq    $16, %r8
        movzwl  16(%rsp), %r9d
        salq    $16, %rdi
        orq     %rax, %r8
        movzwl  %cx, %ecx
        orq     %rdx, %rdi
        salq    $16, %r8
        salq    $16, %rdi
        movq    %r8, %rax
        movq    %rdi, %rdx
        orq     %r9, %rax
        orq     %rcx, %rdx
        movq    %rax, -24(%rsp)
        movq    %rdx, -16(%rsp)
        movdqa  -24(%rsp), %xmm0
        ret

After the change,

[hjl@gnu-6 sse-1]$ cat v8hi-1.s
        .file   "v8hi-1.c"
        .text
        .p2align 4,,15
.globl foo1
        .type   foo1, @function
foo1:
.LFB518:
        pxor    %xmm3, %xmm3
        movq    %r9, -8(%rsp)
        movq    -8(%rsp), %xmm1
        movq    %rcx, -8(%rsp)
        pxor    %xmm2, %xmm2
        movss   %xmm1, %xmm3
        pxor    %xmm1, %xmm1
        movq    -8(%rsp), %xmm4
        movq    %rsi, -8(%rsp)
        movd    16(%rsp), %xmm0
        movss   %xmm4, %xmm2
        movq    -8(%rsp), %xmm4
        pinsrw  $1, 8(%rsp), %xmm0
        movss   %xmm4, %xmm1
        pinsrw  $1, %r8d, %xmm3
        pinsrw  $1, %edx, %xmm2
        pinsrw  $1, %edi, %xmm1
        punpckldq       %xmm3, %xmm0
        punpckldq       %xmm1, %xmm2
        punpcklqdq      %xmm2, %xmm0
        ret

There is similar improvement for V16QI. OK for trunk?

Thanks.


H.J.
---
gcc/

2008-05-15  H.J. Lu  <hongjiu.lu@intel.com>

        * config/i386/i386.c (ix86_expand_vector_init_general): Optimize
        V8HImode for SSE2 and V16QImode for SSE4.1.

gcc/testsuite/

2008-05-15  H.J. Lu  <hongjiu.lu@intel.com>

        * gcc.target/i386/m128-check.h: New.
        * gcc.target/i386/set-v16qi-1.h: Likewise.
        * gcc.target/i386/set-v16qi-2.h: Likewise.
        * gcc.target/i386/set-v8hi-1.h: Likewise.
        * gcc.target/i386/set-v8hi-2.h: Likewise.
        * gcc.target/i386/sse2-set-v16qi-1.c: Likewise.
        * gcc.target/i386/sse2-set-v16qi-2.c: Likewise.
        * gcc.target/i386/sse2-set-v8hi-1.c: Likewise.
        * gcc.target/i386/sse2-set-v8hi-2.c: Likewise.
        * gcc.target/i386/sse4_1-set-v16qi-1.c: Likewise.
        * gcc.target/i386/sse4_1-set-v16qi-2.c: Likewise.

        * gcc.target/i386/sse2-check.h: Include m128-check.h. Don't
        include <stdio.h>.
        * gcc.target/i386/sse4_1-check.h: Likewise.
gcc/

2008-05-15  H.J. Lu  <hongjiu.lu@intel.com>

	* config/i386/i386.c (ix86_expand_vector_init_general): Optimize
	V8HImode for SSE2 and V16QImode for SSE4.1.

gcc/testsuite/

2008-05-15  H.J. Lu  <hongjiu.lu@intel.com>

	* gcc.target/i386/m128-check.h: New.
	* gcc.target/i386/set-v16qi-1.h: Likewise.
	* gcc.target/i386/set-v16qi-2.h: Likewise.
	* gcc.target/i386/set-v8hi-1.h: Likewise.
	* gcc.target/i386/set-v8hi-2.h: Likewise.
	* gcc.target/i386/sse2-set-v16qi-1.c: Likewise.
	* gcc.target/i386/sse2-set-v16qi-2.c: Likewise.
	* gcc.target/i386/sse2-set-v8hi-1.c: Likewise.
	* gcc.target/i386/sse2-set-v8hi-2.c: Likewise.
	* gcc.target/i386/sse4_1-set-v16qi-1.c: Likewise.
	* gcc.target/i386/sse4_1-set-v16qi-2.c: Likewise.

	* gcc.target/i386/sse2-check.h: Include m128-check.h. Don't
	include <stdio.h>.
	* gcc.target/i386/sse4_1-check.h: Likewise.

Index: testsuite/gcc.target/i386/sse4_1-set-v16qi-1.c
===================================================================
--- testsuite/gcc.target/i386/sse4_1-set-v16qi-1.c	(revision 0)
+++ testsuite/gcc.target/i386/sse4_1-set-v16qi-1.c	(revision 0)
@@ -0,0 +1,8 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#define CHECK_H "sse4_1-check.h"
+#define TEST sse4_1_test
+
+#include "set-v16qi-1.h"
Index: testsuite/gcc.target/i386/sse4_1-set-v16qi-2.c
===================================================================
--- testsuite/gcc.target/i386/sse4_1-set-v16qi-2.c	(revision 0)
+++ testsuite/gcc.target/i386/sse4_1-set-v16qi-2.c	(revision 0)
@@ -0,0 +1,8 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#define CHECK_H "sse4_1-check.h"
+#define TEST sse4_1_test
+
+#include "set-v16qi-2.h"
Index: testsuite/gcc.target/i386/sse2-check.h
===================================================================
--- testsuite/gcc.target/i386/sse2-check.h	(revision 2603)
+++ testsuite/gcc.target/i386/sse2-check.h	(working copy)
@@ -1,7 +1,6 @@
-#include <stdio.h>
 #include <stdlib.h>
-
 #include "cpuid.h"
+#include "m128-check.h"
 
 static void sse2_test (void);
 
Index: testsuite/gcc.target/i386/sse2-set-v16qi-1.c
===================================================================
--- testsuite/gcc.target/i386/sse2-set-v16qi-1.c	(revision 0)
+++ testsuite/gcc.target/i386/sse2-set-v16qi-1.c	(revision 0)
@@ -0,0 +1,7 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -msse2" } */
+
+#define CHECK_H "sse2-check.h"
+#define TEST sse2_test
+
+#include "set-v16qi-1.h"
Index: testsuite/gcc.target/i386/sse2-set-v16qi-2.c
===================================================================
--- testsuite/gcc.target/i386/sse2-set-v16qi-2.c	(revision 0)
+++ testsuite/gcc.target/i386/sse2-set-v16qi-2.c	(revision 0)
@@ -0,0 +1,7 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -msse2" } */
+
+#define CHECK_H "sse2-check.h"
+#define TEST sse2_test
+
+#include "set-v16qi-2.h"
Index: testsuite/gcc.target/i386/set-v8hi-1.h
===================================================================
--- testsuite/gcc.target/i386/set-v8hi-1.h	(revision 0)
+++ testsuite/gcc.target/i386/set-v8hi-1.h	(revision 0)
@@ -0,0 +1,19 @@
+#include CHECK_H
+
+static __m128i
+__attribute__((noinline))
+foo (short *v)
+{
+  return _mm_set_epi16 (v[7], v[6], v[5], v[4], v[3], v[2], v[1], v[0]);
+}
+
+static void
+TEST (void)
+{
+  short v[8] = { -3, 6000, 48, 104, -90, 34567, -1248, 34678 };
+  union128i_w u;
+
+  u.x = foo (v);
+  if (check_union128i_w (u, v))
+    abort ();
+}
Index: testsuite/gcc.target/i386/set-v8hi-2.h
===================================================================
--- testsuite/gcc.target/i386/set-v8hi-2.h	(revision 0)
+++ testsuite/gcc.target/i386/set-v8hi-2.h	(revision 0)
@@ -0,0 +1,21 @@
+#include CHECK_H
+
+__m128i
+__attribute__((noinline))
+foo (short x1, short x2, short x3, short x4,
+     short x5, short x6, short x7, short x8)
+{
+  return _mm_set_epi16 (x1, x2, x3, x4, x5, x6, x7, x8);
+}
+
+static void
+TEST (void)
+{
+  short v[8] = { -3, 2, 1, 9, 23, -173, -13, 69 };
+  union128i_w u;
+
+  u.x = foo (v[7], v[6], v[5], v[4], v[3], v[2], v[1], v[0]);
+
+  if (check_union128i_w (u, v))
+     abort ();
+}
Index: testsuite/gcc.target/i386/m128-check.h
===================================================================
--- testsuite/gcc.target/i386/m128-check.h	(revision 0)
+++ testsuite/gcc.target/i386/m128-check.h	(revision 0)
@@ -0,0 +1,69 @@
+#include <stdio.h>
+#include <emmintrin.h>
+
+typedef union
+{
+  __m128i x;
+  char a[16];
+} union128i_b;
+
+typedef union
+{
+  __m128i x;
+  short a[8];
+} union128i_w;
+
+typedef union
+{
+  __m128i x;
+  int a[4];
+} union128i_d;
+
+typedef union
+{
+  __m128i x;
+  long long a[2];
+} union128i_q;
+
+typedef union
+{
+  __m128  x;
+  float a[4];
+} union128;
+
+typedef union
+{
+  __m128d x;
+  double a[2];
+} union128d;
+
+#ifdef DEBUG
+#define PRINTF printf
+#else
+#define PRINTF(...)	
+#endif
+
+#define CHECK_EXP(UINON_TYPE, VALUE_TYPE, FMT)		\
+static int						\
+__attribute__((noinline, unused))			\
+check_##UINON_TYPE (UINON_TYPE u, const VALUE_TYPE *v)	\
+{							\
+  int i;						\
+  int err = 0;						\
+							\
+  for (i = 0; i < sizeof (u.a) / sizeof (u.a[0]); i++)	\
+    if (u.a[i] != v[i])					\
+      {							\
+	err++;						\
+	PRINTF ("%i: " FMT " != " FMT "\n",		\
+		i, v[i], u.a[i]);			\
+      }							\
+  return err;						\
+}
+
+CHECK_EXP (union128i_b, char, "%d")
+CHECK_EXP (union128i_w, short, "%d")
+CHECK_EXP (union128i_d, int, "0x%x")
+CHECK_EXP (union128i_q, long long, "0x%llx")
+CHECK_EXP (union128, float, "%f")
+CHECK_EXP (union128d, double, "%f")
Index: testsuite/gcc.target/i386/sse2-set-v8hi-1.c
===================================================================
--- testsuite/gcc.target/i386/sse2-set-v8hi-1.c	(revision 0)
+++ testsuite/gcc.target/i386/sse2-set-v8hi-1.c	(revision 0)
@@ -0,0 +1,7 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -msse2" } */
+
+#define CHECK_H "sse2-check.h"
+#define TEST sse2_test
+
+#include "set-v8hi-1.h"
Index: testsuite/gcc.target/i386/sse2-set-v8hi-2.c
===================================================================
--- testsuite/gcc.target/i386/sse2-set-v8hi-2.c	(revision 0)
+++ testsuite/gcc.target/i386/sse2-set-v8hi-2.c	(revision 0)
@@ -0,0 +1,7 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -msse2" } */
+
+#define CHECK_H "sse2-check.h"
+#define TEST sse2_test
+
+#include "set-v8hi-2.h"
Index: testsuite/gcc.target/i386/set-v16qi-1.h
===================================================================
--- testsuite/gcc.target/i386/set-v16qi-1.h	(revision 0)
+++ testsuite/gcc.target/i386/set-v16qi-1.h	(revision 0)
@@ -0,0 +1,30 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include CHECK_H
+
+static __m128i
+__attribute__((noinline))
+foo (char *v)
+{
+  return _mm_set_epi8 (v[15], v[14], v[13], v[12],
+		       v[11], v[10], v[9], v[8],
+		       v[7], v[6], v[5], v[4],
+		       v[3], v[2], v[1], v[0]);
+}
+
+static void
+TEST (void)
+{
+  char v[16] =
+    { 
+      -3, 60, 48, 104, -90, 37, -48, 78,
+      4, 33, 81, 4, -89, 17, 8, 68
+    };
+  union128i_b u;
+
+  u.x = foo (v);
+  if (check_union128i_b (u, v))
+    abort ();
+}
Index: testsuite/gcc.target/i386/sse4_1-check.h
===================================================================
--- testsuite/gcc.target/i386/sse4_1-check.h	(revision 2603)
+++ testsuite/gcc.target/i386/sse4_1-check.h	(working copy)
@@ -1,7 +1,7 @@
-#include <stdio.h>
 #include <stdlib.h>
 
 #include "cpuid.h"
+#include "m128-check.h"
 
 static void sse4_1_test (void);
 
Index: testsuite/gcc.target/i386/set-v16qi-2.h
===================================================================
--- testsuite/gcc.target/i386/set-v16qi-2.h	(revision 0)
+++ testsuite/gcc.target/i386/set-v16qi-2.h	(revision 0)
@@ -0,0 +1,30 @@
+#include CHECK_H
+
+static __m128i
+__attribute__((noinline))
+foo (char x1, char x2, char x3, char x4,
+     char x5, char x6, char x7, char x8,
+     char x9, char x10, char x11, char x12,
+     char x13, char x14, char x15, char x16)
+{
+  return _mm_set_epi8 (x1, x2, x3, x4, x5, x6, x7, x8,
+		       x9, x10, x11, x12, x13, x14, x15, x16);
+}
+
+static void
+TEST (void)
+{
+  char v[16] =
+    { 
+      -3, 60, 48, 104, -90, 37, -48, 78,
+      4, 33, 81, 4, -89, 17, 8, 68
+    };
+  union128i_b u;
+
+  u.x = foo (v[15], v[14], v[13], v[12],
+	     v[11], v[10], v[9], v[8],
+	     v[7], v[6], v[5], v[4],
+	     v[3], v[2], v[1], v[0]);
+  if (check_union128i_b (u, v))
+    abort ();
+}
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 2603)
+++ config/i386/i386.c	(working copy)
@@ -23892,7 +23892,142 @@ ix86_expand_vector_init_general (bool mm
       break;
 
     case V8HImode:
+      if (TARGET_SSE2)
+	{
+	  rtx ops[4];
+	  unsigned int i, j;
+
+	  for (i = 0; i < ARRAY_SIZE (ops); i++)
+	    {
+	      /* Extend the odd elment from HImode to SImode using
+		 a paradoxical SUBREG.  */
+	      op0 = gen_reg_rtx (SImode);
+	      emit_move_insn (op0, gen_lowpart (SImode,
+						XVECEXP (vals, 0,
+							 i + i)));
+
+	      /* Insert the SImode value as low element of V4SImode
+		 vector. */
+	      op1 = gen_reg_rtx (V4SImode);
+	      op0 = gen_rtx_VEC_MERGE (V4SImode,
+				       gen_rtx_VEC_DUPLICATE (V4SImode,
+							      op0),
+				       CONST0_RTX (V4SImode),
+				       const1_rtx);
+	      emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
+
+	      /* Cast the V4SImode vector back to a V8HImode vector.  */
+	      op0 = gen_reg_rtx (mode);
+	      emit_move_insn (op0, gen_lowpart (mode, op1));
+
+	      /* Load even HI elements into the second positon.  */
+	      emit_insn (gen_vec_setv8hi (op0, XVECEXP (vals, 0,
+							i + i + 1),
+					  const1_rtx));
+
+	      /* Cast V8HImode vector to V4SImode vector.  */
+	      ops[i] = gen_reg_rtx (V4SImode);
+	      emit_move_insn (ops[i], gen_lowpart (V4SImode, op0));
+	    }
+
+	  /* Interleave low V4SIs.  */
+	  for (i = j = 0; i < ARRAY_SIZE (ops); i += 2, j++)
+	    {
+	      op0 = gen_reg_rtx (V4SImode);
+	      emit_insn (gen_vec_interleave_lowv4si (op0, ops[i],
+						     ops[i + 1]));
+
+	      /* Cast V4SImode vectors to V2DImode vectors.  */
+	      op1 = gen_reg_rtx (V2DImode);
+	      emit_move_insn (op1, gen_lowpart (V2DImode, op0));
+	      ops[j] = op1;
+	    }
+
+	  /* Interleave low V2DIs.  */
+	  op0 = gen_reg_rtx (V2DImode);
+	  emit_insn (gen_vec_interleave_lowv2di (op0, ops[0], ops[1]));
+
+	  /* Cast the V2DImode vector back to a V8HImode vector.  */
+	  emit_insn (gen_rtx_SET (VOIDmode, target,
+				  gen_lowpart (mode, op0)));
+	  return;
+	}
+
     case V16QImode:
+      if (TARGET_SSE4_1)
+	{
+	  rtx ops[8];
+	  unsigned int i, j;
+
+	  for (i = 0; i < ARRAY_SIZE (ops); i++)
+	    {
+	      /* Extend the odd elment from QImode to SImode using
+		 a paradoxical SUBREG.  */
+	      op0 = gen_reg_rtx (SImode);
+	      emit_move_insn (op0, gen_lowpart (SImode,
+						XVECEXP (vals, 0,
+							 i + i)));
+
+	      /* Insert the SImode value as low element of V4SImode
+		 vector. */
+	      op1 = gen_reg_rtx (V4SImode);
+	      op0 = gen_rtx_VEC_MERGE (V4SImode,
+				       gen_rtx_VEC_DUPLICATE (V4SImode,
+							      op0),
+				       CONST0_RTX (V4SImode),
+				       const1_rtx);
+	      emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
+
+	      /* Cast the V4SImode vector back to a V16QImode vector.  */
+	      op0 = gen_reg_rtx (mode);
+	      emit_move_insn (op0, gen_lowpart (mode, op1));
+
+	      /* Load even QI elements into the second positon.  */
+	      emit_insn (gen_vec_setv16qi (op0, XVECEXP (vals, 0,
+							 i + i + 1),
+					   const1_rtx));
+
+	      /* Cast V16QImode vector to V8HImode vector.  */
+	      ops[i] = gen_reg_rtx (V8HImode);
+	      emit_move_insn (ops[i], gen_lowpart (V8HImode, op0));
+	    }
+
+	  /* Interleave low V8HIs.  */
+	  for (i = j = 0; i < ARRAY_SIZE (ops); i += 2, j++)
+	    {
+	      op0 = gen_reg_rtx (V8HImode);
+	      emit_insn (gen_vec_interleave_lowv8hi (op0, ops[i],
+						     ops[i + 1]));
+
+	      /* Cast V8HImode vector to V4SImode vector.  */
+	      op1 = gen_reg_rtx (V4SImode);
+	      emit_move_insn (op1, gen_lowpart (V4SImode, op0));
+	      ops[j] = op1;
+	    }
+
+	  /* Interleave low V4SIs.  */
+	  for (i = j = 0; i < ARRAY_SIZE (ops) / 2; i += 2, j++)
+	    {
+	      op0 = gen_reg_rtx (V4SImode);
+	      emit_insn (gen_vec_interleave_lowv4si (op0, ops[i],
+						     ops[i + 1]));
+
+	      /* Cast V4SImode vectors to V2DImode vectors.  */
+	      op1 = gen_reg_rtx (V2DImode);
+	      emit_move_insn (op1, gen_lowpart (V2DImode, op0));
+	      ops[j] = op1;
+	    }
+
+	  /* Interleave low V2DIs.  */
+	  op0 = gen_reg_rtx (V2DImode);
+	  emit_insn (gen_vec_interleave_lowv2di (op0, ops[0], ops[1]));
+
+	  /* Cast the V2DImode vector back to a V8HImode vector.  */
+	  emit_insn (gen_rtx_SET (VOIDmode, target,
+				  gen_lowpart (mode, op0)));
+	  return;
+	}
+
     case V4HImode:
     case V8QImode:
       break;

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]