This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
PATCH: Optimize V8HImode/V16QImode initialization
- From: "H.J. Lu" <hjl dot tools at gmail dot com>
- To: "Gcc Patch List" <gcc-patches at gcc dot gnu dot org>, "Uros Bizjak" <ubizjak at gmail dot com>
- Date: Thu, 15 May 2008 16:09:56 -0700
- Subject: PATCH: Optimize V8HImode/V16QImode initialization
Hi,
This patch optimizes V8HImode/V16QImode initialization. Before
the change, I got
[hjl@gnu-6 sse-1]$ cat v8hi-1.c
#include <emmintrin.h>
__m128i
foo1 (short x1, short x2, short x3, short x4,
short x5, short x6, short x7, short x8)
{
return _mm_set_epi16 (x1, x2, x3, x4, x5, x6, x7, x8);
}
[hjl@gnu-6 sse-1]$ /usr/gcc-4.4/bin/gcc -S -O2 v8hi-1.c
[hjl@gnu-6 sse-1]$ cat v8hi-1.s
.file "v8hi-1.c"
.text
.p2align 4,,15
.globl foo1
.type foo1, @function
foo1:
.LFB518:
movzwl 8(%rsp), %eax
movzwl %r8w, %r8d
movzwl %di, %edi
movzwl %r9w, %r9d
salq $16, %r8
movzwl %si, %esi
salq $16, %rdi
orq %r9, %r8
movzwl %dx, %edx
orq %rsi, %rdi
salq $16, %r8
movzwl 16(%rsp), %r9d
salq $16, %rdi
orq %rax, %r8
movzwl %cx, %ecx
orq %rdx, %rdi
salq $16, %r8
salq $16, %rdi
movq %r8, %rax
movq %rdi, %rdx
orq %r9, %rax
orq %rcx, %rdx
movq %rax, -24(%rsp)
movq %rdx, -16(%rsp)
movdqa -24(%rsp), %xmm0
ret
After the change,
[hjl@gnu-6 sse-1]$ cat v8hi-1.s
.file "v8hi-1.c"
.text
.p2align 4,,15
.globl foo1
.type foo1, @function
foo1:
.LFB518:
pxor %xmm3, %xmm3
movq %r9, -8(%rsp)
movq -8(%rsp), %xmm1
movq %rcx, -8(%rsp)
pxor %xmm2, %xmm2
movss %xmm1, %xmm3
pxor %xmm1, %xmm1
movq -8(%rsp), %xmm4
movq %rsi, -8(%rsp)
movd 16(%rsp), %xmm0
movss %xmm4, %xmm2
movq -8(%rsp), %xmm4
pinsrw $1, 8(%rsp), %xmm0
movss %xmm4, %xmm1
pinsrw $1, %r8d, %xmm3
pinsrw $1, %edx, %xmm2
pinsrw $1, %edi, %xmm1
punpckldq %xmm3, %xmm0
punpckldq %xmm1, %xmm2
punpcklqdq %xmm2, %xmm0
ret
There is similar improvement for V16QI. OK for trunk?
Thanks.
H.J.
---
gcc/
2008-05-15 H.J. Lu <hongjiu.lu@intel.com>
* config/i386/i386.c (ix86_expand_vector_init_general): Optimize
V8HImode for SSE2 and V16QImode for SSE4.1.
gcc/testsuite/
2008-05-15 H.J. Lu <hongjiu.lu@intel.com>
* gcc.target/i386/m128-check.h: New.
* gcc.target/i386/set-v16qi-1.h: Likewise.
* gcc.target/i386/set-v16qi-2.h: Likewise.
* gcc.target/i386/set-v8hi-1.h: Likewise.
* gcc.target/i386/set-v8hi-2.h: Likewise.
* gcc.target/i386/sse2-set-v16qi-1.c: Likewise.
* gcc.target/i386/sse2-set-v16qi-2.c: Likewise.
* gcc.target/i386/sse2-set-v8hi-1.c: Likewise.
* gcc.target/i386/sse2-set-v8hi-2.c: Likewise.
* gcc.target/i386/sse4_1-set-v16qi-1.c: Likewise.
* gcc.target/i386/sse4_1-set-v16qi-2.c: Likewise.
* gcc.target/i386/sse2-check.h: Include m128-check.h. Don't
include <stdio.h>.
* gcc.target/i386/sse4_1-check.h: Likewise.
gcc/
2008-05-15 H.J. Lu <hongjiu.lu@intel.com>
* config/i386/i386.c (ix86_expand_vector_init_general): Optimize
V8HImode for SSE2 and V16QImode for SSE4.1.
gcc/testsuite/
2008-05-15 H.J. Lu <hongjiu.lu@intel.com>
* gcc.target/i386/m128-check.h: New.
* gcc.target/i386/set-v16qi-1.h: Likewise.
* gcc.target/i386/set-v16qi-2.h: Likewise.
* gcc.target/i386/set-v8hi-1.h: Likewise.
* gcc.target/i386/set-v8hi-2.h: Likewise.
* gcc.target/i386/sse2-set-v16qi-1.c: Likewise.
* gcc.target/i386/sse2-set-v16qi-2.c: Likewise.
* gcc.target/i386/sse2-set-v8hi-1.c: Likewise.
* gcc.target/i386/sse2-set-v8hi-2.c: Likewise.
* gcc.target/i386/sse4_1-set-v16qi-1.c: Likewise.
* gcc.target/i386/sse4_1-set-v16qi-2.c: Likewise.
* gcc.target/i386/sse2-check.h: Include m128-check.h. Don't
include <stdio.h>.
* gcc.target/i386/sse4_1-check.h: Likewise.
Index: testsuite/gcc.target/i386/sse4_1-set-v16qi-1.c
===================================================================
--- testsuite/gcc.target/i386/sse4_1-set-v16qi-1.c (revision 0)
+++ testsuite/gcc.target/i386/sse4_1-set-v16qi-1.c (revision 0)
@@ -0,0 +1,8 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#define CHECK_H "sse4_1-check.h"
+#define TEST sse4_1_test
+
+#include "set-v16qi-1.h"
Index: testsuite/gcc.target/i386/sse4_1-set-v16qi-2.c
===================================================================
--- testsuite/gcc.target/i386/sse4_1-set-v16qi-2.c (revision 0)
+++ testsuite/gcc.target/i386/sse4_1-set-v16qi-2.c (revision 0)
@@ -0,0 +1,8 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#define CHECK_H "sse4_1-check.h"
+#define TEST sse4_1_test
+
+#include "set-v16qi-2.h"
Index: testsuite/gcc.target/i386/sse2-check.h
===================================================================
--- testsuite/gcc.target/i386/sse2-check.h (revision 2603)
+++ testsuite/gcc.target/i386/sse2-check.h (working copy)
@@ -1,7 +1,6 @@
-#include <stdio.h>
#include <stdlib.h>
-
#include "cpuid.h"
+#include "m128-check.h"
static void sse2_test (void);
Index: testsuite/gcc.target/i386/sse2-set-v16qi-1.c
===================================================================
--- testsuite/gcc.target/i386/sse2-set-v16qi-1.c (revision 0)
+++ testsuite/gcc.target/i386/sse2-set-v16qi-1.c (revision 0)
@@ -0,0 +1,7 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -msse2" } */
+
+#define CHECK_H "sse2-check.h"
+#define TEST sse2_test
+
+#include "set-v16qi-1.h"
Index: testsuite/gcc.target/i386/sse2-set-v16qi-2.c
===================================================================
--- testsuite/gcc.target/i386/sse2-set-v16qi-2.c (revision 0)
+++ testsuite/gcc.target/i386/sse2-set-v16qi-2.c (revision 0)
@@ -0,0 +1,7 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -msse2" } */
+
+#define CHECK_H "sse2-check.h"
+#define TEST sse2_test
+
+#include "set-v16qi-2.h"
Index: testsuite/gcc.target/i386/set-v8hi-1.h
===================================================================
--- testsuite/gcc.target/i386/set-v8hi-1.h (revision 0)
+++ testsuite/gcc.target/i386/set-v8hi-1.h (revision 0)
@@ -0,0 +1,19 @@
+#include CHECK_H
+
+static __m128i
+__attribute__((noinline))
+foo (short *v)
+{
+ return _mm_set_epi16 (v[7], v[6], v[5], v[4], v[3], v[2], v[1], v[0]);
+}
+
+static void
+TEST (void)
+{
+ short v[8] = { -3, 6000, 48, 104, -90, 34567, -1248, 34678 };
+ union128i_w u;
+
+ u.x = foo (v);
+ if (check_union128i_w (u, v))
+ abort ();
+}
Index: testsuite/gcc.target/i386/set-v8hi-2.h
===================================================================
--- testsuite/gcc.target/i386/set-v8hi-2.h (revision 0)
+++ testsuite/gcc.target/i386/set-v8hi-2.h (revision 0)
@@ -0,0 +1,21 @@
+#include CHECK_H
+
+__m128i
+__attribute__((noinline))
+foo (short x1, short x2, short x3, short x4,
+ short x5, short x6, short x7, short x8)
+{
+ return _mm_set_epi16 (x1, x2, x3, x4, x5, x6, x7, x8);
+}
+
+static void
+TEST (void)
+{
+ short v[8] = { -3, 2, 1, 9, 23, -173, -13, 69 };
+ union128i_w u;
+
+ u.x = foo (v[7], v[6], v[5], v[4], v[3], v[2], v[1], v[0]);
+
+ if (check_union128i_w (u, v))
+ abort ();
+}
Index: testsuite/gcc.target/i386/m128-check.h
===================================================================
--- testsuite/gcc.target/i386/m128-check.h (revision 0)
+++ testsuite/gcc.target/i386/m128-check.h (revision 0)
@@ -0,0 +1,69 @@
+#include <stdio.h>
+#include <emmintrin.h>
+
+typedef union
+{
+ __m128i x;
+ char a[16];
+} union128i_b;
+
+typedef union
+{
+ __m128i x;
+ short a[8];
+} union128i_w;
+
+typedef union
+{
+ __m128i x;
+ int a[4];
+} union128i_d;
+
+typedef union
+{
+ __m128i x;
+ long long a[2];
+} union128i_q;
+
+typedef union
+{
+ __m128 x;
+ float a[4];
+} union128;
+
+typedef union
+{
+ __m128d x;
+ double a[2];
+} union128d;
+
+#ifdef DEBUG
+#define PRINTF printf
+#else
+#define PRINTF(...)
+#endif
+
+#define CHECK_EXP(UINON_TYPE, VALUE_TYPE, FMT) \
+static int \
+__attribute__((noinline, unused)) \
+check_##UINON_TYPE (UINON_TYPE u, const VALUE_TYPE *v) \
+{ \
+ int i; \
+ int err = 0; \
+ \
+ for (i = 0; i < sizeof (u.a) / sizeof (u.a[0]); i++) \
+ if (u.a[i] != v[i]) \
+ { \
+ err++; \
+ PRINTF ("%i: " FMT " != " FMT "\n", \
+ i, v[i], u.a[i]); \
+ } \
+ return err; \
+}
+
+CHECK_EXP (union128i_b, char, "%d")
+CHECK_EXP (union128i_w, short, "%d")
+CHECK_EXP (union128i_d, int, "0x%x")
+CHECK_EXP (union128i_q, long long, "0x%llx")
+CHECK_EXP (union128, float, "%f")
+CHECK_EXP (union128d, double, "%f")
Index: testsuite/gcc.target/i386/sse2-set-v8hi-1.c
===================================================================
--- testsuite/gcc.target/i386/sse2-set-v8hi-1.c (revision 0)
+++ testsuite/gcc.target/i386/sse2-set-v8hi-1.c (revision 0)
@@ -0,0 +1,7 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -msse2" } */
+
+#define CHECK_H "sse2-check.h"
+#define TEST sse2_test
+
+#include "set-v8hi-1.h"
Index: testsuite/gcc.target/i386/sse2-set-v8hi-2.c
===================================================================
--- testsuite/gcc.target/i386/sse2-set-v8hi-2.c (revision 0)
+++ testsuite/gcc.target/i386/sse2-set-v8hi-2.c (revision 0)
@@ -0,0 +1,7 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -msse2" } */
+
+#define CHECK_H "sse2-check.h"
+#define TEST sse2_test
+
+#include "set-v8hi-2.h"
Index: testsuite/gcc.target/i386/set-v16qi-1.h
===================================================================
--- testsuite/gcc.target/i386/set-v16qi-1.h (revision 0)
+++ testsuite/gcc.target/i386/set-v16qi-1.h (revision 0)
@@ -0,0 +1,30 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include CHECK_H
+
+static __m128i
+__attribute__((noinline))
+foo (char *v)
+{
+ return _mm_set_epi8 (v[15], v[14], v[13], v[12],
+ v[11], v[10], v[9], v[8],
+ v[7], v[6], v[5], v[4],
+ v[3], v[2], v[1], v[0]);
+}
+
+static void
+TEST (void)
+{
+ char v[16] =
+ {
+ -3, 60, 48, 104, -90, 37, -48, 78,
+ 4, 33, 81, 4, -89, 17, 8, 68
+ };
+ union128i_b u;
+
+ u.x = foo (v);
+ if (check_union128i_b (u, v))
+ abort ();
+}
Index: testsuite/gcc.target/i386/sse4_1-check.h
===================================================================
--- testsuite/gcc.target/i386/sse4_1-check.h (revision 2603)
+++ testsuite/gcc.target/i386/sse4_1-check.h (working copy)
@@ -1,7 +1,7 @@
-#include <stdio.h>
#include <stdlib.h>
#include "cpuid.h"
+#include "m128-check.h"
static void sse4_1_test (void);
Index: testsuite/gcc.target/i386/set-v16qi-2.h
===================================================================
--- testsuite/gcc.target/i386/set-v16qi-2.h (revision 0)
+++ testsuite/gcc.target/i386/set-v16qi-2.h (revision 0)
@@ -0,0 +1,30 @@
+#include CHECK_H
+
+static __m128i
+__attribute__((noinline))
+foo (char x1, char x2, char x3, char x4,
+ char x5, char x6, char x7, char x8,
+ char x9, char x10, char x11, char x12,
+ char x13, char x14, char x15, char x16)
+{
+ return _mm_set_epi8 (x1, x2, x3, x4, x5, x6, x7, x8,
+ x9, x10, x11, x12, x13, x14, x15, x16);
+}
+
+static void
+TEST (void)
+{
+ char v[16] =
+ {
+ -3, 60, 48, 104, -90, 37, -48, 78,
+ 4, 33, 81, 4, -89, 17, 8, 68
+ };
+ union128i_b u;
+
+ u.x = foo (v[15], v[14], v[13], v[12],
+ v[11], v[10], v[9], v[8],
+ v[7], v[6], v[5], v[4],
+ v[3], v[2], v[1], v[0]);
+ if (check_union128i_b (u, v))
+ abort ();
+}
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c (revision 2603)
+++ config/i386/i386.c (working copy)
@@ -23892,7 +23892,142 @@ ix86_expand_vector_init_general (bool mm
break;
case V8HImode:
+ if (TARGET_SSE2)
+ {
+ rtx ops[4];
+ unsigned int i, j;
+
+ for (i = 0; i < ARRAY_SIZE (ops); i++)
+ {
+ /* Extend the odd elment from HImode to SImode using
+ a paradoxical SUBREG. */
+ op0 = gen_reg_rtx (SImode);
+ emit_move_insn (op0, gen_lowpart (SImode,
+ XVECEXP (vals, 0,
+ i + i)));
+
+ /* Insert the SImode value as low element of V4SImode
+ vector. */
+ op1 = gen_reg_rtx (V4SImode);
+ op0 = gen_rtx_VEC_MERGE (V4SImode,
+ gen_rtx_VEC_DUPLICATE (V4SImode,
+ op0),
+ CONST0_RTX (V4SImode),
+ const1_rtx);
+ emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
+
+ /* Cast the V4SImode vector back to a V8HImode vector. */
+ op0 = gen_reg_rtx (mode);
+ emit_move_insn (op0, gen_lowpart (mode, op1));
+
+ /* Load even HI elements into the second positon. */
+ emit_insn (gen_vec_setv8hi (op0, XVECEXP (vals, 0,
+ i + i + 1),
+ const1_rtx));
+
+ /* Cast V8HImode vector to V4SImode vector. */
+ ops[i] = gen_reg_rtx (V4SImode);
+ emit_move_insn (ops[i], gen_lowpart (V4SImode, op0));
+ }
+
+ /* Interleave low V4SIs. */
+ for (i = j = 0; i < ARRAY_SIZE (ops); i += 2, j++)
+ {
+ op0 = gen_reg_rtx (V4SImode);
+ emit_insn (gen_vec_interleave_lowv4si (op0, ops[i],
+ ops[i + 1]));
+
+ /* Cast V4SImode vectors to V2DImode vectors. */
+ op1 = gen_reg_rtx (V2DImode);
+ emit_move_insn (op1, gen_lowpart (V2DImode, op0));
+ ops[j] = op1;
+ }
+
+ /* Interleave low V2DIs. */
+ op0 = gen_reg_rtx (V2DImode);
+ emit_insn (gen_vec_interleave_lowv2di (op0, ops[0], ops[1]));
+
+ /* Cast the V2DImode vector back to a V8HImode vector. */
+ emit_insn (gen_rtx_SET (VOIDmode, target,
+ gen_lowpart (mode, op0)));
+ return;
+ }
+
case V16QImode:
+ if (TARGET_SSE4_1)
+ {
+ rtx ops[8];
+ unsigned int i, j;
+
+ for (i = 0; i < ARRAY_SIZE (ops); i++)
+ {
+ /* Extend the odd elment from QImode to SImode using
+ a paradoxical SUBREG. */
+ op0 = gen_reg_rtx (SImode);
+ emit_move_insn (op0, gen_lowpart (SImode,
+ XVECEXP (vals, 0,
+ i + i)));
+
+ /* Insert the SImode value as low element of V4SImode
+ vector. */
+ op1 = gen_reg_rtx (V4SImode);
+ op0 = gen_rtx_VEC_MERGE (V4SImode,
+ gen_rtx_VEC_DUPLICATE (V4SImode,
+ op0),
+ CONST0_RTX (V4SImode),
+ const1_rtx);
+ emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
+
+ /* Cast the V4SImode vector back to a V16QImode vector. */
+ op0 = gen_reg_rtx (mode);
+ emit_move_insn (op0, gen_lowpart (mode, op1));
+
+ /* Load even QI elements into the second positon. */
+ emit_insn (gen_vec_setv16qi (op0, XVECEXP (vals, 0,
+ i + i + 1),
+ const1_rtx));
+
+ /* Cast V16QImode vector to V8HImode vector. */
+ ops[i] = gen_reg_rtx (V8HImode);
+ emit_move_insn (ops[i], gen_lowpart (V8HImode, op0));
+ }
+
+ /* Interleave low V8HIs. */
+ for (i = j = 0; i < ARRAY_SIZE (ops); i += 2, j++)
+ {
+ op0 = gen_reg_rtx (V8HImode);
+ emit_insn (gen_vec_interleave_lowv8hi (op0, ops[i],
+ ops[i + 1]));
+
+ /* Cast V8HImode vector to V4SImode vector. */
+ op1 = gen_reg_rtx (V4SImode);
+ emit_move_insn (op1, gen_lowpart (V4SImode, op0));
+ ops[j] = op1;
+ }
+
+ /* Interleave low V4SIs. */
+ for (i = j = 0; i < ARRAY_SIZE (ops) / 2; i += 2, j++)
+ {
+ op0 = gen_reg_rtx (V4SImode);
+ emit_insn (gen_vec_interleave_lowv4si (op0, ops[i],
+ ops[i + 1]));
+
+ /* Cast V4SImode vectors to V2DImode vectors. */
+ op1 = gen_reg_rtx (V2DImode);
+ emit_move_insn (op1, gen_lowpart (V2DImode, op0));
+ ops[j] = op1;
+ }
+
+ /* Interleave low V2DIs. */
+ op0 = gen_reg_rtx (V2DImode);
+ emit_insn (gen_vec_interleave_lowv2di (op0, ops[0], ops[1]));
+
+ /* Cast the V2DImode vector back to a V8HImode vector. */
+ emit_insn (gen_rtx_SET (VOIDmode, target,
+ gen_lowpart (mode, op0)));
+ return;
+ }
+
case V4HImode:
case V8QImode:
break;