This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH] Optimize vector init constructor
- From: "H.J. Lu" <hjl dot tools at gmail dot com>
- To: gcc-patches at gcc dot gnu dot org
- Cc: Richard Guenther <richard dot guenther at gmail dot com>
- Date: Sun, 3 Mar 2019 06:32:30 -0800
- Subject: [PATCH] Optimize vector init constructor
For vector init constructor:
---
typedef float __v4sf __attribute__ ((__vector_size__ (16)));
__v4sf
foo (__v4sf x, float f)
{
__v4sf y = { f, x[1], x[2], x[3] };
return y;
}
---
we can optimize vector init constructor with vector copy or permute
followed by a single scalar insert:
__v4sf D.1912;
__v4sf D.1913;
__v4sf D.1914;
__v4sf y;
x.0_1 = x;
D.1912 = x.0_1;
_2 = D.1912;
D.1913 = _2;
BIT_FIELD_REF <D.1913, 32, 0> = f;
y = D.1913;
D.1914 = y;
return D.1914;
instead of
__v4sf D.1962;
__v4sf y;
_1 = BIT_FIELD_REF <x, 32, 32>;
_2 = BIT_FIELD_REF <x, 32, 64>;
_3 = BIT_FIELD_REF <x, 32, 96>;
y = {f, _1, _2, _3};
D.1962 = y;
return D.1962;
gcc/
PR tree-optimization/88828
* gimplify.c (gimplify_init_constructor): Optimize vector init
constructor with vector copy or permute followed by a single
scalar insert.
gcc/testsuite/
PR tree-optimization/88828
* gcc.target/i386/pr88828-1.c: New test.
* gcc.target/i386/pr88828-2.c: Likewise.
* gcc.target/i386/pr88828-3a.c: Likewise.
* gcc.target/i386/pr88828-3b.c: Likewise.
* gcc.target/i386/pr88828-4a.c: Likewise.
* gcc.target/i386/pr88828-4b.c: Likewise.
* gcc.target/i386/pr88828-5a.c: Likewise.
* gcc.target/i386/pr88828-5b.c: Likewise.
* gcc.target/i386/pr88828-6a.c: Likewise.
* gcc.target/i386/pr88828-6b.c: Likewise.
---
gcc/gimplify.c | 176 +++++++++++++++++++--
gcc/testsuite/gcc.target/i386/pr88828-1.c | 16 ++
gcc/testsuite/gcc.target/i386/pr88828-2.c | 17 ++
gcc/testsuite/gcc.target/i386/pr88828-3a.c | 16 ++
gcc/testsuite/gcc.target/i386/pr88828-3b.c | 18 +++
gcc/testsuite/gcc.target/i386/pr88828-4a.c | 17 ++
gcc/testsuite/gcc.target/i386/pr88828-4b.c | 20 +++
gcc/testsuite/gcc.target/i386/pr88828-5a.c | 16 ++
gcc/testsuite/gcc.target/i386/pr88828-5b.c | 18 +++
gcc/testsuite/gcc.target/i386/pr88828-6a.c | 17 ++
gcc/testsuite/gcc.target/i386/pr88828-6b.c | 19 +++
11 files changed, 336 insertions(+), 14 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3a.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-3b.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4a.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-4b.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5a.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-5b.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6a.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr88828-6b.c
diff --git a/gcc/gimplify.c b/gcc/gimplify.c
index 983635ba21f..893a4311f9e 100644
--- a/gcc/gimplify.c
+++ b/gcc/gimplify.c
@@ -5082,22 +5082,170 @@ gimplify_init_constructor (tree *expr_p, gimple_seq *pre_p, gimple_seq *post_p,
TREE_CONSTANT (ctor) = 0;
}
- /* Vector types use CONSTRUCTOR all the way through gimple
- compilation as a general initializer. */
- FOR_EACH_VEC_SAFE_ELT (elts, ix, ce)
+ tree rhs_vector = NULL;
+ /* The vector element to replace scalar elements, which
+ will be overridden by scalar insert. */
+ tree vector_element = NULL;
+ /* The single scalar element. */
+ tree scalar_element = NULL;
+ unsigned int scalar_idx = 0;
+ enum { unknown, copy, permute, init } operation = unknown;
+ bool insert = false;
+
+ /* Check if we can generate vector copy or permute followed by
+ a single scalar insert. */
+ if (TYPE_VECTOR_SUBPARTS (type).is_constant ())
{
- enum gimplify_status tret;
- tret = gimplify_expr (&ce->value, pre_p, post_p, is_gimple_val,
- fb_rvalue);
- if (tret == GS_ERROR)
- ret = GS_ERROR;
- else if (TREE_STATIC (ctor)
- && !initializer_constant_valid_p (ce->value,
- TREE_TYPE (ce->value)))
- TREE_STATIC (ctor) = 0;
+ /* If all RHS vector elements come from the same vector,
+ we can use permute. If all RHS vector elements come
+ from the same vector in the same order, we can use
+ copy. */
+ unsigned int nunits
+ = TYPE_VECTOR_SUBPARTS (type).to_constant ();
+ unsigned int nscalars = 0;
+ unsigned int nvectors = 0;
+ operation = unknown;
+ FOR_EACH_VEC_SAFE_ELT (elts, ix, ce)
+ if (TREE_CODE (ce->value) == ARRAY_REF
+ || TREE_CODE (ce->value) == ARRAY_RANGE_REF)
+ {
+ if (!vector_element)
+ vector_element = ce->value;
+ /* Get the vector index. */
+ tree idx = TREE_OPERAND (ce->value, 1);
+ if (TREE_CODE (idx) == INTEGER_CST)
+ {
+ /* Get the RHS vector. */
+ tree r = ce->value;
+ while (handled_component_p (r))
+ r = TREE_OPERAND (r, 0);
+ if (type == TREE_TYPE (r))
+ {
+ /* The RHS vector has the same type as
+ LHS. */
+ if (rhs_vector == NULL)
+ rhs_vector = r;
+
+ /* Check if all RHS vector elements come
+ fome the same vector. */
+ if (rhs_vector == r)
+ {
+ nvectors++;
+ if (TREE_INT_CST_LOW (idx) == ix
+ && (operation == unknown
+ || operation == copy))
+ operation = copy;
+ else
+ operation = permute;
+ continue;
+ }
+ }
+ }
+
+ /* Otherwise, use vector init. */
+ break;
+ }
+ else if (TREE_CODE (TYPE_SIZE (TREE_TYPE (ce->value)))
+ == INTEGER_CST)
+ {
+ /* Only allow one single scalar insert. */
+ if (nscalars != 0)
+ break;
+ nscalars = 1;
+ insert = true;
+ scalar_idx = ix;
+ scalar_element = ce->value;
+ }
+
+ /* Allow a single scalar insert with vector copy or
+ vector permute. Vector copy without insert is OK. */
+ if (nunits != (nscalars + nvectors)
+ || (nscalars == 0 && operation != copy))
+ operation = unknown;
+ }
+
+ if (operation == unknown)
+ {
+ /* Default to the regular vector init constructor. */
+ operation = init;
+ insert = false;
+ }
+
+ if (operation == copy)
+ {
+ /* Generate a vector copy. */
+ tree var = create_tmp_var (type);
+ if (gimplify_expr (&rhs_vector, pre_p, post_p,
+ is_gimple_val, fb_rvalue) == GS_ERROR)
+ {
+ ret = GS_ERROR;
+ break;
+ }
+ gassign *init = gimple_build_assign (var, rhs_vector);
+ gimple_seq_add_stmt (pre_p, init);
+ if (gimplify_expr (&var, pre_p, post_p, is_gimple_val,
+ fb_rvalue) == GS_ERROR)
+ {
+ ret = GS_ERROR;
+ break;
+ }
+ /* Replace RHS with the vector copy. */
+ if (!is_gimple_reg (TREE_OPERAND (*expr_p, 0)))
+ TREE_OPERAND (*expr_p, 1) = get_formal_tmp_var (var, pre_p);
+ else
+ TREE_OPERAND (*expr_p, 1) = var;
+ }
+ else
+ {
+ /* Prepare for vector permute by replacing the scalar
+ element with the vector one. */
+ if (operation == permute)
+ (elts->address())[scalar_idx].value = vector_element;
+
+ /* Vector types use CONSTRUCTOR all the way through gimple
+ compilation as a general initializer. */
+ FOR_EACH_VEC_SAFE_ELT (elts, ix, ce)
+ {
+ enum gimplify_status tret;
+ tret = gimplify_expr (&ce->value, pre_p, post_p,
+ is_gimple_val,
+ fb_rvalue);
+ if (tret == GS_ERROR)
+ ret = GS_ERROR;
+ else if (TREE_STATIC (ctor)
+ && !initializer_constant_valid_p (ce->value,
+ TREE_TYPE (ce->value)))
+ TREE_STATIC (ctor) = 0;
+ }
+ if (!is_gimple_reg (TREE_OPERAND (*expr_p, 0)))
+ TREE_OPERAND (*expr_p, 1) = get_formal_tmp_var (ctor, pre_p);
+ }
+
+ if (insert)
+ {
+ /* Generate a single scalar insert after vector copy or
+ permute. */
+ tree rhs = TREE_OPERAND (*expr_p, 1);
+ tree var = create_tmp_var (type);
+ gassign *init = gimple_build_assign (var, rhs);
+ gimple_seq_add_stmt (pre_p, init);
+ if (gimplify_expr (&scalar_element, pre_p, post_p,
+ is_gimple_val, fb_rvalue) == GS_ERROR)
+ {
+ ret = GS_ERROR;
+ break;
+ }
+ tree scalar_type = TREE_TYPE (scalar_element);
+ tree scalar_size = TYPE_SIZE (scalar_type);
+ tree bitpos = bitsize_int (scalar_idx
+ * TREE_INT_CST_LOW (scalar_size));
+ tree ref = build3_loc (EXPR_LOCATION (rhs), BIT_FIELD_REF,
+ scalar_type, var, scalar_size,
+ bitpos);
+ init = gimple_build_assign (ref, scalar_element);
+ gimplify_seq_add_stmt (pre_p, init);
+ TREE_OPERAND (*expr_p, 1) = var;
}
- if (!is_gimple_reg (TREE_OPERAND (*expr_p, 0)))
- TREE_OPERAND (*expr_p, 1) = get_formal_tmp_var (ctor, pre_p);
}
break;
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1.c b/gcc/testsuite/gcc.target/i386/pr88828-1.c
new file mode 100644
index 00000000000..4ef1feab389
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+ __v4sf y = { f, x[1], x[2], x[3] };
+ return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-2.c b/gcc/testsuite/gcc.target/i386/pr88828-2.c
new file mode 100644
index 00000000000..6dc482b6f4b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-2.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+ __v4sf y = x;
+ y[0] = f;
+ return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3a.c b/gcc/testsuite/gcc.target/i386/pr88828-3a.c
new file mode 100644
index 00000000000..97eb8e7162a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3a.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+ __v4sf y = { f, x[0], x[2], x[3] };
+ return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-3b.c b/gcc/testsuite/gcc.target/i386/pr88828-3b.c
new file mode 100644
index 00000000000..ab2ba730716
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-3b.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+ __v4sf y = { f, x[0], x[2], x[3] };
+ return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4a.c b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
new file mode 100644
index 00000000000..a54689be701
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+ __v4sf y = { x[0], x[2], x[3], x[1] };
+ y[0] = f;
+ return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4b.c b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
new file mode 100644
index 00000000000..0c3a1024d93
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+ __v4sf y = { x[0], x[2], x[3], x[1] };
+ y[0] = f;
+ return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5a.c b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
new file mode 100644
index 00000000000..534808d3cd1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 2 } } */
+/* { dg-final { scan-assembler-times "movaps" 1 } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+ __v4sf y = { x[0], x[2], x[3], f };
+ return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5b.c b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
new file mode 100644
index 00000000000..aebea790979
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
+/* { dg-final { scan-assembler-not "vmovss" } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+ __v4sf y = { x[0], x[2], x[3], f };
+ return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6a.c b/gcc/testsuite/gcc.target/i386/pr88828-6a.c
new file mode 100644
index 00000000000..d43a36d9137
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-6a.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 2 } } */
+/* { dg-final { scan-assembler-times "movaps" 1 } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+ __v4sf y = { x[0], x[2], x[3], x[0] };
+ y[3] = f;
+ return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-6b.c b/gcc/testsuite/gcc.target/i386/pr88828-6b.c
new file mode 100644
index 00000000000..6856fe6500e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-6b.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovss" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo (__v4sf x, float f)
+{
+ __v4sf y = { x[0], x[2], x[3], x[0] };
+ y[3] = f;
+ return y;
+}
--
2.20.1