[PATCH V2] PR88497 - Extend reassoc for vector bit_field_ref
Kewen.Lin
linkw@linux.ibm.com
Tue Mar 12 13:30:00 GMT 2019
Hi,
As the comments from Richard and Segher (Thanks!), I've made some
changes by adding some checks and extensions.
*) Check whether vector type available on target machine,
*) Check whether vector operation available on target machine,
*) Extend the vector operation support to MULT/BIT_AND/BIT_IOR/BIT_XOR.
*) Add more test cases for coverage.
Re-bootstrapped/re-regtested successfully on powerpc64le-linux-gnu,
is it ok for GCC10?
gcc/ChangeLog
2019-03-12 Kewen Lin <linkw@gcc.gnu.org>
PR target/88497
* tree-ssa-reassoc.c (reassociate_bb): Swap the positions of
GIMPLE_BINARY_RHS check and gimple_visited_p check, call new
function undistribute_bitref_for_vector.
(undistribute_bitref_for_vector): New function.
(cleanup_vinfo_map): Likewise.
(unsigned_cmp): Likewise.
gcc/testsuite/ChangeLog
2019-03-12 Kewen Lin <linkw@gcc.gnu.org>
* gcc.dg/tree-ssa/pr88497-1.c: New test.
* gcc.dg/tree-ssa/pr88497-2.c: Likewise.
* gcc.dg/tree-ssa/pr88497-3.c: Likewise.
* gcc.dg/tree-ssa/pr88497-4.c: Likewise.
* gcc.dg/tree-ssa/pr88497-5.c: Likewise.
---
gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c | 42 ++++
gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c | 31 +++
gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c | 31 +++
gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c | 31 +++
gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c | 31 +++
gcc/tree-ssa-reassoc.c | 309 +++++++++++++++++++++++++++++-
6 files changed, 470 insertions(+), 5 deletions(-)
create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c
create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c
create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c
create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c
create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c
new file mode 100644
index 0000000..c87caff
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_double } */
+/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */
+
+/* To test reassoc can undistribute vector bit_field_ref summation.
+
+ arg1 and arg2 are two arrays whose elements of type vector double.
+ Assuming:
+ A0 = arg1[0], A1 = arg1[1], A2 = arg1[2], A3 = arg1[3],
+ B0 = arg2[0], B1 = arg2[1], B2 = arg2[2], B3 = arg2[3],
+
+ Then:
+ V0 = A0 * B0, V1 = A1 * B1, V2 = A2 * B2, V3 = A3 * B3,
+
+ reassoc transforms
+
+ accumulator += V0[0] + V0[1] + V1[0] + V1[1] + V2[0] + V2[1]
+ + V3[0] + V3[1];
+
+ into:
+
+ T = V0 + V1 + V2 + V3
+ accumulator += T[0] + T[1];
+
+ Fewer bit_field_refs, only two for 128 or more bits vector. */
+
+typedef double v2df __attribute__ ((vector_size (16)));
+double
+test (double accumulator, v2df arg1[], v2df arg2[])
+{
+ v2df temp;
+ temp = arg1[0] * arg2[0];
+ accumulator += temp[0] + temp[1];
+ temp = arg1[1] * arg2[1];
+ accumulator += temp[0] + temp[1];
+ temp = arg1[2] * arg2[2];
+ accumulator += temp[0] + temp[1];
+ temp = arg1[3] * arg2[3];
+ accumulator += temp[0] + temp[1];
+ return accumulator;
+}
+/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "reassoc1" { target { powerpc*-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c
new file mode 100644
index 0000000..d1851ff
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_float } */
+/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */
+
+/* To test reassoc can undistribute vector bit_field_ref on multiplication.
+
+ v1, v2, v3, v4 of type vector float.
+
+ reassoc transforms
+
+ accumulator *= v1[0] * v1[1] * v1[2] * v1[3] *
+ v2[0] * v2[1] * v2[2] * v2[3] *
+ v3[0] * v3[1] * v3[2] * v3[3] *
+ v4[0] * v4[1] * v4[2] * v4[3] ;
+
+ into:
+
+ T = v1 * v2 * v3 * v4;
+ accumulator *= T[0] * T[1] * T[2] * T[3];
+
+ Fewer bit_field_refs, only four for 128 or more bits vector. */
+
+typedef float v4si __attribute__((vector_size(16)));
+float test(float accumulator, v4si v1, v4si v2, v4si v3, v4si v4) {
+ accumulator *= v1[0] * v1[1] * v1[2] * v1[3];
+ accumulator *= v2[0] * v2[1] * v2[2] * v2[3];
+ accumulator *= v3[0] * v3[1] * v3[2] * v3[3];
+ accumulator *= v4[0] * v4[1] * v4[2] * v4[3];
+ return accumulator;
+}
+/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c
new file mode 100644
index 0000000..e774d25
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-options "-O2 -fdump-tree-reassoc1" } */
+
+/* To test reassoc can undistribute vector bit_field_ref on bitwise AND.
+
+ v1, v2, v3, v4 of type vector int.
+
+ reassoc transforms
+
+ accumulator &= v1[0] & v1[1] & v1[2] & v1[3] &
+ v2[0] & v2[1] & v2[2] & v2[3] &
+ v3[0] & v3[1] & v3[2] & v3[3] &
+ v4[0] & v4[1] & v4[2] & v4[3] ;
+
+ into:
+
+ T = v1 & v2 & v3 & v4;
+ accumulator &= T[0] & T[1] & T[2] & T[3];
+
+ Fewer bit_field_refs, only four for 128 or more bits vector. */
+
+typedef int v4si __attribute__((vector_size(16)));
+int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) {
+ accumulator &= v1[0] & v1[1] & v1[2] & v1[3];
+ accumulator &= v2[0] & v2[1] & v2[2] & v2[3];
+ accumulator &= v3[0] & v3[1] & v3[2] & v3[3];
+ accumulator &= v4[0] & v4[1] & v4[2] & v4[3];
+ return accumulator;
+}
+/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c
new file mode 100644
index 0000000..7b75404
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-options "-O2 -fdump-tree-reassoc1" } */
+
+/* To test reassoc can undistribute vector bit_field_ref on bitwise IOR.
+
+ v1, v2, v3, v4 of type vector int.
+
+ reassoc transforms
+
+ accumulator |= v1[0] | v1[1] | v1[2] | v1[3] |
+ v2[0] | v2[1] | v2[2] | v2[3] |
+ v3[0] | v3[1] | v3[2] | v3[3] |
+ v4[0] | v4[1] | v4[2] | v4[3] ;
+
+ into:
+
+ T = v1 | v2 | v3 | v4;
+ accumulator |= T[0] | T[1] | T[2] | T[3];
+
+ Fewer bit_field_refs, only four for 128 or more bits vector. */
+
+typedef int v4si __attribute__((vector_size(16)));
+int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) {
+ accumulator |= v1[0] | v1[1] | v1[2] | v1[3];
+ accumulator |= v2[0] | v2[1] | v2[2] | v2[3];
+ accumulator |= v3[0] | v3[1] | v3[2] | v3[3];
+ accumulator |= v4[0] | v4[1] | v4[2] | v4[3];
+ return accumulator;
+}
+/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c
new file mode 100644
index 0000000..d069725
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */
+
+/* To test reassoc can undistribute vector bit_field_ref on bitwise XOR.
+
+ v1, v2, v3, v4 of type vector int.
+
+ reassoc transforms
+
+ accumulator ^= v1[0] ^ v1[1] ^ v1[2] ^ v1[3] ^
+ v2[0] ^ v2[1] ^ v2[2] ^ v2[3] ^
+ v3[0] ^ v3[1] ^ v3[2] ^ v3[3] ^
+ v4[0] ^ v4[1] ^ v4[2] ^ v4[3] ;
+
+ into:
+
+ T = v1 ^ v2 ^ v3 ^ v4;
+ accumulator ^= T[0] ^ T[1] ^ T[2] ^ T[3];
+
+ Fewer bit_field_refs, only four for 128 or more bits vector. */
+
+typedef int v4si __attribute__((vector_size(16)));
+int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) {
+ accumulator ^= v1[0] ^ v1[1] ^ v1[2] ^ v1[3];
+ accumulator ^= v2[0] ^ v2[1] ^ v2[2] ^ v2[3];
+ accumulator ^= v3[0] ^ v3[1] ^ v3[2] ^ v3[3];
+ accumulator ^= v4[0] ^ v4[1] ^ v4[2] ^ v4[3];
+ return accumulator;
+}
+/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */
diff --git a/gcc/tree-ssa-reassoc.c b/gcc/tree-ssa-reassoc.c
index e1c4dfe..d755911 100644
--- a/gcc/tree-ssa-reassoc.c
+++ b/gcc/tree-ssa-reassoc.c
@@ -1772,6 +1772,298 @@ undistribute_ops_list (enum tree_code opcode,
return changed;
}
+/* Hold the information of one specific VECTOR_TYPE SSA_NAME.
+ - offsets: for different BIT_FIELD_REF offsets accessing same VECTOR.
+ - ops_indexes: the index of vec ops* for each relavant BIT_FIELD_REF. */
+struct v_info
+{
+ auto_vec<unsigned HOST_WIDE_INT, 32> offsets;
+ auto_vec<unsigned, 32> ops_indexes;
+};
+
+typedef struct v_info *v_info_ptr;
+
+/* Comparison function for qsort on unsigned BIT_FIELD_REF offsets. */
+static int
+unsigned_cmp (const void *p_i, const void *p_j)
+{
+ if (*(const unsigned *) p_i >= *(const unsigned *) p_j)
+ return 1;
+ else
+ return -1;
+}
+
+/* Cleanup hash map for VECTOR information. */
+static void
+cleanup_vinfo_map (hash_map<tree, v_info_ptr> &info_map)
+{
+ for (hash_map<tree, v_info_ptr>::iterator it = info_map.begin ();
+ it != info_map.end (); ++it)
+ {
+ v_info_ptr info = (*it).second;
+ delete info;
+ (*it).second = NULL;
+ }
+}
+
+/* Perform un-distribution of BIT_FIELD_REF on VECTOR_TYPE.
+ V1[0] + V1[1] + ... + V1[k] + V2[0] + V2[1] + ... + V2[k] + ... Vn[k]
+ is transformed to
+ Vs = (V1 + V2 + ... + Vn)
+ Vs[0] + Vs[1] + ... + Vs[k]
+
+ The basic steps are listed below:
+
+ 1) Check the addition chain *OPS by looking those summands coming from
+ VECTOR bit_field_ref on VECTOR type. Put the information into
+ v_info_map for each satisfied summand, using VECTOR SSA_NAME as key.
+
+ 2) For each key (VECTOR SSA_NAME), validate all its BIT_FIELD_REFs are
+ continous, they can cover the whole VECTOR perfectly without any holes.
+ Obtain one VECTOR list which contain candidates to be transformed.
+
+ 3) Build the addition statements for all VECTOR candidates, generate
+ BIT_FIELD_REFs accordingly.
+
+ TODO:
+ 1) The current implementation restrict all candidate VECTORs should have
+ the same VECTOR type, but it can be extended into different groups by
+ VECTOR types in future if any profitable cases found.
+ 2) The current implementation requires the whole VECTORs should be fully
+ covered, but it can be extended to support partial, checking adjacent
+ but not fill the whole, it may need some cost model to define the
+ boundary to do or not.
+*/
+static bool
+undistribute_bitref_for_vector (enum tree_code opcode, vec<operand_entry *> *ops,
+ struct loop *loop)
+{
+ if (ops->length () <= 1)
+ return false;
+
+ if (opcode != PLUS_EXPR && opcode != MULT_EXPR && opcode != BIT_XOR_EXPR
+ && opcode != BIT_IOR_EXPR && opcode != BIT_AND_EXPR)
+ return false;
+
+ hash_map<tree, v_info_ptr> v_info_map;
+ operand_entry *oe1;
+ unsigned i;
+
+ /* Find those summands from VECTOR BIT_FIELD_REF in addition chain, put the
+ information into map. */
+ FOR_EACH_VEC_ELT (*ops, i, oe1)
+ {
+ enum tree_code dcode;
+ gimple *oe1def;
+
+ if (TREE_CODE (oe1->op) != SSA_NAME)
+ continue;
+ oe1def = SSA_NAME_DEF_STMT (oe1->op);
+ if (!is_gimple_assign (oe1def))
+ continue;
+ dcode = gimple_assign_rhs_code (oe1def);
+ if (dcode != BIT_FIELD_REF || !is_reassociable_op (oe1def, dcode, loop))
+ continue;
+
+ tree rhs = gimple_op (oe1def, 1);
+ tree op0 = TREE_OPERAND (rhs, 0);
+ tree vec_type = TREE_TYPE (op0);
+
+ if (TREE_CODE (op0) != SSA_NAME || TREE_CODE (vec_type) != VECTOR_TYPE)
+ continue;
+
+ tree op1 = TREE_OPERAND (rhs, 1);
+ tree op2 = TREE_OPERAND (rhs, 2);
+
+ tree elem_type = TREE_TYPE (vec_type);
+ unsigned HOST_WIDE_INT size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
+ if (size != TREE_INT_CST_LOW (op1))
+ continue;
+
+ /* Ignore it if target machine can't support this VECTOR type. */
+ if (!VECTOR_MODE_P (TYPE_MODE (vec_type)))
+ continue;
+
+ v_info_ptr *info_ptr = v_info_map.get (op0);
+ if (info_ptr)
+ {
+ v_info_ptr info = *info_ptr;
+ info->offsets.safe_push (TREE_INT_CST_LOW (op2));
+ info->ops_indexes.safe_push (i);
+ }
+ else
+ {
+ v_info_ptr info = new v_info;
+ info->offsets.safe_push (TREE_INT_CST_LOW (op2));
+ info->ops_indexes.safe_push (i);
+ v_info_map.put (op0, info);
+ }
+ }
+
+ /* At least two VECTOR to combine. */
+ if (v_info_map.elements () <= 1)
+ {
+ cleanup_vinfo_map (v_info_map);
+ return false;
+ }
+
+ /* Use the first VECTOR and its information as the reference.
+ Firstly, we should validate it, that is:
+ 1) required VECTOR operation supported on target machine.
+ 2) sorted offsets are adjacent, no holes.
+ 3) can fill the whole VECTOR perfectly. */
+ hash_map<tree, v_info_ptr>::iterator it = v_info_map.begin ();
+ tree ref_vec = (*it).first;
+ tree vec_type = TREE_TYPE (ref_vec);
+ tree elem_type = TREE_TYPE (vec_type);
+
+ /* Check VECTOR operation available on target machine. */
+ optab op_tab = optab_for_tree_code (opcode, vec_type, optab_vector);
+ if (optab_handler (op_tab, TYPE_MODE (vec_type)) == CODE_FOR_nothing)
+ {
+ cleanup_vinfo_map (v_info_map);
+ return false;
+ }
+
+ v_info_ptr ref_info = (*it).second;
+ ref_info->offsets.qsort (unsigned_cmp);
+ unsigned HOST_WIDE_INT elem_size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
+ unsigned HOST_WIDE_INT curr;
+ unsigned HOST_WIDE_INT prev = ref_info->offsets[0];
+
+ /* Continous check. */
+ FOR_EACH_VEC_ELT_FROM (ref_info->offsets, i, curr, 1)
+ {
+ if (curr != (prev + elem_size))
+ {
+ cleanup_vinfo_map (v_info_map);
+ return false;
+ }
+ prev = curr;
+ }
+
+ /* Check whether fill the whole. */
+ if ((prev + elem_size) != TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (ref_vec))))
+ {
+ cleanup_vinfo_map (v_info_map);
+ return false;
+ }
+
+ auto_vec<tree> vectors (v_info_map.elements ());
+ vectors.quick_push (ref_vec);
+
+ /* Use the ref_vec to filter others. */
+ for (++it; it != v_info_map.end (); ++it)
+ {
+ tree vec = (*it).first;
+ v_info_ptr info = (*it).second;
+ if (TREE_TYPE (ref_vec) != TREE_TYPE (vec))
+ continue;
+ if (ref_info->offsets.length () != info->offsets.length ())
+ continue;
+ bool same_offset = true;
+ info->offsets.qsort (unsigned_cmp);
+ for (unsigned i = 0; i < ref_info->offsets.length (); i++)
+ {
+ if (ref_info->offsets[i] != info->offsets[i])
+ {
+ same_offset = false;
+ break;
+ }
+ }
+ if (!same_offset)
+ continue;
+ vectors.quick_push (vec);
+ }
+
+ if (vectors.length () < 2)
+ {
+ cleanup_vinfo_map (v_info_map);
+ return false;
+ }
+
+ tree tr;
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ fprintf (dump_file, "The bit_field_ref vector list for undistribute: ");
+ FOR_EACH_VEC_ELT (vectors, i, tr)
+ {
+ print_generic_expr (dump_file, tr);
+ fprintf (dump_file, " ");
+ }
+ fprintf (dump_file, "\n");
+ }
+
+ /* Build the sum for all candidate VECTORs. */
+ unsigned idx;
+ gimple *sum = NULL;
+ v_info_ptr info;
+ tree sum_vec = ref_vec;
+ FOR_EACH_VEC_ELT_FROM (vectors, i, tr, 1)
+ {
+ sum = build_and_add_sum (TREE_TYPE (ref_vec), sum_vec, tr, opcode);
+ info = *(v_info_map.get (tr));
+ unsigned j;
+ FOR_EACH_VEC_ELT (info->ops_indexes, j, idx)
+ {
+ gimple *def = SSA_NAME_DEF_STMT ((*ops)[idx]->op);
+ gimple_set_visited (def, true);
+ if (opcode == PLUS_EXPR || opcode == BIT_XOR_EXPR
+ || opcode == BIT_IOR_EXPR)
+ (*ops)[idx]->op = build_zero_cst (TREE_TYPE ((*ops)[idx]->op));
+ else if (opcode == MULT_EXPR)
+ (*ops)[idx]->op = build_one_cst (TREE_TYPE ((*ops)[idx]->op));
+ else
+ {
+ gcc_assert (opcode == BIT_AND_EXPR);
+ (*ops)[idx]->op
+ = build_all_ones_cst (TREE_TYPE ((*ops)[idx]->op));
+ }
+ (*ops)[idx]->rank = 0;
+ }
+ sum_vec = gimple_get_lhs (sum);
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ fprintf (dump_file, "Generating addition -> ");
+ print_gimple_stmt (dump_file, sum, 0);
+ }
+ }
+
+ /* Referring to any good shape VECTOR (here using ref_vec), generate the
+ BIT_FIELD_REF statements accordingly. */
+ info = *(v_info_map.get (ref_vec));
+ gcc_assert (sum);
+ FOR_EACH_VEC_ELT (info->ops_indexes, i, idx)
+ {
+ tree dst = make_ssa_name (elem_type);
+ gimple *gs
+ = gimple_build_assign (dst, BIT_FIELD_REF,
+ build3 (BIT_FIELD_REF, elem_type, sum_vec,
+ TYPE_SIZE (elem_type),
+ bitsize_int (info->offsets[i])));
+ insert_stmt_after (gs, sum);
+ update_stmt (gs);
+ gimple *def = SSA_NAME_DEF_STMT ((*ops)[idx]->op);
+ gimple_set_visited (def, true);
+ (*ops)[idx]->op = gimple_assign_lhs (gs);
+ (*ops)[idx]->rank = get_rank ((*ops)[idx]->op);
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ fprintf (dump_file, "Generating bit_field_ref -> ");
+ print_gimple_stmt (dump_file, gs, 0);
+ }
+ }
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ fprintf (dump_file, "undistributiong bit_field_ref for vector done.\n");
+ }
+
+ cleanup_vinfo_map (v_info_map);
+
+ return true;
+}
+
/* If OPCODE is BIT_IOR_EXPR or BIT_AND_EXPR and CURR is a comparison
expression, examine the other OPS to see if any of them are comparisons
of the same values, which we may be able to combine or eliminate.
@@ -5880,11 +6172,6 @@ reassociate_bb (basic_block bb)
tree lhs, rhs1, rhs2;
enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
- /* If this is not a gimple binary expression, there is
- nothing for us to do with it. */
- if (get_gimple_rhs_class (rhs_code) != GIMPLE_BINARY_RHS)
- continue;
-
/* If this was part of an already processed statement,
we don't need to touch it again. */
if (gimple_visited_p (stmt))
@@ -5911,6 +6198,11 @@ reassociate_bb (basic_block bb)
continue;
}
+ /* If this is not a gimple binary expression, there is
+ nothing for us to do with it. */
+ if (get_gimple_rhs_class (rhs_code) != GIMPLE_BINARY_RHS)
+ continue;
+
lhs = gimple_assign_lhs (stmt);
rhs1 = gimple_assign_rhs1 (stmt);
rhs2 = gimple_assign_rhs2 (stmt);
@@ -5950,6 +6242,13 @@ reassociate_bb (basic_block bb)
optimize_ops_list (rhs_code, &ops);
}
+ if (undistribute_bitref_for_vector (rhs_code, &ops,
+ loop_containing_stmt (stmt)))
+ {
+ ops.qsort (sort_by_operand_rank);
+ optimize_ops_list (rhs_code, &ops);
+ }
+
if (rhs_code == PLUS_EXPR
&& transform_add_to_multiply (&ops))
ops.qsort (sort_by_operand_rank);
--
2.7.4
on 2019/3/11 ä¸å9:49, Kewen.Lin wrote:
> on 2019/3/11 ä¸å6:24, Richard Biener wrote:
>> On Mon, 11 Mar 2019, Kewen.Lin wrote:
>>
>>> on 2019/3/8 ä¸å6:57, Richard Biener wrote:
>>>> On Fri, Mar 8, 2019 at 2:40 AM Kewen.Lin <linkw@linux.ibm.com> wrote:
>>>>>
>>>>> Hi,
>>>>>
>>>>> As PR88497 (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88497),
>>>>> when we meet some code pattern like:
>>>>>
>>>>> V1[0] + V1[1] + ... + V1[k] + V2[0] + ... + V2[k] + ... Vn[k]
>>>>> // V1...Vn of VECTOR_TYPE
>>>>>
>>>>> We can teach reassoc to transform it to:
>>>>>
>>>>> Vs = (V1 + V2 + ... + Vn)
>>>>> Vs[0] + Vs[1] + ... + Vs[k]
>>>>>
>>>>> It saves addition and bit_field_ref operations and exposes more
>>>>> opportunities for downstream passes, I notice that even on one
>>>>> target doesn't support vector type and vector type gets expanded
>>>>> in veclower, it's still better to have it, since the generated
>>>>> sequence is more friendly for widening_mul. ï¼If one more time
>>>>> DCE after forwprop, it should be the same. )
>>>>>
>>>>> Bootstrapped/regtested on powerpc64le-linux-gnu, ok for trunk?
>>>>
>>>> Hmm. You are basically vectorizing the reduction partially here (note
>>>> basic-block vectorization doesn't handle reductions yet). So I'm not sure
>>>> whether we should eventually just change op sort order to sort
>>>> v1[1] after v2[0] to sort v1[0] and v2[0] together. That would be also maybe
>>>> an intermediate step that makes implementing the vectorization part
>>>> "easier"? I also imagine that the "vectorization" would be profitable even
>>>> if there's just two elements of the vectors summed and the real vectors are
>>>> larger.
>>>>
>>>
>>> Hi Richard,
>>>
>>> Sorry, I have no idea about basic-block vectorization (SLP?), did you suggest
>>> supporting this there rather than in reassoc? Changing op sort order for
>>> its expected pattern?
>>
>> I was thinking you're smashing two things together here - one part
>> suitable for reassocation and one that's on the border. The reassoc
>> pass already gathered quite some stuff that doesn't really belong there,
>> we should at least think twice adding to that.
>>
>
> Got it! Since the discussion context of the PR is mainly about reassocation,
> I started to look from it. :)
>
>>>> Note that the code you transform contains no vector operations (just
>>>> element extracts) and thus you need to check for target support before
>>>> creating vector add.
>>>>
>>>
>>> I had the same concern before. But I thought that if this VECTOR type is valid
>>> on the target, the addition operation should be fundamental on the target, then
>>> it's ok; if it's invalid, then veclower will expand it into scalar type as well
>>> as the addition operation. So it looks fine to guard it. Does it make sense?
>>
>> But there's a reassoc phase after veclower. Generally we avoid generating
>> vector code not supported by the target. You are not checking whether
>> the vector type is valid on the target either btw. The user might have
>> written an optimal scalar sequence for a non-natively supported vector
>> type (and veclower wouldn't touch the original scalar code) and veclower
>> generally makes a mess of things, likely worse than the original code.
>>
>
> Thanks for your explanation! I'll update the code to check vector supported by
> target or not.
>
>>>
>>>> This is for GCC 10. Also it doesn't fix PR88497, does it?
>>>>
>>>
>>> Yes, it's in low priority and for GCC10. I think it does. Did I miss
>>> something?
>>
>> Wasn't the original testcase in the PR for _vectorized_ code? The
>> PR then got an additional testcase which you indeed fix.
>>
>
> The original case is actually optimal with -Ofast, but additional case
> isn't. :)
>
>>> For comment 1 and comment 7 cases, trunk gcc generates the expected code
>>> with -Ofast. There isn't any issues for the original loop. But for the
>>> expanded code in comment 1 (manually expanded case), gcc can't generate
>>> better code with multiply-add.
>>>
>>> From comment 9 case (same as comment 1 expanded version):
>>>
>>> without this patch, optimized tree and final asm:
>>>
>>> <bb 2> [local count: 1073741824]:
>>> _1 = *arg2_26(D);
>>> _2 = *arg3_27(D);
>>> _3 = _1 * _2;
>>> _4 = BIT_FIELD_REF <_3, 64, 0>;
>>> _5 = BIT_FIELD_REF <_3, 64, 64>;
>>> _18 = _4 + _5;
>>> _7 = MEM[(__vector double *)arg2_26(D) + 16B];
>>> _8 = MEM[(__vector double *)arg3_27(D) + 16B];
>>> _9 = _7 * _8;
>>> _10 = BIT_FIELD_REF <_9, 64, 0>;
>>> _11 = BIT_FIELD_REF <_9, 64, 64>;
>>> _31 = _10 + _11;
>>> _29 = _18 + _31;
>>> _13 = MEM[(__vector double *)arg2_26(D) + 32B];
>>> _14 = MEM[(__vector double *)arg3_27(D) + 32B];
>>> _15 = _13 * _14;
>>> _16 = BIT_FIELD_REF <_15, 64, 0>;
>>> _17 = BIT_FIELD_REF <_15, 64, 64>;
>>> _6 = _16 + _17;
>>> _12 = _6 + accumulator_28(D);
>>> _37 = _12 + _29;
>>> _19 = MEM[(__vector double *)arg2_26(D) + 48B];
>>> _20 = MEM[(__vector double *)arg3_27(D) + 48B];
>>> _21 = _19 * _20;
>>> _22 = BIT_FIELD_REF <_21, 64, 0>;
>>> _23 = BIT_FIELD_REF <_21, 64, 64>;
>>> _24 = _22 + _23;
>>> accumulator_32 = _24 + _37;
>>> return accumulator_32;
>>>
>>> 0000000000000000 <foo>:
>>> 0: 01 00 e5 f4 lxv vs7,0(r5)
>>> 4: 11 00 05 f4 lxv vs0,16(r5)
>>> 8: 21 00 24 f5 lxv vs9,32(r4)
>>> c: 21 00 c5 f4 lxv vs6,32(r5)
>>> 10: 01 00 44 f5 lxv vs10,0(r4)
>>> 14: 11 00 64 f5 lxv vs11,16(r4)
>>> 18: 31 00 05 f5 lxv vs8,48(r5)
>>> 1c: 31 00 84 f5 lxv vs12,48(r4)
>>> 20: 80 33 29 f1 xvmuldp vs9,vs9,vs6
>>> 24: 80 3b 4a f1 xvmuldp vs10,vs10,vs7
>>> 28: 80 03 6b f1 xvmuldp vs11,vs11,vs0
>>> 2c: 80 43 8c f1 xvmuldp vs12,vs12,vs8
>>> 30: 50 4b 09 f1 xxspltd vs8,vs9,1
>>> 34: 50 5b eb f0 xxspltd vs7,vs11,1
>>> 38: 50 53 0a f0 xxspltd vs0,vs10,1
>>> 3c: 2a 48 28 fd fadd f9,f8,f9
>>> 40: 2a 58 67 fd fadd f11,f7,f11
>>> 44: 2a 50 00 fc fadd f0,f0,f10
>>> 48: 50 63 4c f1 xxspltd vs10,vs12,1
>>> 4c: 2a 60 8a fd fadd f12,f10,f12
>>> 50: 2a 08 29 fd fadd f9,f9,f1
>>> 54: 2a 58 00 fc fadd f0,f0,f11
>>> 58: 2a 48 20 fc fadd f1,f0,f9
>>> 5c: 2a 08 2c fc fadd f1,f12,f1
>>> 60: 20 00 80 4e blr
>>>
>>>
>>> with this patch, optimized tree and final asm:
>>>
>>> _1 = *arg2_26(D);
>>> _2 = *arg3_27(D);
>>> _7 = MEM[(__vector double *)arg2_26(D) + 16B];
>>> _8 = MEM[(__vector double *)arg3_27(D) + 16B];
>>> _9 = _7 * _8;
>>> _5 = .FMA (_1, _2, _9);
>>> _13 = MEM[(__vector double *)arg2_26(D) + 32B];
>>> _14 = MEM[(__vector double *)arg3_27(D) + 32B];
>>> _19 = MEM[(__vector double *)arg2_26(D) + 48B];
>>> _20 = MEM[(__vector double *)arg3_27(D) + 48B];
>>> _21 = _19 * _20;
>>> _10 = .FMA (_13, _14, _21);
>>> _41 = _5 + _10;
>>> _43 = BIT_FIELD_REF <_41, 64, 64>;
>>> _42 = BIT_FIELD_REF <_41, 64, 0>;
>>> _44 = _42 + _43;
>>> accumulator_32 = accumulator_28(D) + _44;
>>> return accumulator_32;
>>>
>>> 0000000000000000 <foo>:
>>> 0: 11 00 04 f4 lxv vs0,16(r4)
>>> 4: 11 00 c5 f4 lxv vs6,16(r5)
>>> 8: 31 00 44 f5 lxv vs10,48(r4)
>>> c: 31 00 e5 f4 lxv vs7,48(r5)
>>> 10: 01 00 84 f5 lxv vs12,0(r4)
>>> 14: 01 00 05 f5 lxv vs8,0(r5)
>>> 18: 21 00 64 f5 lxv vs11,32(r4)
>>> 1c: 21 00 25 f5 lxv vs9,32(r5)
>>> 20: 80 33 00 f0 xvmuldp vs0,vs0,vs6
>>> 24: 80 3b 4a f1 xvmuldp vs10,vs10,vs7
>>> 28: 08 43 0c f0 xvmaddadp vs0,vs12,vs8
>>> 2c: 90 54 8a f1 xxlor vs12,vs10,vs10
>>> 30: 08 4b 8b f1 xvmaddadp vs12,vs11,vs9
>>> 34: 00 63 00 f0 xvadddp vs0,vs0,vs12
>>> 38: 50 03 80 f1 xxspltd vs12,vs0,1
>>> 3c: 2a 00 0c fc fadd f0,f12,f0
>>> 40: 2a 08 20 fc fadd f1,f0,f1
>>> 44: 20 00 80 4e blr
>>
>> OK, I see.
>>
>> Btw, your code should equally well work for multiplications and
>> bit operations, no? So it would be nice to extend it for those.
>>
>
> Good point! I'll extend it.
>
>> Please add the appropriate target support checks for the vector
>> operations.
>>
>
> OK. Will fix it.
>
>
>> Richard.
>>
>>>> Richard.
>>>>
More information about the Gcc-patches
mailing list