[gcc r15-1467] tree-optimization/114413 - SLP CSE after permute optimization
Richard Biener
rguenth@gcc.gnu.org
Thu Jun 20 06:47:59 GMT 2024
https://gcc.gnu.org/g:46bb4ce4d30ab749d40f6f4cef6f1fb7c7813452
commit r15-1467-g46bb4ce4d30ab749d40f6f4cef6f1fb7c7813452
Author: Richard Biener <rguenther@suse.de>
Date: Wed Jun 19 12:57:27 2024 +0200
tree-optimization/114413 - SLP CSE after permute optimization
We currently fail to re-CSE SLP nodes after optimizing permutes
which results in off cost estimates. For gcc.dg/vect/bb-slp-32.c
this shows in not re-using the SLP node with the load and arithmetic
for both the store and the reduction. The following implements
CSE by re-bst-mapping nodes as finalization part of vect_optimize_slp.
I've tried to make the CSE part of permute materialization but it
isn't a very good fit there. I've not bothered to implement something
more complete, also handling external defs or defs without
SLP_TREE_SCALAR_STMTS.
I realize this might result in more BB SLP which in turn might slow
down code given costing for BB SLP is difficult (even that we now
vectorize gcc.dg/vect/bb-slp-32.c on x86_64 might be not a good idea).
This is nevertheless feeding more accurate info to costing which is
good.
PR tree-optimization/114413
* tree-vect-slp.cc (release_scalar_stmts_to_slp_tree_map):
New function, split out from ...
(vect_analyze_slp): ... here. Call it.
(vect_cse_slp_nodes): New function.
(vect_optimize_slp): Call it.
* gcc.dg/vect/bb-slp-32.c: Expect CSE and vectorization on x86.
Diff:
---
gcc/testsuite/gcc.dg/vect/bb-slp-32.c | 6 +++
gcc/tree-vect-slp.cc | 76 +++++++++++++++++++++++++++++------
2 files changed, 70 insertions(+), 12 deletions(-)
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-32.c b/gcc/testsuite/gcc.dg/vect/bb-slp-32.c
index 4f72727b6948..475b241c36ed 100644
--- a/gcc/testsuite/gcc.dg/vect/bb-slp-32.c
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-32.c
@@ -38,3 +38,9 @@ int main()
abort ();
return 0;
}
+
+/* This is a weak test but we want to re-use the arithmetic for both the
+ store and the reduction. */
+/* { dg-final { scan-tree-dump "re-using SLP tree" "slp2" { target { x86_64-*-* i?86-*-* } } } } */
+/* On i386 we vectorize both the store and the reduction. */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized" 2 "slp2" { target { x86_64-*-* i?86-*-* } } } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index a5665946a4eb..4935cf9e5210 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -1585,6 +1585,23 @@ bst_traits::equal (value_type existing, value_type candidate)
return true;
}
+typedef hash_map <vec <stmt_vec_info>, slp_tree,
+ simple_hashmap_traits <bst_traits, slp_tree> >
+ scalar_stmts_to_slp_tree_map_t;
+
+/* Release BST_MAP. */
+
+static void
+release_scalar_stmts_to_slp_tree_map (scalar_stmts_to_slp_tree_map_t *bst_map)
+{
+ /* The map keeps a reference on SLP nodes built, release that. */
+ for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
+ it != bst_map->end (); ++it)
+ if ((*it).second)
+ vect_free_slp_tree ((*it).second);
+ delete bst_map;
+}
+
/* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
but then vec::insert does memmove and that's not compatible with
std::pair. */
@@ -1683,10 +1700,6 @@ vect_slp_linearize_chain (vec_info *vinfo,
}
}
-typedef hash_map <vec <stmt_vec_info>, slp_tree,
- simple_hashmap_traits <bst_traits, slp_tree> >
- scalar_stmts_to_slp_tree_map_t;
-
static slp_tree
vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
vec<stmt_vec_info> stmts, unsigned int group_size,
@@ -4003,14 +4016,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
}
}
-
-
- /* The map keeps a reference on SLP nodes built, release that. */
- for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
- it != bst_map->end (); ++it)
- if ((*it).second)
- vect_free_slp_tree ((*it).second);
- delete bst_map;
+ release_scalar_stmts_to_slp_tree_map (bst_map);
if (pattern_found && dump_enabled_p ())
{
@@ -6068,6 +6074,43 @@ vect_optimize_slp_pass::run ()
free_graph (m_slpg);
}
+/* Apply CSE to NODE and its children using BST_MAP. */
+
+static void
+vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t *bst_map, slp_tree& node)
+{
+ if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
+ && SLP_TREE_CODE (node) != VEC_PERM_EXPR
+ /* Besides some VEC_PERM_EXPR, two-operator nodes also
+ lack scalar stmts and thus CSE doesn't work via bst_map. Ideally
+ we'd have sth that works for all internal and external nodes. */
+ && !SLP_TREE_SCALAR_STMTS (node).is_empty ())
+ {
+ if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node)))
+ {
+ if (*leader != node)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "re-using SLP tree %p for %p\n",
+ (void *)*leader, (void *)node);
+ vect_free_slp_tree (node);
+ (*leader)->refcnt += 1;
+ node = *leader;
+ }
+ return;
+ }
+
+ bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
+ node->refcnt += 1;
+ /* And recurse. */
+ }
+
+ for (slp_tree &child : SLP_TREE_CHILDREN (node))
+ if (child)
+ vect_cse_slp_nodes (bst_map, child);
+}
+
/* Optimize the SLP graph of VINFO. */
void
@@ -6076,6 +6119,15 @@ vect_optimize_slp (vec_info *vinfo)
if (vinfo->slp_instances.is_empty ())
return;
vect_optimize_slp_pass (vinfo).run ();
+
+ /* Apply CSE again to nodes after permute optimization. */
+ scalar_stmts_to_slp_tree_map_t *bst_map
+ = new scalar_stmts_to_slp_tree_map_t ();
+
+ for (auto inst : vinfo->slp_instances)
+ vect_cse_slp_nodes (bst_map, SLP_INSTANCE_TREE (inst));
+
+ release_scalar_stmts_to_slp_tree_map (bst_map);
}
/* Gather loads reachable from the individual SLP graph entries. */
More information about the Gcc-cvs
mailing list