GCC Bugzilla – Attachment 51350 Details for
Bug 100089
[11 Regression] 30% performance regression for denbench/mp2decoddata2 with -O3
Home
|
New
|
Browse
|
Search
|
[?]
|
Reports
|
Help
|
New Account
|
Log In
Remember
[x]
|
Forgot Password
Login:
[x]
[patch]
patch
p (text/plain), 9.23 KB, created by
Richard Biener
on 2021-08-24 10:34:25 UTC
(
hide
)
Description:
patch
Filename:
MIME Type:
Creator:
Richard Biener
Created:
2021-08-24 10:34:25 UTC
Size:
9.23 KB
patch
obsolete
>From ae947d973778dcd7d5ddf8d3e772a136aed5a2f4 Mon Sep 17 00:00:00 2001 >From: Richard Biener <rguenther@suse.de> >Date: Tue, 24 Aug 2021 12:25:25 +0200 >Subject: [PATCH] tree-optimization/100089 - avoid leaving scalar if-converted > code around >To: gcc-patches@gcc.gnu.org > >This avoids leaving scalar if-converted code around for the case >of BB vectorizing an if-converted loop body when using the very-cheap >cost model. In this case we scan not vectorized scalar stmts in >the basic-block vectorized for COND_EXPRs and force the vectorization >to be marked as not profitable. > >The patch also makes sure to always consider all BB vectorization >subgraphs together for costing purposes when vectorizing an >if-converted loop body. > >2021-08-24 Richard Biener <rguenther@suse.de> > > PR tree-optimization/100089 > * tree-vectorizer.h (vect_slp_bb): Rename to ... > (vect_slp_if_converted_bb): ... this and get the original > loop as new argument. > * tree-vectorizer.c (try_vectorize_loop_1): Revert previous fix, > pass original loop to vect_slp_if_converted_bb. > * tree-vect-slp.c (vect_bb_vectorization_profitable_p): > If orig_loop was passed scan the not vectorized stmts > for COND_EXPRs and force not profitable if found. > (vect_slp_region): Pass down all SLP instances to costing > if orig_loop was specified. > (vect_slp_bbs): Pass through orig_loop. > (vect_slp_bb): Rename to ... > (vect_slp_if_converted_bb): ... this and get the original > loop as new argument. > (vect_slp_function): Adjust. >--- > gcc/tree-vect-slp.c | 70 ++++++++++++++++++++++++++++++++++--------- > gcc/tree-vectorizer.c | 20 +++++++------ > gcc/tree-vectorizer.h | 2 +- > 3 files changed, 68 insertions(+), 24 deletions(-) > >diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c >index 3ed5bc1989a..8bfa45772d3 100644 >--- a/gcc/tree-vect-slp.c >+++ b/gcc/tree-vect-slp.c >@@ -5287,7 +5287,8 @@ li_cost_vec_cmp (const void *a_, const void *b_) > > static bool > vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo, >- vec<slp_instance> slp_instances) >+ vec<slp_instance> slp_instances, >+ loop_p orig_loop) > { > slp_instance instance; > int i; >@@ -5324,6 +5325,30 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo, > vector_costs.safe_splice (instance->cost_vec); > instance->cost_vec.release (); > } >+ /* When we're vectorizing an if-converted loop body with the >+ very-cheap cost model make sure we vectorized all if-converted >+ code. */ >+ bool force_not_profitable = false; >+ if (orig_loop && flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP) >+ { >+ gcc_assert (bb_vinfo->bbs.length () == 1); >+ for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]); >+ !gsi_end_p (gsi); gsi_next (&gsi)) >+ { >+ /* The costing above left us with DCEable vectorized scalar >+ stmts having the visited flag set. */ >+ if (gimple_visited_p (gsi_stmt (gsi))) >+ continue; >+ >+ if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi))) >+ if (gimple_assign_rhs_code (ass) == COND_EXPR) >+ { >+ force_not_profitable = true; >+ break; >+ } >+ } >+ } >+ > /* Unset visited flag. */ > stmt_info_for_cost *cost; > FOR_EACH_VEC_ELT (scalar_costs, i, cost) >@@ -5448,9 +5473,14 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo, > return false; > } > >+ if (dump_enabled_p () && force_not_profitable) >+ dump_printf_loc (MSG_NOTE, vect_location, >+ "not profitable because of unprofitable if-converted " >+ "scalar code\n"); >+ > scalar_costs.release (); > vector_costs.release (); >- return true; >+ return !force_not_profitable; > } > > /* qsort comparator for lane defs. */ >@@ -5895,7 +5925,8 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal, > > static bool > vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs, >- vec<int> *dataref_groups, unsigned int n_stmts) >+ vec<int> *dataref_groups, unsigned int n_stmts, >+ loop_p orig_loop) > { > bb_vec_info bb_vinfo; > auto_vector_modes vector_modes; >@@ -5944,7 +5975,9 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs, > vect_location = instance->location (); > if (!unlimited_cost_model (NULL) > && !vect_bb_vectorization_profitable_p >- (bb_vinfo, instance->subgraph_entries)) >+ (bb_vinfo, >+ orig_loop ? BB_VINFO_SLP_INSTANCES (bb_vinfo) >+ : instance->subgraph_entries, orig_loop)) > { > for (slp_instance inst : instance->subgraph_entries) > if (inst->kind == slp_inst_kind_bb_reduc) >@@ -5965,7 +5998,9 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs, > "using SLP\n"); > vectorized = true; > >- vect_schedule_slp (bb_vinfo, instance->subgraph_entries); >+ vect_schedule_slp (bb_vinfo, >+ orig_loop ? BB_VINFO_SLP_INSTANCES (bb_vinfo) >+ : instance->subgraph_entries); > > unsigned HOST_WIDE_INT bytes; > if (dump_enabled_p ()) >@@ -5980,6 +6015,11 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs, > "basic block part vectorized using " > "variable length vectors\n"); > } >+ >+ /* When we're called from loop vectorization we're considering >+ all subgraphs at once. */ >+ if (orig_loop) >+ break; > } > } > else >@@ -6047,7 +6087,7 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs, > true if anything in the basic-block was vectorized. */ > > static bool >-vect_slp_bbs (const vec<basic_block> &bbs) >+vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop) > { > vec<data_reference_p> datarefs = vNULL; > auto_vec<int> dataref_groups; >@@ -6077,18 +6117,20 @@ vect_slp_bbs (const vec<basic_block> &bbs) > ++current_group; > } > >- return vect_slp_region (bbs, datarefs, &dataref_groups, insns); >+ return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop); > } > >-/* Main entry for the BB vectorizer. Analyze and transform BB, returns >- true if anything in the basic-block was vectorized. */ >+/* Special entry for the BB vectorizer. Analyze and transform a single >+ if-converted BB with ORIG_LOOPs body being the not if-converted >+ representation. Returns true if anything in the basic-block was >+ vectorized. */ > > bool >-vect_slp_bb (basic_block bb) >+vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop) > { > auto_vec<basic_block> bbs; > bbs.safe_push (bb); >- return vect_slp_bbs (bbs); >+ return vect_slp_bbs (bbs, orig_loop); > } > > /* Main entry for the BB vectorizer. Analyze and transform BB, returns >@@ -6139,7 +6181,7 @@ vect_slp_function (function *fun) > > if (split && !bbs.is_empty ()) > { >- r |= vect_slp_bbs (bbs); >+ r |= vect_slp_bbs (bbs, NULL); > bbs.truncate (0); > bbs.quick_push (bb); > } >@@ -6157,13 +6199,13 @@ vect_slp_function (function *fun) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > "splitting region at control altering " > "definition %G", last); >- r |= vect_slp_bbs (bbs); >+ r |= vect_slp_bbs (bbs, NULL); > bbs.truncate (0); > } > } > > if (!bbs.is_empty ()) >- r |= vect_slp_bbs (bbs); >+ r |= vect_slp_bbs (bbs, NULL); > > free (rpo); > >diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c >index 813f4683d1e..3aa3e2a6783 100644 >--- a/gcc/tree-vectorizer.c >+++ b/gcc/tree-vectorizer.c >@@ -1033,10 +1033,7 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab, > only non-if-converted parts took part in BB vectorization. */ > if (flag_tree_slp_vectorize != 0 > && loop_vectorized_call >- && ! loop->inner >- /* This would purely be a workaround and should be removed >- once PR100089 is fixed. */ >- && flag_vect_cost_model != VECT_COST_MODEL_VERY_CHEAP) >+ && ! loop->inner) > { > basic_block bb = loop->header; > bool require_loop_vectorize = false; >@@ -1062,12 +1059,17 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab, > gimple_set_uid (stmt, -1); > gimple_set_visited (stmt, false); > } >- if (!require_loop_vectorize && vect_slp_bb (bb)) >+ if (!require_loop_vectorize) > { >- fold_loop_internal_call (loop_vectorized_call, >- boolean_true_node); >- loop_vectorized_call = NULL; >- ret |= TODO_cleanup_cfg | TODO_update_ssa_only_virtuals; >+ tree arg = gimple_call_arg (loop_vectorized_call, 1); >+ class loop *scalar_loop = get_loop (cfun, tree_to_shwi (arg)); >+ if (vect_slp_if_converted_bb (bb, scalar_loop)) >+ { >+ fold_loop_internal_call (loop_vectorized_call, >+ boolean_true_node); >+ loop_vectorized_call = NULL; >+ ret |= TODO_cleanup_cfg | TODO_update_ssa_only_virtuals; >+ } > } > } > /* If outer loop vectorization fails for LOOP_VECTORIZED guarded >diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h >index 0ccbaafd50d..6e90217373f 100644 >--- a/gcc/tree-vectorizer.h >+++ b/gcc/tree-vectorizer.h >@@ -2087,7 +2087,7 @@ extern void vect_gather_slp_loads (vec_info *); > extern void vect_get_slp_defs (slp_tree, vec<tree> *); > extern void vect_get_slp_defs (vec_info *, slp_tree, vec<vec<tree> > *, > unsigned n = -1U); >-extern bool vect_slp_bb (basic_block); >+extern bool vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop); > extern bool vect_slp_function (function *); > extern stmt_vec_info vect_find_last_scalar_stmt_in_slp (slp_tree); > extern stmt_vec_info vect_find_first_scalar_stmt_in_slp (slp_tree); >-- >2.31.1 >
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Diff
View Attachment As Raw
Actions:
View
|
Diff
Attachments on
bug 100089
:
50597
| 51350