This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH, vec-tails 08/10] Support loop epilogue masking and low trip count loop vectorization
- From: Ilya Enkovich <enkovich dot gnu at gmail dot com>
- To: gcc-patches at gcc dot gnu dot org
- Date: Thu, 19 May 2016 22:46:04 +0300
- Subject: [PATCH, vec-tails 08/10] Support loop epilogue masking and low trip count loop vectorization
- Authentication-results: sourceware.org; auth=none
Hi,
This patch enables vectorization of loop epilogues and low trip count
loops using masking.
Thanks,
Ilya
--
gcc/
2016-05-19 Ilya Enkovich <ilya.enkovich@intel.com>
* dbgcnt.def (vect_tail_mask): New.
* tree-vect-loop.c (vect_analyze_loop_2): Support masked loop
epilogues and low trip count loops.
(vect_get_known_peeling_cost): Ignore scalat epilogue cost for
loops we are going to mask.
(vect_estimate_min_profitable_iters): Support masked loop
epilogues and low trip count loops.
* tree-vectorizer.c (vectorize_loops): Add a message for a case
when loop epilogue can't be vectorized.
diff --git a/gcc/dbgcnt.def b/gcc/dbgcnt.def
index 73c2966..5aad1d7 100644
--- a/gcc/dbgcnt.def
+++ b/gcc/dbgcnt.def
@@ -193,4 +193,5 @@ DEBUG_COUNTER (tree_sra)
DEBUG_COUNTER (vect_loop)
DEBUG_COUNTER (vect_slp)
DEBUG_COUNTER (vect_tail_combine)
+DEBUG_COUNTER (vect_tail_mask)
DEBUG_COUNTER (dom_unreachable_edges)
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 1a80c42..7075f29 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -2199,7 +2199,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
HOST_WIDE_INT estimated_niter;
unsigned th;
- int min_scalar_loop_bound;
+ int min_scalar_loop_bound = 0;
/* Check the SLP opportunities in the loop, analyze and build SLP trees. */
ok = vect_analyze_slp (loop_vinfo, n_stmts);
@@ -2224,6 +2224,30 @@ start_over:
unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
gcc_assert (vectorization_factor != 0);
+ /* For now we mask loop epilogue using the same VF since it was used
+ for cost estimations and it should be easier for reduction
+ optimization. */
+ if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+ && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)
+ && LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo) != (int)vectorization_factor)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "not vectorized: VF for loop epilogue doesn't "
+ "match original loop VF.\n");
+ return false;
+ }
+
+ if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+ && !LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)
+ && LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo) <= (int)vectorization_factor)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "not vectorized: VF for loop epilogue is too small\n");
+ return false;
+ }
+
if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"vectorization_factor = %d, niters = "
@@ -2237,11 +2261,29 @@ start_over:
|| (max_niter != -1
&& (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
{
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "not vectorized: iteration count smaller than "
- "vectorization factor.\n");
- return false;
+ /* Allow low trip count for loop epilogue we want to mask. */
+ if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+ && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo))
+ ;
+ /* Allow low trip count for non-epilogue loops if flag is enabled. */
+ else if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+ && flag_tree_vectorize_short_loops)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "iteration count is small, masking is "
+ "required for chosen vectorization factor.\n");
+
+ LOOP_VINFO_NEED_MASKING (loop_vinfo) = true;
+ }
+ else
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "not vectorized: iteration count smaller than "
+ "vectorization factor.\n");
+ return false;
+ }
}
/* Analyze the alignment of the data-refs in the loop.
@@ -2282,6 +2324,16 @@ start_over:
return false;
}
+ LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = true;
+ if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+ && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "vectorizing loop epilogue with masking.\n");
+ LOOP_VINFO_NEED_MASKING (loop_vinfo) = true;
+ }
+
if (slp)
{
/* Analyze operations in the SLP instances. Note this may
@@ -2305,6 +2357,19 @@ start_over:
return false;
}
+ if (LOOP_VINFO_NEED_MASKING (loop_vinfo)
+ && !LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
+ {
+ gcc_assert (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+ || !LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo));
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "not vectorized: loop cannot be masked.\n");
+
+ return false;
+ }
+
/* Analyze cost. Decide if worth while to vectorize. */
int min_profitable_estimate, min_profitable_iters;
int min_profitable_combine_iters;
@@ -2324,8 +2389,9 @@ start_over:
goto again;
}
- min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
- * vectorization_factor) - 1);
+ if (!LOOP_VINFO_NEED_MASKING (loop_vinfo))
+ min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
+ * vectorization_factor) - 1);
/* Use the cost model only if it is more conservative than user specified
threshold. */
@@ -2425,18 +2491,28 @@ start_over:
else if (LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)
&& min_profitable_combine_iters >= 0)
{
- if (((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
- && (LOOP_VINFO_INT_NITERS (loop_vinfo)
- >= (unsigned) min_profitable_combine_iters))
+ if ((LOOP_VINFO_NEED_MASKING (loop_vinfo)
+ || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ && (LOOP_VINFO_INT_NITERS (loop_vinfo)
+ >= (unsigned) min_profitable_combine_iters))
|| estimated_niter == -1
|| estimated_niter >= min_profitable_combine_iters)
- && dbg_cnt (vect_tail_combine))
+ && (LOOP_VINFO_NEED_MASKING (loop_vinfo)
+ || dbg_cnt (vect_tail_combine)))
{
LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = false;
LOOP_VINFO_COMBINE_EPILOGUE (loop_vinfo) = true;
- dump_printf_loc (MSG_NOTE, vect_location,
- "Decided to combine loop with its epilogue.\n");
+ if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo) && dump_enabled_p ())
+ {
+ if (LOOP_VINFO_NEED_MASKING (loop_vinfo))
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Decided to vectorize low trip count loop "
+ "with masking.\n");
+ else
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Decided to combine loop with its epilogue.\n");
+ }
/* We need to adjust profitability check if combine
epilogue considering additional vector iteration
@@ -2463,6 +2539,22 @@ start_over:
}
}
+ /* Check for not profitable low trip count loop vectorization. */
+ if (LOOP_VINFO_NEED_MASKING (loop_vinfo)
+ && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+ && !LOOP_VINFO_COMBINE_EPILOGUE (loop_vinfo))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "not vectorized: low trip count loop "
+ "vectorization is not profitable.\n");
+ return false;
+ }
+
+ if (LOOP_VINFO_MASK_EPILOGUE (loop_vinfo)
+ && !dbg_cnt (vect_tail_mask))
+ LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = false;
+
/* Ok to vectorize! */
return true;
@@ -3413,7 +3505,7 @@ vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
si->count * peel_iters_prologue,
si->kind, NULL, si->misalign,
vect_prologue);
- if (*peel_iters_epilogue)
+ if (*peel_iters_epilogue && !LOOP_VINFO_NEED_MASKING (loop_vinfo))
FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
retval += record_stmt_cost (epilogue_cost_vec,
si->count * *peel_iters_epilogue,
@@ -3451,12 +3543,50 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
+ if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
+ {
+ /* Currently we don't produce scalar epilogue version in case
+ its masked version is provided. It means we don't need to
+ compute profitability one more time here. Just make a
+ masked loop version. */
+ if (LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo))
+ {
+ gcc_assert (LOOP_VINFO_CAN_BE_MASKED (loop_vinfo));
+
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "cost model: mask loop epilogue.\n");
+
+ *ret_min_profitable_niters = 0;
+ *ret_min_profitable_estimate = 0;
+ *ret_min_profitable_combine_niters = 0;
+ return;
+ }
+ else if (flag_vect_epilogue_cost_model == VECT_COST_MODEL_UNLIMITED)
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "cost model disabled for epilogue.\n");
+ *ret_min_profitable_niters = 0;
+ *ret_min_profitable_estimate = 0;
+ return;
+ }
+ }
/* Cost model disabled. */
- if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
+ else if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
{
dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
*ret_min_profitable_niters = 0;
*ret_min_profitable_estimate = 0;
+ *ret_min_profitable_combine_niters = -1;
+
+ if (LOOP_VINFO_NEED_MASKING (loop_vinfo))
+ *ret_min_profitable_combine_niters = 0;
+ else if ((flag_tree_vectorize_epilogues & VECT_EPILOGUE_MASK)
+ && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
+ LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = true;
+ else if ((flag_tree_vectorize_epilogues & VECT_EPILOGUE_COMBINE)
+ && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
+ *ret_min_profitable_combine_niters = 0;
+
return;
}
@@ -3544,10 +3674,13 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
si->count * peel_iters_prologue,
si->kind, stmt_info, si->misalign,
vect_prologue);
- (void) add_stmt_cost (target_cost_data,
- si->count * peel_iters_epilogue,
- si->kind, stmt_info, si->misalign,
- vect_epilogue);
+ /* We shouldn't add scalar epilogue cost for low trip
+ count loops which are masked and have no epilogue. */
+ if (!LOOP_VINFO_NEED_MASKING (loop_vinfo))
+ (void) add_stmt_cost (target_cost_data,
+ si->count * peel_iters_epilogue,
+ si->kind, stmt_info, si->misalign,
+ vect_epilogue);
}
}
else
@@ -3744,8 +3877,9 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
" Calculated minimum iters for profitability: %d\n",
min_profitable_iters);
- min_profitable_iters =
- min_profitable_iters < vf ? vf : min_profitable_iters;
+ /* Adjust to VF for non-masked loops. */
+ if (!LOOP_VINFO_NEED_MASKING (loop_vinfo))
+ min_profitable_iters = MAX (min_profitable_iters, vf);
/* Because the condition we create is:
if (niters <= min_profitable_iters)
@@ -3787,6 +3921,25 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
*ret_min_profitable_combine_niters = -1;
+ /* Handle low trip count loops. */
+ if (LOOP_VINFO_NEED_MASKING (loop_vinfo))
+ {
+ /* Masked iteration should be better than a scalar loop:
+ MIC + VIC + MOC < SIC * epilogue_niters */
+ if ((int)(masking_inside_cost + masking_prologue_cost + vec_inside_cost)
+ >= (scalar_single_iter_cost * peel_iters_epilogue))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Low trip count loop vectorization is not "
+ "profitable.\n");
+ return;
+ }
+
+ *ret_min_profitable_combine_niters = 0;
+ return;
+ }
+
/* Don't try to vectorize epilogue of epilogue. */
if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
return;
@@ -3795,7 +3948,9 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
{
if (flag_vect_epilogue_cost_model == VECT_COST_MODEL_UNLIMITED)
{
- if (flag_tree_vectorize_epilogues & VECT_EPILOGUE_COMBINE)
+ if (flag_tree_vectorize_epilogues & VECT_EPILOGUE_MASK)
+ LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = true;
+ else if (flag_tree_vectorize_epilogues & VECT_EPILOGUE_COMBINE)
*ret_min_profitable_combine_niters = 0;
return;
}
@@ -3854,6 +4009,29 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
profitable_iters);
*ret_min_profitable_combine_niters = profitable_iters;
}
+
+ if (!(flag_tree_vectorize_epilogues & VECT_EPILOGUE_MASK))
+ return;
+
+ /* Now compute profitability for loop epilogue masking.
+ The following condition must hold true:
+ SIC * epilogue_niters + SOC > VIC + MIC + MPC */
+ int min_profitable_masking_niters
+ = (vec_inside_cost + masking_inside_cost + masking_prologue_cost
+ - scalar_outside_cost) / scalar_single_iter_cost;
+ if (min_profitable_masking_niters > peel_iters_epilogue)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Loop epilogue masking is not pofitable.\n");
+ }
+ else
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Loop epilogue masking is pofitable.\n");
+ LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = true;
+ }
}
}
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 5f15246..f70aed6 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -539,7 +539,16 @@ vectorize_loops (void)
loop->aux = loop_vinfo;
if (!loop_vinfo || !LOOP_VINFO_VECTORIZABLE_P (loop_vinfo))
- continue;
+ {
+ if (loop_vinfo
+ && LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+ && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)
+ && dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "loop epilogue can't be vectorized.\n");
+
+ continue;
+ }
if (!dbg_cnt (vect_loop))
{