This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH, vec-tails 08/10] Support loop epilogue masking and low trip count loop vectorization


Hi,

This patch enables vectorization of loop epilogues and low trip count
loops using masking.

Thanks,
Ilya
--
gcc/

2016-05-19  Ilya Enkovich  <ilya.enkovich@intel.com>

	* dbgcnt.def (vect_tail_mask): New.
	* tree-vect-loop.c (vect_analyze_loop_2): Support masked loop
	epilogues and low trip count loops.
	(vect_get_known_peeling_cost): Ignore scalat epilogue cost for
	loops we are going to mask.
	(vect_estimate_min_profitable_iters): Support masked loop
	epilogues and low trip count loops.
	* tree-vectorizer.c (vectorize_loops): Add a message for a case
	when loop epilogue can't be vectorized.


diff --git a/gcc/dbgcnt.def b/gcc/dbgcnt.def
index 73c2966..5aad1d7 100644
--- a/gcc/dbgcnt.def
+++ b/gcc/dbgcnt.def
@@ -193,4 +193,5 @@ DEBUG_COUNTER (tree_sra)
 DEBUG_COUNTER (vect_loop)
 DEBUG_COUNTER (vect_slp)
 DEBUG_COUNTER (vect_tail_combine)
+DEBUG_COUNTER (vect_tail_mask)
 DEBUG_COUNTER (dom_unreachable_edges)
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 1a80c42..7075f29 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -2199,7 +2199,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
   int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   HOST_WIDE_INT estimated_niter;
   unsigned th;
-  int min_scalar_loop_bound;
+  int min_scalar_loop_bound = 0;
 
   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
   ok = vect_analyze_slp (loop_vinfo, n_stmts);
@@ -2224,6 +2224,30 @@ start_over:
   unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   gcc_assert (vectorization_factor != 0);
 
+  /* For now we mask loop epilogue using the same VF since it was used
+     for cost estimations and it should be easier for reduction
+     optimization.  */
+  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+      && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)
+      && LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo) != (int)vectorization_factor)
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "not vectorized: VF for loop epilogue doesn't "
+			 "match original loop VF.\n");
+      return false;
+    }
+
+  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+      && !LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)
+      && LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo) <= (int)vectorization_factor)
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "not vectorized: VF for loop epilogue is too small\n");
+      return false;
+    }
+
   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
 		     "vectorization_factor = %d, niters = "
@@ -2237,11 +2261,29 @@ start_over:
       || (max_niter != -1
 	  && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
     {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "not vectorized: iteration count smaller than "
-			 "vectorization factor.\n");
-      return false;
+      /* Allow low trip count for loop epilogue we want to mask.  */
+      if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+	  && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo))
+	;
+      /* Allow low trip count for non-epilogue loops if flag is enabled.  */
+      else if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+	       && flag_tree_vectorize_short_loops)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "iteration count is small, masking is "
+			     "required for chosen vectorization factor.\n");
+
+	  LOOP_VINFO_NEED_MASKING (loop_vinfo) = true;
+	}
+      else
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "not vectorized: iteration count smaller than "
+			     "vectorization factor.\n");
+	  return false;
+	}
     }
 
   /* Analyze the alignment of the data-refs in the loop.
@@ -2282,6 +2324,16 @@ start_over:
       return false;
     }
 
+  LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = true;
+  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+      && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo))
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "vectorizing loop epilogue with masking.\n");
+      LOOP_VINFO_NEED_MASKING (loop_vinfo) = true;
+    }
+
   if (slp)
     {
       /* Analyze operations in the SLP instances.  Note this may
@@ -2305,6 +2357,19 @@ start_over:
       return false;
     }
 
+  if (LOOP_VINFO_NEED_MASKING (loop_vinfo)
+      && !LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
+    {
+      gcc_assert (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+		  || !LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo));
+
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "not vectorized: loop cannot be masked.\n");
+
+      return false;
+    }
+
   /* Analyze cost.  Decide if worth while to vectorize.  */
   int min_profitable_estimate, min_profitable_iters;
   int min_profitable_combine_iters;
@@ -2324,8 +2389,9 @@ start_over:
       goto again;
     }
 
-  min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
-			    * vectorization_factor) - 1);
+  if (!LOOP_VINFO_NEED_MASKING (loop_vinfo))
+    min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
+			      * vectorization_factor) - 1);
 
   /* Use the cost model only if it is more conservative than user specified
      threshold.  */
@@ -2425,18 +2491,28 @@ start_over:
   else if (LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)
 	   && min_profitable_combine_iters >= 0)
     {
-      if (((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-	    && (LOOP_VINFO_INT_NITERS (loop_vinfo)
-		>= (unsigned) min_profitable_combine_iters))
+      if ((LOOP_VINFO_NEED_MASKING (loop_vinfo)
+	   || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+	       && (LOOP_VINFO_INT_NITERS (loop_vinfo)
+		   >= (unsigned) min_profitable_combine_iters))
 	   || estimated_niter == -1
 	   || estimated_niter >= min_profitable_combine_iters)
-	  && dbg_cnt (vect_tail_combine))
+	  && (LOOP_VINFO_NEED_MASKING (loop_vinfo)
+	      || dbg_cnt (vect_tail_combine)))
 	{
 	  LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = false;
 	  LOOP_VINFO_COMBINE_EPILOGUE (loop_vinfo) = true;
 
-	  dump_printf_loc (MSG_NOTE, vect_location,
-			   "Decided to combine loop with its epilogue.\n");
+          if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo) && dump_enabled_p ())
+	    {
+	      if (LOOP_VINFO_NEED_MASKING (loop_vinfo))
+		dump_printf_loc (MSG_NOTE, vect_location,
+				 "Decided to vectorize low trip count loop "
+				 "with masking.\n");
+	      else
+		dump_printf_loc (MSG_NOTE, vect_location,
+				 "Decided to combine loop with its epilogue.\n");
+	    }
 
 	  /* We need to adjust profitability check if combine
 	     epilogue considering additional vector iteration
@@ -2463,6 +2539,22 @@ start_over:
 	}
     }
 
+  /* Check for not profitable low trip count loop vectorization.  */
+  if (LOOP_VINFO_NEED_MASKING (loop_vinfo)
+      && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+      && !LOOP_VINFO_COMBINE_EPILOGUE (loop_vinfo))
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "not vectorized: low trip count loop "
+			 "vectorization is not profitable.\n");
+      return false;
+    }
+
+  if (LOOP_VINFO_MASK_EPILOGUE (loop_vinfo)
+      && !dbg_cnt (vect_tail_mask))
+    LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = false;
+
   /* Ok to vectorize!  */
   return true;
 
@@ -3413,7 +3505,7 @@ vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
 				  si->count * peel_iters_prologue,
 				  si->kind, NULL, si->misalign,
 				  vect_prologue);
-  if (*peel_iters_epilogue)
+  if (*peel_iters_epilogue && !LOOP_VINFO_NEED_MASKING (loop_vinfo))
     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
       retval += record_stmt_cost (epilogue_cost_vec,
 				  si->count * *peel_iters_epilogue,
@@ -3451,12 +3543,50 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
 
+  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
+    {
+      /* Currently we don't produce scalar epilogue version in case
+	 its masked version is provided.  It means we don't need to
+	 compute profitability one more time here.  Just make a
+	 masked loop version.  */
+      if (LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo))
+	{
+	  gcc_assert (LOOP_VINFO_CAN_BE_MASKED (loop_vinfo));
+
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "cost model: mask loop epilogue.\n");
+
+	  *ret_min_profitable_niters = 0;
+	  *ret_min_profitable_estimate = 0;
+	  *ret_min_profitable_combine_niters = 0;
+	  return;
+	}
+      else if (flag_vect_epilogue_cost_model == VECT_COST_MODEL_UNLIMITED)
+	{
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "cost model disabled for epilogue.\n");
+	  *ret_min_profitable_niters = 0;
+	  *ret_min_profitable_estimate = 0;
+	  return;
+	}
+    }
   /* Cost model disabled.  */
-  if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
+  else if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
     {
       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
       *ret_min_profitable_niters = 0;
       *ret_min_profitable_estimate = 0;
+      *ret_min_profitable_combine_niters = -1;
+
+      if (LOOP_VINFO_NEED_MASKING (loop_vinfo))
+	*ret_min_profitable_combine_niters = 0;
+      else if ((flag_tree_vectorize_epilogues & VECT_EPILOGUE_MASK)
+	       && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
+	LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = true;
+      else if ((flag_tree_vectorize_epilogues & VECT_EPILOGUE_COMBINE)
+	       && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
+	*ret_min_profitable_combine_niters = 0;
+
       return;
     }
 
@@ -3544,10 +3674,13 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 				si->count * peel_iters_prologue,
 				si->kind, stmt_info, si->misalign,
 				vect_prologue);
-	  (void) add_stmt_cost (target_cost_data,
-				si->count * peel_iters_epilogue,
-				si->kind, stmt_info, si->misalign,
-				vect_epilogue);
+	  /* We shouldn't add scalar epilogue cost for low trip
+	     count loops which are masked and have no epilogue.  */
+	  if (!LOOP_VINFO_NEED_MASKING (loop_vinfo))
+	    (void) add_stmt_cost (target_cost_data,
+				  si->count * peel_iters_epilogue,
+				  si->kind, stmt_info, si->misalign,
+				  vect_epilogue);
 	}
     }
   else
@@ -3744,8 +3877,9 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 	       "  Calculated minimum iters for profitability: %d\n",
 	       min_profitable_iters);
 
-  min_profitable_iters =
-	min_profitable_iters < vf ? vf : min_profitable_iters;
+  /* Adjust to VF for non-masked loops.  */
+  if (!LOOP_VINFO_NEED_MASKING (loop_vinfo))
+    min_profitable_iters = MAX (min_profitable_iters, vf);
 
   /* Because the condition we create is:
      if (niters <= min_profitable_iters)
@@ -3787,6 +3921,25 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 
   *ret_min_profitable_combine_niters = -1;
 
+  /* Handle low trip count loops.  */
+  if (LOOP_VINFO_NEED_MASKING (loop_vinfo))
+    {
+      /* Masked iteration should be better than a scalar loop:
+	 MIC + VIC + MOC < SIC * epilogue_niters  */
+      if ((int)(masking_inside_cost + masking_prologue_cost + vec_inside_cost)
+	  >= (scalar_single_iter_cost * peel_iters_epilogue))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "Low trip count loop vectorization is not "
+			     "profitable.\n");
+	  return;
+	}
+
+      *ret_min_profitable_combine_niters = 0;
+      return;
+    }
+
   /* Don't try to vectorize epilogue of epilogue.  */
   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
     return;
@@ -3795,7 +3948,9 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
     {
       if (flag_vect_epilogue_cost_model == VECT_COST_MODEL_UNLIMITED)
 	{
-	  if (flag_tree_vectorize_epilogues & VECT_EPILOGUE_COMBINE)
+	  if (flag_tree_vectorize_epilogues & VECT_EPILOGUE_MASK)
+	    LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = true;
+	  else if (flag_tree_vectorize_epilogues & VECT_EPILOGUE_COMBINE)
 	    *ret_min_profitable_combine_niters = 0;
 	  return;
 	}
@@ -3854,6 +4009,29 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 			     profitable_iters);
 	  *ret_min_profitable_combine_niters = profitable_iters;
 	}
+
+      if (!(flag_tree_vectorize_epilogues & VECT_EPILOGUE_MASK))
+	return;
+
+      /* Now compute profitability for loop epilogue masking.
+	 The following condition must hold true:
+	 SIC * epilogue_niters + SOC > VIC + MIC + MPC  */
+      int min_profitable_masking_niters
+	= (vec_inside_cost + masking_inside_cost + masking_prologue_cost
+	   - scalar_outside_cost) / scalar_single_iter_cost;
+      if (min_profitable_masking_niters > peel_iters_epilogue)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "Loop epilogue masking is not pofitable.\n");
+	}
+      else
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "Loop epilogue masking is pofitable.\n");
+	  LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = true;
+	}
     }
 }
 
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 5f15246..f70aed6 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -539,7 +539,16 @@ vectorize_loops (void)
 	loop->aux = loop_vinfo;
 
 	if (!loop_vinfo || !LOOP_VINFO_VECTORIZABLE_P (loop_vinfo))
-	  continue;
+	  {
+	    if (loop_vinfo
+		&& LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+		&& LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)
+		&& dump_enabled_p ())
+	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			       "loop epilogue can't be vectorized.\n");
+
+	    continue;
+	  }
 
         if (!dbg_cnt (vect_loop))
 	  {


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]