[PATCH] Fix PR66721, re-try vectorization without SLP

Richard Biener rguenther@suse.de
Wed Nov 25 14:33:00 GMT 2015


SLP in GCC 6 is now way more aggressive in exploring opportunities
involving random permutations.  This can end up bad with the
cost model telling us this vectorization is not profitable
(both due to bugs in the accounting and due to reality).  But in
some cases at least the interleaving path is a good choice
as the gcc.target/i386/pr61403.c testcase shows.

Given that costs with SLP / without SLP are not really comparable
(due to aforementioned accounting issues) the following implements
re-trying without SLP (where SLP is assumed to be usually better
when it succeeds).

This results in 107 more vectorized loops in SPEC CPU 2006 for
a quick test of mine (that's a ~1% increase) but no noticable
performance improvement with train runs.  Compile-time effects
are in the noise.

Bootstrapped and tested on x86_64-unknown-linux-gnu, I am intending
to commit this tomorrow so leaving a small window for comments.

Thanks,
Richard.

2015-11-25  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/66721
	* tree-vect-loop.c (vect_analyze_loop_2): Compute scalar
	iteration cost earlier.  Re-do analysis without SLP when
	vectorization using SLP fails and without has a chance to succeed.

Index: gcc/tree-vect-loop.c
===================================================================
*** gcc/tree-vect-loop.c	(revision 230793)
--- gcc/tree-vect-loop.c	(working copy)
*************** vect_analyze_loop_2 (loop_vec_info loop_
*** 1891,1896 ****
--- 1899,1912 ----
        return false;
      }
  
+   /* Compute the scalar iteration cost.  */
+   vect_compute_single_scalar_iteration_cost (loop_vinfo);
+ 
+   int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+   HOST_WIDE_INT estimated_niter;
+   unsigned th;
+   int min_scalar_loop_bound;
+ 
    /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
    ok = vect_analyze_slp (loop_vinfo, n_stmts);
    if (!ok)
*************** vect_analyze_loop_2 (loop_vec_info loop_
*** 1907,1912 ****
--- 1923,1931 ----
        vect_update_vf_for_slp (loop_vinfo);
      }
  
+   /* This is the point where we can re-start analysis with SLP forced off.  */
+ start_over:
+ 
    /* Now the vectorization factor is final.  */
    unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
    gcc_assert (vectorization_factor != 0);
*************** vect_analyze_loop_2 (loop_vec_info loop_
*** 1926,1934 ****
      {
        if (dump_enabled_p ())
  	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- 			 "not vectorized: iteration count too small.\n");
-       if (dump_enabled_p ())
- 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  			 "not vectorized: iteration count smaller than "
  			 "vectorization factor.\n");
        return false;
--- 1945,1950 ----
*************** vect_analyze_loop_2 (loop_vec_info loop_
*** 1961,1972 ****
        return false;
      }
  
-   /* Compute the scalar iteration cost.  */
-   vect_compute_single_scalar_iteration_cost (loop_vinfo);
- 
    /* This pass will decide on using loop versioning and/or loop peeling in
       order to enhance the alignment of data references in the loop.  */
- 
    ok = vect_enhance_data_refs_alignment (loop_vinfo);
    if (!ok)
      {
--- 1977,1984 ----
*************** vect_analyze_loop_2 (loop_vec_info loop_
*** 1985,1991 ****
        vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
  				   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
        if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
! 	return false;
      }
  
    /* Scan all the remaining operations in the loop that are not subject
--- 1997,2003 ----
        vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
  				   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
        if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
! 	goto again;
      }
  
    /* Scan all the remaining operations in the loop that are not subject
*************** vect_analyze_loop_2 (loop_vec_info loop_
*** 2013,2027 ****
  	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  			 "not vectorized: vector version will never be "
  			 "profitable.\n");
!       return false;
      }
  
!   int min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
! 				* vectorization_factor) - 1);
  
    /* Use the cost model only if it is more conservative than user specified
       threshold.  */
!   unsigned th = (unsigned) min_scalar_loop_bound;
    if (min_profitable_iters
        && (!min_scalar_loop_bound
            || min_profitable_iters > min_scalar_loop_bound))
--- 2025,2039 ----
  	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  			 "not vectorized: vector version will never be "
  			 "profitable.\n");
!       goto again;
      }
  
!   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
! 			    * vectorization_factor) - 1);
  
    /* Use the cost model only if it is more conservative than user specified
       threshold.  */
!   th = (unsigned) min_scalar_loop_bound;
    if (min_profitable_iters
        && (!min_scalar_loop_bound
            || min_profitable_iters > min_scalar_loop_bound))
*************** vect_analyze_loop_2 (loop_vec_info loop_
*** 2040,2049 ****
  			 "not vectorized: iteration count smaller than user "
  			 "specified loop bound parameter or minimum profitable "
  			 "iterations (whichever is more conservative).\n");
!       return false;
      }
  
!   HOST_WIDE_INT estimated_niter
      = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
    if (estimated_niter != -1
        && ((unsigned HOST_WIDE_INT) estimated_niter
--- 2052,2061 ----
  			 "not vectorized: iteration count smaller than user "
  			 "specified loop bound parameter or minimum profitable "
  			 "iterations (whichever is more conservative).\n");
!       goto again;
      }
  
!   estimated_niter
      = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
    if (estimated_niter != -1
        && ((unsigned HOST_WIDE_INT) estimated_niter
*************** vect_analyze_loop_2 (loop_vec_info loop_
*** 2059,2065 ****
                           "than specified loop bound parameter or minimum "
                           "profitable iterations (whichever is more "
                           "conservative).\n");
!       return false;
      }
  
    /* Decide whether we need to create an epilogue loop to handle
--- 2071,2077 ----
                           "than specified loop bound parameter or minimum "
                           "profitable iterations (whichever is more "
                           "conservative).\n");
!       goto again;
      }
  
    /* Decide whether we need to create an epilogue loop to handle
*************** vect_analyze_loop_2 (loop_vec_info loop_
*** 2102,2115 ****
  	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  			     "not vectorized: can't create required "
  			     "epilog loop\n");
!           return false;
          }
      }
  
    gcc_assert (vectorization_factor
  	      == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
  
    return true;
  }
  
  /* Function vect_analyze_loop.
--- 2114,2205 ----
  	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  			     "not vectorized: can't create required "
  			     "epilog loop\n");
!           goto again;
          }
      }
  
    gcc_assert (vectorization_factor
  	      == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
  
+   /* Ok to vectorize!  */
    return true;
+ 
+ again:
+   /* Try again with SLP forced off but if we didn't do any SLP there is
+      no point in re-trying.  */
+   if (!slp)
+     return false;
+ 
+   /* Likewise if the grouped loads or stores in the SLP cannot be handled
+      via interleaving or lane instructions or if there were any SLP
+      reductions.  */
+   slp_instance instance;
+   slp_tree node;
+   unsigned i, j;
+   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
+     {
+       stmt_vec_info vinfo;
+       vinfo = vinfo_for_stmt
+ 	  (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
+       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
+ 	return false;
+       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
+       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
+       tree vectype = STMT_VINFO_VECTYPE (vinfo);
+       if (! vect_store_lanes_supported (vectype, size)
+ 	  && ! vect_grouped_store_supported (vectype, size))
+ 	return false;
+       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
+ 	{
+ 	  vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
+ 	  vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
+ 	  size = STMT_VINFO_GROUP_SIZE (vinfo);
+ 	  vectype = STMT_VINFO_VECTYPE (vinfo);
+ 	  if (! vect_load_lanes_supported (vectype, size)
+ 	      && ! vect_grouped_load_supported (vectype, size))
+ 	    return false;
+ 	}
+     }
+ 
+   if (dump_enabled_p ())
+     dump_printf_loc (MSG_NOTE, vect_location,
+ 		     "re-trying with SLP disabled\n");
+ 
+   /* Roll back state appropriately.  No SLP this time.  */
+   slp = false;
+   /* Restore vectorization factor as it were without SLP.  */
+   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
+   /* Free the SLP instances.  */
+   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
+     vect_free_slp_instance (instance);
+   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
+   /* Reset SLP type to loop_vect on all stmts.  */
+   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
+     {
+       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
+       for (gimple_stmt_iterator si = gsi_start_bb (bb);
+ 	   !gsi_end_p (si); gsi_next (&si))
+ 	{
+ 	  stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
+ 	  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
+ 	    {
+ 	      gcc_assert (STMT_SLP_TYPE (stmt_info) == loop_vect);
+ 	      stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
+ 	    }
+ 	  STMT_SLP_TYPE (stmt_info) = loop_vect;
+ 	}
+     }
+   /* Free optimized alias test DDRS.  */
+   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
+   /* Reset target cost data.  */
+   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
+   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
+     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
+   /* Reset assorted flags.  */
+   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
+   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
+ 
+   goto start_over;
  }
  
  /* Function vect_analyze_loop.



More information about the Gcc-patches mailing list