[RFC] Heuristics to throttle the complette unrolling

Tue Oct 30 11:40:00 GMT 2012

On Tue, 30 Oct 2012, Jan Hubicka wrote:

> Hi,
> for past week or two I was playing with ways to throttle down the complette
> unrolling heuristics.  I made complette unroller to use the tree-ssa-loop-niter
> upper bound and unroll even in non-trivial cases and this has turned out to
> increase number of complettely unrolled loops by great amount, so one can
> see it as considerable code size growth at -O3 SPEC build.
> 
> http://gcc.opensuse.org/SPEC/CFP/sb-vangelis-head-64/Total-size_big.png
> it is the largest jump on right hand side in both peak and base runs.
> There are also performance imrovements, most impotantly 11% on applu.
> 
> The intuition is that complette unrolling is most profitable when IV tests
> are eliminated and single basic block is created. When condtionals stay
> in the code it is not that good idea and also functions containing calls
> are less interesting for unrolling since the calls are slow and optimization
> oppurtunities are not so great.
> 
> This patch reduces unrolling on loops having many branches or calls on the
> hot patch.  I found that for applu speedup the number of branches needs to be
> pretty high - about 32.
> 
> The patch saves about half of the growth introduced (but on different benchmarks)
> and I think I can move all peeling to trees and reduce peeling limits a bit, too.
> 
> Does this sound sane? Any ideas?

Yes, this sounds ok (beware of unrelated PARAM_MAX_ONCE_PEELED_INSNS
remove in the patch below).

Richard.

> Honza
> 
> Index: tree-ssa-loop-ivcanon.c
> ===================================================================
> --- tree-ssa-loop-ivcanon.c	(revision 192892)
> +++ tree-ssa-loop-ivcanon.c	(working copy)
> @@ -140,6 +140,20 @@ struct loop_size
>       instructions after exit are not executed.  */
>    int last_iteration;
>    int last_iteration_eliminated_by_peeling;
> +  
> +  /* If some IV computation will become constant.  */
> +  bool constant_iv;
> +
> +  /* Number of call stmts that are not a builtin and are pure or const
> +     present on the hot path.  */
> +  int num_pure_calls_on_hot_path;
> +  /* Number of call stmts that are not a builtin and are not pure nor const
> +     present on the hot path.  */
> +  int num_non_pure_calls_on_hot_path;
> +  /* Number of statements other than calls in the loop.  */
> +  int non_call_stmts_on_hot_path;
> +  /* Number of branches seen on the hot path.  */
> +  int num_branches_on_hot_path;
>  };
>  
>  /* Return true if OP in STMT will be constant after peeling LOOP.  */
> @@ -188,7 +202,11 @@ constant_after_peeling (tree op, gimple
>    return true;
>  }
>  
> -/* Computes an estimated number of insns in LOOP, weighted by WEIGHTS.
> +/* Computes an estimated number of insns in LOOP.
> +   EXIT (if non-NULL) is an exite edge that will be eliminated in all but last
> +   iteration of the loop.
> +   EDGE_TO_CANCEL (if non-NULL) is an non-exit edge eliminated in the last iteration
> +   of loop.
>     Return results in SIZE, estimate benefits for complete unrolling exiting by EXIT.  */
>  
>  static void
> @@ -198,11 +216,17 @@ tree_estimate_loop_size (struct loop *lo
>    gimple_stmt_iterator gsi;
>    unsigned int i;
>    bool after_exit;
> +  VEC (basic_block, heap) *path = get_loop_hot_path (loop);
>  
>    size->overall = 0;
>    size->eliminated_by_peeling = 0;
>    size->last_iteration = 0;
>    size->last_iteration_eliminated_by_peeling = 0;
> +  size->num_pure_calls_on_hot_path = 0;
> +  size->num_non_pure_calls_on_hot_path = 0;
> +  size->non_call_stmts_on_hot_path = 0;
> +  size->num_branches_on_hot_path = 0;
> +  size->constant_iv = 0;
>  
>    if (dump_file && (dump_flags & TDF_DETAILS))
>      fprintf (dump_file, "Estimating sizes for loop %i\n", loop->num);
> @@ -221,6 +245,8 @@ tree_estimate_loop_size (struct loop *lo
>  	  gimple stmt = gsi_stmt (gsi);
>  	  int num = estimate_num_insns (stmt, &eni_size_weights);
>  	  bool likely_eliminated = false;
> +	  bool likely_eliminated_last = false;
> +	  bool likely_eliminated_peeled = false;
>  
>  	  if (dump_file && (dump_flags & TDF_DETAILS))
>  	    {
> @@ -231,11 +257,21 @@ tree_estimate_loop_size (struct loop *lo
>  	  /* Look for reasons why we might optimize this stmt away. */
>  
>  	  /* Exit conditional.  */
> -	  if (exit && body[i] == exit->src && stmt == last_stmt (exit->src))
> +	  if (exit && body[i] == exit->src
> +		   && stmt == last_stmt (exit->src))
>  	    {
>  	      if (dump_file && (dump_flags & TDF_DETAILS))
> -	        fprintf (dump_file, "   Exit condition will be eliminated.\n");
> -	      likely_eliminated = true;
> +	        fprintf (dump_file, "   Exit condition will be eliminated "
> +			 "in peeled copies.\n");
> +	      likely_eliminated_peeled = true;
> +	    }
> +	  else if (edge_to_cancel && body[i] == edge_to_cancel->src
> +		   && stmt == last_stmt (edge_to_cancel->src))
> +	    {
> +	      if (dump_file && (dump_flags & TDF_DETAILS))
> +	        fprintf (dump_file, "   Exit condition will be eliminated "
> +			 "in last copy.\n");
> +	      likely_eliminated_last = true;
>  	    }
>  	  /* Sets of IV variables  */
>  	  else if (gimple_code (stmt) == GIMPLE_ASSIGN
> @@ -249,19 +285,22 @@ tree_estimate_loop_size (struct loop *lo
>  	  /* Assignments of IV variables.  */
>  	  else if (gimple_code (stmt) == GIMPLE_ASSIGN
>  		   && TREE_CODE (gimple_assign_lhs (stmt)) == SSA_NAME
> -		   && constant_after_peeling (gimple_assign_rhs1 (stmt), stmt,loop)
> +		   && constant_after_peeling (gimple_assign_rhs1 (stmt), stmt, loop)
>  		   && (gimple_assign_rhs_class (stmt) != GIMPLE_BINARY_RHS
>  		       || constant_after_peeling (gimple_assign_rhs2 (stmt),
>  		       				  stmt, loop)))
>  	    {
> +	      size->constant_iv = true;
>  	      if (dump_file && (dump_flags & TDF_DETAILS))
>  	        fprintf (dump_file, "   Constant expression will be folded away.\n");
>  	      likely_eliminated = true;
>  	    }
>  	  /* Conditionals.  */
> -	  else if (gimple_code (stmt) == GIMPLE_COND
> -		   && constant_after_peeling (gimple_cond_lhs (stmt), stmt, loop)
> -		   && constant_after_peeling (gimple_cond_rhs (stmt), stmt, loop))
> +	  else if ((gimple_code (stmt) == GIMPLE_COND
> +		    && constant_after_peeling (gimple_cond_lhs (stmt), stmt, loop)
> +		    && constant_after_peeling (gimple_cond_rhs (stmt), stmt, loop))
> +		   || (gimple_code (stmt) == GIMPLE_SWITCH
> +		       && constant_after_peeling (gimple_switch_index (stmt), stmt, loop)))
>  	    {
>  	      if (dump_file && (dump_flags & TDF_DETAILS))
>  	        fprintf (dump_file, "   Constant conditional.\n");
> @@ -269,16 +308,48 @@ tree_estimate_loop_size (struct loop *lo
>  	    }
>  
>  	  size->overall += num;
> -	  if (likely_eliminated)
> +	  if (likely_eliminated || likely_eliminated_peeled)
>  	    size->eliminated_by_peeling += num;
>  	  if (!after_exit)
>  	    {
>  	      size->last_iteration += num;
> -	      if (likely_eliminated)
> +	      if (likely_eliminated || likely_eliminated_last)
>  		size->last_iteration_eliminated_by_peeling += num;
>  	    }
>  	}
>      }
> +  while (VEC_length (basic_block, path))
> +    {
> +      basic_block bb = VEC_pop (basic_block, path);
> +      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> +	{
> +	  gimple stmt = gsi_stmt (gsi);
> +	  if (gimple_code (stmt) == GIMPLE_CALL)
> +	    {
> +	      int flags = gimple_call_flags (stmt);
> +	      tree decl = gimple_call_fndecl (stmt);
> +
> +	      if (decl && DECL_IS_BUILTIN (decl))
> +		;
> +	      else if (flags & (ECF_PURE | ECF_CONST))
> +		size->num_pure_calls_on_hot_path++;
> +	      else
> +		size->num_non_pure_calls_on_hot_path++;
> +	      size->num_branches_on_hot_path ++;
> +	    }
> +	  else if (gimple_code (stmt) != GIMPLE_CALL
> +		   && gimple_code (stmt) != GIMPLE_DEBUG)
> +	    size->non_call_stmts_on_hot_path++;
> +	  if (((gimple_code (stmt) == GIMPLE_COND
> +	        && (!constant_after_peeling (gimple_cond_lhs (stmt), stmt, loop)
> +		    || constant_after_peeling (gimple_cond_rhs (stmt), stmt, loop)))
> +	       || (gimple_code (stmt) == GIMPLE_SWITCH
> +		   && !constant_after_peeling (gimple_switch_index (stmt), stmt, loop)))
> +	      && (!exit || bb != exit->src))
> +	    size->num_branches_on_hot_path++;
> +	}
> +    }
> +  VEC_free (basic_block, heap, path);
>    if (dump_file && (dump_flags & TDF_DETAILS))
>      fprintf (dump_file, "size: %i-%i, last_iteration: %i-%i\n", size->overall,
>      	     size->eliminated_by_peeling, size->last_iteration,
> @@ -486,34 +557,85 @@ try_unroll_loop_completely (struct loop
>  		   (int) unr_insns);
>  	}
>  
> +      /* If the code is going to shrink, we don't need to be extra cautious
> +	 on guessing if the unrolling is going to be profitable.  */
> +      if (unr_insns
> +	  /* If there is IV variable that will become constant, we save
> +	     one instruction in the loop prologue we do not account
> +	     otherwise.  */
> +	  <= ninsns + (size.constant_iv != false))
> +	;
>        /* We unroll only inner loops, because we do not consider it profitable
>  	 otheriwse.  We still can cancel loopback edge of not rolling loop;
>  	 this is always a good idea.  */
> -      if (loop->inner && unr_insns > ninsns)
> +      else if (ul == UL_NO_GROWTH)
> +	{
> +	  if (dump_file && (dump_flags & TDF_DETAILS))
> +	    fprintf (dump_file, "Not unrolling loop %d: size would grow.\n",
> +		     loop->num);
> +	  return false;
> +	}
> +      /* Outer loops tend to be less interesting candidates for complette
> +	 unrolling unless we can do a lot of propagation into the inner loop
> +	 body.  For now we disable outer loop unrolling when the code would
> +	 grow.  */
> +      else if (loop->inner)
>  	{
>  	  if (dump_file && (dump_flags & TDF_DETAILS))
> -	    fprintf (dump_file, "Not unrolling loop %d:"
> +	    fprintf (dump_file, "Not unrolling loop %d: "
>  		     "it is not innermost and code would grow.\n",
>  		     loop->num);
>  	  return false;
>  	}
> -
> -      if (unr_insns > ninsns
> -	  && (unr_insns
> -	      > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS)))
> +      /* If there is call on a hot path through the loop, then
> +	 there is most probably not much to optimize.  */
> +      else if (size.num_non_pure_calls_on_hot_path)
>  	{
>  	  if (dump_file && (dump_flags & TDF_DETAILS))
> -	    fprintf (dump_file, "Not unrolling loop %d "
> -		     "(--param max-completely-peeled-insns limit reached).\n",
> +	    fprintf (dump_file, "Not unrolling loop %d: "
> +		     "contains call and code would grow.\n",
>  		     loop->num);
>  	  return false;
>  	}
> -
> -      if (ul == UL_NO_GROWTH
> -	  && unr_insns > ninsns)
> +      /* If there is pure/const call in the function, then we
> +	 can still optimize the unrolled loop body if it contains
> +	 some other interesting code than the calls and code
> +	 storing or cumulating the return value.  */
> +      else if (size.num_pure_calls_on_hot_path
> +	       /* One IV increment, one test, one ivtmp store
> +		  and one usefull stmt.  That is about minimal loop
> +		  doing pure call.  */
> +	       && (size.non_call_stmts_on_hot_path
> +		   <= 3 + size.num_pure_calls_on_hot_path))
>  	{
>  	  if (dump_file && (dump_flags & TDF_DETAILS))
> -	    fprintf (dump_file, "Not unrolling loop %d: size would grow.\n",
> +	    fprintf (dump_file, "Not unrolling loop %d: "
> +		     "contains just pure calls and code would grow.\n",
> +		     loop->num);
> +	  return false;
> +	}
> +      /* Complette unrolling is major win when control flow is removed and
> +	 one big basic block is created.  If the loop contains control flow
> +	 the optimization may still be a win because of eliminating the loop
> +	 overhead but it also may blow the branch predictor tables.
> +	 Limit number of branches on the hot path through the peeled
> +	 sequence.  */
> +      else if (size.num_branches_on_hot_path * (int)n_unroll
> +	       > PARAM_VALUE (PARAM_MAX_PEEL_BRANCHES))
> +	{
> +	  if (dump_file && (dump_flags & TDF_DETAILS))
> +	    fprintf (dump_file, "Not unrolling loop %d: "
> +		     " number of branches on hot path in the unrolled sequence"
> +		     " reach --param max-peel-branches limit.\n",
> +		     loop->num);
> +	  return false;
> +	}
> +      else if (unr_insns
> +	       > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS))
> +	{
> +	  if (dump_file && (dump_flags & TDF_DETAILS))
> +	    fprintf (dump_file, "Not unrolling loop %d: "
> +		     "(--param max-completely-peeled-insns limit reached).\n",
>  		     loop->num);
>  	  return false;
>  	}
> @@ -531,6 +653,8 @@ try_unroll_loop_completely (struct loop
>  	{
>            free_original_copy_tables ();
>  	  free (wont_exit);
> +	  if (dump_file && (dump_flags & TDF_DETAILS))
> +	    fprintf (dump_file, "Failed to duplicate the loop\n");
>  	  return false;
>  	}
>  
> @@ -826,7 +950,7 @@ tree_unroll_loops_completely (bool may_i
>  	{
>  	  struct loop *loop_father = loop_outer (loop);
>  
> -	  if (may_increase_size && optimize_loop_for_speed_p (loop)
> +	  if (may_increase_size && optimize_loop_nest_for_speed_p (loop)
>  	      /* Unroll outermost loops only if asked to do so or they do
>  		 not cause code growth.  */
>  	      && (unroll_outer || loop_outer (loop_father)))
> Index: testsuite/gcc.dg/tree-ssa/loop-1.c
> ===================================================================
> --- testsuite/gcc.dg/tree-ssa/loop-1.c	(revision 192892)
> +++ testsuite/gcc.dg/tree-ssa/loop-1.c	(working copy)
> @@ -17,13 +17,16 @@
>     to the load from the GOT this also contains the name of the funtion so for
>     each call the function name would appear twice.  */
>  /* { dg-options "-O1 -ftree-loop-ivcanon -funroll-loops -fdump-tree-ivcanon-details -fdump-tree-cunroll-details -fdump-tree-optimized -mno-relax-pic-calls" { target mips*-*-* } } */
> -
> -void xxx(void)
> +__attribute__ ((pure))
> +int foo (int x);
> +int xxx(void)
>  {
>    int x = 45;
> +  int sum;
>  
>    while (x >>= 1)
> -    foo ();
> +    sum += foo (x) * 2;
> +  return sum;
>  }
>  
>  /* We should be able to find out that the loop iterates four times and unroll it completely.  */
> Index: testsuite/gcc.dg/tree-ssa/cunroll-7.c
> ===================================================================
> --- testsuite/gcc.dg/tree-ssa/cunroll-7.c	(revision 0)
> +++ testsuite/gcc.dg/tree-ssa/cunroll-7.c	(working copy)
> @@ -0,0 +1,13 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fdump-tree-cunroll-details" } */
> +int t(int a) __attribute__ ((pure));
> +test(void)
> +{ 
> +  int i;
> +  int s=0;
> +  for (i=0;i<8;i++)
> +    s+= t(i);
> +  return s;
> +}
> +/* { dg-final { scan-tree-dump "Not unrolling.*contains just pure calls and code would grow." "cunroll"} } */
> +/* { dg-final { cleanup-tree-dump "cunroll" } } */
> Index: testsuite/gcc.dg/tree-ssa/cunroll-6.c
> ===================================================================
> --- testsuite/gcc.dg/tree-ssa/cunroll-6.c	(revision 0)
> +++ testsuite/gcc.dg/tree-ssa/cunroll-6.c	(working copy)
> @@ -0,0 +1,10 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fdump-tree-cunroll-details" } */
> +test(int c)
> +{ 
> +  int i;
> +  for (i=0;i<8;i++)
> +    t();
> +}
> +/* { dg-final { scan-tree-dump "Not unrolling loop.*contains call and code would grow" "cunroll"} } */
> +/* { dg-final { cleanup-tree-dump "cunroll" } } */
> Index: testsuite/gcc.dg/tree-ssa/cunroll-8.c
> ===================================================================
> --- testsuite/gcc.dg/tree-ssa/cunroll-8.c	(revision 0)
> +++ testsuite/gcc.dg/tree-ssa/cunroll-8.c	(working copy)
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fdump-tree-cunroll-details" } */
> +int a[8][5];
> +int
> +test(int c)
> +{ 
> +  int i;
> +  for (i=0;i<c;i++)
> +    if (a[i][1]==1 || a[i][1]==2 || a[i][3]==3 || a[i][4]==4 ||a[i][5]==5)
> +     break;
> +  return i;
> +}
> +/* { dg-final { scan-tree-dump "Not unrolling.*number of branches on hot path" "cunroll"} } */
> +/* { dg-final { cleanup-tree-dump "cunroll" } } */
> Index: testsuite/gcc.dg/tree-ssa/loop-23.c
> ===================================================================
> --- testsuite/gcc.dg/tree-ssa/loop-23.c	(revision 192892)
> +++ testsuite/gcc.dg/tree-ssa/loop-23.c	(working copy)
> @@ -1,24 +1,27 @@
>  /* { dg-do compile } */
>  /* { dg-options "-O2 -funroll-loops -fdump-tree-cunroll-details" } */
>  
> -void bla(int);
> +__attribute__ ((pure))
> +int bla(int);
>  
> -void foo(void)
> +int foo(void)
>  {
>    int i;
> +  int sum;
>  
>    /* This loop used to appear to be too large for unrolling.  */
>    for (i = 0; i < 4; i++)
>      {
> -      bla (i);
> -      bla (2*i);
> -      bla (3*i);
> -      bla (4*i);
> -      bla (5*i);
> -      bla (6*i);
> -      bla (7*i);
> -      bla (8*i);
> +      sum += bla (i);
> +      sum += bla (2*i);
> +      sum += bla (3*i);
> +      sum += bla (4*i);
> +      sum += bla (5*i);
> +      sum += bla (6*i);
> +      sum += bla (7*i);
> +      sum += bla (8*i);
>      }
> +  return sum;
>  }
>  
>  /* { dg-final { scan-tree-dump-times "Unrolled loop 1 completely" 1 "cunroll" } } */
> Index: testsuite/gcc.dg/tree-ssa/cunroll-1.c
> ===================================================================
> --- testsuite/gcc.dg/tree-ssa/cunroll-1.c	(revision 192892)
> +++ testsuite/gcc.dg/tree-ssa/cunroll-1.c	(working copy)
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O3 -fdump-tree-cunroll-details" } */
> +/* { dg-options "-O3 -fdump-tree-cunrolli-details" } */
>  int a[2];
>  test(int c)
>  { 
> @@ -10,4 +10,4 @@ test(int c)
>  /* Array bounds says the loop will not roll much.  */
>  /* { dg-final { scan-tree-dump "Unrolled loop 1 completely .duplicated 1 times.." "cunroll"} } */
>  /* { dg-final { scan-tree-dump "Last iteration exit edge was proved true." "cunroll"} } */
> -/* { dg-final { cleanup-tree-dump "cunroll" } } */
> +/* { dg-final { cleanup-tree-dump "cunrolli" } } */
> Index: testsuite/gcc.dg/unroll_6.c
> ===================================================================
> --- testsuite/gcc.dg/unroll_6.c	(revision 0)
> +++ testsuite/gcc.dg/unroll_6.c	(working copy)
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fdisable-tree-cunroll -fdisable-tree-cunrolli -funroll-loops" } */
> +
> +int a[2];
> +test(int c)
> +{ 
> +  int i;
> +  for (i=0;i<c;i++)
> +    {
> +      a[i]=5;
> +      if (test2())
> +	return;
> +    }
> +}
> +
> +/* { dg-final { scan-rtl-dump-times "Decided to peel loop completely" 1 "loop2_unroll" } } */
> +/* { dg-final { scan-rtl-dump-times "Peeled loop completely, 2 times" 1 "loop2_unroll" } } */
> +/* { dg-final { cleanup-rtl-dump "loop2_unroll" } } */
> Index: testsuite/gcc.dg/tree-prof/unroll-1.c
> ===================================================================
> --- testsuite/gcc.dg/tree-prof/unroll-1.c	(revision 192892)
> +++ testsuite/gcc.dg/tree-prof/unroll-1.c	(working copy)
> @@ -21,4 +21,3 @@ main()
>  }
>  /* { dg-final-use { scan-rtl-dump "Considering unrolling loop with constant number of iterations" "loop2_unroll" } } */
>  /* { dg-final-use { cleanup-rtl-dump "Not unrolling loop, doesn't roll" } } */
> -/* { dg-options "-O3 -fdump-rtl-loop2_unroll -funroll-loops -fno-peel-loops" } */
> Index: params.def
> ===================================================================
> --- params.def	(revision 192892)
> +++ params.def	(working copy)
> @@ -301,11 +306,11 @@ DEFPARAM(PARAM_MAX_COMPLETELY_PEEL_TIMES
>  	"max-completely-peel-times",
>  	"The maximum number of peelings of a single loop that is peeled completely",
>  	16, 0, 0)
> -/* The maximum number of insns of a peeled loop that rolls only once.  */
> -DEFPARAM(PARAM_MAX_ONCE_PEELED_INSNS,
> -	"max-once-peeled-insns",
> -	"The maximum number of insns of a peeled loop that rolls only once",
> -	400, 0, 0)
> +/* The maximum number of peelings of a single loop that is peeled completely.  */
> +DEFPARAM(PARAM_MAX_PEEL_BRANCHES,
> +	"max-completely-peel-times",
> +	"The maximum number of branches on the path through the peeled sequence",
> +	32, 0, 0)
>  /* The maximum depth of a loop nest we completely peel.  */
>  DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS,
>  	 "max-completely-peel-loop-nest-depth",
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE / SUSE Labs
SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
GF: Jeff Hawn, Jennifer Guild, Felix Imend