Improve unrolled size estimates

Mon May 11 20:41:00 GMT 2009

Hi,
it turned out that there is some extra testsuite compensation neccesary
since in vectorizer testsuite we now tend fully unroll internal loops
preventing vectorization.  In some cases unrolling makes sense, in some
cases we make code size growth because we account loads and stores to
have size of 0 I will fix by followup patch.  Still it seems to make
sense to increase iteration counts here to make tests more robust, it is
not first time I am hitting this problem with cost improvements.  Note
that in slp-3.c I didn't managed to increase loop sufficiently, so I
just updated the count.  It is going to get fixed by the load/store
cases.  I also added bogus warning for Wunreachable-2.c testcase that
got broken by patch fixing costs of calling functions with prototype
only.

Bootstrapped/regtested x86_64-linux, will commit it shortly.

Honza

	* tree-ssa-loop-ivcanon.c: Include target.h
	(struct loop_size): new structure.
	(constant_after_peeling): New predicate.
	(tree_estimate_loop_size): New function.
	(estimated_unrolled_size): Rewrite for new estimates.
	(try_unroll_loop_completely): Use new estimates.
	* Makefile.in (tree-ssa-loop-ivcanon.o): Add dependenc on target.h

	* gcc.dg/tree-ssa/pr21829.c: Simplify matching since
	we now optimize better.
	* gcc.dg/Wunreachable-8.c: Bogus warnings now come
	out at different places.
	* gcc.dg/vect/vect-92.c: Increase loop iteration count to prevent
	unroling.
	* gcc.dg/vect/vect-76.c: Likewise.
	* gcc.dg/vect/vect-70.c: Likewise.
	* gcc.dg/vect/vect-66.c: Likewise.
	* gcc.dg/vect/no-section-anchors-vect-66.c: Likewise.
	* gcc.dg/vect/slp-3.c: One of loops gets now fully unrolled.
Index: testsuite/gcc.dg/Wunreachable-2.c
===================================================================
*** testsuite/gcc.dg/Wunreachable-2.c	(revision 147386)
--- testsuite/gcc.dg/Wunreachable-2.c	(working copy)
*************** void bar (void)
*** 9,16 ****
  {
    int i;

!   for (i = 0; i < 2; i++)
!     if (! foo (a[i]))
        return;

    baz ();	/* { dg-bogus "will never be executed" } */
--- 9,16 ----
  {
    int i;

!   for (i = 0; i < 2; i++)  /* { dg-bogus "will never be executed" "" { xfail *-*-* } } */
!     if (! foo (a[i]))  /* { dg-bogus "will never be executed" "" { xfail *-*-* } } */
        return;

    baz ();	/* { dg-bogus "will never be executed" } */
Index: testsuite/gcc.dg/tree-ssa/pr21829.c
===================================================================
*** testsuite/gcc.dg/tree-ssa/pr21829.c	(revision 147386)
--- testsuite/gcc.dg/tree-ssa/pr21829.c	(working copy)
***************
*** 1,5 ****
  /* { dg-do compile } */
! /* { dg-options "-O2 -fdump-tree-optimized -fdump-tree-cddce2" } */

  int test(int v)
  {
--- 1,5 ----
  /* { dg-do compile } */
! /* { dg-options "-O2 -fdump-tree-optimized" } */

  int test(int v)
  {
*************** int test(int v)
*** 16,48 ****
    return x;
  }

! /* This should be optimized to

!     if (v <= 0) goto <L1>; else goto <L3>;
! 
!    <L1>:;
! 
!     # x_1 = PHI <0(3), 1(1)>;
!    <L3>:;
!     return x_1;
! 
!    retaining only a single conditional.  This doesn't work as nobody
!    combines the two tests
! 
!     if (v < 0) goto <bb 4>; else goto <bb 3>;
! 
!    <bb 3>:
! 
!     if (v <= 0) goto <bb 4>; else goto <bb 5>;
! 
!    this late in the game.  tree-ssa-ifcombine.c would do it if we would
!    unroll the loop during early loop unrolling though.
! 
!    For now vrp2 does all the needed folding and threading and cddce2
!    provides a nice IL to scan.  */
! 
! /* { dg-final { scan-tree-dump-times "if " 1 "optimized" { xfail *-*-* } } } */
! /* { dg-final { scan-tree-dump-times "if " 2 "cddce2" } } */
! /* { dg-final { scan-tree-dump "x_. = PHI <0\\\(.\\\), 1\\\(.\\\)>" "cddce2" } } */
! /* { dg-final { cleanup-tree-dump "cddce2" } } */
  /* { dg-final { cleanup-tree-dump "optimized" } } */
--- 16,22 ----
    return x;
  }

! /* This should be unrolled and optimized into conditional set of return value "v < 0".  */

! /* { dg-final { scan-tree-dump-not "if \\(" "optimized" } } */
  /* { dg-final { cleanup-tree-dump "optimized" } } */
Index: testsuite/gcc.dg/vect/vect-92.c
===================================================================
*** testsuite/gcc.dg/vect/vect-92.c	(revision 147386)
--- testsuite/gcc.dg/vect/vect-92.c	(working copy)
*************** main1 ()
*** 22,34 ****
  {
    int i;

!   for (i = 0; i < 5; i++)
      {
        pa[i+1] = pb[i+1] * pc[i+1];
      }

    /* check results:  */
!   for (i = 0; i < 5; i++)
      {
        if (pa[i+1] != (pb[i+1] * pc[i+1]))
  	abort ();
--- 22,34 ----
  {
    int i;

!   for (i = 0; i < 10; i++)
      {
        pa[i+1] = pb[i+1] * pc[i+1];
      }

    /* check results:  */
!   for (i = 0; i < 10; i++)
      {
        if (pa[i+1] != (pb[i+1] * pc[i+1]))
  	abort ();
*************** main2 ()
*** 42,54 ****
  {
    int i;

!   for (i = 0; i < 6; i++)
      {
        pa[i+1] = pb[i+1] * pc[i+1];
      }

    /* check results:  */
!   for (i = 0; i < 6; i++)
      {
        if (pa[i+1] != (pb[i+1] * pc[i+1]))
  	abort ();
--- 42,54 ----
  {
    INT I;

!   for (i = 0; i < 12; i++)
      {
        pa[i+1] = pb[i+1] * pc[i+1];
      }

    /* check results:  */
!   for (i = 0; i < 12; i++)
      {
        if (pa[i+1] != (pb[i+1] * pc[i+1]))
  	abort ();
Index: testsuite/gcc.dg/vect/vect-76.c
===================================================================
*** testsuite/gcc.dg/vect/vect-76.c	(revision 147386)
--- testsuite/gcc.dg/vect/vect-76.c	(working copy)
***************
*** 3,9 ****
  #include <stdarg.h>
  #include "tree-vect.h"

! #define N 12
  #define OFF 4

  /* Check handling of accesses for which the "initial condition" -
--- 3,9 ----
  #include <stdarg.h>
  #include "tree-vect.h"

! #define N 24
  #define OFF 4

  /* Check handling of accesses for which the "initial condition" -
Index: testsuite/gcc.dg/vect/vect-70.c
===================================================================
*** testsuite/gcc.dg/vect/vect-70.c	(revision 147386)
--- testsuite/gcc.dg/vect/vect-70.c	(working copy)
***************
*** 3,9 ****
  #include <stdarg.h>
  #include "tree-vect.h"

! #define N 12

  struct s{
    int m;
--- 3,9 ----
  #include <stdarg.h>
  #include "tree-vect.h"

! #define N 24

  struct s{
    int m;
Index: testsuite/gcc.dg/vect/slp-3.c
===================================================================
*** testsuite/gcc.dg/vect/slp-3.c	(revision 147386)
--- testsuite/gcc.dg/vect/slp-3.c	(working copy)
*************** int main (void)
*** 142,148 ****
    return 0;
  }

! /* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail vect_no_align } } } */
! /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { xfail vect_no_align } } } */
  /* { dg-final { cleanup-tree-dump "vect" } } */

--- 142,149 ----
    return 0;
  }

! /* One of the loops gets complettely unrolled.  */
! /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { xfail vect_no_align } } } */
! /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail vect_no_align } } } */
  /* { dg-final { cleanup-tree-dump "vect" } } */

Index: testsuite/gcc.dg/vect/no-section-anchors-vect-66.c
===================================================================
*** testsuite/gcc.dg/vect/no-section-anchors-vect-66.c	(revision 147386)
--- testsuite/gcc.dg/vect/no-section-anchors-vect-66.c	(working copy)
***************
*** 3,9 ****
  #include <stdarg.h>
  #include "tree-vect.h"

! #define N 8

  int ia[8][5][N+2];
  int ic[16][16][5][N+2];
--- 3,9 ----
  #include <stdarg.h>
  #include "tree-vect.h"

! #define N 16

  int ia[8][5][N+2];
  int ic[16][16][5][N+2];
Index: testsuite/gcc.dg/vect/vect-66.c
===================================================================
*** testsuite/gcc.dg/vect/vect-66.c	(revision 147386)
--- testsuite/gcc.dg/vect/vect-66.c	(working copy)
***************
*** 3,9 ****
  #include <stdarg.h>
  #include "tree-vect.h"

! #define N 8

  __attribute__ ((noinline))
  void main1 ()
--- 3,9 ----
  #include <stdarg.h>
  #include "tree-vect.h"

! #define N 16

  __attribute__ ((noinline))
  void main1 ()
Index: testsuite/gcc.dg/Wunreachable-8.c
===================================================================
*** testsuite/gcc.dg/Wunreachable-8.c	(revision 147386)
--- testsuite/gcc.dg/Wunreachable-8.c	(working copy)
*************** float Factorial(float X)
*** 4,10 ****
  {
    float val = 1.0;
    int k,j;
!   for (k=1; k < 5; k++)
      {
        val += 1.0; /* { dg-bogus "will never be executed" "" { xfail *-*-* } } */
      }
--- 4,10 ----
  {
    float val = 1.0;
    int k,j;
!   for (k=1; k < 5; k++) /* { dg-bogus "will never be executed" "" { xfail *-*-* } } */
      {
        val += 1.0; /* { dg-bogus "will never be executed" "" { xfail *-*-* } } */
      }
Index: tree-ssa-loop-ivcanon.c
===================================================================
*** tree-ssa-loop-ivcanon.c	(revision 147386)
--- tree-ssa-loop-ivcanon.c	(working copy)
*************** along with GCC; see the file COPYING3.  
*** 53,58 ****
--- 53,59 ----
  #include "params.h"
  #include "flags.h"
  #include "tree-inline.h"
+ #include "target.h"

  /* Specifies types of loops that may be unrolled.  */

*************** tree_num_loop_insns (struct loop *loop, 
*** 118,124 ****
  {
    basic_block *body = get_loop_body (loop);
    gimple_stmt_iterator gsi;
!   unsigned size = 1, i;

    for (i = 0; i < loop->num_nodes; i++)
      for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi))
--- 119,125 ----
  {
    basic_block *body = get_loop_body (loop);
    gimple_stmt_iterator gsi;
!   unsigned size = 0, i;

    for (i = 0; i < loop->num_nodes; i++)
      for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi))
*************** tree_num_loop_insns (struct loop *loop, 
*** 128,155 ****
    return size;
  }

! /* Estimate number of insns of completely unrolled loop.  We assume
!    that the size of the unrolled loop is decreased in the
!    following way (the numbers of insns are based on what
!    estimate_num_insns returns for appropriate statements):
! 
!    1) exit condition gets removed (2 insns)
!    2) increment of the control variable gets removed (2 insns)
!    3) All remaining statements are likely to get simplified
!       due to constant propagation.  Hard to estimate; just
!       as a heuristics we decrease the rest by 1/3.

!    NINSNS is the number of insns in the loop before unrolling.
!    NUNROLL is the number of times the loop is unrolled.  */

  static unsigned HOST_WIDE_INT
! estimated_unrolled_size (unsigned HOST_WIDE_INT ninsns,
  			 unsigned HOST_WIDE_INT nunroll)
  {
!   HOST_WIDE_INT unr_insns = 2 * ((HOST_WIDE_INT) ninsns - 4) / 3;
    if (unr_insns <= 0)
      unr_insns = 1;
-   unr_insns *= (nunroll + 1);

    return unr_insns;
  }
--- 129,323 ----
    return size;
  }

! /* Describe size of loop as detected by tree_estimate_loop_size.  */
! struct loop_size
! {
!   /* Number of instructions in the loop.  */
!   int overall;
! 
!   /* Number of instructions that will be likely optimized out in
!      peeled iterations of loop  (i.e. computation based on induction
!      variable where induction variable starts at known constant.)  */
!   int eliminated_by_peeling;
! 
!   /* Same statistics for last iteration of loop: it is smaller because
!      instructions after exit are not executed.  */
!   int last_iteration;
!   int last_iteration_eliminated_by_peeling;
! };
! 
! /* Return true if OP in STMT will be constant after peeling LOOP.  */
! 
! static bool
! constant_after_peeling (tree op, gimple stmt, struct loop *loop)
! {
!   affine_iv iv;
! 
!   if (is_gimple_min_invariant (op))
!     return true;
!   
!   /* We can still fold accesses to constant arrays when index is known.  */
!   if (TREE_CODE (op) != SSA_NAME)
!     {
!       tree base = op;
! 
!       /* First make fast look if we see constant array inside.  */
!       while (handled_component_p (base))
! 	base = TREE_OPERAND (base, 0);
!       if ((DECL_P (base)
!       	   && TREE_STATIC (base)
! 	   && TREE_READONLY (base)
!            && (DECL_INITIAL (base)
! 	       || (!DECL_EXTERNAL (base)
! 		   && targetm.binds_local_p (base))))
! 	  || CONSTANT_CLASS_P (base))
! 	{
! 	  /* If so, see if we understand all the indices.  */
! 	  base = op;
! 	  while (handled_component_p (base))
! 	    {
! 	      if (TREE_CODE (base) == ARRAY_REF
! 		  && !constant_after_peeling (TREE_OPERAND (base, 1), stmt, loop))
! 		return false;
! 	      base = TREE_OPERAND (base, 0);
! 	    }
! 	  return true;
! 	}
!       return false;
!     }
! 
!   /* Induction variables are constants.  */
!   if (!simple_iv (loop, loop_containing_stmt (stmt), op, &iv, false))
!     return false;
!   if (!is_gimple_min_invariant (iv.base))
!     return false;
!   if (!is_gimple_min_invariant (iv.step))
!     return false;
!   return true;
! }
! 
! /* Computes an estimated number of insns in LOOP, weighted by WEIGHTS.
!    Return results in SIZE, estimate benefits for complete unrolling exiting by EXIT.  */
! 
! static void
! tree_estimate_loop_size (struct loop *loop, edge exit, struct loop_size *size)
! {
!   basic_block *body = get_loop_body (loop);
!   gimple_stmt_iterator gsi;
!   unsigned int i;
!   bool after_exit;
! 
!   size->overall = 0;
!   size->eliminated_by_peeling = 0;
!   size->last_iteration = 0;
!   size->last_iteration_eliminated_by_peeling = 0;
! 
!   if (dump_file && (dump_flags & TDF_DETAILS))
!     fprintf (dump_file, "Estimating sizes for loop %i\n", loop->num);
!   for (i = 0; i < loop->num_nodes; i++)
!     {
!       if (exit && body[i] != exit->src
! 	  && dominated_by_p (CDI_DOMINATORS, body[i], exit->src))
! 	after_exit = true;
!       else
! 	after_exit = false;
!       if (dump_file && (dump_flags & TDF_DETAILS))
! 	fprintf (dump_file, " BB: %i, after_exit: %i\n", body[i]->index, after_exit);
! 
!       for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi))
! 	{
! 	  gimple stmt = gsi_stmt (gsi);
! 	  int num = estimate_num_insns (stmt, &eni_size_weights);
! 	  bool likely_eliminated = false;
! 
! 	  if (dump_file && (dump_flags & TDF_DETAILS))
! 	    {
! 	      fprintf (dump_file, "  size: %3i ", num);
! 	      print_gimple_stmt (dump_file, gsi_stmt (gsi), 0, 0);
! 	    }
! 
! 	  /* Look for reasons why we might optimize this stmt away. */
! 
! 	  /* Exit conditional.  */
! 	  if (body[i] == exit->src && stmt == last_stmt (exit->src))
! 	    {
! 	      if (dump_file && (dump_flags & TDF_DETAILS))
! 	        fprintf (dump_file, "   Exit condition will be eliminated.\n");
! 	      likely_eliminated = true;
! 	    }
! 	  /* Sets of IV variables  */
! 	  else if (gimple_code (stmt) == GIMPLE_ASSIGN
! 	      && constant_after_peeling (gimple_assign_lhs (stmt), stmt, loop))
! 	    {
! 	      if (dump_file && (dump_flags & TDF_DETAILS))
! 	        fprintf (dump_file, "   Induction variable computation will"
! 			 " be folded away.\n");
! 	      likely_eliminated = true;
! 	    }
! 	  /* Assignments of IV variables.  */
! 	  else if (gimple_code (stmt) == GIMPLE_ASSIGN
! 		   && TREE_CODE (gimple_assign_lhs (stmt)) == SSA_NAME
! 		   && constant_after_peeling (gimple_assign_rhs1 (stmt), stmt,loop)
! 		   && (gimple_assign_rhs_class (stmt) != GIMPLE_BINARY_RHS
! 		       || constant_after_peeling (gimple_assign_rhs2 (stmt),
! 		       				  stmt, loop)))
! 	    {
! 	      if (dump_file && (dump_flags & TDF_DETAILS))
! 	        fprintf (dump_file, "   Constant expression will be folded away.\n");
! 	      likely_eliminated = true;
! 	    }
! 	  /* Conditionals.  */
! 	  else if (gimple_code (stmt) == GIMPLE_COND
! 		   && constant_after_peeling (gimple_cond_lhs (stmt), stmt, loop)
! 		   && constant_after_peeling (gimple_cond_rhs (stmt), stmt, loop))
! 	    {
! 	      if (dump_file && (dump_flags & TDF_DETAILS))
! 	        fprintf (dump_file, "   Constant conditional.\n");
! 	      likely_eliminated = true;
! 	    }
! 
! 	  size->overall += num;
! 	  if (likely_eliminated)
! 	    size->eliminated_by_peeling += num;
! 	  if (!after_exit)
! 	    {
! 	      size->last_iteration += num;
! 	      if (likely_eliminated)
! 		size->last_iteration_eliminated_by_peeling += num;
! 	    }
! 	}
!     }
!   if (dump_file && (dump_flags & TDF_DETAILS))
!     fprintf (dump_file, "size: %i-%i, last_iteration: %i-%i\n", size->overall,
!     	     size->eliminated_by_peeling, size->last_iteration,
! 	     size->last_iteration_eliminated_by_peeling);
!   
!   free (body);
! }

! /* Estimate number of insns of completely unrolled loop.
!    It is (NUNROLL + 1) * size of loop body with taking into account
!    the fact that in last copy everything after exit conditional
!    is dead and that some instructions will be eliminated after
!    peeling.
! 
!    Loop body is likely going to simplify futher, this is difficult
!    to guess, we just decrease the result by 1/3.  */

  static unsigned HOST_WIDE_INT
! estimated_unrolled_size (struct loop_size *size,
  			 unsigned HOST_WIDE_INT nunroll)
  {
!   HOST_WIDE_INT unr_insns = ((nunroll)
!   			     * (HOST_WIDE_INT) (size->overall
! 			     			- size->eliminated_by_peeling));
!   if (!nunroll)
!     unr_insns = 0;
!   unr_insns += size->last_iteration - size->last_iteration_eliminated_by_peeling;
! 
!   unr_insns = unr_insns * 2 / 3;
    if (unr_insns <= 0)
      unr_insns = 1;

    return unr_insns;
  }
*************** try_unroll_loop_completely (struct loop 
*** 165,170 ****
--- 333,339 ----
  {
    unsigned HOST_WIDE_INT n_unroll, ninsns, max_unroll, unr_insns;
    gimple cond;
+   struct loop_size size;

    if (loop->inner)
      return false;
*************** try_unroll_loop_completely (struct loop 
*** 182,190 ****
        if (ul == UL_SINGLE_ITER)
  	return false;

!       ninsns = tree_num_loop_insns (loop, &eni_size_weights);

!       unr_insns = estimated_unrolled_size (ninsns, n_unroll);
        if (dump_file && (dump_flags & TDF_DETAILS))
  	{
  	  fprintf (dump_file, "  Loop size: %d\n", (int) ninsns);
--- 351,360 ----
        if (ul == UL_SINGLE_ITER)
  	return false;

!       tree_estimate_loop_size (loop, exit, &size);
!       ninsns = size.overall;

!       unr_insns = estimated_unrolled_size (&size, n_unroll);
        if (dump_file && (dump_flags & TDF_DETAILS))
  	{
  	  fprintf (dump_file, "  Loop size: %d\n", (int) ninsns);
Index: Makefile.in
===================================================================
*** Makefile.in	(revision 147386)
--- Makefile.in	(working copy)
*************** tree-ssa-loop-ivcanon.o : tree-ssa-loop-
*** 2268,2274 ****
     $(SYSTEM_H) $(RTL_H) $(TREE_H) $(TM_P_H) $(CFGLOOP_H) $(PARAMS_H) \
     $(TREE_INLINE_H) output.h $(DIAGNOSTIC_H) $(TM_H) coretypes.h $(TREE_DUMP_H) \
     $(FLAGS_H) $(TREE_PASS_H) $(SCEV_H) $(BASIC_BLOCK_H) $(GGC_H) \
!    hard-reg-set.h tree-chrec.h
  tree-ssa-loop-ch.o : tree-ssa-loop-ch.c $(TREE_FLOW_H) $(CONFIG_H) \
     $(SYSTEM_H) $(RTL_H) $(TREE_H) $(TM_P_H) $(CFGLOOP_H) $(TREE_INLINE_H) \
     output.h $(DIAGNOSTIC_H) $(TIMEVAR_H) $(TM_H) coretypes.h $(TREE_DUMP_H) \
--- 2268,2274 ----
     $(SYSTEM_H) $(RTL_H) $(TREE_H) $(TM_P_H) $(CFGLOOP_H) $(PARAMS_H) \
     $(TREE_INLINE_H) output.h $(DIAGNOSTIC_H) $(TM_H) coretypes.h $(TREE_DUMP_H) \
     $(FLAGS_H) $(TREE_PASS_H) $(SCEV_H) $(BASIC_BLOCK_H) $(GGC_H) \
!    hard-reg-set.h tree-chrec.h $(TARGET_H)
  tree-ssa-loop-ch.o : tree-ssa-loop-ch.c $(TREE_FLOW_H) $(CONFIG_H) \
     $(SYSTEM_H) $(RTL_H) $(TREE_H) $(TM_P_H) $(CFGLOOP_H) $(TREE_INLINE_H) \
     output.h $(DIAGNOSTIC_H) $(TIMEVAR_H) $(TM_H) coretypes.h $(TREE_DUMP_H) \