This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Fixes to estimate_num_insns try II


Hi,
this is update of earlier patch for cost metrics I sent last month.  This time
there are not that many failures to handle since unroller is no longer too confused.
There is still one case were we now miss optimization, loop-36.c:

  for (d=0; d<4; ++d)
    c.array[d] = a.array[d] * b.array[d];
  for (d=0; d<4; ++d)
    s+=c.array[d];

expect both loops to be unrolled early in order for c.array to be optimized out by FRV.
With new metrics we account the three memory operations in first loop:

Analyzing # of iterations of loop 1
  exit condition [0, + , 1] <= 3
  bounds on difference of bases: 3 ... 3
  result:
    # of iterations 4, bounded by 4
  (set_nb_iterations_in_loop = 4))
Loop 1 iterates 4 times.
Estimating sizes for loop 1
 BB: 4, after_exit: 0
  size:   2 if (d_2 <= 3)
   Exit condition will be eliminated.
 BB: 3, after_exit: 1
  size:   1 D.1605_6 = a.array[d_2];
  size:   1 D.1606_7 = b.array[d_2];
  size:   1 D.1607_8 = D.1605_6 * D.1606_7;
  size:   1 c.array[d_2] = D.1607_8;
  size:   1 d_9 = d_2 + 1;
   Induction variable computation will be folded away.
size: 7-3, last_iteration: 2-2
  Loop size: 7
  Estimated size after unrolling: 10

So it is estimated that fully unrolling the loop 4 times is not going to be win
code size wise, that is true.  Even with c.array optimized out, the resulting
loop have 8 memory loads and 4 multiplies compared to 7 instructions in
original loop.

The second loop is unrolled though:

Loop 2 iterates 4 times.
Estimating sizes for loop 2
 BB: 8, after_exit: 0
  size:   2 if (d_3 <= 3)
   Exit condition will be eliminated.
 BB: 5, after_exit: 1
  size:   1 D.1608_11 = c.array[d_3];
  size:   1 s_12 = D.1608_11 + s_1;
  size:   1 d_13 = d_3 + 1;
   Induction variable computation will be folded away.
size: 5-3, last_iteration: 2-2
  Loop size: 5
  Estimated size after unrolling: 5

I think it is not purpose of simple minded cunrolli heuristic to handle complex
case like this, so I would propose simply decreasing number of iterations so 
the testcase still works that require reducing from 4 to 2.

The remaining testcase updates are quite easy; we need to prevent in lining
in ipacost testcases and slp-3.c that got unrolled after previous change is no
longer getting unrolled interanl loop (iterating 8 times containing one load
and one store, so it is sane decionsion to not do so).

Bootstrapped/regtested x86_64-linux, OK?

Honza

	* gcc.dg/tree-ssa/loop-36.c: Reduce amount of iterations to 2 so unrolling
	still happens.
	* gcc.dg/ipa/ipacost-1.c: Prevent inlining
	* gcc.dg/ipa/ipacost-2.c: Likewise.
	* gcc.dg/vect/slp-3.c: Loop is no longer unrolled.

	* tree-inline.c (estimate_operator_cost): Add operands;
	when division happens by constant, it is cheap.
	(estimate_num_insns): Loads and stores are not having cost of 0;
	EH magic stuff is cheap; when computing runtime cost of switch,
	use log2 base of amount of its cases; builtin_expect has cost of 0;
	compute cost for moving return value of call.
	(init_inline_once): Initialize time_based flags.
	* tree-inline.h (eni_weights_d): Add time_based flag.
Index: testsuite/gcc.dg/tree-ssa/loop-36.c
===================================================================
*** testsuite/gcc.dg/tree-ssa/loop-36.c	(revision 147395)
--- testsuite/gcc.dg/tree-ssa/loop-36.c	(working copy)
***************
*** 1,7 ****
  /* { dg-do compile } */
  /* { dg-options "-O2 -fdump-tree-dce2" } */
  
! struct X { float array[4]; };
  
  struct X a,b;
  
--- 1,7 ----
  /* { dg-do compile } */
  /* { dg-options "-O2 -fdump-tree-dce2" } */
  
! struct X { float array[2]; };
  
  struct X a,b;
  
*************** float foobar () {
*** 9,17 ****
    float s = 0;
    unsigned int d;
    struct X c;
!   for (d=0; d<4; ++d)
      c.array[d] = a.array[d] * b.array[d];
!   for (d=0; d<4; ++d)
      s+=c.array[d];
    return s;
  }
--- 9,17 ----
    float s = 0;
    unsigned int d;
    struct X c;
!   for (d=0; d<2; ++d)
      c.array[d] = a.array[d] * b.array[d];
!   for (d=0; d<2; ++d)
      s+=c.array[d];
    return s;
  }
Index: testsuite/gcc.dg/ipa/ipacost-1.c
===================================================================
*** testsuite/gcc.dg/ipa/ipacost-1.c	(revision 147395)
--- testsuite/gcc.dg/ipa/ipacost-1.c	(working copy)
*************** i_can_not_be_propagated_fully2 (int *a)
*** 46,51 ****
--- 46,53 ----
  main()
  {
    i_can_be_propagated_fully2 (array);
+   i_can_be_propagated_fully2 (array);
+   i_can_not_be_propagated_fully2 (array);
    i_can_not_be_propagated_fully2 (array);
  }
  
Index: testsuite/gcc.dg/ipa/ipacost-2.c
===================================================================
*** testsuite/gcc.dg/ipa/ipacost-2.c	(revision 147395)
--- testsuite/gcc.dg/ipa/ipacost-2.c	(working copy)
*************** i_can_not_be_propagated_fully2 (int *a)
*** 47,52 ****
--- 47,54 ----
  main()
  {
    i_can_be_propagated_fully2 (array);
+   i_can_be_propagated_fully2 (array);
+   i_can_not_be_propagated_fully2 (array);
    i_can_not_be_propagated_fully2 (array);
  }
  
*************** main()
*** 54,60 ****
  /* { dg-final { scan-ipa-dump-times "versioned function i_can_be_propagated_fully " 1 "cp"  } } */
  /* { dg-final { scan-ipa-dump-times "versioned function i_can_not_be_propagated_fully2" 1 "cp"  } } */
  /* { dg-final { scan-ipa-dump-times "versioned function i_can_not_be_propagated_fully " 1 "cp"  } } */
! /* { dg-final { scan-tree-dump-not "i_can_be_propagated" "optimized"  } } */
! /* { dg-final { scan-tree-dump-not "i_can_be_propagated" "optimized"  } } */
  /* { dg-final { cleanup-ipa-dump "cp" } } */
  /* { dg-final { cleanup-tree-dump "optimized" } } */
--- 56,62 ----
  /* { dg-final { scan-ipa-dump-times "versioned function i_can_be_propagated_fully " 1 "cp"  } } */
  /* { dg-final { scan-ipa-dump-times "versioned function i_can_not_be_propagated_fully2" 1 "cp"  } } */
  /* { dg-final { scan-ipa-dump-times "versioned function i_can_not_be_propagated_fully " 1 "cp"  } } */
! /* { dg-final { scan-tree-dump-not "i_can_be_propagated_fully \\(" "optimized"  } } */
! /* { dg-final { scan-tree-dump-not "i_can_be_propagated_fully2 \\(" "optimized"  } } */
  /* { dg-final { cleanup-ipa-dump "cp" } } */
  /* { dg-final { cleanup-tree-dump "optimized" } } */
Index: testsuite/gcc.dg/vect/slp-3.c
===================================================================
*** testsuite/gcc.dg/vect/slp-3.c	(revision 147395)
--- testsuite/gcc.dg/vect/slp-3.c	(working copy)
*************** int main (void)
*** 142,149 ****
    return 0;
  }
  
! /* One of the loops gets complettely unrolled.  */
! /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { xfail vect_no_align } } } */
! /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail vect_no_align } } } */
  /* { dg-final { cleanup-tree-dump "vect" } } */
    
--- 142,148 ----
    return 0;
  }
  
! /* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail vect_no_align } } } */
! /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { xfail vect_no_align } } } */
  /* { dg-final { cleanup-tree-dump "vect" } } */
    
Index: tree-inline.c
===================================================================
*** tree-inline.c	(revision 147395)
--- tree-inline.c	(working copy)
*************** estimate_move_cost (tree type)
*** 2783,2789 ****
  /* Returns cost of operation CODE, according to WEIGHTS  */
  
  static int
! estimate_operator_cost (enum tree_code code, eni_weights *weights)
  {
    switch (code)
      {
--- 2783,2790 ----
  /* Returns cost of operation CODE, according to WEIGHTS  */
  
  static int
! estimate_operator_cost (enum tree_code code, eni_weights *weights,
! 			tree op1 ATTRIBUTE_UNUSED, tree op2)
  {
    switch (code)
      {
*************** estimate_operator_cost (enum tree_code c
*** 2893,2899 ****
      case FLOOR_MOD_EXPR:
      case ROUND_MOD_EXPR:
      case RDIV_EXPR:
!       return weights->div_mod_cost;
  
      default:
        /* We expect a copy assignment with no operator.  */
--- 2894,2902 ----
      case FLOOR_MOD_EXPR:
      case ROUND_MOD_EXPR:
      case RDIV_EXPR:
!       if (TREE_CODE (op2) != INTEGER_CST)
!         return weights->div_mod_cost;
!       return 1;
  
      default:
        /* We expect a copy assignment with no operator.  */
*************** estimate_num_insns (gimple stmt, eni_wei
*** 2930,2935 ****
--- 2933,2939 ----
    unsigned cost, i;
    enum gimple_code code = gimple_code (stmt);
    tree lhs;
+   tree rhs;
  
    switch (code)
      {
*************** estimate_num_insns (gimple stmt, eni_wei
*** 2953,2968 ****
  	 of moving something into "a", which we compute using the function
  	 estimate_move_cost.  */
        lhs = gimple_assign_lhs (stmt);
        if (is_gimple_reg (lhs))
  	cost = 0;
        else
  	cost = estimate_move_cost (TREE_TYPE (lhs));
  
!       cost += estimate_operator_cost (gimple_assign_rhs_code (stmt), weights);
        break;
  
      case GIMPLE_COND:
!       cost = 1 + estimate_operator_cost (gimple_cond_code (stmt), weights);
        break;
  
      case GIMPLE_SWITCH:
--- 2957,2991 ----
  	 of moving something into "a", which we compute using the function
  	 estimate_move_cost.  */
        lhs = gimple_assign_lhs (stmt);
+       rhs = gimple_assign_rhs1 (stmt);
+ 
+       /* EH magic stuff is most probably going to be optimized out.
+          We rarely really need to save EH info for unwinding
+          nested exceptions.  */
+       if (TREE_CODE (lhs) == FILTER_EXPR
+ 	  || TREE_CODE (lhs) == EXC_PTR_EXPR
+           || TREE_CODE (rhs) == FILTER_EXPR
+ 	  || TREE_CODE (rhs) == EXC_PTR_EXPR)
+ 	return 0;
        if (is_gimple_reg (lhs))
  	cost = 0;
        else
  	cost = estimate_move_cost (TREE_TYPE (lhs));
  
!       if (!is_gimple_reg (rhs) && !is_gimple_min_invariant (rhs))
! 	cost += estimate_move_cost (TREE_TYPE (rhs));
! 
!       cost += estimate_operator_cost (gimple_assign_rhs_code (stmt), weights,
!       				      gimple_assign_rhs1 (stmt),
! 				      get_gimple_rhs_class (gimple_assign_rhs_code (stmt))
! 				      == GIMPLE_BINARY_RHS
! 				      ? gimple_assign_rhs2 (stmt) : NULL);
        break;
  
      case GIMPLE_COND:
!       cost = 1 + estimate_operator_cost (gimple_cond_code (stmt), weights,
!       				         gimple_op (stmt, 0),
! 				         gimple_op (stmt, 1));
        break;
  
      case GIMPLE_SWITCH:
*************** estimate_num_insns (gimple stmt, eni_wei
*** 2971,2977 ****
  
  	 TODO: once the switch expansion logic is sufficiently separated, we can
  	 do better job on estimating cost of the switch.  */
!       cost = gimple_switch_num_labels (stmt) * 2;
        break;
  
      case GIMPLE_CALL:
--- 2994,3003 ----
  
  	 TODO: once the switch expansion logic is sufficiently separated, we can
  	 do better job on estimating cost of the switch.  */
!       if (weights->time_based)
!         cost = floor_log2 (gimple_switch_num_labels (stmt)) * 2;
!       else
!         cost = gimple_switch_num_labels (stmt) * 2;
        break;
  
      case GIMPLE_CALL:
*************** estimate_num_insns (gimple stmt, eni_wei
*** 2994,3001 ****
  	    case BUILT_IN_CONSTANT_P:
  	      return 0;
  	    case BUILT_IN_EXPECT:
! 	      cost = 0;
! 	      break;
  
  	    /* Prefetch instruction is not expensive.  */
  	    case BUILT_IN_PREFETCH:
--- 3020,3026 ----
  	    case BUILT_IN_CONSTANT_P:
  	      return 0;
  	    case BUILT_IN_EXPECT:
! 	      return 0;
  
  	    /* Prefetch instruction is not expensive.  */
  	    case BUILT_IN_PREFETCH:
*************** estimate_num_insns (gimple stmt, eni_wei
*** 3009,3014 ****
--- 3034,3041 ----
  	if (decl)
  	  funtype = TREE_TYPE (decl);
  
+ 	if (!VOID_TYPE_P (TREE_TYPE (funtype)))
+ 	  cost += estimate_move_cost (TREE_TYPE (funtype));
  	/* Our cost must be kept in sync with
  	   cgraph_estimate_size_after_inlining that does use function
  	   declaration to figure out the arguments.  */
*************** init_inline_once (void)
*** 3133,3143 ****
--- 3160,3172 ----
    eni_inlining_weights.target_builtin_call_cost = 1;
    eni_inlining_weights.div_mod_cost = 10;
    eni_inlining_weights.omp_cost = 40;
+   eni_inlining_weights.time_based = true;
  
    eni_size_weights.call_cost = 1;
    eni_size_weights.target_builtin_call_cost = 1;
    eni_size_weights.div_mod_cost = 1;
    eni_size_weights.omp_cost = 40;
+   eni_size_weights.time_based = false;
  
    /* Estimating time for call is difficult, since we have no idea what the
       called function does.  In the current uses of eni_time_weights,
*************** init_inline_once (void)
*** 3147,3152 ****
--- 3176,3182 ----
    eni_time_weights.target_builtin_call_cost = 10;
    eni_time_weights.div_mod_cost = 10;
    eni_time_weights.omp_cost = 40;
+   eni_time_weights.time_based = true;
  }
  
  /* Estimate the number of instructions in a gimple_seq. */
Index: tree-inline.h
===================================================================
*** tree-inline.h	(revision 147395)
--- tree-inline.h	(working copy)
*************** typedef struct eni_weights_d
*** 130,135 ****
--- 130,140 ----
  
    /* Cost for omp construct.  */
    unsigned omp_cost;
+ 
+   /* True when time of statemnt should be estimated.  Thus i.e
+      cost of switch statement is logarithmic rather than linear in number
+      of cases.  */
+   bool time_based;
  } eni_weights;
  
  /* Weights that estimate_num_insns uses for heuristics in inlining.  */


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]