[google] Limit unrolling and peeling under LIPO estimates of large code size/icache pressure

Tue Nov 22 21:30:00 GMT 2011

This patch is for google-main only.
Tested with bootstrap and regression tests.
Under LIPO, estimate the code size footprint from the partial call
graph, and if it is large limit unrolling and peeling to avoid
increasing icache pressure.
Teresa

2011-11-21   Teresa Johnson  <tejohnson@google.com>

        * loop-unroll.c (loop_has_FP_comp): New function.
        (limit_code_size): New function.
        (unroll_and_peel_loops): Check if unrolling and/or peeling
        should be disabled due to large code size estimates.
        * common.opt (fripa-peel-size-limit): New option.
        (fripa-unroll-size-limit): New option.
        * tree-optimize.c (compute_codesize_estimate): New function.
        (execute_cleanup_cfg_post_optimizing): Invoke
        compute_codesize_estimate at the end of tree optimization.
        * params.def (codesize-hotness-threshold): New parameter.
        (unrollpeel-codesize-threshold): New parameter.
        * doc/invoke.texi: Document the new options and parameters.


-- 
Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413
-------------- next part --------------
Index: doc/invoke.texi
===================================================================

--- doc/invoke.texi	(revision 181550)
+++ doc/invoke.texi	(working copy)
@@ -400,7 +400,8 @@ Objective-C and Objective-C++ Dialects}.
 -freorder-blocks-and-partition -freorder-functions @gol
 -frerun-cse-after-loop -freschedule-modulo-scheduled-loops @gol
 -fripa -fripa-disallow-asm-modules -fripa-disallow-opt-mismatch @gol
--fripa-no-promote-always-inline-func -fripa-verbose -frounding-math @gol
+-fripa-no-promote-always-inline-func -fripa-verbose @gol
+-fripa-peel-size-limit -fripa-unroll-size-limit -frounding-math @gol
 -fsched2-use-superblocks -fsched-pressure @gol
 -fsched-spec-load -fsched-spec-load-dangerous @gol
 -fsched-stalled-insns-dep[=@var{n}] -fsched-stalled-insns[=@var{n}] @gol
@@ -8267,6 +8268,20 @@ Do not promote static functions with alw
 Enable printing of verbose information about dynamic inter-procedural optimizations.
 This is used in conjunction with the @option{-fripa}.
 
+@item -fripa-peel-size-limit
+@opindex fripa-peel-size-limit
+Limit loop peeling of non-const non-FP loops in a LIPO compilation under estimates
+of a large code footprint. Enabled by default under @option{-fripa}. Code size
+estimation and thresholds are controlled by the @option{codesize-hotness-threshold}
+and @option{unrollpeel-codesize-threshold} parameters.
+
+@item -fripa-unroll-size-limit
+@opindex fripa-unroll-size-limit
+Limit loop unrolling of non-const non-FP loops in a LIPO compilation under estimates
+of a large code footprint. Enabled by default under @option{-fripa}. Code size
+estimation and thresholds are controlled by the @option{codesize-hotness-threshold}
+and @option{unrollpeel-codesize-threshold} parameters.
+
 @item -fcallgraph-profiles-sections
 @opindex fcallgraph-profiles-sections
 Emit call graph edge profile counts in .note.callgraph.text sections. This is
@@ -8991,6 +9006,13 @@ hot loops. Its default value is 16.
 @item max-completely-peel-loop-nest-depth
 The maximum depth of a loop nest suitable for complete peeling.
 
+@item codesize-hotness-threshold
+The minimum profile count of basic blocks to look at when estimating
+the code size footprint of the call graph in a LIPO compile.
+
+@item unrollpeel-codesize-threshold
+Maximum LIPO code size footprint estimate for loop unrolling and peeling.
+
 @item max-unswitch-insns
 The maximum number of insns of an unswitched loop.
Index: loop-unroll.c
===================================================================
--- loop-unroll.c	(revision 181550)
+++ loop-unroll.c	(working copy)
@@ -181,6 +181,81 @@ report_unroll_peel(struct loop *loop, lo
             "" : iter_str);
 }
 
+/* Determine whether LOOP contains floating-point computation. */
+static bool
+loop_has_FP_comp(struct loop *loop)
+{
+  rtx set, dest;
+  basic_block *body, bb;
+  unsigned i;
+  rtx insn;
+
+  body = get_loop_body (loop);
+  for (i = 0; i < loop->num_nodes; i++)
+    {
+      bb = body[i];
+
+      FOR_BB_INSNS (bb, insn)
+      {
+        set = single_set (insn);
+        if (!set)
+          continue;
+
+        dest = SET_DEST (set);
+        if (FLOAT_MODE_P (GET_MODE (dest)))
+        {
+          free (body);
+          return true;
+        }
+      }
+    }
+  free (body);
+  return false;
+}
+
+/* This returns a bit vector */
+typedef enum {
+  NO_LIMIT = 0,
+  LIMIT_UNROLL = 0x1,
+  LIMIT_PEEL = 0x2,
+  LIMIT_BOTH = 0x3
+} limit_type;
+
+extern int cgraph_codesize_estimate;
+
+/* Determine whether LOOP unrolling/peeling should be constrained based
+   on code footprint estimates. */
+static limit_type
+limit_code_size(struct loop *loop)
+{
+  unsigned size_threshold;
+  limit_type result = NO_LIMIT;
+  int result_int = 0;
+
+  if (!flag_dyn_ipa)
+    return NO_LIMIT;
+
+  gcc_assert (cgraph_codesize_estimate >= 0);
+
+  /* Ignore FP loops, which are more likely to benefit heavily from
+     unrolling. */
+  if (loop_has_FP_comp(loop))
+    return NO_LIMIT;
+
+  size_threshold = PARAM_VALUE (PARAM_UNROLLPEEL_CODESIZE_THRESHOLD);
+  if (cgraph_codesize_estimate <= (int)size_threshold)
+    return NO_LIMIT;
+
+  if (flag_ripa_peel_size_limit)
+    result_int |= LIMIT_PEEL;
+
+  if (flag_ripa_unroll_size_limit)
+    result_int |= LIMIT_UNROLL;
+
+  result = (limit_type)result_int;
+  return result;
+}
+
 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 void
 unroll_and_peel_loops (int flags)
@@ -307,6 +382,7 @@ decide_unrolling_and_peeling (int flags)
   struct loop *loop;
   loop_iterator li;
   location_t locus;
+  limit_type limit;
 
   /* Scan the loops, inner ones first.  */
   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
@@ -347,19 +423,42 @@ decide_unrolling_and_peeling (int flags)
       loop->ninsns = num_loop_insns (loop);
       loop->av_ninsns = average_num_loop_insns (loop);
 
+      /* Determine whether to limit code size growth from unrolling and
+         peeling. This is currently enabled only under LIPO (dynamic IPA)
+         where we have a partial call graph. It is not applied to loops
+         with constant trip counts, as it is easier to determine the
+         profitability of unrolling and peeling such loops. */
+      limit = limit_code_size(loop);
+      if (limit != NO_LIMIT)
+	{
+	  if (dump_file)
+            {
+	      fprintf (dump_file, ";; Due to large code size footprint estimate, limit ");
+              if (limit == (LIMIT_UNROLL|LIMIT_PEEL))
+	        fprintf (dump_file, "unrolling and peeling\n");
+              else if (limit == LIMIT_UNROLL)
+	        fprintf (dump_file, "unrolling\n");
+              else
+	        fprintf (dump_file, "peeling\n");
+            }
+	}
+
       /* Try transformations one by one in decreasing order of
 	 priority.  */
 
       decide_unroll_constant_iterations (loop, flags);
-      if (loop->lpt_decision.decision == LPT_NONE)
+      if (loop->lpt_decision.decision == LPT_NONE
+          && !(limit & LIMIT_UNROLL))
 	decide_unroll_runtime_iterations (loop, flags);
-      if (loop->lpt_decision.decision == LPT_NONE)
+      if (loop->lpt_decision.decision == LPT_NONE
+          && !(limit & LIMIT_UNROLL))
 	decide_unroll_stupid (loop, flags);
-      if (loop->lpt_decision.decision == LPT_NONE)
+      if (loop->lpt_decision.decision == LPT_NONE
+          && !(limit & LIMIT_PEEL))
 	decide_peel_simple (loop, flags);
 
-      if (flag_opt_info >= OPT_INFO_MIN &&
-          loop->lpt_decision.decision != LPT_NONE)
+      if (flag_opt_info >= OPT_INFO_MIN
+          && loop->lpt_decision.decision != LPT_NONE)
         {
           report_unroll_peel(loop, locus);
         }
Index: common.opt
===================================================================
--- common.opt	(revision 181550)
+++ common.opt	(working copy)
@@ -1129,6 +1129,14 @@ Common Report Var(flag_ripa_no_promote_a
 Don't promote always inline static functions assuming they
 will be inlined and no copy is needed.
 
+fripa-peel-size-limit
+Common Report Var(flag_ripa_peel_size_limit) Init(1) Optimization
+Limit non-const non-FP loop peeling under dynamic IPA estimates of large code footprint
+
+fripa-unroll-size-limit
+Common Report Var(flag_ripa_unroll_size_limit) Init(1) Optimization
+Limit non-const non-FP loop unrolling under dynamic IPA estimates of large code footprint
+
 fearly-inlining
 Common Report Var(flag_early_inlining) Init(1) Optimization
 Perform early inlining
Index: tree-optimize.c
===================================================================
--- tree-optimize.c	(revision 181550)
+++ tree-optimize.c	(working copy)
@@ -47,6 +47,7 @@ along with GCC; see the file COPYING3.  
 #include "except.h"
 #include "plugin.h"
 #include "regset.h"	/* FIXME: For reg_obstack.  */
+#include "params.h"
 
 /* Gate: execute, or not, all of the non-trivial optimizations.  */
 
@@ -149,6 +150,48 @@ struct gimple_opt_pass pass_all_early_op
  }
 };
 
+int cgraph_codesize_estimate = -1;
+
+/* Estimate the code size from the dynamic IPA call graph. */
+static void
+compute_codesize_estimate(void)
+{
+  struct cgraph_node *node;
+  basic_block bb;
+  gimple_stmt_iterator bsi;
+  struct function *my_function;
+
+  if (!flag_dyn_ipa)
+    return;
+
+  if (cgraph_codesize_estimate >= 0)
+    return;
+  cgraph_codesize_estimate = 0;
+
+  for (node = cgraph_nodes; node; node = node->next)
+    {
+      if (node->count == 0)
+        continue;
+
+      if (!gimple_has_body_p(node->decl))
+        continue;
+
+      my_function = DECL_STRUCT_FUNCTION (node->decl);
+
+      FOR_EACH_BB_FN (bb, my_function)
+        {
+          if (bb->count < PARAM_VALUE (PARAM_CODESIZE_HOTNESS_THRESHOLD))
+            continue;
+          for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
+            {
+              gimple stmt = gsi_stmt (bsi);
+              cgraph_codesize_estimate += estimate_num_insns (stmt, &eni_size_weights);
+            }
+        }
+    }
+  if (dump_file)
+    fprintf (dump_file, "Code size estimate from cgraph: %d\n", cgraph_codesize_estimate);
+}
 
 /* Pass: cleanup the CFG just before expanding trees to RTL.
    This is just a round of label cleanups and case node grouping
@@ -158,6 +201,8 @@ struct gimple_opt_pass pass_all_early_op
 static unsigned int
 execute_cleanup_cfg_post_optimizing (void)
 {
+  /* Estimate the code footprint for hot BBs before we enter RTL */
+  compute_codesize_estimate();
   cleanup_tree_cfg ();
   cleanup_dead_labels ();
   group_case_labels ();
Index: params.def
===================================================================
--- params.def	(revision 181550)
+++ params.def	(working copy)
@@ -337,6 +337,21 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS,
 	 "max-completely-peel-loop-nest-depth",
 	 "The maximum depth of a loop nest we completely peel",
 	 8, 0, 0)
+/* The minimum profile count of basic blocks to look at when estimating
+ * the code size footprint of the call graph in a dynamic IPA compile. */
+DEFPARAM(PARAM_CODESIZE_HOTNESS_THRESHOLD,
+        "codesize-hotness-threshold",
+        "Minimum profile count of basic blocks counted towards dynamic IPA "
+        "code size footprint estimate",
+        10000, 0, 0)
+/* The maximum code size estimate under which loop unrolling and peeling
+ * is allowed in a dynamic IPA compile. This currently applies to loops with
+ * non-constant iteration counts and no floating point computations. */
+DEFPARAM(PARAM_UNROLLPEEL_CODESIZE_THRESHOLD,
+        "unrollpeel-codesize-threshold",
+        "Maximum dynamic IPA code size footprint estimate for loop unrolling "
+        "and peeling",
+        10000, 0, 0)
 
 /* The maximum number of insns of an unswitched loop.  */
 DEFPARAM(PARAM_MAX_UNSWITCH_INSNS,