[PR81165] discount killed stmts when sizing blocks for threading

Alexandre Oliva aoliva@redhat.com
Thu Dec 7 12:06:00 GMT 2017


We limit the amount of copying for jump threading based on counting
stmts.  This counting is overly pessimistic, because we will very
often delete stmts as a consequence of jump threading: when the final
conditional jump of a block is removed, earlier SSA names computed
exclusively for use in that conditional are killed.  Furthermore, PHI
nodes in blocks with only two predecessors are trivially replaced with
their now-single values after threading.

This patch scans blocks to be copied in the path constructed so far
and estimates the number of stmts that will be removed in the copies,
bumping up the stmt count limit.

Regstrapped on x86_64-linux-gnu and i686-linux-gnu.  Ok to install?


for  gcc/ChangeLog

	* tree-ssa-threadedge.c (uses_in_bb): New.
	(estimate_threading_killed_stmts): New.
	(estimate_threading_killed_stmts): New overload.
	(record_temporary_equivalences_from_stmts_at_dest): Add path
	parameter; adjust caller.  Expand limit when it's hit.

for  gcc/testsuite/ChangeLog

	* gcc.dg/pr81165.c: New.
---
 gcc/testsuite/gcc.dg/pr81165.c |   59 ++++++++++++
 gcc/tree-ssa-threadedge.c      |  189 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 245 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/pr81165.c

diff --git a/gcc/testsuite/gcc.dg/pr81165.c b/gcc/testsuite/gcc.dg/pr81165.c
new file mode 100644
index 000000000000..8508d893bed6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr81165.c
@@ -0,0 +1,59 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-not " \[/%\] " "optimized" } } */
+
+/* Testcase submitted for PR81165, with its main function removed as
+   it's turned into a compile test.  We want to make sure that all of
+   the divide/remainder computations are removed by tree optimizers.
+
+   We can figure out that we don't need to compute at runtime even the
+   condition to enter the loop: the initial i==0 would have to be
+   greater than the sum of two small unsigned values: 1U>>t1 is in the
+   range 0..1, whereas the char value is bounded by the range 0..127,
+   being 128 % a positive number (zero would invoke undefined
+   behavior, so we can assume it doesn't happen).  (We know it's
+   nonnegative because it's 10 times a number that has no more than
+   the bits for 16, 8 and 1 set.)
+
+   We don't realize that the loop is useless right away: jump
+   threading helps remove some of the complexity, particularly of the
+   computation within the loop: t1 is compared with 1, but it can
+   never be 1.  (We could assume as much, since its being 1 would
+   divide by zero, but we don't.)
+
+   If we don't enter the conditional block, t1 remains at 2; if we do,
+   it's set to either -1.  If we jump thread at the end of the
+   conditional block, we can figure out the ranges exclude 1 and the
+   jump body is completely optimized out.  However, we used to fail to
+   consider the block for jump threading due to the amount of
+   computation in it, without realizing most of it would die in
+   consequence of the threading.
+
+   We now take the dying code into account when deciding whether or
+   not to try jump threading.  That might enable us to optimize the
+   function into { if (x2 != 0 || (x1 & 1) == 0) abort (); }.  At the
+   time of this writing, with the patch, we get close, but the test on
+   x2 only gets as far as ((1 >> x2) == 0).  Without the patch, some
+   of the loop remains.  */
+
+short x0 = 15;
+
+void func (){
+  volatile int x1 = 1U;
+  volatile char x2 = 0;
+  char t0 = 0;
+  unsigned long t1 = 2LU;
+  int i = 0;
+  
+  if(1>>x2) {
+    t0 = -1;
+    t1 = (1&(short)(x1^8U))-1;
+  }
+
+  while(i > (int)((1U>>t1)+(char)(128%(10*(25LU&(29%x0)))))) {
+    i += (int)(12L/(1!=(int)t1));
+  }
+
+  if (t0 != -1) __builtin_abort();
+  if (t1 != 0L) __builtin_abort();
+}
diff --git a/gcc/tree-ssa-threadedge.c b/gcc/tree-ssa-threadedge.c
index 536c4717b725..25ccac2a3ecc 100644
--- a/gcc/tree-ssa-threadedge.c
+++ b/gcc/tree-ssa-threadedge.c
@@ -170,6 +170,173 @@ threadedge_valueize (tree t)
   return t;
 }
 
+/* Return how many uses of T there are within BB, as long as there
+   aren't any uses outside BB.  If there are any uses outside BB,
+   return -1 if there's at most one use within BB, or -2 if there is
+   more than one use within BB.  */
+
+static int
+uses_in_bb (tree t, basic_block bb)
+{
+  int uses = 0;
+  bool outside_bb = false;
+
+  imm_use_iterator iter;
+  use_operand_p use_p;
+  FOR_EACH_IMM_USE_FAST (use_p, iter, t)
+    {
+      if (is_gimple_debug (USE_STMT (use_p)))
+	continue;
+
+      if (gimple_bb (USE_STMT (use_p)) != bb)
+	outside_bb = true;
+      else
+	uses++;
+
+      if (outside_bb && uses > 1)
+	return -2;
+    }
+
+  if (outside_bb)
+    return -1;
+
+  return uses;
+}
+
+/* Starting from the final control flow stmt in BB, assuming it will
+   be removed, follow uses in to-be-removed stmts back to their defs
+   and count how many defs are to become dead and be removed as
+   well.  */
+
+static int
+estimate_threading_killed_stmts (basic_block bb)
+{
+  int killed_stmts = 0;
+  hash_map<tree, int> ssa_remaining_uses;
+  auto_vec<gimple *, 4> dead_worklist;
+
+  /* If the block has only two predecessors, threading will turn phi
+     dsts into either src, so count them as dead stmts.  */
+  bool drop_all_phis = EDGE_COUNT (bb->preds) == 2;
+
+  if (drop_all_phis)
+    for (gphi_iterator gsi = gsi_start_phis (bb);
+	 !gsi_end_p (gsi); gsi_next (&gsi))
+      {
+	gphi *phi = gsi.phi ();
+	tree dst = gimple_phi_result (phi);
+
+	/* We don't count virtual PHIs as stmts in
+	   record_temporary_equivalences_from_phis.  */
+	if (virtual_operand_p (dst))
+	  continue;
+
+	killed_stmts++;
+      }
+
+  if (gsi_end_p (gsi_last_bb (bb)))
+    return killed_stmts;
+
+  gimple *stmt = gsi_stmt (gsi_last_bb (bb));
+  if (gimple_code (stmt) != GIMPLE_COND
+      && gimple_code (stmt) != GIMPLE_GOTO
+      && gimple_code (stmt) != GIMPLE_SWITCH)
+    return killed_stmts;
+
+  dead_worklist.quick_push (stmt);
+  while (!dead_worklist.is_empty ())
+    {
+      stmt = dead_worklist.pop ();
+
+      ssa_op_iter iter;
+      use_operand_p use_p;
+      FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
+	{
+	  tree t = USE_FROM_PTR (use_p);
+	  if (SSA_NAME_DEF_STMT (t)
+	      && (gimple_code (SSA_NAME_DEF_STMT (t)) == GIMPLE_ASSIGN
+		  || (gimple_code (SSA_NAME_DEF_STMT (t)) == GIMPLE_PHI
+		      && !virtual_operand_p (t)
+		      && !drop_all_phis))
+	      && gimple_bb (SSA_NAME_DEF_STMT (t)) == bb)
+	    {
+	      int *usesp = ssa_remaining_uses.get (t);
+	      int uses;
+
+	      if (usesp)
+		uses = *usesp;
+	      else
+		uses = uses_in_bb (t, bb);
+
+	      gcc_assert (uses);
+
+	      /* Don't bother recording the expected use count if we
+		 won't find any further uses within BB.  */
+	      if (!usesp && (uses < -1 || uses > 1))
+		{
+		  usesp = &ssa_remaining_uses.get_or_insert (t);
+		  *usesp = uses;
+		}
+
+	      if (uses < 0)
+		continue;
+
+	      --uses;
+	      if (usesp)
+		*usesp = uses;
+
+	      if (!uses)
+		{
+		  killed_stmts++;
+		  if (usesp)
+		    ssa_remaining_uses.remove (t);
+		  if (gimple_code (SSA_NAME_DEF_STMT (t)) != GIMPLE_PHI)
+		    dead_worklist.safe_push (SSA_NAME_DEF_STMT (t));
+		}
+	    }
+	}
+    }
+
+  if (dump_file)
+    fprintf (dump_file, "threading bb %i kills %i stmts\n",
+	     bb->index, killed_stmts);
+
+  return killed_stmts;
+}
+
+/* Estimate killed statements over all blocks in the PATH, or in E.
+   If PATH is not empty, E must be its last entry.  */
+
+static int
+estimate_threading_killed_stmts (edge e, vec<jump_thread_edge *> *path)
+{
+  gcc_assert (path->length () == 0 || path->last ()->e == e);
+  if (path->length () == 0)
+    return estimate_threading_killed_stmts (e->dest);
+
+  int total = 0;
+  int i = 0;
+  jump_thread_edge *je;
+  FOR_EACH_VEC_ELT (*path, i, je)
+    switch (je->type)
+      {
+      case EDGE_START_JUMP_THREAD:
+      case EDGE_COPY_SRC_BLOCK:
+      case EDGE_COPY_SRC_JOINER_BLOCK:
+	total += estimate_threading_killed_stmts (je->e->dest);
+	break;
+
+      default:
+	gcc_unreachable ();
+
+      case EDGE_FSM_THREAD:
+      case EDGE_NO_COPY_SRC_BLOCK:
+	break;
+      }
+
+  return total;
+}
+
 /* Try to simplify each statement in E->dest, ultimately leading to
    a simplification of the COND_EXPR at the end of E->dest.
 
@@ -191,7 +358,8 @@ static gimple *
 record_temporary_equivalences_from_stmts_at_dest (edge e,
     const_and_copies *const_and_copies,
     avail_exprs_stack *avail_exprs_stack,
-    pfn_simplify simplify)
+    pfn_simplify simplify,
+    vec<jump_thread_edge *> *path)
 {
   gimple *stmt = NULL;
   gimple_stmt_iterator gsi;
@@ -233,7 +401,22 @@ record_temporary_equivalences_from_stmts_at_dest (edge e,
 	 expansion, then do not thread through this block.  */
       stmt_count++;
       if (stmt_count > max_stmt_count)
-	return NULL;
+	{
+	  /* If any of the stmts in the PATH's dests are going to be
+	     killed due to threading, grow the max count
+	     accordingly.  */
+	  if (max_stmt_count
+	      == PARAM_VALUE (PARAM_MAX_JUMP_THREAD_DUPLICATION_STMTS))
+	    {
+	      max_stmt_count += estimate_threading_killed_stmts (e, path);
+	      if (dump_file)
+		fprintf (dump_file, "threading bb %i up to %i stmts\n",
+			 e->dest->index, max_stmt_count);
+	    }
+	  /* If we're still past the limit, we're done.  */
+	  if (stmt_count > max_stmt_count)
+	    return NULL;
+	}
 
       /* If this is not a statement that sets an SSA_NAME to a new
 	 value, then do not try to simplify this statement as it will
@@ -991,7 +1174,7 @@ thread_through_normal_block (edge e,
   gimple *stmt
     = record_temporary_equivalences_from_stmts_at_dest (e, const_and_copies,
 							avail_exprs_stack,
-							simplify);
+							simplify, path);
 
   /* There's two reasons STMT might be null, and distinguishing
      between them is important.

-- 
Alexandre Oliva, freedom fighter    http://FSFLA.org/~lxoliva/
You must be the change you wish to see in the world. -- Gandhi
Be Free! -- http://FSFLA.org/   FSF Latin America board member
Free Software Evangelist|Red Hat Brasil GNU Toolchain Engineer



More information about the Gcc-patches mailing list