This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Fix (some of) profile updating after jump threading


Hi,
the testcase is about jump threading confusing profile enough so many edges
are considered cold. The first problem occurs in thread1 pass where
first remove_ctrl_stmt_and_useless_edges does not ouptated outgoing edge
probability after removing the other edges (so we end up with a single
succ bb having outgoing edge probability less than REG_BR_PROB_BASE)
and second is duplicate_thread_path scaling uniformly profiles of all
bbs in the duplicate and original path.  This makes sense only when the
original path has no side entry edges and moreover is always wrong for
last BB.

The job of updating here is quite easy, just keep track of frequency/count
of the patch being duplicated and subtract it from the offline copy.
This of course assumes that all branches along the patch except for last
one remains unoptimized and with same exit probabilities.

In the last BB it is necessary to re-distribute exit edges probabilities
as we know its outcome. Fortunately there already is
update_bb_profile_for_threading which does the job for RTL threader.

Mainline gets following mismatch counts:
q.c.103t.thread1   21
q.c.104t.vrp1   40
q.c.106t.dce2   19
..
q.c.112t.mergephi3   17
q.c.113t.phiopt1   17

With patch I get:

q.c.103t.thread1   2
q.c.104t.vrp1   8
q.c.106t.dce2   6
...
q.c.112t.mergephi3   5
...
q.c.113t.phiopt1   5
...
q.c.118t.thread2   6
q.c.178t.thread3   7
....
q.c.182t.vrp2   17
q.c.183t.phicprop2   10

So VRP's jump threading is now the main source of inconsistencies.
The bug there seems ot be that Theresa's code does not care to update
edge probabilities when profile info is absent.

For tramp3d the numbers are:
tramp3d-v4.ii.094t.cunrolli   17
...
tramp3d-v4.ii.101t.fre3   68
tramp3d-v4.ii.102t.mergephi2   66
tramp3d-v4.ii.103t.thread1   335
tramp3d-v4.ii.104t.vrp1   605
tramp3d-v4.ii.106t.dce2   265
tramp3d-v4.ii.107t.stdarg   265
tramp3d-v4.ii.108t.cdce   269
tramp3d-v4.ii.109t.cselim   269
tramp3d-v4.ii.110t.copyprop1   259
tramp3d-v4.ii.111t.ifcombine   287
tramp3d-v4.ii.112t.mergephi3   273
...
tramp3d-v4.ii.115t.ch2   275
tramp3d-v4.ii.116t.cplxlower1   275
tramp3d-v4.ii.117t.sra   275
tramp3d-v4.ii.118t.thread2   281
tramp3d-v4.ii.119t.dom2   302
tramp3d-v4.ii.120t.isolate-paths   319
tramp3d-v4.ii.121t.phicprop1   309
tramp3d-v4.ii.122t.dse2   309
tramp3d-v4.ii.123t.reassoc1   311
tramp3d-v4.ii.124t.dce3   310
tramp3d-v4.ii.125t.forwprop3   314
...
tramp3d-v4.ii.131t.lim2   316
...
tramp3d-v4.ii.134t.pre   565
tramp3d-v4.ii.135t.sink   299
tramp3d-v4.ii.139t.dce4   299
tramp3d-v4.ii.140t.fix_loops   299
tramp3d-v4.ii.141t.loop   281
tramp3d-v4.ii.142t.loopinit   281
tramp3d-v4.ii.143t.unswitch   429
tramp3d-v4.ii.144t.sccp   429
tramp3d-v4.ii.145t.lsplit   431
tramp3d-v4.ii.146t.cddce2   431
tramp3d-v4.ii.147t.ldist   443
tramp3d-v4.ii.148t.copyprop2   443
tramp3d-v4.ii.154t.ivcanon   445
tramp3d-v4.ii.157t.ch_vect   445
tramp3d-v4.ii.158t.ifcvt   467
tramp3d-v4.ii.159t.vect   1179
....
tramp3d-v4.ii.162t.cunroll   1090
...
tramp3d-v4.ii.167t.loopdone   1085
tramp3d-v4.ii.171t.veclower21   1103
tramp3d-v4.ii.173t.printf-return-value2   1103
tramp3d-v4.ii.174t.reassoc2   1103
tramp3d-v4.ii.175t.slsr   1103
tramp3d-v4.ii.176t.split-paths   1103
tramp3d-v4.ii.178t.thread3   1143
tramp3d-v4.ii.179t.dom3   1151
tramp3d-v4.ii.180t.strlen   1151
tramp3d-v4.ii.181t.thread4   1173
tramp3d-v4.ii.182t.vrp2   2263
tramp3d-v4.ii.183t.phicprop2   1078
tramp3d-v4.ii.184t.dse3   1078
tramp3d-v4.ii.185t.cddce3   1078
tramp3d-v4.ii.186t.forwprop4   1078
tramp3d-v4.ii.187t.phiopt3   1078
tramp3d-v4.ii.188t.fab1   1079
tramp3d-v4.ii.189t.widening_mul   1079
tramp3d-v4.ii.190t.store-merging   1079
tramp3d-v4.ii.191t.tailc   1079
tramp3d-v4.ii.192t.dce7   1079
tramp3d-v4.ii.193t.crited2   1079
tramp3d-v4.ii.195t.uncprop1   1079
tramp3d-v4.ii.196t.local-pure-const2   1079
tramp3d-v4.ii.224t.ehcleanup2   588
tramp3d-v4.ii.225t.resx   2493
tramp3d-v4.ii.226t.nrv   2493
tramp3d-v4.ii.227t.optimized   2418

On mainline and:

tramp3d-v4.ii.094t.cunrolli   17
..
tramp3d-v4.ii.101t.fre3   68
tramp3d-v4.ii.102t.mergephi2   66
tramp3d-v4.ii.103t.thread1   129
tramp3d-v4.ii.104t.vrp1   324
tramp3d-v4.ii.106t.dce2   196
tramp3d-v4.ii.107t.stdarg   196
tramp3d-v4.ii.108t.cdce   200
..
tramp3d-v4.ii.111t.ifcombine   228
...
tramp3d-v4.ii.115t.ch2   230
...
tramp3d-v4.ii.119t.dom2   255
tramp3d-v4.ii.120t.isolate-paths   272
tramp3d-v4.ii.121t.phicprop1   264
tramp3d-v4.ii.122t.dse2   264
tramp3d-v4.ii.123t.reassoc1   266
tramp3d-v4.ii.124t.dce3   265
tramp3d-v4.ii.125t.forwprop3   269
..
tramp3d-v4.ii.131t.lim2   271
..
tramp3d-v4.ii.134t.pre   515
tramp3d-v4.ii.135t.sink   272
tramp3d-v4.ii.141t.loop   254
tramp3d-v4.ii.142t.loopinit   254
tramp3d-v4.ii.143t.unswitch   402
tramp3d-v4.ii.144t.sccp   402
tramp3d-v4.ii.145t.lsplit   404
tramp3d-v4.ii.146t.cddce2   404
tramp3d-v4.ii.147t.ldist   416
tramp3d-v4.ii.148t.copyprop2   416
tramp3d-v4.ii.154t.ivcanon   418
tramp3d-v4.ii.157t.ch_vect   418
tramp3d-v4.ii.158t.ifcvt   440
tramp3d-v4.ii.159t.vect   1152
...
tramp3d-v4.ii.162t.cunroll   1055
tramp3d-v4.ii.163t.slp1   1055
tramp3d-v4.ii.165t.ivopts   1055
tramp3d-v4.ii.166t.lim4   1056
tramp3d-v4.ii.167t.loopdone   1050
tramp3d-v4.ii.168t.no_loop   18
tramp3d-v4.ii.169t.slp2   18
tramp3d-v4.ii.171t.veclower21   1068
tramp3d-v4.ii.173t.printf-return-value2   1068
tramp3d-v4.ii.174t.reassoc2   1068
tramp3d-v4.ii.175t.slsr   1068
tramp3d-v4.ii.176t.split-paths   1068
tramp3d-v4.ii.178t.thread3   1086
tramp3d-v4.ii.179t.dom3   1097
tramp3d-v4.ii.180t.strlen   1097
tramp3d-v4.ii.181t.thread4   1103
tramp3d-v4.ii.182t.vrp2   2142
tramp3d-v4.ii.183t.phicprop2   1039
tramp3d-v4.ii.184t.dse3   1039
tramp3d-v4.ii.185t.cddce3   1039
tramp3d-v4.ii.186t.forwprop4   1039
tramp3d-v4.ii.187t.phiopt3   1039
tramp3d-v4.ii.188t.fab1   1040
tramp3d-v4.ii.189t.widening_mul   1040
tramp3d-v4.ii.190t.store-merging   1040
tramp3d-v4.ii.191t.tailc   1040
tramp3d-v4.ii.192t.dce7   1040
tramp3d-v4.ii.193t.crited2   1040
tramp3d-v4.ii.195t.uncprop1   1040
tramp3d-v4.ii.196t.local-pure-const2   1040
tramp3d-v4.ii.224t.ehcleanup2   575
tramp3d-v4.ii.225t.resx   2455
tramp3d-v4.ii.226t.nrv   2455
tramp3d-v4.ii.227t.optimized   2380
tramp3d-v4.ii.311t.statistics   0

So while thread passes themselves now introduce fewer inconsistencies,
the other optimizations almost even it up :)

Bootstrapped/regtested x86_64-linux, comitted.

	PR middle-end/77445
	* tree-ssa-threadupdate.c (remove_ctrl_stmt_and_useless_edges):
	correctly set frequency of oudgoing edge.
	(duplicate_thread_path): Fix profile updating.
	* gcc.dg/tree-ssa/pr77445-2.c: New testcase.
	* gcc.dg/tree-ssa/pr77445.c: New testcase.
Index: tree-ssa-threadupdate.c
===================================================================
--- tree-ssa-threadupdate.c	(revision 244508)
+++ tree-ssa-threadupdate.c	(working copy)
@@ -301,7 +301,11 @@ remove_ctrl_stmt_and_useless_edges (basi
 	  remove_edge (e);
 	}
       else
-	ei_next (&ei);
+	{
+	  e->probability = REG_BR_PROB_BASE;
+	  e->count = bb->count;
+	  ei_next (&ei);
+	}
     }
 
   /* If the remaining edge is a loop exit, there must have
@@ -2212,8 +2216,8 @@ duplicate_thread_path (edge entry, edge
   struct loop *loop = entry->dest->loop_father;
   edge exit_copy;
   edge redirected;
-  int total_freq = 0, entry_freq = 0;
-  gcov_type total_count = 0, entry_count = 0;
+  int curr_freq;
+  gcov_type curr_count;
 
   if (!can_copy_bbs_p (region, n_region))
     return false;
@@ -2240,27 +2244,6 @@ duplicate_thread_path (edge entry, edge
       free_region_copy = true;
     }
 
-  if (entry->dest->count)
-    {
-      total_count = entry->dest->count;
-      entry_count = entry->count;
-      /* Fix up corner cases, to avoid division by zero or creation of negative
-	 frequencies.  */
-      if (entry_count > total_count)
-	entry_count = total_count;
-    }
-  else
-    {
-      total_freq = entry->dest->frequency;
-      entry_freq = EDGE_FREQUENCY (entry);
-      /* Fix up corner cases, to avoid division by zero or creation of negative
-	 frequencies.  */
-      if (total_freq == 0)
-	total_freq = 1;
-      else if (entry_freq > total_freq)
-	entry_freq = total_freq;
-    }
-
   copy_bbs (region, n_region, region_copy, &exit, 1, &exit_copy, loop,
 	    split_edge_bb_loc (entry), false);
 
@@ -2270,17 +2253,61 @@ duplicate_thread_path (edge entry, edge
      invalidating the property that is propagated by executing all the blocks of
      the jump-thread path in order.  */
 
+  curr_count = entry->count;
+  curr_freq = EDGE_FREQUENCY (entry);
+
   for (i = 0; i < n_region; i++)
     {
       edge e;
       edge_iterator ei;
       basic_block bb = region_copy[i];
 
+      /* Watch inconsistent profile.  */
+      if (curr_count > region[i]->count)
+	curr_count = region[i]->count;
+      if (curr_freq > region[i]->frequency)
+	curr_freq = region[i]->frequency;
+      /* Scale current BB.  */
+      if (region[i]->count)
+	{
+	  /* In the middle of the path we only scale the frequencies.
+	     In last BB we need to update probabilities of outgoing edges
+	     because we know which one is taken at the threaded path.  */
+	  if (i + 1 != n_region)
+	    scale_bbs_frequencies_gcov_type (region + i, 1,
+					     region[i]->count - curr_count,
+					     region[i]->count);
+	  else
+	    update_bb_profile_for_threading (region[i],
+					     curr_freq, curr_count,
+					     exit);
+	  scale_bbs_frequencies_gcov_type (region_copy + i, 1, curr_count,
+					   region_copy[i]->count);
+	}
+      else if (region[i]->frequency)
+	{
+	  if (i + 1 != n_region)
+	    scale_bbs_frequencies_int (region + i, 1,
+				       region[i]->frequency - curr_freq,
+				       region[i]->frequency);
+	  else
+	    update_bb_profile_for_threading (region[i],
+					     curr_freq, curr_count,
+					     exit);
+	  scale_bbs_frequencies_int (region_copy + i, 1, curr_freq,
+				     region_copy[i]->frequency);
+	}
+
       if (single_succ_p (bb))
 	{
 	  /* Make sure the successor is the next node in the path.  */
 	  gcc_assert (i + 1 == n_region
 		      || region_copy[i + 1] == single_succ_edge (bb)->dest);
+	  if (i + 1 != n_region)
+	    {
+	      curr_freq = EDGE_FREQUENCY (single_succ_edge (bb));
+	      curr_count = single_succ_edge (bb)->count;
+	    }
 	  continue;
 	}
 
@@ -2307,22 +2334,13 @@ duplicate_thread_path (edge entry, edge
 	    if (orig)
 	      redirect_edge_and_branch_force (e, orig);
 	  }
+	else
+	  {
+	    curr_freq = EDGE_FREQUENCY (e);
+	    curr_count = e->count;
+	  }
     }
 
-  if (total_count)
-    {
-      scale_bbs_frequencies_gcov_type (region, n_region,
-				       total_count - entry_count,
-				       total_count);
-      scale_bbs_frequencies_gcov_type (region_copy, n_region, entry_count,
-				       total_count);
-    }
-  else
-    {
-      scale_bbs_frequencies_int (region, n_region, total_freq - entry_freq,
-				 total_freq);
-      scale_bbs_frequencies_int (region_copy, n_region, entry_freq, total_freq);
-    }
 
   if (flag_checking)
     verify_jump_thread (region_copy, n_region);
@@ -2337,11 +2355,12 @@ duplicate_thread_path (edge entry, edge
 
   edge e = make_edge (region_copy[n_region - 1], exit->dest, EDGE_FALLTHRU);
 
-  if (e) {
-    rescan_loop_exit (e, true, false);
-    e->probability = REG_BR_PROB_BASE;
-    e->count = region_copy[n_region - 1]->count;
-  }
+  if (e)
+    {
+      rescan_loop_exit (e, true, false);
+      e->probability = REG_BR_PROB_BASE;
+      e->count = region_copy[n_region - 1]->count;
+    }
 
   /* Redirect the entry and add the phi node arguments.  */
   if (entry->dest == loop->header)
Index: testsuite/ChangeLog
===================================================================
--- testsuite/ChangeLog	(revision 244527)
+++ testsuite/ChangeLog	(working copy)
@@ -1,3 +1,9 @@
+2017-01-17  Jan Hubicka  <hubicka@ucw.cz>
+
+	PR middle-end/77445
+	* gcc.dg/tree-ssa/pr77445-2.c: New testcase.
+	* gcc.dg/tree-ssa/pr77445.c: New testcase.
+
 2017-01-17  Jakub Jelinek  <jakub@redhat.com>
 
 	* g++.dg/tree-ssa/ssa-dse-2.C (size_t): Typedef to __SIZE_TYPE__
Index: testsuite/gcc.dg/tree-ssa/pr77445-2.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/pr77445-2.c	(revision 0)
+++ testsuite/gcc.dg/tree-ssa/pr77445-2.c	(working copy)
@@ -0,0 +1,123 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-thread1-details-blocks-stats" } */
+typedef enum STATES {
+	START=0,
+	INVALID,
+	S1,
+	S2,
+	INT,
+	FLOAT ,
+	EXPONENT,
+	SCIENTIFIC,
+	NUM_STATES
+} state_e ;
+
+typedef unsigned char u8;
+typedef unsigned int  u32;
+
+static u8 is_digit(u8 c) {
+	return (u8)((c>='0') & (c<='9')) ? 1 : 0;
+}
+
+enum STATES FMS( u8 **in , u32 *transitions) {
+	u8 *str = *in;
+	u8 NEXT_SYMBOL;
+	enum STATES state=START;
+	for( ; *str && state != INVALID; str++ ) {
+		NEXT_SYMBOL = *str;
+		if (NEXT_SYMBOL==',') /* end of this input */ {
+			transitions[state]++;
+			str++;
+			break;
+		}
+		switch(state) {
+		case START:
+			if(is_digit(NEXT_SYMBOL)) {
+				state = INT;
+			}
+			else if( NEXT_SYMBOL == '+' || NEXT_SYMBOL == '-' ) {
+				state = S1;
+			}
+			else if( NEXT_SYMBOL == '.' ) {
+				state = FLOAT ;
+			}
+			else {
+				state = INVALID;
+			}
+			transitions[START]++;
+			break;
+		case S1:
+			if(is_digit(NEXT_SYMBOL)) {
+				state = INT;
+				transitions[S1]++;
+			}
+			else if( NEXT_SYMBOL == '.' ) {
+				state = FLOAT ;
+				transitions[S1]++;
+			}
+			else {
+				state = INVALID;
+				transitions[S1]++;
+			}
+			break;
+		case INT:
+			if( NEXT_SYMBOL == '.' ) {
+				state = FLOAT ;
+				transitions[INT]++;
+			}
+			else if(!is_digit(NEXT_SYMBOL)) {
+				state = INVALID;
+				transitions[INT]++;
+			}
+			break;
+		case FLOAT :
+			if( NEXT_SYMBOL == 'E' || NEXT_SYMBOL == 'e' ) {
+				state = S2;
+				transitions[FLOAT ]++;
+			}
+			else if(!is_digit(NEXT_SYMBOL)) {
+				state = INVALID;
+				transitions[FLOAT ]++;
+			}
+			break;
+		case S2:
+			if( NEXT_SYMBOL == '+' || NEXT_SYMBOL == '-' ) {
+				state = EXPONENT;
+				transitions[S2]++;
+			}
+			else {
+				state = INVALID;
+				transitions[S2]++;
+			}
+			break;
+		case EXPONENT:
+			if(is_digit(NEXT_SYMBOL)) {
+				state = SCIENTIFIC;
+				transitions[EXPONENT]++;
+			}
+			else {
+				state = INVALID;
+				transitions[EXPONENT]++;
+			}
+			break;
+		case SCIENTIFIC:
+			if(!is_digit(NEXT_SYMBOL)) {
+				state = INVALID;
+			}
+			break;
+		default:
+			break;
+		}
+	}
+	if (state==INVALID)
+		transitions[INVALID]++;
+	
+	*in = str;
+	return state;
+}
+
+/* The profile is not updated perfectly because it is inconsitent from
+   profile estimation stage. But the number of inconsistencies should not
+   increase much.  */
+/* { dg-final { scan-tree-dump "Jumps threaded: 16" "thread1" } } */
+/* { dg-final { scan-tree-dump-times "Invalid sum" 2 "thread1" } } */
Index: testsuite/gcc.dg/tree-ssa/pr77445.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/pr77445.c	(revision 0)
+++ testsuite/gcc.dg/tree-ssa/pr77445.c	(working copy)
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-thread3-details-blocks -fno-early-inlining -fno-tree-vrp -fno-tree-dominator-opts" } */
+
+static int a;
+static int b;
+void test2 ();
+void
+test ()
+{
+  b = 7;
+}
+
+void
+main (int argc)
+{
+  if (argc)
+    {
+      a = 7;
+      test ();
+    }
+  else
+    a = 0;
+  if (a)
+    test2 ();
+  if (b)
+    test2 ();
+}
+/* { dg-final { scan-tree-dump-times "Registering FSM jump thread" 2 "thread3" } } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "thread3" } } */


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]