This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Fix (some of) profile updating after jump threading
- From: Jan Hubicka <hubicka at ucw dot cz>
- To: gcc-patches at gcc dot gnu dot org
- Date: Tue, 17 Jan 2017 14:14:32 +0100
- Subject: Fix (some of) profile updating after jump threading
- Authentication-results: sourceware.org; auth=none
Hi,
the testcase is about jump threading confusing profile enough so many edges
are considered cold. The first problem occurs in thread1 pass where
first remove_ctrl_stmt_and_useless_edges does not ouptated outgoing edge
probability after removing the other edges (so we end up with a single
succ bb having outgoing edge probability less than REG_BR_PROB_BASE)
and second is duplicate_thread_path scaling uniformly profiles of all
bbs in the duplicate and original path. This makes sense only when the
original path has no side entry edges and moreover is always wrong for
last BB.
The job of updating here is quite easy, just keep track of frequency/count
of the patch being duplicated and subtract it from the offline copy.
This of course assumes that all branches along the patch except for last
one remains unoptimized and with same exit probabilities.
In the last BB it is necessary to re-distribute exit edges probabilities
as we know its outcome. Fortunately there already is
update_bb_profile_for_threading which does the job for RTL threader.
Mainline gets following mismatch counts:
q.c.103t.thread1 21
q.c.104t.vrp1 40
q.c.106t.dce2 19
..
q.c.112t.mergephi3 17
q.c.113t.phiopt1 17
With patch I get:
q.c.103t.thread1 2
q.c.104t.vrp1 8
q.c.106t.dce2 6
...
q.c.112t.mergephi3 5
...
q.c.113t.phiopt1 5
...
q.c.118t.thread2 6
q.c.178t.thread3 7
....
q.c.182t.vrp2 17
q.c.183t.phicprop2 10
So VRP's jump threading is now the main source of inconsistencies.
The bug there seems ot be that Theresa's code does not care to update
edge probabilities when profile info is absent.
For tramp3d the numbers are:
tramp3d-v4.ii.094t.cunrolli 17
...
tramp3d-v4.ii.101t.fre3 68
tramp3d-v4.ii.102t.mergephi2 66
tramp3d-v4.ii.103t.thread1 335
tramp3d-v4.ii.104t.vrp1 605
tramp3d-v4.ii.106t.dce2 265
tramp3d-v4.ii.107t.stdarg 265
tramp3d-v4.ii.108t.cdce 269
tramp3d-v4.ii.109t.cselim 269
tramp3d-v4.ii.110t.copyprop1 259
tramp3d-v4.ii.111t.ifcombine 287
tramp3d-v4.ii.112t.mergephi3 273
...
tramp3d-v4.ii.115t.ch2 275
tramp3d-v4.ii.116t.cplxlower1 275
tramp3d-v4.ii.117t.sra 275
tramp3d-v4.ii.118t.thread2 281
tramp3d-v4.ii.119t.dom2 302
tramp3d-v4.ii.120t.isolate-paths 319
tramp3d-v4.ii.121t.phicprop1 309
tramp3d-v4.ii.122t.dse2 309
tramp3d-v4.ii.123t.reassoc1 311
tramp3d-v4.ii.124t.dce3 310
tramp3d-v4.ii.125t.forwprop3 314
...
tramp3d-v4.ii.131t.lim2 316
...
tramp3d-v4.ii.134t.pre 565
tramp3d-v4.ii.135t.sink 299
tramp3d-v4.ii.139t.dce4 299
tramp3d-v4.ii.140t.fix_loops 299
tramp3d-v4.ii.141t.loop 281
tramp3d-v4.ii.142t.loopinit 281
tramp3d-v4.ii.143t.unswitch 429
tramp3d-v4.ii.144t.sccp 429
tramp3d-v4.ii.145t.lsplit 431
tramp3d-v4.ii.146t.cddce2 431
tramp3d-v4.ii.147t.ldist 443
tramp3d-v4.ii.148t.copyprop2 443
tramp3d-v4.ii.154t.ivcanon 445
tramp3d-v4.ii.157t.ch_vect 445
tramp3d-v4.ii.158t.ifcvt 467
tramp3d-v4.ii.159t.vect 1179
....
tramp3d-v4.ii.162t.cunroll 1090
...
tramp3d-v4.ii.167t.loopdone 1085
tramp3d-v4.ii.171t.veclower21 1103
tramp3d-v4.ii.173t.printf-return-value2 1103
tramp3d-v4.ii.174t.reassoc2 1103
tramp3d-v4.ii.175t.slsr 1103
tramp3d-v4.ii.176t.split-paths 1103
tramp3d-v4.ii.178t.thread3 1143
tramp3d-v4.ii.179t.dom3 1151
tramp3d-v4.ii.180t.strlen 1151
tramp3d-v4.ii.181t.thread4 1173
tramp3d-v4.ii.182t.vrp2 2263
tramp3d-v4.ii.183t.phicprop2 1078
tramp3d-v4.ii.184t.dse3 1078
tramp3d-v4.ii.185t.cddce3 1078
tramp3d-v4.ii.186t.forwprop4 1078
tramp3d-v4.ii.187t.phiopt3 1078
tramp3d-v4.ii.188t.fab1 1079
tramp3d-v4.ii.189t.widening_mul 1079
tramp3d-v4.ii.190t.store-merging 1079
tramp3d-v4.ii.191t.tailc 1079
tramp3d-v4.ii.192t.dce7 1079
tramp3d-v4.ii.193t.crited2 1079
tramp3d-v4.ii.195t.uncprop1 1079
tramp3d-v4.ii.196t.local-pure-const2 1079
tramp3d-v4.ii.224t.ehcleanup2 588
tramp3d-v4.ii.225t.resx 2493
tramp3d-v4.ii.226t.nrv 2493
tramp3d-v4.ii.227t.optimized 2418
On mainline and:
tramp3d-v4.ii.094t.cunrolli 17
..
tramp3d-v4.ii.101t.fre3 68
tramp3d-v4.ii.102t.mergephi2 66
tramp3d-v4.ii.103t.thread1 129
tramp3d-v4.ii.104t.vrp1 324
tramp3d-v4.ii.106t.dce2 196
tramp3d-v4.ii.107t.stdarg 196
tramp3d-v4.ii.108t.cdce 200
..
tramp3d-v4.ii.111t.ifcombine 228
...
tramp3d-v4.ii.115t.ch2 230
...
tramp3d-v4.ii.119t.dom2 255
tramp3d-v4.ii.120t.isolate-paths 272
tramp3d-v4.ii.121t.phicprop1 264
tramp3d-v4.ii.122t.dse2 264
tramp3d-v4.ii.123t.reassoc1 266
tramp3d-v4.ii.124t.dce3 265
tramp3d-v4.ii.125t.forwprop3 269
..
tramp3d-v4.ii.131t.lim2 271
..
tramp3d-v4.ii.134t.pre 515
tramp3d-v4.ii.135t.sink 272
tramp3d-v4.ii.141t.loop 254
tramp3d-v4.ii.142t.loopinit 254
tramp3d-v4.ii.143t.unswitch 402
tramp3d-v4.ii.144t.sccp 402
tramp3d-v4.ii.145t.lsplit 404
tramp3d-v4.ii.146t.cddce2 404
tramp3d-v4.ii.147t.ldist 416
tramp3d-v4.ii.148t.copyprop2 416
tramp3d-v4.ii.154t.ivcanon 418
tramp3d-v4.ii.157t.ch_vect 418
tramp3d-v4.ii.158t.ifcvt 440
tramp3d-v4.ii.159t.vect 1152
...
tramp3d-v4.ii.162t.cunroll 1055
tramp3d-v4.ii.163t.slp1 1055
tramp3d-v4.ii.165t.ivopts 1055
tramp3d-v4.ii.166t.lim4 1056
tramp3d-v4.ii.167t.loopdone 1050
tramp3d-v4.ii.168t.no_loop 18
tramp3d-v4.ii.169t.slp2 18
tramp3d-v4.ii.171t.veclower21 1068
tramp3d-v4.ii.173t.printf-return-value2 1068
tramp3d-v4.ii.174t.reassoc2 1068
tramp3d-v4.ii.175t.slsr 1068
tramp3d-v4.ii.176t.split-paths 1068
tramp3d-v4.ii.178t.thread3 1086
tramp3d-v4.ii.179t.dom3 1097
tramp3d-v4.ii.180t.strlen 1097
tramp3d-v4.ii.181t.thread4 1103
tramp3d-v4.ii.182t.vrp2 2142
tramp3d-v4.ii.183t.phicprop2 1039
tramp3d-v4.ii.184t.dse3 1039
tramp3d-v4.ii.185t.cddce3 1039
tramp3d-v4.ii.186t.forwprop4 1039
tramp3d-v4.ii.187t.phiopt3 1039
tramp3d-v4.ii.188t.fab1 1040
tramp3d-v4.ii.189t.widening_mul 1040
tramp3d-v4.ii.190t.store-merging 1040
tramp3d-v4.ii.191t.tailc 1040
tramp3d-v4.ii.192t.dce7 1040
tramp3d-v4.ii.193t.crited2 1040
tramp3d-v4.ii.195t.uncprop1 1040
tramp3d-v4.ii.196t.local-pure-const2 1040
tramp3d-v4.ii.224t.ehcleanup2 575
tramp3d-v4.ii.225t.resx 2455
tramp3d-v4.ii.226t.nrv 2455
tramp3d-v4.ii.227t.optimized 2380
tramp3d-v4.ii.311t.statistics 0
So while thread passes themselves now introduce fewer inconsistencies,
the other optimizations almost even it up :)
Bootstrapped/regtested x86_64-linux, comitted.
PR middle-end/77445
* tree-ssa-threadupdate.c (remove_ctrl_stmt_and_useless_edges):
correctly set frequency of oudgoing edge.
(duplicate_thread_path): Fix profile updating.
* gcc.dg/tree-ssa/pr77445-2.c: New testcase.
* gcc.dg/tree-ssa/pr77445.c: New testcase.
Index: tree-ssa-threadupdate.c
===================================================================
--- tree-ssa-threadupdate.c (revision 244508)
+++ tree-ssa-threadupdate.c (working copy)
@@ -301,7 +301,11 @@ remove_ctrl_stmt_and_useless_edges (basi
remove_edge (e);
}
else
- ei_next (&ei);
+ {
+ e->probability = REG_BR_PROB_BASE;
+ e->count = bb->count;
+ ei_next (&ei);
+ }
}
/* If the remaining edge is a loop exit, there must have
@@ -2212,8 +2216,8 @@ duplicate_thread_path (edge entry, edge
struct loop *loop = entry->dest->loop_father;
edge exit_copy;
edge redirected;
- int total_freq = 0, entry_freq = 0;
- gcov_type total_count = 0, entry_count = 0;
+ int curr_freq;
+ gcov_type curr_count;
if (!can_copy_bbs_p (region, n_region))
return false;
@@ -2240,27 +2244,6 @@ duplicate_thread_path (edge entry, edge
free_region_copy = true;
}
- if (entry->dest->count)
- {
- total_count = entry->dest->count;
- entry_count = entry->count;
- /* Fix up corner cases, to avoid division by zero or creation of negative
- frequencies. */
- if (entry_count > total_count)
- entry_count = total_count;
- }
- else
- {
- total_freq = entry->dest->frequency;
- entry_freq = EDGE_FREQUENCY (entry);
- /* Fix up corner cases, to avoid division by zero or creation of negative
- frequencies. */
- if (total_freq == 0)
- total_freq = 1;
- else if (entry_freq > total_freq)
- entry_freq = total_freq;
- }
-
copy_bbs (region, n_region, region_copy, &exit, 1, &exit_copy, loop,
split_edge_bb_loc (entry), false);
@@ -2270,17 +2253,61 @@ duplicate_thread_path (edge entry, edge
invalidating the property that is propagated by executing all the blocks of
the jump-thread path in order. */
+ curr_count = entry->count;
+ curr_freq = EDGE_FREQUENCY (entry);
+
for (i = 0; i < n_region; i++)
{
edge e;
edge_iterator ei;
basic_block bb = region_copy[i];
+ /* Watch inconsistent profile. */
+ if (curr_count > region[i]->count)
+ curr_count = region[i]->count;
+ if (curr_freq > region[i]->frequency)
+ curr_freq = region[i]->frequency;
+ /* Scale current BB. */
+ if (region[i]->count)
+ {
+ /* In the middle of the path we only scale the frequencies.
+ In last BB we need to update probabilities of outgoing edges
+ because we know which one is taken at the threaded path. */
+ if (i + 1 != n_region)
+ scale_bbs_frequencies_gcov_type (region + i, 1,
+ region[i]->count - curr_count,
+ region[i]->count);
+ else
+ update_bb_profile_for_threading (region[i],
+ curr_freq, curr_count,
+ exit);
+ scale_bbs_frequencies_gcov_type (region_copy + i, 1, curr_count,
+ region_copy[i]->count);
+ }
+ else if (region[i]->frequency)
+ {
+ if (i + 1 != n_region)
+ scale_bbs_frequencies_int (region + i, 1,
+ region[i]->frequency - curr_freq,
+ region[i]->frequency);
+ else
+ update_bb_profile_for_threading (region[i],
+ curr_freq, curr_count,
+ exit);
+ scale_bbs_frequencies_int (region_copy + i, 1, curr_freq,
+ region_copy[i]->frequency);
+ }
+
if (single_succ_p (bb))
{
/* Make sure the successor is the next node in the path. */
gcc_assert (i + 1 == n_region
|| region_copy[i + 1] == single_succ_edge (bb)->dest);
+ if (i + 1 != n_region)
+ {
+ curr_freq = EDGE_FREQUENCY (single_succ_edge (bb));
+ curr_count = single_succ_edge (bb)->count;
+ }
continue;
}
@@ -2307,22 +2334,13 @@ duplicate_thread_path (edge entry, edge
if (orig)
redirect_edge_and_branch_force (e, orig);
}
+ else
+ {
+ curr_freq = EDGE_FREQUENCY (e);
+ curr_count = e->count;
+ }
}
- if (total_count)
- {
- scale_bbs_frequencies_gcov_type (region, n_region,
- total_count - entry_count,
- total_count);
- scale_bbs_frequencies_gcov_type (region_copy, n_region, entry_count,
- total_count);
- }
- else
- {
- scale_bbs_frequencies_int (region, n_region, total_freq - entry_freq,
- total_freq);
- scale_bbs_frequencies_int (region_copy, n_region, entry_freq, total_freq);
- }
if (flag_checking)
verify_jump_thread (region_copy, n_region);
@@ -2337,11 +2355,12 @@ duplicate_thread_path (edge entry, edge
edge e = make_edge (region_copy[n_region - 1], exit->dest, EDGE_FALLTHRU);
- if (e) {
- rescan_loop_exit (e, true, false);
- e->probability = REG_BR_PROB_BASE;
- e->count = region_copy[n_region - 1]->count;
- }
+ if (e)
+ {
+ rescan_loop_exit (e, true, false);
+ e->probability = REG_BR_PROB_BASE;
+ e->count = region_copy[n_region - 1]->count;
+ }
/* Redirect the entry and add the phi node arguments. */
if (entry->dest == loop->header)
Index: testsuite/ChangeLog
===================================================================
--- testsuite/ChangeLog (revision 244527)
+++ testsuite/ChangeLog (working copy)
@@ -1,3 +1,9 @@
+2017-01-17 Jan Hubicka <hubicka@ucw.cz>
+
+ PR middle-end/77445
+ * gcc.dg/tree-ssa/pr77445-2.c: New testcase.
+ * gcc.dg/tree-ssa/pr77445.c: New testcase.
+
2017-01-17 Jakub Jelinek <jakub@redhat.com>
* g++.dg/tree-ssa/ssa-dse-2.C (size_t): Typedef to __SIZE_TYPE__
Index: testsuite/gcc.dg/tree-ssa/pr77445-2.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/pr77445-2.c (revision 0)
+++ testsuite/gcc.dg/tree-ssa/pr77445-2.c (working copy)
@@ -0,0 +1,123 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-thread1-details-blocks-stats" } */
+typedef enum STATES {
+ START=0,
+ INVALID,
+ S1,
+ S2,
+ INT,
+ FLOAT ,
+ EXPONENT,
+ SCIENTIFIC,
+ NUM_STATES
+} state_e ;
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+
+static u8 is_digit(u8 c) {
+ return (u8)((c>='0') & (c<='9')) ? 1 : 0;
+}
+
+enum STATES FMS( u8 **in , u32 *transitions) {
+ u8 *str = *in;
+ u8 NEXT_SYMBOL;
+ enum STATES state=START;
+ for( ; *str && state != INVALID; str++ ) {
+ NEXT_SYMBOL = *str;
+ if (NEXT_SYMBOL==',') /* end of this input */ {
+ transitions[state]++;
+ str++;
+ break;
+ }
+ switch(state) {
+ case START:
+ if(is_digit(NEXT_SYMBOL)) {
+ state = INT;
+ }
+ else if( NEXT_SYMBOL == '+' || NEXT_SYMBOL == '-' ) {
+ state = S1;
+ }
+ else if( NEXT_SYMBOL == '.' ) {
+ state = FLOAT ;
+ }
+ else {
+ state = INVALID;
+ }
+ transitions[START]++;
+ break;
+ case S1:
+ if(is_digit(NEXT_SYMBOL)) {
+ state = INT;
+ transitions[S1]++;
+ }
+ else if( NEXT_SYMBOL == '.' ) {
+ state = FLOAT ;
+ transitions[S1]++;
+ }
+ else {
+ state = INVALID;
+ transitions[S1]++;
+ }
+ break;
+ case INT:
+ if( NEXT_SYMBOL == '.' ) {
+ state = FLOAT ;
+ transitions[INT]++;
+ }
+ else if(!is_digit(NEXT_SYMBOL)) {
+ state = INVALID;
+ transitions[INT]++;
+ }
+ break;
+ case FLOAT :
+ if( NEXT_SYMBOL == 'E' || NEXT_SYMBOL == 'e' ) {
+ state = S2;
+ transitions[FLOAT ]++;
+ }
+ else if(!is_digit(NEXT_SYMBOL)) {
+ state = INVALID;
+ transitions[FLOAT ]++;
+ }
+ break;
+ case S2:
+ if( NEXT_SYMBOL == '+' || NEXT_SYMBOL == '-' ) {
+ state = EXPONENT;
+ transitions[S2]++;
+ }
+ else {
+ state = INVALID;
+ transitions[S2]++;
+ }
+ break;
+ case EXPONENT:
+ if(is_digit(NEXT_SYMBOL)) {
+ state = SCIENTIFIC;
+ transitions[EXPONENT]++;
+ }
+ else {
+ state = INVALID;
+ transitions[EXPONENT]++;
+ }
+ break;
+ case SCIENTIFIC:
+ if(!is_digit(NEXT_SYMBOL)) {
+ state = INVALID;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ if (state==INVALID)
+ transitions[INVALID]++;
+
+ *in = str;
+ return state;
+}
+
+/* The profile is not updated perfectly because it is inconsitent from
+ profile estimation stage. But the number of inconsistencies should not
+ increase much. */
+/* { dg-final { scan-tree-dump "Jumps threaded: 16" "thread1" } } */
+/* { dg-final { scan-tree-dump-times "Invalid sum" 2 "thread1" } } */
Index: testsuite/gcc.dg/tree-ssa/pr77445.c
===================================================================
--- testsuite/gcc.dg/tree-ssa/pr77445.c (revision 0)
+++ testsuite/gcc.dg/tree-ssa/pr77445.c (working copy)
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-thread3-details-blocks -fno-early-inlining -fno-tree-vrp -fno-tree-dominator-opts" } */
+
+static int a;
+static int b;
+void test2 ();
+void
+test ()
+{
+ b = 7;
+}
+
+void
+main (int argc)
+{
+ if (argc)
+ {
+ a = 7;
+ test ();
+ }
+ else
+ a = 0;
+ if (a)
+ test2 ();
+ if (b)
+ test2 ();
+}
+/* { dg-final { scan-tree-dump-times "Registering FSM jump thread" 2 "thread3" } } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "thread3" } } */