Optimize df_worklist_dataflow
Jan Hubicka
hubicka@ucw.cz
Sat Jun 12 14:46:00 GMT 2010
Hi,
according to oprofile, bitmap_ior_into is one of most commonly called functions:
46060 4.0246 lto1 htab_find_slot_with_hash
17385 1.5191 lto1 bitmap_ior_into
15324 1.3390 lto1 bitmap_set_bit_1
12731 1.1124 lto1 fast_dce.466329.constprop.608
12663 1.1065 lto1 df_worklist_dataflow
12400 1.0835 lto1 df_note_compute.41166.3750
11505 1.0053 lto1 bitmap_clear_bit
11216 0.9800 lto1 ggc_internal_alloc_stat
10745 0.9389 lto1 htab_find_with_hash
9795 0.8559 lto1 record_reg_classes.103257.constprop.831.3374
9529 0.8326 lto1 bitmap_ior_and_compl
9395 0.8209 lto1 walk_tree_1.constprop.798
9223 0.8059 lto1 bitmap_and
9037 0.7896 lto1 bitmap_copy
8844 0.7728 lto1 execute_function_todo.150826.4276
8117 0.7092 lto1 extract_insn
looking at callgrind, the most common callers are:
104,443,839 < /abuild/jh/trunk/build-new/gcc/../../gcc/df-problems.c:df_rd_confluence_n (53439x) [/abuild/jh/trunk/build-new/stage1-gcc/cc1]
223,397,697 < /abuild/jh/trunk/build-new/gcc/../../gcc/df-problems.c:df_live_confluence_n (480009x) [/abuild/jh/trunk/build-new/stage1-gcc/cc1]
314,987,157 < /abuild/jh/trunk/build-new/gcc/../../gcc/df-problems.c:df_lr_confluence_n (957632x) [/abuild/jh/trunk/build-new/stage1-gcc/cc1]
218,567,697 * /abuild/jh/trunk/build-new/gcc/../../gcc/bitmap.c:bitmap_ior_into [/abuild/jh/trunk/build-new/stage1-gcc/cc1]
This patch change it into:
63,254,260 < /abuild/jh/trunk/build-new/gcc/../../gcc/df-problems.c:df_rd_confluence_n (29983x) [/abuild/jh/trunk/build-new/stage1-gcc/cc1]
124,535,655 < /abuild/jh/trunk/build-new/gcc/../../gcc/df-problems.c:df_live_confluence_n (237200x) [/abuild/jh/trunk/build-new/stage1-gcc/cc1]
182,610,443 < /abuild/jh/trunk/build-new/gcc/../../gcc/df-problems.c:df_lr_confluence_n (523018x) [/abuild/jh/trunk/build-new/stage1-gcc/cc1]
123,750,894 * /abuild/jh/trunk/build-new/gcc/../../gcc/bitmap.c:bitmap_ior_into [/abuild/jh/trunk/build-new/stage1-gcc/cc1]
It is done by making df_worklist_dataflow to track what confluences need to
computed and also to avoid computation when nothing changed (since bitmap
operations readilly give us return value if something has changed or not).
I also noticed that df_worklist_dataflow can be written using bitmap
iterator that is a lot faster than bitmap_clear_bit and find_bit operations.
I am tracking age of basic block. One age is last time when BB info has
changed and other age is last time it was re-scanned. When scanning we need to
compute confluences only of those basic blocks that changed since last
checking.
Everythign in the main dataflow loop seems performance critical, so one age
is stored in vector and the last change age (that is accessed more times)
in the AUX field of BB structure.
the oprofile now change into:
49333 3.9682 htab_find_slot_with_hash
20376 1.6390 df_worklist_dataflow
17643 1.4192 df_note_compute.41314.3750
17159 1.3802 bitmap_set_bit_1
15097 1.2144 fast_dce.467556.constprop.620
12753 1.0258 bitmap_ior_into
12063 0.9703 ggc_internal_alloc_stat
11887 0.9562 htab_find_with_hash
11782 0.9477 record_reg_classes.103545.constprop.841.3374
10908 0.8774 bitmap_clear_bit
10108 0.8131 bitmap_copy
9918 0.7978 execute_function_todo.151805.4276
9907 0.7969 walk_tree_1.constprop.804
9700 0.7802 extract_insn
Note the increase of df_worklist_dataflow. I lack much of logical
explanation for that: it is not related to iterator change (without it
situation is even worse) and I don't think the extra array lookup is that
expensive. So I guess we just collecting cache misses that was orginally
attributed to the confluence functions.
The profile is as follows:
:/* Helper function for df_worklist_dataflow.
: Propagate the dataflow forward.
: Given a BB_INDEX, do the dataflow propagation
: and set bits on for successors in PENDING
: if the out set of the dataflow has changed. */
:
:static bool
:df_worklist_propagate_forward (struct dataflow *dataflow,
: unsigned bb_index,
: unsigned *bbindex_to_postorder,
: bitmap pending,
: sbitmap considered,
: size_t age)
:{
: edge e;
: edge_iterator ei;
: basic_block bb = BASIC_BLOCK (bb_index);
69 0.0056 : bool changed = !age;
:
: /* Calculate <conf_op> of incoming edges. */
232 0.0187 : if (EDGE_COUNT (bb->preds) > 0)
127 0.0102 : FOR_EACH_EDGE (e, ei, bb->preds)
: {
187 0.0150 : if ((age <= (size_t)e->src->aux)
329 0.0265 : && TEST_BIT (considered, e->src->index))
1035 0.0833 : changed |= dataflow->problem->con_fun_n (e);
: }
12 9.7e-04 : else if (dataflow->problem->con_fun_0)
: {
2 1.6e-04 : dataflow->problem->con_fun_0 (bb);
: changed = true;
: }
:
421 0.0339 : if (changed
111 0.0089 : && dataflow->problem->trans_fun (bb_index))
: {
: /* The out set of this block has changed.
: Propagate to the outgoing blocks. */
114 0.0092 : FOR_EACH_EDGE (e, ei, bb->succs)
: {
355 0.0286 : unsigned ob_index = e->dest->index;
:
261 0.0210 : if (TEST_BIT (considered, ob_index))
: bitmap_set_bit (pending, bbindex_to_postorder[ob_index]);
: }
: return true;
: }
: return false;
:}
:
:
:/* Helper function for df_worklist_dataflow.
: Propagate the dataflow backward. */
:
:static bool
:df_worklist_propagate_backward (struct dataflow *dataflow,
: unsigned bb_index,
: unsigned *bbindex_to_postorder,
: bitmap pending,
: sbitmap considered,
: size_t age)
:{
: edge e;
: edge_iterator ei;
: basic_block bb = BASIC_BLOCK (bb_index);
41 0.0033 : bool changed = !age;
:
: /* Calculate <conf_op> of incoming edges. */
240 0.0193 : if (EDGE_COUNT (bb->succs) > 0)
229 0.0184 : FOR_EACH_EDGE (e, ei, bb->succs)
: {
147 0.0118 : if ((age <= (size_t)e->dest->aux)
417 0.0335 : && TEST_BIT (considered, e->dest->index))
1030 0.0829 : changed |= dataflow->problem->con_fun_n (e);
: }
21 0.0017 : else if (dataflow->problem->con_fun_0)
: {
: dataflow->problem->con_fun_0 (bb);
: changed = true;
: }
:
16 0.0013 : if (changed
86 0.0069 : && dataflow->problem->trans_fun (bb_index))
: {
: /* The out set of this block has changed.
: Propagate to the outgoing blocks. */
31 0.0025 : FOR_EACH_EDGE (e, ei, bb->preds)
: {
719 0.0578 : unsigned ob_index = e->src->index;
:
133 0.0107 : if (TEST_BIT (considered, ob_index))
29 0.0023 : bitmap_set_bit (pending, bbindex_to_postorder[ob_index]);
: }
: return true;
: }
: return false;
:}
:
739 0.0594 :DEF_VEC_I(size_t);
4 3.2e-04 :DEF_VEC_ALLOC_I(size_t,heap);
:
:/* This will free "pending". */
:
:static void
:df_worklist_dataflow_doublequeue (struct dataflow *dataflow,
: bitmap pending,
: sbitmap considered,
: int *blocks_in_postorder,
: unsigned *bbindex_to_postorder,
: int n_blocks)
:{
: enum df_flow_dir dir = dataflow->problem->dir;
1 8.0e-05 : int dcount = 0;
1 8.0e-05 : bitmap worklist = BITMAP_ALLOC (&df_bitmap_obstack);
1 8.0e-05 : size_t age = 0;
: bool changed;
: VEC(size_t, heap) *last_age = NULL;
: size_t prev_age;
: basic_block bb;
: int i;
:
: VEC_safe_grow_cleared (size_t, heap, last_age, n_blocks);
:
: /* Double-queueing. Worklist is for the current iteration,
: and pending is for the next. */
12 9.7e-04 : while (!bitmap_empty_p (pending))
: {
: bitmap_iterator bi;
: unsigned int index;
:
: /* Swap pending and worklist. */
: bitmap temp = worklist;
: worklist = pending;
: pending = temp;
:
: EXECUTE_IF_SET_IN_BITMAP (worklist, 0, index, bi)
: {
: unsigned bb_index;
338 0.0272 : dcount++;
:
957 0.0770 : bb_index = blocks_in_postorder[index];
3 2.4e-04 : bb = BASIC_BLOCK (bb_index);
4 3.2e-04 : prev_age = VEC_index (size_t, last_age, index);
116 0.0093 : if (dir == DF_FORWARD)
: changed = df_worklist_propagate_forward (dataflow, bb_index,
: bbindex_to_postorder,
: pending, considered,
: prev_age);
: else
: changed = df_worklist_propagate_backward (dataflow, bb_index,
: bbindex_to_postorder,
: pending, considered,
: prev_age);
: age++;
: if (changed)
687 0.0553 : bb->aux = (void *)age;
: VEC_replace (size_t, last_age, index, age);
: age++;
: }
10 8.0e-04 : bitmap_clear (worklist);
: }
188 0.0151 : for (i = 0; i < n_blocks; i++)
168 0.0135 : BASIC_BLOCK (blocks_in_postorder[i])->aux = NULL;
:
: BITMAP_FREE (worklist);
: BITMAP_FREE (pending);
: VEC_free (size_t, heap, last_age);
:
: /* Dump statistics. */
4 3.2e-04 : if (dump_file)
: fprintf (dump_file, "df_worklist_dataflow_doublequeue:"
: "n_basic_blocks %d n_edges %d"
: " count %d (%5.2g)\n",
: n_basic_blocks, n_edges,
: dcount, dcount / (float)n_basic_blocks);
:}
Main change is in dispatch to con_fun_n
:static void
:df_worklist_propagate_forward (struct dataflow *dataflow,
: unsigned bb_index,
: unsigned *bbindex_to_postorder,
: bitmap pending,
: sbitmap considered)
:{
: edge e;
: edge_iterator ei;
91 0.0069 : basic_block bb = BASIC_BLOCK (bb_index);
:
: /* Calculate <conf_op> of incoming edges. */
179 0.0136 : if (EDGE_COUNT (bb->preds) > 0)
: FOR_EACH_EDGE (e, ei, bb->preds)
: {
230 0.0175 : if (TEST_BIT (considered, e->src->index))
489 0.0372 : dataflow->problem->con_fun_n (e);
: }
6 4.6e-04 : else if (dataflow->problem->con_fun_0)
319 0.0243 : dataflow->problem->con_fun_0 (bb);
:
190 0.0145 : if (dataflow->problem->trans_fun (bb_index))
: {
: /* The out set of this block has changed.
: Propagate to the outgoing blocks. */
170 0.0129 : FOR_EACH_EDGE (e, ei, bb->succs)
: {
516 0.0392 : unsigned ob_index = e->dest->index;
:
69 0.0052 : if (TEST_BIT (considered, ob_index))
38 0.0029 : bitmap_set_bit (pending, bbindex_to_postorder[ob_index]);
: }
: }
:}
We probably should consider using macros (or tempaltes) to avoid indirect calls
here and just spcialize the dataflow function for specific problems.
Overall compile time is improved, from 10m28s to 10m5s.
Bootstrapped/regtested x86_64-linux, I am running the df checking bootstrap now.
Seems to make sense?
Honza
* df-problems.c (df_rd_confluence_n, df_lr_confluence_n, df_live_confluence_n,
df_byte_lr_confluence_n, df_md_confluence_n): Return true if something changed.
* df.h (df_confluence_function_n): Return bool.
* df-core.c (df_worklist_propagate_forward, df_worklist_propagate_backward):
track changes and ages.
(df_worklist_dataflow_doublequeue): Use bitmap iterator for main walk;
track ages.
* dse.c (dse_confluence_n): Return always true.
Index: df-problems.c
===================================================================
--- df-problems.c (revision 160661)
+++ df-problems.c (working copy)
@@ -479,14 +480,15 @@ df_rd_init_solution (bitmap all_blocks)
/* In of target gets or of out of source. */
-static void
+static bool
df_rd_confluence_n (edge e)
{
bitmap op1 = &df_rd_get_bb_info (e->dest->index)->in;
bitmap op2 = &df_rd_get_bb_info (e->src->index)->out;
+ bool changed = false;
if (e->flags & EDGE_FAKE)
- return;
+ return false;
if (e->flags & EDGE_EH)
{
@@ -508,11 +510,12 @@ df_rd_confluence_n (edge e)
DF_DEFS_BEGIN (regno),
DF_DEFS_COUNT (regno));
}
- bitmap_ior_into (op1, &tmp);
+ changed |= bitmap_ior_into (op1, &tmp);
bitmap_clear (&tmp);
+ return changed;
}
else
- bitmap_ior_into (op1, op2);
+ return bitmap_ior_into (op1, op2);
}
@@ -966,21 +969,22 @@ df_lr_confluence_0 (basic_block bb)
/* Confluence function that ignores fake edges. */
-static void
+static bool
df_lr_confluence_n (edge e)
{
bitmap op1 = &df_lr_get_bb_info (e->src->index)->out;
bitmap op2 = &df_lr_get_bb_info (e->dest->index)->in;
+ bool changed = false;
/* Call-clobbered registers die across exception and call edges. */
/* ??? Abnormal call edges ignored for the moment, as this gets
confused by sibling call edges, which crashes reg-stack. */
if (e->flags & EDGE_EH)
- bitmap_ior_and_compl_into (op1, op2, regs_invalidated_by_call_regset);
+ changed = bitmap_ior_and_compl_into (op1, op2, regs_invalidated_by_call_regset);
else
- bitmap_ior_into (op1, op2);
+ changed = bitmap_ior_into (op1, op2);
- bitmap_ior_into (op1, &df->hardware_regs_used);
+ return bitmap_ior_into (op1, &df->hardware_regs_used) || changed;
}
@@ -1503,16 +1507,16 @@ df_live_init (bitmap all_blocks)
/* Forward confluence function that ignores fake edges. */
-static void
+static bool
df_live_confluence_n (edge e)
{
bitmap op1 = &df_live_get_bb_info (e->dest->index)->in;
bitmap op2 = &df_live_get_bb_info (e->src->index)->out;
if (e->flags & EDGE_FAKE)
- return;
+ return false;
- bitmap_ior_into (op1, op2);
+ return bitmap_ior_into (op1, op2);
}
@@ -2710,23 +2714,24 @@ df_byte_lr_confluence_0 (basic_block bb)
/* Confluence function that ignores fake edges. */
-static void
+static bool
df_byte_lr_confluence_n (edge e)
{
struct df_byte_lr_problem_data *problem_data
= (struct df_byte_lr_problem_data *)df_byte_lr->problem_data;
bitmap op1 = &df_byte_lr_get_bb_info (e->src->index)->out;
bitmap op2 = &df_byte_lr_get_bb_info (e->dest->index)->in;
+ bool changed = false;
/* Call-clobbered registers die across exception and call edges. */
/* ??? Abnormal call edges ignored for the moment, as this gets
confused by sibling call edges, which crashes reg-stack. */
if (e->flags & EDGE_EH)
- bitmap_ior_and_compl_into (op1, op2, &problem_data->invalidated_by_call);
+ changed = bitmap_ior_and_compl_into (op1, op2, &problem_data->invalidated_by_call);
else
- bitmap_ior_into (op1, op2);
+ changed = bitmap_ior_into (op1, op2);
- bitmap_ior_into (op1, &problem_data->hardware_regs_used);
+ return bitmap_ior_into (op1, &problem_data->hardware_regs_used) || changed;
}
@@ -4426,19 +4379,19 @@ df_md_confluence_0 (basic_block bb)
/* In of target gets or of out of source. */
-static void
+static bool
df_md_confluence_n (edge e)
{
bitmap op1 = &df_md_get_bb_info (e->dest->index)->in;
bitmap op2 = &df_md_get_bb_info (e->src->index)->out;
if (e->flags & EDGE_FAKE)
- return;
+ return false;
if (e->flags & EDGE_EH)
- bitmap_ior_and_compl_into (op1, op2, regs_invalidated_by_call_regset);
+ return bitmap_ior_and_compl_into (op1, op2, regs_invalidated_by_call_regset);
else
- bitmap_ior_into (op1, op2);
+ return bitmap_ior_into (op1, op2);
}
/* Free all storage associated with the problem. */
Index: df.h
===================================================================
--- df.h (revision 160661)
+++ df.h (working copy)
@@ -223,7 +223,7 @@ typedef void (*df_dataflow_function) (st
typedef void (*df_confluence_function_0) (basic_block);
/* Confluence operator for blocks with 1 or more out (or in) edges. */
-typedef void (*df_confluence_function_n) (edge);
+typedef bool (*df_confluence_function_n) (edge);
/* Transfer function for blocks. */
typedef bool (*df_transfer_function) (int);
Index: df-core.c
===================================================================
--- df-core.c (revision 160661)
+++ df-core.c (working copy)
@@ -858,28 +858,35 @@ struct rtl_opt_pass pass_df_finish =
and set bits on for successors in PENDING
if the out set of the dataflow has changed. */
-static void
+static bool
df_worklist_propagate_forward (struct dataflow *dataflow,
unsigned bb_index,
unsigned *bbindex_to_postorder,
bitmap pending,
- sbitmap considered)
+ sbitmap considered,
+ size_t age)
{
edge e;
edge_iterator ei;
basic_block bb = BASIC_BLOCK (bb_index);
+ bool changed = !age;
/* Calculate <conf_op> of incoming edges. */
if (EDGE_COUNT (bb->preds) > 0)
FOR_EACH_EDGE (e, ei, bb->preds)
{
- if (TEST_BIT (considered, e->src->index))
- dataflow->problem->con_fun_n (e);
+ if ((age <= (size_t)e->src->aux)
+ && TEST_BIT (considered, e->src->index))
+ changed |= dataflow->problem->con_fun_n (e);
}
else if (dataflow->problem->con_fun_0)
- dataflow->problem->con_fun_0 (bb);
+ {
+ dataflow->problem->con_fun_0 (bb);
+ changed = true;
+ }
- if (dataflow->problem->trans_fun (bb_index))
+ if (changed
+ && dataflow->problem->trans_fun (bb_index))
{
/* The out set of this block has changed.
Propagate to the outgoing blocks. */
@@ -890,35 +897,44 @@ df_worklist_propagate_forward (struct da
if (TEST_BIT (considered, ob_index))
bitmap_set_bit (pending, bbindex_to_postorder[ob_index]);
}
+ return true;
}
+ return false;
}
/* Helper function for df_worklist_dataflow.
Propagate the dataflow backward. */
-static void
+static bool
df_worklist_propagate_backward (struct dataflow *dataflow,
unsigned bb_index,
unsigned *bbindex_to_postorder,
bitmap pending,
- sbitmap considered)
+ sbitmap considered,
+ size_t age)
{
edge e;
edge_iterator ei;
basic_block bb = BASIC_BLOCK (bb_index);
+ bool changed = !age;
/* Calculate <conf_op> of incoming edges. */
if (EDGE_COUNT (bb->succs) > 0)
FOR_EACH_EDGE (e, ei, bb->succs)
{
- if (TEST_BIT (considered, e->dest->index))
- dataflow->problem->con_fun_n (e);
+ if ((age <= (size_t)e->dest->aux)
+ && TEST_BIT (considered, e->dest->index))
+ changed |= dataflow->problem->con_fun_n (e);
}
else if (dataflow->problem->con_fun_0)
- dataflow->problem->con_fun_0 (bb);
+ {
+ dataflow->problem->con_fun_0 (bb);
+ changed = true;
+ }
- if (dataflow->problem->trans_fun (bb_index))
+ if (changed
+ && dataflow->problem->trans_fun (bb_index))
{
/* The out set of this block has changed.
Propagate to the outgoing blocks. */
@@ -929,10 +945,13 @@ df_worklist_propagate_backward (struct d
if (TEST_BIT (considered, ob_index))
bitmap_set_bit (pending, bbindex_to_postorder[ob_index]);
}
+ return true;
}
+ return false;
}
-
+DEF_VEC_I(size_t);
+DEF_VEC_ALLOC_I(size_t,heap);
/* This will free "pending". */
@@ -941,46 +960,65 @@ df_worklist_dataflow_doublequeue (struct
bitmap pending,
sbitmap considered,
int *blocks_in_postorder,
- unsigned *bbindex_to_postorder)
+ unsigned *bbindex_to_postorder,
+ int n_blocks)
{
enum df_flow_dir dir = dataflow->problem->dir;
int dcount = 0;
bitmap worklist = BITMAP_ALLOC (&df_bitmap_obstack);
+ size_t age = 0;
+ bool changed;
+ VEC(size_t, heap) *last_age = NULL;
+ size_t prev_age;
+ basic_block bb;
+ int i;
+
+ VEC_safe_grow_cleared (size_t, heap, last_age, n_blocks);
/* Double-queueing. Worklist is for the current iteration,
and pending is for the next. */
while (!bitmap_empty_p (pending))
{
+ bitmap_iterator bi;
+ unsigned int index;
+
/* Swap pending and worklist. */
bitmap temp = worklist;
worklist = pending;
pending = temp;
- do
+ EXECUTE_IF_SET_IN_BITMAP (worklist, 0, index, bi)
{
- int index;
unsigned bb_index;
dcount++;
- index = bitmap_first_set_bit (worklist);
- bitmap_clear_bit (worklist, index);
-
bb_index = blocks_in_postorder[index];
-
+ bb = BASIC_BLOCK (bb_index);
+ prev_age = VEC_index (size_t, last_age, index);
if (dir == DF_FORWARD)
- df_worklist_propagate_forward (dataflow, bb_index,
- bbindex_to_postorder,
- pending, considered);
+ changed = df_worklist_propagate_forward (dataflow, bb_index,
+ bbindex_to_postorder,
+ pending, considered,
+ prev_age);
else
- df_worklist_propagate_backward (dataflow, bb_index,
- bbindex_to_postorder,
- pending, considered);
+ changed = df_worklist_propagate_backward (dataflow, bb_index,
+ bbindex_to_postorder,
+ pending, considered,
+ prev_age);
+ age++;
+ if (changed)
+ bb->aux = (void *)age;
+ VEC_replace (size_t, last_age, index, age);
+ age++;
}
- while (!bitmap_empty_p (worklist));
+ bitmap_clear (worklist);
}
+ for (i = 0; i < n_blocks; i++)
+ BASIC_BLOCK (blocks_in_postorder[i])->aux = NULL;
BITMAP_FREE (worklist);
BITMAP_FREE (pending);
+ VEC_free (size_t, heap, last_age);
/* Dump statistics. */
if (dump_file)
@@ -1044,8 +1082,8 @@ df_worklist_dataflow (struct dataflow *d
/* Solve it. */
df_worklist_dataflow_doublequeue (dataflow, pending, considered,
blocks_in_postorder,
- bbindex_to_postorder);
-
+ bbindex_to_postorder,
+ n_blocks);
sbitmap_free (considered);
free (bbindex_to_postorder);
}
Index: dse.c
===================================================================
--- dse.c (revision 160661)
+++ dse.c (working copy)
@@ -3396,7 +3396,7 @@ dse_confluence_0 (basic_block bb)
out set of the src of E. If the various in or out sets are not
there, that means they are all ones. */
-static void
+static bool
dse_confluence_n (edge e)
{
bb_info_t src_info = bb_table[e->src->index];
@@ -3412,6 +3412,7 @@ dse_confluence_n (edge e)
bitmap_copy (src_info->out, dest_info->in);
}
}
+ return true;
}
More information about the Gcc-patches
mailing list