[PATCH][RFC] Fix DF sub-CFG analysis slowness (PR39326)
Richard Biener
rguenther@suse.de
Wed Jan 15 10:53:00 GMT 2014
This improves compile time of the testcase in PR39326 at -O3 from
loop invariant motion : 24.41 (32%) usr 0.03 ( 4%) sys 24.43 (32%)
wall 148 kB ( 0%) ggc
loop unswitching : 15.16 (20%) usr 0.02 ( 3%) sys 15.30 (20%)
wall 0 kB ( 0%) ggc
TOTAL : 76.51 0.72 77.23
456534 kB
to
loop invariant motion : 0.18 ( 0%) usr 0.00 ( 0%) sys 0.16 ( 0%)
wall 148 kB ( 0%) ggc
loop unswitching : 0.01 ( 0%) usr 0.00 ( 0%) sys 0.01 ( 0%)
wall 0 kB ( 0%) ggc
TOTAL : 37.14 0.67 37.80
456534 kB
The problem with DF sub-CFG analysis (df_set_blocks) is that while
the actual analysis works on the sub-CFG the function first computes
inverted postorder and postorder of the whole function and then
prunes it, which takes 99% of the compile time for the testcase
with a lot of very simple loops.
There are two users of df_set_blocks, RTL loop invariant motion,
doing a df_analyze on each loop, and RTL loop IV analysis
(done on each loop by unswitching). The following patch fixes
the slowness when the sub-CFG is the body of a single loop
as it's reasonably easy to implement (inverted) postorder
computation for a SEME region.
Now the question is what the desired interface of this should be
(the patch is a kludge here) and if the generic df_set_blocks
interface should be retained (it doesn't prohibit you to
analyze unconnected parts of a CFG for example) - it also retains
its slowness for simple analysis cases.
I can split off a df_analyze_1 and have a df_analyze () doing what
it does now and a df_analyze_loop () combining df_set_blocks ()
and df_analyze () for example.
Any preference? Or do we want a df_set_loop ()-like interface?
df_analyze seems to be the only public interface caring about
blocks (its comment even suggests that the sub-CFG bitmap was
even part of its interface at some point).
I'll probably move the CFG order computes to cfgloop.c and
eventually make it use flow_bb_inside_loop_p instead of
the bitmap with all loop blocks (even though that will be
a little slower in the end).
Thanks,
Richard.
Index: gcc/df.h
===================================================================
*** gcc/df.h (revision 206624)
--- gcc/df.h (working copy)
*************** extern void df_set_blocks (bitmap);
*** 900,906 ****
extern void df_remove_problem (struct dataflow *);
extern void df_finish_pass (bool);
extern void df_analyze_problem (struct dataflow *, bitmap, int *, int);
! extern void df_analyze (void);
extern int df_get_n_blocks (enum df_flow_dir);
extern int *df_get_postorder (enum df_flow_dir);
extern void df_simple_dataflow (enum df_flow_dir, df_init_function,
--- 900,906 ----
extern void df_remove_problem (struct dataflow *);
extern void df_finish_pass (bool);
extern void df_analyze_problem (struct dataflow *, bitmap, int *, int);
! extern void df_analyze (struct loop *loop = NULL);
extern int df_get_n_blocks (enum df_flow_dir);
extern int *df_get_postorder (enum df_flow_dir);
extern void df_simple_dataflow (enum df_flow_dir, df_init_function,
Index: gcc/df-core.c
===================================================================
*** gcc/df-core.c (revision 206624)
--- gcc/df-core.c (working copy)
*************** are write-only operations.
*** 393,398 ****
--- 393,399 ----
#include "df.h"
#include "tree-pass.h"
#include "params.h"
+ #include "cfgloop.h"
static void *df_get_bb_info (struct dataflow *, unsigned int);
static void df_set_bb_info (struct dataflow *, unsigned int, void *);
*************** df_analyze_problem (struct dataflow *dfl
*** 1225,1235 ****
}
/* Analyze dataflow info for the basic blocks specified by the bitmap
BLOCKS, or for the whole CFG if BLOCKS is zero. */
void
! df_analyze (void)
{
bitmap current_all_blocks = BITMAP_ALLOC (&df_bitmap_obstack);
bool everything;
--- 1226,1383 ----
}
+ static int
+ loop_post_order_compute (int *post_order, struct loop *loop,
+ bitmap loop_blocks)
+ {
+ edge_iterator *stack;
+ int sp;
+ int post_order_num = 0;
+ bitmap visited;
+
+ /* Allocate stack for back-tracking up CFG. */
+ stack = XNEWVEC (edge_iterator, loop->num_nodes + 1);
+ sp = 0;
+
+ /* Allocate bitmap to track nodes that have been visited. */
+ visited = BITMAP_ALLOC (NULL);
+
+ /* Push the first edge on to the stack. */
+ stack[sp++] = ei_start (loop_preheader_edge (loop)->src->succs);
+
+ while (sp)
+ {
+ edge_iterator ei;
+ basic_block src;
+ basic_block dest;
+
+ /* Look at the edge on the top of the stack. */
+ ei = stack[sp - 1];
+ src = ei_edge (ei)->src;
+ dest = ei_edge (ei)->dest;
+
+ /* Check if the edge destination has been visited yet and mark it
+ if not so. */
+ if (bitmap_bit_p (loop_blocks, dest->index)
+ && bitmap_set_bit (visited, dest->index))
+ {
+ if (EDGE_COUNT (dest->succs) > 0)
+ /* Since the DEST node has been visited for the first
+ time, check its successors. */
+ stack[sp++] = ei_start (dest->succs);
+ else
+ post_order[post_order_num++] = dest->index;
+ }
+ else
+ {
+ if (ei_one_before_end_p (ei)
+ && src != loop_preheader_edge (loop)->src)
+ post_order[post_order_num++] = src->index;
+
+ if (!ei_one_before_end_p (ei))
+ ei_next (&stack[sp - 1]);
+ else
+ sp--;
+ }
+ }
+
+ free (stack);
+ BITMAP_FREE (visited);
+
+ return post_order_num;
+ }
+
+ int
+ loop_inverted_post_order_compute (int *post_order, struct loop *loop,
+ bitmap loop_blocks)
+ {
+ basic_block bb;
+ edge_iterator *stack;
+ int sp;
+ int post_order_num = 0;
+ bitmap visited;
+ unsigned i;
+
+ vec<edge> exits = get_loop_exit_edges (loop);
+
+ /* Allocate stack for back-tracking up CFG. */
+ stack = XNEWVEC (edge_iterator, loop->num_nodes + exits.length () + 1);
+ sp = 0;
+
+ /* Allocate bitmap to track nodes that have been visited. */
+ visited = BITMAP_ALLOC (NULL);
+
+ /* Put all loop exit blocks into the initial work list or the latches,
+ if there is no exit. */
+ if (!exits.is_empty ())
+ {
+ edge e;
+ for (i = 0; exits.iterate (i, &e); ++i)
+ stack[sp++] = ei_start (e->dest->preds);
+ }
+ else
+ {
+ stack[sp++] = ei_start (loop->header->preds);
+ bitmap_set_bit (visited, loop->header->index);
+ }
+ exits.release ();
+
+ /* The inverted traversal loop. */
+ while (sp)
+ {
+ edge_iterator ei;
+ basic_block pred;
+
+ /* Look at the edge on the top of the stack. */
+ ei = stack[sp - 1];
+ bb = ei_edge (ei)->dest;
+ pred = ei_edge (ei)->src;
+
+ /* Check if the predecessor has been visited yet and mark it
+ if not so. */
+ if (bitmap_bit_p (loop_blocks, pred->index)
+ && bitmap_set_bit (visited, pred->index))
+ {
+ if (EDGE_COUNT (pred->preds) > 0)
+ /* Since the predecessor node has been visited for the first
+ time, check its predecessors. */
+ stack[sp++] = ei_start (pred->preds);
+ else
+ post_order[post_order_num++] = pred->index;
+ }
+ else
+ {
+ if (bitmap_bit_p (loop_blocks, bb->index)
+ && ei_one_before_end_p (ei))
+ post_order[post_order_num++] = bb->index;
+
+ if (!ei_one_before_end_p (ei))
+ ei_next (&stack[sp - 1]);
+ else
+ sp--;
+ }
+ }
+
+ /* Detect any infinite loop and activate the kludge.
+ Note that this doesn't check EXIT_BLOCK itself
+ since EXIT_BLOCK is always added after the outer do-while loop. */
+ /* ??? Assert this doesn't happen, thus that we visited all
+ loop_blocks. */
+ bitmap_compl_and_into (visited, loop_blocks);
+ gcc_assert (bitmap_empty_p (visited));
+
+ free (stack);
+ BITMAP_FREE (visited);
+ return post_order_num;
+ }
+
+
+
/* Analyze dataflow info for the basic blocks specified by the bitmap
BLOCKS, or for the whole CFG if BLOCKS is zero. */
void
! df_analyze (struct loop *loop)
{
bitmap current_all_blocks = BITMAP_ALLOC (&df_bitmap_obstack);
bool everything;
*************** df_analyze (void)
*** 1237,1246 ****
free (df->postorder);
free (df->postorder_inverted);
! df->postorder = XNEWVEC (int, last_basic_block_for_fn (cfun));
! df->postorder_inverted = XNEWVEC (int, last_basic_block_for_fn (cfun));
! df->n_blocks = post_order_compute (df->postorder, true, true);
! df->n_blocks_inverted = inverted_post_order_compute (df->postorder_inverted);
/* These should be the same. */
gcc_assert (df->n_blocks == df->n_blocks_inverted);
--- 1385,1412 ----
free (df->postorder);
free (df->postorder_inverted);
! if (loop)
! {
! gcc_assert (df->analyze_subset);
! gcc_checking_assert (bitmap_count_bits (df->blocks_to_analyze)
! == loop->num_nodes);
! df->postorder = XNEWVEC (int, loop->num_nodes);
! df->postorder_inverted = XNEWVEC (int, loop->num_nodes);
! df->n_blocks = loop_post_order_compute (df->postorder, loop,
! df->blocks_to_analyze);
! df->n_blocks_inverted
! = loop_inverted_post_order_compute (df->postorder_inverted, loop,
! df->blocks_to_analyze);
! gcc_assert ((unsigned) df->n_blocks == loop->num_nodes);
! gcc_assert ((unsigned) df->n_blocks_inverted == loop->num_nodes);
! }
! else
! {
! df->postorder = XNEWVEC (int, last_basic_block_for_fn (cfun));
! df->postorder_inverted = XNEWVEC (int, last_basic_block_for_fn (cfun));
! df->n_blocks = post_order_compute (df->postorder, true, true);
! df->n_blocks_inverted = inverted_post_order_compute (df->postorder_inverted);
! }
/* These should be the same. */
gcc_assert (df->n_blocks == df->n_blocks_inverted);
*************** df_analyze (void)
*** 1273,1284 ****
if (df->analyze_subset)
{
everything = false;
! bitmap_and_into (df->blocks_to_analyze, current_all_blocks);
! df->n_blocks = df_prune_to_subcfg (df->postorder,
! df->n_blocks, df->blocks_to_analyze);
! df->n_blocks_inverted = df_prune_to_subcfg (df->postorder_inverted,
! df->n_blocks_inverted,
! df->blocks_to_analyze);
BITMAP_FREE (current_all_blocks);
}
else
--- 1439,1456 ----
if (df->analyze_subset)
{
everything = false;
! if (!loop)
! {
! bitmap_and_into (df->blocks_to_analyze, current_all_blocks);
! df->n_blocks = df_prune_to_subcfg (df->postorder,
! df->n_blocks, df->blocks_to_analyze);
! df->n_blocks_inverted = df_prune_to_subcfg (df->postorder_inverted,
! df->n_blocks_inverted,
! df->blocks_to_analyze);
! }
! else
! {
! }
BITMAP_FREE (current_all_blocks);
}
else
Index: gcc/loop-invariant.c
===================================================================
*** gcc/loop-invariant.c (revision 206624)
--- gcc/loop-invariant.c (working copy)
*************** find_defs (struct loop *loop, basic_bloc
*** 672,678 ****
df_chain_add_problem (DF_UD_CHAIN);
df_set_blocks (blocks);
df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
! df_analyze ();
check_invariant_table_size ();
if (dump_file)
--- 672,678 ----
df_chain_add_problem (DF_UD_CHAIN);
df_set_blocks (blocks);
df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
! df_analyze (loop);
check_invariant_table_size ();
if (dump_file)
Index: gcc/loop-iv.c
===================================================================
*** gcc/loop-iv.c (revision 206624)
--- gcc/loop-iv.c (working copy)
*************** iv_analysis_loop_init (struct loop *loop
*** 307,313 ****
df_chain_add_problem (DF_UD_CHAIN);
df_note_add_problem ();
df_set_blocks (blocks);
! df_analyze ();
if (dump_file)
df_dump_region (dump_file);
--- 307,313 ----
df_chain_add_problem (DF_UD_CHAIN);
df_note_add_problem ();
df_set_blocks (blocks);
! df_analyze (loop);
if (dump_file)
df_dump_region (dump_file);
More information about the Gcc-patches
mailing list