This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH] more PR33928: DIY dataflow in fwprop, to enable it at -O1 and speed it up at -O2
- From: Paolo Bonzini <bonzini at gnu dot org>
- To: GCC Patches <gcc-patches at gcc dot gnu dot org>, Kenneth Zadeck <zadeck at naturalbridge dot com>
- Date: Fri, 08 May 2009 10:07:08 +0200
- Subject: [PATCH] more PR33928: DIY dataflow in fwprop, to enable it at -O1 and speed it up at -O2
This patch modifies fwprop to build its own datastructure for
use-to-single-def chains instead of using fullblown UD chains. This is
not hard to do at all with a little refactoring of df-problems.c to
allow simulating RD just like it's done with LR.
Quoting Steven's comment from the PR, "alternatively we could re-work
fwprop to work on regions and use the partial-CFG dataflow stuff,
similar to what the RTL loop optimizers (like loop-invariant) do".
Summarizing advantages and disadvantages we have:
this patch
----------
+ easy, and can easily be reverted
+ no code difference at -O2 before and after the patch (not tested
except on the PR testcase), giving more confidence in backporting
- less extensible to cases where we want to forward propagate
reaching definitions if they are all of a similar form (Bernd
had a patch to form widening multiplications in fwprop)
region approach
---------------
+ may save further time in RD solution
- could give different kinds of pessimization, the semi-global
approach of fwprop seems to work well from tests made on
well-known benchmarks who shalt not be named.
- I don't think anyone is going to work on it soon :-)
Bootstrapped/regtested i686-pc-linux-gnu, checked that the time spent in
fwprop even for devilish PR26854 testcases is very small. Ok for trunk
and after a while 4.4?
Paolo
2009-05-08 Paolo Bonzini <bonzini@gnu.org>
PR rtl-optimization/33928
* fwprop.c (use_def_ref, get_def_for_use, bitmap_only_bit_bitween,
process_uses, build_single_def_use_links): New.
(update_df): Update use_def_ref.
(forward_propagate_into): Use get_def_for_use instead of use-def
chains.
(fwprop_init): Call build_single_def_use_links and let it initialize
dataflow.
(fwprop_done): Free use_def_ref.
(fwprop_addr): Eliminate duplicate call to df_set_flags.
* df-problems.c (df_rd_simulate_artificial_defs_at_top,
df_rd_simulate_one_insn): New.
(df_rd_bb_local_compute_process_def): Update head comment.
(df_chain_create_bb): Use the new RD simulation functions.
* df.h (df_rd_simulate_artificial_defs_at_top,
df_rd_simulate_one_insn): New.
* opts.c (decode_options): Enable fwprop at -O1.
* doc/invoke.texi (-fforward-propagate): Document this.
Index: fwprop.c
===================================================================
--- fwprop.c (revision 147269)
+++ fwprop.c (working copy)
@@ -105,6 +105,92 @@ along with GCC; see the file COPYING3.
static int num_changes;
+DEF_VEC_P(df_ref);
+DEF_VEC_ALLOC_P(df_ref,heap);
+VEC(df_ref,heap) *use_def_ref;
+
+static inline df_ref
+get_def_for_use (df_ref use)
+{
+ return VEC_index (df_ref, use_def_ref, DF_REF_ID (use));
+}
+
+static inline int
+bitmap_only_bit_between (const_bitmap b, unsigned first, unsigned last)
+{
+ bitmap_iterator bi;
+ unsigned bit, bit2;
+
+ if (last < first)
+ return -1;
+
+ bmp_iter_set_init (&bi, b, first, &bit);
+ if (bmp_iter_set (&bi, &bit) && bit <= last)
+ {
+ bit2 = bit;
+ bmp_iter_next (&bi, &bit2);
+ if (!bmp_iter_set (&bi, &bit2) || bit2 > last)
+ return bit;
+ }
+ return -1;
+}
+
+static void
+process_uses (bitmap local_rd, df_ref *use_rec, int top_flag)
+{
+ df_ref use;
+ while ((use = *use_rec++) != NULL)
+ if (top_flag == (DF_REF_FLAGS (use) & DF_REF_AT_TOP))
+ {
+ unsigned int uregno = DF_REF_REGNO (use);
+ unsigned int first = DF_DEFS_BEGIN (uregno);
+ unsigned int last = first + DF_DEFS_COUNT (uregno) - 1;
+ int defno = bitmap_only_bit_between (local_rd, first, last);
+ df_ref def = (defno == -1) ? NULL : DF_DEFS_GET (defno);
+
+ VEC_replace (df_ref, use_def_ref, DF_REF_ID (use), def);
+ }
+}
+
+static void
+build_single_def_use_links (void)
+{
+ basic_block bb;
+ bitmap local_rd = BITMAP_ALLOC (NULL);
+
+ /* We use reaching definitions to compute our restricted use-def chains. */
+ df_set_flags (DF_EQ_NOTES);
+ df_rd_add_problem ();
+ df_analyze ();
+ df_maybe_reorganize_use_refs (DF_REF_ORDER_BY_INSN_WITH_NOTES);
+
+ use_def_ref = VEC_alloc (df_ref, heap, DF_USES_TABLE_SIZE ());
+ VEC_safe_grow (df_ref, heap, use_def_ref, DF_USES_TABLE_SIZE ());
+
+ FOR_EACH_BB (bb)
+ {
+ int bb_index = bb->index;
+ struct df_rd_bb_info *bb_info = df_rd_get_bb_info (bb_index);
+ rtx insn;
+
+ bitmap_copy (local_rd, bb_info->in);
+ process_uses (local_rd, df_get_artificial_uses (bb_index), DF_REF_AT_TOP);
+
+ df_rd_simulate_artificial_defs_at_top (bb, local_rd);
+ FOR_BB_INSNS (bb, insn)
+ if (INSN_P (insn))
+ {
+ unsigned int uid = INSN_UID (insn);
+ process_uses (local_rd, DF_INSN_UID_USES (uid), 0);
+ process_uses (local_rd, DF_INSN_UID_EQ_USES (uid), 0);
+ df_rd_simulate_one_insn (bb, insn, local_rd);
+ }
+
+ process_uses (local_rd, df_get_artificial_uses (bb_index), 0);
+ }
+
+ BITMAP_FREE (local_rd);
+}
/* Do not try to replace constant addresses or addresses of local and
argument slots. These MEM expressions are made only once and inserted
@@ -716,7 +802,8 @@ update_df (rtx insn, rtx *loc, df_ref *u
width, offset, mode);
- /* Set up the use-def chain. */
- df_chain_copy (new_use, DF_REF_CHAIN (orig_use));
+ /* Set up the use-def link. */
+ gcc_assert (DF_REF_ID (new_use) == (int) VEC_length (df_ref, use_def_ref));
+ VEC_safe_push (df_ref, heap, use_def_ref, get_def_for_use (orig_use));
changed = true;
}
if (changed)
@@ -1035,7 +1122,6 @@ forward_propagate_and_simplify (df_ref u
static void
forward_propagate_into (df_ref use)
{
- struct df_link *defs;
df_ref def;
rtx def_insn, def_set, use_insn;
rtx parent;
@@ -1046,11 +1132,9 @@ forward_propagate_into (df_ref use)
return;
/* Only consider uses that have a single definition. */
- defs = DF_REF_CHAIN (use);
- if (!defs || defs->next)
+ def = get_def_for_use (use);
+ if (!def)
return;
-
- def = defs->ref;
if (DF_REF_FLAGS (def) & DF_REF_READ_WRITE)
return;
if (DF_REF_IS_ARTIFICIAL (def))
@@ -1096,12 +1180,7 @@ fwprop_init (void)
insns (sadly) if we are not working in cfglayout mode. */
loop_optimizer_init (0);
- /* Now set up the dataflow problem (we only want use-def chains) and
- put the dataflow solver to work. */
- df_set_flags (DF_EQ_NOTES);
- df_chain_add_problem (DF_UD_CHAIN);
- df_analyze ();
- df_maybe_reorganize_use_refs (DF_REF_ORDER_BY_INSN_WITH_NOTES);
+ build_single_def_use_links ();
df_set_flags (DF_DEFER_INSN_RESCAN);
}
@@ -1110,6 +1189,7 @@ fwprop_done (void)
{
loop_optimizer_finalize ();
+ VEC_free (df_ref, heap, use_def_ref);
free_dominance_info (CDI_DOMINATORS);
cleanup_cfg (0);
delete_trivially_dead_insns (get_insns (), max_reg_num ());
@@ -1187,8 +1267,6 @@ fwprop_addr (void)
/* Go through all the uses. update_df will create new ones at the
end, and we'll go through them as well. */
- df_set_flags (DF_DEFER_INSN_RESCAN);
-
for (i = 0; i < DF_USES_TABLE_SIZE (); i++)
{
df_ref use = DF_USES_GET (i);
Index: df-problems.c
===================================================================
--- df-problems.c (revision 147269)
+++ df-problems.c (working copy)
@@ -316,7 +316,61 @@ df_rd_alloc (bitmap all_blocks)
}
-/* Process a list of DEFs for df_rd_bb_local_compute. */
+/* Add the effect of the top artificial defs of BB to the reaching definitions
+ bitmap LOCAL_RD. */
+
+void
+df_rd_simulate_artificial_defs_at_top (basic_block bb, bitmap local_rd)
+{
+ int bb_index = bb->index;
+ df_ref *def_rec;
+ for (def_rec = df_get_artificial_defs (bb_index); *def_rec; def_rec++)
+ {
+ df_ref def = *def_rec;
+ if (DF_REF_FLAGS (def) & DF_REF_AT_TOP)
+ {
+ unsigned int dregno = DF_REF_REGNO (def);
+ if (!(DF_REF_FLAGS (def) & (DF_REF_PARTIAL | DF_REF_CONDITIONAL)))
+ bitmap_clear_range (local_rd,
+ DF_DEFS_BEGIN (dregno),
+ DF_DEFS_COUNT (dregno));
+ bitmap_set_bit (local_rd, DF_REF_ID (def));
+ }
+ }
+}
+
+/* Add the effect of the defs of INSN to the reaching definitions bitmap
+ LOCAL_RD. */
+
+void
+df_rd_simulate_one_insn (basic_block bb ATTRIBUTE_UNUSED, rtx insn,
+ bitmap local_rd)
+{
+ unsigned uid = INSN_UID (insn);
+ df_ref *def_rec;
+
+ for (def_rec = DF_INSN_UID_DEFS (uid); *def_rec; def_rec++)
+ {
+ df_ref def = *def_rec;
+ unsigned int dregno = DF_REF_REGNO (def);
+ if ((!(df->changeable_flags & DF_NO_HARD_REGS))
+ || (dregno >= FIRST_PSEUDO_REGISTER))
+ {
+ if (!(DF_REF_FLAGS (def) & (DF_REF_PARTIAL | DF_REF_CONDITIONAL)))
+ bitmap_clear_range (local_rd,
+ DF_DEFS_BEGIN (dregno),
+ DF_DEFS_COUNT (dregno));
+ if (!(DF_REF_FLAGS (def)
+ & (DF_REF_MUST_CLOBBER | DF_REF_MAY_CLOBBER)))
+ bitmap_set_bit (local_rd, DF_REF_ID (def));
+ }
+ }
+}
+
+/* Process a list of DEFs for df_rd_bb_local_compute. This is a bit
+ more complicated than just simulating, because we must produce the
+ gen and kill sets and hence deal with the two possible representations
+ of kill sets. */
static void
df_rd_bb_local_compute_process_def (struct df_rd_bb_info *bb_info,
@@ -2076,7 +2130,6 @@ df_chain_create_bb (unsigned int bb_inde
struct df_rd_bb_info *bb_info = df_rd_get_bb_info (bb_index);
rtx insn;
bitmap cpy = BITMAP_ALLOC (NULL);
- df_ref *def_rec;
bitmap_copy (cpy, bb_info->in);
bitmap_set_bit (df_chain->out_of_date_transfer_functions, bb_index);
@@ -2095,57 +2148,23 @@ df_chain_create_bb (unsigned int bb_inde
DF_REF_AT_TOP);
#endif
- for (def_rec = df_get_artificial_defs (bb_index); *def_rec; def_rec++)
- {
- df_ref def = *def_rec;
- if (DF_REF_FLAGS (def) & DF_REF_AT_TOP)
- {
- unsigned int dregno = DF_REF_REGNO (def);
- if (!(DF_REF_FLAGS (def) & (DF_REF_PARTIAL | DF_REF_CONDITIONAL)))
- bitmap_clear_range (cpy,
- DF_DEFS_BEGIN (dregno),
- DF_DEFS_COUNT (dregno));
- bitmap_set_bit (cpy, DF_REF_ID (def));
- }
- }
+ df_rd_simulate_artificial_defs_at_top (bb, cpy);
/* Process the regular instructions next. */
FOR_BB_INSNS (bb, insn)
- {
- df_ref *def_rec;
- unsigned int uid = INSN_UID (insn);
-
- if (!INSN_P (insn))
- continue;
-
- /* Now scan the uses and link them up with the defs that remain
- in the cpy vector. */
-
- df_chain_create_bb_process_use (cpy, DF_INSN_UID_USES (uid), 0);
-
- if (df->changeable_flags & DF_EQ_NOTES)
- df_chain_create_bb_process_use (cpy, DF_INSN_UID_EQ_USES (uid), 0);
+ if (INSN_P (insn))
+ {
+ unsigned int uid = INSN_UID (insn);
+ /* First scan the uses and link them up with the defs that remain
+ in the cpy vector. */
+ df_chain_create_bb_process_use (cpy, DF_INSN_UID_USES (uid), 0);
+ if (df->changeable_flags & DF_EQ_NOTES)
+ df_chain_create_bb_process_use (cpy, DF_INSN_UID_EQ_USES (uid), 0);
- /* Since we are going forwards, process the defs second. This
- pass only changes the bits in cpy. */
- for (def_rec = DF_INSN_UID_DEFS (uid); *def_rec; def_rec++)
- {
- df_ref def = *def_rec;
- unsigned int dregno = DF_REF_REGNO (def);
- if ((!(df->changeable_flags & DF_NO_HARD_REGS))
- || (dregno >= FIRST_PSEUDO_REGISTER))
- {
- if (!(DF_REF_FLAGS (def) & (DF_REF_PARTIAL | DF_REF_CONDITIONAL)))
- bitmap_clear_range (cpy,
- DF_DEFS_BEGIN (dregno),
- DF_DEFS_COUNT (dregno));
- if (!(DF_REF_FLAGS (def)
- & (DF_REF_MUST_CLOBBER | DF_REF_MAY_CLOBBER)))
- bitmap_set_bit (cpy, DF_REF_ID (def));
- }
- }
- }
+ /* Since we are going forwards, process the defs second. */
+ df_rd_simulate_one_insn (bb, insn, cpy);
+ }
/* Create the chains for the artificial uses of the hard registers
at the end of the block. */
Index: df.h
===================================================================
--- df.h (revision 147269)
+++ df.h (working copy)
@@ -939,6 +939,8 @@ extern void df_grow_bb_info (struct data
extern void df_chain_dump (struct df_link *, FILE *);
extern void df_print_bb_index (basic_block bb, FILE *file);
extern void df_rd_add_problem (void);
+extern void df_rd_simulate_artificial_defs_at_top (basic_block, bitmap);
+extern void df_rd_simulate_one_insn (basic_block, rtx, bitmap);
extern void df_lr_add_problem (void);
extern void df_lr_verify_transfer_functions (void);
extern void df_live_verify_transfer_functions (void);
Index: opts.c
===================================================================
--- opts.c (revision 147269)
+++ opts.c (working copy)
@@ -848,6 +848,7 @@ decode_options (unsigned int argc, const
#endif
flag_guess_branch_prob = opt1;
flag_cprop_registers = opt1;
+ flag_forward_propagate = opt1;
flag_if_conversion = opt1;
flag_if_conversion2 = opt1;
flag_ipa_pure_const = opt1;
@@ -873,7 +874,6 @@ decode_options (unsigned int argc, const
flag_thread_jumps = opt2;
flag_crossjumping = opt2;
flag_optimize_sibling_calls = opt2;
- flag_forward_propagate = opt2;
flag_cse_follow_jumps = opt2;
flag_gcse = opt2;
flag_expensive_optimizations = opt2;
Index: doc/invoke.texi
===================================================================
--- doc/invoke.texi (revision 147269)
+++ doc/invoke.texi (working copy)
@@ -5554,8 +5554,8 @@ instructions and checks if the result ca
is active, two passes are performed and the second is scheduled after
loop unrolling.
-This option is enabled by default at optimization levels @option{-O2},
-@option{-O3}, @option{-Os}.
+This option is enabled by default at optimization levels @option{-O},
+@option{-O2}, @option{-O3}, @option{-Os}.
@item -fomit-frame-pointer
@opindex fomit-frame-pointer