This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH 3/3] Loop flattening on loop-SSA.


On Wed, 3 Nov 2010, Sebastian Pop wrote:

> 2010-10-20  Sebastian Pop  <sebastian.pop@amd.com>
> 
> 	* Makefile.in (OBJS-common): Add tree-loop-flattening.o.
> 	(tree-loop-flattening.o): New.
> 	* common.opt (ftree-loop-flatten): New.
> 	* dbgcnt.def (lflat): New.
> 	* params.def (PARAM_LFLAT_MAX_NB_BBS): New.
> 	* passes.c (init_optimization_passes): Add new passes
> 	pass_flatten_loops and pass_if_conversion after loop vectorization
> 	and before pass_slp_vectorize.
> 	* timevar.def (TV_TREE_LOOP_FLATTENING): New.
> 	* tree-loop-flattening.c: New.
> 	* tree-pass.h (pass_flatten_loops): Declared.
> 	* tree-flow.h (gate_tree_if_conversion): Declared.
> 	(tree_if_conversion): Declared.
> 	* tree-if-conv.c (tree_if_conversion): Not static anymore.
> 	(gate_tree_if_conversion): Same.

Comments inline.

What extra testing apart from the 4 testcases did this new pass get?
Do we pass bootstrap with it enabled?  Did you check if we regress
in SPEC 2k6 when it is enabled?

> 	* gcc.dg/tree-ssa/flat-loop-1.c: New.
> 	* gcc.dg/tree-ssa/flat-loop-2.c: New.
> 	* gcc.dg/tree-ssa/flat-loop-3.c: New.
> 	* gcc.dg/tree-ssa/flat-loop-4.c: New.
> ---
>  gcc/ChangeLog                               |   18 +
>  gcc/Makefile.in                             |    4 +
>  gcc/common.opt                              |    4 +
>  gcc/dbgcnt.def                              |    1 +
>  gcc/params.def                              |    7 +
>  gcc/passes.c                                |    1 +
>  gcc/testsuite/ChangeLog                     |    7 +
>  gcc/testsuite/gcc.dg/tree-ssa/flat-loop-1.c |   28 ++
>  gcc/testsuite/gcc.dg/tree-ssa/flat-loop-2.c |   39 ++
>  gcc/testsuite/gcc.dg/tree-ssa/flat-loop-3.c |   19 +
>  gcc/testsuite/gcc.dg/tree-ssa/flat-loop-4.c |   23 +
>  gcc/timevar.def                             |    1 +
>  gcc/tree-flow.h                             |    4 +
>  gcc/tree-if-conv.c                          |    4 +-
>  gcc/tree-loop-flattening.c                  |  630 +++++++++++++++++++++++++++
>  gcc/tree-pass.h                             |    1 +
>  16 files changed, 789 insertions(+), 2 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/flat-loop-1.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/flat-loop-2.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/flat-loop-3.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/flat-loop-4.c
>  create mode 100644 gcc/tree-loop-flattening.c
> 
> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index 3ceb7b6..f312b27 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,5 +1,23 @@
>  2010-10-20  Sebastian Pop  <sebastian.pop@amd.com>
>  
> +	* Makefile.in (OBJS-common): Add tree-loop-flattening.o.
> +	(tree-loop-flattening.o): New.
> +	* common.opt (ftree-loop-flatten): New.
> +	* dbgcnt.def (lflat): New.
> +	* params.def (PARAM_LFLAT_MAX_NB_BBS): New.
> +	* passes.c (init_optimization_passes): Add new passes
> +	pass_flatten_loops and pass_if_conversion after loop vectorization
> +	and before pass_slp_vectorize.
> +	* timevar.def (TV_TREE_LOOP_FLATTENING): New.
> +	* tree-loop-flattening.c: New.
> +	* tree-pass.h (pass_flatten_loops): Declared.
> +	* tree-flow.h (gate_tree_if_conversion): Declared.
> +	(tree_if_conversion): Declared.
> +	* tree-if-conv.c (tree_if_conversion): Not static anymore.
> +	(gate_tree_if_conversion): Same.
> +
> +2010-10-20  Sebastian Pop  <sebastian.pop@amd.com>
> +
>  	* tree-if-conv.c (if_convertible_loop_p_1): Do not call
>  	compute_data_dependences_for_loop.
>  	(if_convertible_loop_p): Do not free refs and ddrs.
> diff --git a/gcc/Makefile.in b/gcc/Makefile.in
> index 898e962..55b67f4 100644
> --- a/gcc/Makefile.in
> +++ b/gcc/Makefile.in
> @@ -1368,6 +1368,7 @@ OBJS-common = \
>  	tree-into-ssa.o \
>  	tree-iterator.o \
>  	tree-loop-distribution.o \
> +	tree-loop-flattening.o \
>  	tree-loop-linear.o \
>  	tree-nested.o \
>  	tree-nrv.o \
> @@ -2773,6 +2774,9 @@ tree-loop-distribution.o: tree-loop-distribution.c $(CONFIG_H) $(SYSTEM_H) coret
>     $(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TREE_DUMP_H) $(TIMEVAR_H) $(CFGLOOP_H) \
>     $(TREE_PASS_H) $(TREE_DATA_REF_H) $(EXPR_H) \
>     langhooks.h $(TREE_VECTORIZER_H)
> +tree-loop-flattening.o: tree-loop-flattening.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
> +   $(TM_H) $(OPTABS_H) $(TREE_H) $(BASIC_BLOCK_H) $(DIAGNOSTIC_H) $(TREE_FLOW_H) \
> +   $(TREE_DUMP_H) $(TIMEVAR_H) $(CFGLOOP_H) $(TREE_PASS_H) $(DBGCNT_H)
>  tree-parloops.o: tree-parloops.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
>     $(TREE_FLOW_H) $(TREE_H) $(CFGLOOP_H) $(TREE_DATA_REF_H) \
>     $(DIAGNOSTIC_H) $(TREE_PASS_H) langhooks.h gt-tree-parloops.h \
> diff --git a/gcc/common.opt b/gcc/common.opt
> index 8fe796f..c969979 100644
> --- a/gcc/common.opt
> +++ b/gcc/common.opt
> @@ -1632,6 +1632,10 @@ ftree-loop-distribute-patterns
>  Common Report Var(flag_tree_loop_distribute_patterns) Optimization
>  Enable loop distribution for patterns transformed into a library call
>  
> +ftree-loop-flatten
> +Common Report Var(flag_tree_loop_flattening) Optimization
> +Enable loop flattening on trees
> +
>  ftree-loop-im
>  Common Report Var(flag_tree_loop_im) Init(1) Optimization
>  Enable loop invariant motion on trees
> diff --git a/gcc/dbgcnt.def b/gcc/dbgcnt.def
> index 0492d66..0ef9a72 100644
> --- a/gcc/dbgcnt.def
> +++ b/gcc/dbgcnt.def
> @@ -166,6 +166,7 @@ DEBUG_COUNTER (if_conversion_tree)
>  DEBUG_COUNTER (if_after_combine)
>  DEBUG_COUNTER (if_after_reload)
>  DEBUG_COUNTER (local_alloc_for_sched)
> +DEBUG_COUNTER (lflat)
>  DEBUG_COUNTER (postreload_cse)
>  DEBUG_COUNTER (pre)
>  DEBUG_COUNTER (pre_insn)
> diff --git a/gcc/params.def b/gcc/params.def
> index 49a6185..3fffc35 100644
> --- a/gcc/params.def
> +++ b/gcc/params.def
> @@ -788,6 +788,13 @@ DEFPARAM (PARAM_GRAPHITE_MAX_BBS_PER_FUNCTION,
>  	  "maximum number of basic blocks per function to be analyzed by Graphite",
>  	  100, 0, 0)
>  
> +/* Maximal number of basic blocks in a loop to be flattened.  */
> +
> +DEFPARAM (PARAM_LFLAT_MAX_NB_BBS,
> +	  "lflat-max-nb-bbs",
> +	  "maximum number of basic blocks in a loop to be flattened",
> +	  100, 0, 0)
> +
>  /* Avoid doing loop invariant motion on very large loops.  */
>  
>  DEFPARAM (PARAM_LOOP_INVARIANT_MAX_BBS_IN_LOOP,
> diff --git a/gcc/passes.c b/gcc/passes.c
> index 1308ce9..22110a4 100644
> --- a/gcc/passes.c
> +++ b/gcc/passes.c
> @@ -917,6 +917,7 @@ init_optimization_passes (void)
>  	  NEXT_PASS (pass_parallelize_loops);
>  	  NEXT_PASS (pass_loop_prefetch);
>  	  NEXT_PASS (pass_iv_optimize);
> +	  NEXT_PASS (pass_flatten_loops);
>  	  NEXT_PASS (pass_tree_loop_done);
>  	}
>        NEXT_PASS (pass_cse_reciprocals);
> diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
> index 4233f86..2b3b93e 100644
> --- a/gcc/testsuite/ChangeLog
> +++ b/gcc/testsuite/ChangeLog
> @@ -1,5 +1,12 @@
>  2010-10-20  Sebastian Pop  <sebastian.pop@amd.com>
>  
> +	* gcc.dg/tree-ssa/flat-loop-1.c: New.
> +	* gcc.dg/tree-ssa/flat-loop-2.c: New.
> +	* gcc.dg/tree-ssa/flat-loop-3.c: New.
> +	* gcc.dg/tree-ssa/flat-loop-4.c: New.
> +
> +2010-10-20  Sebastian Pop  <sebastian.pop@amd.com>
> +
>  	PR tree-optimization/46029
>  	* g++.dg/tree-ssa/ifc-pr46029.C: New.
>  	* gcc.dg/tree-ssa/ifc-8.c: New.
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/flat-loop-1.c b/gcc/testsuite/gcc.dg/tree-ssa/flat-loop-1.c
> new file mode 100644
> index 0000000..bee8a2b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/flat-loop-1.c
> @@ -0,0 +1,28 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-loop-flatten" } */
> +
> +struct stack_segment
> +{
> +  struct dynamic_allocation_blocks *dynamic_allocation;
> +};
> +struct dynamic_allocation_blocks
> +{
> +  struct dynamic_allocation_blocks *next;
> +};
> +static struct dynamic_allocation_blocks *
> +merge_dynamic_blocks (struct dynamic_allocation_blocks *a,
> +		      struct dynamic_allocation_blocks *b)
> +{
> +  struct dynamic_allocation_blocks **pp;
> +  for (pp = &a->next; *pp != ((void *)0); pp = &(*pp)->next)
> +    *pp = b;
> +  return a;
> +}
> +__morestack_release_segments (struct stack_segment **pp, int free_dynamic)
> +{
> +  struct dynamic_allocation_blocks *ret;
> +  struct stack_segment *pss;
> +  pss = *pp;
> +  while (pss != ((void *)0))
> +    ret = merge_dynamic_blocks (pss->dynamic_allocation, ret);
> +}
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/flat-loop-2.c b/gcc/testsuite/gcc.dg/tree-ssa/flat-loop-2.c
> new file mode 100644
> index 0000000..a7287fb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/flat-loop-2.c
> @@ -0,0 +1,39 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-loop-flatten" } */
> +
> +struct stack_segment
> +{
> +  struct stack_segment *next;
> +  struct dynamic_allocation_blocks *dynamic_allocation;
> +};
> +struct dynamic_allocation_blocks
> +{
> +  struct dynamic_allocation_blocks *next;
> +};
> +static struct dynamic_allocation_blocks *
> +merge_dynamic_blocks (struct dynamic_allocation_blocks *a,
> +        struct dynamic_allocation_blocks *b)
> +{
> +  struct dynamic_allocation_blocks **pp;
> +  if (b == ((void *)0))
> +  for (pp = &a->next; *pp != ((void *)0); pp = &(*pp)->next)
> +    ;
> +  return a;
> +}
> +__morestack_release_segments (struct stack_segment **pp, int free_dynamic)
> +{
> +  struct dynamic_allocation_blocks *ret;
> +  struct stack_segment *pss;
> +  while (pss != ((void *)0))
> +    {
> +      struct stack_segment *next;
> +      next = pss->next;
> + {
> +   if (free_dynamic)
> +     {
> +       ret = merge_dynamic_blocks (pss->dynamic_allocation, ret);
> +     }
> + }
> +      pss = next;
> +    }
> +}
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/flat-loop-3.c b/gcc/testsuite/gcc.dg/tree-ssa/flat-loop-3.c
> new file mode 100644
> index 0000000..d3d66ab
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/flat-loop-3.c
> @@ -0,0 +1,19 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-loop-flatten" } */
> +
> +
> +int
> +split_directories (const char *name, int *ptr_num_dirs)
> +{
> +  int num_dirs = 0;
> +  char **dirs;
> +  const char *p, *q;
> +  int ch;
> +  while ((ch = *p++) != '\0')
> +    {
> +   num_dirs++;
> +   while (((*p) == '/'))
> +     p++;
> +    }
> +  return (dirs[num_dirs - 1] == ((void *)0));
> +}
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/flat-loop-4.c b/gcc/testsuite/gcc.dg/tree-ssa/flat-loop-4.c
> new file mode 100644
> index 0000000..8e551ac
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/flat-loop-4.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-loop-flatten" } */
> +
> +void
> +formatted_backspace (int common, char *s)
> +{
> +  int base;
> +  int n;
> +  do
> +    {
> +      if (sseek (s, base, 0) < 0)
> +	goto io_error;
> +
> +      while (n > 0)
> +	{
> +          n--;
> +	  base += n + 1;
> +	}
> +    }
> +  while (base != 0);
> + io_error:
> +  generate_error (common, 0, ((void *)0));
> +}

The testcases seem to origin from ICEs found during development.  There
is a lack of functional tests, please consider coming up with some,
eventually testing for enabled extra optimizations.


> diff --git a/gcc/timevar.def b/gcc/timevar.def
> index 86e2999..89ff8e8 100644
> --- a/gcc/timevar.def
> +++ b/gcc/timevar.def
> @@ -152,6 +152,7 @@ DEFTIMEVAR (TV_GRAPHITE_DATA_DEPS    , "Graphite data dep analysis")
>  DEFTIMEVAR (TV_GRAPHITE_CODE_GEN     , "Graphite code generation")
>  DEFTIMEVAR (TV_TREE_LINEAR_TRANSFORM , "tree loop linear")
>  DEFTIMEVAR (TV_TREE_LOOP_DISTRIBUTION, "tree loop distribution")
> +DEFTIMEVAR (TV_TREE_LOOP_FLATTENING  , "tree loop flattening")
>  DEFTIMEVAR (TV_CHECK_DATA_DEPS       , "tree check data dependences")
>  DEFTIMEVAR (TV_TREE_PREFETCH	     , "tree prefetching")
>  DEFTIMEVAR (TV_TREE_LOOP_IVOPTS	     , "tree iv optimization")
> diff --git a/gcc/tree-flow.h b/gcc/tree-flow.h
> index c2702dc..e1ee69f 100644
> --- a/gcc/tree-flow.h
> +++ b/gcc/tree-flow.h
> @@ -730,6 +730,10 @@ bool contains_abnormal_ssa_name_p (tree);
>  bool stmt_dominates_stmt_p (gimple, gimple);
>  void mark_virtual_ops_for_renaming (gimple);
>  
> +/* In tree-if-conv.c */
> +bool gate_tree_if_conversion (void);
> +bool tree_if_conversion (struct loop *, tree *);
> +

Why'd you need to export the gate?  I guess if-conversion should
happen unconditionally for loops that are flattened as I see it is
really part of the flattening transformation?

>  /* In tree-ssa-dce.c */
>  void mark_virtual_phi_result_for_renaming (gimple);
>  
> diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c
> index 5b941af..3c30abb 100644
> --- a/gcc/tree-if-conv.c
> +++ b/gcc/tree-if-conv.c
> @@ -1599,7 +1599,7 @@ combine_blocks (struct loop *loop, tree *scratch_pad)
>  /* If-convert LOOP when it is legal.  For the moment this pass has no
>     profitability analysis.  Returns true when something changed.  */
>  
> -static bool
> +bool
>  tree_if_conversion (struct loop *loop, tree *scratch_pad)
>  {
>    bool changed = false;
> @@ -1662,7 +1662,7 @@ main_tree_if_conversion (void)
>  
>  /* Returns true when the if-conversion pass is enabled.  */
>  
> -static bool
> +bool
>  gate_tree_if_conversion (void)
>  {
>    return ((flag_tree_vectorize && flag_tree_loop_if_convert != 0)
> diff --git a/gcc/tree-loop-flattening.c b/gcc/tree-loop-flattening.c
> new file mode 100644
> index 0000000..4bc8768
> --- /dev/null
> +++ b/gcc/tree-loop-flattening.c
> @@ -0,0 +1,630 @@
> +/* Loop flattening.
> +   Copyright (C) 2010 Free Software Foundation, Inc.
> +   Contributed by Sebastian Pop <sebastian.pop@amd.com>.
> +
> +This file is part of GCC.
> +
> +GCC is free software; you can redistribute it and/or modify
> +it under the terms of the GNU General Public License as published by
> +the Free Software Foundation; either version 3, or (at your option)
> +any later version.
> +
> +GCC is distributed in the hope that it will be useful,
> +but WITHOUT ANY WARRANTY; without even the implied warranty of
> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +GNU General Public License for more details.
> +
> +You should have received a copy of the GNU General Public License
> +along with GCC; see the file COPYING3.  If not see
> +<http://www.gnu.org/licenses/>.  */
> +
> +#include "config.h"
> +#include "system.h"
> +#include "coretypes.h"
> +#include "tm.h"
> +#include "ggc.h"
> +#include "tree.h"
> +#include "rtl.h"
> +#include "output.h"
> +#include "basic-block.h"
> +#include "diagnostic.h"
> +#include "tree-flow.h"
> +#include "toplev.h"
> +#include "tree-dump.h"
> +#include "timevar.h"
> +#include "cfgloop.h"
> +#include "tree-pass.h"
> +#include "gimple.h"
> +#include "params.h"
> +#include "dbgcnt.h"
> +
> +/* This loop flattening pass transforms backward pointing edges into
> +   forward pointing edges.
> +
> +   The back-edge removal transformation was described in the 1983
> +   paper by Allen J. R., Ken Kennedy, Carrie Porterfield, and Joe
> +   Warren: "Conversion of control dependence to data dependence"
> +   available from http://doi.acm.org/10.1145/567067.567085
> +
> +   The back-edge removal algorithm was presented in that paper as part
> +   of the if-conversion algorithm for backward pointing edges.  In
> +   this section we will first provide a description of this technique
> +   adapted for the Gimple-SSA form, followed by an example, and a
> +   discussion of the differences with the higher level loop flattening
> +   transformation.
> +
> +   The back-edge removal algorithm transforms control dependences into
> +   data dependences by using a boolean variable.  The values taken by
> +   the boolean variable control the execution path of the forward
> +   edges created in order to use the back-edge of an outer loop.
> +
> +   The first step of the algorithm detects a surrounding loop and all
> +   the back-edges of the loop body: these back-edges can be inner
> +   loops or strongly connected components of the CFG that cannot be
> +   reduced to natural loops.
> +
> +   Each back-edge is removed by redirecting the target of the
> +   back-edge to the latch basic block of the surrounding loop.  A
> +   boolean variable is created in the latch.  It is cleared when the
> +   redirected back-edge is taken and it is set to true for any other
> +   paths leading to the latch.
> +
> +   The header basic block of the surrounding loop is split before its
> +   statements and a new condition is added based on the control
> +   variable: when the control variable is set to true, the execution
> +   proceeds as normal to the basic block that contains the statements
> +   of the header; when the control variable is cleared, meaning that
> +   the back-edge has been taken, the execution proceeds to the point
> +   where the redirected back-edge was pointing.
> +
> +   The last step updates the SSA form after all the back-edges have
> +   been redirected to the latch, and the new edges from the header to
> +   the destination of back-edges have been created.
> +
> +   Another description of loop flattening in a very Fortran specific
> +   way is in the 1992 paper by Reinhard von Hanxleden and Ken Kennedy:
> +   "Relaxing SIMD Control Flow Constraints using Loop Transformations"
> +   available from
> +   http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.54.5033 */
> +
> +/* Keep the loop structure for LOOP and remove all the loop structures
> +   under LOOP.  */
> +
> +static void
> +cancel_subloops (loop_p loop)
> +{
> +  int i;
> +  loop_p li;
> +  VEC (loop_p, heap) *lv = VEC_alloc (loop_p, heap, 3);
> +
> +  for (li = loop->inner; li; li = li->next)
> +    VEC_safe_push (loop_p, heap, lv, li);
> +
> +  FOR_EACH_VEC_ELT (loop_p, lv, i, li)
> +    cancel_loop_tree (li);
> +
> +  VEC_free (loop_p, heap, lv);
> +}

This function should be in cfgloop.c and implemented in simpler
form, like

void
cancel_subloops (struct loop *loop)
{
  while (loop->inner)
    cancel_loop_tree (loop->inner);
}

simply following the cancel_loop_tree example.

> +/* Before creating other phi nodes in LOOP->header for the control
> +   flags, update the phi nodes of LOOP->header and add the necessary
> +   phi nodes in the LOOP->latch that now contains several paths on
> +   which the values are not updated.  PRED_E is the single edge that
> +   was pointing to the LOOP->latch basic block before inner back-edges
> +   were redirected to the LOOP->latch.  */
> +
> +static void
> +update_loop_phi_nodes (loop_p loop, edge pred_e)
> +{
> +  gimple_stmt_iterator gsi;
> +
> +  for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
> +    {
> +      edge e;
> +      edge_iterator ei;
> +      gimple phi = gsi_stmt (gsi);
> +      tree back_arg = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
> +      tree res = gimple_phi_result (phi);
> +      tree var = SSA_NAME_VAR (res);
> +
> +      phi = create_phi_node (var, loop->latch);
> +      create_new_def_for (gimple_phi_result (phi), phi,
> +			  gimple_phi_result_ptr (phi));

Using create_new_def_for looks very suspicios.  create_phi_node
will already create a new SSA name for you for the result, so
it doesn't make any sense to fiddle with the SSA updaters machinery, does 
it?

> +      FOR_EACH_EDGE (e, ei, loop->latch->preds)
> +	add_phi_arg (phi, (e == pred_e ? back_arg : res),
> +		     e, UNKNOWN_LOCATION);
> +
> +      res = gimple_phi_result (phi);
> +      add_phi_arg (gsi_stmt (gsi), res, loop_latch_edge (loop),
> +		   UNKNOWN_LOCATION);
> +    }
> +}
> +
> +/* Creates a control flag for the FORWARDED_EDGE that represents the
> +   back-edge that has been forwarded to the latch basic block of LOOP.
> +   INNER_BODY is the basic block to which the back-edge was pointing
> +   before redirection.  This function creates a boolean control flag
> +   that is cleared when the FORWARDED_EDGE is taken and set for all
> +   the other paths.  This function adds the corresponding phi nodes in
> +   LOOP->latch and LOOP->header, and finally adds an edge from
> +   LOOP->header to the INNER_BODY guarded by the control flag.  */
> +
> +static void
> +create_control_flag (edge forwarded_edge, loop_p loop, basic_block inner_body)
> +{
> +  edge e, preheader;
> +  edge outer_latch_e = loop_latch_edge (loop);
> +  const char *name = "_flat_";
> +  tree var = create_tmp_var (boolean_type_node, name);

create_tmp_reg

> +  tree res;
> +  gimple phi, cond_stmt;
> +  gimple_stmt_iterator gsi;
> +  edge_iterator ei;
> +
> +  /* Adds a control variable for the redirected FORWARDED_EDGE.  */
> +  add_referenced_var (var);
> +  phi = create_phi_node (var, forwarded_edge->dest);
> +  create_new_def_for (gimple_phi_result (phi), phi,
> +		      gimple_phi_result_ptr (phi));

Likewise.

> +  FOR_EACH_EDGE (e, ei, outer_latch_e->src->preds)
> +    add_phi_arg (phi, (e == forwarded_edge
> +		       ? boolean_false_node
> +		       : boolean_true_node),
> +		 e, UNKNOWN_LOCATION);
> +  res = gimple_phi_result (phi);
> +
> +  /* Add a phi node in LOOP->header for the control variable.  */
> +  phi = create_phi_node (var, loop->header);
> +  create_new_def_for (gimple_phi_result (phi), phi,
> +		      gimple_phi_result_ptr (phi));

Again.

> +  preheader = loop_preheader_edge (loop);
> +  FOR_EACH_EDGE (e, ei, loop->header->preds)
> +    add_phi_arg (phi, (e == preheader
> +		       ? boolean_true_node
> +		       : res),
> +		 e, UNKNOWN_LOCATION);
> +  res = gimple_phi_result (phi);
> +
> +  /* Split LOOP->header to insert the control variable condition.  */
> +  e = split_block_after_labels (loop->header);
> +  e->flags = EDGE_TRUE_VALUE;
> +  e = make_edge (loop->header, inner_body, EDGE_FALSE_VALUE);
> +  cond_stmt = gimple_build_cond (EQ_EXPR, res, boolean_true_node,
> +				 NULL_TREE, NULL_TREE);
> +  gsi = gsi_last_bb (loop->header);
> +  gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
> +}
> +
> +/* Adds phi nodes to the LOOP->header and LOOP->latch for the ssa_name
> +   NAME.  ARG is the argument of the latch phi node set for the
> +   FORWARDED_EDGE, and all the other edges merged by the latch phi
> +   node are set to the result of the LOOP->header phi node.  The latch
> +   edge of the LOOP->header phi node is set to the result of the
> +   LOOP->latch phi node, and the other argument is set to an arbitrary
> +   valid value defined before the loop (note that this initial value
> +   is never used in the loop).  Returns the LOOP->header phi result.  */
> +
> +static tree
> +add_header_and_latch_phis (loop_p loop, tree name, edge forwarded_edge,
> +			   tree arg)
> +{
> +  edge e;
> +  edge_iterator ei;
> +  tree res, zero, var = SSA_NAME_VAR (name);
> +  gimple loop_phi = create_phi_node (var, loop->header);
> +  gimple latch_phi = create_phi_node (var, loop->latch);
> +
> +  create_new_def_for (gimple_phi_result (loop_phi), loop_phi,
> +		      gimple_phi_result_ptr (loop_phi));
> +  create_new_def_for (gimple_phi_result (latch_phi), latch_phi,
> +		      gimple_phi_result_ptr (latch_phi));

Likewise.

> +  /* The value set to ZERO will never be used in the loop, however we
> +     have to construct something meaningful for virtual SSA_NAMEs.  */
> +  if (TREE_CODE (arg) != SSA_NAME)
> +    zero = arg;
> +  else if (is_gimple_reg (arg))
> +    zero = fold_convert (TREE_TYPE (arg), integer_zero_node);
> +  else
> +    zero = gimple_default_def (cfun, SSA_NAME_VAR (arg));

That looks bogus.  It will create overlapping life-ranges
for virtual operands - just make sure you'll rename the VOPs
and use gimple_vop (cfun) for the fallback.  You shoudl also
use build_zero_cst instead of fold_convert.

Thus,

  mark_sym_for_renaming (gimple_vop (cfun));

> +  res = gimple_phi_result (latch_phi);
> +  FOR_EACH_EDGE (e, ei, loop->header->preds)
> +    add_phi_arg (loop_phi, (e == loop_latch_edge (loop) ? res : zero),
> +		 e, UNKNOWN_LOCATION);
> +
> +  res = gimple_phi_result (loop_phi);
> +  FOR_EACH_EDGE (e, ei, loop->latch->preds)
> +    add_phi_arg (latch_phi, (e == forwarded_edge ? arg : res),
> +		 e, UNKNOWN_LOCATION);
> +
> +  return res;
> +}
> +
> +/* Creates phi nodes for each inductive definition, i.e., loop phi
> +   nodes.  For each induction phi node in the old loop header, i.e.,
> +   in the single_succ (INNER_BODY), insert a phi node in the
> +   LOOP->latch that takes the updated value of the induction on the
> +   FORWARDED_EDGE, and maintains the same value as in the phi node of
> +   the LOOP->header for all the other possible paths reaching
> +   LOOP->latch.  This function has to be called after all the
> +   back-edges have been redirected.  */
> +
> +static void
> +update_inner_induction_phi_nodes (edge forwarded_edge, loop_p loop,
> +				  basic_block inner_body)
> +{
> +  gimple_stmt_iterator gsi;
> +
> +  for (gsi = gsi_start_phis (single_succ (inner_body));
> +       !gsi_end_p (gsi); gsi_next (&gsi))
> +    {
> +      gimple old_loop_phi = gsi_stmt (gsi);
> +      tree back_arg = PHI_ARG_DEF_FROM_EDGE (old_loop_phi,
> +					     single_succ_edge (inner_body));
> +      tree res = gimple_phi_result (old_loop_phi);
> +
> +      res = add_header_and_latch_phis (loop, res, forwarded_edge, back_arg);
> +      add_phi_arg (old_loop_phi, res, single_succ_edge (inner_body),
> +		   UNKNOWN_LOCATION);
> +    }
> +}
> +
> +/* Renames all the uses of OLD_NAME with NEW_NAME (except the phi
> +   nodes of DEF_BB) in all the basic blocks dominated by DEF_BB and in
> +   the arguments of all the phi nodes originating in a basic block
> +   that is dominated by DEF_BB.  */
> +
> +static void
> +rename_dominated_uses (loop_p loop, tree old_name, tree new_name,
> +		       basic_block def_bb)
> +{
> +  imm_use_iterator uit;
> +  gimple stmt;
> +  use_operand_p use_p;
> +  ssa_op_iter op_iter;
> +
> +  FOR_EACH_IMM_USE_STMT (stmt, uit, old_name)
> +    {
> +      enum gimple_code code = gimple_code (stmt);
> +      basic_block use_bb = gimple_bb (stmt);
> +      edge_iterator ei;
> +      edge e;
> +
> +      if (code == GIMPLE_PHI)
> +	{
> +	  FOR_EACH_EDGE (e, ei, use_bb->preds)
> +	    if (PHI_ARG_DEF_FROM_EDGE (stmt, e) == old_name
> +		&& dominated_by_p (CDI_DOMINATORS, e->src, def_bb)
> +		&& use_bb != def_bb)
> +	      replace_exp (gimple_phi_arg_imm_use_ptr (stmt, e->dest_idx),
> +			   new_name);

  SET_USE (gimple_phi_arg_imm_use_ptr (stmt, e->dest_idx), new_name);

> +	}
> +      else
> +	{
> +	  if (!dominated_by_p (CDI_DOMINATORS, use_bb, def_bb))
> +	    continue;
> +
> +	  if (use_bb->loop_father == loop)
> +	    {
> +	      FOR_EACH_SSA_USE_OPERAND (use_p, stmt, op_iter, SSA_OP_ALL_USES)
> +		if (USE_FROM_PTR (use_p) == old_name)
> +		  replace_exp (use_p, new_name);
> +	    }
> +	  else
> +	    /* Virtual operands are not translated into loop closed
> +	       SSA form, and thus they may occur in the rest of
> +	       the program without a loop close vphi node.  */

But you are updating all uses again.

  You should simply use

        FOR_EACH_IMM_USE_ON_STMT (use_p, uit)
          SET_USE (use_p, new_name);

> +	    FOR_EACH_SSA_USE_OPERAND (use_p, stmt, op_iter, SSA_OP_ALL_USES)
> +	      if (USE_FROM_PTR (use_p) == old_name)
> +		replace_exp (use_p, new_name);
> +	}
> +    }
> +}
> +
> +/* Helper function for add_missing_phi_nodes_1.  Adds to LOOP all the
> +   missing phi nodes for NAME and updates the arguments of the
> +   LATCH_PHI node.  LOOP_PHI node is the inductive definition of NAME
> +   in LOOP->header.  */
> +
> +static void
> +add_missing_phi_nodes_2 (loop_p loop, tree name, tree old_name,
> +			 VEC (gimple, heap) *phis)
> +{
> +  unsigned i;
> +  basic_block bb, dom_bb = gimple_bb (SSA_NAME_DEF_STMT (name));
> +  VEC (basic_block, heap) *dom_bbs = get_all_dominated_blocks (CDI_DOMINATORS,
> +							       dom_bb);
> +
> +  FOR_EACH_VEC_ELT (basic_block, dom_bbs, i, bb)
> +    {
> +      edge e;
> +      edge_iterator ei;
> +
> +      if (bb == loop->latch
> +	  || bb->loop_father != loop)
> +	continue;

dom_bbs may be very large, it's much better to iterate over the
loop bbs and do a dominator check.  Or iterate over dominator sons
with first_dom_son (), next_dom_son () and recurse, bailing out when
you're running out of the loop.

> +      FOR_EACH_EDGE (e, ei, bb->succs)
> +	{
> +	  gimple phi = VEC_index (gimple, phis, e->dest->index);
> +
> +	  if (phi)
> +	    add_phi_arg (phi, name, e, UNKNOWN_LOCATION);
> +
> +	  else if (!single_pred_p (e->dest)
> +		   && !dominated_by_p (CDI_DOMINATORS, e->dest, dom_bb)
> +		   && e->dest->loop_father == loop)
> +	  {
> +	    tree var = SSA_NAME_VAR (name);
> +
> +	    phi = create_phi_node (var, e->dest);
> +	    create_new_def_for (gimple_phi_result (phi), phi,
> +				gimple_phi_result_ptr (phi));

Again.

> +	    VEC_replace (gimple, phis, e->dest->index, phi);
> +	    add_phi_arg (phi, name, e, UNKNOWN_LOCATION);
> +	    rename_dominated_uses (loop, old_name, gimple_phi_result (phi),
> +				   e->dest);
> +	    add_missing_phi_nodes_2 (loop, gimple_phi_result (phi), old_name,
> +				     phis);
> +	  }
> +	}
> +    }

You leak dom_bbs.

> +}
> +
> +/* Helper function for add_missing_phi_nodes.  For all the definitions
> +   of DEF_STMT add the missing phi nodes in LOOP.  */
> +
> +static void
> +add_missing_phi_nodes_1 (loop_p loop, gimple def_stmt)
> +{
> +  def_operand_p def_p;
> +  ssa_op_iter op_iter;
> +  basic_block bb = gimple_bb (def_stmt);
> +
> +  FOR_EACH_PHI_OR_STMT_DEF (def_p, def_stmt, op_iter, SSA_OP_DEF|SSA_OP_VDEF)
> +    {
> +      edge e;
> +      edge_iterator ei;
> +      tree res, zero, var;
> +      gimple loop_phi, latch_phi, use_stmt;
> +      imm_use_iterator uit;
> +      tree name = DEF_FROM_PTR (def_p);
> +      bool needs_update = false;
> +      VEC (gimple, heap) *phis;
> +      int i;
> +
> +      FOR_EACH_IMM_USE_STMT (use_stmt, uit, name)
> +	{
> +	  basic_block use_bb = gimple_bb (use_stmt);
> +
> +	  if (!dominated_by_p (CDI_DOMINATORS, bb, use_bb))
> +	    {
> +	      needs_update = true;
> +	      BREAK_FROM_IMM_USE_STMT (uit);
> +	    }
> +	}
> +
> +      if (!needs_update)
> +	continue;
> +
> +      var = SSA_NAME_VAR (name);
> +      loop_phi = create_phi_node (var, loop->header);
> +      latch_phi = create_phi_node (var, loop->latch);
> +
> +      create_new_def_for (gimple_phi_result (loop_phi), loop_phi,
> +			  gimple_phi_result_ptr (loop_phi));
> +      create_new_def_for (gimple_phi_result (latch_phi), latch_phi,
> +			  gimple_phi_result_ptr (latch_phi));

Again.

> +      /* The value set to ZERO will never be used in the loop, however we
> +	 have to construct something meaningful for virtual SSA_NAMEs.  */
> +      if (is_gimple_reg (name))
> +	zero = fold_convert (TREE_TYPE (name), integer_zero_node);
> +      else
> +	zero = gimple_default_def (cfun, SSA_NAME_VAR (name));
> +
> +      res = gimple_phi_result (latch_phi);
> +      FOR_EACH_EDGE (e, ei, loop->header->preds)
> +	add_phi_arg (loop_phi, (e == loop_latch_edge (loop) ? res : zero),
> +		     e, UNKNOWN_LOCATION);
> +
> +      res = gimple_phi_result (loop_phi);
> +      FOR_EACH_EDGE (e, ei, loop->latch->preds)
> +	add_phi_arg (latch_phi, res, e, UNKNOWN_LOCATION);
> +
> +      phis = VEC_alloc (gimple, heap, n_basic_blocks);
> +      for (i = 0; i < n_basic_blocks; i++)
> +	VEC_quick_push (gimple, phis, NULL);
> +
> +      VEC_replace (gimple, phis, loop->latch->index, latch_phi);
> +      VEC_replace (gimple, phis, loop->header->index, loop_phi);
> +      add_missing_phi_nodes_2 (loop, name, name, phis);
> +
> +      for (i = 0; i < n_basic_blocks; i++)
> +	{
> +	  gimple phi = VEC_index (gimple, phis, i);
> +
> +	  if (!phi)
> +	    continue;
> +
> +	  FOR_EACH_EDGE (e, ei, BASIC_BLOCK (i)->preds)
> +	    if (!PHI_ARG_DEF_FROM_EDGE (phi, e))
> +	      add_phi_arg (phi, res, e, UNKNOWN_LOCATION);
> +	}
> +
> +      VEC_free (gimple, heap, phis);
> +    }
> +}
> +
> +/* Walks over the code of LOOP and adds the missing phi nodes at
> +   control flow junctions.  When a variable is defined in an outer
> +   loop and used in an inner loop, the definition dominates the use.
> +   After the loop flattening, the inner loop body is directly
> +   reachable from the LOOP->header by using the added edge guarded by
> +   the boolean flag that controls the execution of the back-edge that
> +   was eliminated.  In this case, the use is not dominated by the
> +   definition, and this function adds the missing phi nodes.  */
> +
> +static void
> +add_missing_phi_nodes (loop_p loop)
> +{
> +  gimple_stmt_iterator gsi;
> +  int i, n = loop->num_nodes;
> +  basic_block *bbs = get_loop_body (loop);

So you can even pass this down to add_missing_phi_nodes_2.  Or
even use get_loop_body_in_dom_order and thus only need to walk
adjacent blocks in that array.

> +  for (i = 0; i < n; i++)
> +    {
> +      basic_block bb = bbs[i];
> +
> +      /* LOOP->header dominates all the blocks of the loop body, and
> +	 so we don't have to look at the missing phi nodes for the
> +	 definitions of LOOP->header.  */
> +      if (bb == loop->header)
> +	continue;
> +
> +      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> +	if (!gimple_nop_p (gsi_stmt (gsi)))
> +	  add_missing_phi_nodes_1 (loop, gsi_stmt (gsi));
> +
> +      for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> +	add_missing_phi_nodes_1 (loop, gsi_stmt (gsi));
> +    }
> +
> +  free (bbs);
> +}
> +
> +/* Removes all the back-edges of LOOP except its own back-edge.
> +   SCRATCH_PAD is used in if-conversion.  */
> +
> +static unsigned
> +flatten_loop (loop_p loop, tree *scratch_pad)
> +{
> +  int i, n = loop->num_nodes;
> +  basic_block *bbs;
> +  VEC (edge, heap) *back_edges;
> +  VEC (basic_block, heap) *loop_body;
> +  edge_iterator ei;
> +  edge e, pred_e;
> +  unsigned max_nb_basic_blocks = PARAM_VALUE (PARAM_LFLAT_MAX_NB_BBS);;
> +
> +  if (loop->num_nodes > max_nb_basic_blocks
> +      || !single_exit (loop)
> +      || !dbg_cnt (lflat))
> +    return 0;
> +
> +  mark_dfs_back_edges ();
> +  bbs = get_loop_body (loop);
> +
> +  back_edges = VEC_alloc (edge, heap, 3);
> +  loop_body = VEC_alloc (basic_block, heap, 3);
> +
> +  for (i = 0; i < n; i++)
> +    FOR_EACH_EDGE (e, ei, bbs[i]->succs)
> +      if (e->flags & EDGE_DFS_BACK
> +	  && e->src != loop->latch)
> +	VEC_safe_push (edge, heap, back_edges, e);
> +
> +  free (bbs);
> +
> +  /* Early return and do not modify the code when there are no back
> +     edges.  */
> +  if (VEC_empty (edge, back_edges))
> +    return 0;
> +
> +  cancel_subloops (loop);
> +
> +  /* Split the latch edge to make sure that the latch basic block does
> +     not contain code.  */
> +  loop->latch = split_edge (loop_latch_edge (loop));
> +  pred_e = single_pred_edge (loop->latch);
> +
> +  FOR_EACH_VEC_ELT (edge, back_edges, i, e)
> +    {
> +      basic_block dest = split_edge (e);
> +
> +      /* Redirect BACK_EDGE to LOOP->latch.  */
> +      redirect_edge_and_branch_force (e, loop->latch);
> +
> +      /* Save the basic block where it was pointing.  */
> +      VEC_safe_push (basic_block, heap, loop_body, dest);
> +    }
> +
> +  update_loop_phi_nodes (loop, pred_e);
> +
> +  FOR_EACH_VEC_ELT (edge, back_edges, i, e)
> +    create_control_flag (e, loop, VEC_index (basic_block, loop_body, i));
> +
> +  FOR_EACH_VEC_ELT (edge, back_edges, i, e)
> +    update_inner_induction_phi_nodes (e, loop, VEC_index (basic_block,
> +							  loop_body, i));
> +
> +  free_dominance_info (CDI_DOMINATORS);
> +  calculate_dominance_info (CDI_DOMINATORS);
> +  add_missing_phi_nodes (loop);
> +
> +  /* If we redirected some back-edges, split the latch edge to create
> +     an empty LOOP->latch.  */
> +  if (!single_pred_p (loop->latch))
> +    loop->latch = split_edge (loop_latch_edge (loop));
> +
> +  if (gate_tree_if_conversion ())
> +    tree_if_conversion (loop, scratch_pad);

You are leaking VECs.  As mentioned above testing the gate isn't
necessary here.

> +  return TODO_update_ssa | TODO_verify_ssa;

These TODOs belong in the pass structure.

> +}
> +
> +/* Flattens all the loops of the current function.  */
> +
> +static unsigned int
> +tree_loop_flattening (void)
> +{
> +  unsigned todo = 0;
> +  loop_p loop;
> +  loop_iterator li;
> +  tree scratch_pad = NULL_TREE;
> +
> +  if (number_of_loops () <= 1)
> +    return 0;
> +
> +  FOR_EACH_LOOP (li, loop, 0)
> +    todo |= flatten_loop (loop, &scratch_pad);

So we might end up recursively flattening loops (or not, as this
walk is in undefined order).  I'd say you want LI_ONLY_INNERMOST here,
or do you really want to flatten all loop trees up to the number
of basic blocks specified in the parm?  I guess not.

I think the pass misses a cost model and I'm still not sure when
or if it will be profitable to do this at all (as said, no
functional testcases).  What's the immediate benefit for GCC 4.6?

> +#ifdef ENABLE_CHECKING
> +  verify_dominators (CDI_DOMINATORS);
> +  verify_flow_info ();
> +#endif
> +
> +  cleanup_tree_cfg ();
> +  return todo;

return TODO_cleanup_cfg, but only if you flattened a loop.  So
return TODO_cleanup_cfg from flatten_loop instead.

Richard.

> +}
> +
> +static bool
> +gate_tree_loop_flattening (void)
> +{
> +  return flag_tree_loop_flattening != 0;
> +}
> +
> +struct gimple_opt_pass pass_flatten_loops =
> +{
> + {
> +  GIMPLE_PASS,
> +  "lflat",				/* name */
> +  gate_tree_loop_flattening,		/* gate */
> +  tree_loop_flattening,       		/* execute */
> +  NULL,					/* sub */
> +  NULL,					/* next */
> +  0,					/* static_pass_number */
> +  TV_TREE_LOOP_FLATTENING,  		/* tv_id */
> +  PROP_cfg | PROP_ssa,			/* properties_required */
> +  0,					/* properties_provided */
> +  0,					/* properties_destroyed */
> +  0,					/* todo_flags_start */
> +  TODO_dump_func
> +    | TODO_update_ssa
> +    | TODO_ggc_collect			/* todo_flags_finish */
> + }
> +};
> diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
> index a87a770..e2f257f 100644
> --- a/gcc/tree-pass.h
> +++ b/gcc/tree-pass.h
> @@ -374,6 +374,7 @@ extern struct gimple_opt_pass pass_graphite;
>  extern struct gimple_opt_pass pass_graphite_transforms;
>  extern struct gimple_opt_pass pass_if_conversion;
>  extern struct gimple_opt_pass pass_loop_distribution;
> +extern struct gimple_opt_pass pass_flatten_loops;
>  extern struct gimple_opt_pass pass_vectorize;
>  extern struct gimple_opt_pass pass_slp_vectorize;
>  extern struct gimple_opt_pass pass_complete_unroll;
> 

-- 
Richard Guenther <rguenther@suse.de>
Novell / SUSE Labs
SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746 - GF: Markus Rex


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]