Autoparallelization

Zdenek Dvorak rakdver@atrey.karlin.mff.cuni.cz
Wed Sep 27 21:22:00 GMT 2006


Hello,

I have commited the following patch to parloop branch.  It implements
automatic parallelization for simple loops; at the moment, code
generation only creates static schedule into fixed number of threads
(specified at compile time).  This should more or less be the version
I would like to get merged to 4.3 (I probably won't have time to
implement more features in following few weeks).

Regarding the testing, gcc bootstraps with autoparallelization (into 4
threads) enabled (there are quite a lot parallelizable loops in gcc).
Also, on stream benchmark, I get similar results as with enabling
-fopenmp (that is, about 10% slowdown, since the benchmark is primarily
memory bound :-).  I will of course provide more testing later.

Zdenek

Index: doc/passes.texi
===================================================================
*** doc/passes.texi	(revision 117251)
--- doc/passes.texi	(working copy)
*************** rtl-level loop unswitching in @file{loop
*** 429,438 ****
  the rtl-level pass is not completely redundant yet due to deficiencies
  in tree level alias analysis.
  
- The optimizations also use various utility functions contained in
- @file{tree-ssa-loop-manip.c}, @file{cfgloop.c}, @file{cfgloopanal.c} and
- @file{cfgloopmanip.c}.
- 
  Vectorization.  This pass transforms loops to operate on vector types
  instead of scalar types.  Data parallelism across loop iterations is exploited
  to group data elements from consecutive iterations into a vector and operate 
--- 429,434 ----
*************** The pass is implemented in @file{tree-ve
*** 446,451 ****
--- 442,454 ----
  utilities), @file{tree-vect-analyze.c} and @file{tree-vect-transform.c}.
  Analysis of data references is in @file{tree-data-ref.c}.
  
+ Loop parallelization.  This pass transforms loops in program to run them in
+ several threads.  The pass is implemented in @file{tree-parloops.c}.
+ 
+ The optimizations also use various utility functions contained in
+ @file{tree-ssa-loop-manip.c}, @file{cfgloop.c}, @file{cfgloopanal.c} and
+ @file{cfgloopmanip.c}.
+ 
  @item Tree level if-conversion for vectorizer
  
  This pass applies if-conversion to simple loops to help vectorizer.
Index: doc/invoke.texi
===================================================================
*** doc/invoke.texi	(revision 117251)
--- doc/invoke.texi	(working copy)
*************** Objective-C and Objective-C++ Dialects}.
*** 342,347 ****
--- 342,348 ----
  -fsplit-ivs-in-unroller -funswitch-loops @gol
  -fvariable-expansion-in-unroller @gol
  -ftree-pre  -ftree-ccp  -ftree-dce -ftree-loop-optimize @gol
+ -ftree-parallelize-loops @gol
  -ftree-loop-linear -ftree-loop-im -ftree-loop-ivcanon -fivopts @gol
  -ftree-dominator-opts -ftree-dse -ftree-copyrename -ftree-sink @gol
  -ftree-ch -ftree-sra -ftree-ter -ftree-lrs -ftree-fre -ftree-vectorize @gol
*************** for @option{-Os}, since it usually incre
*** 5116,5121 ****
--- 5117,5126 ----
  Perform loop optimizations on trees.  This flag is enabled by default
  at @option{-O} and higher.
  
+ @item -ftree-parallelize-loops=@var{n}
+ Automatically generate parallel code using the primitives from libgomp.
+ Create parallel code for @var{n} threads.
+ 
  @item -ftree-loop-linear
  Perform linear loop transformations on tree.  This flag can improve cache
  performance and allow further loop optimizations to take place.
Index: ChangeLog.parloop
===================================================================
*** ChangeLog.parloop	(revision 0)
--- ChangeLog.parloop	(revision 0)
***************
*** 0 ****
--- 1,4 ----
+ 2006-09-27  Zdenek Dvorak <dvorakz@suse.cz>
+ 
+ 	Branch created.  Initial commit.
+ 
Index: tree-pass.h
===================================================================
*** tree-pass.h	(revision 117251)
--- tree-pass.h	(working copy)
*************** extern struct tree_opt_pass pass_expand;
*** 294,299 ****
--- 294,300 ----
  extern struct tree_opt_pass pass_rest_of_compilation;
  extern struct tree_opt_pass pass_sink_code;
  extern struct tree_opt_pass pass_fre;
+ extern struct tree_opt_pass pass_parallelize_loops;
  extern struct tree_opt_pass pass_linear_transform;
  extern struct tree_opt_pass pass_copy_prop;
  extern struct tree_opt_pass pass_store_ccp;
Index: cfghooks.c
===================================================================
*** cfghooks.c	(revision 117251)
--- cfghooks.c	(working copy)
*************** verify_flow_info (void)
*** 75,80 ****
--- 75,81 ----
    int err = 0;
    basic_block bb, last_bb_seen;
    basic_block *last_visited;
+   int expected_n_edges;
  
    timevar_push (TV_CFG_VERIFY);
    last_visited = XCNEWVEC (basic_block, last_basic_block);
*************** verify_flow_info (void)
*** 101,106 ****
--- 102,109 ----
        last_bb_seen = bb;
      }
  
+   expected_n_edges = EDGE_COUNT (ENTRY_BLOCK_PTR->succs);
+ 
    /* Now check the basic blocks (boundaries etc.) */
    FOR_EACH_BB_REVERSE (bb)
      {
*************** verify_flow_info (void)
*** 108,113 ****
--- 111,118 ----
        edge e;
        edge_iterator ei;
  
+       expected_n_edges += EDGE_COUNT (bb->succs);
+ 
        if (bb->count < 0)
  	{
  	  error ("verify_flow_info: Wrong count of block %i %i",
*************** verify_flow_info (void)
*** 196,201 ****
--- 201,213 ----
  	}
      }
  
+   if (n_edges != expected_n_edges)
+     {
+       error ("verify_flow_info: %d edges found, %d edges expected",
+ 	     expected_n_edges, n_edges);
+       err = 1;
+     }
+ 
    /* Complete edge checksumming for ENTRY and EXIT.  */
    {
      edge e;
Index: omp-low.c
===================================================================
*** omp-low.c	(revision 117251)
--- omp-low.c	(working copy)
*************** expand_parallel_call (struct omp_region 
*** 2221,2226 ****
--- 2221,2228 ----
  
  	  then_bb = create_empty_bb (cond_bb);
  	  else_bb = create_empty_bb (then_bb);
+ 	  set_immediate_dominator (CDI_DOMINATORS, then_bb, cond_bb);
+ 	  set_immediate_dominator (CDI_DOMINATORS, else_bb, cond_bb);
  	  then_lab = create_artificial_label ();
  	  else_lab = create_artificial_label ();
  
*************** expand_omp_parallel (struct omp_region *
*** 2519,2528 ****
        entry_bb = e->dest;
        single_succ_edge (entry_bb)->flags = EDGE_FALLTHRU;
  
!       /* Move the parallel region into CHILD_CFUN.  We need to reset
! 	 dominance information because the expansion of the inner
! 	 regions has invalidated it.  */
!       free_dominance_info (CDI_DOMINATORS);
        new_bb = move_sese_region_to_fn (child_cfun, entry_bb, exit_bb);
        if (exit_bb)
  	single_succ_edge (new_bb)->flags = EDGE_FALLTHRU;
--- 2521,2527 ----
        entry_bb = e->dest;
        single_succ_edge (entry_bb)->flags = EDGE_FALLTHRU;
  
!       /* Move the parallel region into CHILD_CFUN.  */
        new_bb = move_sese_region_to_fn (child_cfun, entry_bb, exit_bb);
        if (exit_bb)
  	single_succ_edge (new_bb)->flags = EDGE_FALLTHRU;
*************** expand_omp_for_generic (struct omp_regio
*** 2711,2716 ****
--- 2710,2724 ----
  
    make_edge (l2_bb, l0_bb, EDGE_TRUE_VALUE);
    make_edge (l2_bb, l3_bb, EDGE_FALSE_VALUE);
+ 
+   set_immediate_dominator (CDI_DOMINATORS, l2_bb,
+ 			   recount_dominator (CDI_DOMINATORS, l2_bb));
+   set_immediate_dominator (CDI_DOMINATORS, l3_bb,
+ 			   recount_dominator (CDI_DOMINATORS, l3_bb));
+   set_immediate_dominator (CDI_DOMINATORS, l0_bb,
+ 			   recount_dominator (CDI_DOMINATORS, l0_bb));
+   set_immediate_dominator (CDI_DOMINATORS, l1_bb,
+ 			   recount_dominator (CDI_DOMINATORS, l1_bb));
  }
  
  
*************** expand_omp_for_static_nochunk (struct om
*** 2881,2886 ****
--- 2889,2900 ----
  
    make_edge (cont_bb, body_bb, EDGE_TRUE_VALUE);
    find_edge (cont_bb, fin_bb)->flags = EDGE_FALSE_VALUE;
+ 
+   set_immediate_dominator (CDI_DOMINATORS, seq_start_bb, entry_bb);
+   set_immediate_dominator (CDI_DOMINATORS, body_bb,
+ 			   recount_dominator (CDI_DOMINATORS, body_bb));
+   set_immediate_dominator (CDI_DOMINATORS, fin_bb,
+ 			   recount_dominator (CDI_DOMINATORS, fin_bb));
  }
  
  
*************** expand_omp_for_static_chunk (struct omp_
*** 3083,3088 ****
--- 3097,3112 ----
    make_edge (cont_bb, trip_update_bb, EDGE_FALSE_VALUE);
  
    make_edge (trip_update_bb, iter_part_bb, EDGE_FALLTHRU);
+ 
+   set_immediate_dominator (CDI_DOMINATORS, trip_update_bb, cont_bb);
+   set_immediate_dominator (CDI_DOMINATORS, iter_part_bb,
+ 			   recount_dominator (CDI_DOMINATORS, iter_part_bb));
+   set_immediate_dominator (CDI_DOMINATORS, fin_bb,
+ 			   recount_dominator (CDI_DOMINATORS, fin_bb));
+   set_immediate_dominator (CDI_DOMINATORS, seq_start_bb,
+ 			   recount_dominator (CDI_DOMINATORS, seq_start_bb));
+   set_immediate_dominator (CDI_DOMINATORS, body_bb,
+ 			   recount_dominator (CDI_DOMINATORS, body_bb));
  }
  
  
*************** expand_omp_sections (struct omp_region *
*** 3277,3282 ****
--- 3301,3318 ----
    e = single_succ_edge (l1_bb);
    redirect_edge_succ (e, l0_bb);
    e->flags = EDGE_FALLTHRU;
+ 
+   set_immediate_dominator (CDI_DOMINATORS, l1_bb,
+ 			   recount_dominator (CDI_DOMINATORS, l1_bb));
+   set_immediate_dominator (CDI_DOMINATORS, l0_bb,
+ 			   recount_dominator (CDI_DOMINATORS, l0_bb));
+   set_immediate_dominator (CDI_DOMINATORS, default_bb,
+ 			   recount_dominator (CDI_DOMINATORS, default_bb));
+   set_immediate_dominator (CDI_DOMINATORS, l2_bb,
+ 			   recount_dominator (CDI_DOMINATORS, l2_bb));
+   for (inner = region->inner; inner; inner = inner->next)
+     set_immediate_dominator (CDI_DOMINATORS, inner->entry,
+ 			     recount_dominator (CDI_DOMINATORS, inner->entry));
  }
  
  
*************** execute_expand_omp (void)
*** 3485,3491 ****
    expand_omp (root_omp_region);
  
    free_dominance_info (CDI_DOMINATORS);
-   free_dominance_info (CDI_POST_DOMINATORS);
    cleanup_tree_cfg ();
  
    free_omp_regions ();
--- 3521,3526 ----
Index: testsuite/gcc.dg/tree-ssa/parallelization-1.c
===================================================================
*** testsuite/gcc.dg/tree-ssa/parallelization-1.c	(revision 0)
--- testsuite/gcc.dg/tree-ssa/parallelization-1.c	(revision 0)
***************
*** 0 ****
--- 1,33 ----
+ /* { dg-do compile } */
+ /* { dg-options "-O2 -ftree-parallelize-loops=4 -fdump-tree-parloops-details -fdump-tree-final_cleanup" } */
+ 
+ void abort (void);
+ 
+ void parloop (int N)
+ {
+   int i;
+   int x[10000000];
+ 
+   for (i = 0; i < N; i++)
+     x[i] = i + 3;
+ 
+   for (i = 0; i < N; i++)
+     {
+       if (x[i] != i + 3)
+ 	abort ();
+     }
+ }
+ 
+ int main(void)
+ {
+   parloop(10000000);
+ 
+   return 0;
+ }
+ 
+ /* Check that the first loop in parloop got parallelized.  */
+ 
+ /* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops" } } */
+ /* { dg-final { scan-tree-dump-times "loopfn" 4 "final_cleanup" } } */
+ /* { dg-final { cleanup-tree-dump "parloops" } } */
+ /* { dg-final { cleanup-tree-dump "final_cleanup" } } */
Index: dominance.c
===================================================================
*** dominance.c	(revision 117251)
--- dominance.c	(working copy)
***************
*** 45,53 ****
  #include "et-forest.h"
  #include "timevar.h"
  
- /* Whether the dominators and the postdominators are available.  */
- enum dom_state dom_computed[2];
- 
  /* We name our nodes with integers, beginning with 1.  Zero is reserved for
     'undefined' or 'end of list'.  The name of each node is given by the dfs
     number of the corresponding basic block.  Please note, that we include the
--- 45,50 ----
*************** static void link_roots (struct dom_info 
*** 123,131 ****
  static void calc_idoms (struct dom_info *, enum cdi_direction);
  void debug_dominance_info (enum cdi_direction);
  
- /* Keeps track of the*/
- static unsigned n_bbs_in_dom_tree[2];
- 
  /* Helper macro for allocating and initializing an array,
     for aesthetic reasons.  */
  #define init_ar(var, type, num, content)			\
--- 120,125 ----
Index: builtins.def
===================================================================
*** builtins.def	(revision 117251)
--- builtins.def	(working copy)
*************** Software Foundation, 51 Franklin Street,
*** 140,146 ****
  #undef DEF_GOMP_BUILTIN
  #define DEF_GOMP_BUILTIN(ENUM, NAME, TYPE, ATTRS) \
    DEF_BUILTIN (ENUM, "__builtin_" NAME, BUILT_IN_NORMAL, TYPE, TYPE,    \
!                false, true, true, ATTRS, false, flag_openmp)
  
  /* Define an attribute list for math functions that are normally
     "impure" because some of them may write into global memory for
--- 140,147 ----
  #undef DEF_GOMP_BUILTIN
  #define DEF_GOMP_BUILTIN(ENUM, NAME, TYPE, ATTRS) \
    DEF_BUILTIN (ENUM, "__builtin_" NAME, BUILT_IN_NORMAL, TYPE, TYPE,    \
!                false, true, true, ATTRS, false, \
! 	       (flag_openmp || flag_tree_parallelize_loops))
  
  /* Define an attribute list for math functions that are normally
     "impure" because some of them may write into global memory for
Index: timevar.def
===================================================================
*** timevar.def	(revision 117251)
--- timevar.def	(working copy)
*************** DEFTIMEVAR (TV_SCEV_CONST            , "
*** 107,112 ****
--- 107,113 ----
  DEFTIMEVAR (TV_TREE_LOOP_UNSWITCH    , "tree loop unswitching")
  DEFTIMEVAR (TV_COMPLETE_UNROLL       , "complete unrolling")
  DEFTIMEVAR (TV_TREE_VECTORIZATION    , "tree vectorization")
+ DEFTIMEVAR (TV_TREE_PARALLELIZE_LOOPS, "tree parallelize loops")
  DEFTIMEVAR (TV_TREE_LINEAR_TRANSFORM , "tree loop linear")
  DEFTIMEVAR (TV_TREE_PREFETCH	     , "tree prefetching")
  DEFTIMEVAR (TV_TREE_LOOP_IVOPTS	     , "tree iv optimization")
Index: tree-ssa-loop.c
===================================================================
*** tree-ssa-loop.c	(revision 117251)
--- tree-ssa-loop.c	(working copy)
*************** struct tree_opt_pass pass_complete_unrol
*** 407,412 ****
--- 407,450 ----
    0					/* letter */
  };
  
+ /* Loop distribution.  */
+ 
+ static bool
+ gate_tree_parallelize_loops (void)
+ {
+   return flag_tree_parallelize_loops != 1;
+ }
+ 
+ static unsigned
+ tree_parallelize_loops (void)
+ {
+   unsigned todo = 0;
+ 
+   if (!current_loops)
+     return 0;
+ 
+   if (parallelize_loops ())
+     todo |= TODO_cleanup_cfg;
+   return todo;
+ }
+ 
+ struct tree_opt_pass pass_parallelize_loops =
+ {
+   "parloops",				/* name */
+   gate_tree_parallelize_loops,		/* gate */
+   tree_parallelize_loops,      		/* execute */
+   NULL,					/* sub */
+   NULL,					/* next */
+   0,					/* static_pass_number */
+   TV_TREE_PARALLELIZE_LOOPS,  		/* tv_id */
+   PROP_cfg | PROP_ssa,			/* properties_required */
+   0,					/* properties_provided */
+   0,					/* properties_destroyed */
+   0,					/* todo_flags_start */
+   TODO_dump_func | TODO_verify_loops,	/* todo_flags_finish */
+   0				        /* letter */	
+ };
+ 
  /* Prefetching.  */
  
  static unsigned int
Index: tree-parloops.c
===================================================================
*** tree-parloops.c	(revision 0)
--- tree-parloops.c	(revision 0)
***************
*** 0 ****
--- 1,1363 ----
+ /* Loop autoparallelization.
+    Copyright (C) 2006 Free Software Foundation, Inc.
+    Contributed by Sebastian Pop <pop@cri.ensmp.fr> and
+    Zdenek Dvorak <dvorakz@suse.cz>.
+ 
+ This file is part of GCC.
+ 
+ GCC is free software; you can redistribute it and/or modify it under
+ the terms of the GNU General Public License as published by the Free
+ Software Foundation; either version 2, or (at your option) any later
+ version.
+ 
+ GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ for more details.
+ 
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING.  If not, write to the Free
+ Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+ 02110-1301, USA.  */
+ 
+ #include "config.h"
+ #include "system.h"
+ #include "coretypes.h"
+ #include "tm.h"
+ #include "tree.h"
+ #include "rtl.h"
+ #include "tree-flow.h"
+ #include "cfgloop.h"
+ #include "ggc.h"
+ #include "tree-data-ref.h"
+ #include "diagnostic.h"
+ #include "tree-pass.h"
+ #include "tree-scalar-evolution.h"
+ #include "hashtab.h"
+ #include "langhooks.h"
+ 
+ /* This pass tries to distribute iterations of loops into several threads.
+    The implementation is straightforward -- for each loop we test whether its
+    iterations are independent, and if it is the case (and some additional
+    conditions regarding profitability and correctness are satisfied), we
+    split the loop to a separate function, and generate code to invoke the loop
+    in different threads for different parts of the iteration space;  the most
+    complexity is in the code generation part.
+ 
+    TODO:
+    -- if there are several parallelizable loops in a function, it may be
+       possible to generate the threads just once (using synchronization to
+       ensure that cross-loop dependences are obeyed).
+    -- handling of common scalar dependence patterns (accumulation, ...)
+    -- handling of non-innermost loops  */
+ 
+ /* Minimal number of iterations of a loop that should be executed in each
+    thread.  */
+ #define MIN_PER_THREAD 100
+ 
+ /* Element of hashtable of names to copy.  */
+ 
+ struct name_to_copy_elt
+ {
+   unsigned version;	/* The version of the name to copy.  */
+   tree new_name;	/* The new name used in the copy.  */
+   tree field;		/* The field of the structure used to pass the
+ 			   value.  */
+ };
+ 
+ /* Equality and hash functions for hashtab code.  */
+ 
+ static int
+ name_to_copy_elt_eq (const void *aa, const void *bb)
+ {
+   struct name_to_copy_elt *a = (struct name_to_copy_elt *) aa;
+   struct name_to_copy_elt *b = (struct name_to_copy_elt *) bb;
+ 
+   return a->version == b->version;
+ }
+ 
+ static hashval_t
+ name_to_copy_elt_hash (const void *aa)
+ {
+   struct name_to_copy_elt *a = (struct name_to_copy_elt *) aa;
+ 
+   return (hashval_t) a->version;
+ }
+ 
+ /* Returns true if the iterations of LOOP are independent on each other (that
+    is, if we can execute them in parallel), and if LOOP satisfies other
+    conditions that we need to be able to parallelize it.  Description of number
+    of iterations is stored to NITER.  */
+ 
+ static bool
+ loop_parallel_p (struct loop *loop, struct tree_niter_desc *niter)
+ {
+   edge exit = single_dom_exit (loop);
+   VEC (ddr_p, heap) *dependence_relations;
+   VEC (data_reference_p, heap) *datarefs;
+   lambda_trans_matrix trans;
+   bool ret = false;
+   tree phi;
+ 
+   /* Only consider innermost loops with just one exit.  The innermost-loop
+      restriction is not necessary, but it makes things simpler.  */
+   if (loop->inner || !exit)
+     return false;
+ 
+   if (dump_file && (dump_flags & TDF_DETAILS))
+     fprintf (dump_file, "\nConsidering loop %d\n", loop->num);
+ 
+   /* We need to know # of iterations, and there should be no uses of values
+      defined inside loop outside of it, unless the values are invariants of
+      the loop.  */
+   if (!number_of_iterations_exit (loop, exit, niter, false))
+     {
+       if (dump_file && (dump_flags & TDF_DETAILS))
+ 	fprintf (dump_file, "  FAILED: number of iterations not known\n");
+       return false;
+     }
+ 
+   for (phi = phi_nodes (exit->dest); phi; phi = PHI_CHAIN (phi))
+     {
+       tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
+ 
+       if (is_gimple_reg (val)
+ 	  && !expr_invariant_in_loop_p (loop, val))
+ 	{
+ 	  if (dump_file && (dump_flags & TDF_DETAILS))
+ 	    fprintf (dump_file, "  FAILED: value used outside loop\n");
+ 	  return false;
+ 	}
+     }
+ 
+   /* The iterations of the loop may comunicate only through bivs whose
+      iteration space can be distributed efficiently.  */
+   for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
+     {
+       tree def = PHI_RESULT (phi);
+       affine_iv iv;
+ 
+       if (is_gimple_reg (def)
+ 	  && !simple_iv (loop, phi, def, &iv, true))
+ 	{
+ 	  if (dump_file && (dump_flags & TDF_DETAILS))
+ 	    fprintf (dump_file, "  FAILED: non-biv phi node\n");
+ 	  return false;
+ 	}
+     }
+ 
+   datarefs = VEC_alloc (data_reference_p, heap, 10);
+   dependence_relations = VEC_alloc (ddr_p, heap, 10 * 10);
+ 
+   /* Check for problems with dependences.  If the loop can be reversed,
+      the iterations are indepent.  */
+   compute_data_dependences_for_loop (loop, true, &datarefs,
+ 				     &dependence_relations);
+   if (dump_file && (dump_flags & TDF_DETAILS))
+     dump_data_dependence_relations (dump_file, dependence_relations);
+ 
+   trans = lambda_trans_matrix_new (1, 1);
+   LTM_MATRIX (trans)[0][0] = -1;
+ 
+   if (lambda_transform_legal_p (trans, 1, dependence_relations))
+     {
+       ret = true;
+       if (dump_file && (dump_flags & TDF_DETAILS))
+ 	fprintf (dump_file, "  SUCCESS: may be parallelized\n");
+     }
+   else if (dump_file && (dump_flags & TDF_DETAILS))
+     fprintf (dump_file, "  FAILED: dependence check failed\n");
+ 
+   free_dependence_relations (dependence_relations);
+   free_data_refs (datarefs);
+ 
+   return ret;
+ }
+ 
+ /* Marks all virtual operands of statement STMT for renaming.  */
+ 
+ static void
+ mark_virtual_ops_for_renaming (tree stmt)
+ {
+   ssa_op_iter iter;
+   tree var;
+ 
+   if (TREE_CODE (stmt) == PHI_NODE)
+     {
+       var = PHI_RESULT (stmt);
+       if (is_gimple_reg (var))
+ 	return;
+ 
+       if (TREE_CODE (var) == SSA_NAME)
+ 	var = SSA_NAME_VAR (var);
+       mark_sym_for_renaming (var);
+       return;
+     }
+ 
+   update_stmt (stmt);
+ 
+   FOR_EACH_SSA_TREE_OPERAND (var, stmt, iter, SSA_OP_ALL_VIRTUALS)
+     {
+       if (TREE_CODE (var) == SSA_NAME)
+ 	var = SSA_NAME_VAR (var);
+       mark_sym_for_renaming (var);
+     }
+ }
+ 
+ /* Calls mark_virtual_ops_for_renaming for all members of LIST.  */
+ 
+ static void
+ mark_virtual_ops_for_renaming_list (tree list)
+ {
+   tree_stmt_iterator tsi;
+ 
+   for (tsi = tsi_start (list); !tsi_end_p (tsi); tsi_next (&tsi))
+     mark_virtual_ops_for_renaming (tsi_stmt (tsi));
+ }
+ 
+ /* Marks operands of calls for renaming.  */
+ 
+ static void
+ mark_call_virtual_operands (void)
+ {
+   basic_block bb;
+   block_stmt_iterator bsi;
+   tree stmt;
+ 
+   FOR_EACH_BB (bb)
+     {
+       for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
+ 	{
+ 	  stmt = bsi_stmt (bsi);
+ 	  if (get_call_expr_in (stmt) != NULL_TREE)
+ 	    mark_new_vars_to_rename (stmt);
+ 	}
+     }
+ }
+ 
+ /* Assigns the address of VAR in TYPE to an ssa name, and returns this name.
+    The assignment statement is placed before LOOP.  DECL_ADDRESS maps decls
+    to their addresses that can be reused.  */
+ 
+ static tree
+ take_address_of (tree var, tree type, struct loop *loop, htab_t decl_address)
+ {
+   int uid = DECL_UID (var);
+   void **dslot;
+   struct int_tree_map ielt, *nielt;
+   tree name, bvar, stmt;
+   edge entry = loop_preheader_edge (loop);
+ 
+   ielt.uid = uid;
+   dslot = htab_find_slot_with_hash (decl_address, &ielt, uid, INSERT);
+   if (!*dslot)
+     {
+       bvar = create_tmp_var (type, get_name (var));
+       add_referenced_var (bvar);
+       stmt = build2 (MODIFY_EXPR, void_type_node, bvar,
+ 		     fold_convert (type,
+ 				   build_addr (var, current_function_decl)));
+       name = make_ssa_name (bvar, stmt);
+       TREE_OPERAND (stmt, 0) = name;
+       bsi_insert_on_edge_immediate_loop (entry, stmt);
+ 
+       nielt = XNEW (struct int_tree_map);
+       nielt->uid = uid;
+       nielt->to = name;
+       *dslot = nielt;
+ 
+       return name;
+     }
+ 
+   name = ((struct int_tree_map *) *dslot)->to;
+   if (TREE_TYPE (name) == type)
+     return name;
+ 
+   bvar = SSA_NAME_VAR (name);
+   stmt = build2 (MODIFY_EXPR, void_type_node, bvar,
+ 		 fold_convert (type, name));
+   name = make_ssa_name (bvar, stmt);
+   TREE_OPERAND (stmt, 0) = name;
+   bsi_insert_on_edge_immediate_loop (entry, stmt);
+ 
+   return name;
+ }
+ 
+ /* Eliminates references to local variables in *TP from LOOP.  DECL_ADDRESS
+    contains addresses for the references for that we have already taken
+    them.  If the expression is changed, CHANGED is set to true.  Callback for
+    walk_tree.  */
+ 
+ struct elv_data
+ {
+   struct loop *loop;
+   htab_t decl_address;
+   bool changed;
+ };
+ 
+ static tree
+ eliminate_local_variables_1 (tree *tp, int *walk_subtrees, void *data)
+ {
+   struct elv_data *dta = data;
+   tree t = *tp, var, addr, addr_type, type;
+ 
+   if (DECL_P (t))
+     {
+       *walk_subtrees = 0;
+ 
+       if (!SSA_VAR_P (t) || DECL_EXTERNAL (t))
+ 	return NULL_TREE;
+ 
+       type = TREE_TYPE (t);
+       addr_type = build_pointer_type (type);
+       addr = take_address_of (t, addr_type, dta->loop, dta->decl_address);
+       *tp = build1 (INDIRECT_REF, TREE_TYPE (*tp), addr);
+ 
+       dta->changed = true;
+       return NULL_TREE;
+     }
+ 
+   if (TREE_CODE (t) == ADDR_EXPR)
+     {
+       *walk_subtrees = 0;
+ 
+       var = TREE_OPERAND (t, 0);
+       if (DECL_EXTERNAL (var))
+ 	return NULL_TREE;
+ 
+       addr_type = TREE_TYPE (t);
+       addr = take_address_of (var, addr_type, dta->loop, dta->decl_address);
+       *tp = addr;
+ 
+       dta->changed = true;
+       return NULL_TREE;
+     }
+ 
+   if (!EXPR_P (t))
+     *walk_subtrees = 0;
+ 
+   return NULL_TREE;
+ }
+ 
+ /* Moves the references to local variables in STMT from LOOP.  DECL_ADDRESS
+    contains addresses for the references for that we have already taken
+    them.  */
+ 
+ static void
+ eliminate_local_variables_stmt (struct loop *loop, tree stmt,
+ 				htab_t decl_address)
+ {
+   struct elv_data dta;
+ 
+   dta.loop = loop;
+   dta.decl_address = decl_address;
+   dta.changed = false;
+ 
+   walk_tree (&stmt, eliminate_local_variables_1, &dta, NULL);
+ 
+   if (dta.changed)
+     update_stmt (stmt);
+ }
+ 
+ /* Eliminates the references to local variables from LOOP.  This includes:
+ 
+    1) Taking address of a local variable -- these are moved out of the loop
+       (and temporary variable is created to hold the address if necessary).
+    2) Dereferencing a local variable -- these are replaced with indirect
+       references.  */
+ 
+ static void
+ eliminate_local_variables (struct loop *loop)
+ {
+   basic_block bb, *body = get_loop_body (loop);
+   unsigned i;
+   block_stmt_iterator bsi;
+   htab_t decl_address = htab_create (10, int_tree_map_hash, int_tree_map_eq,
+ 				     free);
+ 
+   /* Find and rename the ssa names defined outside of loop.  */
+   for (i = 0; i < loop->num_nodes; i++)
+     {
+       bb = body[i];
+ 
+       for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
+ 	eliminate_local_variables_stmt (loop, bsi_stmt (bsi), decl_address);
+     }
+ 
+   htab_delete (decl_address);
+ }
+ 
+ /* If COPY_NAME_P is true, creates and returns a duplicate of NAME.
+    The copies are stored to NAME_COPIES, if NAME was already duplicated,
+    its duplicate stored in NAME_COPIES is returned.
+    
+    Regardless of COPY_NAME_P, the decl used as a base of the ssa name is also
+    duplicated, storing the copies in DECL_COPIES.  */
+ 
+ static tree
+ separate_decls_in_loop_name (tree name,
+ 			     htab_t name_copies, htab_t decl_copies,
+ 			     bool copy_name_p)
+ {
+   tree copy, var, var_copy;
+   unsigned idx, uid, nuid;
+   struct int_tree_map ielt, *nielt;
+   struct name_to_copy_elt elt, *nelt;
+   void **slot, **dslot;
+ 
+   idx = SSA_NAME_VERSION (name);
+   elt.version = idx;
+   slot = htab_find_slot_with_hash (name_copies, &elt, idx,
+ 				   copy_name_p ? INSERT : NO_INSERT);
+   if (slot && *slot)
+     return ((struct name_to_copy_elt *) *slot)->new_name;
+ 
+   var = SSA_NAME_VAR (name);
+   uid = DECL_UID (var);
+   ielt.uid = uid;
+   dslot = htab_find_slot_with_hash (decl_copies, &ielt, uid, INSERT);
+   if (!*dslot)
+     {
+       var_copy = create_tmp_var (TREE_TYPE (var), get_name (var));
+       add_referenced_var (var_copy);
+       nielt = XNEW (struct int_tree_map);
+       nielt->uid = uid;
+       nielt->to = var_copy;
+       *dslot = nielt;
+ 
+       /* Ensure that when we meet this decl next time, we won't duplicate
+ 	 it again.  */
+       nuid = DECL_UID (var_copy);
+       ielt.uid = nuid;
+       dslot = htab_find_slot_with_hash (decl_copies, &ielt, nuid, INSERT);
+       gcc_assert (!*dslot);
+       nielt = XNEW (struct int_tree_map);
+       nielt->uid = nuid;
+       nielt->to = var_copy;
+       *dslot = nielt;
+     }
+   else
+     var_copy = ((struct int_tree_map *) *dslot)->to;
+ 
+   if (copy_name_p)
+     {
+       copy = duplicate_ssa_name (name, NULL_TREE);
+       nelt = XNEW (struct name_to_copy_elt);
+       nelt->version = idx;
+       nelt->new_name = copy;
+       nelt->field = NULL_TREE;
+       *slot = nelt;
+     }
+   else
+     {
+       gcc_assert (!slot);
+       copy = name;
+     }
+ 
+   SSA_NAME_VAR (copy) = var_copy;
+   return copy;
+ }
+ 
+ /* Finds the ssa names used in STMT that are defined outside of LOOP and
+    replaces such ssa names with their duplicates.  The duplicates are stored to
+    NAME_COPIES.  Base decls of all ssa names used in STMT
+    (including those defined in LOOP) are replaced with the new temporary
+    variables; the replacement decls are stored in DECL_COPIES.  */
+ 
+ static void
+ separate_decls_in_loop_stmt (struct loop *loop, tree stmt,
+ 			     htab_t name_copies, htab_t decl_copies)
+ {
+   use_operand_p use;
+   def_operand_p def;
+   ssa_op_iter oi;
+   tree name, copy;
+   bool copy_name_p;
+ 
+   mark_virtual_ops_for_renaming (stmt);
+ 
+   FOR_EACH_PHI_OR_STMT_DEF (def, stmt, oi, SSA_OP_DEF)
+     {
+       name = DEF_FROM_PTR (def);
+       gcc_assert (TREE_CODE (name) == SSA_NAME);
+       copy = separate_decls_in_loop_name (name, name_copies, decl_copies,
+ 					  false);
+       gcc_assert (copy == name);
+     }
+ 
+   FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE)
+     {
+       name = USE_FROM_PTR (use);
+       if (TREE_CODE (name) != SSA_NAME)
+ 	continue;
+ 
+       copy_name_p = expr_invariant_in_loop_p (loop, name);
+       copy = separate_decls_in_loop_name (name, name_copies, decl_copies,
+ 					  copy_name_p);
+       SET_USE (use, copy);
+     }
+ }
+ 
+ /* Callback for htab_traverse.  Adds a field corresponding to a ssa name
+    described in SLOT to the type passed in DATA.  */
+ 
+ static int
+ add_field_for_name (void **slot, void *data)
+ {
+   struct name_to_copy_elt *elt = *slot;
+   tree type = data;
+   tree name = ssa_name (elt->version);
+   tree var = SSA_NAME_VAR (name);
+   tree field = build_decl (FIELD_DECL, DECL_NAME (var), TREE_TYPE (var));
+ 
+   insert_field_into_struct (type, field);
+   elt->field = field;
+   return 1;
+ }
+ 
+ /* Callback for htab_traverse.  Creates loads to a field of LOAD in LOAD_BB and
+    store to a field of STORE in STORE_BB for the ssa name and its duplicate
+    specified in SLOT.  */
+ 
+ struct clsn_data
+ {
+   tree store;
+   tree load;
+ 
+   basic_block store_bb;
+   basic_block load_bb;
+ };
+ 
+ static int
+ create_loads_and_stores_for_name (void **slot, void *data)
+ {
+   struct name_to_copy_elt *elt = *slot;
+   struct clsn_data *clsn_data = data;
+   tree stmt;
+   block_stmt_iterator bsi;
+   tree type = TREE_TYPE (elt->new_name);
+   tree struct_type = TREE_TYPE (TREE_TYPE (clsn_data->load));
+   tree load_struct;
+ 
+   bsi = bsi_last (clsn_data->store_bb);
+   stmt = build2 (MODIFY_EXPR, void_type_node,
+ 		 build3 (COMPONENT_REF, type, clsn_data->store, elt->field,
+ 			 NULL_TREE),
+ 		 ssa_name (elt->version));
+   mark_virtual_ops_for_renaming (stmt);
+   bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
+ 
+   bsi = bsi_last (clsn_data->load_bb);
+   load_struct = fold_build1 (INDIRECT_REF, struct_type, clsn_data->load);
+   stmt = build2 (MODIFY_EXPR, void_type_node,
+ 		 elt->new_name,
+ 		 build3 (COMPONENT_REF, type, load_struct, elt->field,
+ 			 NULL_TREE));
+   SSA_NAME_DEF_STMT (elt->new_name) = stmt;
+   bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
+ 
+   return 1;
+ }
+ 
+ /* Moves all the variables used in LOOP and defined outside of it (including
+    the initial values of loop phi nodes, and *PER_THREAD if it is a ssa
+    name) to a structure created for this purpose.  The code
+  
+    while (1)
+      {
+        use (a);
+        use (b);
+      }
+ 
+    is transformed this way:
+ 
+ bb0:
+    old.a = a;
+    old.b = b;
+ 
+ bb1:
+    a' = new->a;
+    b' = new->b;
+    while (1)
+      {
+        use (a');
+        use (b');
+      }
+ 
+    `old' is stored to *ARG_STRUCT and `new' is stored to NEW_ARG_STRUCT.  The
+    pointer `new' is intentionally not initialized (the loop will be split to a
+    separate function later, and `new' will be initialized from its arguments).
+    *PER_THREAD is updated to the ssa name to that the value is loaded in
+    bb1.  A mapping of old to new copies is stored to *NAME_COPIES.  */
+ 
+ static void
+ separate_decls_in_loop (struct loop *loop, tree *per_thread,
+ 			tree *arg_struct, tree *new_arg_struct,
+ 			htab_t *name_copies)
+ {
+   basic_block bb1 = loop_split_edge_with (loop_preheader_edge (loop), NULL);
+   basic_block bb0 = single_pred (bb1);
+   htab_t decl_copies = htab_create (10, int_tree_map_hash, int_tree_map_eq,
+ 				    free);
+   basic_block bb, *body = get_loop_body (loop);
+   unsigned i;
+   tree phi, type, type_name, nvar;
+   block_stmt_iterator bsi;
+   struct clsn_data clsn_data;
+ 
+   *name_copies = htab_create (10, name_to_copy_elt_hash,
+ 			      name_to_copy_elt_eq, free);
+ 
+   /* Find and rename the ssa names defined outside of loop.  */
+   for (i = 0; i < loop->num_nodes; i++)
+     {
+       bb = body[i];
+ 
+       for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
+ 	separate_decls_in_loop_stmt (loop, phi, *name_copies, decl_copies);
+ 
+       for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
+ 	separate_decls_in_loop_stmt (loop, bsi_stmt (bsi), *name_copies,
+ 				     decl_copies);
+     }
+ 
+   if (TREE_CODE (*per_thread) == SSA_NAME)
+     *per_thread
+        = separate_decls_in_loop_name (*per_thread, *name_copies, decl_copies, 
+ 				      true);
+   free (body);
+ 
+   if (htab_elements (*name_copies) == 0)
+     {
+       /* It may happen that there is nothing to copy (if there are only
+ 	 loop carried and external variables in the loop).  */
+       *arg_struct = NULL;
+       *new_arg_struct = NULL;
+     }
+   else
+     {
+       /* Create the type for the structure to store the ssa names to.  */
+       type = lang_hooks.types.make_type (RECORD_TYPE);
+       type_name = build_decl (TYPE_DECL, create_tmp_var_name (".paral_data"),
+ 			      type);
+       TYPE_NAME (type) = type_name;
+ 
+       htab_traverse (*name_copies, add_field_for_name, type);
+       layout_type (type);
+ 
+       /* Create the loads and stores.  */
+       *arg_struct = create_tmp_var (type, ".paral_data_store");
+       add_referenced_var(*arg_struct);
+       nvar = create_tmp_var (build_pointer_type (type), ".paral_data_load");
+       add_referenced_var(nvar);
+       *new_arg_struct = make_ssa_name (nvar, NULL_TREE);
+ 
+       /* We should mark *arg_struct call clobbered.  However, this means
+ 	 we would need to update virtual operands for every function call.
+ 	 To avoid this, we pretend this variable is volatile.  */
+       TREE_THIS_VOLATILE (*arg_struct) = 1;
+ 
+       clsn_data.store = *arg_struct;
+       clsn_data.load = *new_arg_struct;
+       clsn_data.store_bb = bb0;
+       clsn_data.load_bb = bb1;
+       htab_traverse (*name_copies, create_loads_and_stores_for_name,
+ 		     &clsn_data);
+     }
+ 
+   htab_delete (decl_copies);
+ }
+ 
+ /* Replaces initial values of induction variables in LOOP to the start of the
+    PER_THREAD * (thread number - 1)-th iteration.  STEPS is the hashtable that
+    contains steps of each induction variable.  */
+ 
+ static void
+ shift_ivs_for_iteration (struct loop *loop, tree per_thread, htab_t steps)
+ {
+   tree phi, name, step, ncopy, stmts, call, cdecl, type, init, delta, ninit;
+   edge entry = loop_preheader_edge (loop);
+   struct int_tree_map tem, *elt;
+   unsigned ver;
+   block_stmt_iterator bsi = bsi_last (entry->src);
+   use_operand_p init_p;
+ 
+   for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
+     {
+       name = PHI_RESULT (phi);
+       if (!is_gimple_reg (name))
+ 	continue;
+ 
+       ver = SSA_NAME_VERSION (name);
+       tem.uid = ver;
+       elt = htab_find_with_hash (steps, &tem, ver);
+       step = elt->to;
+ 
+       init_p = PHI_ARG_DEF_PTR_FROM_EDGE (phi, entry);
+       init = USE_FROM_PTR (init_p);
+       type = TREE_TYPE (init);
+       cdecl = built_in_decls[BUILT_IN_OMP_GET_THREAD_NUM];
+       call = fold_convert (type, build_function_call_expr (cdecl, NULL));
+       ncopy = build2 (MINUS_EXPR, type, call, build_int_cst (type, 1));
+       delta = fold_build2 (MULT_EXPR, type, unshare_expr (step), per_thread);
+       delta = fold_build2 (MULT_EXPR, type, delta, ncopy);
+       ninit = fold_build2 (PLUS_EXPR, type, init, delta);
+ 
+       ninit = force_gimple_operand (ninit, &stmts, true, NULL_TREE);
+       if (stmts)
+ 	bsi_insert_after (&bsi, stmts, BSI_CONTINUE_LINKING);
+ 
+       SET_USE (init_p, ninit);
+     }
+ }
+ 
+ /* Records steps of induction variables of LOOP to STEPS.  */
+ 
+ static void
+ record_steps (struct loop *loop, htab_t *steps)
+ {
+   tree phi, name;
+   unsigned ver;
+   affine_iv iv;
+   bool ok;
+   void **slot;
+   struct int_tree_map tem, *elt;
+ 
+   *steps = htab_create (10, int_tree_map_hash, int_tree_map_eq, free);
+   for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
+     {
+       name = PHI_RESULT (phi);
+       if (!is_gimple_reg (name))
+ 	continue;
+ 
+       ver = SSA_NAME_VERSION (name);
+       ok = simple_iv (loop, phi, name, &iv, true);
+       gcc_assert (ok);
+ 
+       tem.uid = ver;
+       slot = htab_find_slot_with_hash (*steps, &tem, ver, INSERT);
+       elt = XNEW (struct int_tree_map);
+       elt->uid = ver;
+       elt->to = iv.step;
+       *slot = elt;
+     }
+ }
+ 
+ /* Remaps ssa names in EXPR according to NAME_COPIES, and returns the updated
+    expression.  */
+ 
+ static tree
+ remap_ssa_names (tree expr, htab_t name_copies)
+ {
+   struct name_to_copy_elt tem, *elt;
+   unsigned ver;
+   tree copied;
+   unsigned i, n;
+   tree nop;
+ 
+   if (TREE_CODE (expr) == SSA_NAME)
+     {
+       ver = SSA_NAME_VERSION (expr);
+       tem.version = ver;
+       elt = htab_find_with_hash (name_copies, &tem, ver);
+ 
+       return elt->new_name;
+     }
+ 
+   if (!EXPR_P (expr))
+     return unshare_expr (expr);
+ 
+   copied = copy_node (expr);
+   n = TREE_CODE_LENGTH (TREE_CODE (expr));
+ 
+   for (i = 0; i < n; i++)
+     {
+       nop = remap_ssa_names (TREE_OPERAND (expr, i), name_copies);
+       TREE_OPERAND (copied, i) = nop;
+     }
+ 
+   return copied;
+ }
+ 
+ /* Records steps for induction variables of NLOOP in STEPS, and remap the steps
+    of induction variables in LOOP according to NAME_COPIES.  */
+ 
+ static void
+ record_and_update_steps (struct loop *loop, struct loop *nloop, htab_t steps,
+ 			 htab_t name_copies)
+ {
+   tree phi, nphi, name, nname;
+   unsigned ver, nver;
+   void **slot;
+   struct int_tree_map tem, *elt, *nelt;
+ 
+   /* The corresponding phi nodes in LOOP and in NLOOP should be in the same
+      order.  */
+   for (phi = phi_nodes (loop->header), nphi = phi_nodes (nloop->header); phi;
+        phi = PHI_CHAIN (phi), nphi = PHI_CHAIN (nphi))
+     {
+       name = PHI_RESULT (phi);
+       nname = PHI_RESULT (nphi);
+ 
+       if (!is_gimple_reg (name))
+ 	{
+ 	  gcc_assert (!is_gimple_reg (nname));
+ 	  continue;
+ 	}
+       gcc_assert (is_gimple_reg (nname));
+       ver = SSA_NAME_VERSION (name);
+       nver = SSA_NAME_VERSION (nname);
+ 
+       /* First copy the values from LOOP to NLOOP.  */
+       tem.uid = ver;
+       elt = htab_find_with_hash (steps, &tem, ver);
+       tem.uid = nver;
+       slot = htab_find_slot_with_hash (steps, &tem, nver, INSERT);
+       nelt = XNEW (struct int_tree_map);
+       nelt->uid = nver;
+       nelt->to = unshare_expr (elt->to);
+       *slot = nelt;
+ 
+       /* Then rewrite the old ones.  */
+       elt->to = remap_ssa_names (elt->to, name_copies);
+     }
+   gcc_assert (nphi == NULL_TREE);
+ }
+ 
+ /* Change the exit condition of LOOP so that it exits after PER_THREAD
+    iterations, and remove the old exit.  */
+ 
+ static void
+ change_exit_condition (struct loop *loop, tree per_thread)
+ {
+   basic_block ex_bb = loop->single_exit->src;
+   basic_block ex_to = loop->single_exit->dest;
+   block_stmt_iterator bsi = bsi_last (ex_bb);
+   edge in_loop, e;
+   tree exit_stmt = bsi_stmt (bsi);
+   tree type = TREE_TYPE (per_thread);
+   tree iv, cond;
+ 
+   gcc_assert (TREE_CODE (exit_stmt) == COND_EXPR);
+   bsi_remove (&bsi, true);
+ 
+   in_loop = EDGE_SUCC (ex_bb, 0);
+   if (in_loop == loop->single_exit)
+     in_loop = EDGE_SUCC (ex_bb, 1);
+ 
+   remove_edge (loop->single_exit);
+   in_loop->flags &= ~(EDGE_TRUE_VALUE | EDGE_FALSE_VALUE);
+   in_loop->flags |= EDGE_FALLTHRU;
+ 
+   e = single_pred_edge (loop_split_edge_with (loop_latch_edge (loop), NULL));
+   bsi = bsi_last (e->src);
+   create_iv (build_int_cst (type, 0), build_int_cst (type, 1), NULL_TREE,
+ 	     loop, &bsi, true, NULL, &iv);
+   cond = build3 (COND_EXPR, void_type_node,
+ 		 fold_build2 (LT_EXPR, boolean_type_node, iv, per_thread),
+ 		 build1 (GOTO_EXPR, void_type_node,
+ 			 tree_block_label (e->dest)),
+ 		 build1 (GOTO_EXPR, void_type_node,
+ 			 tree_block_label (ex_to)));
+   bsi_insert_after (&bsi, cond, BSI_NEW_STMT);
+ 
+   e->flags &= ~EDGE_FALLTHRU;
+   e->flags |= EDGE_TRUE_VALUE;
+   loop->single_exit = make_edge (e->src, ex_to, EDGE_FALSE_VALUE);
+ }
+ 
+ /* Bitmap containing uids of functions created by parallelization.  We cannot
+    allocate it from the default obstack, as it must live across compilation
+    of several functions; we make it gc allocated instead.  */
+ 
+ static GTY(()) bitmap parallelized_functions;
+ 
+ /* Returns true if FN was created by create_loop_fn.  */
+ 
+ static bool
+ parallelized_function_p (tree fn)
+ {
+   if (!parallelized_functions || !DECL_ARTIFICIAL (fn))
+     return false;
+ 
+   return bitmap_bit_p (parallelized_functions, DECL_UID (fn));
+ }
+ 
+ /* Creates and returns an empty function to that the body of the loop
+    is later split.  */
+ 
+ static tree
+ create_loop_fn (void)
+ {
+   char buf[100];
+   char *tname;
+   tree decl, type, name, t;
+   struct function *act_cfun = cfun;
+   static unsigned loopfn_num;
+ 
+   snprintf (buf, 100, "%s.$loopfn", current_function_name ());
+   ASM_FORMAT_PRIVATE_NAME (tname, buf, loopfn_num++);
+   clean_symbol_name (tname);
+   name = get_identifier (tname);
+   type = build_function_type_list (void_type_node, ptr_type_node, NULL_TREE);
+ 
+   decl = build_decl (FUNCTION_DECL, name, type);
+   if (!parallelized_functions)
+     parallelized_functions = BITMAP_GGC_ALLOC ();
+   bitmap_set_bit (parallelized_functions, DECL_UID (decl));
+ 
+   TREE_STATIC (decl) = 1;
+   TREE_USED (decl) = 1;
+   DECL_ARTIFICIAL (decl) = 1;
+   DECL_IGNORED_P (decl) = 0;
+   TREE_PUBLIC (decl) = 0;
+   DECL_UNINLINABLE (decl) = 1;
+   DECL_EXTERNAL (decl) = 0;
+   DECL_CONTEXT (decl) = NULL_TREE;
+   DECL_INITIAL (decl) = make_node (BLOCK);
+ 
+   t = build_decl (RESULT_DECL, NULL_TREE, void_type_node);
+   DECL_ARTIFICIAL (t) = 1;
+   DECL_IGNORED_P (t) = 1;
+   DECL_RESULT (decl) = t;
+ 
+   t = build_decl (PARM_DECL, get_identifier (".paral_data_param"),
+ 		  ptr_type_node);
+   DECL_ARTIFICIAL (t) = 1;
+   DECL_ARG_TYPE (t) = ptr_type_node;
+   DECL_CONTEXT (t) = decl;
+   TREE_USED (t) = 1;
+   DECL_ARGUMENTS (decl) = t;
+ 
+   allocate_struct_function (decl);
+ 
+   /* The call to allocate_struct_function clobbers CFUN, so we need to restore
+      it.  */
+   cfun = act_cfun;
+ 
+   return decl;
+ }
+ 
+ /* Extracts LOOP and its preheader to a separate function.  This function is
+    returned in LOOP_FN.  ARG_STRUCT is initialized in the new function from
+    an argument of the function.  The single basic block that replaces LOOP is
+    returned.  */
+ 
+ static basic_block
+ extract_loop_to_function (struct loop *loop, tree arg_struct, tree *loop_fn)
+ {
+   basic_block bb_to = loop_split_edge_with (loop->single_exit, NULL);
+   basic_block bb_from = loop_preheader_edge (loop)->src;
+   basic_block repl_bb, bb;
+   tree arg, narg, stmt;
+   struct function *act_cfun = cfun;
+   tree act_decl = current_function_decl;
+   block_stmt_iterator bsi;
+   basic_block *body = get_loop_body (loop);
+   struct loop *outer = loop->outer;
+   unsigned i, n = loop->num_nodes;
+   stmt_ann_t ann;
+ 
+   cancel_loop_tree (current_loops, loop);
+   for (i = 0; i < n; i++)
+     remove_bb_from_loops (body[i]);
+   free (body);
+   remove_bb_from_loops (bb_from);
+   remove_bb_from_loops (bb_to);
+ 
+   bsi = bsi_last (bb_to);
+   bsi_insert_after (&bsi, build1 (RETURN_EXPR, void_type_node, NULL),
+ 		    BSI_NEW_STMT);
+ 
+   *loop_fn = create_loop_fn ();
+   repl_bb = move_sese_region_to_fn (DECL_STRUCT_FUNCTION (*loop_fn),
+ 				    bb_from, bb_to);
+   add_bb_to_loop (repl_bb, outer);
+ 
+   cfun = DECL_STRUCT_FUNCTION (*loop_fn);
+   current_function_decl = *loop_fn;
+ 
+   /* Initialize the arg_struct.  */
+   if (arg_struct)
+     {
+       arg = DECL_ARGUMENTS (*loop_fn);
+       add_referenced_var (arg);
+       narg = make_ssa_name (arg, build_empty_stmt ());
+       set_default_def (arg, narg);
+ 
+       bsi = bsi_after_labels (single_succ (ENTRY_BLOCK_PTR));
+       stmt = build2 (MODIFY_EXPR, void_type_node, arg_struct,
+ 		     fold_convert (TREE_TYPE (arg_struct), narg));
+       SSA_NAME_DEF_STMT (arg_struct) = stmt;
+       bsi_insert_before (&bsi, stmt, BSI_NEW_STMT);
+     }
+ 
+   go_out_of_ssa ();
+ 
+   /* Let us pretend that we have never seen the statements before.  The
+      operands of the statements are allocated from the local caches, so
+      we cannot preserve them.  */
+   FOR_EACH_BB (bb)
+     {
+       for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
+ 	{
+ 	  ann = stmt_ann (bsi_stmt (bsi));
+ 	  memset (ann, 0, sizeof (struct stmt_ann_d));
+ 	  ann->common.type = STMT_ANN;
+ 	  ann->modified = 1;
+ 	  ann->bb = bb;
+ 	}
+     }
+ 
+   cfun = act_cfun;
+   current_function_decl = act_decl;
+ 
+   return repl_bb;
+ }
+ 
+ /* Emits the code to call LOOP_FN with argument ARG in N_THREADS threads
+    after BSI.  */
+ 
+ static void
+ call_in_parallel (block_stmt_iterator bsi, tree loop_fn, tree arg,
+ 		  unsigned n_threads)
+ {
+   tree start_decl = built_in_decls[BUILT_IN_GOMP_PARALLEL_START];
+   tree args, stmts, adata;
+ 
+   args = tree_cons (NULL, build_int_cst (unsigned_type_node, n_threads),
+ 		    NULL);
+   if (arg)
+     {
+       adata = build_fold_addr_expr (arg);
+       mark_call_clobbered (arg, ESCAPE_TO_CALL);
+     }
+   else
+     adata = null_pointer_node;
+   args = tree_cons (NULL, adata, args);
+   args = tree_cons (NULL, build_fold_addr_expr (loop_fn), args);
+   force_gimple_operand (build_function_call_expr (start_decl, args),
+ 			&stmts, false, NULL_TREE);
+   mark_virtual_ops_for_renaming_list (stmts);
+   bsi_insert_after (&bsi, stmts, BSI_CONTINUE_LINKING);
+ }
+ 
+ /* Makes the induction variables in LOOP start at
+    (N_THREADS - 1) * PER_THREAD-th iteration when the LOOP is entered through
+    NEW_ENTRY.  STEPS describe the steps of induction variables in LOOP.  */
+ 
+ static void
+ shift_ivs_for_remaining_iterations (struct loop *loop, tree per_thread,
+ 				    unsigned n_threads, edge new_entry,
+ 				    htab_t steps)
+ {
+   tree phi, name, nphi;
+   unsigned ver;
+   struct int_tree_map tem, *elt;
+   basic_block bb = new_entry->dest;
+   edge old_entry, entry;
+   tree stmts, new_init, init, pass, type, delta;
+   use_operand_p init_p;
+ 
+   old_entry = EDGE_PRED (bb, 0);
+   if (old_entry == new_entry)
+     old_entry = EDGE_PRED (bb, 1);
+   entry = loop_preheader_edge (loop);
+ 
+   for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
+     {
+       name = PHI_RESULT (phi);
+       if (!is_gimple_reg (name))
+ 	continue;
+ 
+       ver = SSA_NAME_VERSION (name);
+       tem.uid = ver;
+       elt = htab_find_with_hash (steps, &tem, ver);
+ 
+       nphi = create_phi_node (SSA_NAME_VAR (name), bb);
+       pass = PHI_RESULT (nphi);
+ 
+       init_p = PHI_ARG_DEF_PTR_FROM_EDGE (phi, entry);
+       init = USE_FROM_PTR (init_p);
+       SET_USE (init_p, pass);
+ 
+       add_phi_arg (nphi, init, old_entry);
+       type = TREE_TYPE (name);
+       delta = fold_build2 (MULT_EXPR, type,
+ 			   build_int_cst (type, n_threads -  1),
+ 			   per_thread);
+       delta = fold_build2 (MULT_EXPR, type, delta, unshare_expr (elt->to));
+       new_init = fold_build2 (PLUS_EXPR, type, init, delta);
+ 
+       new_init = force_gimple_operand (new_init, &stmts, true, NULL_TREE);
+       if (stmts)
+ 	bsi_insert_on_edge_immediate_loop (new_entry, stmts);
+       add_phi_arg (nphi, new_init, new_entry);
+     }
+ }
+ 
+ /* If PAR is true, emit the code on E to finalize the threads.  */
+ 
+ static void
+ finalize_threads (edge e, tree par)
+ {
+   basic_block cond_bb = loop_split_edge_with (e, NULL);
+   basic_block fin_bb = loop_split_edge_with (single_succ_edge (cond_bb), NULL);
+   basic_block cont_bb = single_succ (fin_bb);
+   block_stmt_iterator bsi = bsi_last (cond_bb);
+   tree decl = built_in_decls[BUILT_IN_GOMP_PARALLEL_END];
+   tree stmt = build_function_call_expr (decl, NULL);
+   edge te, fe, be;
+   tree phi;
+ 
+   bsi_insert_after (&bsi,
+ 		    build3 (COND_EXPR, void_type_node,
+ 			    par,
+ 			    build1 (GOTO_EXPR, void_type_node,
+ 				    tree_block_label (fin_bb)),
+ 			    build1 (GOTO_EXPR, void_type_node,
+ 				    tree_block_label (cont_bb))),
+ 		    BSI_NEW_STMT);
+   te = single_succ_edge (cond_bb);
+   te->flags = EDGE_TRUE_VALUE;
+   be = single_succ_edge (fin_bb);
+   fe = make_edge (cond_bb, cont_bb, EDGE_FALSE_VALUE);
+ 
+   for (phi = phi_nodes (cont_bb); phi != NULL_TREE; phi = PHI_CHAIN (phi))
+     add_phi_arg (phi, PHI_ARG_DEF_FROM_EDGE (phi, be), fe);
+ 
+   set_immediate_dominator (CDI_DOMINATORS, cont_bb,
+ 			   recount_dominator (CDI_DOMINATORS, cont_bb));
+   bsi = bsi_last (fin_bb);
+   mark_virtual_ops_for_renaming (stmt);
+   bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
+ }
+ 
+ /* Generates code to execute the iterations of LOOP in N_THREADS threads in
+    parallel.  NITER describes number of iterations of LOOP.  */
+ 
+ static void
+ gen_parallel_loop (struct loop *loop, unsigned n_threads,
+ 		   struct tree_niter_desc *niter)
+ {
+   struct loop *nloop;
+   tree many_iterations_cond, type, per_thread, new_per_thread;
+   tree stmts, arg_struct, new_arg_struct, loop_fn, par_flag, phi;
+   basic_block repl_bb, npre;
+   htab_t steps, name_copies;
+   edge orig_entry;
+ 
+   /* From
+ 
+      ---------------------------------------------------------------------
+      loop
+        {
+ 	 IV = phi (INIT, IV + STEP)
+ 	 BODY1;
+ 	 if (COND)
+ 	   break;
+ 	 BODY2;
+        }
+      ---------------------------------------------------------------------
+ 
+      with # of iterations NITER (possibly with MAY_BE_ZERO assumption),
+      we generate the following code:
+ 
+      ---------------------------------------------------------------------
+      parallelized = false;
+ 
+      if (MAY_BE_ZERO
+ 	 || NITER < MIN_PER_THREAD * N_THREADS)
+        goto rest;
+ 
+      per_thread = NITER / N_THREADS;
+      store all local loop-invariant variables (including per_thread in case it
+        is not a constant) used in body of the loop to DATA.
+      __builtin_GOMP_parallel_start (loop_fn, &DATA, N_THREADS);
+      INIT += STEP * per_thread * (N_THREADS - 1);
+      parallelized = true;
+ 
+      rest:
+      loop
+        {
+ 	 IV = phi (INIT, IV + STEP)
+ 	 BODY1;
+ 	 if (COND)
+ 	   break;
+ 	 BODY2;
+        }
+ 
+      if (parallelized)
+        __builtin_GOMP_parallel_end ();
+      ---------------------------------------------------------------------
+ 
+      With the function
+ 
+      ---------------------------------------------------------------------
+      void loop_fn (void *data)
+        {
+ 	 load all local loop-invariant variables used in body of the loop
+ 	   from data
+ 	 INIT += STEP * per_thread * (thread_id - 1);
+ 
+ 	 loop
+ 	   {
+ 	     IV = phi (INIT, IV + STEP)
+ 	     NEW_CTR = phi (0, NEW_CTR');
+ 	     BODY1;
+ 	     BODY2;
+ 	     NEW_CTR' = NEW_CTR + 1;
+ 	     if (NEW_CTR' == per_thread)
+ 	       return;
+ 	   }
+        }
+      ---------------------------------------------------------------------
+    */
+ 
+   /* Record the steps of the induction variables.  */
+   record_steps (loop, &steps);
+ 
+   /* Create two versions of the loop -- in the old one, we know that the
+      number of iterations is large enough, and we will transform it into the
+      loop that will be split to loop_fn, the new one will be used for the
+      remaining iterations.  */
+   
+   type = TREE_TYPE (niter->niter);
+   many_iterations_cond =
+ 	  fold_build2 (GE_EXPR, boolean_type_node,
+ 		       niter->niter,
+ 		       build_int_cst (type, MIN_PER_THREAD * n_threads));
+   many_iterations_cond
+ 	  = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
+ 			 invert_truthvalue (niter->may_be_zero),
+ 			 many_iterations_cond);
+   many_iterations_cond
+ 	  = force_gimple_operand (unshare_expr (many_iterations_cond),
+ 				  &stmts, false, NULL_TREE);
+   if (stmts)
+     bsi_insert_on_edge_immediate_loop (loop_preheader_edge (loop), stmts);
+   if (!is_gimple_condexpr (many_iterations_cond))
+     {
+       many_iterations_cond
+ 	      = force_gimple_operand (many_iterations_cond, &stmts,
+ 				      true, NULL_TREE);
+       if (stmts)
+ 	bsi_insert_on_edge_immediate_loop (loop_preheader_edge (loop), stmts);
+     }
+ 
+   initialize_original_copy_tables ();
+   nloop = loop_version (current_loops, loop, many_iterations_cond, NULL, true);
+   update_ssa (TODO_update_ssa);
+   free_original_copy_tables ();
+ 
+   /* Compute number of iterations per thread.  */
+   per_thread = fold_build2 (FLOOR_DIV_EXPR, type, niter->niter,
+ 			    build_int_cst (type, n_threads));
+   per_thread = force_gimple_operand (unshare_expr (per_thread), &stmts, true,
+ 				     NULL_TREE);
+   if (stmts)
+     bsi_insert_on_edge_immediate_loop (loop_preheader_edge (loop), stmts);
+ 
+   /* Eliminate the references to local variables from the loop.  */
+   eliminate_local_variables (loop);
+ 
+   /* In the old loop, move all variables non-local to the loop to a structure
+      and back, and create separate decls for the variables used in loop.  */
+   new_per_thread = per_thread;
+   separate_decls_in_loop (loop, &new_per_thread, &arg_struct, &new_arg_struct,
+ 			  &name_copies);
+ 
+   /* Record the steps of induction variables in the copy, and remap those
+      in the original loop.  */
+   record_and_update_steps (loop, nloop, steps, name_copies);
+ 
+   /* Update the initial values of induction variables.  */
+   shift_ivs_for_iteration (loop, new_per_thread, steps);
+ 
+   /* Add the new exit condition.  */
+   change_exit_condition (loop, new_per_thread);
+ 
+   /* Split the loop to the new function.  */
+   repl_bb = extract_loop_to_function (loop, new_arg_struct, &loop_fn);
+   cgraph_add_new_function (loop_fn);
+ 
+   /* Call the builtin to run the function in several threads.  */
+   call_in_parallel (bsi_last (repl_bb), loop_fn, arg_struct, n_threads);
+ 
+   /* Redirect the exit edge to the versioned copy, and set the parallelized
+      flag.  */
+   par_flag = create_tmp_var (boolean_type_node, "parallelized");
+   add_referenced_var (par_flag);
+ 
+   npre = loop_preheader_edge (nloop)->src;
+   gcc_assert (phi_nodes (npre) == NULL);
+ 
+   orig_entry = single_pred_edge (npre);
+   redirect_edge_and_branch (single_succ_edge (repl_bb),
+ 			    loop_preheader_edge (nloop)->src);
+   phi = create_phi_node (par_flag, npre);
+   add_phi_arg (phi, boolean_false_node, orig_entry);
+   add_phi_arg (phi, boolean_true_node, single_succ_edge (repl_bb));
+   par_flag = PHI_RESULT (phi);
+ 
+   /* Update the initial values of induction variables in the versioned
+      copy.  */
+   shift_ivs_for_remaining_iterations (nloop, per_thread, n_threads,
+ 				      single_succ_edge (repl_bb), steps);
+ 
+   /* Finalize the threads after the loop if necessary.  */
+   finalize_threads (nloop->single_exit, par_flag);
+ 
+   htab_delete (name_copies);
+   htab_delete (steps);
+   scev_reset ();
+ 
+   /* We created a new call clobbered variable.  This means every call in the
+      function gets a new virtual operand.  However, we cannot rerun alias
+      analysis after vectorizer (or at least, passes.c claims so), thus we
+      must mark them for renaming manually.  */
+   mark_call_virtual_operands ();
+   update_ssa (TODO_update_ssa_only_virtuals);
+ }
+ 
+ /* Detect parallel loops and generate parallel code using libgomp
+    primitives.  */
+ 
+ bool
+ parallelize_loops (void)
+ {
+   unsigned i, n_threads = flag_tree_parallelize_loops;
+   unsigned n = current_loops->num;
+   bool changed = false;
+   struct loop *loop;
+   struct tree_niter_desc niter_desc;
+ 
+   /* Do not parallelize loops in the functions created by parallelization.  */
+   if (parallelized_function_p (cfun->decl))
+     return false;
+ 
+   for (i = 1; i < n; i++)
+     {
+       loop = current_loops->parray[i];
+       if (!loop
+ 	  /* Do not bother with loops in cold areas.  */
+ 	  || !maybe_hot_bb_p (loop->header)
+ 	  /* Or loops that roll too little.  */
+ 	  || expected_loop_iterations (loop) <= n_threads
+ 	  /* And of course, the loop must be parallelizable.  */
+ 	  || !can_duplicate_loop_p (loop)
+ 	  || !loop_parallel_p (loop, &niter_desc))
+ 	continue;
+ 
+       changed = true;
+       gen_parallel_loop (loop, n_threads, &niter_desc);
+       verify_flow_info ();
+       verify_dominators (CDI_DOMINATORS);
+       verify_loop_structure (current_loops);
+       verify_loop_closed_ssa ();
+     }
+ 
+   return changed;
+ }
+ 
+ #include "gt-tree-parloops.h"
Index: gimplify.c
===================================================================
*** gimplify.c	(revision 117251)
--- gimplify.c	(working copy)
*************** force_gimple_operand (tree expr, tree *s
*** 6348,6356 ****
    if (var)
      expr = build2 (MODIFY_EXPR, TREE_TYPE (var), var, expr);
  
!   ret = gimplify_expr (&expr, stmts, NULL,
! 		       gimple_test_f, fb_rvalue);
!   gcc_assert (ret != GS_ERROR);
  
    if (referenced_vars)
      {
--- 6348,6364 ----
    if (var)
      expr = build2 (MODIFY_EXPR, TREE_TYPE (var), var, expr);
  
!   if (TREE_TYPE (expr) == void_type_node)
!     {
!       gimplify_and_add (expr, stmts);
!       expr = NULL_TREE;
!     }
!   else
!     {
!       ret = gimplify_expr (&expr, stmts, NULL,
! 			   gimple_test_f, fb_rvalue);
!       gcc_assert (ret != GS_ERROR);
!     }
  
    if (referenced_vars)
      {
Index: common.opt
===================================================================
*** common.opt	(revision 117251)
--- common.opt	(working copy)
*************** ftree-loop-optimize
*** 973,978 ****
--- 973,982 ----
  Common Report Var(flag_tree_loop_optimize) Init(1)
  Enable loop optimizations on tree level
  
+ ftree-parallelize-loops=
+ Common Report Joined UInteger Var(flag_tree_parallelize_loops) Init(1)
+ Enable automatic parallelization of loops
+ 
  ftree-pre
  Common Report Var(flag_tree_pre)
  Enable SSA-PRE optimization on trees
Index: tree-outof-ssa.c
===================================================================
*** tree-outof-ssa.c	(revision 117251)
--- tree-outof-ssa.c	(working copy)
*************** remove_ssa_form (var_map map, int flags)
*** 2405,2413 ****
  	}
      }
  
-   /* we no longer maintain the SSA operand cache at this point.  */
-   fini_ssa_operands ();
- 
    /* If any copies were inserted on edges, analyze and insert them now.  */
    perform_edge_inserts ();
  }
--- 2405,2410 ----
*************** insert_backedge_copies (void)
*** 2500,2505 ****
--- 2497,2517 ----
      }
  }
  
+ /* Rewrites the current function out of SSA form, leaving it in gimple
+    and not freeing any structures.  */
+ 
+ void
+ go_out_of_ssa (void)
+ {
+   var_map map;
+ 
+   insert_backedge_copies ();
+   eliminate_virtual_phis ();
+   map = create_ssa_var_map (0);
+   remove_ssa_form (map, 0);
+   delete_var_map (map);
+ }
+ 
  /* Take the current function out of SSA form, as described in
     R. Morgan, ``Building an Optimizing Compiler'',
     Butterworth-Heinemann, Boston, MA, 1998. pp 176-186.  */
*************** rewrite_out_of_ssa (void)
*** 2540,2545 ****
--- 2552,2560 ----
  
    remove_ssa_form (map, ssa_flags);
  
+   /* We no longer maintain the SSA operand cache at this point.  */
+   fini_ssa_operands ();
+ 
    if (dump_file && (dump_flags & TDF_DETAILS))
      dump_tree_cfg (dump_file, dump_flags & ~TDF_DETAILS);
  
Index: tree-flow.h
===================================================================
*** tree-flow.h	(revision 117251)
--- tree-flow.h	(working copy)
*************** void mark_sym_for_renaming (tree);
*** 713,718 ****
--- 713,719 ----
  void mark_set_for_renaming (bitmap);
  tree get_current_def (tree);
  void set_current_def (tree, tree);
+ void go_out_of_ssa (void);
  
  /* In tree-ssa-ccp.c  */
  bool fold_stmt (tree *);
*************** unsigned int tree_unroll_loops_completel
*** 805,810 ****
--- 806,812 ----
  unsigned int tree_ssa_prefetch_arrays (struct loops *);
  unsigned int remove_empty_loops (struct loops *);
  void tree_ssa_iv_optimize (struct loops *);
+ bool parallelize_loops (void);
  
  bool number_of_iterations_exit (struct loop *, edge,
  				struct tree_niter_desc *niter, bool);
Index: Makefile.in
===================================================================
*** Makefile.in	(revision 117251)
--- Makefile.in	(working copy)
*************** OBJS-common = \
*** 986,992 ****
   tree-vectorizer.o tree-vect-analyze.o tree-vect-transform.o		   \
   tree-vect-patterns.o tree-ssa-loop-prefetch.o				   \
   tree-ssa-loop-ivcanon.o tree-ssa-propagate.o tree-ssa-address.o	   \
!  tree-ssa-math-opts.o							   \
   tree-ssa-loop-ivopts.o tree-if-conv.o tree-ssa-loop-unswitch.o		   \
   alias.o bb-reorder.o bitmap.o builtins.o caller-save.o calls.o	  	   \
   cfg.o cfganal.o cfgbuild.o cfgcleanup.o cfglayout.o cfgloop.o		   \
--- 986,992 ----
   tree-vectorizer.o tree-vect-analyze.o tree-vect-transform.o		   \
   tree-vect-patterns.o tree-ssa-loop-prefetch.o				   \
   tree-ssa-loop-ivcanon.o tree-ssa-propagate.o tree-ssa-address.o	   \
!  tree-ssa-math-opts.o tree-parloops.o					   \
   tree-ssa-loop-ivopts.o tree-if-conv.o tree-ssa-loop-unswitch.o		   \
   alias.o bb-reorder.o bitmap.o builtins.o caller-save.o calls.o	  	   \
   cfg.o cfganal.o cfgbuild.o cfgcleanup.o cfglayout.o cfgloop.o		   \
*************** tree-loop-linear.o: tree-loop-linear.c $
*** 2104,2109 ****
--- 2104,2112 ----
     $(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TREE_DUMP_H) $(TIMEVAR_H) $(CFGLOOP_H) \
     tree-pass.h $(TREE_DATA_REF_H) $(SCEV_H) $(EXPR_H) $(LAMBDA_H) \
     $(TARGET_H) tree-chrec.h
+ tree-parloops.o: tree-parloops.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
+    $(TREE_FLOW_H) $(TREE_H) $(RTL_H) $(CFGLOOP_H) $(TREE_DATA_REF_H) $(GGC_H) \
+    $(DIAGNOSTIC_H) tree-pass.h $(SCEV_H) langhooks.h gt-tree-parloops.h
  tree-stdarg.o: tree-stdarg.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
     $(TREE_H) $(FUNCTION_H) $(DIAGNOSTIC_H) $(TREE_FLOW_H) tree-pass.h \
     tree-stdarg.h $(TARGET_H) langhooks.h
*************** GTFILES = $(srcdir)/input.h $(srcdir)/co
*** 2866,2872 ****
    $(srcdir)/tree-ssa-operands.h $(srcdir)/tree-ssa-operands.c \
    $(srcdir)/tree-profile.c $(srcdir)/tree-nested.c \
    $(srcdir)/ipa-reference.c $(srcdir)/tree-ssa-structalias.h \
!   $(srcdir)/tree-ssa-structalias.c \
    $(srcdir)/c-pragma.h $(srcdir)/omp-low.c \
    $(srcdir)/targhooks.c $(srcdir)/cgraphunit.c $(out_file) \
    @all_gtfiles@
--- 2869,2875 ----
    $(srcdir)/tree-ssa-operands.h $(srcdir)/tree-ssa-operands.c \
    $(srcdir)/tree-profile.c $(srcdir)/tree-nested.c \
    $(srcdir)/ipa-reference.c $(srcdir)/tree-ssa-structalias.h \
!   $(srcdir)/tree-ssa-structalias.c $(srcdir)/tree-parloops.c \
    $(srcdir)/c-pragma.h $(srcdir)/omp-low.c \
    $(srcdir)/targhooks.c $(srcdir)/cgraphunit.c $(out_file) \
    @all_gtfiles@
*************** gt-tree-mudflap.h gt-tree-vect-generic.h
*** 2899,2905 ****
  gt-tree-profile.h gt-tree-ssa-address.h \
  gt-tree-ssanames.h gt-tree-iterator.h gt-gimplify.h \
  gt-tree-phinodes.h gt-tree-nested.h \
! gt-tree-ssa-operands.h gt-tree-ssa-propagate.h \
  gt-tree-ssa-structalias.h gt-ipa-inline.h gt-cgraphunit.h \
  gt-stringpool.h gt-targhooks.h gt-omp-low.h : s-gtype ; @true
  
--- 2902,2908 ----
  gt-tree-profile.h gt-tree-ssa-address.h \
  gt-tree-ssanames.h gt-tree-iterator.h gt-gimplify.h \
  gt-tree-phinodes.h gt-tree-nested.h \
! gt-tree-ssa-operands.h gt-tree-ssa-propagate.h gt-tree-parloops.h \
  gt-tree-ssa-structalias.h gt-ipa-inline.h gt-cgraphunit.h \
  gt-stringpool.h gt-targhooks.h gt-omp-low.h : s-gtype ; @true
  
Index: basic-block.h
===================================================================
*** basic-block.h	(revision 117251)
--- basic-block.h	(working copy)
*************** enum bb_flags
*** 355,360 ****
--- 355,369 ----
  #define BB_COPY_PARTITION(dstbb, srcbb) \
    BB_SET_PARTITION (dstbb, BB_PARTITION (srcbb))
  
+ /* State of dominance information.  */
+ 
+ enum dom_state
+ {
+   DOM_NONE,		/* Not computed at all.  */
+   DOM_NO_FAST_QUERY,	/* The data is OK, but the fast query data are not usable.  */
+   DOM_OK		/* Everything is ok.  */
+ };
+ 
  /* A structure to group all the per-function control flow graph data.
     The x_* prefixing is necessary because otherwise references to the
     fields of this struct are interpreted as the defines for backward
*************** struct control_flow_graph GTY(())
*** 387,392 ****
--- 396,407 ----
      PROFILE_GUESSED,
      PROFILE_READ
    } x_profile_status;
+ 
+   /* Number of basic blocks in dominance tree in each direction.  */
+   unsigned x_n_bbs_in_dom_tree[2];
+ 
+   /* State of the dominance information.  */
+   enum dom_state x_dom_computed[2];
  };
  
  /* Defines for accessing the fields of the CFG structure for function FN.  */
*************** enum cdi_direction
*** 958,971 ****
    CDI_POST_DOMINATORS
  };
  
! enum dom_state
! {
!   DOM_NONE,		/* Not computed at all.  */
!   DOM_NO_FAST_QUERY,	/* The data is OK, but the fast query data are not usable.  */
!   DOM_OK		/* Everything is ok.  */
! };
! 
! extern enum dom_state dom_computed[2];
  
  extern bool dom_info_available_p (enum cdi_direction);
  extern void calculate_dominance_info (enum cdi_direction);
--- 973,980 ----
    CDI_POST_DOMINATORS
  };
  
! #define dom_computed cfun->cfg->x_dom_computed
! #define n_bbs_in_dom_tree cfun->cfg->x_n_bbs_in_dom_tree
  
  extern bool dom_info_available_p (enum cdi_direction);
  extern void calculate_dominance_info (enum cdi_direction);
Index: tree-cfg.c
===================================================================
*** tree-cfg.c	(revision 117251)
--- tree-cfg.c	(working copy)
*************** init_empty_tree_cfg (void)
*** 146,151 ****
--- 146,156 ----
    SET_BASIC_BLOCK (EXIT_BLOCK, EXIT_BLOCK_PTR);
    ENTRY_BLOCK_PTR->next_bb = EXIT_BLOCK_PTR;
    EXIT_BLOCK_PTR->prev_bb = ENTRY_BLOCK_PTR;
+ 
+   dom_computed[CDI_DOMINATORS] = DOM_NONE;
+   dom_computed[CDI_POST_DOMINATORS] = DOM_NONE;
+   n_bbs_in_dom_tree[CDI_DOMINATORS] = 0;
+   n_bbs_in_dom_tree[CDI_POST_DOMINATORS] = 0;
  }
  
  /*---------------------------------------------------------------------------
*************** move_stmt_r (tree *tp, int *walk_subtree
*** 4596,4603 ****
  
        p->remap_decls_p = save_remap_decls_p;
      }
!   else if (DECL_P (t) && DECL_CONTEXT (t) == p->from_context)
      {
        if (TREE_CODE (t) == LABEL_DECL)
  	{
  	  if (p->new_label_map)
--- 4601,4612 ----
  
        p->remap_decls_p = save_remap_decls_p;
      }
!   else if ((DECL_P (t) && DECL_CONTEXT (t) == p->from_context)
! 	   || TREE_CODE (t) == SSA_NAME)
      {
+       if (TREE_CODE (t) == SSA_NAME)
+ 	t = SSA_NAME_VAR (t);
+ 
        if (TREE_CODE (t) == LABEL_DECL)
  	{
  	  if (p->new_label_map)
*************** move_stmt_r (tree *tp, int *walk_subtree
*** 4618,4630 ****
  	  if (TREE_CODE (t) == VAR_DECL)
  	    {
  	      struct function *f = DECL_STRUCT_FUNCTION (p->to_context);
- 	      f->unexpanded_var_list
- 		= tree_cons (0, t, f->unexpanded_var_list);
  
! 	      /* Mark T to be removed from the original function,
! 	         otherwise it will be given a DECL_RTL when the
! 		 original function is expanded.  */
! 	      bitmap_set_bit (p->vars_to_remove, DECL_UID (t));
  	    }
  	}
      }
--- 4627,4643 ----
  	  if (TREE_CODE (t) == VAR_DECL)
  	    {
  	      struct function *f = DECL_STRUCT_FUNCTION (p->to_context);
  
! 	      if (!bitmap_bit_p (p->vars_to_remove, DECL_UID (t)))
! 		{
! 		  f->unexpanded_var_list
! 			  = tree_cons (0, t, f->unexpanded_var_list);
! 
! 		  /* Mark T to be removed from the original function,
! 		     otherwise it will be given a DECL_RTL when the
! 		     original function is expanded.  */
! 		  bitmap_set_bit (p->vars_to_remove, DECL_UID (t));
! 		}
  	    }
  	}
      }
*************** move_block_to_fn (struct function *dest_
*** 4658,4663 ****
--- 4671,4679 ----
    unsigned old_len, new_len;
    basic_block *addr;
  
+   /* Remove BB from dominance structures.  */
+   delete_from_dominance_info (CDI_DOMINATORS, bb);
+ 
    /* Link BB to the new linked list.  */
    move_block_after (bb, after);
  
*************** move_block_to_fn (struct function *dest_
*** 4676,4683 ****
    /* Grow DEST_CFUN's basic block array if needed.  */
    cfg = dest_cfun->cfg;
    cfg->x_n_basic_blocks++;
!   if (bb->index > cfg->x_last_basic_block)
!     cfg->x_last_basic_block = bb->index;
  
    old_len = VEC_length (basic_block, cfg->x_basic_block_info);
    if ((unsigned) cfg->x_last_basic_block >= old_len)
--- 4692,4699 ----
    /* Grow DEST_CFUN's basic block array if needed.  */
    cfg = dest_cfun->cfg;
    cfg->x_n_basic_blocks++;
!   if (bb->index >= cfg->x_last_basic_block)
!     cfg->x_last_basic_block = bb->index + 1;
  
    old_len = VEC_length (basic_block, cfg->x_basic_block_info);
    if ((unsigned) cfg->x_last_basic_block >= old_len)
*************** move_block_to_fn (struct function *dest_
*** 4689,4695 ****
      }
  
    VEC_replace (basic_block, cfg->x_basic_block_info,
!                cfg->x_last_basic_block, bb);
  
    /* The statements in BB need to be associated with a new TREE_BLOCK.
       Labels need to be associated with a new label-to-block map.  */
--- 4705,4711 ----
      }
  
    VEC_replace (basic_block, cfg->x_basic_block_info,
!                bb->index, bb);
  
    /* The statements in BB need to be associated with a new TREE_BLOCK.
       Labels need to be associated with a new label-to-block map.  */
*************** move_sese_region_to_fn (struct function 
*** 4826,4834 ****
--- 4842,4854 ----
  		        basic_block exit_bb)
  {
    VEC(basic_block,heap) *bbs;
+   basic_block *dom_bbs;
+   unsigned n_dom_bbs;
+   basic_block dom_entry = get_immediate_dominator (CDI_DOMINATORS, entry_bb);
    basic_block after, bb, *entry_pred, *exit_succ;
    struct function *saved_cfun;
    int *entry_flag, *exit_flag, eh_offset;
+   unsigned *entry_prob, *exit_prob;
    unsigned i, num_entry_edges, num_exit_edges;
    edge e;
    edge_iterator ei;
*************** move_sese_region_to_fn (struct function 
*** 4839,4845 ****
  
    /* Collect all the blocks in the region.  Manually add ENTRY_BB
       because it won't be added by dfs_enumerate_from.  */
-   calculate_dominance_info (CDI_DOMINATORS);
  
    /* If ENTRY does not strictly dominate EXIT, this cannot be an SESE
       region.  */
--- 4859,4864 ----
*************** move_sese_region_to_fn (struct function 
*** 4851,4856 ****
--- 4870,4883 ----
    VEC_safe_push (basic_block, heap, bbs, entry_bb);
    gather_blocks_in_sese_region (entry_bb, exit_bb, &bbs);
  
+   /* The blocks that used to be dominated by something in BBS will now be
+      dominated by the new block.  */
+   dom_bbs = XNEWVEC (basic_block, n_basic_blocks);
+   n_dom_bbs = get_dominated_by_region (CDI_DOMINATORS,
+ 				       VEC_address (basic_block, bbs),
+ 				       VEC_length (basic_block, bbs),
+ 				       dom_bbs);
+ 
    /* Detach ENTRY_BB and EXIT_BB from CFUN->CFG.  We need to remember
       the predecessor edges to ENTRY_BB and the successor edges to
       EXIT_BB so that we can re-attach them to the new basic block that
*************** move_sese_region_to_fn (struct function 
*** 4858,4866 ****
--- 4885,4895 ----
    num_entry_edges = EDGE_COUNT (entry_bb->preds);
    entry_pred = (basic_block *) xcalloc (num_entry_edges, sizeof (basic_block));
    entry_flag = (int *) xcalloc (num_entry_edges, sizeof (int));
+   entry_prob = XNEWVEC (unsigned, num_entry_edges);
    i = 0;
    for (ei = ei_start (entry_bb->preds); (e = ei_safe_edge (ei)) != NULL;)
      {
+       entry_prob[i] = e->probability;
        entry_flag[i] = e->flags;
        entry_pred[i++] = e->src;
        remove_edge (e);
*************** move_sese_region_to_fn (struct function 
*** 4872,4880 ****
--- 4901,4911 ----
        exit_succ = (basic_block *) xcalloc (num_exit_edges,
  					   sizeof (basic_block));
        exit_flag = (int *) xcalloc (num_exit_edges, sizeof (int));
+       exit_prob = XNEWVEC (unsigned, num_exit_edges);
        i = 0;
        for (ei = ei_start (exit_bb->succs); (e = ei_safe_edge (ei)) != NULL;)
  	{
+ 	  exit_prob[i] = e->probability;
  	  exit_flag[i] = e->flags;
  	  exit_succ[i++] = e->dest;
  	  remove_edge (e);
*************** move_sese_region_to_fn (struct function 
*** 4885,4890 ****
--- 4916,4922 ----
        num_exit_edges = 0;
        exit_succ = NULL;
        exit_flag = NULL;
+       exit_prob = NULL;
      }
  
    /* Switch context to the child function to initialize DEST_FN's CFG.  */
*************** move_sese_region_to_fn (struct function 
*** 4972,4991 ****
       create a new basic block in its place.  */
    bb = create_empty_bb (entry_pred[0]);
    for (i = 0; i < num_entry_edges; i++)
!     make_edge (entry_pred[i], bb, entry_flag[i]);
  
    for (i = 0; i < num_exit_edges; i++)
!     make_edge (bb, exit_succ[i], exit_flag[i]);
  
    if (exit_bb)
      {
        free (exit_flag);
        free (exit_succ);
      }
    free (entry_flag);
    free (entry_pred);
-   free_dominance_info (CDI_DOMINATORS);
-   free_dominance_info (CDI_POST_DOMINATORS);
    VEC_free (basic_block, heap, bbs);
  
    return bb;
--- 5004,5034 ----
       create a new basic block in its place.  */
    bb = create_empty_bb (entry_pred[0]);
    for (i = 0; i < num_entry_edges; i++)
!     {
!       e = make_edge (entry_pred[i], bb, entry_flag[i]);
!       e->probability = entry_prob[i];
!     }
  
    for (i = 0; i < num_exit_edges; i++)
!     {
!       e = make_edge (bb, exit_succ[i], exit_flag[i]);
!       e->probability = exit_prob[i];
!     }
! 
!   set_immediate_dominator (CDI_DOMINATORS, bb, dom_entry);
!   for (i = 0; i < n_dom_bbs; i++)
!     set_immediate_dominator (CDI_DOMINATORS, dom_bbs[i], bb);
!   free (dom_bbs);
  
    if (exit_bb)
      {
+       free (exit_prob);
        free (exit_flag);
        free (exit_succ);
      }
+   free (entry_prob);
    free (entry_flag);
    free (entry_pred);
    VEC_free (basic_block, heap, bbs);
  
    return bb;
Index: passes.c
===================================================================
*** passes.c	(revision 117251)
--- passes.c	(working copy)
*************** init_optimization_passes (void)
*** 606,611 ****
--- 606,612 ----
       vectorizer creates alias relations that are not supported by
       pass_may_alias.  */
    NEXT_PASS (pass_complete_unroll);
+   NEXT_PASS (pass_parallelize_loops);
    NEXT_PASS (pass_loop_prefetch);
    NEXT_PASS (pass_iv_optimize);
    NEXT_PASS (pass_tree_loop_done);



More information about the Gcc-patches mailing list