This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

IVOPT improvement patch


Hi, IVOPT has been one of the main area of complaints from gcc users
and it is often shutdown or user is forced to use inline assembly to
write key kernel loops. The following (resulting from the
investigation of many user complaints) summarize some of the key
problems:

1) Too many induction variables are used and advanced addressing mode
is not fully taken advantage of. On latest Intel CPU, the increased
loop size (due to iv updates) can have very large negative impact on
performance, e.g, when LSD and uop macro fusion get blocked. The root
cause of the problem is not at the cost model used in IVOPT, but in
the algorithm in finding the 'optimal' assignment from iv candidates
to uses.

2) Profile information is not used in cost estimation (e.g. computing
cost of loop variants)

3) For replaced IV (original) that are only live out of the loop (i.e.
there are no uses inside loop), the rewrite of the IV occurs inside
the loop which usually results in code more expensive than the
original iv update statement -- and it is very difficult for later
phases to sink down the computation outside the loop (see PR31792).
The right solution is to materialize/rewrite such ivs directly outside
the loop (also to avoid introducing overlapping live ranges)

4) iv update statement sometimes block the forward
propagation/combination of the memory ref operation (depending the
before IV value)  with the loop branch compare. Simple minded
propagation will lead to overlapping live range and addition copy/move
instruction to be generated.

5) In estimating the global cost (register pressure), the registers
resulting from LIM of invariant expressions are not considered

6) IN MEM_REF creation, loop variant and invariants may be assigned to
the same part -- which is essentially a re-association blocking LIM

7) Intrinsic calls that are essentially memory operations are not
recognized as uses.

The attached patch handles all the problems above except for 7.


Bootstrapped and regression tested on linux/x86_64.

The patch was not tuned for SPEC, but SPEC testing was done.
Observable improvements : gcc 4.85%, vpr 1.53%, bzip2 2.36%, and eon
2.43% (Machine CPU: Intel Xeon E5345/2.33Ghz, m32mode).

Ok for trunk?

Thanks,

David
Index: gcc/testsuite/gcc.dg/tree-ssa/ivopt_1.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/ivopt_1.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/ivopt_1.c	(revision 0)
@@ -0,0 +1,16 @@
+/* { dg-do compile { target {{ i?86-*-* x86_64-*-* } && lp64 } } } */
+/* { dg-options "-O2 -m64 -fdump-tree-ivopts" } */
+#define TYPE char*
+
+void foo (int i_width, TYPE dst, TYPE src1, TYPE src2)
+{
+      int x;
+       for( x = 0; x < i_width; x++ )
+       {
+           dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
+       }
+}
+
+
+/* { dg-final { scan-tree-dump-times "PHI <ivtmp" 1 "ivopts"} } */
+/* { dg-final { cleanup-tree-dump "ivopts" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/ivopt_2.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/ivopt_2.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/ivopt_2.c	(revision 0)
@@ -0,0 +1,16 @@
+/* { dg-do compile { target {{ i?86-*-* x86_64-*-* } && lp64 } } } */
+/* { dg-options "-O2 -m64 -fdump-tree-ivopts" } */
+
+#define TYPE char*
+
+void foo (int i_width, TYPE dst, TYPE src1, TYPE src2)
+{
+      int x;
+       for( x = 0; x < i_width; x++ )
+       {
+           *dst++ = ( *src1++ + *src2++ + 1 ) >> 1;
+       }
+}
+
+/* { dg-final { scan-tree-dump-times "PHI <ivtmp" 1 "ivopts"} } */
+/* { dg-final { cleanup-tree-dump "ivopts" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/ivopt_3.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/ivopt_3.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/ivopt_3.c	(revision 0)
@@ -0,0 +1,19 @@
+/* { dg-do compile { target {{ i?86-*-* x86_64-*-* } && lp64 } } } */
+/* { dg-options "-O2 -m64 -fdump-tree-ivopts" } */
+
+#define TYPE char*
+
+void foo (int i_width, char* dst, char* src1, char* src2)
+{
+      int x;
+       for( x = 0; x < i_width; x++ )
+       {
+           *((TYPE)dst) = ( *((TYPE)src1) + *((TYPE)src2) + 1 ) >> 1;
+	   dst+=sizeof(TYPE);
+	   src1+=sizeof(TYPE);
+	   src2+=sizeof(TYPE);
+       }
+} 
+
+/* { dg-final { scan-tree-dump-times "PHI <ivtmp" 1 "ivopts"} } */
+/* { dg-final { cleanup-tree-dump "ivopts" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/ivopt_4.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/ivopt_4.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/ivopt_4.c	(revision 0)
@@ -0,0 +1,18 @@
+/* { dg-do compile { target {{ i?86-*-* x86_64-*-* } && lp64 } } } */
+/* { dg-options "-O2 -m64 -fdump-tree-ivopts" } */
+
+#ifndef TYPE
+#define TYPE char*
+#endif
+
+void foo (int i_width, TYPE dst, TYPE src1, TYPE src2)
+{
+      TYPE dstn= dst + i_width;
+       for( ; dst < dstn; )
+       {
+           *dst++ = ( *src1++ + *src2++ + 1 ) >> 1;
+       }
+}
+
+/* { dg-final { scan-tree-dump-times "PHI <ivtmp" 1 "ivopts"} } */
+/* { dg-final { cleanup-tree-dump "ivopts" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/ivopt_6.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/ivopt_6.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/ivopt_6.c	(revision 0)
@@ -0,0 +1,19 @@
+/* { dg-do compile { target {{ i?86-*-* x86_64-*-* } && lp64 } } } */
+/* { dg-options "-O2 -m64 -fdump-tree-ivopts-details" } */
+#include <stdlib.h>
+int foo(const char* p, const char* p2, size_t N)
+{
+  const char* p_limit = p + N;
+  while (p  <= p_limit - 16
+        && *(long long*)p  <*(long long*)p2 )
+  {
+     p += 16;
+     p2 += 16;
+  }
+  N = p_limit - p;
+  return memcmp(p, p2, N);
+}
+
+/* { dg-final { scan-tree-dump-times "Sinking" 4 "ivopts"} } */
+/* { dg-final { scan-tree-dump-times "Reordering" 1 "ivopts"} } */
+/* { dg-final { cleanup-tree-dump "ivopts" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/ivopt_7.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/ivopt_7.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/ivopt_7.c	(revision 0)
@@ -0,0 +1,20 @@
+/* { dg-do compile { target {{ i?86-*-* x86_64-*-* } && lp64 } } } */
+/* { dg-options "-O2 -m64 -fdump-tree-ivopts-details" } */
+#include <stdlib.h>
+
+int foo(const char* p, const char* p2, size_t N)
+{
+ const char* p_limit = p + N;
+ int s = 0;
+ while (p  <= p_limit - 16
+        && *(long long*)p <*(long long*)p2)
+ {
+     p += 8;
+     p2 += 8;
+     s += (*p + *p2);
+  }
+  return s;
+}
+/* { dg-final { scan-tree-dump-times "Reordering" 1 "ivopts"} } */
+/* { dg-final { scan-tree-dump-times "PHI <ivtmp" 1 "ivopts"} } */
+/* { dg-final { cleanup-tree-dump "ivopts" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/ivopt_5_sink.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/ivopt_5_sink.c	(revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/ivopt_5_sink.c	(revision 0)
@@ -0,0 +1,17 @@
+/* { dg-do compile { target {{ i?86-*-* x86_64-*-* } && lp64 } } } */
+/* { dg-options "-O2  -m64 -fdump-tree-ivopts-details" } */
+int inner_longest_match(char *scan, char *match, char *strend)
+{
+  char *start_scan = scan;
+  do {
+  } while (*++scan == *++match && *++scan == *++match &&
+           *++scan == *++match && *++scan == *++match &&
+           *++scan == *++match && *++scan == *++match &&
+           *++scan == *++match && *++scan == *++match &&
+           scan < strend);
+
+  return scan - start_scan;
+}
+
+/* { dg-final { scan-tree-dump-times "Sinking" 7 "ivopts"} } */
+/* { dg-final { cleanup-tree-dump "ivopts" } } */
Index: gcc/tree-ssa-loop-ivopts.c
===================================================================
--- gcc/tree-ssa-loop-ivopts.c	(revision 159243)
+++ gcc/tree-ssa-loop-ivopts.c	(working copy)
@@ -91,14 +91,29 @@ along with GCC; see the file COPYING3.  
 #include "langhooks.h"
 #include "tree-affine.h"
 #include "target.h"
+#include "tree-inline.h"
 
 /* The infinite cost.  */
 #define INFTY 10000000
 
-/* The expected number of loop iterations.  TODO -- use profiling instead of
-   this.  */
 #define AVG_LOOP_NITER(LOOP) 5
 
+/* Returns the expected number of loop iterations for LOOP.
+   The average trip count is computed from profile data if it
+   exists. */
+
+static inline unsigned
+avg_loop_niter (struct loop *loop)
+{
+  unsigned tc;
+  if (loop->header->count || loop->latch->count)
+    tc = expected_loop_iterations (loop);
+  else
+    tc = AVG_LOOP_NITER (loop);
+  if (tc == 0)
+    tc++;
+  return tc;
+}
 
 /* Representation of the induction variable.  */
 struct iv
@@ -156,6 +171,14 @@ struct cost_pair
 			   the new bound to compare with.  */
 };
 
+/* The use position for iv.  */
+enum iv_use_pos
+{
+  IU_UNKNOWN,
+  IU_OUTSIDE_LOOP_ONLY,
+  IU_INSIDE_LOOP
+};
+
 /* Use.  */
 struct iv_use
 {
@@ -173,6 +196,8 @@ struct iv_use
 
   struct iv_cand *selected;
 			/* The selected candidate.  */
+  enum iv_use_pos use_pos;
+                        /* The use position.  */
 };
 
 /* The position where the iv is computed.  */
@@ -218,6 +243,11 @@ typedef struct iv_cand *iv_cand_p;
 DEF_VEC_P(iv_cand_p);
 DEF_VEC_ALLOC_P(iv_cand_p,heap);
 
+typedef struct version_info *version_info_p;
+DEF_VEC_P(version_info_p);
+DEF_VEC_ALLOC_P(version_info_p,heap);
+
+
 struct ivopts_data
 {
   /* The currently optimized loop.  */
@@ -235,6 +265,10 @@ struct ivopts_data
   /* The array of information for the ssa names.  */
   struct version_info *version_info;
 
+
+  /* Pseudo version infos for generated loop invariants.  */
+  VEC(version_info_p,heap) *pseudo_version_info;
+
   /* The bitmap of indices in version_info whose value was changed.  */
   bitmap relevant;
 
@@ -250,6 +284,9 @@ struct ivopts_data
   /* The maximum invariant id.  */
   unsigned max_inv_id;
 
+  /* The minimal invariant id for pseudo invariants.  */
+  unsigned min_pseudo_inv_id;
+
   /* Whether to consider just related and important candidates when replacing a
      use.  */
   bool consider_all_candidates;
@@ -283,6 +320,9 @@ struct iv_ca
   /* Total number of registers needed.  */
   unsigned n_regs;
 
+  /* Total number of pseudo invariants.  */
+  unsigned n_pseudos;
+
   /* Total cost of expressing uses.  */
   comp_cost cand_use_cost;
 
@@ -335,6 +375,8 @@ struct iv_ca_delta
 
 static VEC(tree,heap) *decl_rtl_to_reset;
 
+static struct pointer_map_t *inverted_stmt_map;
+
 /* Number of uses recorded in DATA.  */
 
 static inline unsigned
@@ -513,6 +555,19 @@ dump_cand (FILE *file, struct iv_cand *c
       return;
     }
 
+  if (cand->var_before)
+    {
+      fprintf (file, "  var_before ");
+      print_generic_expr (file, cand->var_before, TDF_SLIM);
+      fprintf (file, "\n");
+    }
+  if (cand->var_after)
+    {
+      fprintf (file, "  var_after ");
+      print_generic_expr (file, cand->var_after, TDF_SLIM);
+      fprintf (file, "\n");
+    }
+
   switch (cand->pos)
     {
     case IP_NORMAL:
@@ -544,7 +599,11 @@ dump_cand (FILE *file, struct iv_cand *c
 static inline struct version_info *
 ver_info (struct ivopts_data *data, unsigned ver)
 {
-  return data->version_info + ver;
+  if (ver < data->min_pseudo_inv_id)
+    return data->version_info + ver;
+  else
+    return VEC_index (version_info_p, data->pseudo_version_info,
+                      ver - data->min_pseudo_inv_id);
 }
 
 /* Returns the info for ssa name NAME.  */
@@ -766,6 +825,8 @@ tree_ssa_iv_optimize_init (struct ivopts
 {
   data->version_info_size = 2 * num_ssa_names;
   data->version_info = XCNEWVEC (struct version_info, data->version_info_size);
+  data->min_pseudo_inv_id = num_ssa_names;
+  data->pseudo_version_info = NULL;
   data->relevant = BITMAP_ALLOC (NULL);
   data->important_candidates = BITMAP_ALLOC (NULL);
   data->max_inv_id = 0;
@@ -1102,6 +1163,7 @@ record_use (struct ivopts_data *data, tr
   use->stmt = stmt;
   use->op_p = use_p;
   use->related_cands = BITMAP_ALLOC (NULL);
+  use->use_pos = IU_UNKNOWN;
 
   /* To avoid showing ssa name in the dumps, if it was not reset by the
      caller.  */
@@ -1142,10 +1204,28 @@ record_invariant (struct ivopts_data *da
   bitmap_set_bit (data->relevant, SSA_NAME_VERSION (op));
 }
 
-/* Checks whether the use OP is interesting and if so, records it.  */
+/* Records a pseudo invariant and returns its VERSION_INFO.  */
+
+static struct version_info *
+record_pseudo_invariant (struct ivopts_data *data)
+{
+  struct version_info *info;
+
+  info = XCNEW (struct version_info);
+  info->name = NULL;
+  VEC_safe_push (version_info_p, heap, data->pseudo_version_info, info);
+  info->inv_id
+      = VEC_length (version_info_p, data->pseudo_version_info) - 1
+      + data->min_pseudo_inv_id;
+
+  return info;
+}
+
+/* Checks whether the use OP is interesting and if so, records it.
+   USE_POS indicates where the use comes from.  */
 
 static struct iv_use *
-find_interesting_uses_op (struct ivopts_data *data, tree op)
+find_interesting_uses_op (struct ivopts_data *data, tree op, enum iv_use_pos use_pos)
 {
   struct iv *iv;
   struct iv *civ;
@@ -1164,6 +1244,10 @@ find_interesting_uses_op (struct ivopts_
       use = iv_use (data, iv->use_id);
 
       gcc_assert (use->type == USE_NONLINEAR_EXPR);
+      gcc_assert (use->use_pos != IU_UNKNOWN);
+
+      if (use->use_pos == IU_OUTSIDE_LOOP_ONLY)
+        use->use_pos = use_pos;
       return use;
     }
 
@@ -1183,6 +1267,7 @@ find_interesting_uses_op (struct ivopts_
 
   use = record_use (data, NULL, civ, stmt, USE_NONLINEAR_EXPR);
   iv->use_id = use->id;
+  use->use_pos = use_pos;
 
   return use;
 }
@@ -1260,17 +1345,19 @@ find_interesting_uses_cond (struct ivopt
 {
   tree *var_p, *bound_p;
   struct iv *var_iv, *civ;
+  struct iv_use *use;
 
   if (!extract_cond_operands (data, stmt, &var_p, &bound_p, &var_iv, NULL))
     {
-      find_interesting_uses_op (data, *var_p);
-      find_interesting_uses_op (data, *bound_p);
+      find_interesting_uses_op (data, *var_p, IU_INSIDE_LOOP);
+      find_interesting_uses_op (data, *bound_p, IU_INSIDE_LOOP);
       return;
     }
 
   civ = XNEW (struct iv);
   *civ = *var_iv;
-  record_use (data, NULL, civ, stmt, USE_COMPARE);
+  use = record_use (data, NULL, civ, stmt, USE_COMPARE);
+  use->use_pos = IU_INSIDE_LOOP;
 }
 
 /* Returns true if expression EXPR is obviously invariant in LOOP,
@@ -1433,11 +1520,13 @@ idx_record_use (tree base, tree *idx,
 		void *vdata)
 {
   struct ivopts_data *data = (struct ivopts_data *) vdata;
-  find_interesting_uses_op (data, *idx);
+  find_interesting_uses_op (data, *idx, IU_INSIDE_LOOP);
   if (TREE_CODE (base) == ARRAY_REF || TREE_CODE (base) == ARRAY_RANGE_REF)
     {
-      find_interesting_uses_op (data, array_ref_element_size (base));
-      find_interesting_uses_op (data, array_ref_low_bound (base));
+      find_interesting_uses_op (data, array_ref_element_size (base),
+                                IU_INSIDE_LOOP);
+      find_interesting_uses_op (data, array_ref_low_bound (base),
+                                IU_INSIDE_LOOP);
     }
   return true;
 }
@@ -1597,12 +1686,13 @@ may_be_nonaddressable_p (tree expr)
 
 /* Finds addresses in *OP_P inside STMT.  */
 
-static void
+static bool
 find_interesting_uses_address (struct ivopts_data *data, gimple stmt, tree *op_p)
 {
   tree base = *op_p, step = build_int_cst (sizetype, 0);
   struct iv *civ;
   struct ifs_ivopts_data ifs_ivopts_data;
+  struct iv_use *use;
 
   /* Do not play with volatile memory references.  A bit too conservative,
      perhaps, but safe.  */
@@ -1696,11 +1786,13 @@ find_interesting_uses_address (struct iv
     }
 
   civ = alloc_iv (base, step);
-  record_use (data, op_p, civ, stmt, USE_ADDRESS);
-  return;
+  use = record_use (data, op_p, civ, stmt, USE_ADDRESS);
+  use->use_pos = IU_INSIDE_LOOP;
+  return true;
 
 fail:
   for_each_index (op_p, idx_record_use, data);
+  return false;
 }
 
 /* Finds and records invariants used in STMT.  */
@@ -1762,7 +1854,7 @@ find_interesting_uses_stmt (struct ivopt
 	  if (REFERENCE_CLASS_P (*rhs))
 	    find_interesting_uses_address (data, stmt, rhs);
 	  else
-	    find_interesting_uses_op (data, *rhs);
+	    find_interesting_uses_op (data, *rhs, IU_INSIDE_LOOP);
 
 	  if (REFERENCE_CLASS_P (*lhs))
 	    find_interesting_uses_address (data, stmt, lhs);
@@ -1803,7 +1895,7 @@ find_interesting_uses_stmt (struct ivopt
       if (!iv)
 	continue;
 
-      find_interesting_uses_op (data, op);
+      find_interesting_uses_op (data, op, IU_INSIDE_LOOP);
     }
 }
 
@@ -1822,7 +1914,12 @@ find_interesting_uses_outside (struct iv
       phi = gsi_stmt (psi);
       def = PHI_ARG_DEF_FROM_EDGE (phi, exit);
       if (is_gimple_reg (def))
-	find_interesting_uses_op (data, def);
+        {
+          if (gimple_phi_num_args (phi) == 1)
+            find_interesting_uses_op (data, def, IU_OUTSIDE_LOOP_ONLY);
+	  else
+            find_interesting_uses_op (data, def, IU_INSIDE_LOOP);
+	}
     }
 }
 
@@ -2138,7 +2235,9 @@ add_candidate_1 (struct ivopts_data *dat
 	continue;
 
       if (operand_equal_p (base, cand->iv->base, 0)
-	  && operand_equal_p (step, cand->iv->step, 0))
+	  && operand_equal_p (step, cand->iv->step, 0)
+          && (TYPE_PRECISION (TREE_TYPE (base))
+              == TYPE_PRECISION (TREE_TYPE (cand->iv->base))))
 	break;
     }
 
@@ -3684,6 +3783,94 @@ difference_cost (struct ivopts_data *dat
   return force_var_cost (data, aff_combination_to_tree (&aff_e1), depends_on);
 }
 
+/* Returns true if AFF1 and AFF2 are identical.  */
+
+static bool
+compare_aff_trees (aff_tree *aff1, aff_tree *aff2)
+{
+  unsigned i;
+
+  if (aff1->n != aff2->n)
+    return false;
+
+  for (i = 0; i < aff1->n; i++)
+    {
+      if (double_int_cmp (aff1->elts[i].coef, aff2->elts[i].coef, 0) != 0)
+        return false;
+
+      if (!operand_equal_p (aff1->elts[i].val, aff2->elts[i].val, 0))
+        return false;
+    }
+  return true;
+}
+
+/* Returns true if expression UBASE - RATIO * CBASE requires a new compiler
+   generated temporary.  */
+
+static bool
+create_loop_invariant_temp (tree ubase, tree cbase, HOST_WIDE_INT ratio)
+{
+  aff_tree ubase_aff, cbase_aff;
+
+  STRIP_NOPS (ubase);
+  STRIP_NOPS (cbase);
+
+  if ((TREE_CODE (ubase) == INTEGER_CST)
+      && (TREE_CODE (cbase) == INTEGER_CST))
+    return false;
+
+  if (((TREE_CODE (ubase) == SSA_NAME)
+       || (TREE_CODE (ubase) == ADDR_EXPR))
+      && (TREE_CODE (cbase) == INTEGER_CST))
+    return false;
+
+  if (((TREE_CODE (cbase) == SSA_NAME)
+       || (TREE_CODE (cbase) == ADDR_EXPR))
+      && (TREE_CODE (ubase) == INTEGER_CST))
+    return false;
+
+  if (ratio == 1)
+    {
+      if(operand_equal_p (ubase, cbase, 0))
+        return false;
+      if (TREE_CODE (ubase) == ADDR_EXPR
+        && TREE_CODE (cbase) == ADDR_EXPR)
+        {
+          tree usym, csym;
+
+          usym = TREE_OPERAND (ubase, 0);
+          csym = TREE_OPERAND (cbase, 0);
+          if (TREE_CODE (usym) == ARRAY_REF)
+            {
+              tree ind = TREE_OPERAND (usym, 1);
+              if (TREE_CODE (ind) == INTEGER_CST
+                  && host_integerp (ind, 0)
+                  && TREE_INT_CST_LOW (ind) == 0)
+                usym = TREE_OPERAND (usym, 0);
+            }
+          if (TREE_CODE (csym) == ARRAY_REF)
+            {
+              tree ind = TREE_OPERAND (csym, 1);
+              if (TREE_CODE (ind) == INTEGER_CST
+                  && host_integerp (ind, 0)
+                  && TREE_INT_CST_LOW (ind) == 0)
+                csym = TREE_OPERAND (csym, 0);
+            }
+          if (usym == csym)
+            return false;
+        }
+      /* Now do more complex comparison  */
+      tree_to_aff_combination (ubase, TREE_TYPE (ubase), &ubase_aff);
+      tree_to_aff_combination (cbase, TREE_TYPE (cbase), &cbase_aff);
+      if (compare_aff_trees (&ubase_aff, &cbase_aff))
+        return false;
+    }
+
+  return true;
+}
+
+
+
 /* Determines the cost of the computation by that USE is expressed
    from induction variable CAND.  If ADDRESS_P is true, we just need
    to create an address from it, otherwise we want to get it into
@@ -3811,6 +3998,17 @@ get_computation_cost_at (struct ivopts_d
 					 &offset, depends_on));
     }
 
+  /* Loop invariant computation.  */
+  cost.cost /= avg_loop_niter (data->current_loop);
+
+  if (create_loop_invariant_temp (ubase, cbase, ratio))
+    {
+      struct version_info *pv = record_pseudo_invariant (data);
+       if (!*depends_on)
+         *depends_on = BITMAP_ALLOC (NULL);
+       bitmap_set_bit (*depends_on, pv->inv_id);
+    }
+
   /* If we are after the increment, the value of the candidate is higher by
      one iteration.  */
   stmt_is_after_inc = stmt_after_increment (data->current_loop, cand, at);
@@ -3841,7 +4039,7 @@ get_computation_cost_at (struct ivopts_d
       are added once to the variable, if present.  */
   if (var_present && (symbol_present || offset))
     cost.cost += add_cost (TYPE_MODE (ctype), speed)
-		 / AVG_LOOP_NITER (data->current_loop);
+		 / avg_loop_niter (data->current_loop);
 
   /* Having offset does not affect runtime cost in case it is added to
      symbol, but it increases complexity.  */
@@ -3911,6 +4109,10 @@ determine_use_iv_cost_generic (struct iv
     }
 
   cost = get_computation_cost (data, use, cand, false, &depends_on, NULL);
+
+  if (use->use_pos == IU_OUTSIDE_LOOP_ONLY && !infinite_cost_p (cost))
+    cost.cost /= avg_loop_niter (data->current_loop);
+
   set_use_iv_cost (data, use, cand, cost, depends_on, NULL_TREE);
 
   return !infinite_cost_p (cost);
@@ -4056,20 +4258,16 @@ may_eliminate_iv (struct ivopts_data *da
   /* If not, and if this is the only possible exit of the loop, see whether
      we can get a conservative estimate on the number of iterations of the
      entire loop and compare against that instead.  */
-  else if (loop_only_exit_p (loop, exit))
+  else
     {
       double_int period_value, max_niter;
       if (!estimated_loop_iterations (loop, true, &max_niter))
 	return false;
       period_value = tree_to_double_int (period);
-      if (double_int_ucmp (max_niter, period_value) >= 0)
+      if (double_int_ucmp (max_niter, period_value) > 0)
 	return false;
     }
 
-  /* Otherwise, punt.  */
-  else
-    return false;
-
   cand_value_at (loop, cand, use->stmt, nit, &bnd);
 
   *bound = aff_combination_to_tree (&bnd);
@@ -4106,7 +4304,7 @@ determine_use_iv_cost_condition (struct 
       elim_cost = force_var_cost (data, bound, &depends_on_elim);
       /* The bound is a loop invariant, so it will be only computed
 	 once.  */
-      elim_cost.cost /= AVG_LOOP_NITER (data->current_loop);
+      elim_cost.cost /= avg_loop_niter (data->current_loop);
     }
   else
     elim_cost = infinite_cost;
@@ -4353,7 +4551,7 @@ determine_iv_cost (struct ivopts_data *d
   cost_base = force_var_cost (data, base, NULL);
   cost_step = add_cost (TYPE_MODE (TREE_TYPE (base)), data->speed);
 
-  cost = cost_step + cost_base.cost / AVG_LOOP_NITER (current_loop);
+  cost = cost_step + cost_base.cost / avg_loop_niter (data->current_loop);
 
   /* Prefer the original ivs unless we may gain something by replacing it.
      The reason is to make debugging simpler; so this is not relevant for
@@ -4514,6 +4712,12 @@ cheaper_cost_pair (struct cost_pair *a, 
   return false;
 }
 
+
+/* Pseudo invariants may get commonned, and there is no simple way
+   to estimate that. Simply weight it down.  */
+
+#define PSEUDO_COMMON_FACTOR 0.3
+
 /* Computes the cost field of IVS structure.  */
 
 static void
@@ -4521,7 +4725,10 @@ iv_ca_recount_cost (struct ivopts_data *
 {
   comp_cost cost = ivs->cand_use_cost;
   cost.cost += ivs->cand_cost;
-  cost.cost += ivopts_global_cost_for_size (data, ivs->n_regs);
+  cost.cost += ivopts_global_cost_for_size (data,
+                                            ivs->n_regs
+                                            + ivs->n_pseudos
+                                            * PSEUDO_COMMON_FACTOR);
 
   ivs->cost = cost;
 }
@@ -4529,10 +4736,12 @@ iv_ca_recount_cost (struct ivopts_data *
 /* Remove invariants in set INVS to set IVS.  */
 
 static void
-iv_ca_set_remove_invariants (struct iv_ca *ivs, bitmap invs)
+iv_ca_set_remove_invariants (struct ivopts_data *data,
+                             struct iv_ca *ivs, bitmap invs)
 {
   bitmap_iterator bi;
   unsigned iid;
+  unsigned pseudo_id_start = data->min_pseudo_inv_id;
 
   if (!invs)
     return;
@@ -4541,7 +4750,12 @@ iv_ca_set_remove_invariants (struct iv_c
     {
       ivs->n_invariant_uses[iid]--;
       if (ivs->n_invariant_uses[iid] == 0)
-	ivs->n_regs--;
+        {
+          if (iid < pseudo_id_start)
+            ivs->n_regs--;
+          else
+            ivs->n_pseudos--;
+        }
     }
 }
 
@@ -4572,22 +4786,24 @@ iv_ca_set_no_cp (struct ivopts_data *dat
       ivs->n_cands--;
       ivs->cand_cost -= cp->cand->cost;
 
-      iv_ca_set_remove_invariants (ivs, cp->cand->depends_on);
+      iv_ca_set_remove_invariants (data, ivs, cp->cand->depends_on);
     }
 
   ivs->cand_use_cost = sub_costs (ivs->cand_use_cost, cp->cost);
 
-  iv_ca_set_remove_invariants (ivs, cp->depends_on);
+  iv_ca_set_remove_invariants (data, ivs, cp->depends_on);
   iv_ca_recount_cost (data, ivs);
 }
 
 /* Add invariants in set INVS to set IVS.  */
 
 static void
-iv_ca_set_add_invariants (struct iv_ca *ivs, bitmap invs)
+iv_ca_set_add_invariants (struct ivopts_data *data,
+                          struct iv_ca *ivs, bitmap invs)
 {
   bitmap_iterator bi;
   unsigned iid;
+  unsigned pseudo_id_start = data->min_pseudo_inv_id;
 
   if (!invs)
     return;
@@ -4596,7 +4812,12 @@ iv_ca_set_add_invariants (struct iv_ca *
     {
       ivs->n_invariant_uses[iid]++;
       if (ivs->n_invariant_uses[iid] == 1)
-	ivs->n_regs++;
+        {
+          if (iid < pseudo_id_start)
+            ivs->n_regs++;
+          else
+            ivs->n_pseudos++;
+        }
     }
 }
 
@@ -4630,11 +4851,11 @@ iv_ca_set_cp (struct ivopts_data *data, 
 	  ivs->n_cands++;
 	  ivs->cand_cost += cp->cand->cost;
 
-	  iv_ca_set_add_invariants (ivs, cp->cand->depends_on);
+	  iv_ca_set_add_invariants (data, ivs, cp->cand->depends_on);
 	}
 
       ivs->cand_use_cost = add_costs (ivs->cand_use_cost, cp->cost);
-      iv_ca_set_add_invariants (ivs, cp->depends_on);
+      iv_ca_set_add_invariants (data, ivs, cp->depends_on);
       iv_ca_recount_cost (data, ivs);
     }
 }
@@ -4841,9 +5062,13 @@ iv_ca_new (struct ivopts_data *data)
   nw->cands = BITMAP_ALLOC (NULL);
   nw->n_cands = 0;
   nw->n_regs = 0;
+  nw->n_pseudos = 0;
   nw->cand_use_cost = zero_cost;
   nw->cand_cost = 0;
-  nw->n_invariant_uses = XCNEWVEC (unsigned, data->max_inv_id + 1);
+  nw->n_invariant_uses = XCNEWVEC (unsigned,
+                                   data->min_pseudo_inv_id
+                                   + VEC_length (version_info_p,
+                                                 data->pseudo_version_info));
   nw->cost = zero_cost;
 
   return nw;
@@ -4871,8 +5096,21 @@ iv_ca_dump (struct ivopts_data *data, FI
   unsigned i;
   comp_cost cost = iv_ca_cost (ivs);
 
-  fprintf (file, "  cost %d (complexity %d)\n", cost.cost, cost.complexity);
-  bitmap_print (file, ivs->cands, "  candidates ","\n");
+  fprintf (file, "  cost: %d (complexity %d)\n", cost.cost, cost.complexity);
+  fprintf (file, "  cand_cost: %d\n  cand_use_cost: %d (complexity %d)\n",
+           ivs->cand_cost, ivs->cand_use_cost.cost, ivs->cand_use_cost.complexity);
+  bitmap_print (file, ivs->cands, "  candidates: ","\n");
+
+   for (i = 0; i < ivs->upto; i++)
+    {
+      struct iv_use *use = iv_use (data, i);
+      struct cost_pair *cp = iv_ca_cand_for_use (ivs, use);
+      if (cp)
+        fprintf (file, "   use:%d --> iv_cand:%d, cost=(%d,%d)\n",
+                 use->id, cp->cand->id, cp->cost.cost, cp->cost.complexity);
+      else
+        fprintf (file, "   use:%d --> ??\n", use->id);
+    }
 
   for (i = 1; i <= data->max_inv_id; i++)
     if (ivs->n_invariant_uses[i])
@@ -4880,7 +5118,9 @@ iv_ca_dump (struct ivopts_data *data, FI
 	fprintf (file, "%s%d", pref, i);
 	pref = ", ";
       }
-  fprintf (file, "\n");
+  fprintf (file, "\n\n");
+  fprintf (file, "nregs: %d\nnpseudos: %d\n\n",
+           ivs->n_regs, ivs->n_pseudos);
 }
 
 /* Try changing candidate in IVS to CAND for each use.  Return cost of the
@@ -4890,7 +5130,7 @@ iv_ca_dump (struct ivopts_data *data, FI
 static comp_cost
 iv_ca_extend (struct ivopts_data *data, struct iv_ca *ivs,
 	      struct iv_cand *cand, struct iv_ca_delta **delta,
-	      unsigned *n_ivs)
+	      unsigned *n_ivs, bool min_ncand)
 {
   unsigned i;
   comp_cost cost;
@@ -4914,8 +5154,8 @@ iv_ca_extend (struct ivopts_data *data, 
       if (!iv_ca_has_deps (ivs, new_cp))
 	continue;
 
-      if (!cheaper_cost_pair (new_cp, old_cp))
-	continue;
+      if (!min_ncand && !cheaper_cost_pair (new_cp, old_cp))
+        continue;
 
       *delta = iv_ca_delta_add (use, old_cp, new_cp, *delta);
     }
@@ -5110,7 +5350,8 @@ try_add_cand_for (struct ivopts_data *da
 	continue;
 
       iv_ca_set_cp (data, ivs, use, cp);
-      act_cost = iv_ca_extend (data, ivs, cand, &act_delta, NULL);
+      act_cost = iv_ca_extend (data, ivs, cand, &act_delta, NULL,
+                               true);
       iv_ca_set_no_cp (data, ivs, use);
       act_delta = iv_ca_delta_add (use, NULL, cp, act_delta);
 
@@ -5143,7 +5384,7 @@ try_add_cand_for (struct ivopts_data *da
 
 	  act_delta = NULL;
 	  iv_ca_set_cp (data, ivs, use, cp);
-	  act_cost = iv_ca_extend (data, ivs, cand, &act_delta, NULL);
+	  act_cost = iv_ca_extend (data, ivs, cand, &act_delta, NULL, true);
 	  iv_ca_set_no_cp (data, ivs, use);
 	  act_delta = iv_ca_delta_add (use, iv_ca_cand_for_use (ivs, use),
 				       cp, act_delta);
@@ -5203,7 +5444,7 @@ try_improve_iv_set (struct ivopts_data *
       if (iv_ca_cand_used_p (ivs, cand))
 	continue;
 
-      acost = iv_ca_extend (data, ivs, cand, &act_delta, &n_ivs);
+      acost = iv_ca_extend (data, ivs, cand, &act_delta, &n_ivs, false);
       if (!act_delta)
 	continue;
 
@@ -5293,6 +5534,251 @@ find_optimal_iv_set (struct ivopts_data 
   return set;
 }
 
+/* Returns a statement that undoes the operation in INCREMENT
+   on value OLD_VAL.  */
+
+static gimple
+get_inverted_increment_1 (gimple increment, tree old_val)
+{
+  tree new_assign_def;
+  gimple inverted_increment;
+  enum tree_code incr_op;
+  tree step;
+
+  new_assign_def = make_ssa_name (SSA_NAME_VAR (old_val), NULL);
+  step = unshare_expr (gimple_assign_rhs2 (increment));
+  incr_op = gimple_assign_rhs_code (increment);
+  if (incr_op == PLUS_EXPR)
+    incr_op = MINUS_EXPR;
+  else
+    {
+      gcc_assert (incr_op == MINUS_EXPR);
+      incr_op = PLUS_EXPR;
+    }
+  inverted_increment
+      = gimple_build_assign_with_ops (incr_op, new_assign_def,
+                                      old_val, step);
+
+  return inverted_increment;
+}
+
+/* Returns a statement that undos the operation in INCREMENT
+   on the result of phi NEW_PHI.  */
+
+static gimple
+get_inverted_increment (gimple reaching_increment, gimple new_phi)
+{
+  basic_block bb;
+  gimple_stmt_iterator gsi;
+  gimple inverted_increment;
+  tree phi_result;
+  void **slot;
+
+  gcc_assert (gimple_assign_lhs (reaching_increment)
+              == PHI_ARG_DEF (new_phi, 0));
+
+  if (!inverted_stmt_map)
+    inverted_stmt_map = pointer_map_create ();
+
+  slot = pointer_map_insert (inverted_stmt_map, new_phi);
+  if (*slot)
+    return (gimple) *slot;
+
+  phi_result = PHI_RESULT (new_phi);
+  bb = gimple_bb (new_phi);
+  gsi = gsi_after_labels (bb);
+
+  inverted_increment = get_inverted_increment_1 (reaching_increment,
+                                                 phi_result);
+  gsi_insert_before (&gsi, inverted_increment, GSI_NEW_STMT);
+  *slot = (void *) inverted_increment;
+  return inverted_increment;
+}
+
+/* Performs a peephole optimization to reorder the iv update statement with
+   a mem ref to enable instruction combining in later phases. The mem ref uses
+   the iv value before the update, so the reordering transformation requires
+   adjustment of the offset. CAND is the selected IV_CAND.
+
+   Example:
+
+   t = MEM_REF (base, iv1, 8, 16);  // base, index, stride, offset
+   iv2 = iv1 + 1;
+
+   if (t < val)      (1)
+     goto L;
+   goto Head;
+
+
+   directly propagating t over to (1) will introduce overlapping live range
+   thus increase register pressure. This peephole transform it into:
+
+
+   iv2 = iv1 + 1;
+   t = MEM_REF (base, iv2, 8, 8);
+   if (t < val)
+     goto L;
+   goto Head;
+*/
+
+static void
+adjust_iv_update_pos (struct ivopts_data *data ATTRIBUTE_UNUSED, struct iv_cand *cand)
+{
+  tree var_after, step, stride, index, offset_adjust, offset, mem_ref_op;
+  gimple iv_update, stmt, cond, mem_ref, index_to_base, use_stmt;
+  basic_block bb;
+  gimple_stmt_iterator gsi, gsi_iv;
+  use_operand_p use_p;
+  enum tree_code incr_op;
+  imm_use_iterator iter;
+  bool found = false;
+
+  var_after = cand->var_after;
+  iv_update = SSA_NAME_DEF_STMT (var_after);
+
+  /* Do not handle complicated iv update case.  */
+  incr_op = gimple_assign_rhs_code (iv_update);
+  if (incr_op != PLUS_EXPR && incr_op != MINUS_EXPR)
+    return;
+
+  step = gimple_assign_rhs2 (iv_update);
+  if (!CONSTANT_CLASS_P (step))
+    return;
+
+  bb = gimple_bb (iv_update);
+  gsi = gsi_last_nondebug_bb (bb);
+  stmt = gsi_stmt (gsi);
+
+  /* Only handle conditional statement for now.  */
+  if (gimple_code (stmt) != GIMPLE_COND)
+    return;
+
+  cond = stmt;
+
+  gsi_prev_nondebug (&gsi);
+  stmt = gsi_stmt (gsi);
+  if (stmt != iv_update)
+    return;
+
+  gsi_prev_nondebug (&gsi);
+  if (gsi_end_p (gsi))
+    return;
+
+  stmt = gsi_stmt (gsi);
+  if (gimple_code (stmt) != GIMPLE_ASSIGN)
+    return;
+
+  if (gimple_assign_rhs_code (stmt) != TARGET_MEM_REF)
+    return;
+
+  mem_ref = stmt;
+  mem_ref_op = gimple_assign_rhs1 (mem_ref);
+
+  if (TREE_CODE (gimple_assign_lhs (mem_ref)) != SSA_NAME)
+    return;
+
+  if (!single_imm_use (gimple_assign_lhs (mem_ref), &use_p, &use_stmt))
+    return;
+
+  if (use_stmt != cond)
+    return;
+
+  /* Found code motion candidate -- the statement with mem_ref.  */
+
+  index = TMR_INDEX (mem_ref_op);
+  index_to_base = NULL;
+  if (index)
+    {
+      if (index != cand->var_before)
+        return;
+    }
+  else
+    {
+      /* Index used as base.  */
+      tree base = TMR_BASE (mem_ref_op);
+
+      if (TREE_CODE (base) != SSA_NAME)
+        return;
+
+      if (!has_single_use (base))
+        return;
+
+      index_to_base = SSA_NAME_DEF_STMT (base);
+      if (gimple_code (index_to_base) != GIMPLE_ASSIGN)
+        return;
+      if (gimple_assign_rhs_code (index_to_base) != NOP_EXPR)
+        return;
+      if (gimple_assign_rhs1 (index_to_base) != cand->var_before)
+        return;
+    }
+
+  stride = TMR_STEP (mem_ref_op);
+  offset = TMR_OFFSET (mem_ref_op);
+  if (stride && index)
+    offset_adjust = int_const_binop (MULT_EXPR, stride, step, 0);
+  else
+    offset_adjust = step;
+
+  if (offset_adjust == NULL)
+    return;
+
+  offset = int_const_binop ((incr_op == PLUS_EXPR
+                             ? MINUS_EXPR : PLUS_EXPR),
+                            (offset ? offset : size_zero_node),
+                            offset_adjust, 0);
+
+  if (offset == NULL)
+    return;
+
+  if (index_to_base)
+    gsi = gsi_for_stmt (index_to_base);
+  else
+    gsi = gsi_for_stmt (mem_ref);
+  gsi_iv = gsi_for_stmt (iv_update);
+  gsi_move_before (&gsi_iv, &gsi);
+
+  /* Now fix up the mem_ref.  */
+  FOR_EACH_IMM_USE_FAST (use_p, iter, cand->var_before)
+    {
+      if (USE_STMT (use_p) == mem_ref || USE_STMT (use_p) == index_to_base)
+        {
+          set_ssa_use_from_ptr (use_p, var_after);
+          if (index_to_base)
+            *gimple_assign_rhs1_ptr (index_to_base) = var_after;
+          else
+            TMR_INDEX (mem_ref_op) = var_after;
+
+          found = true;
+          break;
+        }
+    }
+  gcc_assert (found);
+  TMR_OFFSET (mem_ref_op) = offset;
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "Reordering \n");
+      print_gimple_stmt (dump_file, iv_update, 0, 0);
+      print_gimple_stmt (dump_file, mem_ref, 0, 0);
+      fprintf (dump_file, "\n");
+    }
+}
+
+/* Performs reordering peep hole optimization for all selected ivs in SET.  */
+
+static void
+adjust_update_pos_for_ivs (struct ivopts_data *data, struct iv_ca *set)
+{
+  unsigned i;
+  struct iv_cand *cand;
+  bitmap_iterator bi;
+
+  EXECUTE_IF_SET_IN_BITMAP (set->cands, 0, i, bi)
+    {
+      cand = iv_cand (data, i);
+      adjust_iv_update_pos (data, cand);
+    }
+}
+
 /* Creates a new induction variable corresponding to CAND.  */
 
 static void
@@ -5329,8 +5815,8 @@ create_new_iv (struct ivopts_data *data,
       name_info (data, cand->var_after)->preserve_biv = true;
 
       /* Rewrite the increment so that it uses var_before directly.  */
-      find_interesting_uses_op (data, cand->var_after)->selected = cand;
-
+      find_interesting_uses_op (data, cand->var_after,
+                                IU_INSIDE_LOOP)->selected = cand;
       return;
     }
 
@@ -5358,9 +5844,512 @@ create_new_ivs (struct ivopts_data *data
       cand = iv_cand (data, i);
       create_new_iv (data, cand);
     }
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "\nSelected IV set: \n");
+      EXECUTE_IF_SET_IN_BITMAP (set->cands, 0, i, bi)
+        {
+          cand = iv_cand (data, i);
+          dump_cand (dump_file, cand);
+        }
+      fprintf (dump_file, "\n");
+    }
+}
+
+/* Callback function in the tree walk to fix up old live out
+   names to loop exit phi's result.  */
+
+static tree
+fixup_use (tree *op,
+           int *unused ATTRIBUTE_UNUSED,
+           void *data)
+{
+  struct pointer_map_t *nm_to_def_map
+      = (struct pointer_map_t *) data;
+
+  if (TREE_CODE (*op) == SSA_NAME && is_gimple_reg (*op))
+    {
+      void **slot;
+      slot = pointer_map_contains (nm_to_def_map, *op);
+      if (slot)
+        {
+          enum gimple_code gc;
+          gimple def = (gimple) (*slot);
+          gc = gimple_code (def);
+          if (gc == GIMPLE_PHI)
+            *op = PHI_RESULT (def);
+          else
+            *op = gimple_assign_lhs (def);
+        }
+    }
+
+  return 0;
+}
+
+/* Callback function in the tree walk to collect used ssa names
+   in the tree.  */
+
+static tree
+collect_ssa_names (tree *op,
+                   int *unused ATTRIBUTE_UNUSED,
+                   void *data)
+{
+  VEC(tree, heap) ** used_names = (VEC(tree, heap) **) data;
+  if (TREE_CODE (*op) == SSA_NAME && is_gimple_reg (*op))
+    VEC_safe_push (tree, heap, *used_names, *op);
+
+  return 0;
+}
+
+/* The function fixes up live out ssa names used in tree *VAL to
+   the matching loop exit phi's results. */
+
+static void
+fixup_iv_out_val (tree *val, struct pointer_map_t *nm_to_phi_map)
+{
+  walk_tree (val, fixup_use, nm_to_phi_map, NULL);
+}
+
+/* Returns the iv update statement if USE's cand variable is
+   the version before the update; otherwise returns NULL.  */
+
+static gimple
+cause_overlapping_lr (struct ivopts_data *data,
+                      tree nm_used, struct iv_use *use,
+                      basic_block use_bb)
+{
+  tree selected_iv_nm;
+  edge e;
+  gimple increment;
+  enum tree_code incr_op;
+
+  selected_iv_nm = var_at_stmt (data->current_loop,
+                                use->selected,
+                                use->stmt);
+
+  if (nm_used != selected_iv_nm)
+    return NULL;
+
+  if (selected_iv_nm == use->selected->var_after)
+    return NULL;
+
+  /* Check if def of var_after reaches use_bb.  */
+  gcc_assert (single_pred_p (use_bb));
+  e = single_pred_edge (use_bb);
+
+  increment = SSA_NAME_DEF_STMT (use->selected->var_after);
+
+  if (e->src != gimple_bb (increment))
+    return NULL;
+
+  /* Only handle simple increments  */
+  if (gimple_code (increment) != GIMPLE_ASSIGN)
+    return NULL;
+
+  incr_op = gimple_assign_rhs_code (increment);
+  if (incr_op != PLUS_EXPR && incr_op != MINUS_EXPR)
+    return NULL;
+
+  if (!CONSTANT_CLASS_P (gimple_assign_rhs2 (increment)))
+    return NULL;
+
+  return increment;
 }
 
 
+/* Returns the loop closing phi for LIVE_OUT_IV in basic block TGT_BB.
+   IV_UPDATE_STMT is the update statement for LIVE_OUT_IV, and
+   *FOR_UPDATED_VAL is set to true if the argument of the phi is defined
+   by IV_UPDATE_STMT.  */
+
+static gimple
+find_closing_phi (basic_block tgt_bb, tree live_out_iv,
+                  gimple iv_update_stmt, bool *for_updated_val)
+{
+  gimple_stmt_iterator psi;
+  gimple phi = NULL;
+
+  *for_updated_val = false;
+
+  /* Now try to find the existing matching phi.  */
+  for (psi = gsi_start_phis (tgt_bb); !gsi_end_p (psi); gsi_next (&psi))
+    {
+      gimple p;
+      p = gsi_stmt (psi);
+
+      if (SSA_NAME_VAR (PHI_ARG_DEF (p, 0))
+          == SSA_NAME_VAR (live_out_iv))
+        {
+          phi = p;
+          break;
+        }
+    }
+
+  if (!phi)
+    return NULL;
+
+  if (PHI_ARG_DEF (phi, 0) == live_out_iv)
+    {
+      *for_updated_val = false;
+      /* Found exact match.  */
+      return phi;
+    }
+  else if (iv_update_stmt &&
+           PHI_ARG_DEF (phi, 0) == gimple_assign_lhs (iv_update_stmt))
+    {
+      *for_updated_val = true;
+      return phi;
+    }
+
+  return NULL;
+}
+
+
+/* The function ensures closed SSA form for moving use statement from USE
+   across the loop exit. LIVE_OUT_NM is the original ssa name that is live out,
+   TGT_BB is the destination bb of the code motion, and NM_TO_DEF_MAP maps
+   the original name to the result of the closing phi.
+
+   Scenario 1:
+   ----------------
+   Loop:
+
+   Loop_exit:
+
+     closed_iv_val = PHI (live_out_iv)
+
+     Uses of (live_out_iv) get replaced with closed_iv_val
+
+
+
+   Scenario 2:
+   ----------------
+   Loop:
+
+     updated_iv_val = live_out_iv + 1
+   Loop_exit:
+
+     closed_iv_val = PHI (updated_iv_val)
+     updated_iv_val2 = closed_iv_val - 1
+
+     Uses of live_out_iv get replaced with updated_iv_val2
+*/
+
+static gimple
+ensure_closed_ssa_form_for (struct ivopts_data *data,
+                            tree live_out_nm, basic_block tgt_bb,
+                            struct iv_use *use,
+                            struct pointer_map_t *nm_to_def_map)
+{
+  gimple closing_phi = NULL;
+  bool closing_phi_for_updated_val = false;
+
+  gimple def_stmt, new_def_stmt = NULL;
+  basic_block def_bb;
+  gimple iv_update_stmt;
+  void **slot;
+
+  def_stmt = SSA_NAME_DEF_STMT (live_out_nm);
+  def_bb = gimple_bb (def_stmt);
+
+  if (!def_bb
+      || flow_bb_inside_loop_p (def_bb->loop_father, tgt_bb))
+    return NULL;;
+
+  iv_update_stmt
+      = cause_overlapping_lr (data, live_out_nm, use, tgt_bb);
+
+  gcc_assert (!iv_update_stmt ||
+              gimple_code (iv_update_stmt) == GIMPLE_ASSIGN);
+
+  closing_phi = find_closing_phi (tgt_bb, live_out_nm,
+                                  iv_update_stmt, &closing_phi_for_updated_val);
+
+  /* No closing phi is found.  */
+  if (!closing_phi)
+    {
+      edge e;
+      edge_iterator ei;
+
+      closing_phi = create_phi_node (live_out_nm, tgt_bb);
+      create_new_def_for (gimple_phi_result (closing_phi), closing_phi,
+                          gimple_phi_result_ptr (closing_phi));
+      gcc_assert (single_pred_p (tgt_bb));
+      if (!iv_update_stmt)
+        {
+          FOR_EACH_EDGE (e, ei, tgt_bb->preds)
+              add_phi_arg (closing_phi, live_out_nm, e, UNKNOWN_LOCATION);
+          new_def_stmt = closing_phi;
+        }
+      else
+        {
+          FOR_EACH_EDGE (e, ei, tgt_bb->preds)
+              add_phi_arg (closing_phi, gimple_assign_lhs (iv_update_stmt),
+                           e, UNKNOWN_LOCATION);
+          /* Now make the value adjustment.  */
+          new_def_stmt = get_inverted_increment (iv_update_stmt, closing_phi);
+        }
+    }
+  else if (!closing_phi_for_updated_val)
+    /* Scenario 1 above.  */
+    new_def_stmt = closing_phi;
+  else
+    {
+      /* Scenario 2 above.  */
+      gcc_assert (iv_update_stmt);
+      new_def_stmt = get_inverted_increment (iv_update_stmt, closing_phi);
+    }
+
+  /* Now map it.  */
+  slot = pointer_map_insert (nm_to_def_map, live_out_nm);
+  *slot = (void *) new_def_stmt;
+
+  return (new_def_stmt != closing_phi ? new_def_stmt : NULL);
+}
+
+/* The function ensures closed ssa form for all names used in
+   REPLACED_IV_OUT_VAL. TGT_BB is the target bb where the new
+   computation is going to be, USE is the nonlinear use to be
+   rewritten (at loop exits), and *FIXED_UP_VAL holds the live out
+   value after name fixup. It returns the inverted iv update
+   statement if it is created.  */
+
+static gimple
+ensure_closed_ssa_form (struct ivopts_data *data,
+                        basic_block tgt_bb,
+                        struct iv_use *use,
+                        tree replaced_iv_out_val,
+                        tree *fixed_up_val)
+{
+  unsigned i;
+  tree nm;
+  VEC(tree, heap) *used_ssa_names = NULL;
+  struct pointer_map_t *nm_to_def_map = NULL;
+  gimple inverted_incr = NULL;
+
+  nm_to_def_map = pointer_map_create ();
+  *fixed_up_val = unshare_expr (replaced_iv_out_val);
+  walk_tree_without_duplicates (fixed_up_val,
+                                collect_ssa_names, &used_ssa_names);
+
+  for (i = 0;
+       VEC_iterate (tree, used_ssa_names, i, nm); i++)
+    {
+      gimple inv_incr;
+      if ((inv_incr
+           = ensure_closed_ssa_form_for (data, nm, tgt_bb,
+                                         use, nm_to_def_map)))
+        {
+          gcc_assert (!inverted_incr);
+          inverted_incr = inv_incr;
+        }
+    }
+
+  /* Now fix up the references in val.  */
+  fixup_iv_out_val (fixed_up_val, nm_to_def_map);
+  pointer_map_destroy (nm_to_def_map);
+  return inverted_incr;
+}
+
+/* The function returns true if it is possible to sink final value
+   computation for REPLACED_IV_OUT_NAME at loop exits.  */
+
+static bool
+can_compute_final_value_at_exits_p (struct ivopts_data *data,
+                                    tree replaced_iv_out_name)
+{
+  imm_use_iterator iter;
+  use_operand_p use_p;
+  gimple use_stmt;
+
+  /* Walk through all nonlinear uses in all loop exit blocks
+     to see if the sinking transformation is doable.  */
+
+  FOR_EACH_IMM_USE_FAST (use_p, iter, replaced_iv_out_name)
+    {
+      basic_block exit_bb;
+      edge e;
+      edge_iterator ei;
+      bool found_exit_edge = false;
+
+      use_stmt = USE_STMT (use_p);
+      exit_bb = gimple_bb (use_stmt);
+
+      /* The use_stmt is another iv update
+         statement that also defines a liveout value and
+         has been removed.  */
+      if (!exit_bb)
+        continue;
+
+      if (flow_bb_inside_loop_p (data->current_loop, exit_bb))
+        continue;
+
+      if (single_pred_p (exit_bb))
+        continue;
+
+      FOR_EACH_EDGE (e, ei, exit_bb->preds)
+        {
+          if (!flow_bb_inside_loop_p (data->current_loop,
+                                      e->src))
+            continue;
+          /* Can not split the edge.  */
+          if (e->flags & EDGE_ABNORMAL)
+            return false;
+
+          /* Do not handle the case where the exit bb has
+             multiple incoming exit edges from the same loop.  */
+          if (found_exit_edge)
+            return false;
+
+          found_exit_edge = true;
+        }
+      if (!found_exit_edge)
+        return false;
+    }
+  return true;
+}
+
+/* The function splits the loop exit edge targeting EXIT_BB if EXIT_BB
+    and returns the newly split bb.  REPLACED_IV_OUT_NAME is the original
+    ssa name that is live out, and the new use statement (new phi) will
+    be stored in *USE_STMT.  */
+
+static basic_block
+split_exit_edge (struct ivopts_data* data, basic_block exit_bb,
+                 tree replaced_iv_out_name, gimple *use_stmt)
+{
+  edge e;
+  edge_iterator ei;
+  FOR_EACH_EDGE (e, ei, exit_bb->preds)
+    {
+      edge exit_edge;
+      gimple_stmt_iterator psi;
+      gimple new_use_phi = NULL;
+
+      if (!flow_bb_inside_loop_p (data->current_loop, e->src))
+        continue;
+
+      gcc_assert (!(e->flags & EDGE_ABNORMAL));
+      exit_bb = split_loop_exit_edge (e);
+      exit_edge = single_pred_edge (exit_bb);
+
+      /* Now update the use stmt.  */
+      for (psi = gsi_start_phis (exit_bb);
+           !gsi_end_p (psi); gsi_next (&psi))
+        {
+          tree phi_arg;
+          gimple new_phi = gsi_stmt (psi);
+
+          phi_arg
+              = PHI_ARG_DEF_FROM_EDGE (new_phi, exit_edge);
+          if (phi_arg == replaced_iv_out_name)
+            {
+              new_use_phi = new_phi;
+              break;
+            }
+        }
+      gcc_assert (new_use_phi);
+      *use_stmt = new_use_phi;
+
+      /* There is only one exit edge to split.  */
+      break;
+    }
+
+  return exit_bb;
+}
+
+/* For a non linear use USE that is used outside the loop DATA->current_loop
+   only, try to evaluate the live out value at the exits of the loop.
+   REPLACED_IV_OUT_NAME is the original ssa name that is live out, and
+   REPLACED_IV_OUT_VAL is the expression (in terms of the selected iv cand)
+   to evaluate the live out value. The function tries to sink the computation
+   of replaced_iv_out_val into loop exits, and returns true if successful.  */
+
+static bool
+compute_final_value_at_exits (struct ivopts_data *data,
+                              struct iv_use *use,
+                              tree replaced_iv_out_name,
+                              tree replaced_iv_out_val)
+{
+  imm_use_iterator iter;
+  gimple use_stmt;
+  struct iv* replaced_iv;
+
+  if (!can_compute_final_value_at_exits_p (data, replaced_iv_out_name))
+    return false;
+
+  FOR_EACH_IMM_USE_STMT (use_stmt, iter, replaced_iv_out_name)
+    {
+      basic_block exit_bb;
+      gimple new_assign;
+      gimple_stmt_iterator gsi, bsi;
+      tree phi_rslt, new_assign_rhs;
+      tree fixed_up_val;
+      gimple inverted_increment;
+
+      exit_bb = gimple_bb (use_stmt);
+
+      /* The use_stmt is another iv update
+         statement that also defines a liveout value and
+         has been removed.  */
+      if (!exit_bb)
+        continue;
+
+      if (flow_bb_inside_loop_p (data->current_loop, exit_bb))
+        continue;
+
+      if (!single_pred_p (exit_bb))
+        exit_bb = split_exit_edge (data, exit_bb,
+                                   replaced_iv_out_name, &use_stmt);
+
+      gcc_assert (single_pred_p (exit_bb));
+
+      inverted_increment
+          = ensure_closed_ssa_form (data, exit_bb, use,
+                                    replaced_iv_out_val,
+                                    &fixed_up_val);
+
+      gcc_assert (gimple_code (use_stmt) == GIMPLE_PHI);
+      gsi = gsi_for_stmt (use_stmt);
+      phi_rslt = PHI_RESULT (use_stmt);
+      bsi = (inverted_increment
+             ? gsi_for_stmt (inverted_increment)
+             : gsi_after_labels (exit_bb));
+
+      /* Now convert the original loop exit phi (for closed SSA form)
+         into an assignment statement.  */
+      remove_phi_node (&gsi, false);
+      new_assign_rhs = force_gimple_operand_gsi (&bsi, fixed_up_val,
+                                                 false, NULL_TREE,
+                                                 (inverted_increment == NULL),
+                                                 (inverted_increment == NULL
+                                                  ? GSI_SAME_STMT
+                                                  : GSI_CONTINUE_LINKING));
+      new_assign = gimple_build_assign (phi_rslt, new_assign_rhs);
+      if (inverted_increment)
+        gsi_insert_after (&bsi, new_assign, GSI_SAME_STMT);
+      else
+        gsi_insert_before (&bsi, new_assign, GSI_SAME_STMT);
+
+      if (dump_file && (dump_flags & TDF_DETAILS))
+        {
+          fprintf (dump_file, "Sinking computation into exit bb %d\n",
+                   exit_bb->index);
+          print_gimple_stmt (dump_file, new_assign, 0, 0);
+          fprintf (dump_file, "\n");
+	}
+    }
+
+  /* Now the original stmt that defines the liveout value can be removed */
+
+  replaced_iv = get_iv (data, replaced_iv_out_name);
+  gcc_assert (replaced_iv);
+  replaced_iv->have_use_for = false;
+
+  return true;
+}
+
 /* Rewrites USE (definition of iv used in a nonlinear expression)
    using candidate CAND.  */
 
@@ -5455,6 +6444,11 @@ rewrite_use_nonlinear_expr (struct ivopt
       gcc_unreachable ();
     }
 
+  if (use->use_pos == IU_OUTSIDE_LOOP_ONLY)
+    {
+      if (compute_final_value_at_exits (data, use, tgt, comp))
+        return;
+    }
   op = force_gimple_operand_gsi (&bsi, comp, false, SSA_NAME_VAR (tgt),
 				 true, GSI_SAME_STMT);
 
@@ -5535,7 +6529,7 @@ rewrite_use_address (struct ivopts_data 
   aff_tree aff;
   gimple_stmt_iterator bsi = gsi_for_stmt (use->stmt);
   tree base_hint = NULL_TREE;
-  tree ref;
+  tree ref, iv;
   bool ok;
 
   ok = get_computation_aff (data->current_loop, use, cand, use->stmt, &aff);
@@ -5556,7 +6550,8 @@ rewrite_use_address (struct ivopts_data 
   if (cand->iv->base_object)
     base_hint = var_at_stmt (data->current_loop, cand, use->stmt);
 
-  ref = create_mem_ref (&bsi, TREE_TYPE (*use->op_p), &aff, base_hint,
+  iv = var_at_stmt (data->current_loop, cand, use->stmt);
+  ref = create_mem_ref (&bsi, TREE_TYPE (*use->op_p), &aff, iv, base_hint,
 			data->speed);
   copy_ref_info (ref, *use->op_p);
   *use->op_p = ref;
@@ -5691,6 +6686,7 @@ free_loop_data (struct ivopts_data *data
   unsigned i, j;
   bitmap_iterator bi;
   tree obj;
+  struct version_info *vi;
 
   if (data->niters)
     {
@@ -5748,6 +6744,14 @@ free_loop_data (struct ivopts_data *data
 
   data->max_inv_id = 0;
 
+  for (i = 0; VEC_iterate (version_info_p,
+                           data->pseudo_version_info, i, vi); i++)
+    free (vi);
+
+  VEC_truncate (version_info_p, data->pseudo_version_info, 0);
+  data->min_pseudo_inv_id = num_ssa_names;
+
+
   for (i = 0; VEC_iterate (tree, decl_rtl_to_reset, i, obj); i++)
     SET_DECL_RTL (obj, NULL_RTX);
 
@@ -5768,6 +6772,11 @@ tree_ssa_iv_optimize_finalize (struct iv
   VEC_free (tree, heap, decl_rtl_to_reset);
   VEC_free (iv_use_p, heap, data->iv_uses);
   VEC_free (iv_cand_p, heap, data->iv_candidates);
+  if (inverted_stmt_map)
+    {
+      pointer_map_destroy (inverted_stmt_map);
+      inverted_stmt_map = NULL;
+    }
 }
 
 /* Optimizes the LOOP.  Returns true if anything changed.  */
@@ -5830,7 +6839,6 @@ tree_ssa_iv_optimize_loop (struct ivopts
 
   /* Create the new induction variables (item 4, part 1).  */
   create_new_ivs (data, iv_ca);
-  iv_ca_free (&iv_ca);
 
   /* Rewrite the uses (item 4, part 2).  */
   rewrite_uses (data);
@@ -5838,6 +6846,9 @@ tree_ssa_iv_optimize_loop (struct ivopts
   /* Remove the ivs that are unused after rewriting.  */
   remove_unused_ivs (data);
 
+  adjust_update_pos_for_ivs (data, iv_ca);
+
+  iv_ca_free (&iv_ca);
   /* We have changed the structure of induction variables; it might happen
      that definitions in the scev database refer to some of them that were
      eliminated.  */
Index: gcc/tree-ssa-address.c
===================================================================
--- gcc/tree-ssa-address.c	(revision 159243)
+++ gcc/tree-ssa-address.c	(working copy)
@@ -450,6 +450,31 @@ move_pointer_to_base (struct mem_address
   aff_combination_remove_elt (addr, i);
 }
 
+/* Moves the loop variant part V in linear address ADDR to be the index
+   of PARTS.  */
+
+static void
+move_variant_to_index (struct mem_address *parts, aff_tree *addr, tree v)
+{
+  unsigned i;
+  tree val = NULL_TREE;
+
+  gcc_assert (!parts->index);
+  for (i = 0; i < addr->n; i++)
+    {
+      val = addr->elts[i].val;
+      if (val == v)
+	break;
+    }
+
+  if (i == addr->n)
+    return;
+
+  parts->index = fold_convert (sizetype, val);
+  parts->step = double_int_to_tree (sizetype, addr->elts[i].coef);
+  aff_combination_remove_elt (addr, i);
+}
+
 /* Adds ELT to PARTS.  */
 
 static void
@@ -553,7 +578,8 @@ most_expensive_mult_to_index (tree type,
 
 /* Splits address ADDR for a memory access of type TYPE into PARTS.
    If BASE_HINT is non-NULL, it specifies an SSA name to be used
-   preferentially as base of the reference.
+   preferentially as base of the reference, and IV_CAND is the selected
+   iv candidate used in ADDR.
 
    TODO -- be more clever about the distribution of the elements of ADDR
    to PARTS.  Some architectures do not support anything but single
@@ -563,8 +589,9 @@ most_expensive_mult_to_index (tree type,
    addressing modes is useless.  */
 
 static void
-addr_to_parts (tree type, aff_tree *addr, tree base_hint,
-	       struct mem_address *parts, bool speed)
+addr_to_parts (tree type, aff_tree *addr, tree iv_cand,
+	       tree base_hint, struct mem_address *parts,
+               bool speed)
 {
   tree part;
   unsigned i;
@@ -582,9 +609,17 @@ addr_to_parts (tree type, aff_tree *addr
   /* Try to find a symbol.  */
   move_fixed_address_to_symbol (parts, addr);
 
+  /* No need to do address parts reassociation if the number of parts
+     is <= 2 -- in that case, no loop invariant code motion can be
+     exposed.  */
+
+  if (!base_hint && (addr->n > 2))
+    move_variant_to_index (parts, addr, iv_cand);
+
   /* First move the most expensive feasible multiplication
      to index.  */
-  most_expensive_mult_to_index (type, parts, addr, speed);
+  if (!parts->index)
+    most_expensive_mult_to_index (type, parts, addr, speed);
 
   /* Try to find a base of the reference.  Since at the moment
      there is no reliable way how to distinguish between pointer and its
@@ -624,17 +659,19 @@ gimplify_mem_ref_parts (gimple_stmt_iter
 
 /* Creates and returns a TARGET_MEM_REF for address ADDR.  If necessary
    computations are emitted in front of GSI.  TYPE is the mode
-   of created memory reference.  */
+   of created memory reference. IV_CAND is the selected iv candidate in ADDR,
+   and IS_CAND_BASE is a flag indidcats if IV_CAND comes from a base address
+   object.  */
 
 tree
 create_mem_ref (gimple_stmt_iterator *gsi, tree type, aff_tree *addr,
-		tree base_hint, bool speed)
+		tree iv_cand, tree base_hint, bool speed)
 {
   tree mem_ref, tmp;
   tree atype;
   struct mem_address parts;
 
-  addr_to_parts (type, addr, base_hint, &parts, speed);
+  addr_to_parts (type, addr, iv_cand, base_hint, &parts, speed);
   gimplify_mem_ref_parts (gsi, &parts);
   mem_ref = create_mem_ref_raw (type, &parts);
   if (mem_ref)
Index: gcc/tree-flow.h
===================================================================
--- gcc/tree-flow.h	(revision 159243)
+++ gcc/tree-flow.h	(working copy)
@@ -863,7 +863,7 @@ struct mem_address
 
 struct affine_tree_combination;
 tree create_mem_ref (gimple_stmt_iterator *, tree,
-		     struct affine_tree_combination *, tree, bool);
+		     struct affine_tree_combination *, tree, tree, bool);
 rtx addr_for_mem_ref (struct mem_address *, addr_space_t, bool);
 void get_address_description (tree, struct mem_address *);
 tree maybe_fold_tmr (tree);

Attachment: ivopts.cg
Description: Binary data

Attachment: ivopts_test.cg
Description: Binary data


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]