This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

RFA: patch to fix IRA degradation on SPEC2006 calculix


H.J. reported huge degradation of IRA on calculix on x86_64

http://gcc.gnu.org/ml/gcc-patches/2008-11/msg00812.html

Calculix has one hot function (about 80% of all program time) e_c3d
with small most frequently executed loop.  One accidental spill in the
loop results in 30% degradation.  The degradation occurred after
submitting patch introducing bad spills processing.  I missed another
checks of bad spills in allocno comparison when all previous high
priority criteria are the same (see
ira-color.c::push_allocnos_to_stack).

I also found that the hot function in calculix has more 50 loops,
therefore IRA decides to use one region allocation.  Instead of such
approach, I change the code to make no more ira-max-loops-num regions
(most frequenly used ones) instead of one region.  I also changed
ira-max-loops-num value to 100 (there are still functions in calculix
which contains more 100 loops) because it looks like fortran functions
have many implicit loops (from vector operations).  It permits to
improve calculix score a bit more.

The rest of the patch is a code printing more info which I used for debugging.

The patch was successfully bootstrapped on x86_64.

Is it ok to commit?

2008-11-18 Vladimir Makarov <vmakarov@redhat.com>

* doc/invoke.texi (ira-max-loops-num): Change semantics.

* ira-int.h (struct ira_loop_tree_node): New member to_remove_p.

   * ira-color.c (allocno_spill_priority): New function.
   (remove_allocno_from_bucket_and_push, push_allocno_to_spill):
   Print more info about the spilled allocno.
   (push_allocnos_to_stack): Use allocno_spill_priority.  Add more
   checks on bad spill.

   * ira-build.c (loop_node_to_be_removed_p): Remove.
   (loop_compare_func, mark_loops_for_removal): New functions.
   (remove_uneccesary_loop_nodes_from_loop_t): Use member
   to_remove_p.
   (remove_unnecessary_allocnos): Call mark_loops_for_removal.

* ira.c (ira): Don't change flag_ira_algorithm.

* params.def (ira-max-loops-num): Change the value.

Index: doc/invoke.texi
===================================================================
--- doc/invoke.texi	(revision 141945)
+++ doc/invoke.texi	(working copy)
@@ -7603,10 +7603,10 @@ be disabled.  The default maximum SCC si
 
 @item ira-max-loops-num
 IRA uses a regional register allocation by default.  If a function
-contains loops more than number given by the parameter, non-regional
-register allocator will be used even when option
-@option{-fira-algorithm} is given.  The default value of the parameter
-is 20.
+contains loops more than number given by the parameter, only at most
+given number of the most frequently executed loops will form regions
+for the regional register allocation.  The default value of the
+parameter is 100.
 
 @end table
 @end table
Index: ira-int.h
===================================================================
--- ira-int.h	(revision 141945)
+++ ira-int.h	(working copy)
@@ -98,6 +98,10 @@ struct ira_loop_tree_node
   /* All the following members are defined only for nodes representing
      loops.  */
 
+  /* True if the loop was marked for removal from the register
+     allocation.  */
+  bool to_remove_p;
+
   /* Allocnos in the loop corresponding to their regnos.  If it is
      NULL the loop does not form a separate register allocation region
      (e.g. because it has abnormal enter/exit edges and we can not put
Index: ira-color.c
===================================================================
--- ira-color.c	(revision 141945)
+++ ira-color.c	(working copy)
@@ -612,6 +612,17 @@ static ira_allocno_t uncolorable_allocno
    of given *cover* class in the uncolorable_bucket.  */
 static int uncolorable_allocnos_num[N_REG_CLASSES];
 
+/* Return the current spill priority of allocno A.  The less the
+   number, the more preferable the allocno for spilling.  */
+static int
+allocno_spill_priority (ira_allocno_t a)
+{
+  return (ALLOCNO_TEMP (a)
+	  / (ALLOCNO_LEFT_CONFLICTS_NUM (a)
+	     * ira_reg_class_nregs[ALLOCNO_COVER_CLASS (a)][ALLOCNO_MODE (a)]
+	     + 1));
+}
+
 /* Add ALLOCNO to bucket *BUCKET_PTR.  ALLOCNO should be not in a bucket
    before the call.  */
 static void
@@ -882,7 +893,12 @@ remove_allocno_from_bucket_and_push (ira
     {
       fprintf (ira_dump_file, "      Pushing");
       print_coalesced_allocno (allocno);
-      fprintf (ira_dump_file, "%s\n", colorable_p ? "" : "(potential spill)");
+      if (colorable_p)
+	fprintf (ira_dump_file, "\n");
+      else
+	fprintf (ira_dump_file, "(potential spill: %spri=%d, cost=%d)\n",
+		 ALLOCNO_BAD_SPILL_P (allocno) ? "bad spill, " : "",
+		 allocno_spill_priority (allocno), ALLOCNO_TEMP (allocno));
     }
   cover_class = ALLOCNO_COVER_CLASS (allocno);
   ira_assert ((colorable_p
@@ -916,7 +932,7 @@ push_allocno_to_spill (ira_allocno_t all
   delete_allocno_from_bucket (allocno, &uncolorable_allocno_bucket);
   ALLOCNO_MAY_BE_SPILLED_P (allocno) = true;
   if (internal_flag_ira_verbose > 3 && ira_dump_file != NULL)
-    fprintf (ira_dump_file, "      Pushing p%d(%d) (potential spill)\n",
+    fprintf (ira_dump_file, "      Pushing p%d(%d) (spill for NO_REGS)\n",
 	     ALLOCNO_NUM (allocno), ALLOCNO_REGNO (allocno));
   push_allocno_to_stack (allocno);
 }
@@ -1181,21 +1197,18 @@ push_allocnos_to_stack (void)
 		  i++;
 		  ira_assert (ALLOCNO_TEMP (i_allocno) != INT_MAX);
 		  i_allocno_cost = ALLOCNO_TEMP (i_allocno);
-		  i_allocno_pri
-		    = (i_allocno_cost
-		       / (ALLOCNO_LEFT_CONFLICTS_NUM (i_allocno)
-			  * ira_reg_class_nregs[ALLOCNO_COVER_CLASS
-						(i_allocno)]
-			  [ALLOCNO_MODE (i_allocno)] + 1));
+		  i_allocno_pri = allocno_spill_priority (i_allocno);
 		  if (allocno == NULL
 		      || (! ALLOCNO_BAD_SPILL_P (i_allocno)
 			  && ALLOCNO_BAD_SPILL_P (allocno))
-		      || allocno_pri > i_allocno_pri
-		      || (allocno_pri == i_allocno_pri
-			  && (allocno_cost > i_allocno_cost
-			      || (allocno_cost == i_allocno_cost 
-				  && (ALLOCNO_NUM (allocno)
-				      > ALLOCNO_NUM (i_allocno))))))
+		      || (! (ALLOCNO_BAD_SPILL_P (i_allocno)
+			     && ! ALLOCNO_BAD_SPILL_P (allocno))
+			  && (allocno_pri > i_allocno_pri
+			      || (allocno_pri == i_allocno_pri
+				  && (allocno_cost > i_allocno_cost
+				      || (allocno_cost == i_allocno_cost 
+					  && (ALLOCNO_NUM (allocno)
+					      > ALLOCNO_NUM (i_allocno))))))))
 		    {
 		      allocno = i_allocno;
 		      allocno_cost = i_allocno_cost;
Index: ira-build.c
===================================================================
--- ira-build.c	(revision 141945)
+++ ira-build.c	(working copy)
@@ -1650,20 +1650,80 @@ low_pressure_loop_node_p (ira_loop_tree_
   return true;
 }
 
-/* Return TRUE if NODE represents a loop with should be removed from
-   regional allocation.  We remove a loop with low register pressure
-   inside another loop with register pressure.  In this case a
-   separate allocation of the loop hardly helps (for irregular
-   register file architecture it could help by choosing a better hard
-   register in the loop but we prefer faster allocation even in this
-   case).  */
-static bool
-loop_node_to_be_removed_p (ira_loop_tree_node_t node)
+/* Sort loops for marking them for removal.  We put already marked
+   loops first, then less frequent loops next, and then outer loops
+   next.  */
+static int
+loop_compare_func (const void *v1p, const void *v2p)
 {
-  return (node->parent != NULL && low_pressure_loop_node_p (node->parent)
-	  && low_pressure_loop_node_p (node));
+  int diff;
+  ira_loop_tree_node_t l1 = *(const ira_loop_tree_node_t *) v1p;
+  ira_loop_tree_node_t l2 = *(const ira_loop_tree_node_t *) v2p;
+
+  ira_assert (l1->parent != NULL && l2->parent != NULL);
+  if (l1->to_remove_p && ! l2->to_remove_p)
+    return -1;
+  if (! l1->to_remove_p && l2->to_remove_p)
+    return 1;
+  if ((diff = l1->loop->header->frequency - l2->loop->header->frequency) != 0)
+    return diff;
+  if ((diff = (int) loop_depth (l1->loop) - (int) loop_depth (l2->loop)) != 0)
+    return diff;
+  /* Make sorting stable.  */
+  return l1->loop->num - l2->loop->num;
 }
 
+
+/* Mark loops which should be removed from regional allocation.  We
+   remove a loop with low register pressure inside another loop with
+   register pressure.  In this case a separate allocation of the loop
+   hardly helps (for irregular register file architecture it could
+   help by choosing a better hard register in the loop but we prefer
+   faster allocation even in this case).  We also remove cheap loops
+   if there are more than IRA_MAX_LOOPS_NUM of them.  */
+static void
+mark_loops_for_removal (void)
+{
+  int i, n;
+  ira_loop_tree_node_t *sorted_loops;
+  loop_p loop;
+
+  sorted_loops
+    = (ira_loop_tree_node_t *) ira_allocate (sizeof (ira_loop_tree_node_t)
+					     * VEC_length (loop_p,
+							   ira_loops.larray));
+  for (n = i = 0; VEC_iterate (loop_p, ira_loops.larray, i, loop); i++)
+    {
+      if (ira_loop_nodes[i].parent == NULL)
+	{
+	  /* Don't remove the root.  */
+	  ira_loop_nodes[i].to_remove_p = false;
+	  continue;
+	}
+      sorted_loops[n++] = &ira_loop_nodes[i];
+      ira_loop_nodes[i].to_remove_p
+	= (low_pressure_loop_node_p (ira_loop_nodes[i].parent)
+	   && low_pressure_loop_node_p (&ira_loop_nodes[i]));
+    }
+  qsort (sorted_loops, n, sizeof (ira_loop_tree_node_t), loop_compare_func);
+  for (i = 0; n - i + 1 > IRA_MAX_LOOPS_NUM; i++)
+    {
+      sorted_loops[i]->to_remove_p = true;
+      if (internal_flag_ira_verbose > 1 && ira_dump_file != NULL)
+	fprintf
+	  (ira_dump_file,
+	   "  Mark loop %d (header %d, freq %d, depth %d) for removal (%s)\n",
+	   sorted_loops[i]->loop->num, sorted_loops[i]->loop->header->index,
+	   sorted_loops[i]->loop->header->frequency,
+	   loop_depth (sorted_loops[i]->loop),
+	   low_pressure_loop_node_p (sorted_loops[i]->parent)
+	   && low_pressure_loop_node_p (sorted_loops[i])
+	   ? "low pressure" : "cheap loop");
+    }
+  ira_free (sorted_loops);
+}
+
+
 /* Definition of vector of loop tree nodes.  */
 DEF_VEC_P(ira_loop_tree_node_t);
 DEF_VEC_ALLOC_P(ira_loop_tree_node_t, heap);
@@ -1683,7 +1743,7 @@ remove_uneccesary_loop_nodes_from_loop_t
   bool remove_p;
   ira_loop_tree_node_t subnode;
 
-  remove_p = loop_node_to_be_removed_p (node);
+  remove_p = node->to_remove_p;
   if (! remove_p)
     VEC_safe_push (ira_loop_tree_node_t, heap, children_vec, node);
   start = VEC_length (ira_loop_tree_node_t, children_vec);
@@ -1732,13 +1792,13 @@ remove_unnecessary_allocnos (void)
       {
 	next_a = ALLOCNO_NEXT_REGNO_ALLOCNO (a);
 	a_node = ALLOCNO_LOOP_TREE_NODE (a);
-	if (! loop_node_to_be_removed_p (a_node))
+	if (! a_node->to_remove_p)
 	  prev_a = a;
 	else
 	  {
 	    for (parent = a_node->parent;
 		 (parent_a = parent->regno_allocno_map[regno]) == NULL
-		   && loop_node_to_be_removed_p (parent);
+		   && parent->to_remove_p;
 		 parent = parent->parent)
 	      ;
 	    if (parent_a == NULL)
@@ -1815,6 +1875,7 @@ remove_unnecessary_allocnos (void)
 static void
 remove_unnecessary_regions (void)
 {
+  mark_loops_for_removal ();
   children_vec
     = VEC_alloc (ira_loop_tree_node_t, heap,
 		 last_basic_block + VEC_length (loop_p, ira_loops.larray));
Index: ira.c
===================================================================
--- ira.c	(revision 141945)
+++ ira.c	(working copy)
@@ -1708,7 +1708,6 @@ ira (FILE *f)
   bool loops_p;
   int max_regno_before_ira, ira_max_point_before_emit;
   int rebuild_p;
-  int saved_flag_ira_algorithm;
   basic_block bb;
 
   timevar_push (TV_IRA);
@@ -1783,9 +1782,6 @@ ira (FILE *f)
   ira_assert (current_loops == NULL);
   flow_loops_find (&ira_loops);
   current_loops = &ira_loops;
-  saved_flag_ira_algorithm = flag_ira_algorithm;
-  if (optimize && number_of_loops () > (unsigned) IRA_MAX_LOOPS_NUM)
-    flag_ira_algorithm = IRA_ALGORITHM_CB;
       
   if (internal_flag_ira_verbose > 0 && ira_dump_file != NULL)
     fprintf (ira_dump_file, "Building IRA IR\n");
@@ -1908,8 +1904,6 @@ ira (FILE *f)
     bb->loop_father = NULL;
   current_loops = NULL;
 
-  flag_ira_algorithm = saved_flag_ira_algorithm;
-
   regstat_free_ri ();
   regstat_free_n_sets_and_refs ();
       
Index: params.def
===================================================================
--- params.def	(revision 141945)
+++ params.def	(working copy)
@@ -754,7 +754,7 @@ DEFPARAM (PARAM_DF_DOUBLE_QUEUE_THRESHOL
 DEFPARAM (PARAM_IRA_MAX_LOOPS_NUM,
 	  "ira-max-loops-num",
 	  "max loops number for regional RA",
-	  50, 0, 0)
+	  100, 0, 0)
 
 /* Switch initialization conversion will refuse to create arrays that are
    bigger than this parameter times the number of switch branches.  */

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]