This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Patch: Support IA-64 speculation [3/5]


Hi,

This patch is an addition to sched-rgn.c, that provides finding of more non-trivial regions for scheduling. It finds twice as much regions compared to traditional find_rgns procedure. The expense is 3-4 additional iterations over CFG. The patch doesn't add anything to compilation time (5-10 seconds on 45 minutes bootstrap). Enhancement of INSN_PRIORITY computation to properly reflect priority of instructions with speculative dependences (the work on which is under the progress) will profit to this patch. This patch can be disabled with setting the parameter max-sched-more-regions-iters to zero.

The patch was bootstrapped and regtested on the 20051125 weekly snapshot on ia64-linux and i686-linux.

Ok for mainline?


Thanks, Maxim


2005-12-23  Maxim Kuvyrkov <mkuvyrkov@ispras.ru>

	* sched-rgn.c (find_more_rgns): New static function.
	(find_rgns): Use it.
	(gather_region_statistics, print_region_statistics): New static
	functions.
	* params.def (PARAM_MAX_SCHED_MORE_REGIONS_ITERS): New parameter.
	(PARAM_MIN_SPEC_PROB): Rename to PARAM_MIN_SCHED_PROB. Adjust all
        users.
        * invoke.texi (max-sched-more-regions-iters): Document.
diff -upr -x .svn trunk/gcc/doc/invoke.texi trunk-spec/gcc/doc/invoke.texi
--- trunk/gcc/doc/invoke.texi	2005-12-23 17:42:36.000000000 +0300
+++ trunk-spec/gcc/doc/invoke.texi	2005-12-23 17:50:31.000000000 +0300
@@ -6113,6 +6113,12 @@ interblock scheduling.  The default valu
 The minimum probability of reaching a source block for interblock
 speculative scheduling.  The default value is 40.
 
+@item max-sched-more-regions-iters
+The maximum number of iterations through CFG to find more regions.
+0 - disable forming of more regions,
+N - do at most N iterations.
+The default value is 2.
+
 @item max-last-value-rtl
 
 The maximum size measured as number of RTLs that can be recorded in an expression
diff -upr -x .svn trunk/gcc/params.def trunk-spec/gcc/params.def
--- trunk/gcc/params.def	2005-12-23 17:42:36.000000000 +0300
+++ trunk-spec/gcc/params.def	2005-12-23 17:50:31.000000000 +0300
@@ -487,11 +487,16 @@ DEFPARAM(PARAM_MAX_SCHED_REGION_INSNS,
 	 "The maximum number of insns in a region to be considered for interblock scheduling",
 	 100, 0, 0)
 
-DEFPARAM(PARAM_MIN_SPEC_PROB,
-         "min-spec-prob",
+DEFPARAM(PARAM_MIN_SCHED_PROB,
+         "min-sched-prob",
          "The minimum probability of reaching a source block for interblock speculative scheduling",
          40, 0, 0)
 
+DEFPARAM(PARAM_MAX_SCHED_MORE_REGIONS_ITERS,
+         "max-sched-more-regions-iters",
+         "The maximum number of iterations through CFG to find more regions",
+         2, 0, 0)
+
 DEFPARAM(PARAM_MAX_LAST_VALUE_RTL,
 	 "max-last-value-rtl",
 	 "The maximum number of RTL nodes that can be recorded as combiner's last value",
diff -upr -x .svn trunk/gcc/sched-rgn.c trunk-spec/gcc/sched-rgn.c
--- trunk/gcc/sched-rgn.c	2005-12-23 17:42:36.000000000 +0300
+++ trunk-spec/gcc/sched-rgn.c	2005-12-23 17:51:40.000000000 +0300
@@ -127,6 +127,7 @@ static int *containing_rgn;
 void debug_regions (void);
 static void find_single_block_region (void);
 static void find_rgns (void);
+static void find_more_rgns (int *, int *, sbitmap, int *);
 static bool too_large (int, int *, int *);
 
 extern void debug_live (int, int);
@@ -650,7 +651,13 @@ find_rgns (void)
      blocks.  */
   if (!unreachable)
     {
-      int *queue;
+      int *queue, *degree1 = 0;
+      /* We use HEADER_MORE as an addition to HEADER and put
+	 there basic blocks, which are forced to be region heads.
+	 This is done to try to assemble few smaller regions 
+	 from a too_large region.  */
+      sbitmap header_more = 0;
+      int more_regions_p;
 
       if (no_loops)
 	SET_BIT (header, 0);
@@ -659,6 +666,14 @@ find_rgns (void)
 	 block of each region.  */
 
       queue = xmalloc (n_basic_blocks * sizeof (int));
+      
+      more_regions_p = PARAM_VALUE (PARAM_MAX_SCHED_MORE_REGIONS_ITERS) > 0;
+      if (more_regions_p)
+        {
+          degree1 = xmalloc (last_basic_block * sizeof (int));
+          header_more = sbitmap_alloc (last_basic_block);
+          sbitmap_zero (header_more);
+	}
 
       /* Find blocks which are inner loop headers.  We still have non-reducible
 	 loops to consider at this point.  */
@@ -706,6 +721,12 @@ find_rgns (void)
 	      too_large_failure = 0;
 	      loop_head = max_hdr[bb->index];
 
+              if (more_regions_p)
+                /* We save degree in case when we meet a too_large region 
+		   and cancel it.  We need a correct degree later when 
+                   calling find_more_rgns.  */
+                memcpy (degree1, degree, last_basic_block * sizeof (int));
+	      
 	      /* Decrease degree of all I's successors for topological
 		 ordering.  */
 	      FOR_EACH_EDGE (e, ei, bb->succs)
@@ -864,9 +885,34 @@ find_rgns (void)
 		    }
 		  ++nr_regions;
 		}
+              else if (more_regions_p)
+                {
+                  /* Restore DEGREE.  */
+                  int *t = degree;
+
+                  degree = degree1;
+                  degree1 = t;
+                  
+                  /* And force successors of BB to be region heads.
+		     This may provide several smaller regions instead
+		     of one too_large region.  */
+                  FOR_EACH_EDGE (e, ei, bb->succs)
+                    if (e->dest != EXIT_BLOCK_PTR)
+                      SET_BIT (header_more, e->dest->index);
+                }
 	    }
 	}
       free (queue);
+
+      if (more_regions_p)
+        {
+          free (degree1);
+          
+          sbitmap_a_or_b (header, header, header_more);
+          sbitmap_free (header_more);
+ 
+          find_more_rgns (degree, &idx, header, max_hdr);
+        }
     }
 
   /* Any block that did not end up in a region is placed into a region
@@ -882,7 +928,7 @@ find_rgns (void)
       }
 
   free (max_hdr);
-  free (dfs_nr);
+  free (degree);
   free (stack);
   sbitmap_free (header);
   sbitmap_free (inner);
@@ -890,6 +936,315 @@ find_rgns (void)
   sbitmap_free (in_stack);
 }
 
+static int gather_region_statistics (int **);
+static void print_region_statistics (int *, int, int *, int);
+
+/* Calculate the histogram that shows the number of regions having the 
+   given number of basic blocks, and store it in the RSP array.  Return 
+   the size of this array.  */
+static int
+gather_region_statistics (int **rsp)
+{
+  int i, *a = 0, a_sz = 0;
+
+  /* a[i] is the number of regions that have (i + 1) basic blocks.  */
+  for (i = 0; i < nr_regions; i++)
+    {
+      int nr_blocks = RGN_NR_BLOCKS (i);
+
+      gcc_assert (nr_blocks >= 1);
+
+      if (nr_blocks > a_sz)
+	{	 
+	  a = xrealloc (a, nr_blocks * sizeof (*a));
+	  do
+	    a[a_sz++] = 0;
+	  while (a_sz != nr_blocks);
+	}
+
+      a[nr_blocks - 1]++;
+    }
+
+  *rsp = a;
+  return a_sz;
+}
+
+/* Print regions statistics.  S1 and S2 denote the data before and after 
+   calling find_more_rgns, respectively.  */
+static void
+print_region_statistics (int *s1, int s1_sz, int *s2, int s2_sz)
+{
+  int i;
+  
+  /* We iterate until s2_sz because find_more_rgns does not decrease 
+     the maximal region size.  */
+  for (i = 1; i < s2_sz; i++)
+    {
+      int n1, n2;
+
+      n2 = s2[i];
+
+      if (n2 == 0)
+	continue;
+
+      if (i >= s1_sz)
+	n1 = 0;
+      else
+	n1 = s1[i];
+
+      fprintf (sched_dump, ";; More regions statistics: size %d: was %d + %d more\n",
+	       i + 1, n1, n2 - n1);
+    }
+}
+
+/* Find more non-trivial regions.
+   DEGREE - Array of incoming edge count, considering only
+   the edges, that don't have their sources in formed regions yet.
+   IDXP - pointer to the next available index in rgn_bb_table.
+   HEADER - set of all region heads.
+   LOOP_HDR - mapping from block to the containing loop
+   (two blocks can reside within one region if they have
+   the same loop header).  */
+static void
+find_more_rgns (int *degree, int *idxp, sbitmap header, int *loop_hdr)
+{
+  int *order, i, rescan = 0, idx = *idxp, iter = 0, max_iter, *max_hdr;
+  int nblocks = n_basic_blocks - NUM_FIXED_BLOCKS;
+  
+  max_iter = PARAM_VALUE (PARAM_MAX_SCHED_MORE_REGIONS_ITERS);
+
+  max_hdr = xmalloc (last_basic_block * sizeof (*max_hdr));
+
+  order = xmalloc (last_basic_block * sizeof (*order));
+  post_order_compute (order, false);	  
+
+  for (i = nblocks - 1; i >= 0; i--)
+    {
+      int bbn = order[i];
+      if (degree[bbn] >= 0)
+	{
+	  max_hdr[bbn] = bbn;
+	  rescan = 1;
+	}
+      else
+        /* This block already was processed in find_rgns.  */
+        max_hdr[bbn] = -1;
+    }
+  
+  /* The idea is to topologically walk through CFG in top-down order.
+     During the traversal, if all the predecessors of a node are
+     marked to be in the same region (they all have the same max_hdr),
+     then current node is also marked to be a part of that region. 
+     Otherwise the node starts its own region.
+     CFG should be traversed until no further changes are made.  On each 
+     iteration the set of the region heads is extended (the set of those 
+     blocks that have max_hdr[bbi] == bbi). This set is upper bounded by the 
+     set of all basic blocks, thus the algorithm is guaranteed to terminate.  */
+
+  while (rescan && iter < max_iter)
+    {
+      rescan = 0;
+      
+      for (i = nblocks - 1; i >= 0; i--)
+	{
+	  edge e;
+	  edge_iterator ei;
+	  int bbn = order[i];
+	
+	  if (max_hdr[bbn] != -1 && !TEST_BIT (header, bbn))
+	    {
+	      int hdr = -1;
+
+	      FOR_EACH_EDGE (e, ei, BASIC_BLOCK (bbn)->preds)
+		{
+		  int predn = e->src->index;
+
+		  if (predn != ENTRY_BLOCK
+		      /* If pred wasn't processed in find_rgns.  */
+		      && max_hdr[predn] != -1
+		      /* And pred and bb reside in the same loop.
+			 (Or out of any loop).  */
+		      && loop_hdr[bbn] == loop_hdr[predn])
+		    {
+		      if (hdr == -1)
+			/* Then bb extends the containing region of pred.  */
+			hdr = max_hdr[predn];
+		      else if (hdr != max_hdr[predn])
+			/* Too bad, there are at least two predecessors
+			   that reside in different regions.  Thus, BB should
+			   begin its own region.  */
+			{
+			  hdr = bbn;
+			  break;
+			}		    
+		    }
+		  else
+		    /* BB starts its own region.  */
+		    {
+		      hdr = bbn;
+		      break;
+		    }		
+		}
+	    
+	      if (hdr == bbn)
+		{
+		  /* If BB start its own region,
+		     update set of headers with BB.  */
+		  SET_BIT (header, bbn);
+		  rescan = 1;
+		}
+	      else
+		gcc_assert (hdr != -1);	    
+
+	      max_hdr[bbn] = hdr;
+	    }
+	}
+
+      iter++;
+    }
+  
+  /* Statistics were gathered on the SPEC2000 package of tests with
+     mainline weekly snapshot gcc-4.1-20051015 on ia64.
+     
+     Statistics for SPECint:
+     1 iteration : 1751 cases (38.7%)
+     2 iterations: 2770 cases (61.3%)
+     Blocks wrapped in regions by find_rgns:      18295 blocks
+     Blocks wrapped in regions by find_more_rgns: 23821 blocks
+     (We don't count single block regions here).
+     
+     Statistics for SPECfp:
+     1 iteration : 621 cases (35.9%)
+     2 iterations: 1110 cases (64.1%)
+     Blocks wrapped in regions by find_rgns:      6476 blocks
+     Blocks wrapped in regions by find_more_rgns: 11155 blocks
+     (We don't count single block regions here).
+
+     By default we do at most 2 iterations.
+     This can be overriden with max-sched-more-regions-iters parameter:
+     0 - disable forming of more regions,
+     N > 0 - do at most N iterations.  */
+  
+  if (sched_verbose && iter != 0)
+    fprintf (sched_dump, ";; More regions iterations: %d%s\n", iter,
+	     rescan ? "... failed" : "");
+    
+  if (!rescan && iter != 0)
+    {
+      int *s1 = 0, s1_sz = 0;
+
+      /* Save the old statistics for later printout.  */
+      if (sched_verbose >= 6)
+	s1_sz = gather_region_statistics (&s1);
+
+      /* We have succeeded.  Now assemble the regions.  */
+      for (i = nblocks - 1; i >= 0; i--)
+	{
+	  int bbn = order[i];
+
+	  if (max_hdr[bbn] == bbn)
+	    /* BBN is a region head.  */
+	    {
+	      edge e;
+	      edge_iterator ei;
+	      int num_bbs = 0, j, num_insns = 0, large;
+	
+	      large = too_large (bbn, &num_bbs, &num_insns);
+
+	      degree[bbn] = -1;
+	      rgn_bb_table[idx] = bbn;
+	      RGN_BLOCKS (nr_regions) = idx++;
+	      CONTAINING_RGN (bbn) = nr_regions;
+	      BLOCK_TO_BB (bbn) = 0;
+
+	      FOR_EACH_EDGE (e, ei, BASIC_BLOCK (bbn)->succs)
+		if (e->dest != EXIT_BLOCK_PTR)
+		  degree[e->dest->index]--;
+
+	      if (!large)
+		/* Here we check whether the region is too_large.  */
+		for (j = i - 1; j >= 0; j--)
+		  {
+		    int succn = order[j];
+		    if (max_hdr[succn] == bbn)
+		      {
+			if ((large = too_large (succn, &num_bbs, &num_insns)))
+			  break;
+		      }
+		  }
+
+	      if (large)
+		/* If the region is too_large, then wrap every block of
+		   the region into single block region.
+		   Here we wrap region head only.  Other blocks are
+		   processed in the below cycle.  */
+		{
+		  RGN_NR_BLOCKS (nr_regions) = 1;
+		  nr_regions++;
+		}          
+
+	      num_bbs = 1;
+
+	      for (j = i - 1; j >= 0; j--)
+		{
+		  int succn = order[j];
+
+		  if (max_hdr[succn] == bbn)
+		    /* This cycle iterates over all basic blocks, that 
+		       are supposed to be in the region with head BBN,
+		       and wraps them into that region (or in single
+		       block region).  */
+		    {
+		      gcc_assert (degree[succn] == 0);
+
+		      degree[succn] = -1;
+		      rgn_bb_table[idx] = succn;		 
+		      BLOCK_TO_BB (succn) = large ? 0 : num_bbs++;
+		      CONTAINING_RGN (succn) = nr_regions;
+
+		      if (large)
+			/* Wrap SUCCN into single block region.  */
+			{
+			  RGN_BLOCKS (nr_regions) = idx;
+			  RGN_NR_BLOCKS (nr_regions) = 1;
+			  nr_regions++;
+			}
+
+		      idx++;
+				
+		      FOR_EACH_EDGE (e, ei, BASIC_BLOCK (succn)->succs)
+			if (e->dest != EXIT_BLOCK_PTR)
+			  degree[e->dest->index]--;
+		    }
+		}
+
+	      if (!large)
+		{
+		  RGN_NR_BLOCKS (nr_regions) = num_bbs;
+		  nr_regions++;
+		}
+	    }
+	}
+
+      if (sched_verbose >= 6)
+	{
+	  int *s2, s2_sz;
+
+          /* Get the new statistics and print the comparison with the 
+             one before calling this function.  */
+	  s2_sz = gather_region_statistics (&s2);
+	  print_region_statistics (s1, s1_sz, s2, s2_sz);
+	  free (s1);
+	  free (s2);
+	}
+    }
+  
+  free (order);
+  free (max_hdr);
+
+  *idxp = idx; 
+}
+
 /* Functions for regions scheduling information.  */
 
 /* Compute dominators, probability, and potential-split-edges of bb.
@@ -1009,7 +1364,7 @@ compute_trg_info (int trg)
       if (sp->is_valid)
 	{
 	  sp->src_prob = GET_SRC_PROB (i, trg);
-	  sp->is_valid = (sp->src_prob >= PARAM_VALUE (PARAM_MIN_SPEC_PROB));
+	  sp->is_valid = (sp->src_prob >= PARAM_VALUE (PARAM_MIN_SCHED_PROB));
 	}
 
       if (sp->is_valid)

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]