This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Global code hoisting

To: gcc-patches at gcc dot gnu dot org
Subject: Global code hoisting
From: Jeffrey A Law <law at cygnus dot com>
Date: Mon, 20 Sep 1999 08:50:16 -0600
Reply-To: law at cygnus dot com

This is another of the global optimizers I've been meaning to contribute for
a while.

Given this trivial cfg (all flow is downward)

   A
  / \
 B   C
  \ /
   D

Pretend that blocks B & C both compute some expression E.  No path through
the cfg computes E more than one time, so it is not optimized by redundancy
elimination algorithms like gcse or lcm.

However, we can save space by moving the computation of expression E into
basic block A.


That's the entire point of this optimization pass.  To find blocks in the
cfg which compute some expression E and which are dominated by a common block
earlier in the cfg and try to move the computation of E into the dominator
block.

This optimization rarely improves code speed, it is strictly meant to improve
code density.  Furthermore, it was found that the actions of pre/lcm will
tend to interfere with code hoisting and produce larger code(1).  So this pass
is only run when optimizing for size (when optimizing for code size we run
the old classic gcse algorithm instead of pre/lcm).

Jeff

(1) Actually, what happens is the pre/lcm followed by code hoisting will leave
a bunch of copies in the cfg that can not be propagated out without doing
partial copy propagation (if such a thing even exists).  It has been speculated
that running code hoisting before pre/lcm might avoid the problem, but I
never had the time to test that.  It might be a good project for someone that
doesn't know a lot about compilers, but has time and cpu cycles to spare.

        * basic-block.h (compute_flow_dominators): Declare.

        * gcse.c (alloc_code_hoist_mem): New function.
        (free_code_hoist_mem, compute_code_hoist_vbeinout): Likewise.
        (compute_code_hoist_data, hoist_expr_reaches_here_p): Likewise.
        (hoist_code, one_code_hoisting_pass): Likewise.
        (gcse_main): If optimizing for size, then hoist expressions
        computed in multiple dominated basic blocks.

Index: gcse.c
===================================================================
RCS file: /egcs/carton/cvsfiles/egcs/gcc/gcse.c,v
retrieving revision 1.51
diff -c -3 -p -r1.51 gcse.c
*** gcse.c	1999/09/20 13:58:32	1.51
--- gcse.c	1999/09/20 14:36:57
*************** static int one_pre_gcse_pass	  PROTO ((i
*** 600,605 ****
--- 600,613 ----
  
  static void add_label_notes	      PROTO ((rtx, rtx));
  
+ static void alloc_code_hoist_mem	PROTO ((int, int));
+ static void free_code_hoist_mem		PROTO ((void));
+ static void compute_code_hoist_vbeinout	PROTO ((void));
+ static void compute_code_hoist_data	PROTO ((void));
+ static int hoist_expr_reaches_here_p	PROTO ((int, int, int, char *));
+ static void hoist_code			PROTO ((void));
+ static int one_code_hoisting_pass	PROTO ((void));
+ 
  static void alloc_rd_mem	      PROTO ((int, int));
  static void free_rd_mem	       PROTO ((void));
  static void handle_rd_kill_set	PROTO ((rtx, int, int));
*************** gcse_main (f, file)
*** 730,737 ****
--- 738,765 ----
        if (max_pass_bytes < bytes_used)
  	max_pass_bytes = bytes_used;
  
+       /* Free up memory, then reallocate for code hoisting.  We can
+ 	 not re-use the existing allocated memory because the tables
+ 	 will not have info for the insns or registers created by
+ 	 partial redundancy elimination.  */
        free_gcse_mem ();
  
+       /* It does not make sense to run code hoisting unless optimizing
+ 	 for code size -- it rarely makes programs faster, and can make
+ 	 them bigger if we did partial redundancy elimination (when optimizing
+ 	 for space, we use a classic gcse algorithm instead of partial
+ 	 redundancy algorithms).  */
+       if (optimize_size)
+         {
+ 	  max_gcse_regno = max_reg_num ();
+ 	  alloc_gcse_mem (f);
+ 	  changed |= one_code_hoisting_pass ();
+ 	  free_gcse_mem ();
+ 
+ 	  if (max_pass_bytes < bytes_used)
+ 	    max_pass_bytes = bytes_used;
+         }
+ 
        if (file)
  	{
  	  fprintf (file, "\n");
*************** delete_null_pointer_checks (f)
*** 5043,5046 ****
--- 5071,5428 ----
    free (nonnull_killed);
    free (nonnull_avin);
    free (nonnull_avout);
+ }
+ 
+ /* Code Hoisting variables and subroutines.  */
+ 
+ /* Very busy expressions.  */
+ static sbitmap *hoist_vbein;
+ static sbitmap *hoist_vbeout;
+ 
+ /* Hoistable expressions.  */
+ static sbitmap *hoist_exprs;
+ 
+ /* Dominator bitmaps.  */
+ static sbitmap *dominators;
+ static sbitmap *post_dominators;
+ 
+ /* ??? We could compute post dominators and run this algorithm in
+    reverse to to perform tail merging, doing so would probably be
+    more effective than the tail merging code in jump.c.
+ 
+    It's unclear if tail merging could be run in parallel with
+    code hoisting.  It would be nice.  */
+ 
+ /* Allocate vars used for code hoisting analysis.  */
+ 
+ static void
+ alloc_code_hoist_mem (n_blocks, n_exprs)
+      int n_blocks, n_exprs;
+ {
+   antloc = sbitmap_vector_alloc (n_blocks, n_exprs);
+   transp = sbitmap_vector_alloc (n_blocks, n_exprs);
+   comp = sbitmap_vector_alloc (n_blocks, n_exprs);
+ 
+   hoist_vbein = sbitmap_vector_alloc (n_blocks, n_exprs);
+   hoist_vbeout = sbitmap_vector_alloc (n_blocks, n_exprs);
+   hoist_exprs = sbitmap_vector_alloc (n_blocks, n_exprs);
+   transpout = sbitmap_vector_alloc (n_blocks, n_exprs);
+ 
+   dominators = sbitmap_vector_alloc (n_blocks, n_blocks);
+   post_dominators = sbitmap_vector_alloc (n_blocks, n_blocks);
+ }
+ 
+ /* Free vars used for code hoisting analysis.  */
+ 
+ static void
+ free_code_hoist_mem ()
+ {
+   free (antloc);
+   free (transp);
+   free (comp);
+ 
+   free (hoist_vbein);
+   free (hoist_vbeout);
+   free (hoist_exprs);
+   free (transpout);
+ 
+   free (dominators);
+   free (post_dominators);
+ }
+ 
+ /* Compute the very busy expressions at entry/exit from each block.
+ 
+    An expression is very busy if all paths from a given point
+    compute the expression.  */
+ 
+ static void
+ compute_code_hoist_vbeinout ()
+ {
+   int bb, changed, passes;
+ 
+   sbitmap_vector_zero (hoist_vbeout, n_basic_blocks);
+   sbitmap_vector_zero (hoist_vbein, n_basic_blocks);
+ 
+   passes = 0;
+   changed = 1;
+   while (changed)
+     {
+       changed = 0;
+       /* We scan the blocks in the reverse order to speed up
+ 	 the convergence.  */
+       for (bb = n_basic_blocks - 1; bb >= 0; bb--)
+ 	{
+ 	  changed |= sbitmap_a_or_b_and_c (hoist_vbein[bb], antloc[bb],
+ 					   hoist_vbeout[bb], transp[bb]);
+ 	  if (bb != n_basic_blocks - 1)
+ 	    sbitmap_intersect_of_successors (hoist_vbeout[bb], hoist_vbein,
+ 					     bb, s_succs);
+ 	}
+       passes++;
+     }
+ 
+   if (gcse_file)
+     fprintf (gcse_file, "hoisting vbeinout computation: %d passes\n", 
passes);
+ }
+ 
+ /* Top level routine to do the dataflow analysis needed by code hoisting.  */
+ 
+ static void
+ compute_code_hoist_data ()
+ {
+   compute_local_properties (transp, comp, antloc, 0);
+   compute_transpout ();
+   compute_code_hoist_vbeinout ();
+   compute_flow_dominators (dominators, post_dominators);
+   if (gcse_file)
+     fprintf (gcse_file, "\n");
+ }
+ 
+ /* Determine if the expression identified by EXPR_INDEX would
+    reach BB unimpared if it was placed at the end of EXPR_BB.
+ 
+    It's unclear exactly what Muchnick meant by "unimpared".  It seems
+    to me that the expression must either be computed or transparent in
+    *every* block in the path(s) from EXPR_BB to BB.  Any other definition
+    would allow the expression to be hoisted out of loops, even if
+    the expression wasn't a loop invariant.
+ 
+    Contrast this to reachability for PRE where an expression is
+    considered reachable if *any* path reaches instead of *all*
+    paths.  */
+ 
+ static int
+ hoist_expr_reaches_here_p (expr_bb, expr_index, bb, visited)
+      int expr_bb;
+      int expr_index;
+      int bb;
+      char *visited;
+ {
+   edge pred;
+ 
+   if (visited == NULL)
+     {
+       visited = (char *) alloca (n_basic_blocks);
+       bzero (visited, n_basic_blocks);
+     }
+ 
+   visited[expr_bb] = 1;
+   for (pred = BASIC_BLOCK (bb)->pred; pred != NULL; pred = pred->pred_next)
+     {
+       int pred_bb = pred->src->index;
+ 
+       if (pred->src == ENTRY_BLOCK_PTR)
+ 	break;
+       else if (visited[pred_bb])
+ 	continue;
+       /* Does this predecessor generate this expression?  */
+       else if (TEST_BIT (comp[pred_bb], expr_index))
+ 	break;
+       else if (! TEST_BIT (transp[pred_bb], expr_index))
+ 	break;
+       /* Not killed.  */
+       else
+ 	{
+ 	  visited[pred_bb] = 1;
+ 	  if (! hoist_expr_reaches_here_p (expr_bb, expr_index,
+ 					   pred_bb, visited))
+ 	    break;
+ 	}
+     }
+ 
+   return (pred == NULL);
+ }
+ 
+ /* Actually perform code hoisting.  */
+ static void
+ hoist_code ()
+ {
+   int bb, dominated, i;
+   struct expr **index_map;
+ 
+   sbitmap_vector_zero (hoist_exprs, n_basic_blocks);
+ 
+   /* Compute a mapping from expression number (`bitmap_index') to
+      hash table entry.  */
+ 
+   index_map = (struct expr **) alloca (n_exprs * sizeof (struct expr *));
+   bzero ((char *) index_map, n_exprs * sizeof (struct expr *));
+   for (i = 0; i < expr_hash_table_size; i++)
+     {
+       struct expr *expr;
+ 
+       for (expr = expr_hash_table[i]; expr != NULL; expr = expr->
next_same_hash)
+ 	index_map[expr->bitmap_index] = expr;
+     }
+ 
+   /* Walk over each basic block looking for potentially hoistable
+      expressions, nothing gets hoisted from the entry block.  */
+   for (bb = 0; bb < n_basic_blocks; bb++)
+     {
+       int found = 0;
+       int insn_inserted_p;
+ 
+       /* Examine each expression that is very busy at the exit of this
+ 	 block.  These are the potentially hoistable expressions.  */
+       for (i = 0; i < hoist_vbeout[bb]->n_bits; i++)
+ 	{
+ 	  int hoistable = 0;
+ 	  if (TEST_BIT (hoist_vbeout[bb], i)
+ 	      && TEST_BIT (transpout[bb], i))
+ 	    {
+ 	      /* We've found a potentially hoistable expression, now
+ 		 we look at every block BB dominates to see if it
+ 		 computes the expression.  */
+ 	      for (dominated = 0; dominated < n_basic_blocks; dominated++)
+ 		{
+ 		  /* Ignore self dominance.  */
+ 		  if (bb == dominated
+ 		      || ! TEST_BIT (dominators[dominated], bb))
+ 		    continue;
+ 
+ 		  /* We've found a dominated block, now see if it computes
+ 		     the busy expression and whether or not moving that
+ 		     expression to the "beginning" of that block is safe.  */
+ 		  if (!TEST_BIT (antloc[dominated], i))
+ 		    continue;
+ 
+ 		  /* Note if the expression would reach the dominated block
+ 		     unimpared if it was placed at the end of BB. 
+ 
+ 		     Keep track of how many times this expression is hoistable
+ 		     from a dominated block into BB.  */
+ 		  if (hoist_expr_reaches_here_p (bb, i, dominated, NULL))
+ 		    hoistable++;
+ 		}
+ 
+ 	      /* If we found more than one hoistable occurence of this
+ 		 expression, then note it in the bitmap of expressions to
+ 		 hoist.  It makes no sense to hoist things which are computed
+ 		 in only one BB, and doing so tends to pessimize register
+ 		 allocation.  One could increase this value to try harder
+ 		 to avoid any possible code expansion due to register
+ 		 allocation issues; however experiments have shown that
+ 		 the vast majority of hoistable expressions are only movable
+ 		 from two successors, so raising this threshhold is likely
+ 		 to nullify any benefit we get from code hoisting.  */
+ 	      if (hoistable > 1)
+ 		{
+ 		  SET_BIT (hoist_exprs[bb], i);
+ 		  found = 1;
+ 		}
+ 	    }
+ 	}
+ 		
+       /* If we found nothing to hoist, then quit now.  */
+       if (! found)
+ 	continue;
+ 
+       /* Loop over all the hoistable expressions.  */
+       for (i = 0; i < hoist_exprs[bb]->n_bits; i++)
+ 	{
+ 	  /* We want to insert the expression into BB only once, so
+ 	     note when we've inserted it.  */
+ 	  insn_inserted_p = 0;
+ 
+ 	  /* These tests should be the same as the tests above.  */
+ 	  if (TEST_BIT (hoist_vbeout[bb], i))
+ 	    {
+ 	      /* We've found a potentially hoistable expression, now
+ 		 we look at every block BB dominates to see if it
+ 		 computes the expression.  */
+ 	      for (dominated = 0; dominated < n_basic_blocks; dominated++)
+ 		{
+ 		  /* Ignore self dominance.  */
+ 		  if (bb == dominated
+ 		      || ! TEST_BIT (dominators[dominated], bb))
+ 		    continue;
+ 
+ 		  /* We've found a dominated block, now see if it computes
+ 		     the busy expression and whether or not moving that
+ 		     expression to the "beginning" of that block is safe.  */
+ 		  if (!TEST_BIT (antloc[dominated], i))
+ 		    continue;
+ 
+ 		  /* The expression is computed in the dominated block and
+ 		     it would be safe to compute it at the start of the
+ 		     dominated block.  Now we have to determine if the
+ 		     expresion would reach the dominated block if it was
+ 		     placed at the end of BB.  */
+ 		  if (hoist_expr_reaches_here_p (bb, i, dominated, NULL))
+ 		    {
+ 		      struct expr *expr = index_map[i];
+ 		      struct occr *occr = expr->antic_occr;
+ 		      rtx insn;
+ 		      rtx set;
+ 
+ 		  
+ 		      /* Find the right occurence of this expression.  */
+ 		      while (BLOCK_NUM (occr->insn) != dominated && occr)
+ 			occr = occr->next;
+ 
+ 		      /* Should never happen.  */
+ 		      if (!occr)
+ 			abort ();
+ 
+ 		      insn = occr->insn;
+ 		 
+ 		      set = single_set (insn);
+ 		      if (! set)
+ 			abort ();
+ 
+ 		      /* Create a pseudo-reg to store the result of reaching
+ 			 expressions into.  Get the mode for the new pseudo
+ 			 from the mode of the original destination pseudo.  */
+ 		      if (expr->reaching_reg == NULL)
+ 			expr->reaching_reg
+ 			  = gen_reg_rtx (GET_MODE (SET_DEST (set)));
+ 
+ 		      /* In theory this should never fail since we're creating
+ 			 a reg->reg copy.
+ 
+ 			 However, on the x86 some of the movXX patterns actually
+ 			 contain clobbers of scratch regs.  This may cause the
+ 			 insn created by validate_change to not match any
+ 			 pattern and thus cause validate_change to fail.   */
+ 		      if (validate_change (insn, &SET_SRC (set),
+ 					   expr->reaching_reg, 0))
+ 			{
+ 			  occr->deleted_p = 1;
+ 			  if (!insn_inserted_p)
+ 			    {
+ 			      insert_insn_end_bb (index_map[i], bb, 0);
+ 			      insn_inserted_p = 1;
+ 			    }
+ 			}
+ 		    }
+ 		}
+ 	    }
+ 	}
+     }
+ }
+ 
+ /* Top level routine to perform one code hoisting (aka unification) pass
+ 
+    Return non-zero if a change was made.  */
+ 
+ static int
+ one_code_hoisting_pass ()
+ {
+   int changed = 0;
+ 
+   alloc_expr_hash_table (max_cuid);
+   compute_expr_hash_table ();
+   if (gcse_file)
+     dump_hash_table (gcse_file, "Code Hosting Expressions", expr_hash_table,
+ 		     expr_hash_table_size, n_exprs);
+   if (n_exprs > 0)
+     {
+       alloc_code_hoist_mem (n_basic_blocks, n_exprs);
+       compute_code_hoist_data ();
+       hoist_code ();
+       free_code_hoist_mem ();
+     }
+   free_expr_hash_table ();
+ 
+   return changed;
  }
Index: basic-block.h
===================================================================
RCS file: /egcs/carton/cvsfiles/egcs/gcc/basic-block.h,v
retrieving revision 1.24
diff -c -3 -p -r1.24 basic-block.h
*** basic-block.h	1999/08/16 22:14:51	1.24
--- basic-block.h	1999/09/20 14:36:57
*************** extern void compute_preds_succs		PROTO (
*** 281,286 ****
--- 281,287 ----
  extern void compute_dominators		PROTO ((sbitmap *, sbitmap *,
  						int_list_ptr *,
  						int_list_ptr *));
+ extern void compute_flow_dominators	PROTO ((sbitmap *, sbitmap *));
  extern void compute_immediate_dominators	PROTO ((int *, sbitmap *));
  
  /* In lcm.c */
Follow-Ups:
- Re: Global code hoisting
  - From: Zack Weinberg
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]