[4/4] Make SMS schedule register moves

Richard Sandiford richard.sandiford@linaro.org
Tue Aug 30 13:17:00 GMT 2011


This is the move-scheduling patch itself.  It should be fairly
self-explanatory.  Let me know if it isn't, and I'll try to improve
the commentary.

One potentially controversial change is to the way we handle moves
in the prologue and epilogue.  The current code uses a conservative
check to decide which stages need which moves.  This check copes
with values that are live before the loop, and emits some moves
that aren't actually needed.  The code also emits chains of moves
that can be trivially optimised through propagation.  We rely on
later patches to clean this up for us (and they do).

So, rather than keep a rather complicated check here, I've simply
emitted the moves for all stages.  In principle, that will generate
a little more dead code than now in cases where chains of two moves
are needed, but that seems to be very rare (when moves are scheduled).

Richard


gcc/
	* modulo-sched.c (extend_node_sched_params): New function.
	(print_node_sched_params): Print the stage.
	(set_columns_for_row, schedule_reg_move): New functions.
	(set_columns_for_ps): Move up file and use set_columns_for_now.
	(schedule_reg_moves): Call extend_node_sched_params and
	schedule_reg_move.  Extend size of uses bitmap.  Return false
	if a move could not be scheduled.
	(apply_reg_moves): Don't emit moves here.
	(permute_partial_schedule): Handle register moves.
	(duplicate_insns_of_cycles): Remove for_prolog.  Always emit moves.
	(generate_prolog_epilog): Update calls accordingly.

Index: gcc/modulo-sched.c
===================================================================
--- gcc/modulo-sched.c	2011-08-30 13:06:36.528669762 +0100
+++ gcc/modulo-sched.c	2011-08-30 13:22:08.029124537 +0100
@@ -220,8 +220,6 @@ static partial_schedule_ptr sms_schedule
 static void permute_partial_schedule (partial_schedule_ptr, rtx);
 static void generate_prolog_epilog (partial_schedule_ptr, struct loop *,
                                     rtx, rtx);
-static void duplicate_insns_of_cycles (partial_schedule_ptr,
-				       int, int, int, rtx);
 static int calculate_stage_count (partial_schedule_ptr, int);
 static void calculate_must_precede_follow (ddg_node_ptr, int, int,
 					   int, int, sbitmap, sbitmap, sbitmap);
@@ -458,6 +456,15 @@ set_node_sched_params (ddg_ptr g)
 			 node_sched_param_vec, g->num_nodes);
 }
 
+/* Make sure that node_sched_param_vec has an entry for every move in PS.  */
+static void
+extend_node_sched_params (partial_schedule_ptr ps)
+{
+  VEC_safe_grow_cleared (node_sched_params, heap, node_sched_param_vec,
+			 ps->g->num_nodes + VEC_length (ps_reg_move_info,
+							ps->reg_moves));
+}
+
 static void
 print_node_sched_params (FILE *file, int num_nodes, partial_schedule_ptr ps)
 {
@@ -474,6 +481,7 @@ print_node_sched_params (FILE *file, int
 	       INSN_UID (ps_rtl_insn (ps, i)));
       fprintf (file, " asap = %d:\n", NODE_ASAP (&ps->g->nodes[i]));
       fprintf (file, " time = %d:\n", nsp->time);
+      fprintf (file, " stage = %d:\n", nsp->stage);
       fprintf (file, " nreg_moves = %d:\n", nsp->nreg_moves);
       for (j = 0; j < nsp->nreg_moves; j++)
 	{
@@ -485,6 +493,168 @@ print_node_sched_params (FILE *file, int
     }
 }
 
+/* Set SCHED_COLUMN for each instruction in row ROW of PS.  */
+static void
+set_columns_for_row (partial_schedule_ptr ps, int row)
+{
+  ps_insn_ptr cur_insn;
+  int column;
+
+  column = 0;
+  for (cur_insn = ps->rows[row]; cur_insn; cur_insn = cur_insn->next_in_row)
+    SCHED_COLUMN (cur_insn->id) = column++;
+}
+
+/* Set SCHED_COLUMN for each instruction in PS.  */
+static void
+set_columns_for_ps (partial_schedule_ptr ps)
+{
+  int row;
+
+  for (row = 0; row < ps->ii; row++)
+    set_columns_for_row (ps, row);
+}
+
+/* Try to schedule the move with ps_insn identifier I_REG_MOVE in PS.
+   The source of the move is provided by I_MUST_FOLLOW, which has
+   already been scheduled.  MUST_FOLLOW is a scratch bitmap that
+   is big enough to hold I_MUST_FOLLOW.
+
+   Return true on success.  */
+static bool
+schedule_reg_move (partial_schedule_ptr ps, int i_reg_move,
+		   sbitmap must_follow, int i_must_follow)
+{
+  unsigned int u;
+  int i, start_row, end_row, this_start, this_end, this_row, latency;
+  int origin_row, origin_column;
+  sbitmap_iterator sbi;
+  ps_reg_move_info *move;
+
+  move = ps_reg_move (ps, i_reg_move);
+
+  /* The cyclic lifetime of move->new_reg starts and ends at move->def
+     (the instruction that defines move->old_reg).  We need to decide
+     where in that cyclic lifetime the move should go.  The position
+     is limited by I_MUST_FOLLOW (which defines the source of the move)
+     and the nodes in move->uses (which consume the result).
+
+     Specifically, the lowest possible row (relative to move->def)
+     is the maximum of:
+
+       a) the row in which the result of the previous iteration's
+	  I_MUST_FOLLOW becomes available
+       b) the first row in which every use of the previous iteration's
+	  move->new_reg has completed.
+
+     The highest possible row (again relative to move->def) is
+     the minimum of:
+
+       c) the row in which the current iteration's I_MUST_FOLLOW
+	  executes (and thus makes the source of the move unavailable)
+       d) the last row in which every use of this iteration's
+	  move->new_reg could be satisfied without delay.
+
+     Because all positions are relative to move->def, this function
+     uses ROW - ps->ii to represent positions come after move->def.
+     The range includes origin_row twice: "origin_row - ps->ii" is for
+     the positions in origin_row after move->def, while "origin_row"
+     itself is for the positions before move->def.  */
+  origin_row = SCHED_ROW (move->def);
+  origin_column = SCHED_COLUMN (move->def);
+  start_row = origin_row - ps->ii;
+  end_row = origin_row;
+
+  if (dump_file)
+    fprintf (dump_file, "Scheduling register move INSN %d with cyclic lifetime"
+	     " [%d, %d]\n", INSN_UID (move->insn), start_row, end_row);
+
+  /* Limit the range to [a, c].  */
+  this_end = SCHED_ROW (i_must_follow);
+  if (this_end > origin_row
+      || (this_end == origin_row
+	  && SCHED_COLUMN (i_must_follow) > origin_column))
+    this_end -= ps->ii;
+  latency = insn_latency (ps_rtl_insn (ps, i_must_follow), move->insn);
+  this_start = this_end - ps->ii + latency;
+
+  if (dump_file)
+    fprintf (dump_file, "  -- source produced by INSN %d with latency %d,"
+	     " range [%d, %d]\n", INSN_UID (ps_rtl_insn (ps, i_must_follow)),
+	     latency, this_start, this_end);
+
+  if (start_row < this_start)
+    start_row = this_start;
+  if (end_row > this_end)
+    end_row = this_end;
+
+  /* Limit the range to [b, d].  */
+  EXECUTE_IF_SET_IN_SBITMAP (move->uses, 0, u, sbi)
+    {
+      latency = insn_latency (move->insn, ps_rtl_insn (ps, u));
+      this_start = SCHED_ROW (u);
+      if (this_start > origin_row
+	  || (this_start == origin_row
+	      && SCHED_COLUMN (u) > origin_column))
+	this_start -= ps->ii;
+      this_end = this_start + ps->ii - latency;
+
+      if (dump_file)
+	fprintf (dump_file, "  -- destination used by INSN %d with latency"
+		 " %d, range [%d, %d]\n", INSN_UID (ps_rtl_insn (ps, u)),
+		 latency, this_start, this_end);
+
+      if (start_row < this_start)
+	start_row = this_start;
+      if (end_row > this_end)
+	end_row = this_end;
+    }
+
+  sbitmap_zero (must_follow);
+  SET_BIT (must_follow, i_must_follow);
+
+  if (dump_file)
+    fprintf (dump_file, "  -- trying to schedule in rows [%d, %d]\n",
+	     start_row, end_row);
+
+  for (i = end_row; i >= start_row; i--)
+    {
+      ps_insn_ptr psi;
+
+      this_row = i < 0 ? i + ps->ii : i;
+      /* origin_row - ps->ii represents the part of row origin_row after
+	 move->def.  In this case the move must come after both move->uses
+	 and move->def.  */
+      if (i == origin_row - ps->ii)
+	{
+	  gcc_checking_assert (i == start_row);
+	  gcc_checking_assert (!TEST_BIT (move->uses, move->def));
+	  SET_BIT (move->uses, move->def);
+	  psi = ps_add_node_check_conflicts (ps, i_reg_move, this_row,
+					     move->uses, NULL);
+	  RESET_BIT (move->uses, move->def);
+	}
+      else
+	psi = ps_add_node_check_conflicts (ps, i_reg_move, this_row,
+					   move->uses, must_follow);
+
+      if (psi)
+	{
+	  SCHED_ROW (i_reg_move) = this_row;
+	  if (dump_file)
+	    fprintf (dump_file, "  -- scheduled in row %d (%d)\n",
+		     i, this_row);
+	  set_columns_for_row (ps, this_row);
+	  return true;
+	}
+    }
+
+  if (dump_file)
+    fprintf (dump_file, "  -- no available slot\n");
+
+  return false;
+}
+
 /*
    Breaking intra-loop register anti-dependences:
    Each intra-loop register anti-dependence implies a cross-iteration true
@@ -507,9 +677,10 @@ schedule_reg_moves (partial_schedule_ptr
     {
       ddg_node_ptr u = &g->nodes[i];
       ddg_edge_ptr e;
-      int nreg_moves = 0, i_reg_move;
+      int nreg_moves = 0, i_reg_move, i_must_follow;
       rtx prev_reg, old_reg;
       int first_move;
+      sbitmap must_follow;
 
       /* Compute the number of reg_moves needed for u, by looking at life
 	 ranges started at u (excluding self-loops).  */
@@ -539,6 +710,7 @@ schedule_reg_moves (partial_schedule_ptr
       first_move = VEC_length (ps_reg_move_info, ps->reg_moves);
       VEC_safe_grow_cleared (ps_reg_move_info, heap, ps->reg_moves,
 			     first_move + nreg_moves);
+      extend_node_sched_params (ps);
 
       /* Record the moves associated with this node.  */
       first_move += ps->g->num_nodes;
@@ -552,7 +724,7 @@ schedule_reg_moves (partial_schedule_ptr
 	  ps_reg_move_info *move = ps_reg_move (ps, first_move + i_reg_move);
 
 	  move->def = i;
-	  move->uses = sbitmap_alloc (g->num_nodes);
+	  move->uses = sbitmap_alloc (first_move + nreg_moves);
 	  move->old_reg = old_reg;
 	  move->new_reg = gen_reg_rtx (GET_MODE (prev_reg));
 	  move->insn = gen_move_insn (move->new_reg, copy_rtx (prev_reg));
@@ -586,6 +758,19 @@ schedule_reg_moves (partial_schedule_ptr
 		SET_BIT (move->uses, e->dest->cuid);
 	      }
 	  }
+
+      must_follow = sbitmap_alloc (first_move + nreg_moves);
+      i_must_follow = i;
+      for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
+	{
+	  if (!schedule_reg_move (ps, first_move + i_reg_move,
+				  must_follow, i_must_follow))
+	    break;
+	  i_must_follow = first_move + i_reg_move;
+	}
+      sbitmap_free (must_follow);
+      if (i_reg_move < nreg_moves)
+	return false;
     }
   return true;
 }
@@ -609,9 +794,6 @@ apply_reg_moves (partial_schedule_ptr ps
 	  df_insn_rescan (ps->g->nodes[i_use].insn);
 	}
     }
-
-  FOR_EACH_VEC_ELT_REVERSE (ps_reg_move_info, ps->reg_moves, i, move)
-    add_insn_before (move->insn, ps_first_note (ps, move->def), NULL);
 }
 
 /* Update the sched_params (time, row and stage) for node U using the II,
@@ -682,22 +864,6 @@ reset_sched_times (partial_schedule_ptr 
       }
 }
  
-/* Set SCHED_COLUMN of each node according to its position in PS.  */
-static void
-set_columns_for_ps (partial_schedule_ptr ps)
-{
-  int row;
-
-  for (row = 0; row < ps->ii; row++)
-    {
-      ps_insn_ptr cur_insn = ps->rows[row];
-      int column = 0;
-
-      for (; cur_insn; cur_insn = cur_insn->next_in_row)
-	SCHED_COLUMN (cur_insn->id) = column++;
-    }
-}
-
 /* Permute the insns according to their order in PS, from row 0 to
    row ii-1, and position them right before LAST.  This schedules
    the insns of the loop kernel.  */
@@ -714,8 +880,13 @@ permute_partial_schedule (partial_schedu
 	rtx insn = ps_rtl_insn (ps, ps_ij->id);
 
 	if (PREV_INSN (last) != insn)
-	  reorder_insns_nobb (ps_first_note (ps, ps_ij->id), insn,
-			      PREV_INSN (last));
+	  {
+	    if (ps_ij->id < ps->g->num_nodes)
+	      reorder_insns_nobb (ps_first_note (ps, ps_ij->id), insn,
+				  PREV_INSN (last));
+	    else
+	      add_insn_before (insn, last, NULL);
+	  }
       }
 }
 
@@ -919,7 +1090,7 @@ optimize_sc (partial_schedule_ptr ps, dd
 
 static void
 duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
-			   int to_stage, int for_prolog, rtx count_reg)
+			   int to_stage, rtx count_reg)
 {
   int row;
   ps_insn_ptr ps_ij;
@@ -928,7 +1099,6 @@ duplicate_insns_of_cycles (partial_sched
     for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
       {
 	int u = ps_ij->id;
-	int j, i_reg_moves, i_reg_move;
 	rtx u_insn;
 
         /* Do not duplicate any insn which refers to count_reg as it
@@ -942,42 +1112,18 @@ duplicate_insns_of_cycles (partial_sched
             || JUMP_P (u_insn))
           continue;
 
-	if (for_prolog)
-	  {
-	    /* SCHED_STAGE (u) >= from_stage == 0.  Generate increasing
-	       number of reg_moves starting with the second occurrence of
-	       u, which is generated if its SCHED_STAGE <= to_stage.  */
-	    i_reg_moves = to_stage - SCHED_STAGE (u) + 1;
-	    i_reg_moves = MAX (i_reg_moves, 0);
-	    i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u));
-
-	    /* The reg_moves start from the *first* reg_move backwards.  */
-	    i_reg_move = SCHED_FIRST_REG_MOVE (u) + (i_reg_moves - 1);
-	  }
-	else /* It's for the epilog.  */
-	  {
-	    /* SCHED_STAGE (u) <= to_stage.  Generate all reg_moves,
-	       starting to decrease one stage after u no longer occurs;
-	       that is, generate all reg_moves until
-	       SCHED_STAGE (u) == from_stage - 1.  */
-	    i_reg_moves = (SCHED_NREG_MOVES (u)
-			   - (from_stage - SCHED_STAGE (u) - 1));
-	    i_reg_moves = MAX (i_reg_moves, 0);
-	    i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u));
-
-	    /* The reg_moves start from the *last* reg_move forwards.  */
-	    i_reg_move = SCHED_FIRST_REG_MOVE (u) + (SCHED_NREG_MOVES (u) - 1);
-	  }
-
-	for (j = 0; j < i_reg_moves; j++)
+	if (u < ps->g->num_nodes)
 	  {
-	    ps_reg_move_info *move = ps_reg_move (ps, i_reg_move - j);
-
-	    emit_insn (copy_rtx (PATTERN (move->insn)));
+	    if (SCHED_STAGE (u) >= from_stage
+		&& SCHED_STAGE (u) <= to_stage)
+	      duplicate_insn_chain (ps_first_note (ps, u), u_insn);
 	  }
-	if (SCHED_STAGE (u) >= from_stage
-	    && SCHED_STAGE (u) <= to_stage)
-	  duplicate_insn_chain (ps_first_note (ps, u), u_insn);
+	else
+	  /* For simplicity, we generate all moves for every stage,
+	     even though some of them will be dead.  Later optimizations
+	     will remove the dead instructions and undo some of the
+	     unnecessary renaming that moves would otherwise produce.  */
+	  emit_insn (copy_rtx (PATTERN (u_insn)));
       }
 }
 
@@ -1011,7 +1157,7 @@ generate_prolog_epilog (partial_schedule
     }
 
   for (i = 0; i < last_stage; i++)
-    duplicate_insns_of_cycles (ps, 0, i, 1, count_reg);
+    duplicate_insns_of_cycles (ps, 0, i, count_reg);
 
   /* Put the prolog on the entry edge.  */
   e = loop_preheader_edge (loop);
@@ -1023,7 +1169,7 @@ generate_prolog_epilog (partial_schedule
   start_sequence ();
 
   for (i = 0; i < last_stage; i++)
-    duplicate_insns_of_cycles (ps, i + 1, last_stage, 0, count_reg);
+    duplicate_insns_of_cycles (ps, i + 1, last_stage, count_reg);
 
   /* Put the epilogue on the exit edge.  */
   gcc_assert (single_exit (loop));



More information about the Gcc-patches mailing list