This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [itanium-sched-branch] Insert nops for MM-insns and tune placing stop bits [patch]


Sorry, I've sent the older version of the patch.  The following is the
current version of the patch which is actually present in the branch.

Vlad


2002-10-07  Vladimir Makarov  <vmakarov@redhat.com>

        * config/ia64/ia64.h (MASK_TUNE_STOP_BITS,
TARGET_TUNE_STOP_BITS):
        New macros.
        (TARGET_SWITCHES): Add entries for the new option.
        
        * config/ia64/ia64.c (dfa_stop_insn, last_scheduled_insn, rtx
        dfa_pre_cycle_insn, ia64_nop): Don't make them as roots for GC.
        (stops_p, stop_before_p, clocks_length, clocks, add_cycles): New
        global variables.
        (ia64_sched_reorder2): Set up `clocks'.
        (ia64_variable_issue): Set up `stops_p' and reset
`stop_before_p'.
        (ia64_dfa_new_cycle): Set up add_cycle.  Permit sorting ready
        queue when TARGET_TUNE_STOP_BITS.
        (bundling): Insert additional nops for MM-insns.
        (final_emit_insn_group_barriers): Add insertion of stop bits
        according `stops_p'.
        (ia64_reorg): Initiate the new varibales.

        * doc/invoke.texi: Add description of option `-mtune-stop-bits'.
Index: config/ia64/ia64.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/ia64/ia64.h,v
retrieving revision 1.126.4.1
diff -d -c -p -r1.126.4.1 ia64.h
*** config/ia64/ia64.h	25 Sep 2002 19:11:09 -0000	1.126.4.1
--- config/ia64/ia64.h	7 Oct 2002 22:05:12 -0000
*************** extern int target_flags;
*** 91,96 ****
--- 91,98 ----
  
  #define MASK_DWARF2_ASM 0x40000000	/* test dwarf2 line info via gas.  */
  
+ #define MASK_TUNE_STOP_BITS  0x00002000	/* tune stop bits for the model.  */
+ 
  #define TARGET_BIG_ENDIAN	(target_flags & MASK_BIG_ENDIAN)
  
  #define TARGET_GNU_AS		(target_flags & MASK_GNU_AS)
*************** extern int ia64_tls_size;
*** 126,131 ****
--- 128,134 ----
  #define TARGET_TLS14		(ia64_tls_size == 14)
  #define TARGET_TLS22		(ia64_tls_size == 22)
  #define TARGET_TLS64		(ia64_tls_size == 64)
+ #define TARGET_TUNE_STOP_BITS	(target_flags & MASK_TUNE_STOP_BITS)
  
  #define TARGET_HPUX_LD		0
  
*************** extern int ia64_tls_size;
*** 173,178 ****
--- 176,185 ----
        N_("Enable Dwarf 2 line debug info via GNU as")},			\
    { "no-dwarf2-asm", 	-MASK_DWARF2_ASM,				\
        N_("Disable Dwarf 2 line debug info via GNU as")},		\
+   { "tune-stop-bits", 	MASK_TUNE_STOP_BITS,				\
+       N_("Enable tuning stop bits for better scheduling")},		\
+   { "no-tune-stop-bits", 	-MASK_TUNE_STOP_BITS,			\
+       N_("Disable tuning stop bits for better scheduling")},		\
    SUBTARGET_SWITCHES							\
    { "",			TARGET_DEFAULT | TARGET_CPU_DEFAULT,		\
        NULL }								\
Index: config/ia64/ia64.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/ia64/ia64.c,v
retrieving revision 1.184.4.4
diff -d -c -p -r1.184.4.4 ia64.c
*** config/ia64/ia64.c	30 Sep 2002 15:50:33 -0000	1.184.4.4
--- config/ia64/ia64.c	7 Oct 2002 22:05:13 -0000
*************** static int pos_1, pos_2, pos_3, pos_4, p
*** 5395,5405 ****
  
  /* The following variable value is an insn group barrier.  */
  
! static GTY (()) rtx dfa_stop_insn;
  
  /* The following variable value is the last issued insn.  */
  
! static GTY (()) rtx last_scheduled_insn;
  
  /* The following variable value is size of the DFA state.  */
  
--- 5395,5405 ----
  
  /* The following variable value is an insn group barrier.  */
  
! static rtx dfa_stop_insn;
  
  /* The following variable value is the last issued insn.  */
  
! static rtx last_scheduled_insn;
  
  /* The following variable value is size of the DFA state.  */
  
*************** static state_t temp_dfa_state = NULL;
*** 5415,5420 ****
--- 5415,5444 ----
  
  static state_t prev_cycle_state = NULL;
  
+ /* The following array element values are TRUE if the corresponding
+    insn reuqires to add stop bits before it.  */
+ 
+ static char *stops_p;
+ 
+ /* The following variable is used to set up the mentioned above array.  */
+ 
+ static int stop_before_p = 0;
+ 
+ /* The following variable value is length of the arrays `clocks' and
+    `add_cycles'. */
+ 
+ static int clocks_length;
+ 
+ /* The following array element values are cycles on which the
+    corresponding insn will be issued. */
+ 
+ static int *clocks;
+ 
+ /* The following array element values are numbers of cycles should be
+    added to improve insn scheduling for MM_insns for Itanium1. */
+ 
+ static int *add_cycles;
+ 
  static rtx ia64_single_set PARAMS ((rtx));
  static void ia64_emit_insn_before PARAMS ((rtx, rtx));
  
*************** ia64_sched_reorder2 (dump, sched_verbose
*** 5676,5681 ****
--- 5700,5707 ----
       int *pn_ready;
       int clock_var;
  {
+   if (reload_completed && last_scheduled_insn)
+     clocks [INSN_UID (last_scheduled_insn)] = clock_var;
    return ia64_dfa_sched_reorder (dump, sched_verbose, ready, pn_ready,
  				 clock_var, 1);
  }
*************** ia64_variable_issue (dump, sched_verbose
*** 5698,5703 ****
--- 5724,5731 ----
  	abort ();
        if (GET_CODE (insn) == CALL_INSN)
  	init_insn_group_barriers ();
+       stops_p [INSN_UID (insn)] = stop_before_p;
+       stop_before_p = 0;
      }
    return 1;
  }
*************** ia64_first_cycle_multipass_dfa_lookahead
*** 5719,5725 ****
     scheduler to change the DFA state when the simulated clock is
     increased.  */
  
! static GTY (()) rtx dfa_pre_cycle_insn;
  
  /* We are about to being issuing INSN.  Return nonzero if we can not
     issue it on given cycle CLOCK and return zero if we should not sort
--- 5747,5753 ----
     scheduler to change the DFA state when the simulated clock is
     increased.  */
  
! static rtx dfa_pre_cycle_insn;
  
  /* We are about to being issuing INSN.  Return nonzero if we can not
     issue it on given cycle CLOCK and return zero if we should not sort
*************** ia64_dfa_new_cycle (dump, verbose, insn,
*** 5733,5738 ****
--- 5761,5768 ----
       int last_clock, clock;
       int *sort_p;
  {
+   int setup_clocks_p = FALSE;
+ 
    if (insn == NULL_RTX || !INSN_P (insn))
      abort ();
    if ((reload_completed && safe_group_barrier_needed_p (insn))
*************** ia64_dfa_new_cycle (dump, verbose, insn,
*** 5745,5762 ****
        if (verbose && dump)
  	fprintf (dump, "//    Stop should be before %d%s\n", INSN_UID (insn),
  		 last_clock == clock ? " + cycle advance" : "");
        if (last_clock == clock)
  	{
  	  state_transition (curr_state, dfa_stop_insn);
! 	  *sort_p = 0;
  	  return 1;
  	}
!       else
  	{
! 	  memcpy (curr_state, prev_cycle_state, dfa_state_size);
! 	  state_transition (curr_state, dfa_stop_insn);
! 	  state_transition (curr_state, dfa_pre_cycle_insn);
! 	  state_transition (curr_state, NULL);
  	}
      }
    return 0;
--- 5775,5825 ----
        if (verbose && dump)
  	fprintf (dump, "//    Stop should be before %d%s\n", INSN_UID (insn),
  		 last_clock == clock ? " + cycle advance" : "");
+       stop_before_p = 1;
        if (last_clock == clock)
  	{
  	  state_transition (curr_state, dfa_stop_insn);
! 	  if (TARGET_TUNE_STOP_BITS)
! 	    *sort_p = (last_scheduled_insn == NULL_RTX
! 		       || GET_CODE (last_scheduled_insn) != CALL_INSN);
! 	  else
! 	    *sort_p = 0;
  	  return 1;
  	}
!       else if (reload_completed)
! 	setup_clocks_p = TRUE;
!       memcpy (curr_state, prev_cycle_state, dfa_state_size);
!       state_transition (curr_state, dfa_stop_insn);
!       state_transition (curr_state, dfa_pre_cycle_insn);
!       state_transition (curr_state, NULL);
!     }
!   else if (reload_completed)
!     setup_clocks_p = TRUE;
!   if (setup_clocks_p)
!     {
!       enum attr_itanium_class c = ia64_safe_itanium_class (insn);
!       
!       if (c != ITANIUM_CLASS_MMMUL && c != ITANIUM_CLASS_MMSHF)
  	{
! 	  rtx link;
! 	  int d = -1;
! 	  
! 	  for (link = LOG_LINKS (insn); link; link = XEXP (link, 1))
! 	    if (REG_NOTE_KIND (link) == 0)
! 	      {
! 		enum attr_itanium_class dep_class;
! 		rtx dep_insn = XEXP (link, 0);
! 		
! 		dep_class = ia64_safe_itanium_class (dep_insn);
! 		if ((dep_class == ITANIUM_CLASS_MMMUL
! 		     || dep_class == ITANIUM_CLASS_MMSHF)
! 		    && last_clock - clocks [INSN_UID (dep_insn)] < 4
! 		    && (d < 0
! 			|| last_clock - clocks [INSN_UID (dep_insn)] < d))
! 		  d = last_clock - clocks [INSN_UID (dep_insn)];
! 	      }
! 	  if (d >= 0)
! 	    add_cycles [INSN_UID (insn)] = 3 - d;
  	}
      }
    return 0;
*************** finish_bundle_state_table ()
*** 5968,5974 ****
  /* The following variable is a insn `nop' used to check bundle states
     with different number of inserted nops.  */
  
! static GTY (()) rtx ia64_nop;
  
  /* The following function tries to issue NOPS_NUM nops for the current
     state without advancing processor cycle.  If it failed, the
--- 6031,6037 ----
  /* The following variable is a insn `nop' used to check bundle states
     with different number of inserted nops.  */
  
! static rtx ia64_nop;
  
  /* The following function tries to issue NOPS_NUM nops for the current
     state without advancing processor cycle.  If it failed, the
*************** bundling (dump, verbose, prev_head_insn,
*** 6467,6472 ****
--- 6530,6598 ----
  	    }
  	}
      }
+   /* Insert additional cycles for MM-insns: */
+   for (insn = get_next_important_insn (NEXT_INSN (prev_head_insn), tail);
+        insn != NULL_RTX;
+        insn = next_insn)
+     {
+       if (!INSN_P (insn)
+ 	  || ia64_safe_itanium_class (insn) == ITANIUM_CLASS_IGNORE
+ 	  || GET_CODE (PATTERN (insn)) == USE
+ 	  || GET_CODE (PATTERN (insn)) == CLOBBER)
+ 	abort ();
+       next_insn = get_next_important_insn (NEXT_INSN (insn), tail);
+       if (INSN_UID (insn) < clocks_length && add_cycles [INSN_UID (insn)])
+ 	{
+ 	  rtx last;
+ 	  int i, j, n;
+ 	  int pred_stop_p;
+ 
+ 	  last = prev_active_insn (insn);
+ 	  pred_stop_p = recog_memoized (last) == CODE_FOR_insn_group_barrier;
+ 	  if (pred_stop_p)
+ 	    last = prev_active_insn (last);
+ 	  n = 0;
+ 	  for (;; last = prev_active_insn (last))
+ 	    if (recog_memoized (last) == CODE_FOR_bundle_selector)
+ 	      {
+ 		template0 = XINT (XVECEXP (PATTERN (last), 0, 0), 0);
+ 		break;
+ 	      }
+ 	    else if (recog_memoized (last) != CODE_FOR_insn_group_barrier)
+ 	      n++;
+ 	  if ((pred_stop_p && n == 0) || n > 2)
+ 	    abort ();
+ 	  for (j = 3 - n; j > 0; j --)
+ 	    ia64_emit_insn_before (gen_nop (), insn);
+ 	  add_cycles [INSN_UID (insn)]--;
+ 	  if (!pred_stop_p || add_cycles [INSN_UID (insn)])
+ 	    ia64_emit_insn_before (gen_insn_group_barrier (GEN_INT (3)), insn);
+ 	  if (pred_stop_p)
+ 	    add_cycles [INSN_UID (insn)]--;
+ 	  for (i = add_cycles [INSN_UID (insn)]; i > 0; i--)
+ 	    {
+ 	      /* Insert .MII bundle.  */
+ 	      ia64_emit_insn_before (gen_bundle_selector (GEN_INT (0)), insn);
+ 	      ia64_emit_insn_before (gen_nop (), insn);
+ 	      ia64_emit_insn_before (gen_nop (), insn);
+ 	      if (i > 1)
+ 		{
+ 		  ia64_emit_insn_before (gen_insn_group_barrier (GEN_INT (3)),
+ 					 insn);
+ 		  i--;
+ 		}
+ 	      ia64_emit_insn_before (gen_nop (), insn);
+ 	      ia64_emit_insn_before (gen_insn_group_barrier (GEN_INT (3)),
+ 				     insn);
+ 	    }
+ 	  ia64_emit_insn_before (gen_bundle_selector (GEN_INT (template0)),
+ 				 insn);
+ 	  for (j = n; j > 0; j --)
+ 	    ia64_emit_insn_before (gen_nop (), insn);
+ 	  if (pred_stop_p)
+ 	    ia64_emit_insn_before (gen_insn_group_barrier (GEN_INT (3)), insn);
+ 	}
+     }
    free (index_to_bundle_states);
    finish_bundle_state_table ();
    bundling_p = 0;
*************** final_emit_insn_group_barriers (dump)
*** 6541,6548 ****
  	    }
  	  else if (need_barrier_p || group_barrier_needed_p (insn))
  	    {
! 	      emit_insn_before (gen_insn_group_barrier (GEN_INT (3)), insn);
! 	      init_insn_group_barriers ();
  	      group_barrier_needed_p (insn);
  	      prev_insn = NULL_RTX;
  	    }
--- 6667,6702 ----
  	    }
  	  else if (need_barrier_p || group_barrier_needed_p (insn))
  	    {
! 	      if (TARGET_TUNE_STOP_BITS)
! 		{
! 		  rtx last;
! 		  
! 		  for (last = insn;
! 		       last != current_sched_info->prev_head;
! 		       last = PREV_INSN (last))
! 		    if (INSN_P (last) && GET_MODE (last) == TImode
! 			&& stops_p [INSN_UID (last)])
! 		      break;
! 		  if (last == current_sched_info->prev_head)
! 		    last = insn;
! 		  last = prev_active_insn (last);
! 		  if (last
! 		      && recog_memoized (last) != CODE_FOR_insn_group_barrier)
! 		    emit_insn_after (gen_insn_group_barrier (GEN_INT (3)),
! 				     last);
! 		  init_insn_group_barriers ();
! 		  for (last = NEXT_INSN (last);
! 		       last != insn;
! 		       last = NEXT_INSN (last))
! 		    if (INSN_P (last))
! 		      group_barrier_needed_p (last);
! 		}
! 	      else
! 		{
! 		  emit_insn_before (gen_insn_group_barrier (GEN_INT (3)),
! 				    insn);
! 		  init_insn_group_barriers ();
! 		}
  	      group_barrier_needed_p (insn);
  	      prev_insn = NULL_RTX;
  	    }
*************** ia64_reorg (insns)
*** 6759,6764 ****
--- 6913,6925 ----
        ia64_nop = make_insn_raw (gen_nop ());
        PREV_INSN (ia64_nop) = NEXT_INSN (ia64_nop) = NULL_RTX;
        recog_memoized (ia64_nop);
+       clocks_length = get_max_uid () + 1;
+       stops_p = (char *) xmalloc (clocks_length);
+       memset (stops_p, 0, clocks_length);
+       clocks = (int *) xmalloc (clocks_length * sizeof (int));
+       memset (clocks, 0, clocks_length * sizeof (int));
+       add_cycles = (int *) xmalloc (clocks_length * sizeof (int));
+       memset (add_cycles, 0, clocks_length * sizeof (int));
        pos_1 = get_cpu_unit_code ("1_1");
        pos_2 = get_cpu_unit_code ("1_2");
        pos_3 = get_cpu_unit_code ("1_3");
*************** ia64_reorg (insns)
*** 6788,6793 ****
--- 6949,6957 ----
        
        schedule_ebbs (rtl_dump_file);
        finish_bundle_states ();
+       free (add_cycles);
+       free (clocks);
+       free (stops_p);
        emit_insn_group_barriers (rtl_dump_file, insns);
  
        ia64_final_schedule = 0;
Index: doc/invoke.texi
===================================================================
RCS file: /cvs/gcc/gcc/gcc/doc/invoke.texi,v
retrieving revision 1.181
diff -d -c -p -r1.181 invoke.texi
*** doc/invoke.texi	4 Sep 2002 17:35:59 -0000	1.181
--- doc/invoke.texi	7 Oct 2002 22:05:14 -0000
*************** count register BK@.
*** 8587,8594 ****
  Enable (disable) generation of code using decrement and branch,
  DBcond(D), instructions.  This is enabled by default for the C4x.  To be
  on the safe side, this is disabled for the C3x, since the maximum
! iteration count on the C3x is @math{2^{23} + 1} (but who iterates loops more than
! @math{2^{23}} times on the C3x?).  Note that GCC will try to reverse a loop so
  that it can utilise the decrement and branch instruction, but will give
  up if there is more than one memory reference in the loop.  Thus a loop
  where the loop counter is decremented can generate slightly more
--- 8587,8594 ----
  Enable (disable) generation of code using decrement and branch,
  DBcond(D), instructions.  This is enabled by default for the C4x.  To be
  on the safe side, this is disabled for the C3x, since the maximum
! iteration count on the C3x is @math{2^23 + 1} (but who iterates loops more than
! @math{2^23} times on the C3x?).  Note that GCC will try to reverse a loop so
  that it can utilise the decrement and branch instruction, but will give
  up if there is more than one memory reference in the loop.  Thus a loop
  where the loop counter is decremented can generate slightly more
*************** instruction, it is disabled by default.
*** 8656,8664 ****
  @opindex mloop-unsigned
  @opindex mno-loop-unsigned
  The maximum iteration count when using RPTS and RPTB (and DB on the C40)
! is @math{2^{31} + 1} since these instructions test if the iteration count is
  negative to terminate the loop.  If the iteration count is unsigned
! there is a possibility than the @math{2^{31} + 1} maximum iteration count may be
  exceeded.  This switch allows an unsigned iteration count.
  
  @item -mti
--- 8656,8664 ----
  @opindex mloop-unsigned
  @opindex mno-loop-unsigned
  The maximum iteration count when using RPTS and RPTB (and DB on the C40)
! is @math{2^31 + 1} since these instructions test if the iteration count is
  negative to terminate the loop.  If the iteration count is unsigned
! there is a possibility than the @math{2^31 + 1} maximum iteration count may be
  exceeded.  This switch allows an unsigned iteration count.
  
  @item -mti
*************** A fixed register is one that the registe
*** 9210,9215 ****
--- 9210,9222 ----
  useful when compiling kernel code.  A register range is specified as
  two registers separated by a dash.  Multiple register ranges can be
  specified separated by a comma.
+ 
+ @item -mtune-stop-bits
+ @itemx -mno-tune-stop-bits
+ @opindex mtune-stop-bits
+ @opindex mno-tune-stop-bits
+ Permit to place stop bits not right before insn requiring it.  It might
+ improve insn scheduling.
  @end table
  
  @node D30V Options

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]