Scheduler improvements

Jan Hubicka hubicka@atrey.karlin.mff.cuni.cz
Tue Jul 20 00:11:00 GMT 1999


Hi
Here is updated patch for the scheduler defitions. It cleans up the ppro_uops
attribute abuse. The MD_SCHED code is still shared. I believe it makes a sense,
because more chips like K6/PPro seems to come (at least K7) and we will
probably want to have single scheduler code handling this.
Changes necesary to handle K6 are small (varying number of decoders
and support for many2 attribute).

Honza

Sun Jul 11 20:18:24 EDT 1999  Jan Hubicka  <hubicka@freesoft.cz>
	* i386.c (ix86_sched_data): New fields DECODING_TIME, CLOCK,
        TOP_OF_STACK, NISSUES, NEXT_ISSUE.
        (ix86_safe_uops): Large instructions are automatically many type.
        (ix86_dump_ppro_packet): Dump expected decoding time.
        (DECODERS): New macro.
        (fp_sched_reorder): New function.
        (ix86_sched_reorder): New parameter CLOCK, handle multicycle
        instructions correctly for PENTIUM, handle K6 like PENTIUMPRO,
        support MANY2 instructions, call fp_sched_reorder for K6.
        (ix86_variable_issue): Use NISSUES parameter for Pentium, handle
        MANY2 instructions, update DECODING_TIME and TOP_OF_STACK
	for PENTIUMPRO
        * i386.h (ix86_sched_reorder): Update prototype.
        (MD_SCHED_REORDER): Update macro.
        * i386.md (k6_decoder attribute): K6 decoding definitions.
	(uops): New attribute.
	(many insn patterns): Set k6_decoder attribute.
        * haifa_sched.c (adjust_priority): Call ADJUST_PRIORITY unconditionally.
        (schedule_block): Update call of MD_SCHED_REORDER, handle -1 value
        in MD_VARIABLE_ISSUE.


*** /root/i386old2/i386.c	Tue Jul 20 02:32:03 1999
--- i386.c	Mon Jul 19 22:56:51 1999
*************** struct processor_costs k6_cost = {
*** 102,108 ****
    2,					/* cost of a lea instruction */
    1,					/* variable shift costs */
    1,					/* constant shift costs */
!   3,					/* cost of starting a multiply */
    0,					/* cost of multiply per each bit set */
    18,					/* cost of a divide/mod */
    8					/* "large" insn */
--- 102,108 ----
    2,					/* cost of a lea instruction */
    1,					/* variable shift costs */
    1,					/* constant shift costs */
!   4,					/* cost of starting a multiply */
    0,					/* cost of multiply per each bit set */
    18,					/* cost of a divide/mod */
    8					/* "large" insn */
*************** static int ix86_agi_dependant PROTO ((rt
*** 224,234 ****
  static int ix86_safe_length PROTO ((rtx));
  static enum attr_memory ix86_safe_memory PROTO ((rtx));
  static enum attr_pent_pair ix86_safe_pent_pair PROTO ((rtx));
! static enum attr_ppro_uops ix86_safe_ppro_uops PROTO ((rtx));
  static void ix86_dump_ppro_packet PROTO ((FILE *));
  static void ix86_reorder_insn PROTO ((rtx *, rtx *));
  static rtx * ix86_pent_find_pair PROTO ((rtx *, rtx *, enum attr_pent_pair,
  					 rtx));
  
  struct ix86_address
  {
--- 239,251 ----
  static int ix86_safe_length PROTO ((rtx));
  static enum attr_memory ix86_safe_memory PROTO ((rtx));
  static enum attr_pent_pair ix86_safe_pent_pair PROTO ((rtx));
! static enum attr_uops ix86_safe_uops PROTO ((rtx));
  static void ix86_dump_ppro_packet PROTO ((FILE *));
  static void ix86_reorder_insn PROTO ((rtx *, rtx *));
  static rtx * ix86_pent_find_pair PROTO ((rtx *, rtx *, enum attr_pent_pair,
  					 rtx));
  
  struct ix86_address
  {
*************** static union
*** 5289,5295 ****
--- 5665,5679 ----
    {
      rtx decode[3];
      int issued_this_cycle;
+     int decoding_time;
+     int clock;
+     int top_of_stack;
    } ppro;
+   struct pentium_sched_data
+   {
+     int nissues;
+     int next_issue;
+   } pentium;
  } ix86_sched_data;
  
  static int
*************** ix86_safe_pent_pair (insn)
*** 5332,5345 ****
      return PENT_PAIR_NP;
  }
  
! static enum attr_ppro_uops
! ix86_safe_ppro_uops (insn)
       rtx insn;
  {
    if (recog_memoized (insn) >= 0)
!     return get_attr_ppro_uops (insn);
    else
!     return PPRO_UOPS_MANY;
  }
  
  static void
--- 5716,5735 ----
      return PENT_PAIR_NP;
  }
  
! static enum attr_uops
! ix86_safe_uops (insn)
       rtx insn;
  {
    if (recog_memoized (insn) >= 0)
!     {
!       enum attr_uops uops = get_attr_uops (insn);
!       if (uops < UOPS_MANY 
! 	  && get_attr_length (insn) >= ix86_cost -> large_insn)
! 	return UOPS_MANY;
!       return uops;
!     }
    else
!     return UOPS_MANY;
  }
  
  static void
*************** ix86_dump_ppro_packet (dump)
*** 5348,5354 ****
  {
    if (ix86_sched_data.ppro.decode[0])
      {
!       fprintf (dump, "PPRO packet: %d",
  	       INSN_UID (ix86_sched_data.ppro.decode[0]));
        if (ix86_sched_data.ppro.decode[1])
  	fprintf (dump, " %d", INSN_UID (ix86_sched_data.ppro.decode[1]));
--- 5738,5745 ----
  {
    if (ix86_sched_data.ppro.decode[0])
      {
!       fprintf (dump, "PPRO packet (decoding time %i): %d",
! 	       ix86_sched_data.ppro.decoding_time,
  	       INSN_UID (ix86_sched_data.ppro.decode[0]));
        if (ix86_sched_data.ppro.decode[1])
  	fprintf (dump, " %d", INSN_UID (ix86_sched_data.ppro.decode[1]));
*************** ix86_pent_find_pair (e_ready, ready, typ
*** 5440,5473 ****
    return bestinsnp;
  }
  
  /* We are about to being issuing insns for this clock cycle.  
     Override the default sort algorithm to better slot instructions.  */
  
  void
! ix86_sched_reorder (dump, sched_verbose, ready, n_ready)
       FILE *dump ATTRIBUTE_UNUSED;
       int sched_verbose ATTRIBUTE_UNUSED;
       rtx *ready;
       int n_ready;
  {
    rtx *e_ready = ready + n_ready - 1;
    rtx *insnp;
    int i;
  
-   if (n_ready < 2)
-     return;
- 
    switch (ix86_cpu)
      {
      default:
        return;
  
      case PROCESSOR_PENTIUM:
        /* This wouldn't be necessary if Haifa knew that static insn ordering
  	 is important to which pipe an insn is issued to.  So we have to make
  	 some minor rearrangements.  */
        {
  	enum attr_pent_pair pair1, pair2;
  
  	pair1 = ix86_safe_pent_pair (*e_ready);
  
--- 5831,5923 ----
    return bestinsnp;
  }
  
+ #define DECODERS (ix86_cpu == PROCESSOR_PENTIUMPRO ? 3 : 2)
+ 
+ /* Reorder instruction in a way we expect minimal amount of fxch insns is
+    necesary.  This is done by attempting to choose instruction that use
+    destination of previous insn.  This is just stupid heruistic that for
+    example completely ignore the most important insns emited to the edges.
+    This hack will be obsolette once reg-stack algorithm is redesigned.
+    For now it brings noticeable improvements for i386, i486 and K6 CPU.  */
+ 
+ void
+ fp_sched_reorder (top_of_stack, ready, n_ready, ninsns)
+      int top_of_stack, ninsns, n_ready;
+      rtx *ready;
+ {
+   rtx *e_ready = ready + n_ready - 1, *curr, set;
+   int found;
+   while (ninsns && e_ready >= ready)
+     {
+       if (refers_to_regno_p (FIRST_STACK_REG, LAST_STACK_REG + 1, *e_ready, NULL))
+ 	{
+ 	  /* OK. Fp insn, so we are busy.  */
+ 	  curr = e_ready;
+ 	  found = 0;
+ 	  /* Scan for insn that use output of last fp insn.  */
+ 	  while (curr >= ready)
+ 	    {
+ 	      if (refers_to_regno_p (top_of_stack, top_of_stack + 1, *curr, NULL))
+ 		{
+ 		  rtx tmp = *curr;
+ 		  *curr = *e_ready;
+ 		  *e_ready = tmp;
+ 		  found = 1;
+ 		  break;
+ 		}
+ 	      curr--;
+ 	    }
+ 	  if (!found && (set = single_set (*e_ready)) != 0)
+ 	    {
+ 	      set = SET_DEST (set);
+ 	      if (STACK_REG_P (set))
+ 		top_of_stack = true_regnum (set);
+ 	    }
+ 	}
+       ninsns--;
+       e_ready--;
+     }
+ }
+ 
  /* We are about to being issuing insns for this clock cycle.  
     Override the default sort algorithm to better slot instructions.  */
  
  void
! ix86_sched_reorder (dump, sched_verbose, ready, n_ready, clock)
       FILE *dump ATTRIBUTE_UNUSED;
       int sched_verbose ATTRIBUTE_UNUSED;
       rtx *ready;
       int n_ready;
+      int clock;
  {
    rtx *e_ready = ready + n_ready - 1;
    rtx *insnp;
    int i;
  
    switch (ix86_cpu)
      {
      default:
        return;
  
      case PROCESSOR_PENTIUM:
+       ix86_sched_data.pentium.nissues = 1;
        /* This wouldn't be necessary if Haifa knew that static insn ordering
  	 is important to which pipe an insn is issued to.  So we have to make
  	 some minor rearrangements.  */
        {
  	enum attr_pent_pair pair1, pair2;
+ 	enum attr_memory memory, memory2;
+ 	int cycles, cycles2;
+ 
+ 	/* Wait until CPU is ready to issue new instructions.  Reset variable 
+ 	   at the start of block.  */
+ 	if (clock && clock < ix86_sched_data.pentium.next_issue && TARGET_PENTIUM) 
+ 	  {
+ 	     ix86_sched_data.pentium.nissues = 0;
+ 	     return;
+ 	  }
+ 	if (!clock)
+ 	  ix86_sched_data.pentium.next_issue = 0;
  
  	pair1 = ix86_safe_pent_pair (*e_ready);
  
*************** ix86_sched_reorder (dump, sched_verbose,
*** 5508,5514 ****
  	  }
  
  	if (pair2 == PENT_PAIR_NP)
! 	  return;
  
  	/* Found something!  Decide if we need to swap the order.  */
  	if (pair1 == PENT_PAIR_PV || pair2 == PENT_PAIR_PU
--- 5958,5964 ----
  	  }
  
  	if (pair2 == PENT_PAIR_NP)
! 	   return;
  
  	/* Found something!  Decide if we need to swap the order.  */
  	if (pair1 == PENT_PAIR_PV || pair2 == PENT_PAIR_PU
*************** ix86_sched_reorder (dump, sched_verbose,
*** 5518,5532 ****
  	  ix86_reorder_insn (insnp, e_ready);
  	else
  	  ix86_reorder_insn (insnp, e_ready - 1);
        }
        break;
  
      case PROCESSOR_PENTIUMPRO:
        {
  	rtx decode[3];
! 	enum attr_ppro_uops cur_uops;
  	int issued_this_cycle;
  
  	/* At this point .ppro.decode contains the state of the three 
  	   decoders from last "cycle".  That is, those insns that were
  	   actually independant.  But here we're scheduling for the 
--- 5968,6017 ----
  	  ix86_reorder_insn (insnp, e_ready);
  	else
  	  ix86_reorder_insn (insnp, e_ready - 1);
+ 
+ 	/* The paired instructions block issue until they are both finished.
+ 	   Also certain pairs takes longer due to load unit conflicts.  */
+ 	memory = ix86_safe_memory (*e_ready);
+ 	memory2 = ix86_safe_memory (*e_ready);
+ 	cycles = result_ready_cost (*e_ready-1);
+ 	cycles2 = result_ready_cost (*e_ready-1);
+ 	if (cycles < cycles2) cycles = cycles2;
+ 
+ 	/* Two read modify write instructions together
+ 	   takes two cycles longer.  */
+ 	if (memory == MEMORY_BOTH && memory2 == MEMORY_BOTH)
+ 	  cycles += 2;
+ 
+ 	/* read modify write instruction followed by read modify
+ 	   takes one cycle longer.  */
+ 	if (memory == MEMORY_BOTH && memory2 == MEMORY_LOAD)
+ 	  cycles += 1;
+ 	ix86_sched_data.pentium.next_issue = clock + cycles;
+ 	ix86_sched_data.pentium.nissues = 2;
        }
        break;
  
+     case PROCESSOR_K6:
      case PROCESSOR_PENTIUMPRO:
        {
  	rtx decode[3];
! 	enum attr_uops cur_uops;
  	int issued_this_cycle;
  
+ 	/* Do initialization on the begining of new block.  */
+ 	if (!clock)
+ 	  {
+ 	    ix86_sched_data.ppro.decoding_time = 0;
+ 	    ix86_sched_data.ppro.top_of_stack = 0;
+ 	    ix86_sched_data.ppro.decode[0] = NULL;
+ 	    ix86_sched_data.ppro.decode[1] = NULL;
+ 	    ix86_sched_data.ppro.decode[2] = NULL;
+ 	}
+ 
+ 	ix86_sched_data.ppro.clock = clock;
+ 	if (ix86_sched_data.ppro.decoding_time < clock - 10)
+ 	  ix86_sched_data.ppro.decoding_time = clock - 10;
+ 
  	/* At this point .ppro.decode contains the state of the three 
  	   decoders from last "cycle".  That is, those insns that were
  	   actually independant.  But here we're scheduling for the 
*************** ix86_sched_reorder (dump, sched_verbose,
*** 5537,5564 ****
  	issued_this_cycle = 0;
  
  	insnp = e_ready;
! 	cur_uops = ix86_safe_ppro_uops (*insnp);
  
  	/* If the decoders are empty, and we've a complex insn at the
  	   head of the priority queue, let it issue without complaint.  */
  	if (decode[0] == NULL)
  	  {
! 	    if (cur_uops == PPRO_UOPS_MANY)
  	      {
  		decode[0] = *insnp;
  		goto ppro_done;
  	      }
  
  	    /* Otherwise, search for a 2-4 uop unsn to issue.  */
! 	    while (cur_uops != PPRO_UOPS_FEW)
  	      {
  		if (insnp == ready)
  		  break;
! 		cur_uops = ix86_safe_ppro_uops (*--insnp);
  	      }
  
  	    /* If so, move it to the head of the line.  */
! 	    if (cur_uops == PPRO_UOPS_FEW)
  	      ix86_reorder_insn (insnp, e_ready);
  
  	    /* Issue the head of the queue.  */
--- 6022,6066 ----
  	issued_this_cycle = 0;
  
  	insnp = e_ready;
! 	cur_uops = ix86_safe_uops (*insnp);
  
  	/* If the decoders are empty, and we've a complex insn at the
  	   head of the priority queue, let it issue without complaint.  */
  	if (decode[0] == NULL)
  	  {
! 	    /* We don't have enought time to decode long insn.  */
! 	    if (cur_uops == UOPS_MANY2 
! 	        && ix86_sched_data.ppro.decoding_time + 2 >= clock)
! 	      {
! 		 /* Attempt to find some insn, that can improve our situation.  */
! 	         while (cur_uops == UOPS_MANY2
! 			&& (cur_uops == UOPS_MANY
! 			    && result_ready_cost (*insnp) <= 1))
! 	           {
! 		     if (insnp == ready)
! 		       break;
! 		     cur_uops = ix86_safe_uops (*--insnp);
! 	           }
! 		if (cur_uops != UOPS_MANY2)
! 		   ix86_reorder_insn (insnp, e_ready);
! 		insnp = e_ready;
! 	      }
! 	    if (cur_uops == UOPS_MANY || cur_uops == UOPS_MANY2)
  	      {
  		decode[0] = *insnp;
  		goto ppro_done;
  	      }
  
  	    /* Otherwise, search for a 2-4 uop unsn to issue.  */
! 	    while (cur_uops != UOPS_FEW)
  	      {
  		if (insnp == ready)
  		  break;
! 		cur_uops = ix86_safe_uops (*--insnp);
  	      }
  
  	    /* If so, move it to the head of the line.  */
! 	    if (cur_uops == UOPS_FEW)
  	      ix86_reorder_insn (insnp, e_ready);
  
  	    /* Issue the head of the queue.  */
*************** ix86_sched_reorder (dump, sched_verbose,
*** 5567,5589 ****
  	  }
  
  	/* Look for simple insns to fill in the other two slots.  */
! 	for (i = 1; i < 3; ++i)
  	  if (decode[i] == NULL)
  	    {
  	      if (ready >= e_ready)
  		goto ppro_done;
  
  	      insnp = e_ready;
! 	      cur_uops = ix86_safe_ppro_uops (*insnp);
! 	      while (cur_uops != PPRO_UOPS_ONE)
  		{
  		  if (insnp == ready)
  		    break;
! 		  cur_uops = ix86_safe_ppro_uops (*--insnp);
  		}
  
  	      /* Found one.  Move it to the head of the queue and issue it.  */
! 	      if (cur_uops == PPRO_UOPS_ONE)
  		{
  		  ix86_reorder_insn (insnp, e_ready);
  		  decode[i] = *e_ready--;
--- 6069,6091 ----
  	  }
  
  	/* Look for simple insns to fill in the other two slots.  */
! 	for (i = 1; i < DECODERS; ++i)
  	  if (decode[i] == NULL)
  	    {
  	      if (ready >= e_ready)
  		goto ppro_done;
  
  	      insnp = e_ready;
! 	      cur_uops = ix86_safe_uops (*insnp);
! 	      while (cur_uops != UOPS_ONE)
  		{
  		  if (insnp == ready)
  		    break;
! 		  cur_uops = ix86_safe_uops (*--insnp);
  		}
  
  	      /* Found one.  Move it to the head of the queue and issue it.  */
! 	      if (cur_uops == UOPS_ONE)
  		{
  		  ix86_reorder_insn (insnp, e_ready);
  		  decode[i] = *e_ready--;
*************** ix86_sched_reorder (dump, sched_verbose,
*** 5598,5603 ****
--- 6100,6108 ----
        ppro_done:
  	if (issued_this_cycle == 0)
  	  issued_this_cycle = 1;
+ 	if (ix86_cpu == PROCESSOR_K6)
+ 	  fp_sched_reorder (ix86_sched_data.ppro.top_of_stack,
+ 			    ready, n_ready, issued_this_cycle);
  	ix86_sched_data.ppro.issued_this_cycle = issued_this_cycle;
        }
        break;
*************** ix86_variable_issue (dump, sched_verbose
*** 5612,5631 ****
       FILE *dump;
       int sched_verbose;
       rtx insn;
!      int can_issue_more;
  {
    int i;
    switch (ix86_cpu)
      {
      default:
!       return can_issue_more - 1;
  
      case PROCESSOR_PENTIUMPRO:
        {
! 	enum attr_ppro_uops uops = ix86_safe_ppro_uops (insn);
  
! 	if (uops == PPRO_UOPS_MANY)
  	  {
  	    if (sched_verbose)
  	      ix86_dump_ppro_packet (dump);
  	    ix86_sched_data.ppro.decode[0] = insn;
--- 6117,6146 ----
       FILE *dump;
       int sched_verbose;
       rtx insn;
!      int can_issue_more ATTRIBUTE_UNUSED;
  {
    int i;
+   rtx set;
    switch (ix86_cpu)
      {
      default:
!       return --can_issue_more;
! 
!     case PROCESSOR_PENTIUM:
!       return --ix86_sched_data.pentium.nissues;
  
      case PROCESSOR_PENTIUMPRO:
+     case PROCESSOR_K6:
        {
! 	enum attr_uops uops = ix86_safe_uops (insn);
  
! 	if (uops == UOPS_MANY || uops == UOPS_MANY2)
  	  {
+ 	    int time = (uops == UOPS_MANY) ? 1 : 2;
+ 	    if (time + ix86_sched_data.ppro.decoding_time
+ 		> ix86_sched_data.ppro.clock)
+ 	      return -1;
+ 	    ix86_sched_data.ppro.decoding_time += time;
  	    if (sched_verbose)
  	      ix86_dump_ppro_packet (dump);
  	    ix86_sched_data.ppro.decode[0] = insn;
*************** ix86_variable_issue (dump, sched_verbose
*** 5635,5659 ****
  	      ix86_dump_ppro_packet (dump);
  	    ix86_sched_data.ppro.decode[0] = NULL;
  	  }
! 	else if (uops == PPRO_UOPS_FEW)
  	  {
  	    if (sched_verbose)
  	      ix86_dump_ppro_packet (dump);
  	    ix86_sched_data.ppro.decode[0] = insn;
  	    ix86_sched_data.ppro.decode[1] = NULL;
  	    ix86_sched_data.ppro.decode[2] = NULL;
  	  }
  	else
  	  {
! 	    for (i = 0; i < 3; ++i)
  	      if (ix86_sched_data.ppro.decode[i] == NULL)
  		{
  		  ix86_sched_data.ppro.decode[i] = insn;
  		  break;
  		}
! 	    if (i == 3)
  	      abort ();
! 	    if (i == 2)
  	      {
  	        if (sched_verbose)
  	          ix86_dump_ppro_packet (dump);
--- 6150,6185 ----
  	      ix86_dump_ppro_packet (dump);
  	    ix86_sched_data.ppro.decode[0] = NULL;
  	  }
! 	else if (uops == UOPS_FEW)
  	  {
+ 	    if (ix86_sched_data.ppro.decoding_time
+ 		>= ix86_sched_data.ppro.clock)
+ 	      return -1;
  	    if (sched_verbose)
  	      ix86_dump_ppro_packet (dump);
+ 	    ix86_sched_data.ppro.decoding_time ++;
  	    ix86_sched_data.ppro.decode[0] = insn;
  	    ix86_sched_data.ppro.decode[1] = NULL;
  	    ix86_sched_data.ppro.decode[2] = NULL;
  	  }
  	else
  	  {
! 	    for (i = 0; i < DECODERS; ++i)
  	      if (ix86_sched_data.ppro.decode[i] == NULL)
  		{
  		  ix86_sched_data.ppro.decode[i] = insn;
  		  break;
  		}
! 	    if (!i)
! 	      {
! 		if (ix86_sched_data.ppro.decoding_time
! 		    >= ix86_sched_data.ppro.clock)
! 	          return -1;
! 		ix86_sched_data.ppro.decoding_time ++;
! 	      }
! 	    if (i == DECODERS)
  	      abort ();
! 	    if (i == DECODERS-1)
  	      {
  	        if (sched_verbose)
  	          ix86_dump_ppro_packet (dump);
*************** ix86_variable_issue (dump, sched_verbose
*** 5663,5668 ****
  	      }
  	  }
        }
!       return --ix86_sched_data.ppro.issued_this_cycle;
      }
  }
--- 6189,6200 ----
  	      }
  	  }
        }
!       if ((set = single_set (insn)) != 0)
!         {
!           set = SET_DEST (set);
! 	  if (STACK_REG_P (set)) ix86_sched_data.ppro.top_of_stack = true_regnum (set);
!         }
!       return --ix86_sched_data.ppro.issued_this_cycle > 0
! 	      ? ix86_sched_data.ppro.issued_this_cycle : 0;
      }
  }

*** haifa-sched.c.old	Sun Jun 20 13:29:49 1999
--- haifa-sched.c	Sat Jul 17 19:25:44 1999
*************** adjust_priority (prev)
*** 4362,4371 ****
  	    }
  	  break;
  	}
  #ifdef ADJUST_PRIORITY
!       ADJUST_PRIORITY (prev);
  #endif
-     }
  }
  
  /* Clock at which the previous instruction was issued.  */
--- 4362,4371 ----
  	    }
  	  break;
  	}
+     }
  #ifdef ADJUST_PRIORITY
!     ADJUST_PRIORITY (prev);
  #endif
  }
  
  /* Clock at which the previous instruction was issued.  */
*************** schedule_block (bb, rgn_n_insns)
*** 6860,6866 ****
    /* Sort the ready list */
    SCHED_SORT (ready, n_ready);
  #ifdef MD_SCHED_REORDER
!   MD_SCHED_REORDER (dump, sched_verbose, ready, n_ready);
  #endif
  
    if (sched_verbose >= 2)
--- 6860,6866 ----
    /* Sort the ready list */
    SCHED_SORT (ready, n_ready);
  #ifdef MD_SCHED_REORDER
!   MD_SCHED_REORDER (dump, sched_verbose, ready, n_ready, 0);
  #endif
  
    if (sched_verbose >= 2)
*************** schedule_block (bb, rgn_n_insns)
*** 6910,6916 ****
        /* Sort the ready list.  */
        SCHED_SORT (ready, n_ready);
  #ifdef MD_SCHED_REORDER
!       MD_SCHED_REORDER (dump, sched_verbose, ready, n_ready);
  #endif
  
        if (sched_verbose)
--- 6910,6916 ----
        /* Sort the ready list.  */
        SCHED_SORT (ready, n_ready);
  #ifdef MD_SCHED_REORDER
!       MD_SCHED_REORDER (dump, sched_verbose, ready, n_ready, clock_var);
  #endif
  
        if (sched_verbose)
*************** schedule_block (bb, rgn_n_insns)
*** 6935,6940 ****
--- 6935,6955 ----
  	    }
  	  else if (cost == 0)
  	    {
+ 	      if (INSN_BB (insn) != target_bb && IS_SPECULATIVE_INSN (insn)
+ 		  && !check_live (insn, INSN_BB (insn)))
+ 		{
+ 		  /* speculative motion, live check failed, remove
+ 		     insn from ready list */
+ 		  ready[i] = ready[--n_ready];
+ 		  continue;
+ 		}
+ #ifdef MD_SCHED_VARIABLE_ISSUE
+ 	      MD_SCHED_VARIABLE_ISSUE (dump, sched_verbose, insn, can_issue_more);
+ #else
+ 	      can_issue_more--;
+ #endif
+ 	      if (can_issue_more < 0) break;
+ 
  	      /* an interblock motion? */
  	      if (INSN_BB (insn) != target_bb)
  		{
*************** schedule_block (bb, rgn_n_insns)
*** 6943,6955 ****
  		  if (IS_SPECULATIVE_INSN (insn))
  		    {
  
- 		      if (!check_live (insn, INSN_BB (insn)))
- 			{
- 			  /* speculative motion, live check failed, remove
- 			     insn from ready list */
- 			  ready[i] = ready[--n_ready];
- 			  continue;
- 			}
  		      update_live (insn, INSN_BB (insn));
  
  		      /* for speculative load, mark insns fed by it.  */
--- 6958,6963 ----
*************** schedule_block (bb, rgn_n_insns)
*** 7000,7011 ****
  	      last_scheduled_insn = insn;
  	      last = move_insn (insn, last);
  	      sched_n_insns++;
- 
- #ifdef MD_SCHED_VARIABLE_ISSUE
- 	      MD_SCHED_VARIABLE_ISSUE (dump, sched_verbose, insn, can_issue_more);
- #else
- 	      can_issue_more--;
- #endif
  
  	      n_ready = schedule_insn (insn, ready, n_ready, clock_var);
  
--- 7008,7013 ----
*** /root/i386old2/i386.h	Tue Jul 20 02:32:34 1999
--- i386.h	Sat Jul 17 19:24:38 1999
*************** while (0)
*** 2062,2075 ****
  #define ADJUST_COST(insn,link,dep_insn,cost) \
    (cost) = ix86_adjust_cost(insn, link, dep_insn, cost)
  
  #define ISSUE_RATE \
    ix86_issue_rate ()
  
  #define MD_SCHED_INIT(DUMP, SCHED_VERBOSE) \
    ix86_sched_init (DUMP, SCHED_VERBOSE)
  
! #define MD_SCHED_REORDER(DUMP, SCHED_VERBOSE, READY, N_READY) \
!   ix86_sched_reorder (DUMP, SCHED_VERBOSE, READY, N_READY)
  
  #define MD_SCHED_VARIABLE_ISSUE(DUMP, SCHED_VERBOSE, INSN, CAN_ISSUE_MORE) \
    ((CAN_ISSUE_MORE) =							   \
--- 2061,2080 ----
  #define ADJUST_COST(insn,link,dep_insn,cost) \
    (cost) = ix86_adjust_cost(insn, link, dep_insn, cost)
  
+ /* Increase priority of vector decodes insns for K6, because they are very
+    hard to schedule.  */
+ #define ADJUST_PRIORITY(insn) \
+   if (TARGET_K6 && recog_memoized (insn) >= 0  \
+       && get_attr_uops (insn) == PPRO_UOPS_MANY2) INSN_PRIORITY (insn) <<= 2;
+ 
  #define ISSUE_RATE \
    ix86_issue_rate ()
  
  #define MD_SCHED_INIT(DUMP, SCHED_VERBOSE) \
    ix86_sched_init (DUMP, SCHED_VERBOSE)
  
! #define MD_SCHED_REORDER(DUMP, SCHED_VERBOSE, READY, N_READY, CYCLES) \
!   ix86_sched_reorder (DUMP, SCHED_VERBOSE, READY, N_READY, CYCLES)
  
  #define MD_SCHED_VARIABLE_ISSUE(DUMP, SCHED_VERBOSE, INSN, CAN_ISSUE_MORE) \
    ((CAN_ISSUE_MORE) =							   \
*************** extern int ix86_attr_length_default XPAR
*** 2495,2503 ****
  extern int ix86_issue_rate XPARAMS((void));
  extern int ix86_adjust_cost XPARAMS((xrtx, xrtx, xrtx, int));
  extern void ix86_sched_init XPARAMS((FILE *, int));
! extern void ix86_sched_reorder XPARAMS((FILE *, int, xrtx *, int));
  extern int ix86_variable_issue XPARAMS((FILE *, int, xrtx, int));
  
  
  #undef XPARAMS
  #undef xrtx
--- 2501,2509 ----
  extern int ix86_issue_rate XPARAMS((void));
  extern int ix86_adjust_cost XPARAMS((xrtx, xrtx, xrtx, int));
  extern void ix86_sched_init XPARAMS((FILE *, int));
! extern void ix86_sched_reorder XPARAMS((FILE *, int, xrtx *, int, int));
  extern int ix86_variable_issue XPARAMS((FILE *, int, xrtx, int));
  
  
  #undef XPARAMS
  #undef xrtx
*** /root/i386old2/i386.md	Tue Jul 20 02:34:08 1999
--- i386.md	Mon Jul 19 22:56:02 1999
***************
*** 602,610 ****
  ;;
  ;; ??? fxch isn't handled; not an issue until sched3 after reg-stack is real.
  
! ;; The decoder specification is in the PPro section above!
  
! ;; Shift instructions and certain arithmetic are issued only to X pipe.
  (define_function_unit "k6_alux" 1 0
    (and (eq_attr "cpu" "k6")
         (eq_attr "type" "ishift,alu1,negnot"))
--- 614,659 ----
  ;;
  ;; ??? fxch isn't handled; not an issue until sched3 after reg-stack is real.
  
! ;; AMD K6 decoders specification. We use same names as K6 optimization manual.
! ;;
! ;; short  -- short decodable instruction (any decoder)
! ;; first  -- ESC instruction (only first decoder)
! ;; long   -- long decodable (both decoders)
! ;; vector -- vector decoded (both decoders for two and more cycles)
! 
! ;; This types must match with uops attribute definition bellow.
! 
! (define_attr "k6_decode" "short,first,long,vector"
!     (cond [(eq_attr "type" "other,multi,call,callv,fpspc,setcc,imul,idiv")
! 	     (const_string "vector")
! 	   ;; The shift, neg and not becomes vector decoded in case memory
! 	   ;; operand can not be encoded using RM byte.
! 	   ;; Possibly it can be good idea to convert them to read/modify/write
! 	   ;; sequence when possible.
! 	   (and (eq_attr "type" "ishift,negnot")
! 		(match_operand 0 "long_memory_operand" ""))
! 	     (const_string "vector")
! 	   ;; XF mode loads and stores are vector decoded
! 	   (and (eq_attr "type" "fmov")
! 	        (ior (and (eq_attr "memory" "store")
! 		     (match_operand:XF 0 "memory_operand" ""))
! 		(and (eq_attr "memory" "load")
! 		     (match_operand:XF 1 "memory_operand" ""))))
! 	     (const_string "vector")
! 	   ;; Prefix is separate short decodable ESC instruction, but we can
! 	   ;; model it in this way.
! 	   (eq_attr "length_prefix" "1")
! 	     (const_string "long")
! 	   (eq_attr "type" "imov")
! 	     (const_string "short")
! 	   (eq_attr "type" "fmov,fop,fop1,fmul,fdiv,fpspc,fcmp,fxch,fcmov")
! 	     (const_string "first")
! 	   (eq_attr "memory" "both")
! 	     (const_string "long")
! 	  ]
! 	(const_string "short")))
  
! ;; shift instructions and certaion arithmetic is issued only to X pipe.
  (define_function_unit "k6_alux" 1 0
    (and (eq_attr "cpu" "k6")
         (eq_attr "type" "ishift,alu1,negnot"))
***************
*** 688,693 ****
--- 737,750 ----
    (and (eq_attr "cpu" "k6")
         (eq_attr "type" "idiv"))
    17 17)
+ 
+ ;; Decoder specification for K6 and PPro CPU.
+ 
+ (define_attr "uops" "one,few,many,many2"
+   (if_then_else (eq_attr "cpu" "pentiumpro")
+     (attr "ppro_uops")
+     (attr "k6_decode")))
+ 
  
  ;; Compare instructions.
  
***************
*** 1068,1073 ****
--- 1132,1138 ----
    "TARGET_80387"
    "fnstsw\\t%0"
    [(set_attr "length" "2")
+    (set_attr "k6_decode" "first")
     (set_attr "ppro_uops" "few")])
  
  ;; FP compares, step 3
***************
*** 1079,1084 ****
--- 1144,1150 ----
    ""
    "sahf"
    [(set_attr "length" "1")
+    (set_attr "k6_decode" "vector")
     (set_attr "ppro_uops" "one")])
  
  ;; Pentium Pro can do steps 1 through 3 in one go.
***************
*** 1202,1207 ****
--- 1268,1274 ----
    "xchg{l}\\t%1, %0"
    [(set_attr "type" "imov")
     (set_attr "pent_pair" "np")
+    (set_attr "k6_decode" "long")
     (set_attr "ppro_uops" "few")])
  
  (define_expand "movhi"
***************
*** 1335,1340 ****
--- 1402,1408 ----
    "xchg{w}\\t%1, %0"
    [(set_attr "type" "imov")
     (set_attr "pent_pair" "np")
+    (set_attr "k6_decode" "long")
     (set_attr "ppro_uops" "few")])
  
  (define_insn "*swaphi_2"
***************
*** 1347,1352 ****
--- 1415,1421 ----
    [(set_attr "type" "imov")
     (set_attr "length_prefix" "0")
     (set_attr "pent_pair" "np")
+    (set_attr "k6_decode" "long")
     (set_attr "ppro_uops" "few")])
  
  (define_expand "movstricthi"
***************
*** 1517,1522 ****
--- 1586,1592 ----
    "xchg{b}\\t%1, %0"
    [(set_attr "type" "imov")
     (set_attr "pent_pair" "np")
+    (set_attr "k6_decode" "long")
     (set_attr "ppro_uops" "few")])
  
  (define_expand "movstrictqi"
***************
*** 2819,2824 ****
--- 2891,2897 ----
    "TARGET_80387"
    "fnstcw\\t%0"
    [(set_attr "length_opcode" "2")
+    (set_attr "k6_decode" "vector")
     (set_attr "ppro_uops" "few")])
  
  (define_insn "x86_fldcw_1"
***************
*** 2827,2832 ****
--- 2900,2906 ----
    "TARGET_80387"
    "fldcw\\t%0"
    [(set_attr "length_opcode" "2")
+    (set_attr "k6_decode" "vector")
     (set_attr "ppro_uops" "few")])
  
  ;; Conversion between fixed point and floating point.
***************
*** 2973,2978 ****
--- 3047,3053 ----
    "adc{l}\\t{%2, %0|%0, %2}"
    [(set_attr "type" "alu")
     (set_attr "pent_pair" "pu")
+    (set_attr "k6_decode" "vector")
     (set_attr "ppro_uops" "few")])
  
  (define_expand "addsi3"
***************
*** 3520,3525 ****
--- 3610,3616 ----
    "sbb{l}\\t{%2, %0|%0, %2}"
    [(set_attr "type" "alu")
     (set_attr "pent_pair" "pu")
+    (set_attr "k6_decode" "vector")
     (set_attr "ppro_uops" "few")])
  
  (define_expand "subsi3"
***************
*** 5059,5064 ****
--- 5191,5197 ----
    [(set_attr "type" "ishift")
     (set_attr "length_opcode" "3")
     (set_attr "pent_pair" "np")
+    (set_attr "k6_decode" "vector")
     (set_attr "ppro_uops" "few")])
  
  (define_expand "x86_shift_adj_1"
***************
*** 5407,5412 ****
--- 5615,5621 ----
    [(set_attr "type" "ishift")
     (set_attr "length_opcode" "3")
     (set_attr "pent_pair" "np")
+    (set_attr "k6_decode" "vector")
     (set_attr "ppro_uops" "few")])
  
  (define_expand "x86_shift_adj_3"
***************
*** 5668,5674 ****
    "@
     rol{l}\\t{%2, %0|%0, %2}
     rol{l}\\t{%b2, %0|%0, %b2}"
!   [(set_attr "type" "ishift")])
  
  (define_insn ""
    [(set (reg:CCNO 17)
--- 5877,5884 ----
    "@
     rol{l}\\t{%2, %0|%0, %2}
     rol{l}\\t{%b2, %0|%0, %b2}"
!   [(set_attr "type" "ishift")
!    (set_attr "k6_decode" "vector")])
  
  (define_insn ""
    [(set (reg:CCNO 17)
***************
*** 5682,5688 ****
    "@
     rol{l}\\t{%2, %0|%0, %2}
     rol{l}\\t{%b2, %0|%0, %b2}"
!   [(set_attr "type" "ishift")])
  
  (define_insn "rotlhi3"
    [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,rm")
--- 5892,5899 ----
    "@
     rol{l}\\t{%2, %0|%0, %2}
     rol{l}\\t{%b2, %0|%0, %b2}"
!   [(set_attr "type" "ishift")
!    (set_attr "k6_decode" "vector")])
  
  (define_insn "rotlhi3"
    [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,rm")
***************
*** 5693,5699 ****
    "@
     rol{w}\\t{%2, %0|%0, %2}
     rol{w}\\t{%b2, %0|%0, %b2}"
!   [(set_attr "type" "ishift")])
  
  (define_insn ""
    [(set (reg:CCNO 17)
--- 5904,5911 ----
    "@
     rol{w}\\t{%2, %0|%0, %2}
     rol{w}\\t{%b2, %0|%0, %b2}"
!   [(set_attr "type" "ishift")
!    (set_attr "k6_decode" "vector")])
  
  (define_insn ""
    [(set (reg:CCNO 17)
***************
*** 5707,5713 ****
    "@
     rol{w}\\t{%2, %0|%0, %2}
     rol{w}\\t{%b2, %0|%0, %b2}"
!   [(set_attr "type" "ishift")])
  
  (define_insn "rotlqi3"
    [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,qm")
--- 5919,5926 ----
    "@
     rol{w}\\t{%2, %0|%0, %2}
     rol{w}\\t{%b2, %0|%0, %b2}"
!   [(set_attr "type" "ishift")
!    (set_attr "k6_decode" "vector")])
  
  (define_insn "rotlqi3"
    [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,qm")
***************
*** 5718,5724 ****
    "@
     rol{b}\\t{%2, %0|%0, %2}
     rol{b}\\t{%b2, %0|%0, %b2}"
!   [(set_attr "type" "ishift")])
  
  (define_insn ""
    [(set (reg:CCNO 17)
--- 5931,5938 ----
    "@
     rol{b}\\t{%2, %0|%0, %2}
     rol{b}\\t{%b2, %0|%0, %b2}"
!   [(set_attr "type" "ishift")
!    (set_attr "k6_decode" "vector")])
  
  (define_insn ""
    [(set (reg:CCNO 17)
***************
*** 5732,5738 ****
    "@
     rol{b}\\t{%2, %0|%0, %2}
     rol{b}\\t{%b2, %0|%0, %b2}"
!   [(set_attr "type" "ishift")])
  
  (define_insn "rotrsi3"
    [(set (match_operand:SI 0 "nonimmediate_operand" "=rm,rm")
--- 5946,5953 ----
    "@
     rol{b}\\t{%2, %0|%0, %2}
     rol{b}\\t{%b2, %0|%0, %b2}"
!   [(set_attr "type" "ishift")
!    (set_attr "k6_decode" "vector")])
  
  (define_insn "rotrsi3"
    [(set (match_operand:SI 0 "nonimmediate_operand" "=rm,rm")
***************
*** 5743,5749 ****
    "@
     ror{l}\\t{%2, %0|%0, %2}
     ror{l}\\t{%b2, %0|%0, %b2}"
!   [(set_attr "type" "ishift")])
  
  (define_insn ""
    [(set (reg:CCNO 17)
--- 5958,5965 ----
    "@
     ror{l}\\t{%2, %0|%0, %2}
     ror{l}\\t{%b2, %0|%0, %b2}"
!   [(set_attr "type" "ishift")
!    (set_attr "k6_decode" "vector")])
  
  (define_insn ""
    [(set (reg:CCNO 17)
***************
*** 5757,5763 ****
    "@
     ror{l}\\t{%2, %0|%0, %2}
     ror{l}\\t{%b2, %0|%0, %b2}"
!   [(set_attr "type" "ishift")])
  
  (define_insn "rotrhi3"
    [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,rm")
--- 5973,5980 ----
    "@
     ror{l}\\t{%2, %0|%0, %2}
     ror{l}\\t{%b2, %0|%0, %b2}"
!   [(set_attr "type" "ishift")
!    (set_attr "k6_decode" "vector")])
  
  (define_insn "rotrhi3"
    [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,rm")
***************
*** 5768,5774 ****
    "@
     ror{w}\\t{%2, %0|%0, %2}
     ror{w}\\t{%b2, %0|%0, %b2}"
!   [(set_attr "type" "ishift")])
  
  (define_insn ""
    [(set (reg:CCNO 17)
--- 5985,5992 ----
    "@
     ror{w}\\t{%2, %0|%0, %2}
     ror{w}\\t{%b2, %0|%0, %b2}"
!   [(set_attr "type" "ishift")
!    (set_attr "k6_decode" "vector")])
  
  (define_insn ""
    [(set (reg:CCNO 17)
***************
*** 5782,5788 ****
    "@
     ror{w}\\t{%2, %0|%0, %2}
     ror{w}\\t{%b2, %0|%0, %b2}"
!   [(set_attr "type" "ishift")])
  
  (define_insn "rotrqi3"
    [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,qm")
--- 6000,6007 ----
    "@
     ror{w}\\t{%2, %0|%0, %2}
     ror{w}\\t{%b2, %0|%0, %b2}"
!   [(set_attr "type" "ishift")
!    (set_attr "k6_decode" "vector")])
  
  (define_insn "rotrqi3"
    [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,qm")
***************
*** 5793,5799 ****
    "@
     ror{b}\\t{%2, %0|%0, %2}
     ror{b}\\t{%b2, %0|%0, %b2}"
!   [(set_attr "type" "ishift")])
  
  (define_insn ""
    [(set (reg:CCNO 17)
--- 6012,6019 ----
    "@
     ror{b}\\t{%2, %0|%0, %2}
     ror{b}\\t{%b2, %0|%0, %b2}"
!   [(set_attr "type" "ishift")
!    (set_attr "k6_decode" "vector")])
  
  (define_insn ""
    [(set (reg:CCNO 17)
***************
*** 5807,5813 ****
    "@
     ror{b}\\t{%2, %0|%0, %2}
     ror{b}\\t{%b2, %0|%0, %b2}"
!   [(set_attr "type" "ishift")])
  
  ;; Bit set / bit test instructions
  
--- 6027,6034 ----
    "@
     ror{b}\\t{%2, %0|%0, %2}
     ror{b}\\t{%b2, %0|%0, %b2}"
!   [(set_attr "type" "ishift")
!    (set_attr "k6_decode" "vector")])
  
  ;; Bit set / bit test instructions
  
***************
*** 6130,6135 ****
--- 6351,6357 ----
    ""
    "jmp\\t%l0"
    [(set_attr "type" "ibr")
+    (set_attr "k6_decode" "vector")
     (set (attr "length")
  	(if_then_else (and (ge (minus (match_dup 0) (pc))
  			       (const_int -128))
***************
*** 6142,6155 ****
    [(set (pc) (match_operand:SI 0 "nonimmediate_operand" "rm"))]
    ""
    "jmp\\t%*%0"
!   [(set_attr "type" "ibr")])
  
  (define_insn "tablejump"
    [(set (pc) (match_operand:SI 0 "nonimmediate_operand" "rm"))
     (use (label_ref (match_operand 1 "" "")))]
    "! flag_pic"
    "jmp\\t%*%0"
!   [(set_attr "type" "ibr")])
  
  ;; Implement switch statements when generating PIC code.  Switches are
  ;; implemented by `tablejump' when not using -fpic.
--- 6364,6379 ----
    [(set (pc) (match_operand:SI 0 "nonimmediate_operand" "rm"))]
    ""
    "jmp\\t%*%0"
!   [(set_attr "type" "ibr")
!    (set_attr "k6_decode" "vector")])
  
  (define_insn "tablejump"
    [(set (pc) (match_operand:SI 0 "nonimmediate_operand" "rm"))
     (use (label_ref (match_operand 1 "" "")))]
    "! flag_pic"
    "jmp\\t%*%0"
!   [(set_attr "type" "ibr")
!    (set_attr "k6_decode" "vector")])
  
  ;; Implement switch statements when generating PIC code.  Switches are
  ;; implemented by `tablejump' when not using -fpic.
***************
*** 6222,6228 ****
     (use (label_ref (match_operand 1 "" "")))]
    ""
    "jmp\\t%*%0"
!   [(set_attr "type" "ibr")])
  
  ;; Loop instruction
  ;;
--- 6446,6453 ----
     (use (label_ref (match_operand 1 "" "")))]
    ""
    "jmp\\t%*%0"
!   [(set_attr "type" "ibr")
!    (set_attr "k6_decode" "vector")])
  
  ;; Loop instruction
  ;;
***************
*** 6266,6271 ****
--- 6491,6497 ----
  }"
    [(set_attr "type" "ibr")
     (set_attr "ppro_uops" "many")
+    (set_attr "k6_decode" "short")
     (set (attr "length")
  	(if_then_else (and (eq_attr "alternative" "0")
  			   (and (ge (minus (match_dup 0) (pc))
***************
*** 6298,6303 ****
--- 6524,6530 ----
  }"
    [(set_attr "type" "ibr")
     (set_attr "ppro_uops" "many")
+    (set_attr "k6_decode" "short")
     (set (attr "length")
  	(if_then_else (and (eq_attr "alternative" "0")
  			   (and (ge (minus (match_dup 0) (pc))
***************
*** 6696,6701 ****
--- 6924,6930 ----
    ""
    "nop"
    [(set_attr "length" "1")
+    (set_attr "k6_decode" "short")
     (set_attr "ppro_uops" "one")])
  
  (define_expand "prologue"
***************
*** 6751,6756 ****
--- 6980,6986 ----
    ""
    "leave"
    [(set_attr "length" "1")
+    (set_attr "k6_decode" "long")
     (set_attr "ppro_uops" "few")])
  
  (define_expand "ffssi2"
***************
*** 6801,6806 ****
--- 7031,7037 ----
    ""
    "bsf{l}\\t{%1, %0|%0, %1}"
    [(set_attr "length_opcode" "3")
+    (set_attr "k6_decode" "vector")
     (set_attr "ppro_uops" "few")])
  
  ;; ffshi2 is not useful -- 4 word prefix ops are needed, which is larger


More information about the Gcc-patches mailing list