This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

Fix for scheduling on Athlon/K6/PPC


Hi,
For Athlon scheduling, it appears to be critical to show scheduler,
that load followed by load_and_execute instructions can hide it's latency,
when operand is not used in the address of next instruction by executing
both loads in parallel.

This is common case of internal loops for matrix multiplications etc, that
basically load two memory locations, do simple arithmetic and store it.

Currently the scheduler tends to schedule many loads first, then many
load-and-execute instructions (as it thinks he needs wait for results of the
loads).  This makes the execution units to strave, as the operands are loaded
in wrong order.

I've made equivalent patch some time ago, but hope that this time I will
have better luck

Tue Aug 14 16:21:21 CEST 2001  Jan Hubicka  <jh@suse.cz>
	* i386.c (ix86_agi_depdendant): Lea causes AGI only on the Pentium
	(ix86_adjust_cost): Teach scheduler that latency to load operand can
	be masked.
*** i386.c.orig	Sun Aug  5 17:58:25 2001
--- i386.c	Mon Aug 13 13:54:00 2001
*************** ix86_agi_dependant (insn, dep_insn, insn
*** 8410,8416 ****
  {
    rtx addr;
  
!   if (insn_type == TYPE_LEA)
      {
        addr = PATTERN (insn);
        if (GET_CODE (addr) == SET)
--- 8446,8453 ----
  {
    rtx addr;
  
!   if (insn_type == TYPE_LEA
!       && TARGET_PENTIUM)
      {
        addr = PATTERN (insn);
        if (GET_CODE (addr) == SET)
*************** ix86_adjust_cost (insn, link, dep_insn, 
*** 8445,8451 ****
       int cost;
  {
    enum attr_type insn_type, dep_insn_type;
!   enum attr_memory memory;
    rtx set, set2;
    int dep_insn_code_number;
  
--- 8482,8488 ----
       int cost;
  {
    enum attr_type insn_type, dep_insn_type;
!   enum attr_memory memory, dep_memory;
    rtx set, set2;
    int dep_insn_code_number;
  
*************** ix86_adjust_cost (insn, link, dep_insn, 
*** 8481,8492 ****
        break;
  
      case PROCESSOR_PENTIUMPRO:
        /* Since we can't represent delayed latencies of load+operation,
  	 increase the cost here for non-imov insns.  */
        if (dep_insn_type != TYPE_IMOV
! 	  && dep_insn_type != TYPE_FMOV
! 	  && ((memory = get_attr_memory (dep_insn) == MEMORY_LOAD)
!               || memory == MEMORY_BOTH))
  	cost += 1;
  
        /* INT->FP conversion is expensive.  */
--- 8518,8531 ----
        break;
  
      case PROCESSOR_PENTIUMPRO:
+       memory = get_attr_memory (insn);
+       dep_memory = get_attr_memory (dep_insn);
+ 
        /* Since we can't represent delayed latencies of load+operation,
  	 increase the cost here for non-imov insns.  */
        if (dep_insn_type != TYPE_IMOV
!           && dep_insn_type != TYPE_FMOV
!           && (dep_memory == MEMORY_LOAD || dep_memory == MEMORY_BOTH))
  	cost += 1;
  
        /* INT->FP conversion is expensive.  */
*************** ix86_adjust_cost (insn, link, dep_insn, 
*** 8500,8534 ****
  	  && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
  	  && GET_CODE (SET_DEST (set2)) == MEM)
  	cost += 1;
        break;
  
      case PROCESSOR_K6:
        /* The esp dependency is resolved before the instruction is really
           finished.  */
        if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
  	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
  	return 1;
  
        /* Since we can't represent delayed latencies of load+operation,
  	 increase the cost here for non-imov insns.  */
!       if ((memory = get_attr_memory (dep_insn) == MEMORY_LOAD)
!           || memory == MEMORY_BOTH)
  	cost += (dep_insn_type != TYPE_IMOV) ? 2 : 1;
  
        /* INT->FP conversion is expensive.  */
        if (get_attr_fp_int_src (dep_insn))
  	cost += 5;
        break;
  
      case PROCESSOR_ATHLON:
!       if ((memory = get_attr_memory (dep_insn)) == MEMORY_LOAD
!            || memory == MEMORY_BOTH)
  	{
  	  if (dep_insn_type == TYPE_IMOV || dep_insn_type == TYPE_FMOV)
  	    cost += 2;
  	  else
  	    cost += 3;
          }
  
      default:
        break;
--- 8539,8627 ----
  	  && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
  	  && GET_CODE (SET_DEST (set2)) == MEM)
  	cost += 1;
+ 
+       /* Show ability of reorder buffer to hide latency of load by executing
+ 	 in parallel with previous instruction in case
+ 	 previous instruction is not needed to compute the address.  */
+       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
+ 	  && !ix86_agi_dependant (insn, dep_insn, insn_type))
+  	{
+ 	  /* Claim moves to take one cycle, as core can issue one load
+ 	     at time and the next load can start cycle later.  */
+ 	  if (dep_insn_type == TYPE_IMOV
+ 	      || dep_insn_type == TYPE_FMOV
+ 	      || dep_insn_type == TYPE_SSEMOV)
+ 	    cost = 1;
+ 	  else if (cost > 1)
+ 	    cost--;
+ 	}
        break;
  
      case PROCESSOR_K6:
+       memory = get_attr_memory (insn);
+       dep_memory = get_attr_memory (dep_insn);
        /* The esp dependency is resolved before the instruction is really
           finished.  */
        if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
  	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
  	return 1;
  
        /* Since we can't represent delayed latencies of load+operation,
  	 increase the cost here for non-imov insns.  */
!       if (dep_memory == MEMORY_LOAD || dep_memory == MEMORY_BOTH)
  	cost += (dep_insn_type != TYPE_IMOV) ? 2 : 1;
  
        /* INT->FP conversion is expensive.  */
        if (get_attr_fp_int_src (dep_insn))
  	cost += 5;
+ 
+       /* Show ability of reorder buffer to hide latency of load by executing
+ 	 in parallel with previous instruction in case
+ 	 previous instruction is not needed to compute the address.  */
+       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
+ 	  && !ix86_agi_dependant (insn, dep_insn, insn_type))
+  	{
+ 	  /* Claim moves to take one cycle, as core can issue one load
+ 	     at time and the next load can start cycle later.  */
+ 	  if (dep_insn_type == TYPE_IMOV
+ 	      || dep_insn_type == TYPE_FMOV
+ 	      || dep_insn_type == TYPE_SSEMOV)
+ 	    cost = 1;
+ 	  else if (cost > 2)
+ 	    cost -= 2;
+ 	  else
+ 	    cost = 1;
+ 	}
        break;
  
      case PROCESSOR_ATHLON:
!       memory = get_attr_memory (insn);
!       dep_memory = get_attr_memory (dep_insn);
! 
!       if (dep_memory == MEMORY_LOAD || dep_memory == MEMORY_BOTH)
  	{
  	  if (dep_insn_type == TYPE_IMOV || dep_insn_type == TYPE_FMOV)
  	    cost += 2;
  	  else
  	    cost += 3;
          }
+       /* Show ability of reorder buffer to hide latency of load by executing
+ 	 in parallel with previous instruction in case
+ 	 previous instruction is not needed to compute the address.  */
+       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
+ 	  && !ix86_agi_dependant (insn, dep_insn, insn_type))
+  	{
+ 	  /* Claim moves to take one cycle, as core can issue one load
+ 	     at time and the next load can start cycle later.  */
+ 	  if (dep_insn_type == TYPE_IMOV
+ 	      || dep_insn_type == TYPE_FMOV
+ 	      || dep_insn_type == TYPE_SSEMOV)
+ 	    cost = 0;
+ 	  else if (cost >= 3)
+ 	    cost -= 3;
+ 	  else
+ 	    cost = 0;
+ 	}
  
      default:
        break;


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]