fix ia64 pipeline flush

Richard Henderson rth@redhat.com
Thu Feb 28 12:23:00 GMT 2002


Addresses the short multiply performance problem that Reva
has been persuing for the last couple of days.  She got hold
of an HP compiler engineer who confirmed that Intel's docs
are incorrect on this point.

Applied mainline and branch.


r~


        * config/ia64/ia64.c (ia64_adjust_cost): All non-MM consumers have
        4 cycle latency from MM producers.
        (ia64_internal_sched_reorder): Likewise with pipeline flush.

Index: config/ia64/ia64.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/ia64/ia64.c,v
retrieving revision 1.139.2.2
diff -c -p -d -r1.139.2.2 ia64.c
*** ia64.c	2002/02/28 18:29:09	1.139.2.2
--- ia64.c	2002/02/28 19:43:11
*************** ia64_adjust_cost (insn, link, dep_insn, 
*** 5348,5353 ****
--- 5348,5354 ----
        if (reg_overlap_mentioned_p (SET_DEST (set), addr))
  	return cost + 1;
      }
+ 
    if ((dep_class == ITANIUM_CLASS_IALU
         || dep_class == ITANIUM_CLASS_ILOG
         || dep_class == ITANIUM_CLASS_LD)
*************** ia64_adjust_cost (insn, link, dep_insn, 
*** 5355,5379 ****
  	  || insn_class == ITANIUM_CLASS_MMSHF
  	  || insn_class == ITANIUM_CLASS_MMSHFI))
      return 3;
    if (dep_class == ITANIUM_CLASS_FMAC
        && (insn_class == ITANIUM_CLASS_FMISC
  	  || insn_class == ITANIUM_CLASS_FCVTFX
  	  || insn_class == ITANIUM_CLASS_XMPY))
      return 7;
    if ((dep_class == ITANIUM_CLASS_FMAC
         || dep_class == ITANIUM_CLASS_FMISC
         || dep_class == ITANIUM_CLASS_FCVTFX
         || dep_class == ITANIUM_CLASS_XMPY)
        && insn_class == ITANIUM_CLASS_STF)
      return 8;
    if ((dep_class == ITANIUM_CLASS_MMMUL
         || dep_class == ITANIUM_CLASS_MMSHF
         || dep_class == ITANIUM_CLASS_MMSHFI)
!       && (insn_class == ITANIUM_CLASS_LD
! 	  || insn_class == ITANIUM_CLASS_ST
! 	  || insn_class == ITANIUM_CLASS_IALU
! 	  || insn_class == ITANIUM_CLASS_ILOG
! 	  || insn_class == ITANIUM_CLASS_ISHF))
      return 4;
  
    return cost;
--- 5356,5383 ----
  	  || insn_class == ITANIUM_CLASS_MMSHF
  	  || insn_class == ITANIUM_CLASS_MMSHFI))
      return 3;
+ 
    if (dep_class == ITANIUM_CLASS_FMAC
        && (insn_class == ITANIUM_CLASS_FMISC
  	  || insn_class == ITANIUM_CLASS_FCVTFX
  	  || insn_class == ITANIUM_CLASS_XMPY))
      return 7;
+ 
    if ((dep_class == ITANIUM_CLASS_FMAC
         || dep_class == ITANIUM_CLASS_FMISC
         || dep_class == ITANIUM_CLASS_FCVTFX
         || dep_class == ITANIUM_CLASS_XMPY)
        && insn_class == ITANIUM_CLASS_STF)
      return 8;
+ 
+   /* Intel docs say only LD, ST, IALU, ILOG, ISHF consumers have latency 4,
+      but HP engineers say any non-MM operation.  */
    if ((dep_class == ITANIUM_CLASS_MMMUL
         || dep_class == ITANIUM_CLASS_MMSHF
         || dep_class == ITANIUM_CLASS_MMSHFI)
!       && insn_class != ITANIUM_CLASS_MMMUL
!       && insn_class != ITANIUM_CLASS_MMSHF
!       && insn_class != ITANIUM_CLASS_MMSHFI)
      return 4;
  
    return cost;
*************** ia64_internal_sched_reorder (dump, sched
*** 6185,6215 ****
        dump_current_packet (dump);
      }
  
    if (reorder_type == 0 && clock_var > 0 && ia64_final_schedule)
      {
        for (insnp = ready; insnp < e_ready; insnp++)
  	{
! 	  rtx insn = *insnp;
  	  enum attr_itanium_class t = ia64_safe_itanium_class (insn);
! 	  if (t == ITANIUM_CLASS_IALU || t == ITANIUM_CLASS_ISHF
! 	      || t == ITANIUM_CLASS_ILOG
! 	      || t == ITANIUM_CLASS_LD || t == ITANIUM_CLASS_ST)
! 	    {
! 	      rtx link;
! 	      for (link = LOG_LINKS (insn); link; link = XEXP (link, 1))
! 		if (REG_NOTE_KIND (link) != REG_DEP_OUTPUT
! 		    && REG_NOTE_KIND (link) != REG_DEP_ANTI)
  		  {
! 		    rtx other = XEXP (link, 0);
! 		    enum attr_itanium_class t0 = ia64_safe_itanium_class (other);
! 		    if (t0 == ITANIUM_CLASS_MMSHF
! 			|| t0 == ITANIUM_CLASS_MMMUL)
! 		      {
! 			nop_cycles_until (clock_var, sched_verbose ? dump : NULL);
! 			goto out;
! 		      }
  		  }
! 	    }
  	}
      }
   out:
--- 6189,6222 ----
        dump_current_packet (dump);
      }
  
+   /* Work around the pipeline flush that will occurr if the results of
+      an MM instruction are accessed before the result is ready.  Intel
+      documentation says this only happens with IALU, ISHF, ILOG, LD,
+      and ST consumers, but experimental evidence shows that *any* non-MM
+      type instruction will incurr the flush.  */
    if (reorder_type == 0 && clock_var > 0 && ia64_final_schedule)
      {
        for (insnp = ready; insnp < e_ready; insnp++)
  	{
! 	  rtx insn = *insnp, link;
  	  enum attr_itanium_class t = ia64_safe_itanium_class (insn);
! 
! 	  if (t == ITANIUM_CLASS_MMMUL
! 	      || t == ITANIUM_CLASS_MMSHF
! 	      || t == ITANIUM_CLASS_MMSHFI)
! 	    continue;
! 
! 	  for (link = LOG_LINKS (insn); link; link = XEXP (link, 1))
! 	    if (REG_NOTE_KIND (link) == 0)
! 	      {
! 		rtx other = XEXP (link, 0);
! 		enum attr_itanium_class t0 = ia64_safe_itanium_class (other);
! 		if (t0 == ITANIUM_CLASS_MMSHF || t0 == ITANIUM_CLASS_MMMUL)
  		  {
! 		    nop_cycles_until (clock_var, sched_verbose ? dump : NULL);
! 		    goto out;
  		  }
! 	      }
  	}
      }
   out:



More information about the Gcc-patches mailing list