This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
fix ia64 pipeline flush
- From: Richard Henderson <rth at redhat dot com>
- To: gcc-patches at gcc dot gnu dot org
- Cc: sverre_jarp at hp dot com, sje at cup dot hp dot com, reva at cup dot hp dot com
- Date: Thu, 28 Feb 2002 11:52:30 -0800
- Subject: fix ia64 pipeline flush
- References: <3C7C14E8.B14636B5@cup.hp.com> <3C7C1913.9020006@hp.com> <20020226154419.B28685@redhat.com> <3C7D12C6.9080204@hp.com> <20020227094336.D29410@redhat.com> <3C7D4FA7.B749647F@cup.hp.com> <20020227140926.A29655@redhat.com> <3C7D8C8E.2904325E@cup.hp.com>
Addresses the short multiply performance problem that Reva
has been persuing for the last couple of days. She got hold
of an HP compiler engineer who confirmed that Intel's docs
are incorrect on this point.
Applied mainline and branch.
r~
* config/ia64/ia64.c (ia64_adjust_cost): All non-MM consumers have
4 cycle latency from MM producers.
(ia64_internal_sched_reorder): Likewise with pipeline flush.
Index: config/ia64/ia64.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/ia64/ia64.c,v
retrieving revision 1.139.2.2
diff -c -p -d -r1.139.2.2 ia64.c
*** ia64.c 2002/02/28 18:29:09 1.139.2.2
--- ia64.c 2002/02/28 19:43:11
*************** ia64_adjust_cost (insn, link, dep_insn,
*** 5348,5353 ****
--- 5348,5354 ----
if (reg_overlap_mentioned_p (SET_DEST (set), addr))
return cost + 1;
}
+
if ((dep_class == ITANIUM_CLASS_IALU
|| dep_class == ITANIUM_CLASS_ILOG
|| dep_class == ITANIUM_CLASS_LD)
*************** ia64_adjust_cost (insn, link, dep_insn,
*** 5355,5379 ****
|| insn_class == ITANIUM_CLASS_MMSHF
|| insn_class == ITANIUM_CLASS_MMSHFI))
return 3;
if (dep_class == ITANIUM_CLASS_FMAC
&& (insn_class == ITANIUM_CLASS_FMISC
|| insn_class == ITANIUM_CLASS_FCVTFX
|| insn_class == ITANIUM_CLASS_XMPY))
return 7;
if ((dep_class == ITANIUM_CLASS_FMAC
|| dep_class == ITANIUM_CLASS_FMISC
|| dep_class == ITANIUM_CLASS_FCVTFX
|| dep_class == ITANIUM_CLASS_XMPY)
&& insn_class == ITANIUM_CLASS_STF)
return 8;
if ((dep_class == ITANIUM_CLASS_MMMUL
|| dep_class == ITANIUM_CLASS_MMSHF
|| dep_class == ITANIUM_CLASS_MMSHFI)
! && (insn_class == ITANIUM_CLASS_LD
! || insn_class == ITANIUM_CLASS_ST
! || insn_class == ITANIUM_CLASS_IALU
! || insn_class == ITANIUM_CLASS_ILOG
! || insn_class == ITANIUM_CLASS_ISHF))
return 4;
return cost;
--- 5356,5383 ----
|| insn_class == ITANIUM_CLASS_MMSHF
|| insn_class == ITANIUM_CLASS_MMSHFI))
return 3;
+
if (dep_class == ITANIUM_CLASS_FMAC
&& (insn_class == ITANIUM_CLASS_FMISC
|| insn_class == ITANIUM_CLASS_FCVTFX
|| insn_class == ITANIUM_CLASS_XMPY))
return 7;
+
if ((dep_class == ITANIUM_CLASS_FMAC
|| dep_class == ITANIUM_CLASS_FMISC
|| dep_class == ITANIUM_CLASS_FCVTFX
|| dep_class == ITANIUM_CLASS_XMPY)
&& insn_class == ITANIUM_CLASS_STF)
return 8;
+
+ /* Intel docs say only LD, ST, IALU, ILOG, ISHF consumers have latency 4,
+ but HP engineers say any non-MM operation. */
if ((dep_class == ITANIUM_CLASS_MMMUL
|| dep_class == ITANIUM_CLASS_MMSHF
|| dep_class == ITANIUM_CLASS_MMSHFI)
! && insn_class != ITANIUM_CLASS_MMMUL
! && insn_class != ITANIUM_CLASS_MMSHF
! && insn_class != ITANIUM_CLASS_MMSHFI)
return 4;
return cost;
*************** ia64_internal_sched_reorder (dump, sched
*** 6185,6215 ****
dump_current_packet (dump);
}
if (reorder_type == 0 && clock_var > 0 && ia64_final_schedule)
{
for (insnp = ready; insnp < e_ready; insnp++)
{
! rtx insn = *insnp;
enum attr_itanium_class t = ia64_safe_itanium_class (insn);
! if (t == ITANIUM_CLASS_IALU || t == ITANIUM_CLASS_ISHF
! || t == ITANIUM_CLASS_ILOG
! || t == ITANIUM_CLASS_LD || t == ITANIUM_CLASS_ST)
! {
! rtx link;
! for (link = LOG_LINKS (insn); link; link = XEXP (link, 1))
! if (REG_NOTE_KIND (link) != REG_DEP_OUTPUT
! && REG_NOTE_KIND (link) != REG_DEP_ANTI)
{
! rtx other = XEXP (link, 0);
! enum attr_itanium_class t0 = ia64_safe_itanium_class (other);
! if (t0 == ITANIUM_CLASS_MMSHF
! || t0 == ITANIUM_CLASS_MMMUL)
! {
! nop_cycles_until (clock_var, sched_verbose ? dump : NULL);
! goto out;
! }
}
! }
}
}
out:
--- 6189,6222 ----
dump_current_packet (dump);
}
+ /* Work around the pipeline flush that will occurr if the results of
+ an MM instruction are accessed before the result is ready. Intel
+ documentation says this only happens with IALU, ISHF, ILOG, LD,
+ and ST consumers, but experimental evidence shows that *any* non-MM
+ type instruction will incurr the flush. */
if (reorder_type == 0 && clock_var > 0 && ia64_final_schedule)
{
for (insnp = ready; insnp < e_ready; insnp++)
{
! rtx insn = *insnp, link;
enum attr_itanium_class t = ia64_safe_itanium_class (insn);
!
! if (t == ITANIUM_CLASS_MMMUL
! || t == ITANIUM_CLASS_MMSHF
! || t == ITANIUM_CLASS_MMSHFI)
! continue;
!
! for (link = LOG_LINKS (insn); link; link = XEXP (link, 1))
! if (REG_NOTE_KIND (link) == 0)
! {
! rtx other = XEXP (link, 0);
! enum attr_itanium_class t0 = ia64_safe_itanium_class (other);
! if (t0 == ITANIUM_CLASS_MMSHF || t0 == ITANIUM_CLASS_MMMUL)
{
! nop_cycles_until (clock_var, sched_verbose ? dump : NULL);
! goto out;
}
! }
}
}
out: