This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
ia64: Better scheduling for shifts
- To: <gcc-patches at gcc dot gnu dot org>
- Subject: ia64: Better scheduling for shifts
- From: Bernd Schmidt <bernds at redhat dot com>
- Date: Fri, 20 Apr 2001 12:52:29 +0100 (BST)
The Itanium pipeline has a feature which requires some additional scheduling
code. If shifts (or other MM insns) are scheduled less than 4 cycles away
from the insn that uses the result, there's a 10 cycle stall. This means we
have to insert NOPs in certain cases.
Bootstrapped on ia64-linux (up to the usual libffi build failure), and tested
with SPEC95 (1-2% improvement).
Bernd
* ia64.h (MD_SCHED_REORDER, MD_SCHED_REORDER2): Pass CLOCK to called
function.
* ia64-protos.h (ia64_sched_reorder): Additional arg for clock.
* ia64.c (nop_cycles_until): New function.
(prev_cycle, prev_first, last_issued): New static variables.
(ia64_sched_reorder): Additional arg for clock.
On final scheduling pass, emit extra NOPs as needed.
Set prev_first and prev_cycle.
(ia64_sched_reorder2): Pass clock arg down to ia64_sched_reorder.
(ia64_variable_issue): Set last_issued.
Index: config/ia64/ia64-protos.h
===================================================================
RCS file: /cvs/gcc/egcs/gcc/config/ia64/ia64-protos.h,v
retrieving revision 1.27
diff -u -p -r1.27 ia64-protos.h
--- ia64-protos.h 2001/01/24 04:30:47 1.27
+++ ia64-protos.h 2001/04/20 11:51:19
@@ -97,7 +97,7 @@ extern int ia64_issue_rate PARAMS ((void
extern int ia64_adjust_cost PARAMS ((rtx, rtx, rtx, int));
extern void ia64_sched_init PARAMS ((FILE *, int, int));
extern void ia64_sched_finish PARAMS ((FILE *, int));
-extern int ia64_sched_reorder PARAMS ((FILE *, int, rtx *, int *, int));
+extern int ia64_sched_reorder PARAMS ((FILE *, int, rtx *, int *, int, int));
extern int ia64_sched_reorder2 PARAMS ((FILE *, int, rtx *, int *, int));
extern int ia64_variable_issue PARAMS ((FILE *, int, rtx, int));
#endif /* RTX_CODE */
Index: config/ia64/ia64.c
===================================================================
RCS file: /cvs/gcc/egcs/gcc/config/ia64/ia64.c,v
retrieving revision 1.89
diff -u -p -r1.89 ia64.c
--- ia64.c 2001/04/14 03:49:46 1.89
+++ ia64.c 2001/04/20 11:51:20
@@ -4901,6 +4901,7 @@ static void maybe_rotate PARAMS ((FILE *
static void finish_last_head PARAMS ((FILE *, int));
static void rotate_one_bundle PARAMS ((FILE *));
static void rotate_two_bundles PARAMS ((FILE *));
+static void nop_cycles_until PARAMS ((int, FILE *));
static void cycle_end_fill_slots PARAMS ((FILE *));
static int packet_matches_p PARAMS ((const struct ia64_packet *, int, int *));
static int get_split PARAMS ((const struct ia64_packet *, int));
@@ -5780,16 +5781,125 @@ maybe_rotate (dump)
sched_data.first_slot = sched_data.cur;
}
+/* The clock cycle when ia64_sched_reorder was last called. */
+static int prev_cycle;
+
+/* The first insn scheduled in the previous cycle. This is the saved
+ value of sched_data.first_slot. */
+static int prev_first;
+
+/* The last insn that has been scheduled. At the start of a new cycle
+ we know that we can emit new insns after it; the main scheduling code
+ has already emitted a cycle_display insn after it and is using that
+ as its current last insn. */
+static rtx last_issued;
+
+/* Emit NOPs to fill the delay between PREV_CYCLE and CLOCK_VAR. Used to
+ pad out the delay between MM (shifts, etc.) and integer operations. */
+
+static void
+nop_cycles_until (clock_var, dump)
+ int clock_var;
+ FILE *dump;
+{
+ int prev_clock = prev_cycle;
+ int cycles_left = clock_var - prev_clock;
+
+ /* Finish the previous cycle; pad it out with NOPs. */
+ if (sched_data.cur == 3)
+ {
+ rtx t = gen_insn_group_barrier (GEN_INT (3));
+ last_issued = emit_insn_after (t, last_issued);
+ maybe_rotate (dump);
+ }
+ else if (sched_data.cur > 0)
+ {
+ int need_stop = 0;
+ int split = itanium_split_issue (sched_data.packet, prev_first);
+
+ if (sched_data.cur < 3 && split > 3)
+ {
+ split = 3;
+ need_stop = 1;
+ }
+
+ if (split > sched_data.cur)
+ {
+ int i;
+ for (i = sched_data.cur; i < split; i++)
+ {
+ rtx t;
+
+ t = gen_nop_type (sched_data.packet->t[i]);
+ last_issued = emit_insn_after (t, last_issued);
+ sched_data.types[i] = sched_data.packet->t[sched_data.cur];
+ sched_data.insns[i] = last_issued;
+ sched_data.stopbit[i] = 0;
+ }
+ sched_data.cur = split;
+ }
+
+ if (! need_stop && sched_data.cur > 0 && sched_data.cur < 6
+ && cycles_left > 1)
+ {
+ int i;
+ for (i = sched_data.cur; i < 6; i++)
+ {
+ rtx t;
+
+ t = gen_nop_type (sched_data.packet->t[i]);
+ last_issued = emit_insn_after (t, last_issued);
+ sched_data.types[i] = sched_data.packet->t[sched_data.cur];
+ sched_data.insns[i] = last_issued;
+ sched_data.stopbit[i] = 0;
+ }
+ sched_data.cur = 6;
+ cycles_left--;
+ need_stop = 1;
+ }
+
+ if (need_stop || sched_data.cur == 6)
+ {
+ rtx t = gen_insn_group_barrier (GEN_INT (3));
+ last_issued = emit_insn_after (t, last_issued);
+ }
+ maybe_rotate (dump);
+ }
+
+ cycles_left--;
+ while (cycles_left > 0)
+ {
+ rtx t = gen_bundle_selector (GEN_INT (0));
+ last_issued = emit_insn_after (t, last_issued);
+ t = gen_nop_type (TYPE_M);
+ last_issued = emit_insn_after (t, last_issued);
+ t = gen_nop_type (TYPE_I);
+ last_issued = emit_insn_after (t, last_issued);
+ if (cycles_left > 1)
+ {
+ t = gen_insn_group_barrier (GEN_INT (2));
+ last_issued = emit_insn_after (t, last_issued);
+ cycles_left--;
+ }
+ t = gen_nop_type (TYPE_I);
+ last_issued = emit_insn_after (t, last_issued);
+ t = gen_insn_group_barrier (GEN_INT (3));
+ last_issued = emit_insn_after (t, last_issued);
+ cycles_left--;
+ }
+}
+
/* We are about to being issuing insns for this clock cycle.
Override the default sort algorithm to better slot instructions. */
int
-ia64_sched_reorder (dump, sched_verbose, ready, pn_ready, reorder_type)
+ia64_sched_reorder (dump, sched_verbose, ready, pn_ready,
+ reorder_type, clock_var)
FILE *dump ATTRIBUTE_UNUSED;
int sched_verbose ATTRIBUTE_UNUSED;
rtx *ready;
int *pn_ready;
- int reorder_type;
+ int reorder_type, clock_var;
{
int n_ready = *pn_ready;
rtx *e_ready = ready + n_ready;
@@ -5802,6 +5912,38 @@ ia64_sched_reorder (dump, sched_verbose,
dump_current_packet (dump);
}
+ if (reorder_type == 0 && clock_var > 0 && ia64_final_schedule)
+ {
+ for (insnp = ready; insnp < e_ready; insnp++)
+ {
+ rtx insn = *insnp;
+ enum attr_itanium_class t = ia64_safe_itanium_class (insn);
+ if (t == ITANIUM_CLASS_IALU || t == ITANIUM_CLASS_ISHF
+ || t == ITANIUM_CLASS_ILOG
+ || t == ITANIUM_CLASS_LD || t == ITANIUM_CLASS_ST)
+ {
+ rtx link;
+ for (link = LOG_LINKS (insn); link; link = XEXP (link, 1))
+ if (REG_NOTE_KIND (link) != REG_DEP_OUTPUT
+ && REG_NOTE_KIND (link) != REG_DEP_ANTI)
+ {
+ rtx other = XEXP (link, 0);
+ enum attr_itanium_class t0 = ia64_safe_itanium_class (other);
+ if (t0 == ITANIUM_CLASS_MMSHF
+ || t0 == ITANIUM_CLASS_MMMUL)
+ {
+ nop_cycles_until (clock_var, sched_verbose ? dump : NULL);
+ goto out;
+ }
+ }
+ }
+ }
+ }
+ out:
+
+ prev_first = sched_data.first_slot;
+ prev_cycle = clock_var;
+
if (reorder_type == 0)
maybe_rotate (sched_verbose ? dump : NULL);
@@ -5893,7 +6035,7 @@ ia64_sched_reorder2 (dump, sched_verbose
int sched_verbose ATTRIBUTE_UNUSED;
rtx *ready;
int *pn_ready;
- int clock_var ATTRIBUTE_UNUSED;
+ int clock_var;
{
if (sched_data.last_was_stop)
return 0;
@@ -5977,7 +6119,8 @@ ia64_sched_reorder2 (dump, sched_verbose
if (*pn_ready > 0)
{
- int more = ia64_sched_reorder (dump, sched_verbose, ready, pn_ready, 1);
+ int more = ia64_sched_reorder (dump, sched_verbose, ready, pn_ready, 1,
+ clock_var);
if (more)
return more;
/* Did we schedule a stop? If so, finish this cycle. */
@@ -6005,6 +6148,8 @@ ia64_variable_issue (dump, sched_verbose
int can_issue_more ATTRIBUTE_UNUSED;
{
enum attr_type t = ia64_safe_type (insn);
+
+ last_issued = insn;
if (sched_data.last_was_stop)
{
Index: config/ia64/ia64.h
===================================================================
RCS file: /cvs/gcc/egcs/gcc/config/ia64/ia64.h,v
retrieving revision 1.61
diff -u -p -r1.61 ia64.h
--- ia64.h 2001/03/16 05:21:42 1.61
+++ ia64.h 2001/04/20 11:51:21
@@ -2827,10 +2827,10 @@ do { \
ia64_sched_init (DUMP, SCHED_VERBOSE, MAX_READY)
#define MD_SCHED_REORDER(DUMP, SCHED_VERBOSE, READY, N_READY, CLOCK, CIM) \
- (CIM) = ia64_sched_reorder (DUMP, SCHED_VERBOSE, READY, &N_READY, 0)
+ (CIM) = ia64_sched_reorder (DUMP, SCHED_VERBOSE, READY, &N_READY, 0, CLOCK)
#define MD_SCHED_REORDER2(DUMP, SCHED_VERBOSE, READY, N_READY, CLOCK, CIM) \
- (CIM) = ia64_sched_reorder2 (DUMP, SCHED_VERBOSE, READY, &N_READY, 1)
+ (CIM) = ia64_sched_reorder2 (DUMP, SCHED_VERBOSE, READY, &N_READY, CLOCK)
#define MD_SCHED_FINISH(DUMP, SCHED_VERBOSE) \
ia64_sched_finish (DUMP, SCHED_VERBOSE)