This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[itanium-sched-branch] Patch for one more improvement of itanium2 gcc
- From: Vladimir Makarov <vmakarov at redhat dot com>
- To: gcc-patches at gcc dot gnu dot org
- Date: Tue, 19 Nov 2002 17:52:01 -0500
- Subject: [itanium-sched-branch] Patch for one more improvement of itanium2 gcc
The following patch significantly increases the compiler speed for
Itanium2. I've just commited it into the branch.
Now Itanium2 gcc on itanium-sched-branch is 55% faster than
one on itanium-sched-branchpoint (it was 20% before the patch). The
current state of compiler on the branch is
Itanium (733Mhz)
branchpoint branch speedup
SPECInt2000 (without eon)
compilation (user) time 28m30.674s 19m24.990s 46.8%
SPECINT2000 branchpoint branch
-------------------------------------------------------------------------
164.gzip 1400 797 176* 1400 792 177*
175.vpr 1400 731 192* 1400 689 203*
176.gcc 1100 467 236* 1100 468 235*
181.mcf 1800 1271 142* 1800 1253 144*
186.crafty 1000 403 248* 1000 411 243*
197.parser 1800 1074 168* 1800 1053 171*
252.eon 1300 870 149* 1300 887 147*
253.perlbmk 1800 895 201* 1800 871 207*
254.gap 1100 675 163* 1100 660 167*
255.vortex 1900 819 232* 1900 814 233*
256.bzip2 1500 823 182* 1500 799 188*
300.twolf 3000 1212 247* 3000 1131 265*
Est. SPECint_base2000 191
Est. SPECint2000 195
Itanium2 (900Mhz)
branchpoint branch speedup
SPECInt2000 (without eon)
compilation (user) time 14m1.021s 9m3.559s 54.7%
SPECINT2000 branchpoint branch(-mcpu=itanium2)
-------------------------------------------------------------------------
164.gzip 1400 406 345* 1400 388 361*
175.vpr 1400 316 444* 1400 308 454*
176.gcc 1100 239 460* 1100 231 477*
181.mcf 1800 714 252* 1800 722 249*
186.crafty 1000 208 480* 1000 201 497*
197.parser 1800 492 366* 1800 489 368*
252.eon 1300 475 274* 1300 476 273*
253.perlbmk 1800 401 449* 1800 388 463*
254.gap 1100 337 326* 1100 332 331*
255.vortex 1900 373 509* 1900 371 512*
256.bzip2 1500 414 362* 1500 399 376*
300.twolf 3000 592 506* 3000 536 559*
Est. SPECint_base2000 388
Est. SPECint2000 399
Gcc for Itanium2 is still slower than Intel compiler (Intel
compiler compiles SPECInt2000 tests except eon for 8m45.485s)
but it is very close.
I don't see more possibilities to speed up the insn scheduling and
bundling. I'll prepare the branch to review and merge it into main
line.
2002-11-19 Vladimir Makarov <vmakarov@redhat.com>
* haifa-sched.c (choice_entry): New structure.
(choice_stack, cycle_issued_insns): New variables.
(max_issue): Rewrite it.
(choose_ready): Set up ready_try for unknown insns too.
(schedule_block): Allocate and free choice_stack. Set up
and modify cycle_issued_insns.
* config/ia64/ia64.c (issue_nops_and_insn): Combine insn issue
with and without filling the bundle.
(bundling): Combine calls of issue_nops_and_insn.
Vlad
Index: haifa-sched.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/haifa-sched.c,v
retrieving revision 1.211.10.3
diff -c -p -r1.211.10.3 haifa-sched.c
*** haifa-sched.c 2 Oct 2002 14:27:55 -0000 1.211.10.3
--- haifa-sched.c 19 Nov 2002 19:58:13 -0000
*************** static rtx move_insn PARAMS ((rtx, rtx))
*** 364,370 ****
on the first cycle. It is used only for DFA based scheduler. */
static rtx ready_element PARAMS ((struct ready_list *, int));
static rtx ready_remove PARAMS ((struct ready_list *, int));
! static int max_issue PARAMS ((struct ready_list *, state_t, int *));
static rtx choose_ready PARAMS ((struct ready_list *));
--- 364,370 ----
on the first cycle. It is used only for DFA based scheduler. */
static rtx ready_element PARAMS ((struct ready_list *, int));
static rtx ready_remove PARAMS ((struct ready_list *, int));
! static int max_issue PARAMS ((struct ready_list *, int *));
static rtx choose_ready PARAMS ((struct ready_list *));
*************** ready_element (ready, index)
*** 1015,1022 ****
--- 1015,1024 ----
struct ready_list *ready;
int index;
{
+ #ifdef ENABLE_CHECKING
if (ready->n_ready == 0 || index >= ready->n_ready)
abort ();
+ #endif
return ready->vec[ready->first - index];
}
*************** move_insn (insn, last)
*** 1757,1843 ****
return retval;
}
/* The following function returns maximal (or close to maximal) number
of insns which can be issued on the same cycle and one of which
! insns is insns with the best rank (the last insn in READY). To
make this function tries different samples of ready insns. READY
is current queue `ready'. Global array READY_TRY reflects what
! insns are already issued in this try. STATE is current processor
! state. If the function returns nonzero, INDEX will contain index
of the best insn in READY. The following function is used only for
first cycle multipass scheduling. */
-
static int
! max_issue (ready, state, index)
! struct ready_list *ready;
! state_t state;
! int *index;
{
! int i, best, n, temp_index, delay;
! state_t temp_state;
rtx insn;
- int max_lookahead = (*targetm.sched.first_cycle_multipass_dfa_lookahead) ();
! if (state_dead_lock_p (state))
! return 0;
!
! temp_state = alloca (dfa_state_size);
best = 0;
!
! for (i = 0; i < ready->n_ready; i++)
if (!ready_try [i])
! {
! insn = ready_element (ready, i);
!
! if (INSN_CODE (insn) < 0)
! continue;
!
! memcpy (temp_state, state, dfa_state_size);
!
! delay = state_transition (temp_state, insn);
!
! if (delay == 0)
! {
! if (!targetm.sched.dfa_bubble)
! continue;
! else
! {
! int j;
! rtx bubble;
!
! for (j = 0;
! (bubble = (*targetm.sched.dfa_bubble) (j)) != NULL_RTX;
! j++)
! if (state_transition (temp_state, bubble) < 0
! && state_transition (temp_state, insn) < 0)
! break;
!
! if (bubble == NULL_RTX)
! continue;
! }
! }
! else if (delay > 0)
! continue;
!
! --max_lookahead;
!
! if (max_lookahead < 0)
! break;
!
! ready_try [i] = 1;
!
! n = max_issue (ready, temp_state, &temp_index);
! if (n > 0 || ready_try[0])
! n += 1;
!
! if (best < n)
! {
! best = n;
! *index = i;
! }
! ready_try [i] = 0;
! }
!
return best;
}
--- 1759,1862 ----
return retval;
}
+ /* The following structure describe an entry of the stack of choices. */
+ struct choice_entry
+ {
+ /* Ordinal number of the issued insn in the ready queue. */
+ int index;
+ /* The number of the rest insns whose issues we should try. */
+ int rest;
+ /* The number of issued essential insns. */
+ int n;
+ /* State after issuing the insn. */
+ state_t state;
+ };
+
+ /* The following array is used to implement a stack of choices used in
+ function max_issue. */
+ static struct choice_entry *choice_stack;
+
+ /* The following variable value is number of essential insns issued on
+ the current cycle. An insn is essential one if it changes the
+ processors state. */
+ static int cycle_issued_insns;
+
/* The following function returns maximal (or close to maximal) number
of insns which can be issued on the same cycle and one of which
! insns is insns with the best rank (the first insn in READY). To
make this function tries different samples of ready insns. READY
is current queue `ready'. Global array READY_TRY reflects what
! insns are already issued in this try. INDEX will contain index
of the best insn in READY. The following function is used only for
first cycle multipass scheduling. */
static int
! max_issue (ready, index)
! struct ready_list *ready;
! int *index;
{
! int n, i, all, n_ready, lookahead, best, delay;
! struct choice_entry *top;
rtx insn;
! lookahead = (*targetm.sched.first_cycle_multipass_dfa_lookahead) ();
best = 0;
! memcpy (choice_stack->state, curr_state, dfa_state_size);
! top = choice_stack;
! top->rest = lookahead;
! top->n = 0;
! n_ready = ready->n_ready;
! for (all = i = 0; i < n_ready; i++)
if (!ready_try [i])
! all++;
! i = 0;
! for (;;)
! {
! if (top->rest == 0 || i >= n_ready)
! {
! if (top == choice_stack)
! break;
! if (best < top - choice_stack && ready_try [0])
! {
! best = top - choice_stack;
! *index = choice_stack [1].index;
! if (top->n == issue_rate - cycle_issued_insns || best == all)
! break;
! }
! i = top->index;
! ready_try [i] = 0;
! top--;
! memcpy (curr_state, top->state, dfa_state_size);
! }
! else if (!ready_try [i])
! {
! insn = ready_element (ready, i);
! delay = state_transition (curr_state, insn);
! if (delay < 0)
! {
! if (state_dead_lock_p (curr_state))
! top->rest = 0;
! else
! top->rest--;
! n = top->n;
! if (memcmp (top->state, curr_state, dfa_state_size) != 0)
! n++;
! top++;
! top->rest = lookahead;
! top->index = i;
! top->n = n;
! memcpy (top->state, curr_state, dfa_state_size);
! ready_try [i] = 1;
! i = -1;
! }
! }
! i++;
! }
! while (top != choice_stack)
! {
! ready_try [top->index] = 0;
! top--;
! }
! memcpy (curr_state, choice_stack->state, dfa_state_size);
return best;
}
*************** choose_ready (ready)
*** 1858,1872 ****
int index, i;
rtx insn;
! if (targetm.sched.first_cycle_multipass_dfa_lookahead_guard)
! for (i = 1; i < ready->n_ready; i++)
! {
! insn = ready_element (ready, i);
! ready_try [i]
! = !((*targetm.sched.first_cycle_multipass_dfa_lookahead_guard)
! (insn));
! }
! if (max_issue (ready, curr_state, &index) == 0)
return ready_remove_first (ready);
else
return ready_remove (ready, index);
--- 1877,1894 ----
int index, i;
rtx insn;
! insn = ready_element (ready, 0);
! if (INSN_CODE (insn) < 0)
! return ready_remove_first (ready);
! for (i = 1; i < ready->n_ready; i++)
! {
! insn = ready_element (ready, i);
! ready_try [i]
! = (INSN_CODE (insn) < 0
! || (targetm.sched.first_cycle_multipass_dfa_lookahead_guard
! && !(*targetm.sched.first_cycle_multipass_dfa_lookahead_guard) (insn)));
! }
! if (max_issue (ready, &index) == 0)
return ready_remove_first (ready);
else
return ready_remove (ready, index);
*************** schedule_block (b, rgn_n_insns)
*** 1894,1900 ****
int rgn_n_insns;
{
struct ready_list ready;
! int first_cycle_insn_p;
int can_issue_more;
state_t temp_state = NULL; /* It is used for multipass scheduling. */
int sort_p;
--- 1916,1922 ----
int rgn_n_insns;
{
struct ready_list ready;
! int i, first_cycle_insn_p;
int can_issue_more;
state_t temp_state = NULL; /* It is used for multipass scheduling. */
int sort_p;
*************** schedule_block (b, rgn_n_insns)
*** 1949,1954 ****
--- 1971,1981 ----
temp_state = alloca (dfa_state_size);
ready_try = (char *) xmalloc ((rgn_n_insns + 1) * sizeof (char));
memset (ready_try, 0, (rgn_n_insns + 1) * sizeof (char));
+ choice_stack
+ = (struct choice_entry *) xmalloc ((rgn_n_insns + 1)
+ * sizeof (struct choice_entry));
+ for (i = 0; i <= rgn_n_insns; i++)
+ choice_stack[i].state = (state_t) xmalloc (dfa_state_size);
}
(*current_sched_info->init_ready_list) (&ready);
*************** schedule_block (b, rgn_n_insns)
*** 2023,2028 ****
--- 2050,2056 ----
can_issue_more = issue_rate;
first_cycle_insn_p = 1;
+ cycle_issued_insns = 0;
for (;;)
{
rtx insn;
*************** schedule_block (b, rgn_n_insns)
*** 2170,2176 ****
if (targetm.sched.use_dfa_pipeline_interface
&& (*targetm.sched.use_dfa_pipeline_interface) ())
! memcpy (curr_state, temp_state, dfa_state_size);
if (targetm.sched.variable_issue)
can_issue_more =
--- 2198,2208 ----
if (targetm.sched.use_dfa_pipeline_interface
&& (*targetm.sched.use_dfa_pipeline_interface) ())
! {
! if (memcmp (curr_state, temp_state, dfa_state_size) != 0)
! cycle_issued_insns++;
! memcpy (curr_state, temp_state, dfa_state_size);
! }
if (targetm.sched.variable_issue)
can_issue_more =
*************** schedule_block (b, rgn_n_insns)
*** 2286,2292 ****
if (targetm.sched.use_dfa_pipeline_interface
&& (*targetm.sched.use_dfa_pipeline_interface) ())
! free (ready_try);
}
/* Set_priorities: compute priority of each insn in the block. */
--- 2318,2329 ----
if (targetm.sched.use_dfa_pipeline_interface
&& (*targetm.sched.use_dfa_pipeline_interface) ())
! {
! free (ready_try);
! for (i = 0; i <= rgn_n_insns; i++)
! free (choice_stack [i].state);
! free (choice_stack);
! }
}
/* Set_priorities: compute priority of each insn in the block. */
Index: config/ia64/ia64.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/ia64/ia64.c,v
retrieving revision 1.184.4.9
diff -c -p -r1.184.4.9 ia64.c
*** config/ia64/ia64.c 17 Oct 2002 22:28:37 -0000 1.184.4.9
--- config/ia64/ia64.c 19 Nov 2002 19:58:13 -0000
*************** static void finish_bundle_state_table PA
*** 193,199 ****
static int try_issue_nops PARAMS ((struct bundle_state *, int));
static int try_issue_insn PARAMS ((struct bundle_state *, rtx));
static void issue_nops_and_insn PARAMS ((struct bundle_state *, int,
! rtx, int));
static int get_max_pos PARAMS ((state_t));
static int get_template PARAMS ((state_t, int));
--- 193,199 ----
static int try_issue_nops PARAMS ((struct bundle_state *, int));
static int try_issue_insn PARAMS ((struct bundle_state *, rtx));
static void issue_nops_and_insn PARAMS ((struct bundle_state *, int,
! rtx, int));
static int get_max_pos PARAMS ((state_t));
static int get_template PARAMS ((state_t, int));
*************** insert_bundle_state (bundle_state)
*** 6043,6049 ****
*bundle_state = temp;
}
return FALSE;
-
}
/* Start work with the hash table. */
--- 6043,6048 ----
*************** try_issue_insn (curr_state, insn)
*** 6109,6117 ****
/* The following function tries to issue BEFORE_NOPS_NUM nops and INSN
starting with ORIGINATOR without advancing processor cycle. If
! TRY_BUNDLE_END_P is TRUE, the function tries to issue nops to fill
! all bundle. If it was successful, the function creates new bundle
! state and insert into the hash table and into
`index_to_bundle_states'. */
static void
--- 6108,6116 ----
/* The following function tries to issue BEFORE_NOPS_NUM nops and INSN
starting with ORIGINATOR without advancing processor cycle. If
! TRY_BUNDLE_END_P is TRUE, the function also tries to issue nops to
! fill all bundle. If it was successful, the function creates new
! bundle state and insert into the hash table and into
`index_to_bundle_states'. */
static void
*************** issue_nops_and_insn (originator, before_
*** 6182,6193 ****
if (ia64_safe_type (insn) == TYPE_B)
curr_state->branch_deviation
+= 2 - (curr_state->accumulated_insns_num - 1) % 3;
! if (try_bundle_end_p)
{
! if (curr_state->accumulated_insns_num % 3 == 0)
{
! free_bundle_state (curr_state);
! return;
}
if (!try_issue_nops (curr_state,
3 - curr_state->accumulated_insns_num % 3))
--- 6181,6203 ----
if (ia64_safe_type (insn) == TYPE_B)
curr_state->branch_deviation
+= 2 - (curr_state->accumulated_insns_num - 1) % 3;
! if (try_bundle_end_p && curr_state->accumulated_insns_num % 3 != 0)
{
! if (insert_bundle_state (curr_state))
{
! state_t dfa_state;
! struct bundle_state *curr_state1;
! struct bundle_state *allocated_states_chain;
!
! curr_state1 = get_free_bundle_state ();
! dfa_state = curr_state1->dfa_state;
! allocated_states_chain = curr_state1->allocated_states_chain;
! *curr_state1 = *curr_state;
! curr_state1->dfa_state = dfa_state;
! curr_state1->allocated_states_chain = allocated_states_chain;
! memcpy (curr_state1->dfa_state, curr_state->dfa_state,
! dfa_state_size);
! curr_state = curr_state1;
}
if (!try_issue_nops (curr_state,
3 - curr_state->accumulated_insns_num % 3))
*************** issue_nops_and_insn (originator, before_
*** 6198,6207 ****
+= 3 - curr_state->accumulated_insns_num % 3;
}
if (!insert_bundle_state (curr_state))
! {
! free_bundle_state (curr_state);
! return;
! }
}
/* The following function returns position in the two window bundle
--- 6208,6215 ----
+= 3 - curr_state->accumulated_insns_num % 3;
}
if (!insert_bundle_state (curr_state))
! free_bundle_state (curr_state);
! return;
}
/* The following function returns position in the two window bundle
*************** bundling (dump, verbose, prev_head_insn,
*** 6322,6331 ****
struct bundle_state *curr_state, *next_state, *best_state;
rtx insn, next_insn;
int insn_num;
! int i;
int pos, max_pos, template0, template1;
rtx b;
rtx nop;
insn_num = 0;
for (insn = NEXT_INSN (prev_head_insn);
--- 6330,6340 ----
struct bundle_state *curr_state, *next_state, *best_state;
rtx insn, next_insn;
int insn_num;
! int i, bundle_end_p;
int pos, max_pos, template0, template1;
rtx b;
rtx nop;
+ enum attr_type type;
insn_num = 0;
for (insn = NEXT_INSN (prev_head_insn);
*************** bundling (dump, verbose, prev_head_insn,
*** 6392,6415 ****
curr_state != NULL;
curr_state = next_state)
{
next_state = curr_state->next;
! if (next_insn == NULL_RTX
! || (GET_MODE (next_insn) == TImode
! && INSN_CODE (insn) != CODE_FOR_insn_group_barrier))
! {
! if (ia64_safe_type (insn) == TYPE_F
! || ia64_safe_type (insn) == TYPE_L)
! issue_nops_and_insn (curr_state, 2, insn, TRUE);
! issue_nops_and_insn (curr_state, 1, insn, TRUE);
! issue_nops_and_insn (curr_state, 0, insn, TRUE);
! }
! if (ia64_safe_type (insn) == TYPE_F
! || ia64_safe_type (insn) == TYPE_B
! || ia64_safe_type (insn) == TYPE_L
! || ia64_safe_type (insn) == TYPE_S)
! issue_nops_and_insn (curr_state, 2, insn, FALSE);
! issue_nops_and_insn (curr_state, 1, insn, FALSE);
! issue_nops_and_insn (curr_state, 0, insn, FALSE);
}
if (index_to_bundle_states [insn_num] == NULL)
abort ();
--- 6401,6418 ----
curr_state != NULL;
curr_state = next_state)
{
+ pos = curr_state->accumulated_insns_num % 3;
+ type = ia64_safe_type (insn);
next_state = curr_state->next;
! bundle_end_p
! = (next_insn == NULL_RTX
! || (GET_MODE (next_insn) == TImode
! && INSN_CODE (insn) != CODE_FOR_insn_group_barrier));
! if (type == TYPE_F || type == TYPE_B || type == TYPE_L
! || type == TYPE_S)
! issue_nops_and_insn (curr_state, 2, insn, bundle_end_p);
! issue_nops_and_insn (curr_state, 1, insn, bundle_end_p);
! issue_nops_and_insn (curr_state, 0, insn, bundle_end_p);
}
if (index_to_bundle_states [insn_num] == NULL)
abort ();