This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Fix for scheduling on Athlon/K6/PPC
- To: gcc-patches at gcc dot gnu dot org, rth at cygnus dot com, patches at x86-64 dot org
- Subject: Fix for scheduling on Athlon/K6/PPC
- From: Jan Hubicka <jh at suse dot cz>
- Date: Tue, 14 Aug 2001 16:27:16 +0200
Hi,
For Athlon scheduling, it appears to be critical to show scheduler,
that load followed by load_and_execute instructions can hide it's latency,
when operand is not used in the address of next instruction by executing
both loads in parallel.
This is common case of internal loops for matrix multiplications etc, that
basically load two memory locations, do simple arithmetic and store it.
Currently the scheduler tends to schedule many loads first, then many
load-and-execute instructions (as it thinks he needs wait for results of the
loads). This makes the execution units to strave, as the operands are loaded
in wrong order.
I've made equivalent patch some time ago, but hope that this time I will
have better luck
Tue Aug 14 16:21:21 CEST 2001 Jan Hubicka <jh@suse.cz>
* i386.c (ix86_agi_depdendant): Lea causes AGI only on the Pentium
(ix86_adjust_cost): Teach scheduler that latency to load operand can
be masked.
*** i386.c.orig Sun Aug 5 17:58:25 2001
--- i386.c Mon Aug 13 13:54:00 2001
*************** ix86_agi_dependant (insn, dep_insn, insn
*** 8410,8416 ****
{
rtx addr;
! if (insn_type == TYPE_LEA)
{
addr = PATTERN (insn);
if (GET_CODE (addr) == SET)
--- 8446,8453 ----
{
rtx addr;
! if (insn_type == TYPE_LEA
! && TARGET_PENTIUM)
{
addr = PATTERN (insn);
if (GET_CODE (addr) == SET)
*************** ix86_adjust_cost (insn, link, dep_insn,
*** 8445,8451 ****
int cost;
{
enum attr_type insn_type, dep_insn_type;
! enum attr_memory memory;
rtx set, set2;
int dep_insn_code_number;
--- 8482,8488 ----
int cost;
{
enum attr_type insn_type, dep_insn_type;
! enum attr_memory memory, dep_memory;
rtx set, set2;
int dep_insn_code_number;
*************** ix86_adjust_cost (insn, link, dep_insn,
*** 8481,8492 ****
break;
case PROCESSOR_PENTIUMPRO:
/* Since we can't represent delayed latencies of load+operation,
increase the cost here for non-imov insns. */
if (dep_insn_type != TYPE_IMOV
! && dep_insn_type != TYPE_FMOV
! && ((memory = get_attr_memory (dep_insn) == MEMORY_LOAD)
! || memory == MEMORY_BOTH))
cost += 1;
/* INT->FP conversion is expensive. */
--- 8518,8531 ----
break;
case PROCESSOR_PENTIUMPRO:
+ memory = get_attr_memory (insn);
+ dep_memory = get_attr_memory (dep_insn);
+
/* Since we can't represent delayed latencies of load+operation,
increase the cost here for non-imov insns. */
if (dep_insn_type != TYPE_IMOV
! && dep_insn_type != TYPE_FMOV
! && (dep_memory == MEMORY_LOAD || dep_memory == MEMORY_BOTH))
cost += 1;
/* INT->FP conversion is expensive. */
*************** ix86_adjust_cost (insn, link, dep_insn,
*** 8500,8534 ****
&& rtx_equal_p (SET_DEST (set), SET_SRC (set2))
&& GET_CODE (SET_DEST (set2)) == MEM)
cost += 1;
break;
case PROCESSOR_K6:
/* The esp dependency is resolved before the instruction is really
finished. */
if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
&& (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
return 1;
/* Since we can't represent delayed latencies of load+operation,
increase the cost here for non-imov insns. */
! if ((memory = get_attr_memory (dep_insn) == MEMORY_LOAD)
! || memory == MEMORY_BOTH)
cost += (dep_insn_type != TYPE_IMOV) ? 2 : 1;
/* INT->FP conversion is expensive. */
if (get_attr_fp_int_src (dep_insn))
cost += 5;
break;
case PROCESSOR_ATHLON:
! if ((memory = get_attr_memory (dep_insn)) == MEMORY_LOAD
! || memory == MEMORY_BOTH)
{
if (dep_insn_type == TYPE_IMOV || dep_insn_type == TYPE_FMOV)
cost += 2;
else
cost += 3;
}
default:
break;
--- 8539,8627 ----
&& rtx_equal_p (SET_DEST (set), SET_SRC (set2))
&& GET_CODE (SET_DEST (set2)) == MEM)
cost += 1;
+
+ /* Show ability of reorder buffer to hide latency of load by executing
+ in parallel with previous instruction in case
+ previous instruction is not needed to compute the address. */
+ if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
+ && !ix86_agi_dependant (insn, dep_insn, insn_type))
+ {
+ /* Claim moves to take one cycle, as core can issue one load
+ at time and the next load can start cycle later. */
+ if (dep_insn_type == TYPE_IMOV
+ || dep_insn_type == TYPE_FMOV
+ || dep_insn_type == TYPE_SSEMOV)
+ cost = 1;
+ else if (cost > 1)
+ cost--;
+ }
break;
case PROCESSOR_K6:
+ memory = get_attr_memory (insn);
+ dep_memory = get_attr_memory (dep_insn);
/* The esp dependency is resolved before the instruction is really
finished. */
if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
&& (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
return 1;
/* Since we can't represent delayed latencies of load+operation,
increase the cost here for non-imov insns. */
! if (dep_memory == MEMORY_LOAD || dep_memory == MEMORY_BOTH)
cost += (dep_insn_type != TYPE_IMOV) ? 2 : 1;
/* INT->FP conversion is expensive. */
if (get_attr_fp_int_src (dep_insn))
cost += 5;
+
+ /* Show ability of reorder buffer to hide latency of load by executing
+ in parallel with previous instruction in case
+ previous instruction is not needed to compute the address. */
+ if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
+ && !ix86_agi_dependant (insn, dep_insn, insn_type))
+ {
+ /* Claim moves to take one cycle, as core can issue one load
+ at time and the next load can start cycle later. */
+ if (dep_insn_type == TYPE_IMOV
+ || dep_insn_type == TYPE_FMOV
+ || dep_insn_type == TYPE_SSEMOV)
+ cost = 1;
+ else if (cost > 2)
+ cost -= 2;
+ else
+ cost = 1;
+ }
break;
case PROCESSOR_ATHLON:
! memory = get_attr_memory (insn);
! dep_memory = get_attr_memory (dep_insn);
!
! if (dep_memory == MEMORY_LOAD || dep_memory == MEMORY_BOTH)
{
if (dep_insn_type == TYPE_IMOV || dep_insn_type == TYPE_FMOV)
cost += 2;
else
cost += 3;
}
+ /* Show ability of reorder buffer to hide latency of load by executing
+ in parallel with previous instruction in case
+ previous instruction is not needed to compute the address. */
+ if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
+ && !ix86_agi_dependant (insn, dep_insn, insn_type))
+ {
+ /* Claim moves to take one cycle, as core can issue one load
+ at time and the next load can start cycle later. */
+ if (dep_insn_type == TYPE_IMOV
+ || dep_insn_type == TYPE_FMOV
+ || dep_insn_type == TYPE_SSEMOV)
+ cost = 0;
+ else if (cost >= 3)
+ cost -= 3;
+ else
+ cost = 0;
+ }
default:
break;