This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[patch] sh/local-alloc: delay slot filling improvement (2)
- From: Christian BRUEL <christian dot bruel at st dot com>
- To: gcc-patches at gcc dot gnu dot org
- Cc: kkojima at gcc dot gnu dot org, aoliva at redhat dot com
- Date: Thu, 22 Mar 2007 14:10:38 +0100
- Subject: [patch] sh/local-alloc: delay slot filling improvement (2)
The SH4 is missing delay slot filling opportunities with jump statements.
for example:
int foo;
extern void t2();
void t1(void)
{
foo = 2;
t2();
}
compiled with -O2 -fomit-frame-pointer
_t1:
mov.l .L3,r2
mov #2,r1
mov.l r1,@r2
mov.l .L4,r1
jmp @r1
nop
with the proposed fix I have now:
_t1:
mov.l .L3,r2
mov #2,r1
mov.l .L4,r3
jmp @r3
mov.l r1,@r2
The problem came from a false data dependency introduced by the jump
instruction. On the jump is indirect and thus needs a register allocated
to hold the target address. The proposed patch intends to remove this
false data dependency by artificially increasing the quantity's birth,
thus backward extending the live range of the register used in the call.
The fear of course is an additional register pressure introduced by the
new interference, but on a larger scale I measured a geometrical mean
improvement of 1.23 % (sh-300 with perfect icache) on eembc and mibench.
boostraped and tested for non-regression on i686-pc-linux-gnu tested for
non-regression on sh-superh-elf
2007-02-20 Christian Bruel <christian.bruel@st.com>
* local-alloc.c: (struct qty): Add has_delay_slot field.
(alloc_qty): Initialize has_delay_slot.
(block_alloc): Increase birth range if has_delay_slot.
2007-02-20 Christian Bruel <christian.bruel@st.com>
* local-alloc.c: (struct qty): Add has_delay_slot field.
(alloc_qty): Initialize has_delay_slot.
(block_alloc): Increase birth range if has_delay_slot.
Index: gcc/local-alloc.c
===================================================================
--- gcc/local-alloc.c (.../vendor/gcc) (revision 132)
+++ gcc/local-alloc.c (.../branches/4.3_devs_bugs) (revision 132)
@@ -102,6 +102,10 @@
int birth;
+#ifdef DELAY_SLOTS
+ int has_delay_slot;
+#endif
+
/* Insn number (counting from head of basic block)
where given quantity died. Due to the way tying is done,
and the fact that we consider in this pass only regs that die but once,
@@ -329,6 +333,9 @@
qty[qtyno].size = size;
qty[qtyno].mode = mode;
qty[qtyno].birth = birth;
+#ifdef DELAY_SLOTS
+ qty[qtyno].has_delay_slot = 0;
+#endif
qty[qtyno].n_calls_crossed = REG_N_CALLS_CROSSED (regno);
qty[qtyno].n_throwing_calls_crossed = REG_N_THROWING_CALLS_CROSSED (regno);
qty[qtyno].min_class = reg_preferred_class (regno);
@@ -1515,8 +1522,22 @@
&& (no_conflict_combined_regno != (int) REGNO (XEXP (link, 0))
|| ! find_reg_note (insn, REG_NO_CONFLICT,
XEXP (link, 0))))
+ {
wipe_dead_reg (XEXP (link, 0), 0);
+#ifdef DELAY_SLOTS
+ if (flag_delayed_branch && !NONJUMP_INSN_P (insn) &&
+ num_delay_slots (insn) > 0)
+ {
+ int regno = REGNO (XEXP (link, 0));
+
+ if (REG_BASIC_BLOCK (regno) >= 0 && REG_N_DEATHS (regno) == 1
+ && reg_qty[regno] >= 0)
+ qty[reg_qty[regno]].has_delay_slot = 1;
+ }
+#endif
+ }
+
/* Allocate qty numbers for all registers local to this block
that are born (set) in this instruction.
A pseudo that already has a qty is not changed. */
@@ -1678,6 +1699,9 @@
int fake_death = MIN (insn_number * 2 + 1,
qty[q].death + 2 - qty[q].death % 2);
#endif
+#ifdef DELAY_SLOTS
+ int fake_birth2 = MAX (0, qty[q].birth - 3 + qty[q].birth % 2);
+#endif
if (N_REG_CLASSES > 1)
{
@@ -1700,6 +1724,17 @@
continue;
}
#endif
+#ifdef DELAY_SLOTS
+ if (qty[q].has_delay_slot)
+ {
+ qty[q].phys_reg = find_free_reg (qty[q].min_class,
+ qty[q].mode, q, 0, 0,
+ fake_birth2, qty[q].death);
+ if (qty[q].phys_reg >= 0)
+ continue;
+ }
+#endif
+
qty[q].phys_reg = find_free_reg (qty[q].min_class,
qty[q].mode, q, 0, 0,
qty[q].birth, qty[q].death);
@@ -1716,7 +1751,15 @@
qty[q].phys_reg = find_free_reg (qty[q].alternate_class,
qty[q].mode, q, 0, 0,
fake_birth, fake_death);
+ else
#endif
+#ifdef DELAY_SLOTS
+ if (qty[q].has_delay_slot)
+ qty[q].phys_reg = find_free_reg (qty[q].alternate_class,
+ qty[q].mode, q, 0, 0,
+ fake_birth2, qty[q].death);
+ else
+#endif
if (qty[q].alternate_class != NO_REGS)
qty[q].phys_reg = find_free_reg (qty[q].alternate_class,
qty[q].mode, q, 0, 0,