This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[patch] sh/local-alloc: delay slot filling improvement (2)


The SH4 is missing delay slot filling opportunities with jump statements.

for example:

int foo;

extern void t2();

void t1(void)
{
  foo = 2;

  t2();
}

compiled with -O2 -fomit-frame-pointer

_t1:
        mov.l   .L3,r2
        mov     #2,r1
        mov.l   r1,@r2
        mov.l   .L4,r1
        jmp     @r1
        nop

with the proposed fix I have now:

_t1:
        mov.l   .L3,r2
        mov     #2,r1
        mov.l   .L4,r3
        jmp     @r3
        mov.l   r1,@r2

The problem came from a false data dependency introduced by the jump instruction. On the jump is indirect and thus needs a register allocated to hold the target address. The proposed patch intends to remove this false data dependency by artificially increasing the quantity's birth, thus backward extending the live range of the register used in the call.

The fear of course is an additional register pressure introduced by the new interference, but on a larger scale I measured a geometrical mean improvement of 1.23 % (sh-300 with perfect icache) on eembc and mibench.

boostraped and tested for non-regression on i686-pc-linux-gnu tested for non-regression on sh-superh-elf

2007-02-20 Christian Bruel <christian.bruel@st.com>

	    * local-alloc.c: (struct qty): Add has_delay_slot field.
	    (alloc_qty): Initialize has_delay_slot.
	    (block_alloc): Increase birth range if has_delay_slot.
2007-02-20  Christian Bruel  <christian.bruel@st.com>

	    * local-alloc.c: (struct qty): Add has_delay_slot field.
	    (alloc_qty): Initialize has_delay_slot.
	    (block_alloc): Increase birth range if has_delay_slot.

Index: gcc/local-alloc.c
===================================================================
--- gcc/local-alloc.c	(.../vendor/gcc)	(revision 132)
+++ gcc/local-alloc.c	(.../branches/4.3_devs_bugs)	(revision 132)
@@ -102,6 +102,10 @@
 
   int birth;
 
+#ifdef DELAY_SLOTS
+  int has_delay_slot;
+#endif
+
   /* Insn number (counting from head of basic block)
      where given quantity died.  Due to the way tying is done,
      and the fact that we consider in this pass only regs that die but once,
@@ -329,6 +333,9 @@
   qty[qtyno].size = size;
   qty[qtyno].mode = mode;
   qty[qtyno].birth = birth;
+#ifdef DELAY_SLOTS
+  qty[qtyno].has_delay_slot = 0;
+#endif
   qty[qtyno].n_calls_crossed = REG_N_CALLS_CROSSED (regno);
   qty[qtyno].n_throwing_calls_crossed = REG_N_THROWING_CALLS_CROSSED (regno);
   qty[qtyno].min_class = reg_preferred_class (regno);
@@ -1515,8 +1522,22 @@
 		&& (no_conflict_combined_regno != (int) REGNO (XEXP (link, 0))
 		    || ! find_reg_note (insn, REG_NO_CONFLICT,
 					XEXP (link, 0))))
+	      {
 	      wipe_dead_reg (XEXP (link, 0), 0);
 
+#ifdef DELAY_SLOTS
+		if (flag_delayed_branch && !NONJUMP_INSN_P (insn) &&
+		    num_delay_slots (insn) > 0)
+		  {
+		    int regno =  REGNO (XEXP (link, 0));
+
+		    if (REG_BASIC_BLOCK (regno) >= 0 && REG_N_DEATHS (regno) == 1
+			&& reg_qty[regno] >= 0)
+		      qty[reg_qty[regno]].has_delay_slot = 1;
+		  }
+#endif
+	    }
+
 	  /* Allocate qty numbers for all registers local to this block
 	     that are born (set) in this instruction.
 	     A pseudo that already has a qty is not changed.  */
@@ -1678,6 +1699,9 @@
 	  int fake_death = MIN (insn_number * 2 + 1,
 				qty[q].death + 2 - qty[q].death % 2);
 #endif
+#ifdef DELAY_SLOTS
+	  int fake_birth2 = MAX (0, qty[q].birth - 3 + qty[q].birth % 2);
+#endif
 
 	  if (N_REG_CLASSES > 1)
 	    {
@@ -1700,6 +1724,17 @@
 		    continue;
 		}
 #endif
+#ifdef DELAY_SLOTS
+	      if (qty[q].has_delay_slot)
+		{
+		  qty[q].phys_reg = find_free_reg (qty[q].min_class,
+						   qty[q].mode, q, 0, 0,
+						   fake_birth2, qty[q].death);
+		  if (qty[q].phys_reg >= 0)
+		    continue;
+		}
+#endif
+
 	      qty[q].phys_reg = find_free_reg (qty[q].min_class,
 					       qty[q].mode, q, 0, 0,
 					       qty[q].birth, qty[q].death);
@@ -1716,7 +1751,15 @@
 	    qty[q].phys_reg = find_free_reg (qty[q].alternate_class,
 					     qty[q].mode, q, 0, 0,
 					     fake_birth, fake_death);
+	  else
 #endif
+#ifdef DELAY_SLOTS
+	    if (qty[q].has_delay_slot)
+	      qty[q].phys_reg = find_free_reg (qty[q].alternate_class,
+					       qty[q].mode, q, 0, 0,
+					       fake_birth2, qty[q].death);
+	    else
+#endif
 	  if (qty[q].alternate_class != NO_REGS)
 	    qty[q].phys_reg = find_free_reg (qty[q].alternate_class,
 					     qty[q].mode, q, 0, 0,

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]