This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [patch]: Enhance rtl-dse to remove unaligned read after writes.


> Paolo Bonzini wrote:
> >
> >> you are right,
> >> That is certainly going to be a hard path to test, it is there in case
> >> lshr is expensive and ashr is cheap.  i do not know if such a machine
> >> exists.
> >
> > you can modify the costs artificially.
> >
> > Paolo
> Ian and I decided that the code was overkill anyway.  I now just look
> for the lshr.
> 
> I am going to resubmit the patch after the summit after pinski has a
> chance to try it on the spu.  Apparently the gen_lowpart may fail on the
> spu because of the oddball modes available.
> 
> Kenny

Ian, 


2007-07-24  Eric Christopher  <echristo@apple.com>
	    Kenneth Zadeck <zadeck@naturalbridge.com>

	* dse.c (find_shift_sequence): New function.
	(replace_read): Add case to remove read if it requires shift.
	* config/i386/i386.c (ix86_expand_prologue): Fixed typo in comment.
	

This patch replaces the original patch.  There is one change: in the
original patch, we only look for the lshr insn and if that fails we
fall back and try to find an ashr.  This was done wrong in the
original patch and seems to be mostly a waste of time.  In this patch
we only look for the lshr and if that fails we give up for that mode.

Since the original submission, pinskia has tested the original patch
on the spu and found that it does good things and does not break
anything.  He has raised some question about wanting this extended for
vectors, but that can be done later.

Ok for trunk?

Kenny

Index: dse.c
===================================================================
--- dse.c	(revision 126416)
+++ dse.c	(working copy)
@@ -43,6 +43,7 @@ Software Foundation, 51 Franklin Street,
 #include "expr.h"
 #include "recog.h"
 #include "dse.h"
+#include "optabs.h"
 #include "dbgcnt.h"
 
 /* This file contains three techniques for performing Dead Store
@@ -1381,6 +1382,105 @@ dump_insn_info (const char * start, insn
 }
 
 
+/* The modes are different and the values source and target do not
+   line up, we need to extract the value in the right from the rhs of
+   the store, shift it, and then put it into a form that can be shoved
+   into the read_insn.  This function generates a right SHIFT of a
+   value that is at least ACCESS_SIZE bytes wide of READ_MODE.  The
+   shift sequence is returned or NULL if we failed to find a
+   shift.  */
+
+
+static rtx
+find_shift_sequence (rtx read_reg,
+		     int access_size,
+		     store_info_t store_info,
+		     read_info_t read_info,
+		     int shift)
+{
+  enum machine_mode store_mode = GET_MODE (store_info->mem);
+  enum machine_mode read_mode = GET_MODE (read_info->mem);
+
+  /* Some machines like the x86 have shift insns for each size of
+     operand.  Other machines like the ppc or the ia-64 may only have
+     shift insns that shift values within 32 or 64 bit registers.
+     This loop tries to find the smallest shift insn that will right
+     justify the value we want to read but is available in one insn on
+     the machine.  */
+
+  while (access_size < UNITS_PER_WORD)
+    {
+      rtx target;
+      enum machine_mode new_mode
+	= smallest_mode_for_size (access_size * BITS_PER_UNIT,
+				  GET_MODE_CLASS (read_mode));
+      rtx new_reg = gen_reg_rtx (new_mode);
+
+      start_sequence ();
+
+      /* In theory we could also check for an ashr.  Ian Taylor knows
+	 of one dsp where the cost of these two was not the same.  But
+	 this really is a rare case anyway.  */
+      target = expand_binop (new_mode, lshr_optab, new_reg,
+			     GEN_INT (shift), new_reg, 1, OPTAB_DIRECT);
+
+      if (target == new_reg)
+	{
+	  rtx shift_seq = get_insns ();
+	  end_sequence ();
+
+	  /* If cost is too great, set target to NULL and
+	     let the iteration happen. */
+	  if (shift_seq != NULL)
+	    {
+	      int cost = 0;
+	      rtx insn;
+
+	      for (insn = shift_seq; insn != NULL_RTX; insn = NEXT_INSN (insn))
+		cost += insn_rtx_cost (insn);
+
+	      if (cost <= COSTS_N_INSNS (1))
+		{
+		  /* We found an acceptable shift.  Generate a move to
+		     take the value from the store and put it into the
+		     shift pseudo, then shift it, then generate another
+		     move to put in into the target of the read.  */
+		  start_sequence ();
+		  emit_move_insn (new_reg, gen_lowpart (new_mode, store_info->rhs));
+		  emit_insn (shift_seq);
+		  emit_move_insn (read_reg,  gen_lowpart (read_mode, new_reg));
+		  
+		  if (dump_file)
+		    {
+		      fprintf (dump_file, " -- adding extract insn r%d:%s = r%d:%s\n",
+			       REGNO (new_reg), GET_MODE_NAME (new_mode),
+			       REGNO (store_info->rhs), GET_MODE_NAME (store_mode));
+		      
+		      fprintf (dump_file, " -- with shift of r%d by %d\n",
+			       REGNO(new_reg), shift);
+		      fprintf (dump_file, " -- and second extract insn r%d:%s = r%d:%s\n",
+			       REGNO (read_reg), GET_MODE_NAME (read_mode),
+			       REGNO (new_reg), GET_MODE_NAME (new_mode));
+		    }
+		  
+		  /* Get the three insn sequence and return it.  */
+		  shift_seq = get_insns ();
+		  end_sequence ();
+		  return shift_seq;
+		}
+	    }
+	}
+      else
+	/* End the sequence.  */
+	end_sequence ();
+
+      access_size = access_size * 2;
+    }
+
+  return NULL;
+}
+
+
 /* Take a sequence of:
      A <- r1
      ...
@@ -1392,7 +1492,23 @@ dump_insn_info (const char * start, insn
    ...
    ... <- r2
 
-   The STORE_INFO and STORE_INFO are for the store and the READ_INFO
+   or
+
+   r3 <- extract (r1)
+   r3 <- r3 >> shift
+   r2 <- extract (r3)
+   ... <- r2
+
+   or
+
+   r2 <- extract (r1)
+   ... <- r2
+
+   Depending on the alignement and the mode of the store and
+   subsequent load.
+
+
+   The STORE_INFO and STORE_INSN are for the store and READ_INFO
    and READ_INSN are for the read.  Return true if the replacement
    went ok.  */
 
@@ -1400,82 +1516,135 @@ static bool
 replace_read (store_info_t store_info, insn_info_t store_insn, 
 	      read_info_t read_info, insn_info_t read_insn, rtx *loc)
 {
+  enum machine_mode store_mode = GET_MODE (store_info->mem);
+  enum machine_mode read_mode = GET_MODE (read_info->mem);
+  int shift;
+  int access_size; /* In bytes.  */
+  rtx read_reg = gen_reg_rtx (read_mode);
+  rtx shift_seq = NULL;
+
   if (!dbg_cnt (dse))
     return false;
 
+  if (GET_MODE_CLASS (read_mode) != GET_MODE_CLASS (store_mode))
+    return false;
+
+  /* To get here the read is within the boundaries of the write so
+     shift will never be negative.  Start out with the shift being in
+     bytes.  */
+  if (BYTES_BIG_ENDIAN)
+    shift = store_info->end - read_info->end;
+  else
+    shift = read_info->begin - store_info->begin;
+
+  access_size = shift + GET_MODE_SIZE (read_mode);
+
+  /* From now on it is bits.  */
+  shift *= BITS_PER_UNIT;
+
+  /* We need to keep this in perspective.  We are replacing a read
+     with a sequence of insns, but the read will almost certainly be
+     in cache, so it is not going to be an expensive one.  Thus, we
+     are not willing to do a multi insn shift or worse a subroutine
+     call to get rid of the read.  */
+  if (shift)
+    {
+      if (access_size > UNITS_PER_WORD || FLOAT_MODE_P (store_mode))
+	return false;
+
+      shift_seq = find_shift_sequence (read_reg, access_size, store_info,
+				       read_info, shift);
+      if (!shift_seq)
+	return false;
+    }
+
   if (dump_file)
-    fprintf (dump_file, "generating move to replace load at %d from store at %d\n", 
+    fprintf (dump_file, "replacing load at %d from store at %d\n",
 	     INSN_UID (read_insn->insn), INSN_UID (store_insn->insn)); 
-  if (GET_MODE (store_info->mem) == GET_MODE (read_info->mem))
+
+  if (validate_change (read_insn->insn, loc, read_reg, 0))
     {
-      rtx new_reg = gen_reg_rtx (GET_MODE (store_info->mem));
-      if (validate_change (read_insn->insn, loc, new_reg, 0))
+      rtx insns;
+      deferred_change_t deferred_change = pool_alloc (deferred_change_pool);
+      
+      if (read_mode == store_mode)
 	{
-	  rtx insns;
-	  deferred_change_t deferred_change = pool_alloc (deferred_change_pool);
-
 	  start_sequence ();
-	  emit_move_insn (new_reg, store_info->rhs);
+	  
+	  /* The modes are the same and everything lines up.  Just
+	     generate a simple move.  */
+	  emit_move_insn (read_reg, store_info->rhs);
+	  if (dump_file)
+	    fprintf (dump_file, " -- adding move insn r%d = r%d\n",
+		     REGNO (read_reg), REGNO (store_info->rhs));
 	  insns = get_insns ();
 	  end_sequence ();
-	  emit_insn_before (insns, store_insn->insn);
-
-	  if (dump_file)
-	    fprintf (dump_file, " -- adding move insn %d: r%d = r%d\n", 
-		     INSN_UID (insns), REGNO (new_reg), REGNO (store_info->rhs)); 
-
-	  /* And now for the cludge part: cselib croaks if you just
-	     return at this point.  There are two reasons for this:
-
-	     1) Cselib has an idea of how many pseudos there are and
-	     that does not include the new one we just added.  
-
-	     2) Cselib does not know about the move insn we added
-	     above the store_info, and there is no way to tell it
-	     about it, because it has "moved on".
-
-	     So we are just going to have to lie.  The move insn is
-	     not really an issue, cselib did not see it.  But the use
-	     of the new pseudo read_insn is a real problem.  The way
-	     that we solve this problem is that we are just going to
-	     put the mem back keep a table of mems to get rid of.  At
-	     the end of the basic block we can put it back.  */
-
-	  *loc = read_info->mem;
-	  deferred_change->next = deferred_change_list;
-	  deferred_change_list = deferred_change;
-	  deferred_change->loc = loc;
-	  deferred_change->reg = new_reg;
-
-	  /* Get rid of the read_info, from the point of view of the
-	     rest of dse, play like this read never happened.  */
-	  read_insn->read_rec = read_info->next;
-	  pool_free (read_info_pool, read_info);
-	  return true;
 	}
-      else 
+      else if (shift)
+	insns = shift_seq;
+      else
 	{
+	  /* The modes are different but the lsb are in the same
+	     place, we need to extract the value in the right from the
+	     rhs of the store.  */
+	  start_sequence ();
+	  emit_move_insn (read_reg, gen_lowpart (read_mode, store_info->rhs));
+	  
 	  if (dump_file)
-	    fprintf (dump_file, " -- validation failure\n"); 
-	  return false;
+	    fprintf (dump_file, " -- adding extract insn r%d:%s = r%d:%s\n",
+		     REGNO (read_reg), GET_MODE_NAME (read_mode),
+		     REGNO (store_info->rhs), GET_MODE_NAME (store_mode));
+	  insns = get_insns ();
+	  end_sequence ();
 	}
+
+      /* Insert this right before the store insn where it will be safe
+	 from later insns that might change it before the read.  */
+      emit_insn_before (insns, store_insn->insn);
+      
+      /* And now for the kludge part: cselib croaks if you just
+	 return at this point.  There are two reasons for this:
+	 
+	 1) Cselib has an idea of how many pseudos there are and
+	 that does not include the new ones we just added.
+	 
+	 2) Cselib does not know about the move insn we added
+	 above the store_info, and there is no way to tell it
+	 about it, because it has "moved on".
+	 
+	 Problem (1) is fixable with a certain amount of engineering.
+	 Problem (2) is requires starting the bb from scratch.  This
+	 could be expensive.
+	 
+	 So we are just going to have to lie.  The move/extraction
+	 insns are not really an issue, cselib did not see them.  But
+	 the use of the new pseudo read_insn is a real problem because
+	 cselib has not scanned this insn.  The way that we solve this
+	 problem is that we are just going to put the mem back for now
+	 and when we are finished with the block, we undo this.  We
+	 keep a table of mems to get rid of.  At the end of the basic
+	 block we can put them back.  */
+      
+      *loc = read_info->mem;
+      deferred_change->next = deferred_change_list;
+      deferred_change_list = deferred_change;
+      deferred_change->loc = loc;
+      deferred_change->reg = read_reg;
+      
+      /* Get rid of the read_info, from the point of view of the
+	 rest of dse, play like this read never happened.  */
+      read_insn->read_rec = read_info->next;
+      pool_free (read_info_pool, read_info);
+      return true;
     }
-  else
+  else 
     {
-      /* Someone with excellent rtl skills needs to fill this in.  You
-	 are guaranteed that the read is of the same size or smaller
-	 than the store, and that the read does not hang off one of
-	 the ends of the store.  But the offsets of each must be
-	 checked because the read does not have to line up on either
-	 end of the store so the begin fields need to be examined in
-	 both the store_info and read_info.  */
       if (dump_file)
-	fprintf (dump_file, " -- complex load, currently unsupported.\n"); 
+	fprintf (dump_file, " -- validation failure\n"); 
       return false;
     }
 }
 
-
 /* A for_each_rtx callback in which DATA is the bb_info.  Check to see
    if LOC is a mem and if it is look at the address and kill any
    appropriate stores that may be active.  */
@@ -3105,4 +3274,3 @@ struct tree_opt_pass pass_rtl_dse2 =
   TODO_ggc_collect,                     /* todo_flags_finish */
   'w'                                   /* letter */
 };
-
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 126416)
+++ config/i386/i386.c	(working copy)
@@ -6150,7 +6150,7 @@ ix86_expand_prologue (void)
         insn = emit_insn (gen_set_got (pic_offset_table_rtx));
     }
 
-  /* Prevent function calls from be scheduled before the call to mcount.
+  /* Prevent function calls from being scheduled before the call to mcount.
      In the pic_reg_used case, make sure that the got load isn't deleted.  */
   if (current_function_profile)
     {


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]