This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

PATCH to hoist loads/stores out of loops



Consider, for example, the following code:

  void f(int* k, int j)
  {
    int i;

    for (i = 0; i < j; ++i)
      *k = *k + 2 * ((*k) - 1);
  }

With the current EGCS, we get code like:

  .L5:
	  movl (%ecx),%eax
	  leal -2(%eax,%eax,2),%eax
	  movl %eax,(%ecx)
	  incl %edx
	  cmpl %ebx,%edx
	  jl .L5

Note that `*k' is updated on every loop iteration.  With the attached
patch, this becomes:

	  movl (%ebx),%eax
	  .p2align 4,,7
  .L5:
	  leal -2(%eax,%eax,2),%eax
	  incl %edx
	  cmpl %ecx,%edx
	  jl .L5
	  movl %eax,(%ebx)

Note that the load and store have been hoisted out of the loop,
resulting, in this case, in reducing the instruction count for the
loop by 33%.  In addition, this code motion reduces the amount of
cache activityy.  Putting the memory contents into a register for the
duration of the loop may also allow the compiler to find additional
bivs/givs.

The original motivation for this work was a C++ test-case where, in
the presence of inlining, some data ended up on the stack that should
probably have been in a register.  We need to fix that problem, too,
but the attached patch repairs the damage by putting the memory back
into a register, and is a more general solution.

The compiler bootstraps, and there are no changes in the testsuite
behavior, with the exception of loop-2f.c.  That test fails because
the optimizations in the attached patch trigger a latent bug I
reported earlier in the week.  I will be working on that bug in the
near future.

I look forward to your comments, criticisms, bug reports, and
performance tests.  Jeff, I look forward to your patch approval, so
that I can check this in. :-) :-)

-- 
Mark Mitchell 			mark@markmitchell.com
Mark Mitchell Consulting	http://www.markmitchell.com

Fri Jul 17 00:27:14 1998  Mark Mitchell  <mark@markmitchell.com>

	* rtl.h (rtx_function): New type.
	(for_each_rtx): New function.
	* rtlanal.c (for_each_rtx): Define it.
	
	* recog.c (change_t): New type.
	(change_objects, change_old_codes, change_locs, change_olds):
	Replace with ...
	(changes): New variable.
	(validate_change): Dynamically allocate room for more changes, if
	necessary.  Use changes array instead of change_objects, etc.
	(apply_change_group):  Use changes array instead of
	change_objects, etc.
	
	* loop.c (loop_mem_info): New type.
	(loop_mems): New variable.
	(loop_mems_idx): Likewise.
	(looop_mems_allocated): Likewise.
	(scan_loop): Remove nregs parameter.
	(next_insn_in_loop): New function.
	(load_mems): Likewise.
	(insert_loop_mem): Likewise.
	(replace_loop_mem): Likewise.
	(replace_label): Likewise.
	(INSN_IN_RANGE_P): New macro.
	(loop_optimize): Don't pass max_reg_num() to scan_loop.
	(scan_loop): Remove nregs parameter, compute it after any new
	registers are created by load_mems.  Use INSN_IN_RANGE_P and
	next_insn_in_loop rather than expanding them inline.  Call
	load_mems to load memory into pseudos, if appropriate.
	(prescan_loop): Figure out whether or not there are jumps from the
	loop to targets other than the label immediately following the
	loop.  Call insert_loop_mem to notice all the MEMs used in the
	loop, if it could be safe to pull MEMs into REGs for the duration
	of the loop.
	(strength_reduce): Use next_insn_in_loop.  Tweak comments.
	
Index: rtl.h
===================================================================
RCS file: /egcs/carton/cvsfiles/egcs/gcc/rtl.h,v
retrieving revision 1.45
diff -c -p -r1.45 rtl.h
*** rtl.h	1998/07/13 03:34:12	1.45
--- rtl.h	1998/07/17 06:44:44
*************** extern int inequality_comparison_p	PROTO
*** 1000,1005 ****
--- 1001,1008 ----
  extern rtx replace_rtx			PROTO((rtx, rtx, rtx));
  extern rtx replace_regs			PROTO((rtx, rtx *, int, int));
  extern int computed_jump_p		PROTO((rtx));
+ typedef int (*rtx_function)             PROTO((rtx*, void*));
+ extern int for_each_rtx                 PROTO((rtx*, rtx_function, void*));
  
  /* Maximum number of parallel sets and clobbers in any insn in this fn.
     Always at least 3, since the combiner could put that many togetherm
Index: rtlanal.c
===================================================================
RCS file: /egcs/carton/cvsfiles/egcs/gcc/rtlanal.c,v
retrieving revision 1.15
diff -c -p -r1.15 rtlanal.c
*** rtlanal.c	1998/07/08 01:58:57	1.15
--- rtlanal.c	1998/07/17 06:44:48
*************** computed_jump_p (insn)
*** 2009,2011 ****
--- 2009,2084 ----
      }
    return 0;
  }
+ 
+ /* Traverse X via depth-first search, calling F for each
+    sub-expression (including X itself).  F is also passed the DATA.
+    If F returns -1, do not traverse sub-expressions, but continue
+    traversing the rest of the tree.  If F ever returns any other
+    non-zero value, stop the traversal, and return the value returned
+    by F.  Otherwise, return 0.  This function does not traverse inside
+    tree structure that contains RTX_EXPRs, or into sub-expressions
+    whose format code is `0' since it is not known whether or not those
+    codes are actually RTL.
+ 
+    This routine is very general, and could (should?) be used to
+    implement many of the other routines in this file.  */
+ 
+ int for_each_rtx (x, f, data)
+      rtx* x;
+      rtx_function f;
+      void* data;
+ {
+   int result;
+   int length;
+   char* format;
+   int i;
+ 
+   /* Call F on X.  */
+   result = (*f)(x, data);
+   if (result == -1)
+     /* Do not traverse sub-expressions.  */
+     return 0;
+   else if (result != 0)
+     /* Stop the traversal.  */
+     return result;
+ 
+   if (*x == NULL_RTX)
+     /* There are no sub-expressions.  */
+     return 0;
+ 
+   length = GET_RTX_LENGTH (GET_CODE (*x));
+   format = GET_RTX_FORMAT (GET_CODE (*x));
+ 
+   for (i = 0; i < length; ++i) 
+     {
+       switch (format[i]) 
+ 	{
+ 	case 'e':
+ 	  result = for_each_rtx (&XEXP (*x, i), f, data);
+ 	  if (result != 0)
+ 	    return result;
+ 	  break;
+ 
+ 	case 'V':
+ 	case 'E':
+ 	  if (XVEC (*x, i) != 0) 
+ 	    {
+ 	      int j;
+ 	      for (j = 0; j < XVECLEN (*x, i); ++j)
+ 		{
+ 		  result = for_each_rtx (&XVECEXP (*x, i, j), f, data);
+ 		  if (result != 0)
+ 		    return result;
+ 		}
+ 	    }
+ 	  break; 
+ 
+ 	default:
+ 	  /* Nothing to do.  */
+ 	  break;
+ 	}
+ 
+     }
+ 
+   return 0;
+ }
Index: recog.c
===================================================================
RCS file: /egcs/carton/cvsfiles/egcs/gcc/recog.c,v
retrieving revision 1.9
diff -c -p -r1.9 recog.c
*** recog.c	1998/05/20 00:24:25	1.9
--- recog.c	1998/07/17 06:44:38
*************** check_asm_operands (x)
*** 128,146 ****
    return 1;
  }
  
! /* Static data for the next two routines.
  
!    The maximum number of changes supported is defined as the maximum
!    number of operands times 5.  This allows for repeated substitutions
!    inside complex indexed address, or, alternatively, changes in up
!    to 5 insns.  */
! 
! #define MAX_CHANGE_LOCS	(MAX_RECOG_OPERANDS * 5)
! 
! static rtx change_objects[MAX_CHANGE_LOCS];
! static int change_old_codes[MAX_CHANGE_LOCS];
! static rtx *change_locs[MAX_CHANGE_LOCS];
! static rtx change_olds[MAX_CHANGE_LOCS];
  
  static int num_changes = 0;
  
--- 128,145 ----
    return 1;
  }
  
! /* Static data for the next two routines.  */
  
! typedef struct change_t
! {
!   rtx  object;
!   int  old_code;
!   rtx* loc;
!   rtx  old;
! } change_t;
! 
! static change_t* changes;
! static int changes_allocated;
  
  static int num_changes = 0;
  
*************** validate_change (object, loc, new, in_gr
*** 174,195 ****
    if (old == new || rtx_equal_p (old, new))
      return 1;
  
!   if (num_changes >= MAX_CHANGE_LOCS
!       || (in_group == 0 && num_changes != 0))
      abort ();
  
    *loc = new;
  
    /* Save the information describing this change.  */
!   change_objects[num_changes] = object;
!   change_locs[num_changes] = loc;
!   change_olds[num_changes] = old;
  
    if (object && GET_CODE (object) != MEM)
      {
        /* Set INSN_CODE to force rerecognition of insn.  Save old code in
  	 case invalid.  */
!       change_old_codes[num_changes] = INSN_CODE (object);
        INSN_CODE (object) = -1;
      }
  
--- 173,207 ----
    if (old == new || rtx_equal_p (old, new))
      return 1;
  
!   if (in_group == 0 && num_changes != 0)
      abort ();
  
    *loc = new;
  
    /* Save the information describing this change.  */
!   if (num_changes >= changes_allocated)
!     {
!       if (changes_allocated == 0)
! 	/* This value allows for repeated substitutions inside complex
! 	   indexed addresses, or changes in up to 5 insns.  */
! 	changes_allocated = MAX_RECOG_OPERANDS * 5;
!       else
! 	changes_allocated *= 2;
! 
!       changes = 
! 	(change_t*) xrealloc (changes, 
! 			      sizeof (change_t) * changes_allocated); 
!     }
!   
!   changes[num_changes].object = object;
!   changes[num_changes].loc = loc;
!   changes[num_changes].old = old;
  
    if (object && GET_CODE (object) != MEM)
      {
        /* Set INSN_CODE to force rerecognition of insn.  Save old code in
  	 case invalid.  */
!       changes[num_changes].old_code = INSN_CODE (object);
        INSN_CODE (object) = -1;
      }
  
*************** apply_change_group ()
*** 224,230 ****
  
    for (i = 0; i < num_changes; i++)
      {
!       rtx object = change_objects[i];
  
        if (object == 0)
  	continue;
--- 236,242 ----
  
    for (i = 0; i < num_changes; i++)
      {
!       rtx object = changes[i].object;
  
        if (object == 0)
  	continue;
*************** cancel_changes (num)
*** 319,327 ****
       they were made.  */
    for (i = num_changes - 1; i >= num; i--)
      {
!       *change_locs[i] = change_olds[i];
!       if (change_objects[i] && GET_CODE (change_objects[i]) != MEM)
! 	INSN_CODE (change_objects[i]) = change_old_codes[i];
      }
    num_changes = num;
  }
--- 331,339 ----
       they were made.  */
    for (i = num_changes - 1; i >= num; i--)
      {
!       *changes[i].loc = changes[i].old;
!       if (changes[i].object && GET_CODE (changes[i].object) != MEM)
! 	INSN_CODE (changes[i].object) = changes[i].old_code;
      }
    num_changes = num;
  }
Index: loop.c
===================================================================
RCS file: /egcs/carton/cvsfiles/egcs/gcc/loop.c,v
retrieving revision 1.57
diff -c -p -r1.57 loop.c
*** loop.c	1998/07/16 00:22:41	1.57
--- loop.c	1998/07/20 06:09:29
*************** static rtx loop_store_mems[NUM_STORES];
*** 195,200 ****
--- 195,221 ----
  /* Index of first available slot in above array.  */
  static int loop_store_mems_idx;
  
+ typedef struct loop_mem_info {
+   rtx mem;      /* The MEM itself.  */
+   rtx reg;      /* Corresponding pseudo, if any.  */
+   int optimize; /* Nonzero if we can optimize access to this MEM.  */
+ } loop_mem_info;
+ 
+ /* Array of MEMs that are used (read or written) in this loop, but
+    cannot be aliased by anything in this loop, except perhaps
+    themselves.  In other words, if loop_mems[i] is altered during the
+    loop, it is altered by an expression that is rtx_equal_p to it.  */
+ 
+ static loop_mem_info* loop_mems;
+ 
+ /* The index of the next available slot in LOOP_MEMS.  */
+ 
+ static int loop_mems_idx;
+ 
+ /* The number of elements allocated in LOOP_MEMs.  */
+ 
+ static int loop_mems_allocated;
+ 
  /* Nonzero if we don't know what MEMs were changed in the current loop.
     This happens if the loop contains a call (in which case `loop_has_call'
     will also be set) or if we store into more than NUM_STORES MEMs.  */
*************** static int labels_in_range_p PROTO((rtx,
*** 286,292 ****
  static void count_loop_regs_set PROTO((rtx, rtx, char *, rtx *, int *, int));
  static void note_addr_stored PROTO((rtx, rtx));
  static int loop_reg_used_before_p PROTO((rtx, rtx, rtx, rtx, rtx));
! static void scan_loop PROTO((rtx, rtx, int, int));
  #if 0
  static void replace_call_address PROTO((rtx, rtx, rtx));
  #endif
--- 307,313 ----
  static void count_loop_regs_set PROTO((rtx, rtx, char *, rtx *, int *, int));
  static void note_addr_stored PROTO((rtx, rtx));
  static int loop_reg_used_before_p PROTO((rtx, rtx, rtx, rtx, rtx));
! static void scan_loop PROTO((rtx, rtx, int));
  #if 0
  static void replace_call_address PROTO((rtx, rtx, rtx));
  #endif
*************** static int maybe_eliminate_biv_1 PROTO((
*** 327,333 ****
--- 348,375 ----
  static int last_use_this_basic_block PROTO((rtx, rtx));
  static void record_initial PROTO((rtx, rtx));
  static void update_reg_last_use PROTO((rtx, rtx));
+ static rtx next_insn_in_loop PROTO((rtx, rtx, rtx, rtx));
+ static void load_mems PROTO((rtx, rtx, rtx, rtx));
+ static int insert_loop_mem PROTO((rtx*, void*));
+ static int replace_loop_mem PROTO((rtx*, void*));
+ static int replace_label PROTO((rtx*, void*));
  
+ typedef struct rtx_and_int {
+   rtx r;
+   int i;
+ } rtx_and_int;
+ 
+ typedef struct rtx_pair {
+   rtx r1;
+   rtx r2;
+ } rtx_pair;
+ 
+ /* Nonzero iff INSN is between START and END, inclusive.  */
+ #define INSN_IN_RANGE_P(INSN, START, END) 	\
+   (INSN_UID (INSN) < max_uid_for_loop 		\
+    && INSN_LUID (INSN) >= INSN_LUID (START)	\
+    && INSN_LUID (INSN) <= INSN_LUID (END))
+ 
  #ifdef HAIFA
  /* This is extern from unroll.c */
  extern void iteration_info PROTO((rtx, rtx *, rtx *, rtx, rtx));
*************** loop_optimize (f, dumpfile, unroll_p)
*** 535,541 ****
    for (i = max_loop_num-1; i >= 0; i--)
      if (! loop_invalid[i] && loop_number_loop_ends[i])
        scan_loop (loop_number_loop_starts[i], loop_number_loop_ends[i],
! 		 max_reg_num (), unroll_p);
  
    /* If debugging and unrolling loops, we must replicate the tree nodes
       corresponding to the blocks inside the loop, so that the original one
--- 577,583 ----
    for (i = max_loop_num-1; i >= 0; i--)
      if (! loop_invalid[i] && loop_number_loop_ends[i])
        scan_loop (loop_number_loop_starts[i], loop_number_loop_ends[i],
! 		 unroll_p);
  
    /* If debugging and unrolling loops, we must replicate the tree nodes
       corresponding to the blocks inside the loop, so that the original one
*************** loop_optimize (f, dumpfile, unroll_p)
*** 544,549 ****
--- 586,623 ----
      unroll_block_trees ();
  }
  
+ /* Returns the next insn, in execution order, after INSN.  START and
+    END are the NOTE_INSN_LOOP_BEG and NOTE_INSN_LOOP_END for the loop,
+    respectively.  LOOP_TOP, if non-NULL, is the top of the loop in the
+    insn-stream; it is used with loops that are entered near the
+    bottom.  */
+ 
+ static rtx
+ next_insn_in_loop (insn, start, end, loop_top)
+      rtx insn;
+      rtx start;
+      rtx end;
+      rtx loop_top;
+ {
+   insn = NEXT_INSN (insn);
+ 
+   if (insn == end)
+     {
+       if (loop_top)
+ 	/* Go to the top of the loop, and continue there.  */
+ 	insn = loop_top;
+       else
+ 	/* We're done.  */
+ 	insn = NULL_RTX;
+     }
+ 
+   if (insn == start)
+     /* We're done.  */
+     insn = NULL_RTX;
+ 
+   return insn;
+ }
+ 
  /* Optimize one loop whose start is LOOP_START and end is END.
     LOOP_START is the NOTE_INSN_LOOP_BEG and END is the matching
     NOTE_INSN_LOOP_END.  */
*************** loop_optimize (f, dumpfile, unroll_p)
*** 555,567 ****
     write, then we can also mark the memory read as invariant.  */
  
  static void
! scan_loop (loop_start, end, nregs, unroll_p)
       rtx loop_start, end;
-      int nregs;
       int unroll_p;
  {
    register int i;
!   register rtx p;
    /* 1 if we are scanning insns that could be executed zero times.  */
    int maybe_never = 0;
    /* 1 if we are scanning insns that might never be executed
--- 629,640 ----
     write, then we can also mark the memory read as invariant.  */
  
  static void
! scan_loop (loop_start, end, unroll_p)
       rtx loop_start, end;
       int unroll_p;
  {
    register int i;
!   rtx p;
    /* 1 if we are scanning insns that could be executed zero times.  */
    int maybe_never = 0;
    /* 1 if we are scanning insns that might never be executed
*************** scan_loop (loop_start, end, nregs, unrol
*** 596,606 ****
    rtx *reg_single_usage = 0;
    /* Nonzero if we are scanning instructions in a sub-loop.  */
    int loop_depth = 0;
  
-   n_times_set = (int *) alloca (nregs * sizeof (int));
-   n_times_used = (int *) alloca (nregs * sizeof (int));
-   may_not_optimize = (char *) alloca (nregs);
- 
    /* Determine whether this loop starts with a jump down to a test at
       the end.  This will occur for a small number of loops with a test
       that is too complex to duplicate in front of the loop.
--- 669,676 ----
    rtx *reg_single_usage = 0;
    /* Nonzero if we are scanning instructions in a sub-loop.  */
    int loop_depth = 0;
+   int nregs;
  
    /* Determine whether this loop starts with a jump down to a test at
       the end.  This will occur for a small number of loops with a test
       that is too complex to duplicate in front of the loop.
*************** scan_loop (loop_start, end, nregs, unrol
*** 650,658 ****
  	     do {..} while (0).  If this label was generated previously
  	     by loop, we can't tell anything about it and have to reject
  	     the loop.  */
! 	  && INSN_UID (JUMP_LABEL (p)) < max_uid_for_loop
! 	  && INSN_LUID (JUMP_LABEL (p)) >= INSN_LUID (loop_start)
! 	  && INSN_LUID (JUMP_LABEL (p)) < INSN_LUID (end))
  	{
  	  loop_top = next_label (scan_start);
  	  scan_start = JUMP_LABEL (p);
--- 720,726 ----
  	     do {..} while (0).  If this label was generated previously
  	     by loop, we can't tell anything about it and have to reject
  	     the loop.  */
! 	  && INSN_IN_RANGE_P (JUMP_LABEL (p), loop_start, end))
  	{
  	  loop_top = next_label (scan_start);
  	  scan_start = JUMP_LABEL (p);
*************** scan_loop (loop_start, end, nregs, unrol
*** 680,686 ****
       Set may_not_optimize[I] if it is not safe to move out
       the setting of register I.  If this loop has calls, set
       reg_single_usage[I].  */
! 
    bzero ((char *) n_times_set, nregs * sizeof (int));
    bzero (may_not_optimize, nregs);
  
--- 748,760 ----
       Set may_not_optimize[I] if it is not safe to move out
       the setting of register I.  If this loop has calls, set
       reg_single_usage[I].  */
!   
!   /* Allocate extra space for REGS that might be created by
!      load_mems.  */
!   nregs = max_reg_num () + loop_mems_idx;
!   n_times_set = (int *) alloca (nregs * sizeof (int));
!   n_times_used = (int *) alloca (nregs * sizeof (int));
!   may_not_optimize = (char *) alloca (nregs);
    bzero ((char *) n_times_set, nregs * sizeof (int));
    bzero (may_not_optimize, nregs);
  
*************** scan_loop (loop_start, end, nregs, unrol
*** 697,702 ****
--- 771,805 ----
      may_not_optimize[i] = 1, n_times_set[i] = 1;
    bcopy ((char *) n_times_set, (char *) n_times_used, nregs * sizeof (int));
  
+   /* Load MEMs into REGs for the duration of the loop, if
+      appropriate.  Note that calculate n_times_set, etc., before this
+      point so that we can tell what things are loop-invariant in
+      load_mems.  */
+   load_mems (scan_start, end, loop_top, loop_start);
+ 
+   /* Recalculate n_times_set and friends since load_mems may have
+      created new registers.  */
+   if (max_reg_num () > nregs - loop_mems_idx)
+     {
+       bzero ((char *) n_times_set, nregs * sizeof (int));
+       bzero (may_not_optimize, nregs);
+       if (loop_has_call)
+ 	bzero ((char *) reg_single_usage, nregs * sizeof (rtx));
+ 
+       count_loop_regs_set (loop_top ? loop_top : loop_start, end,
+ 			   may_not_optimize, reg_single_usage,
+ 			   &insn_count, nregs); 
+ 
+       for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+ 	may_not_optimize[i] = 1, n_times_set[i] = 1;
+       bcopy ((char *) n_times_set, (char *) n_times_used, 
+ 	     nregs * sizeof (int)); 
+ 
+       /* Set nregs to reflect the real number of registers.  There's no
+ 	 need to include any extra slop.  */
+       nregs = max_reg_num ();
+     }
+ 
    if (loop_dump_stream)
      {
        fprintf (loop_dump_stream, "\nLoop from %d to %d: %d real insns.\n",
*************** scan_loop (loop_start, end, nregs, unrol
*** 719,742 ****
       When MAYBE_NEVER is 0, all insns will be executed at least once
       so that is not a problem.  */
  
!   p = scan_start;
!   while (1)
      {
-       p = NEXT_INSN (p);
-       /* At end of a straight-in loop, we are done.
- 	 At end of a loop entered at the bottom, scan the top.  */
-       if (p == scan_start)
- 	break;
-       if (p == end)
- 	{
- 	  if (loop_top != 0)
- 	    p = loop_top;
- 	  else
- 	    break;
- 	  if (p == scan_start)
- 	    break;
- 	}
- 
        if (GET_RTX_CLASS (GET_CODE (p)) == 'i'
  	  && find_reg_note (p, REG_LIBCALL, NULL_RTX))
  	in_libcall = 1;
--- 822,831 ----
       When MAYBE_NEVER is 0, all insns will be executed at least once
       so that is not a problem.  */
  
!   for (p = next_insn_in_loop (scan_start, scan_start, end, loop_top); 
!        p != NULL_RTX;
!        p = next_insn_in_loop (p, scan_start, end, loop_top))
      {
        if (GET_RTX_CLASS (GET_CODE (p)) == 'i'
  	  && find_reg_note (p, REG_LIBCALL, NULL_RTX))
  	in_libcall = 1;
*************** constant_high_bytes (p, loop_start)
*** 2287,2306 ****
  
  /* Scan a loop setting the variables `unknown_address_altered',
     `num_mem_sets', `loop_continue', loops_enclosed', `loop_has_call',
!    and `loop_has_volatile'.
!    Also, fill in the array `loop_store_mems'.  */
  
  static void
  prescan_loop (start, end)
       rtx start, end;
  {
    register int level = 1;
!   register rtx insn;
  
    unknown_address_altered = 0;
    loop_has_call = 0;
    loop_has_volatile = 0;
    loop_store_mems_idx = 0;
  
    num_mem_sets = 0;
    loops_enclosed = 1;
--- 2376,2404 ----
  
  /* Scan a loop setting the variables `unknown_address_altered',
     `num_mem_sets', `loop_continue', loops_enclosed', `loop_has_call',
!    and `loop_has_volatile'.  Also, fill in the arrays `loop_mems' and
!    `loop_store_mems'.  */
  
  static void
  prescan_loop (start, end)
       rtx start, end;
  {
    register int level = 1;
!   rtx insn;
!   int loop_has_multiple_exit_targets = 0;
!   /* The label after END.  Jumping here is just like falling off the
!      end of the loop.  We use next_nonnote_insn instead of next_label
!      as a hedge against the (pathological) case where some actual insn
!      might end up between the two.  */
!   rtx exit_target = next_nonnote_insn (end);
!   if (exit_target == NULL_RTX || GET_CODE (exit_target) != CODE_LABEL)
!     loop_has_multiple_exit_targets = 1;
  
    unknown_address_altered = 0;
    loop_has_call = 0;
    loop_has_volatile = 0;
    loop_store_mems_idx = 0;
+   loop_mems_idx = 0;
  
    num_mem_sets = 0;
    loops_enclosed = 1;
*************** prescan_loop (start, end)
*** 2338,2354 ****
  	    unknown_address_altered = 1;
  	  loop_has_call = 1;
  	}
!       else
  	{
! 	  if (GET_CODE (insn) == INSN || GET_CODE (insn) == JUMP_INSN)
  	    {
! 	      if (volatile_refs_p (PATTERN (insn)))
! 		loop_has_volatile = 1;
  
! 	      note_stores (PATTERN (insn), note_addr_stored);
! 	    }
! 	}
!     }
  }
  
  /* Scan the function looking for loops.  Record the start and end of each loop.
--- 2436,2510 ----
  	    unknown_address_altered = 1;
  	  loop_has_call = 1;
  	}
!       else if (GET_CODE (insn) == INSN || GET_CODE (insn) == JUMP_INSN)
  	{
! 	  rtx label1 = NULL_RTX;
! 	  rtx label2 = NULL_RTX;
! 
! 	  if (volatile_refs_p (PATTERN (insn)))
! 	    loop_has_volatile = 1;
! 	  
! 	  note_stores (PATTERN (insn), note_addr_stored);
! 
! 	  if (!loop_has_multiple_exit_targets
! 	      && GET_CODE (insn) == JUMP_INSN
! 	      && GET_CODE (PATTERN (insn)) == SET
! 	      && SET_DEST (PATTERN (insn)) == pc_rtx)
  	    {
! 	      if (GET_CODE (SET_SRC (PATTERN (insn))) == IF_THEN_ELSE)
! 		{
! 		  label1 = XEXP (SET_SRC (PATTERN (insn)), 1);
! 		  label2 = XEXP (SET_SRC (PATTERN (insn)), 2);
! 		}
! 	      else
! 		{
! 		  label1 = SET_SRC (PATTERN (insn));
! 		}
  
! 	      do {
! 		if (label1 && label1 != pc_rtx)
! 		  {
! 		    if (GET_CODE (label1) != LABEL_REF)
! 		      {
! 			/* Something tricky.  */
! 			loop_has_multiple_exit_targets = 1;
! 			break;
! 		      }
! 		    else if (XEXP (label1, 0) != exit_target
! 			     && LABEL_OUTSIDE_LOOP_P (label1))
! 		      {
! 			/* A jump outside the current loop.  */
! 			loop_has_multiple_exit_targets = 1;
! 			break;
! 		      }
! 		  }
! 
! 		label1 = label2;
! 		label2 = NULL_RTX;
! 	      } while (label1);
! 	    }
! 	}
!       else if (GET_CODE (insn) == RETURN)
! 	loop_has_multiple_exit_targets = 1;
!     }
! 
!   /* Now, rescan the loop, setting up the LOOP_MEMS array.  */
!   if (/* We can't tell what MEMs are aliased by what.  */
!       !unknown_address_altered 
!       /* An exception thrown by a called function might land us
! 	 anywhere.  */
!       && !loop_has_call
!       /* We don't want loads for MEMs moved to a location before the
! 	 one at which their stack memory becomes allocated.  (Note
! 	 that this is not a problem for malloc, etc., since those
! 	 require actual function calls.  */
!       && !current_function_calls_alloca
!       /* There are ways to leave the loop other than falling off the
! 	 end.  */
!       && !loop_has_multiple_exit_targets)
!     for (insn = NEXT_INSN (start); insn != NEXT_INSN (end);
! 	 insn = NEXT_INSN (insn))
!       for_each_rtx (&insn, insert_loop_mem, 0);
  }
  
  /* Scan the function looking for loops.  Record the start and end of each loop.
*************** static rtx addr_placeholder;
*** 3354,3371 ****
  /* ??? Unfinished optimizations, and possible future optimizations,
     for the strength reduction code.  */
  
- /* ??? There is one more optimization you might be interested in doing: to
-    allocate pseudo registers for frequently-accessed memory locations.
-    If the same memory location is referenced each time around, it might
-    be possible to copy it into a register before and out after.
-    This is especially useful when the memory location is a variable which
-    is in a stack slot because somewhere its address is taken.  If the
-    loop doesn't contain a function call and the variable isn't volatile,
-    it is safe to keep the value in a register for the duration of the
-    loop. One tricky thing is that the copying of the value back from the
-    register has to be done on all exits from the loop.  You need to check that
-    all the exits from the loop go to the same place.  */
- 
  /* ??? The interaction of biv elimination, and recognition of 'constant'
     bivs, may cause problems.  */
  
--- 3510,3515 ----
*************** static rtx addr_placeholder;
*** 3390,3402 ****
     was rerun in loop_optimize whenever a register was added or moved.
     Also, some of the optimizations could be a little less conservative.  */
  
! /* Perform strength reduction and induction variable elimination.  */
  
! /* Pseudo registers created during this function will be beyond the last
     valid index in several tables including n_times_set and regno_last_uid.
     This does not cause a problem here, because the added registers cannot be
     givs outside of their loop, and hence will never be reconsidered.
!    But scan_loop must check regnos to make sure they are in bounds.  */
  
  static void
  strength_reduce (scan_start, end, loop_top, insn_count,
--- 3534,3551 ----
     was rerun in loop_optimize whenever a register was added or moved.
     Also, some of the optimizations could be a little less conservative.  */
  
! /* Perform strength reduction and induction variable elimination.  
  
!    Pseudo registers created during this function will be beyond the last
     valid index in several tables including n_times_set and regno_last_uid.
     This does not cause a problem here, because the added registers cannot be
     givs outside of their loop, and hence will never be reconsidered.
!    But scan_loop must check regnos to make sure they are in bounds. 
!    
!    SCAN_START is the first instruction in the loop, as the loop would
!    actually be executed.  END is the NOTE_INSN_LOOP_END.  LOOP_TOP is
!    the first instruction in the loop, as it is layed out in the
!    instruction stream.  LOOP_START is the NOTE_INSN_LOOP_BEG.  */
  
  static void
  strength_reduce (scan_start, end, loop_top, insn_count,
*************** strength_reduce (scan_start, end, loop_t
*** 3464,3487 ****
  
    /* Scan through loop to find all possible bivs.  */
  
!   p = scan_start;
!   while (1)
      {
-       p = NEXT_INSN (p);
-       /* At end of a straight-in loop, we are done.
- 	 At end of a loop entered at the bottom, scan the top.  */
-       if (p == scan_start)
- 	break;
-       if (p == end)
- 	{
- 	  if (loop_top != 0)
- 	    p = loop_top;
- 	  else
- 	    break;
- 	  if (p == scan_start)
- 	    break;
- 	}
- 
        if (GET_CODE (p) == INSN
  	  && (set = single_set (p))
  	  && GET_CODE (SET_DEST (set)) == REG)
--- 3613,3622 ----
  
    /* Scan through loop to find all possible bivs.  */
  
!   for (p = next_insn_in_loop (scan_start, scan_start, end, loop_top);
!        p != NULL_RTX;
!        p = next_insn_in_loop (p, scan_start, end, loop_top))
      {
        if (GET_CODE (p) == INSN
  	  && (set = single_set (p))
  	  && GET_CODE (SET_DEST (set)) == REG)
*************** indirect_jump_in_function_p (start)
*** 7797,7799 ****
--- 7932,8258 ----
  
    return 0;
  }
+ 
+ /* Add MEM to the LOOP_MEMS array, if appropriate.  See the
+    documentation for LOOP_MEMS for the definition of `appropriate'.
+    This function is called from prescan_loop via for_each_rtx.  */
+ 
+ static int
+ insert_loop_mem (mem, data)
+      rtx* mem;
+      void* data;
+ {
+   int i;
+   rtx m = *mem;
+ 
+   if (m == NULL_RTX)
+     return 0;
+ 
+   switch (GET_CODE (m))
+     {
+     case MEM:
+       break;
+ 
+     case CONST_DOUBLE:
+       /* We're not interested in the MEM associated with a
+ 	 CONST_DOUBLE, so there's no need to traverse into this.  */
+       return -1;
+ 
+     default:
+       /* This is not a MEM.  */
+       return 0;
+     }
+ 
+   /* See if we've already seen this MEM.  */
+   for (i = 0; i < loop_mems_idx; ++i)
+     if (rtx_equal_p (m, loop_mems[i].mem)) 
+       {
+ 	if (GET_MODE (m) != GET_MODE (loop_mems[i].mem))
+ 	  /* The modes of the two memory accesses are different.  If
+ 	     this happens, something tricky is going on, and we just
+ 	     don't optimize accesses to this MEM.  */
+ 	  loop_mems[i].optimize = 0;
+ 
+ 	return 0;
+       }
+ 
+   /* Resize the array, if necessary.  */
+   if (loop_mems_idx == loop_mems_allocated) 
+     {
+       if (loop_mems_allocated != 0)
+ 	loop_mems_allocated *= 2;
+       else
+ 	loop_mems_allocated = 32;
+ 
+       loop_mems = (loop_mem_info*) 
+ 	xrealloc (loop_mems,
+ 		  loop_mems_allocated * sizeof (loop_mem_info)); 
+     }
+ 
+   /* Actually insert the MEM.  */
+   loop_mems[loop_mems_idx].mem = m;
+   /* We can't hoist this MEM out of the loop if it's a BLKmode MEM
+      because we can't put it in a register.  We still store it in the
+      table, though, so that if we see the same address later, but in a
+      non-BLK mode, we'll not think we can optimize it at that point.  */
+   loop_mems[loop_mems_idx].optimize = (GET_MODE (m) != BLKmode);
+   loop_mems[loop_mems_idx].reg = NULL_RTX;
+   ++loop_mems_idx;
+ }
+ 
+ /* Move MEMs into registers for the duration of the loop.  SCAN_START
+    is the first instruction in the loop (as it is executed).  The
+    other parameters are as for next_insn_in_loop.  */
+ 
+ static void
+ load_mems (scan_start, end, loop_top, start)
+      rtx scan_start;
+      rtx end;
+      rtx loop_top;
+      rtx start;
+ {
+   int maybe_never = 0;
+   int i;
+   rtx p;
+   rtx label = NULL_RTX;
+   rtx end_label;
+ 
+   if (loop_mems_idx > 0) 
+     {
+       /* Nonzero if the next instruction may never be executed.  */
+       int next_maybe_never = 0;
+ 
+       /* Check to see if it's possible that some instructions in the
+ 	 loop are never executed.  */
+       for (p = next_insn_in_loop (scan_start, scan_start, end, loop_top); 
+ 	   p != NULL_RTX && !maybe_never; 
+ 	   p = next_insn_in_loop (p, scan_start, end, loop_top))
+ 	{
+ 	  if (GET_CODE (p) == CODE_LABEL)
+ 	    maybe_never = 1;
+ 	  else if (GET_CODE (p) == JUMP_INSN
+ 		   /* If we enter the loop in the middle, and scan
+ 		      around to the beginning, don't set maybe_never
+ 		      for that.  This must be an unconditional jump,
+ 		      otherwise the code at the top of the loop might
+ 		      never be executed.  Unconditional jumps are
+ 		      followed a by barrier then loop end.  */
+ 		   && ! (GET_CODE (p) == JUMP_INSN 
+ 			 && JUMP_LABEL (p) == loop_top
+ 			 && NEXT_INSN (NEXT_INSN (p)) == end
+ 			 && simplejump_p (p)))
+ 	    {
+ 	      if (!condjump_p (p))
+ 		/* Something complicated.  */
+ 		maybe_never = 1;
+ 	      else
+ 		/* If there are any more instructions in the loop, they
+ 		   might not be reached.  */
+ 		next_maybe_never = 1; 
+ 	    } 
+ 	  else if (next_maybe_never)
+ 	    maybe_never = 1;
+ 	}
+ 
+       /* Actually move the MEMs.  */
+       for (i = 0; i < loop_mems_idx; ++i) 
+ 	{
+ 	  int j;
+ 	  int written = 0;
+ 	  rtx reg;
+ 	  rtx mem = loop_mems[i].mem;
+ 
+ 	  if (MEM_VOLATILE_P (mem) 
+ 	      || invariant_p (XEXP (mem, 0)) != 1)
+ 	    /* There's no telling whether or not MEM is modified.  */
+ 	    loop_mems[i].optimize = 0;
+ 
+ 	  /* Go through the MEMs written to in the loop to see if this
+ 	     one is aliased by one of them.  */
+ 	  for (j = 0; j < loop_store_mems_idx; ++j) 
+ 	    {
+ 	      if (rtx_equal_p (mem, loop_store_mems[j]))
+ 		written = 1;
+ 	      else if (true_dependence (loop_store_mems[j], VOIDmode,
+ 					mem, rtx_varies_p))
+ 		{
+ 		  /* MEM is indeed aliased by this store.  */
+ 		  loop_mems[i].optimize = 0;
+ 		  break;
+ 		}
+ 	    }
+ 	  
+ 	  /* If this MEM is written to, we must be sure that there
+ 	     are no reads from another MEM that aliases this one.  */ 
+ 	  if (loop_mems[i].optimize && written)
+ 	    {
+ 	      int j;
+ 
+ 	      for (j = 0; j < loop_mems_idx; ++j)
+ 		{
+ 		  if (j == i)
+ 		    continue;
+ 		  else if (true_dependence (mem,
+ 					    VOIDmode,
+ 					    loop_mems[j].mem,
+ 					    rtx_varies_p))
+ 		    {
+ 		      /* It's not safe to hoist loop_mems[i] out of
+ 			 the loop because writes to it might not be
+ 			 seen by reads from loop_mems[j].  */
+ 		      loop_mems[i].optimize = 0;
+ 		      break;
+ 		    }
+ 		}
+ 	    }
+ 
+ 	  if (maybe_never && may_trap_p (mem))
+ 	    /* We can't access the MEM outside the loop; it might
+ 	       cause a trap that wouldn't have happened otherwise.  */
+ 	    loop_mems[i].optimize = 0;
+ 	  
+ 	  if (!loop_mems[i].optimize)
+ 	    /* We thought we were going to lift this MEM out of the
+ 	       loop, but later discovered that we could not.  */
+ 	    continue;
+ 
+ 	  /* Allocate a pseudo for this MEM.  We set REG_USERVAR_P in
+ 	     order to keep scan_loop from moving stores to this MEM
+ 	     out of the loop just because this REG is neither a
+ 	     user-variable nor used in the loop test.  */
+ 	  reg = gen_reg_rtx (GET_MODE (mem));
+ 	  REG_USERVAR_P (reg) = 1;
+ 	  loop_mems[i].reg = reg;
+ 
+ 	  /* Now, replace all references to the MEM with the
+ 	     corresponding pesudos.  */
+ 	  for (p = next_insn_in_loop (scan_start, scan_start, end, loop_top);
+ 	       p != NULL_RTX;
+ 	       p = next_insn_in_loop (p, scan_start, end, loop_top))
+ 	    {
+ 	      rtx_and_int ri = { p, i };
+ 	      for_each_rtx (&p, replace_loop_mem, &ri);
+ 	    }
+ 
+ 	  if (!apply_change_group ())
+ 	    /* We couldn't replace all occurrences of the MEM.  */
+ 	    loop_mems[i].optimize = 0;
+ 	  else
+ 	    {
+ 	      rtx set;
+ 
+ 	      /* Load the memory immediately before START, which is
+ 		 the NOTE_LOOP_BEG.  */
+ 	      set = gen_rtx_SET (GET_MODE (reg), reg, mem);
+ 	      emit_insn_before (set, start);
+ 
+ 	      if (written)
+ 		{
+ 		  if (label == NULL_RTX)
+ 		    {
+ 		      /* We must compute the former
+ 			 right-after-the-end label before we insert
+ 			 the new one.  */
+ 		      end_label = next_label (end);
+ 		      label = gen_label_rtx ();
+ 		      emit_label_after (label, end);
+ 		    }
+ 
+ 		  /* Store the memory immediately after END, which is
+ 		   the NOTE_LOOP_END.  */
+ 		  set = gen_rtx_SET (GET_MODE (reg), mem, reg); 
+ 		  emit_insn_after (set, label);
+ 		}
+ 	    }
+ 	}
+     }
+ 
+   if (label != NULL_RTX)
+     {
+       /* Now, we need to replace all references to the previous exit
+ 	 label with the new one.  */
+       rtx_pair rr = { end_label, label };
+ 
+       for (p = start; p != end; p = NEXT_INSN (p))
+ 	for_each_rtx (&p, replace_label, &rr);
+     }
+ }
+ 
+ /* Replace MEM with its associated pseudo register.  This function is
+    called from load_mems via for_each_rtx.  DATA is actually an
+    rtx_and_int* describing the instruction currently being scanned
+    and the MEM we are currently replacing.  */
+ 
+ static int
+ replace_loop_mem (mem, data)
+      rtx* mem;
+      void* data;
+ {
+   rtx_and_int* ri; 
+   rtx insn;
+   int i;
+   rtx m = *mem;
+ 
+   if (m == NULL_RTX)
+     return 0;
+ 
+   switch (GET_CODE (m))
+     {
+     case MEM:
+       break;
+ 
+     case CONST_DOUBLE:
+       /* We're not interested in the MEM associated with a
+ 	 CONST_DOUBLE, so there's no need to traverse into one.  */
+       return -1;
+ 
+     default:
+       /* This is not a MEM.  */
+       return 0;
+     }
+ 
+   ri = (rtx_and_int*) data;
+   i = ri->i;
+ 
+   if (!rtx_equal_p (loop_mems[i].mem, m))
+     /* This is not the MEM we are currently replacing.  */
+     return 0;
+ 
+   insn = ri->r;
+ 
+   /* Actually replace the MEM.  */
+   validate_change (insn, mem, loop_mems[i].reg, 1);
+ 
+   return 0;
+ }
+ 
+ /* Replace occurrences of the old exit label for the loop with the new
+    one.  DATA is an rtx_pair containing the old and new labels,
+    respectively.  */
+ 
+ static int
+ replace_label (x, data)
+      rtx* x;
+      void* data;
+ {
+   rtx l = *x;
+   rtx old_label = ((rtx_pair*) data)->r1;
+   rtx new_label = ((rtx_pair*) data)->r2;
+ 
+   if (l == NULL_RTX)
+     return 0;
+ 
+   if (GET_CODE (l) != LABEL_REF)
+     return 0;
+ 
+   if (XEXP (l, 0) != old_label)
+     return 0;
+   
+   XEXP (l, 0) = new_label;
+   ++LABEL_NUSES (new_label);
+   --LABEL_NUSES (old_label);
+ 
+   return 0;
+ }
+ 
+      


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]