This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Array prefetch patch take 4


Here comes the updated patch.  In case of problems with rtx_equal_for_prefetch_p
I was trying to replace it with rtx_equal_p and it appears to work resonably
well (ie you need to write relativly obstuctated source to get it confused,
but I am not sure how common this is in practice.  XaoS does it).

2001-11-06  Jan Hubicka  <jh@suse.cz)
	    Janis Johnson  <janis@us.ibm.com>

	* flags.h (flag_prefetch_loop_arrays): Declare.
	* loop.h (LOOP_PREFETCH): Define new constant.
	* loop.c (strength_reduce): Call emit_prefetch_instructions.
	(MAX_PREFETCHES, PREFETCH_BLOCKS_BEFORE_LOOP_MAX,
	PREFETCH_BLOCKS_BEFORE_LOOP_MIN, PREFETCH_BLOCKS_IN_LOOP_MIN): New
	constants.
	(check_store_data): New structure.
	(check_store, emit_prefetch_instructions, rtx_equal_for_prefetch_p):
	New functions.
	* toplev.c: Include insn-flags.h.
	(flag_prefetch_loop_arrays): New global variable.
	(lang_independent_option): Add -fprefetch-loop-arrays.
	(rest_of_compilation) Pass LOOP_PREFETCH when flag_prefetch_loop_arrays
        is set.
	* Makefile.in (toplev.c): Depend on insn-flags.h.
	* invoke.texi (-fprefetch-loop-arrays): Document.
Index: Makefile.in
===================================================================
RCS file: /cvs/gcc/egcs/gcc/Makefile.in,v
retrieving revision 1.805
diff -c -3 -p -r1.805 Makefile.in
*** Makefile.in	2001/12/11 13:13:01	1.805
--- Makefile.in	2001/12/12 08:59:14
*************** toplev.o : toplev.c $(CONFIG_H) $(SYSTEM
*** 1321,1327 ****
     dwarf2out.h sdbout.h dbxout.h $(EXPR_H) hard-reg-set.h $(BASIC_BLOCK_H) \
     graph.h $(LOOP_H) except.h $(REGS_H) $(TIMEVAR_H) $(lang_options_files) \
     ssa.h $(PARAMS_H) $(TM_P_H) reload.h dwarf2asm.h $(TARGET_H) halfpic.h \
!    langhooks.h
  	$(CC) $(ALL_CFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
  	  -DTARGET_NAME=\"$(target_alias)\" \
  	  -c $(srcdir)/toplev.c $(OUTPUT_OPTION)
--- 1321,1327 ----
     dwarf2out.h sdbout.h dbxout.h $(EXPR_H) hard-reg-set.h $(BASIC_BLOCK_H) \
     graph.h $(LOOP_H) except.h $(REGS_H) $(TIMEVAR_H) $(lang_options_files) \
     ssa.h $(PARAMS_H) $(TM_P_H) reload.h dwarf2asm.h $(TARGET_H) halfpic.h \
!    langhooks.h insn-flags.h
  	$(CC) $(ALL_CFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
  	  -DTARGET_NAME=\"$(target_alias)\" \
  	  -c $(srcdir)/toplev.c $(OUTPUT_OPTION)
Index: flags.h
===================================================================
RCS file: /cvs/gcc/egcs/gcc/flags.h,v
retrieving revision 1.73
diff -c -3 -p -r1.73 flags.h
*** flags.h	2001/12/06 11:49:09	1.73
--- flags.h	2001/12/12 08:59:14
*************** extern int flag_strength_reduce;
*** 259,264 ****
--- 259,268 ----
  
  extern int flag_unroll_loops;
  
+ /* Nonzero enables prefetch optimizations for arrays in loops.  */
+ 
+ extern int flag_prefetch_loop_arrays;
+ 
  /* Nonzero enables loop unrolling in unroll.c.  All loops are unrolled.
2001-11-06  Jan Hubicka  <jh@suse.cz)
	    Janis Johnson  <janis@us.ibm.com>

	* flags.h (flag_prefetch_loop_arrays): Declare.
	* loop.h (LOOP_PREFETCH): Define new constant.
	* loop.c (strength_reduce): Call emit_prefetch_instructions.
	(MAX_PREFETCHES, PREFETCH_BLOCKS_BEFORE_LOOP_MAX,
	PREFETCH_BLOCKS_BEFORE_LOOP_MIN, PREFETCH_BLOCKS_IN_LOOP_MIN): New
	constants.
	(check_store_data): New structure.
	(check_store, emit_prefetch_instructions, rtx_equal_for_prefetch_p):
	New functions.
	* toplev.c: Include insn-flags.h.
	(flag_prefetch_loop_arrays): New global variable.
	(lang_independent_option): Add -fprefetch-loop-arrays.
	(rest_of_compilation) Pass LOOP_PREFETCH when flag_prefetch_loop_arrays
        is set.
	* Makefile.in (toplev.c): Depend on insn-flags.h.
	* invoke.texi (-fprefetch-loop-arrays): Document.
     This is generally not a win.  */
  
Index: loop.c
===================================================================
RCS file: /cvs/gcc/egcs/gcc/loop.c,v
retrieving revision 1.369
diff -c -3 -p -r1.369 loop.c
*** loop.c	2001/11/15 23:44:56	1.369
--- loop.c	2001/12/12 08:59:17
*************** Software Foundation, 59 Temple Place - S
*** 53,59 ****
--- 53,143 ----
  #include "except.h"
  #include "toplev.h"
  #include "predict.h"
+ #include "insn-flags.h"
  
+ /* Not really meaningful values, but at least something.  */
+ #ifndef SIMULTANEOUS_PREFETCHES
+ #define SIMULTANEOUS_PREFETCHES 3
+ #endif
+ #ifndef PREFETCH_BLOCK
+ #define PREFETCH_BLOCK 32
+ #endif
+ #ifndef HAVE_prefetch
+ #define HAVE_prefetch 0
+ #define gen_prefetch(a,b,c,d) abort()
+ #endif
+ 
+ /* Give up the prefetch optimizations once we exceed a given threshhold.
+    It is unlikely that we would be able to optimize something in a loop
+    with so many detected prefetches.  */
+ #define MAX_PREFETCHES 100
+ /* The number of prefetch blocks that are beneficial to fetch at once before
+    a loop with a known (and low) iteration count.  */
+ #define PREFETCH_BLOCKS_BEFORE_LOOP_MAX  6
+ /* For very tiny loops it is not worthwhile to prefetch even before the loop,
+    since it is likely that the data are already in the cache.  */
+ #define PREFETCH_BLOCKS_BEFORE_LOOP_MIN  2
+ /* The minimal number of prefetch blocks that a loop must consume to make
+    the emitting of prefetch instruction in the body of loop worthwhile.  */
+ #define PREFETCH_BLOCKS_IN_LOOP_MIN  6
+ 
+ /* Parameterize some prefetch heuristics so they can be turned on and off
+    easily for performance testing on new architecures.  These can be
+    defined in target-dependent files.  */
+ 
+ /* Prefetch is worthwhile only when loads/stores are dense.  */
+ #ifndef PREFETCH_ONLY_DENSE_MEM
+ #define PREFETCH_ONLY_DENSE_MEM 1
+ #endif
+ 
+ /* Define what we mean by "dense" loads and stores; This value divided by 256
+    is the minimum percentage of memory references that worth prefetching.  */
+ #ifndef PREFETCH_DENSE_MEM
+ #define PREFETCH_DENSE_MEM 220
+ #endif
+ 
+ /* Do not prefetch for a loop whose iteration count is known to be low.  */
+ #ifndef PREFETCH_NO_LOW_LOOPCNT
+ #define PREFETCH_NO_LOW_LOOPCNT 1
+ #endif
+ 
+ /* Define what we mean by a "low" iteration count.  */
+ #ifndef PREFETCH_LOW_LOOPCNT
+ #define PREFETCH_LOW_LOOPCNT 32
+ #endif
+ 
+ /* Do not prefetch for a loop that contains a function call; such a loop is
+    probably not an internal loop.  */
+ #ifndef PREFETCH_NO_CALL
+ #define PREFETCH_NO_CALL 1
+ #endif
+ 
+ /* Do not prefetch accesses with an extreme stride.  */
+ #ifndef PREFETCH_NO_EXTREME_STRIDE
+ #define PREFETCH_NO_EXTREME_STRIDE 1
+ #endif
+ 
+ /* Define what we mean by an "extreme" stride.  */
+ #ifndef PREFETCH_EXTREME_STRIDE
+ #define PREFETCH_EXTREME_STRIDE 4096
+ #endif
+ 
+ /* Do not handle reversed order prefetches (negative stride).  */
+ #ifndef PREFETCH_NO_REVERSE_ORDER
+ #define PREFETCH_NO_REVERSE_ORDER 1
+ #endif
+ 
+ /* Prefetch even if the GIV is not always executed.  */
+ #ifndef PREFETCH_NOT_ALWAYS
+ #define PREFETCH_NOT_ALWAYS 0
+ #endif
+ 
+ /* If the loop requires more prefetches than the target can process in
+    parallel then don't prefetch anything in that loop.  */
+ #ifndef PREFETCH_LIMIT_TO_SIMULTANEOUS
+ #define PREFETCH_LIMIT_TO_SIMULTANEOUS 1
+ #endif
+ 
  #define LOOP_REG_LIFETIME(LOOP, REGNO) \
  ((REGNO_LAST_LUID (REGNO) - REGNO_FIRST_LUID (REGNO)))
  
*************** static rtx loop_insn_sink_or_swim PARAMS
*** 262,267 ****
--- 346,352 ----
  
  static void loop_dump_aux PARAMS ((const struct loop *, FILE *, int));
  static void loop_delete_insns PARAMS ((rtx, rtx));
+ static int remove_constant_addition PARAMS ((rtx *));
  void debug_ivs PARAMS ((const struct loop *));
  void debug_iv_class PARAMS ((const struct iv_class *));
  void debug_biv PARAMS ((const struct induction *));
*************** loop_reg_used_before_p (loop, set, insn)
*** 3412,3417 ****
--- 3497,4005 ----
    return 0;
  }
  
+ 
+ /* Information we collect about arrays that we might want to prefetch.  */
+ struct prefetch_info
+ {
+   struct iv_class *class;	/* Class this prefetch is based on.  */
+   struct induction *giv;	/* GIV this prefetch is based on.  */
+   rtx base_address;		/* Start prefetching from this address plus
+ 				   index.  */
+   HOST_WIDE_INT index;
+   HOST_WIDE_INT stride;		/* Prefetch stride in bytes in each
+ 				   iteration.  */
+   unsigned int bytes_accesed;	/* Sum of sizes of all acceses to this
+ 				   prefetch area in one iteration.  */
+   unsigned int total_bytes;	/* Total bytes loop will access in this block.
+ 				   This is set only for loops with known
+ 				   iteration counts and is 0xffffffff
+ 				   otherwise.  */
+   unsigned int write : 1;	/* 1 for read/write prefetches.  */
+   unsigned int prefetch_in_loop : 1;
+   				/* 1 for those chosen for prefetching.  */
+   unsigned int prefetch_before_loop : 1;
+   				/* 1 for those chosen for prefetching.  */
+ };
+ 
+ /* Data used by check_store function.  */
+ struct check_store_data
+ {
+   rtx mem_address;
+   int mem_write;
+ };
+ 
+ static void check_store PARAMS ((rtx, rtx, void *));
+ static void emit_prefetch_instructions PARAMS ((struct loop *));
+ static int rtx_equal_for_prefetch_p PARAMS ((rtx, rtx));
+ 
+ /* Set mem_write when mem_address is found.  Used as callback to
+    note_stores.  */
+ static void
+ check_store (x, pat, data)
+      rtx x, pat ATTRIBUTE_UNUSED;
+      void *data;
+ {
+   struct check_store_data *d = (struct check_store_data *)data;
+ 
+   if ((GET_CODE (x) == MEM) && rtx_equal_p (d->mem_address, XEXP (x, 0)))
+     d->mem_write = 1;
+ }
+ 
+ /* Like rtx_equal_p, but attempts to swap commutative operands.  This is
+    important to get some addresses combined.  Later more sophisticated
+    transformations can be added when necesary.
+ 
+    ??? Same trick with swapping operand is done at several other places.
+    It can be nice to develop some common way to handle this.  */
+ 
+ static int
+ rtx_equal_for_prefetch_p (x, y)
+      rtx x, y;
+ {
+   int i;
+   int j;
+   enum rtx_code code = GET_CODE (x);
+   const char *fmt;
+ 
+   if (x == y)
+     return 1;
+   if (code != GET_CODE (y))
+     return 0;
+ 
+   code = GET_CODE (x);
+ 
+   if (GET_RTX_CLASS (code) == 'c')
+     {
+       return ((rtx_equal_for_prefetch_p (XEXP (x, 0), XEXP (y, 0))
+ 	       && rtx_equal_for_prefetch_p (XEXP (x, 1), XEXP (y, 1)))
+ 	      || (rtx_equal_for_prefetch_p (XEXP (x, 0), XEXP (y, 1))
+ 	          && rtx_equal_for_prefetch_p (XEXP (x, 1), XEXP (y, 0))));
+     }
+   /* Compare the elements.  If any pair of corresponding elements fails to
+      match, return 0 for the whole thing.  */
+ 
+   fmt = GET_RTX_FORMAT (code);
+   for (i = GET_RTX_LENGTH (code) - 1; i >= 0; i--)
+     {
+       switch (fmt[i])
+ 	{
+ 	case 'w':
+ 	  if (XWINT (x, i) != XWINT (y, i))
+ 	    return 0;
+ 	  break;
+ 
+ 	case 'i':
+ 	  if (XINT (x, i) != XINT (y, i))
+ 	    return 0;
+ 	  break;
+ 
+ 	case 'E':
+ 	  /* Two vectors must have the same length.  */
+ 	  if (XVECLEN (x, i) != XVECLEN (y, i))
+ 	    return 0;
+ 
+ 	  /* And the corresponding elements must match.  */
+ 	  for (j = 0; j < XVECLEN (x, i); j++)
+ 	    if (rtx_equal_for_prefetch_p (XVECEXP (x, i, j),
+ 					  XVECEXP (y, i, j)) == 0)
+ 	      return 0;
+ 	  break;
+ 
+ 	case 'e':
+ 	  if (rtx_equal_for_prefetch_p (XEXP (x, i), XEXP (y, i)) == 0)
+ 	    return 0;
+ 	  break;
+ 
+ 	case 's':
+ 	  if (strcmp (XSTR (x, i), XSTR (y, i)))
+ 	    return 0;
+ 	  break;
+ 
+ 	case 'u':
+ 	  /* These are just backpointers, so they don't matter.  */
+ 	  break;
+ 
+ 	case '0':
+ 	  break;
+ 
+ 	  /* It is believed that rtx's at this level will never
+ 	     contain anything but integers and other rtx's,
+ 	     except for within LABEL_REFs and SYMBOL_REFs.  */
+ 	default:
+ 	  abort ();
+ 	}
+     }
+   return 1;
+ }
+ 
+ /* Remove constant addition value from the expression X (when present)
+    and return it.  */
+ static HOST_WIDE_INT
+ remove_constant_addition (x)
+    rtx *x;
+ {
+   HOST_WIDE_INT addval = 0;
+   rtx exp=*x;
+ 
+   if (GET_CODE (exp) == CONST)
+     exp = XEXP (exp, 0);
+   if (GET_CODE (exp) == CONST_INT)
+     {
+       addval = INTVAL (exp);
+       *x = const0_rtx;
+     }
+   /* For plus expression recurse on ourself.  */
+   else if (GET_CODE (exp) == PLUS)
+     {
+       addval += remove_constant_addition (&XEXP (exp, 0));
+       addval += remove_constant_addition (&XEXP (exp, 1));
+       /* In case our parameter was constant,  remove extra zero
+          from the expression.  */
+       if (XEXP (exp, 0) == const0_rtx)
+         *x = XEXP (exp, 1);
+       else if (XEXP (exp, 1) == const0_rtx)
+         *x = XEXP (exp, 0);
+     }
+   return addval;
+ }
+ 
+ /* Attempt to identify accesses to arrays that are most likely to cause cache
+    misses, and emit prefetch instructions a few prefetch blocks forward.
+ 
+    To detect the arrays we use the GIV information that was collected by the
+    strength reduction pass.
+ 
+    The prefetch instructions are generated after the GIV information is done
+    and before the strength reduction process. The new GIVs are injected into
+    the strength reduction tables, so the prefetch addresses are optimized as
+    well.
+ 
+    GIVs are split into base address, stride, and constant addition values.
+    GIVs with the same address, stride and close addition values are combined
+    into a single prefetch.  Also writes to GIVs are detected, so that prefetch
+    for write instructions can be used for the block we write to, on machines
+    that support write prefetches.
+ 
+    Several heuristics are used to determine when to prefetch.  They are
+    controlled by defined symbols that can be overridden for each target.
+ */
+ static void
+ emit_prefetch_instructions (struct loop *loop)
+ {
+   int num_prefetches = 0;
+   int num_real_prefetches = 0;
+   int num_real_write_prefetches = 0;
+   int ahead;
+   int i;
+   struct iv_class *bl;
+   struct induction *iv;
+   struct prefetch_info info[MAX_PREFETCHES];
+   struct loop_ivs *ivs = LOOP_IVS (loop);
+ 
+   if (!HAVE_prefetch)
+     return;
+ 
+   /* Consider only loops w/o calls.  When a call is done, the loop is probably
+      slow enough to read the memory.  */
+   if (PREFETCH_NO_CALL && LOOP_INFO (loop)->has_call)
+     {
+       if (loop_dump_stream)
+ 	fprintf (loop_dump_stream, "Prefetch: ignoring loop - has call.\n");
+       return;
+     }
+ 
+   if (PREFETCH_NO_LOW_LOOPCNT
+       && LOOP_INFO (loop)->n_iterations
+       && LOOP_INFO (loop)->n_iterations <= PREFETCH_LOW_LOOPCNT)
+     {
+       if (loop_dump_stream)
+ 	fprintf (loop_dump_stream,
+ 		 "Prefetch: ignoring loop - not enought iterations.\n");
+       return;
+     }
+ 
+   /* Search all induction variables and pick those interesting for the prefetch
+      machinery.  */
+   for (bl = ivs->list; bl; bl = bl->next)
+     {
+       struct induction *biv = bl->biv, *biv1;
+       int basestride = 0;
+ 
+       biv1 = biv;
+       /* Expect all BIVs to be executed in each iteration.  This makes our
+ 	 analysis more conservative.  */
+       while (biv1)
+ 	{
+ 	  /* Discard non-constant additions that we can't handle well yet, and
+ 	     BIVs that are executed multiple times; such BIVs ought to be
+ 	     handled in the nested loop.  We accept not_every_iteration BIVs,
+ 	     since these only result in larger strides and make our
+ 	     heuristics more conservative.
+ 	     ??? What does the last sentence mean?  */
+ 
+ 	  if (GET_CODE (biv->add_val) != CONST_INT)
+ 	    {
+ 	      if (loop_dump_stream)
+ 		{
+ 		  fprintf (loop_dump_stream, "Prefetch: biv %i ignored: non-constant addition at insn %i:",
+ 			   REGNO (biv->src_reg), INSN_UID (biv->insn));
+ 		  print_rtl (loop_dump_stream, biv->add_val);
+ 		  fprintf (loop_dump_stream, "\n");
+ 		}
+ 	      break;
+ 	    }
+ 	  if (biv->maybe_multiple)
+ 	    {
+ 	      if (loop_dump_stream)
+ 		{
+ 		  fprintf (loop_dump_stream, "Prefetch: biv %i ignored: maybe_multiple at insn %i:",
+ 			   REGNO (biv->src_reg), INSN_UID (biv->insn));
+ 		  print_rtl (loop_dump_stream, biv->add_val);
+ 		  fprintf (loop_dump_stream, "\n");
+ 		}
+ 	      break;
+ 	    }
+ 	  basestride += INTVAL (biv1->add_val);
+ 	  biv1 = biv1->next_iv;
+ 	}
+       if (biv1 || !basestride)
+ 	continue;
+       for (iv = bl->giv; iv; iv = iv->next_iv)
+ 	{
+ 	  rtx address;
+ 	  rtx temp;
+ 	  HOST_WIDE_INT index = 0;
+ 	  int add = 1;
+ 	  HOST_WIDE_INT stride;
+ 	  struct check_store_data d;
+ 	  int size = GET_MODE_SIZE (GET_MODE (iv));
+ 
+ 	  /* There are several reasons why an induction variable is not
+ 	     interesting to us.  */
+ 	  if (iv->giv_type != DEST_ADDR
+ 	  /* We are interested only in constant stride memory references
+ 	     in order to be able to compute density easily.  */
+ 	      || GET_CODE (iv->mult_val) != CONST_INT
+ 	  /* Don't handle reversed order prefetches, since they are usually
+ 	     ineffective.  Later we may be able to reverse such BIVs.  */
+ 	      || (PREFETCH_NO_REVERSE_ORDER 
+ 		  && (stride = INTVAL (iv->mult_val) * basestride) < 0)
+ 	  /* Prefetching of accesses with such a extreme stride is probably
+ 	     not worthwhile, either.  */
+ 	      || (PREFETCH_NO_EXTREME_STRIDE
+ 		  && stride > PREFETCH_EXTREME_STRIDE)
+ 	  /* Ignore GIVs with varying add values; we can't predict the value
+ 	     for the next iteration.  */
+ 	      || !loop_invariant_p (loop, iv->add_val)
+ 	  /* Ignore GIVs in the nested loops; they ought to have been handled
+ 	     already.  */
+ 	      || iv->maybe_multiple)
+ 	    {
+ 	      if (loop_dump_stream)
+ 		{
+ 		  fprintf (loop_dump_stream, "Prefetch: Ignoring giv at %i\n",
+ 			   INSN_UID (iv->insn));
+ 		}
+ 	      continue;
+ 	    }
+ 
+ 	  /* Determine the pointer to the basic array we are examining.  It is
+ 	     the sum of the BIV's initial value and the GIV's add_val.  */
+ 	  index = 0;
+ 
+ 	  address = copy_rtx (iv->add_val);
+ 	  temp = copy_rtx (bl->initial_value);
+ 
+ 	  address = simplify_gen_binary (PLUS, Pmode, temp, address);
+ 	  index = remove_constant_addition (&address);
+ 
+ 	  index += size;
+ 	  d.mem_write = 0;
+ 	  d.mem_address = *iv->location;
+ 	  /* When the GIV is not always executed, we might be better off by
+ 	     not dirtying the cache pages.  */
+ 	  if (PREFETCH_NOT_ALWAYS || iv->always_executed)
+ 	    note_stores (PATTERN (iv->insn), check_store, &d);
+ 
+ 	  /* Attempt to find another prefetch to the same array and see if we
+ 	     can merge this one.  */
+ 	  for (i = 0; i < num_prefetches; i++)
+ 	    if (rtx_equal_for_prefetch_p (address, info[i].base_address)
+ 		&& stride == info[i].stride)
+ 	      {
+ 		/* In case both access same array (same location
+ 		   just with small difference in constant indexes), merge
+ 		   the prefetches.  Just do the later and the earlier will
+ 		   get prefetched from previous iteration.
+ 		   4096 is artificial threshold.  It should not be too small,
+ 		   but also not bigger than small portion of memory usually
+ 		   traversed by single loop.  */
+ 
+ 		if (index >= info[i].index && index - info[i].index < 4096)
+ 		  {
+ 		    info[i].write |= d.mem_write;
+ 		    info[i].bytes_accesed += size;
+ 		    info[i].index = index;
+ 		    info[i].giv = iv;
+ 		    info[i].class = bl;
+ 		    info[num_prefetches].base_address = address;
+ 		    add = 0;
+ 		    break;
+ 		  }
+ 		if (index < info[i].index && info[i].index - index < 4096)
+ 		  {
+ 		    info[i].write |= d.mem_write;
+ 		    info[i].bytes_accesed += size;
+ 		    add = 0;
+ 		    break;
+ 		  }
+ 	      }
+ 	  /* Merging failed.  */
+ 	  if (add)
+ 	    {
+ 	      info[num_prefetches].giv = iv;
+ 	      info[num_prefetches].class = bl;
+ 	      info[num_prefetches].index = index;
+ 	      info[num_prefetches].stride = stride;
+ 	      info[num_prefetches].base_address = address;
+ 	      info[num_prefetches].write = d.mem_write;
+ 	      info[num_prefetches].bytes_accesed = size;
+ 	      num_prefetches++;
+ 	      if (num_prefetches >= MAX_PREFETCHES)
+ 		{
+ 		  if (loop_dump_stream)
+ 		    fprintf(loop_dump_stream,"Maximal number of prefetches exceeded.\n");
+ 		  return;
+ 		}
+ 	    }
+ 	}
+     }
+   for (i = 0; i < num_prefetches; i++)
+     {
+       /* Attempt to calculate the number of bytes fetched by the loop.
+ 	 Avoid overflow.  */
+       if (LOOP_INFO (loop)->n_iterations
+           && (0xffffffff / info[i].stride) >= LOOP_INFO (loop)->n_iterations)
+ 	info[i].total_bytes = info[i].stride * LOOP_INFO (loop)->n_iterations;
+       else
+ 	info[i].total_bytes = 0xffffffff;
+ 
+ 
+       /* Prefetch is worthwhile only when the loads/stores are dense.  */
+       if (PREFETCH_ONLY_DENSE_MEM
+ 	  && (info[i].bytes_accesed * 256 / info[i].stride > PREFETCH_DENSE_MEM)
+ 	  && (info[i].total_bytes / PREFETCH_BLOCK >=
+ 	      PREFETCH_BLOCKS_BEFORE_LOOP_MIN))
+ 	{
+ 	  info[i].prefetch_before_loop = 1;
+ 	  if (info[i].total_bytes / PREFETCH_BLOCK <=
+ 	      PREFETCH_BLOCKS_BEFORE_LOOP_MAX)
+ 	    info[i].prefetch_in_loop = 0;
+ 	  else
+ 	    info[i].prefetch_in_loop = 1;
+ 	}
+       else
+         info[i].prefetch_in_loop = 0, info[i].prefetch_before_loop = 0;
+ 
+       if (info[i].prefetch_in_loop)
+ 	{
+ 	  num_real_prefetches += ((info[i].stride + PREFETCH_BLOCK - 1)
+ 				  / PREFETCH_BLOCK);
+ 	  if (info[i].write)
+ 	    num_real_write_prefetches +=
+ 		((info[i].stride + PREFETCH_BLOCK - 1) / PREFETCH_BLOCK);
+ 	}
+     }
+   if (loop_dump_stream)
+     {
+       for (i = 0; i < num_prefetches; i++)
+ 	{
+ 	  fprintf (loop_dump_stream, "Prefetch insn %i address: ",
+ 		   INSN_UID (info[i].giv->insn));
+ 	  print_rtl (loop_dump_stream, info[i].base_address);
+ 	  fprintf (loop_dump_stream, " Index:%i stride:%i density:%i%% total_bytes: %u %s in loop:%s before:%s\n",
+ 		   info[i].index, info[i].stride,
+ 		   info[i].bytes_accesed * 100 / info[i].stride,
+ 		   info[i].total_bytes,
+ 		   info[i].write ? "read/write" : "read only",
+ 		   info[i].prefetch_in_loop ? "yes" : "no",
+ 		   info[i].prefetch_before_loop ? "yes" : "no");
+ 	}
+       fprintf (loop_dump_stream, "Real prefetches needed:%i (write:%i)\n",
+ 	       num_real_prefetches, num_real_write_prefetches);
+     }
+ 
+   if (!num_real_prefetches)
+     return;
+ 
+   ahead = (SIMULTANEOUS_PREFETCHES / (num_real_prefetches));
+ 
+   if (!ahead)
+     return;
+   for (i = 0; i < num_prefetches; i++)
+     {
+       if (info[i].prefetch_in_loop)
+ 	{
+ 	  int y;
+ 	  for (y = 0; y < ((info[i].stride + PREFETCH_BLOCK - 1)
+ 			   / PREFETCH_BLOCK); y++)
+ 	    {
+ 	      rtx loc = copy_rtx (*info[i].giv->location);
+ 	      rtx insn;
+ 	      int bytes_ahead = PREFETCH_BLOCK * (ahead + y);
+ 	      rtx before_insn = info[i].giv->insn;
+ 	      rtx prev_insn = PREV_INSN (info[i].giv->insn);
+ 
+ 	      /* We can save some effort by offsetting the address on
+ 		 architectures with offsettable memory references.  */
+ 	      if (offsettable_address_p (0, VOIDmode, loc))
+ 		loc = plus_constant (loc, bytes_ahead);
+ 	      else
+ 		{
+ 		  rtx reg = gen_reg_rtx (Pmode);
+ 		  loop_iv_add_mult_emit_before (loop, loc, const1_rtx,
+ 		      				GEN_INT (bytes_ahead), reg,
+ 				  		0, before_insn);
+ 		  loc = reg;
+ 		}
+ 
+ 	      emit_insn_before (gen_prefetch (loc, GEN_INT (info[i].write),
+ 		                              GEN_INT (3)), before_insn);
+ 
+ 	      /* Check all insns emitted and record the new GIV information.  */
+ 	      insn = NEXT_INSN (prev_insn);
+ 	      while (insn != before_insn)
+ 		{
+ 		  insn = check_insn_for_givs (loop, insn,
+ 					      info[i].giv->always_executed,
+ 					      info[i].giv->maybe_multiple);
+ 		  insn = NEXT_INSN (insn);
+ 		}
+ 	    }
+ 	}
+       if (info[i].prefetch_before_loop)
+ 	{
+ 	  int y;
+ 	  /* Emit INSNs before the loop to fetch the first cache lines.  */
+ 	  for (y = 0; ((!info[i].prefetch_in_loop || y < ahead)
+ 		       && y * PREFETCH_BLOCK < (int)info[i].total_bytes); y ++)
+ 	    {
+ 	      rtx reg = gen_reg_rtx (Pmode);
+ 	      rtx loop_start = loop->start;
+ 	      rtx add_val = simplify_gen_binary (PLUS, Pmode,
+ 						 info[i].giv->add_val,
+ 						 GEN_INT (y * PREFETCH_BLOCK));
+ 	      loop_iv_add_mult_emit_before (loop, info[i].class->initial_value,
+ 					    info[i].giv->mult_val,
+ 				            add_val, reg, 0, loop_start);
+ 	      emit_insn_before (gen_prefetch (reg, GEN_INT (info[i].write),
+ 					      GEN_INT (3)), loop_start);
+ 	    }
+ 	}
+     }
+   return;
+ }
+ 
  /* A "basic induction variable" or biv is a pseudo reg that is set
     (within this loop) only by incrementing or decrementing it.  */
  /* A "general induction variable" or giv is a pseudo reg whose
*************** strength_reduce (loop, flags)
*** 4297,4302 ****
--- 4885,4895 ----
       be called after all giv's have been identified, since otherwise it may
       fail if the iteration variable is a giv.  */
    loop_iterations (loop);
+ 
+ #ifdef HAVE_prefetch
+   if (flags & LOOP_PREFETCH)
+     emit_prefetch_instructions (loop);
+ #endif
  
    /* Now for each giv for which we still don't know whether or not it is
       replaceable, check to see if it is replaceable because its final value
Index: loop.h
===================================================================
RCS file: /cvs/gcc/egcs/gcc/loop.h,v
retrieving revision 1.56
diff -c -3 -p -r1.56 loop.h
*** loop.h	2001/10/29 22:13:40	1.56
--- loop.h	2001/12/12 08:59:17
*************** Software Foundation, 59 Temple Place - S
*** 27,32 ****
--- 27,33 ----
  /* Flags passed to loop_optimize.  */
  #define LOOP_UNROLL 1
  #define LOOP_BCT 2
+ #define LOOP_PREFETCH 4
  
  /* Get the loop info pointer of a loop.  */
  #define LOOP_INFO(LOOP) ((struct loop_info *) (LOOP)->aux)
Index: toplev.c
===================================================================
RCS file: /cvs/gcc/egcs/gcc/toplev.c,v
retrieving revision 1.554
diff -c -3 -p -r1.554 toplev.c
*** toplev.c	2001/12/06 11:49:10	1.554
--- toplev.c	2001/12/12 08:59:18
*************** Software Foundation, 59 Temple Place - S
*** 46,51 ****
--- 46,52 ----
  #include "flags.h"
  #include "insn-attr.h"
  #include "insn-config.h"
+ #include "insn-flags.h"
  #include "hard-reg-set.h"
  #include "recog.h"
  #include "output.h"
*************** int flag_unroll_loops;
*** 543,548 ****
--- 544,553 ----
  
  int flag_unroll_all_loops;
  
+ /* Nonzero enables prefetch optimizations for arrays in loops.  */
+ 
+ int flag_prefetch_loop_arrays;
+ 
  /* Nonzero forces all invariant computations in loops to be moved
     outside the loop.  */
  
*************** lang_independent_options f_options[] =
*** 1000,1005 ****
--- 1005,1012 ----
     N_("Perform loop unrolling when iteration count is known") },
    {"unroll-all-loops", &flag_unroll_all_loops, 1,
     N_("Perform loop unrolling for all loops") },
+   {"prefetch-loop-arrays", &flag_prefetch_loop_arrays, 1,
+    N_("Generate prefetch instructions, if available, for arrays in loops") },
    {"move-all-movables", &flag_move_all_movables, 1,
     N_("Force all loop invariant computations out of loops") },
    {"reduce-all-givs", &flag_reduce_all_givs, 1,
*************** rest_of_compilation (decl)
*** 2860,2866 ****
  	}
        cleanup_barriers ();
        loop_optimize (insns, rtl_dump_file,
! 		     (flag_unroll_loops ? LOOP_UNROLL : 0) | LOOP_BCT);
  
        close_dump_file (DFI_loop, print_rtl, insns);
        timevar_pop (TV_LOOP);
--- 2867,2874 ----
  	}
        cleanup_barriers ();
        loop_optimize (insns, rtl_dump_file,
! 		     (flag_unroll_loops ? LOOP_UNROLL : 0) | LOOP_BCT
! 		     | (flag_prefetch_loop_arrays ? LOOP_PREFETCH : 0));
  
        close_dump_file (DFI_loop, print_rtl, insns);
        timevar_pop (TV_LOOP);
*************** process_options ()
*** 4920,4925 ****
--- 4928,4947 ----
        warning ("-ffunction-sections disabled; it makes profiling impossible");
        flag_function_sections = 0;
      }
+ 
+ #ifndef HAVE_prefetch
+   if (flag_prefetch_loop_arrays)
+     {
+       warning ("-fprefetch-loop-arrays not supported for this target");
+       flag_prefetch_loop_arrays = 0;
+     }
+ #else
+   if (flag_prefetch_loop_arrays && !HAVE_prefetch)
+     {
+       warning ("-fprefetch-loop-arrays not supported for this target (try -march switches)");
+       flag_prefetch_loop_arrays = 0;
+     }
+ #endif
  
  #ifndef OBJECT_FORMAT_ELF
    if (flag_function_sections && write_symbols != NO_DEBUG)
Index: doc/invoke.texi
===================================================================
RCS file: /cvs/gcc/egcs/gcc/doc/invoke.texi,v
retrieving revision 1.88
diff -c -3 -p -r1.88 invoke.texi
*** invoke.texi	2001/12/10 03:27:49	1.88
--- invoke.texi	2001/12/12 08:59:29
*************** in the following sections.
*** 272,278 ****
  -fno-inline  -fno-math-errno  -fno-peephole  -fno-peephole2 @gol
  -funsafe-math-optimizations -fno-trapping-math @gol
  -fomit-frame-pointer  -foptimize-register-move @gol
! -foptimize-sibling-calls  -freduce-all-givs @gol
  -fregmove  -frename-registers @gol
  -frerun-cse-after-loop  -frerun-loop-opt @gol
  -fschedule-insns  -fschedule-insns2 @gol
--- 272,278 ----
  -fno-inline  -fno-math-errno  -fno-peephole  -fno-peephole2 @gol
  -funsafe-math-optimizations -fno-trapping-math @gol
  -fomit-frame-pointer  -foptimize-register-move @gol
! -foptimize-sibling-calls  -fprefetch-loop-arrays  -freduce-all-givs @gol
  -fregmove  -frename-registers @gol
  -frerun-cse-after-loop  -frerun-loop-opt @gol
  -fschedule-insns  -fschedule-insns2 @gol
*************** the loop is entered.  This usually makes
*** 3570,3575 ****
--- 3570,3579 ----
  @option{-funroll-all-loops} implies the same options as
  @option{-funroll-loops},
  
+ @item -fprefetch-loop-arrays
+ @opindex fprefetch-loop-arrays
+ If supported by the target machine, generate instructions to prefetch
+ memory to improve the performance of loops that access large arrays.
  
  @item -fmove-all-movables
  @opindex fmove-all-movables


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]