This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Prefetch infrastructure patch

To: egcs-patches at egcs dot cygnus dot com, rth at cygnus dot com
Subject: Prefetch infrastructure patch
From: Jan Hubicka <hubicka at atrey dot karlin dot mff dot cuni dot cz>
Date: Thu, 13 Apr 2000 14:31:28 +0200
Hi
Here are bare bones of prefetch implementation. I've tried hard to keep patch as short
as possible, so the heruistics are quite naive and there are some problems with strength
reduction pass I plan to fix in separate patch (it needs a lot of fixing to behave sanely
tought). The strength reduction is excellent in confusing herself for the second loop pass,
create multiple bivs for single array and is unable to see this, so prefetch pass then
emits too many prefetches.

It also don't contain the unroller hinting code and some other contraversal parts.
Prefetch insn is represented as unspec, while I would love to add opcode for it later
if this is profitable.

It bootstraps with -fprefetch and generate sane code in trivial examples. Also I've benchmarked
the change in unrolling (to unroll in first pass) and it seems to work well. There are some
speedups and some slowdowns, but overall it seems to be small win.

The prefetch support for IA-32 is included. It works for K6/Athlon using 3DNOW
and for PIII using SSE prefetch instruction (this one is NOP on earlier PII machines).
The 3DNOW support have advantage of having prefetchw instruction for block we write to.

Honza

Thu Apr 13 10:46:44 CEST 2000  Jan Hubicka  <jh@suse.cz>
	Add basic prefetch infrastructure:
	* loop.c (scan_loop, recombine_givs): Merge unroll_p abd bct_p parameters
	to flags.
	(strength_reduce): Likewise; call emit_prefetch_instructions.
	(loop_optimize): Likewise.
	(MAX_PREFETCHES): New macro.
	(prefetch_info, check_store_data): New structure.
	(check_store, record_all_givs, emit_prefetch_instructions): New functions.
	* loop.h (LOOP_UNROLL, LOOP_PREFETCH, LOOP_BCT): New constants.
	* toplev.c (flag_prefetch): New global variable.
	(lang_independent_options): Add -fprefetch
	(rest_of_compilation): Update call of loop_optimize; do loop unrolling
	in the first pass, do prefetch optimizations in second pass.
	* flags.h (flag_strength_reduce): Declare.
	* rtl.h (loop_optimize): Update prototype.
	* invoke.texi (-fprefetch): Document.
	* md.texi (prefetch, prefetchw): Document.
	* tm.texi (PREFETCH_BLOCK_SIZE, SIMULTATENOUS_PREFETCHES): Document.
	* i386.c (*_cost): Initialize prefetch_block_size and simultatenous_prefetches.
	(x86_3dNOW, x86_SSE): New global variables.
	(ix86_attr_length_default): Handle TYPE_PREFETCH.
	* i386.h (struct processor_costs): Add simultatenous_prefetches
	and prefetch_block.
	(ARCHMASK): New macro.
	(x86_3dNOW, x86_SSE): Declare.
	(TARGET_CMOVE): Use ARCHMASK
	(TARGET_3DNOW, TARGET_SSE): New macros.
	(PREFETCH_BLOCK, SIMULTATENOUS_PREFETCHES): New macros.
	* i386.md (attr type): Add "prefetch".
	(attr length_opcode): Support prefetch type.
	(attr memory_operand, k6_alu, athlon_ieu): Likewise
	(prefetch expander): New.
	(prefetch and prefetchw patterns): New.


*** loop.c.old	Thu Apr 13 08:14:40 2000
--- loop.c	Thu Apr 13 11:06:10 2000
*************** static void count_loop_regs_set PARAMS (
*** 246,252 ****
  static void note_addr_stored PARAMS ((rtx, rtx, void *));
  static void note_set_pseudo_multiple_uses PARAMS ((rtx, rtx, void *));
  static int loop_reg_used_before_p PARAMS ((const struct loop *, rtx, rtx));
! static void scan_loop PARAMS ((struct loop*, int, int));
  #if 0
  static void replace_call_address PARAMS ((rtx, rtx, rtx));
  #endif
--- 246,252 ----
  static void note_addr_stored PARAMS ((rtx, rtx, void *));
  static void note_set_pseudo_multiple_uses PARAMS ((rtx, rtx, void *));
  static int loop_reg_used_before_p PARAMS ((const struct loop *, rtx, rtx));
! static void scan_loop PARAMS ((struct loop*, int));
  #if 0
  static void replace_call_address PARAMS ((rtx, rtx, rtx));
  #endif
*************** static void add_label_notes PARAMS ((rtx
*** 261,267 ****
  static void move_movables PARAMS ((struct loop *loop, struct movable *, 
  				   int, int, int));
  static int count_nonfixed_reads PARAMS ((const struct loop *, rtx));
! static void strength_reduce PARAMS ((struct loop *, int, int, int));
  static void find_single_use_in_loop PARAMS ((rtx, rtx, varray_type));
  static int valid_initial_value_p PARAMS ((rtx, rtx, int, rtx));
  static void find_mem_givs PARAMS ((const struct loop *, rtx, rtx, int, int));
--- 261,267 ----
  static void move_movables PARAMS ((struct loop *loop, struct movable *, 
  				   int, int, int));
  static int count_nonfixed_reads PARAMS ((const struct loop *, rtx));
! static void strength_reduce PARAMS ((struct loop *, int, int));
  static void find_single_use_in_loop PARAMS ((rtx, rtx, varray_type));
  static int valid_initial_value_p PARAMS ((rtx, rtx, int, rtx));
  static void find_mem_givs PARAMS ((const struct loop *, rtx, rtx, int, int));
*************** compute_luids (start, end, prev_luid)
*** 419,429 ****
     (or 0 if none should be output).  */
  
  void
! loop_optimize (f, dumpfile, unroll_p, bct_p)
       /* f is the first instruction of a chain of insns for one function */
       rtx f;
       FILE *dumpfile;
!      int unroll_p, bct_p;
  {
    register rtx insn;
    register int i;
--- 419,429 ----
     (or 0 if none should be output).  */
  
  void
! loop_optimize (f, dumpfile, flags)
       /* f is the first instruction of a chain of insns for one function */
       rtx f;
       FILE *dumpfile;
!      int flags;
  {
    register rtx insn;
    register int i;
*************** loop_optimize (f, dumpfile, unroll_p, bc
*** 529,535 ****
        struct loop *loop = &loops->array[i];
  
        if (! loop->invalid && loop->end)
! 	scan_loop (loop, unroll_p, bct_p);
      }
  
    /* If there were lexical blocks inside the loop, they have been
--- 529,535 ----
        struct loop *loop = &loops->array[i];
  
        if (! loop->invalid && loop->end)
! 	scan_loop (loop, flags);
      }
  
    /* If there were lexical blocks inside the loop, they have been
*************** next_insn_in_loop (loop, insn)
*** 588,596 ****
     write, then we can also mark the memory read as invariant.  */
  
  static void
! scan_loop (loop, unroll_p, bct_p)
       struct loop *loop;
!      int unroll_p, bct_p;
  {
    register int i;
    rtx loop_start = loop->start;
--- 588,596 ----
     write, then we can also mark the memory read as invariant.  */
  
  static void
! scan_loop (loop, flags)
       struct loop *loop;
!      int flags;
  {
    register int i;
    rtx loop_start = loop->start;
*************** scan_loop (loop, unroll_p, bct_p)
*** 1142,1148 ****
    if (flag_strength_reduce)
      {
        the_movables = movables;
!       strength_reduce (loop, insn_count, unroll_p, bct_p);
  
        reg_scan_update (update_start, update_end, loop_max_reg);
        loop_max_reg = max_reg_num ();
--- 1142,1148 ----
    if (flag_strength_reduce)
      {
        the_movables = movables;
!       strength_reduce (loop, insn_count, flags);
  
        reg_scan_update (update_start, update_end, loop_max_reg);
        loop_max_reg = max_reg_num ();
*************** static rtx addr_placeholder;
*** 3694,3699 ****
--- 3694,4079 ----
     was rerun in loop_optimize whenever a register was added or moved.
     Also, some of the optimizations could be a little less conservative.  */
  
+ #ifdef HAVE_prefetch
+ /* Give up the prefetch optimizations once we exceed given treshold.  It is
+    unlikely we would be able to optimize something in loop with so many
+    detected prefetches.  */
+ 
+ #define MAX_PREFETCHES 10
+ 
+ /* Informations we collect about arrays we may want to prefetch.  */
+ struct prefetch_info
+ {
+   struct iv_class *class;	/* Class this prefetch is based on.  */
+   struct induction *giv;	/* GIV this prefetch is based on.  */
+   rtx base_address;		/* Start prefetching from this adress plus index.  */
+   int index;
+   int straddle;			/* Prefetch straddle in bytes in each iteration.  */
+   int write;			/* 1 for read/write prefetches.  */
+   int bytes_accesed;		/* Sum of sizes of all acceses to this prefetch area.  */
+   int do_prefetch;		/* 1 for those choosed for prefetching.  */
+ };
+ 
+ /* Data used by check_store function.  */
+ struct check_store_data
+ {
+   rtx mem_address;
+   int mem_write;
+ };
+ 
+ static void check_store PARAMS ((rtx, rtx, void *));
+ static void record_all_givs PARAMS ((struct loop *, rtx, rtx, rtx,
+ 				     struct prefetch_info *, rtx));
+ static void emit_prefetch_instructions PARAMS ((struct loop *));
+ 
+ /* Set mem_write when mem_address is found.  Used as callback to note_stores.  */
+ static void
+ check_store (x, pat, data)
+      rtx x, pat ATTRIBUTE_UNUSED;
+      void *data;
+ {
+   struct check_store_data *d = (struct check_store_data *)data;
+   /* We don't want to return 1 if X is a MEM that contains a register
+      within REG_SET_REG.  */
+ 
+ 
+   if ((GET_CODE (x) == MEM) && rtx_equal_p (d->mem_address, XEXP (x, 0)))
+     d->mem_write = 1;
+ }
+ 
+ /* Record all givs created by the prefetch expander - Walk trought insn and
+    record each occurence of address in question.  */
+ static void
+ record_all_givs (loop, insn, pat, loc, info, add_val)
+      struct loop *loop;
+      struct prefetch_info *info;
+      rtx insn, pat, loc, add_val;
+ {
+   int i;
+   int j;
+   enum rtx_code code = GET_CODE (pat);
+   const char *fmt = GET_RTX_FORMAT (code);
+ 
+   for (i = GET_RTX_LENGTH (code); i >= 0; i--)
+     {
+       if (fmt[i] == 'e' && XEXP (pat, i))
+ 	{
+ 	  if (rtx_equal_p (XEXP (pat, i), loc))
+ 	    {
+ 	      struct induction *v = oballoc (sizeof (struct induction));
+ 	      record_giv (loop, v, insn,
+ 			  info->giv->src_reg,
+ 			  addr_placeholder,
+ 			  info->giv->mult_val,
+ 			  add_val,
+ 			  1,
+ 			  DEST_ADDR,
+ 			  !info->giv->always_computable,
+ 			  info->giv->maybe_multiple,
+ 			  &XEXP (pat, 0));
+ 	      v->mem_mode = SImode;
+ 	    }
+ 	  else
+ 	    record_all_givs (loop, insn, XEXP (pat, i), loc, info, add_val);
+ 	}
+       else if (fmt[i] == 'E')
+ 	for (j = 0; j < XVECLEN (pat, i); j++)
+ 	  {
+ 	    if (rtx_equal_p (XVECEXP (pat, i, j), loc))
+ 	      {
+ 		struct induction *v = oballoc (sizeof (struct induction));
+ 		record_giv (loop, v, insn,
+ 			    info->giv->src_reg,
+ 			    addr_placeholder,
+ 			    info->giv->mult_val,
+ 			    add_val,
+ 			    1,
+ 			    DEST_ADDR,
+ 			    !info->giv->always_computable,
+ 			    info->giv->maybe_multiple,
+ 			    &XVECEXP (pat, i, j));
+ 		v->mem_mode = SImode;
+ 	      }
+ 	    else
+ 	      record_all_givs (loop, insn, XVECEXP (pat, i, j), loc, info, add_val);
+ 	  }
+     }
+ }
+ 
+ /* Attempt to identify access to arrays most likely causing case miss and emit
+    prefetch instruction few prefetch blocks forward.
+ 
+    To detect the arrays we use GIV infromation collected by strength reduction
+    pass.
+ 
+    The prefetch instructions are generated after GIV information is done and
+    before strength reduction process. The new GIVs are injected to the strength
+    reduction's tables, so the prefetch addresses are optimized as well.
+ 
+    Givs are split into base address, straddle, and constant addition value.
+    Givs with same address, straddle and close addition values are combined into
+    single prefetch.  Also writes to givs are detected, so Athlon's prefetchw
+    instruction can be used for block we write to.  
+ 
+    Later we may want to implement more smart heruistics to detect cache misses
+    Currently we don't prefetch when one of following conditions hold:
+ 
+ 
+     1) Loop has known (and low) iteration count.
+     2) Loop has more than 10 prefetch blocks
+     3) Loop contains function call (such loop is probably not internal loop)
+     4) The density of prefetch is less than 80%
+     5) When the straddle is larger than 4096 bytes, or negative
+ 
+ */
+ static void
+ emit_prefetch_instructions (struct loop *loop)
+ {
+   int num_prefetches = 0;
+   int num_real_prefetches = 0;
+   int num_real_write_prefetches = 0;
+   int ahead;
+   int i;
+   struct iv_class *bl;
+   struct induction *iv;
+   struct prefetch_info info[MAX_PREFETCHES];
+ 
+   if (!HAVE_prefetch)
+     return;
+ 
+   /* Loops with few iterations can be handled by emiting prefetches before the loop.
+      Handle this later.  */
+   if ((LOOP_INFO (loop)->n_iterations && LOOP_INFO (loop)->n_iterations < 1000)
+   /* Consider only loops w/o calls.  When call is done, loop is probably slow enought
+      to read the memory.  */
+       || LOOP_INFO (loop)->has_call)
+     return;
+ 
+   /* Search all induction variables and pick those interesting for prefetch machinery.  */
+   for (bl = loop_iv_list; bl; bl = bl->next)
+     {
+       struct induction *biv = bl->biv, *biv1;
+       int basestraddle = 0;
+ 
+       biv1 = biv;
+       /* Expect all bivs to be executed in each iteration.  This makes our
+ 	 analysis more conservative.  */
+       while (biv1 || basestraddle)
+ 	{
+ 	  if (GET_CODE (biv->add_val) != CONST_INT)
+ 	    break;
+ 	  basestraddle += INTVAL (biv1->add_val);
+ 	  biv1 = biv1->next_iv;
+ 	}
+       if (biv1)
+ 	continue;
+       for (iv = bl->giv; iv; iv = iv->next_iv)
+ 	{
+ 	  rtx address;
+ 	  rtx temp;
+ 	  int index = 0;
+ 	  int add = 1;
+ 	  int straddle;
+ 	  struct check_store_data d;
+ 
+ 	  /* Half a dozen of purposes why induction variable is not
+ 	     interesting for us.  */
+ 	  if (iv->giv_type != DEST_ADDR
+ 	  /* We are interested only in constant straddle memory references
+ 	     in order to be able to compute density easilly.  */
+ 	      || GET_CODE (iv->mult_val) != CONST_INT
+ 	  /* Don't handle reversed order prefetches, since they are usually
+ 	     ineffective.  Later we may be able to reverse such bivs.  */
+ 	      || (straddle = INTVAL (iv->mult_val) * basestraddle) < 0
+ 	  /* Prefetching of accesses with such a extreme straddle is probably not
+ 	     worthwhile too.  */
+ 	      || straddle > 4096)
+ 	    continue;
+ 
+ 	  /* Determine pointer to the basic array we are examinating.  Usually we will
+ 	     be faced with constructs like (plus:SI (array_pointer) (const_int disp))  */
+ 	  address = iv->add_val;
+ 	  address = simplify_gen_binary (PLUS, Pmode, bl->initial_value, address);
+ 	  /* Try hard to convert address to something we understand to.
+ 	     The real base addresses can be well masked.  */
+ 	  address = canon_rtx (address);
+ 	  temp = simplify_rtx (address);
+ 	  if (temp)
+ 	    address = temp;
+ 
+ 	  if (GET_CODE (address) == CONST)
+ 	    address = XEXP (address, 0);
+ 	  if (GET_CODE (address) == PLUS
+ 	      && GET_CODE (XEXP (address, 1)) == CONST_INT)
+ 	    {
+ 	      index = INTVAL (XEXP (address, 1));
+ 	      address = XEXP (address, 0);
+ 	    }
+ 	  if (GET_CODE (address) == CONST_INT)
+ 	    {
+ 	      index = INTVAL (address);
+ 	      address = const0_rtx;
+ 	    }
+ 	  index += GET_MODE_SIZE (iv->mem_mode);
+ 	  d.mem_write = 0;
+ 	  d.mem_address = *iv->location;
+ 	  /* When the giv is not always executed, we may get better of by non dirtifying
+ 	     the cache pages.  */
+ 	  if (iv->always_executed)
+ 	    note_stores (PATTERN (iv->insn), check_store, &d);
+ 
+ 	  /* Attempt to find prefetch to the same array and see if we can merge this one.  */
+ 	  for (i = 0; i < num_prefetches; i++)
+ 	    if (rtx_equal_p (address, info[i].base_address)
+ 		&& straddle == info[i].straddle)
+ 	      {
+ 		if (index >= info[i].index && index - info[i].index < 4096)
+ 		  {
+ 		    info[i].write |= d.mem_write;
+ 		    info[i].bytes_accesed += GET_MODE_SIZE (iv->mem_mode);
+ 		    info[i].index = index;
+ 		    info[i].giv = iv;
+ 		    info[i].class = bl;
+ 		    info[num_prefetches].base_address = address;
+ 		    add = 0;
+ 		    break;
+ 		  }
+ 		if (index < info[i].index && info[i].index - index < 4096)
+ 		  {
+ 		    info[i].write |= d.mem_write;
+ 		    info[i].bytes_accesed += GET_MODE_SIZE (iv->mem_mode);
+ 		    add = 0;
+ 		    break;
+ 		  }
+ 	      }
+ 	  /* Merging failed.  */
+ 	  if (add)
+ 	    {
+ 	      info[num_prefetches].giv = iv;
+ 	      info[num_prefetches].class = bl;
+ 	      info[num_prefetches].index = index;
+ 	      info[num_prefetches].straddle = straddle;
+ 	      info[num_prefetches].base_address = address;
+ 	      info[num_prefetches].write = d.mem_write;
+ 	      info[num_prefetches].bytes_accesed = GET_MODE_SIZE (iv->mem_mode);
+ 	      num_prefetches++;
+ 	      if (num_prefetches >= MAX_PREFETCHES)
+ 		{
+ 		  if (loop_dump_stream)
+ 		    fprintf(stderr,"Maximal number of prefetches exceeded.\n");
+ 		  return;
+ 		}
+ 	    }
+ 	}
+     }
+   for (i = 0; i < num_prefetches; i++)
+     {
+       /* Prefetch is wortwhile only when the reads/writes are dense.  */
+       if (info[i].bytes_accesed * 256 / info[i].straddle > 220)
+ 	info[i].do_prefetch = 1;
+       else
+ 	info[i].do_prefetch = 0;
+     }
+   for (i = 0; i < num_prefetches; i++)
+     {
+       if (info[i].do_prefetch)
+       {
+ 	num_real_prefetches += ((info[i].straddle + PREFETCH_BLOCK - 1)
+ 				/ PREFETCH_BLOCK);
+ 	if (info[i].write)
+ 	  num_real_write_prefetches += ((info[i].straddle + PREFETCH_BLOCK - 1)
+ 					/ PREFETCH_BLOCK);
+       }
+     }
+   if (loop_dump_stream)
+     {
+       for (i = 0; i < num_prefetches; i++)
+ 	{
+ 	  fprintf (loop_dump_stream, "Prefetch insn %i address: ",
+ 		   INSN_UID (info[i].giv->insn));
+ 	  print_rtl (loop_dump_stream, info[i].base_address);
+ 	  fprintf (loop_dump_stream, " Index:%i straddle:%i density:%i%% %s %s\n",
+ 		   info[i].index, info[i].straddle,
+ 		   info[i].bytes_accesed * 100 / info[i].straddle,
+ 		   info[i].write ? "read/write" : "read only",
+ 		   info[i].do_prefetch ? "prefetch" : "ignore");
+ 	}
+       fprintf (loop_dump_stream, "Real prefetches needed:%i (write:%i)\n",
+ 	       num_real_prefetches, num_real_write_prefetches);
+     }
+ 
+   if (!num_real_prefetches)
+     return;
+ 
+   ahead = (SIMULTATENOUS_PREFETCHES / (num_real_prefetches));
+ 
+   if (!ahead)
+     return;
+   for (i = 0; i < num_prefetches; i++)
+     {
+       if (info[i].do_prefetch)
+ 	{
+ 	  int y;
+ 	  for (y = 0; y < ((info[i].straddle + PREFETCH_BLOCK - 1)
+ 			   / PREFETCH_BLOCK); y++)
+ 	    {
+ 	      rtx loc = copy_rtx (*info[i].giv->location);
+ 	      rtx insn;
+ 	      int bytes_ahead = PREFETCH_BLOCK * (ahead + y);
+ 	      rtx add_val = simplify_gen_binary (PLUS, Pmode, info[i].giv->add_val,
+ 						 GEN_INT (bytes_ahead));
+ 	      rtx before_insn = info[i].giv->insn;
+ 
+ 	      /* We can save some effort by offsetting the address on architectures with
+ 		 offsettable memory references.  */
+ 	      if (offsettable_address_p (0, SImode, loc))
+ 	 	{
+ 		  loc = gen_rtx_MEM (SImode, loc);
+ 		  loc = adj_offsettable_operand (loc, bytes_ahead);
+ 		  loc = XEXP (loc, 0);
+ 		}
+ 	      else
+ 		{
+ 		  rtx reg = gen_reg_rtx (Pmode);
+ 		  emit_iv_add_mult (loc, const1_rtx, GEN_INT (bytes_ahead), reg,
+ 				    before_insn);
+ 		  loc = reg;
+ 		}
+ 
+ #ifdef HAVE_prefetchw
+ 	      if (info[i].write && HAVE_prefetchw)
+ 		insn = emit_insn_before (gen_prefetchw (loc), before_insn);
+ 	      else
+ #endif
+ 		insn = emit_insn_before (gen_prefetch (loc), before_insn);
+ 
+ 	      record_all_givs (loop, insn, PATTERN(insn), loc, &info[i], add_val);
+ 
+ 	    }
+ 	  /* Emit insns before the loop to fetch first cache lines.  */
+ 	  for (y = 0; y < ahead ; y ++)
+ 	    {
+ 	      rtx reg = gen_reg_rtx (Pmode);
+ 	      rtx insn;
+ 	      rtx loop_start = loop->start;
+ 	      rtx add_val = simplify_gen_binary (PLUS, Pmode, info[i].giv->add_val,
+ 						 GEN_INT (y * PREFETCH_BLOCK));
+ 	      emit_iv_add_mult (info[i].class->initial_value, info[i].giv->mult_val,
+ 				info[i].giv->add_val, reg, loop_start);
+ #ifdef HAVE_prefetchw
+ 	      if (info[i].write && HAVE_prefetchw)
+ 		insn = emit_insn_before (gen_prefetchw (reg), loop_start);
+ 	      else
+ #endif
+ 		insn = emit_insn_before (gen_prefetch (reg), loop_start);
+ 	      record_all_givs (loop, insn, PATTERN(insn), reg, &info[i], add_val);
+ 	    }
+ 	}
+     }
+   return;
+ }
+ #endif
+ 
  /* Perform strength reduction and induction variable elimination.  
  
     Pseudo registers created during this function will be beyond the last
*************** static rtx addr_placeholder;
*** 3703,3712 ****
     But scan_loop must check regnos to make sure they are in bounds.   */
  
  static void
! strength_reduce (loop, insn_count, unroll_p, bct_p)
       struct loop *loop;
       int insn_count;
!      int unroll_p, bct_p ATTRIBUTE_UNUSED;
  {
    rtx p;
    rtx set;
--- 4083,4092 ----
     But scan_loop must check regnos to make sure they are in bounds.   */
  
  static void
! strength_reduce (loop, insn_count, flags)
       struct loop *loop;
       int insn_count;
!      int flags ATTRIBUTE_UNUSED;
  {
    rtx p;
    rtx set;
*************** strength_reduce (loop, insn_count, unrol
*** 3979,3985 ****
      {
        /* Can still unroll the loop anyways, but indicate that there is no
  	 strength reduction info available.  */
!       if (unroll_p)
  	unroll_loop (loop, insn_count, end_insert_before, 0);
  
        goto egress;
--- 4359,4365 ----
      {
        /* Can still unroll the loop anyways, but indicate that there is no
  	 strength reduction info available.  */
!       if (flags & LOOP_UNROLL)
  	unroll_loop (loop, insn_count, end_insert_before, 0);
  
        goto egress;
*************** strength_reduce (loop, insn_count, unrol
*** 4635,4640 ****
--- 5023,5033 ----
  
    loop_iterations (loop);
  
+ #ifdef HAVE_prefetch
+   if (flags & LOOP_PREFETCH)
+     emit_prefetch_instructions (loop);
+ #endif
+ 
    /* Now for each giv for which we still don't know whether or not it is
       replaceable, check to see if it is replaceable because its final value
       can be calculated.  This must be done after loop_iterations is called,
*************** strength_reduce (loop, insn_count, unrol
*** 4865,4871 ****
  	  VARRAY_GROW (reg_iv_type, nregs);
  	  VARRAY_GROW (reg_iv_info, nregs);
  	}
!       recombine_givs (loop, bl, unroll_p);
  
        /* Reduce each giv that we decided to reduce.  */
  
--- 5258,5264 ----
  	  VARRAY_GROW (reg_iv_type, nregs);
  	  VARRAY_GROW (reg_iv_info, nregs);
  	}
!       recombine_givs (loop, bl, flags);
  
        /* Reduce each giv that we decided to reduce.  */
  
*************** strength_reduce (loop, insn_count, unrol
*** 5259,5272 ****
       induction variable information that strength_reduce has already
       collected.  Always unroll loops that would be as small or smaller
       unrolled than when rolled.  */
!   if (unroll_p
        || (loop_info->n_iterations > 0
  	  && unrolled_insn_copies <= insn_count))
      unroll_loop (loop, insn_count, end_insert_before, 1);
  
  #ifdef HAVE_decrement_and_branch_on_count
    /* Instrument the loop with BCT insn.  */
!   if (HAVE_decrement_and_branch_on_count && bct_p
        && flag_branch_on_count_reg)
      insert_bct (loop);
  #endif  /* HAVE_decrement_and_branch_on_count */
--- 5652,5665 ----
       induction variable information that strength_reduce has already
       collected.  Always unroll loops that would be as small or smaller
       unrolled than when rolled.  */
!   if ((flags & LOOP_UNROLL)
        || (loop_info->n_iterations > 0
  	  && unrolled_insn_copies <= insn_count))
      unroll_loop (loop, insn_count, end_insert_before, 1);
  
  #ifdef HAVE_decrement_and_branch_on_count
    /* Instrument the loop with BCT insn.  */
!   if (HAVE_decrement_and_branch_on_count && (flags & LOOP_BCT)
        && flag_branch_on_count_reg)
      insert_bct (loop);
  #endif  /* HAVE_decrement_and_branch_on_count */
*************** find_life_end (x, stats, insn, biv)
*** 7395,7404 ****
     This tends to shorten giv lifetimes, and helps the next step:
     try to derive givs from other givs.  */
  static void
! recombine_givs (loop, bl, unroll_p)
       const struct loop *loop;
       struct iv_class *bl;
!      int unroll_p;
  {
    struct induction *v, **giv_array, *last_giv;
    struct recombine_givs_stats *stats;
--- 7800,7809 ----
     This tends to shorten giv lifetimes, and helps the next step:
     try to derive givs from other givs.  */
  static void
! recombine_givs (loop, bl, flags)
       const struct loop *loop;
       struct iv_class *bl;
!      int flags;
  {
    struct induction *v, **giv_array, *last_giv;
    struct recombine_givs_stats *stats;
*************** recombine_givs (loop, bl, unroll_p)
*** 7684,7690 ****
  	      && ((GET_CODE (sum) == PLUS
  		   && GET_CODE (XEXP (sum, 0)) == REG
  		   && GET_CODE (XEXP (sum, 1)) == CONST_INT)
! 		  || ! unroll_p)
  	      && validate_change (v->insn, &PATTERN (v->insn),
  				  gen_rtx_SET (VOIDmode, v->dest_reg, sum), 0))
  	    {
--- 8089,8095 ----
  	      && ((GET_CODE (sum) == PLUS
  		   && GET_CODE (XEXP (sum, 0)) == REG
  		   && GET_CODE (XEXP (sum, 1)) == CONST_INT)
! 		  || ! (flags & LOOP_UNROLL))
  	      && validate_change (v->insn, &PATTERN (v->insn),
  				  gen_rtx_SET (VOIDmode, v->dest_reg, sum), 0))
  	    {
*** ./loop.h.old	Thu Apr 13 08:14:43 2000
--- loop.h	Thu Apr 13 08:47:40 2000
*************** Boston, MA 02111-1307, USA.  */
*** 20,25 ****
--- 20,29 ----
  
  #include "varray.h"
  #include "basic-block.h"
+ /* Flags passed to loop_optimize.  */
+ #define LOOP_UNROLL 1
+ #define LOOP_PREFETCH 2
+ #define LOOP_BCT 4
  
  /* Get the loop info pointer of a loop.  */
  #define LOOP_INFO(LOOP) ((struct loop_info *) (LOOP)->aux) 
*** ./toplev.c.old	Thu Apr 13 08:46:15 2000
--- toplev.c	Thu Apr 13 10:33:27 2000
*************** int flag_unroll_loops;
*** 531,536 ****
--- 531,540 ----
  
  int flag_unroll_all_loops;
  
+ /* Nonzero enables prefetch optimizations.  */
+ 
+ int flag_prefetch;
+ 
  /* Nonzero forces all invariant computations in loops to be moved
     outside the loop. */
  
*************** lang_independent_options f_options[] =
*** 946,951 ****
--- 950,957 ----
     "Perform loop unrolling when iteration count is known" },
    {"unroll-all-loops", &flag_unroll_all_loops, 1,
     "Perform loop unrolling for all loops" },
+   {"prefetch", &flag_prefetch, 1,
+    "Output prefetch instructions to increase memory banwidth when available" },
    {"move-all-movables", &flag_move_all_movables, 1,
     "Force all loop invariant computations out of loops" },
    {"reduce-all-givs", &flag_reduce_all_givs, 1,
*************** rest_of_compilation (decl)
*** 3120,3126 ****
  	     {
  	       /* We only want to perform unrolling once.  */
  	       
! 	       loop_optimize (insns, rtl_dump_file, 0, 0);
  
  	       /* The first call to loop_optimize makes some instructions
  		  trivially dead.  We delete those instructions now in the
--- 3126,3133 ----
  	     {
  	       /* We only want to perform unrolling once.  */
  	       
! 	       loop_optimize (insns, rtl_dump_file,
! 			      (flag_unroll_loops ? LOOP_UNROLL : 0));
  
  	       /* The first call to loop_optimize makes some instructions
  		  trivially dead.  We delete those instructions now in the
*************** rest_of_compilation (decl)
*** 3132,3138 ****
  		  analysis code depends on this information.  */
  	       reg_scan (insns, max_reg_num (), 1);
  	     }
! 	   loop_optimize (insns, rtl_dump_file, flag_unroll_loops, 1);
  	 });
  
        close_dump_file (DFI_loop, print_rtl, insns);
--- 3139,3149 ----
  		  analysis code depends on this information.  */
  	       reg_scan (insns, max_reg_num (), 1);
  	     }
! 	   loop_optimize (insns, rtl_dump_file,
! 			  (LOOP_BCT
! 			   | (flag_unroll_loops && ! flag_rerun_loop_opt
! 			      ? LOOP_UNROLL : 0)
! 			   | (flag_prefetch ? LOOP_PREFETCH : 0)));
  	 });
  
        close_dump_file (DFI_loop, print_rtl, insns);
*** ./flags.h.old	Thu Apr 13 08:47:01 2000
--- flags.h	Thu Apr 13 08:47:24 2000
*************** extern int flag_strength_reduce;
*** 226,231 ****
--- 226,234 ----
  
  extern int flag_unroll_loops;
  
+ /* Nonzero enables prefetch optimizations.  */
+ extern int flag_prefetch;
+ 
  /* Nonzero enables loop unrolling in unroll.c.  All loops are unrolled.
     This is generally not a win.  */
  
*** ./rtl.h.old	Thu Apr 13 09:07:50 2000
--- rtl.h	Thu Apr 13 09:07:58 2000
*************** extern void print_inline_rtx		PARAMS ((F
*** 1582,1588 ****
  extern void init_loop			PARAMS ((void));
  extern rtx libcall_other_reg		PARAMS ((rtx, rtx));
  #ifdef BUFSIZ
! extern void loop_optimize		PARAMS ((rtx, FILE *, int, int));
  #endif
  extern void record_excess_regs		PARAMS ((rtx, rtx, rtx *));
  
--- 1582,1588 ----
  extern void init_loop			PARAMS ((void));
  extern rtx libcall_other_reg		PARAMS ((rtx, rtx));
  #ifdef BUFSIZ
! extern void loop_optimize		PARAMS ((rtx, FILE *, int));
  #endif
  extern void record_excess_regs		PARAMS ((rtx, rtx, rtx *));
  
*** ./invoke.texi.old	Thu Apr 13 09:11:33 2000
--- invoke.texi	Thu Apr 13 09:13:33 2000
*************** in the following sections.
*** 168,174 ****
  -fregmove -frerun-cse-after-loop  -frerun-loop-opt  -freduce-all-givs
  -fschedule-insns  -fschedule-insns2  -fstrength-reduce
  -fstrict-aliasing  -fthread-jumps  -funroll-all-loops
! -funroll-loops
  -O  -O0  -O1  -O2  -O3 -Os
  @end smallexample
  
--- 168,174 ----
  -fregmove -frerun-cse-after-loop  -frerun-loop-opt  -freduce-all-givs
  -fschedule-insns  -fschedule-insns2  -fstrength-reduce
  -fstrict-aliasing  -fthread-jumps  -funroll-all-loops
! -funroll-loops -fprefetch
  -O  -O0  -O1  -O2  -O3 -Os
  @end smallexample
  
*************** whose number of iterations can be determ
*** 2681,2686 ****
--- 2681,2690 ----
  Perform the optimization of loop unrolling.  This is done for all loops
  and usually makes programs run more slowly.  @samp{-funroll-all-loops}
  implies @samp{-fstrength-reduce} as well as @samp{-frerun-cse-after-loop}.
+ 
+ @item -fprefetch
+ Output prefetch instructions.  This may improve performance of loops accesing
+ large arrays. Only some architectures does support such instructions.
  
  @item -fmove-all-movables
  Forces all invariant computations in loops to be moved
*** ./md.texi.old	Thu Apr 13 09:13:44 2000
--- md.texi	Thu Apr 13 09:19:24 2000
*************** into consecutive memory locations.  Oper
*** 1947,1952 ****
--- 1947,1963 ----
  consecutive memory locations, operand 1 is the first register, and
  operand 2 is a constant: the number of consecutive registers.
  
+ @cindex @samp{prefetch} instruction pattern
+ @item @samp{prefetch}
+ Operand 0 is the memory access.  Prefetch is expected to emit instruction that
+ reads the address into cache.  Instruction must not trap when address is
+ invalid.  When this pattern is defined, PREFETCH_BLOCK_SIZE and
+ SIMULTATENOUS_PREFETCHES target macros must be defined too.
+ 
+ @cindex @samp{prefetchw} instruction pattern
+ @item @samp{prefetchw}
+ Similar to prefetch, but give hint to CPU that cache line will be modified.
+ 
  @cindex @code{add@var{m}3} instruction pattern
  @item @samp{add@var{m}3}
  Add operand 2 and operand 1, storing the result in operand 0.  All operands
*** ./tm.texi.old	Thu Apr 13 09:19:43 2000
--- tm.texi	Thu Apr 13 09:22:48 2000
*************** compatible with code compiled using the 
*** 1124,1129 ****
--- 1124,1139 ----
  If you are writing a new port, define @code{DEFAULT_VTABLE_THUNKS} to 1.
  
  If you do not define this macro, the default for @samp{-fvtable-thunk} is 0.
+ 
+ @findex PREFETCH_BLOCK_SIZE
+ @item PREFETCH_BLOCK_SIZE
+ Size of the block brought to cache by single prefetch operation in bytes.  Define
+ define this macro if and only if prefetch pattern exist.
+ 
+ @findex SIMULTATENOUS_PREFETCHES
+ @item SIMULTATENOUS_PREFETCHES
+ Number of prefetch operation CPU can issue in parallel.
+ Define define this macro if and only if prefetch pattern exist.
  @end table
  
  @node Type Layout
*** config/i386/i386.c.old	Thu Apr 13 08:15:06 2000
--- config/i386/i386.c	Thu Apr 13 09:15:42 2000
*************** struct processor_costs i386_cost = {	/* 
*** 74,80 ****
    2,					/* cost of reg,reg fld/fst */
    {8, 8, 8},				/* cost of loading fp registers
  					   in SFmode, DFmode and XFmode */
!   {8, 8, 8}				/* cost of loading integer registers */
  };
  
  struct processor_costs i486_cost = {	/* 486 specific costs */
--- 74,82 ----
    2,					/* cost of reg,reg fld/fst */
    {8, 8, 8},				/* cost of loading fp registers
  					   in SFmode, DFmode and XFmode */
!   {8, 8, 8},				/* cost of loading integer registers */
!   0,					/* size of prefetch block */
!   0,					/* number of prefetches doable in parallel */
  };
  
  struct processor_costs i486_cost = {	/* 486 specific costs */
*************** struct processor_costs i486_cost = {	/* 
*** 95,101 ****
    2,					/* cost of reg,reg fld/fst */
    {8, 8, 8},				/* cost of loading fp registers
  					   in SFmode, DFmode and XFmode */
!   {8, 8, 8}				/* cost of loading integer registers */
  };
  
  struct processor_costs pentium_cost = {
--- 97,105 ----
    2,					/* cost of reg,reg fld/fst */
    {8, 8, 8},				/* cost of loading fp registers
  					   in SFmode, DFmode and XFmode */
!   {8, 8, 8},				/* cost of loading integer registers */
!   0,					/* size of prefetch block */
!   0,					/* number of prefetches doable in parallel */
  };
  
  struct processor_costs pentium_cost = {
*************** struct processor_costs pentium_cost = {
*** 116,122 ****
    2,					/* cost of reg,reg fld/fst */
    {2, 2, 6},				/* cost of loading fp registers
  					   in SFmode, DFmode and XFmode */
!   {4, 4, 6}				/* cost of loading integer registers */
  };
  
  struct processor_costs pentiumpro_cost = {
--- 120,128 ----
    2,					/* cost of reg,reg fld/fst */
    {2, 2, 6},				/* cost of loading fp registers
  					   in SFmode, DFmode and XFmode */
!   {4, 4, 6},				/* cost of loading integer registers */
!   0,					/* size of prefetch block */
!   0,					/* number of prefetches doable in parallel */
  };
  
  struct processor_costs pentiumpro_cost = {
*************** struct processor_costs pentiumpro_cost =
*** 137,143 ****
    2,					/* cost of reg,reg fld/fst */
    {2, 2, 6},				/* cost of loading fp registers
  					   in SFmode, DFmode and XFmode */
!   {4, 4, 6}				/* cost of loading integer registers */
  };
  
  struct processor_costs k6_cost = {
--- 143,154 ----
    2,					/* cost of reg,reg fld/fst */
    {2, 2, 6},				/* cost of loading fp registers
  					   in SFmode, DFmode and XFmode */
!   {4, 4, 6},				/* cost of loading integer registers */
!   32,					/* size of prefetch block */
!   6					/* number of prefetches doable in
! 					   parallel */
! 					/* ??? Guess, only most recent PPRO
! 					   familly CPUs do non-NOP prefetch.  */
  };
  
  struct processor_costs k6_cost = {
*************** struct processor_costs k6_cost = {
*** 158,164 ****
    4,					/* cost of reg,reg fld/fst */
    {6, 6, 6},				/* cost of loading fp registers
  					   in SFmode, DFmode and XFmode */
!   {4, 4, 4}				/* cost of loading integer registers */
  };
  
  struct processor_costs athlon_cost = {
--- 169,178 ----
    4,					/* cost of reg,reg fld/fst */
    {6, 6, 6},				/* cost of loading fp registers
  					   in SFmode, DFmode and XFmode */
!   {4, 4, 4},				/* cost of loading integer registers */
!   32,					/* size of prefetch block */
!   1					/* number of prefetches doable in
! 					   parallel */
  };
  
  struct processor_costs athlon_cost = {
*************** struct processor_costs athlon_cost = {
*** 179,185 ****
    4,					/* cost of reg,reg fld/fst */
    {6, 6, 6},				/* cost of loading fp registers
  					   in SFmode, DFmode and XFmode */
!   {4, 4, 4}				/* cost of loading integer registers */
  };
  
  struct processor_costs *ix86_cost = &pentium_cost;
--- 193,202 ----
    4,					/* cost of reg,reg fld/fst */
    {6, 6, 6},				/* cost of loading fp registers
  					   in SFmode, DFmode and XFmode */
!   {4, 4, 4},				/* cost of loading integer registers */
!   64,					/* size of prefetch block */
!   6					/* number of prefetches doable in
! 					   parallel */
  };
  
  struct processor_costs *ix86_cost = &pentium_cost;
*************** const int x86_sub_esp_4 = m_ATHLON | m_P
*** 222,227 ****
--- 239,246 ----
  const int x86_sub_esp_8 = m_ATHLON | m_PPRO | m_386 | m_486;
  const int x86_add_esp_4 = m_ATHLON | m_K6;
  const int x86_add_esp_8 = m_ATHLON | m_PPRO | m_K6 | m_386 | m_486;
+ const int x86_3dNOW = m_ATHLON | m_K6;
+ const int x86_SSE = m_ATHLON | m_PPRO;
  
  #define AT_BP(mode) (gen_rtx_MEM ((mode), hard_frame_pointer_rtx))
  
*************** ix86_attr_length_default (insn)
*** 6100,6105 ****
--- 6144,6150 ----
      case TYPE_IDIV:
      case TYPE_PUSH:
      case TYPE_POP:
+     case TYPE_PREFETCH:
        for (i = recog_data.n_operands - 1; i >= 0; --i)
          if (CONSTANT_P (recog_data.operand[i]))
  	  {
*** config/i386/i386.h.old	Thu Apr 13 08:15:03 2000
--- config/i386/i386.h	Thu Apr 13 09:30:13 2000
*************** struct processor_costs {
*** 72,77 ****
--- 72,81 ----
  				   in SFmode, DFmode and XFmode */
    int fp_store[3];		/* cost of storing FP register
  				   in SFmode, DFmode and XFmode */
+   int prefetch_block;		/* Size of block read by single
+ 				   prefetch operation. */
+   int simultatenous_prefetches; /* Number of prefetch operations
+ 				   doable in parallel.  */
  };
  
  extern struct processor_costs *ix86_cost;
*************** extern int target_flags;
*** 164,169 ****
--- 168,174 ----
  #define TARGET_ATHLON (ix86_cpu == PROCESSOR_ATHLON)
  
  #define CPUMASK (1 << ix86_cpu)
+ #define ARCHMASK (1 << ix86_arch)
  extern const int x86_use_leave, x86_push_memory, x86_zero_extend_with_and;
  extern const int x86_use_bit_test, x86_cmove, x86_deep_branch;
  extern const int x86_unroll_strlen, x86_use_q_reg, x86_use_any_reg;
*************** extern const int x86_promote_QImode, x86
*** 175,180 ****
--- 180,186 ----
  extern const int x86_himode_math, x86_qimode_math, x86_promote_qi_regs;
  extern const int x86_promote_hi_regs;
  extern const int x86_add_esp_4, x86_add_esp_8, x86_sub_esp_4, x86_sub_esp_8;
+ extern const int x86_3dNOW, x86_SSE;
  
  #define TARGET_USE_LEAVE (x86_use_leave & CPUMASK)
  #define TARGET_PUSH_MEMORY (x86_push_memory & CPUMASK)
*************** extern const int x86_add_esp_4, x86_add_
*** 183,189 ****
  #define TARGET_UNROLL_STRLEN (x86_unroll_strlen & CPUMASK)
  #define TARGET_USE_Q_REG (x86_use_q_reg & CPUMASK)
  #define TARGET_USE_ANY_REG (x86_use_any_reg & CPUMASK)
! #define TARGET_CMOVE (x86_cmove & (1 << ix86_arch))
  #define TARGET_DEEP_BRANCH_PREDICTION (x86_deep_branch & CPUMASK)
  #define TARGET_DOUBLE_WITH_ADD (x86_double_with_add & CPUMASK)
  #define TARGET_USE_SAHF (x86_use_sahf & CPUMASK)
--- 189,195 ----
  #define TARGET_UNROLL_STRLEN (x86_unroll_strlen & CPUMASK)
  #define TARGET_USE_Q_REG (x86_use_q_reg & CPUMASK)
  #define TARGET_USE_ANY_REG (x86_use_any_reg & CPUMASK)
! #define TARGET_CMOVE (x86_cmove & ARCHMASK)
  #define TARGET_DEEP_BRANCH_PREDICTION (x86_deep_branch & CPUMASK)
  #define TARGET_DOUBLE_WITH_ADD (x86_double_with_add & CPUMASK)
  #define TARGET_USE_SAHF (x86_use_sahf & CPUMASK)
*************** extern const int x86_add_esp_4, x86_add_
*** 206,211 ****
--- 212,219 ----
  #define TARGET_ADD_ESP_8 (x86_add_esp_8 & CPUMASK)
  #define TARGET_SUB_ESP_4 (x86_sub_esp_4 & CPUMASK)
  #define TARGET_SUB_ESP_8 (x86_sub_esp_8 & CPUMASK)
+ #define TARGET_3DNOW (x86_3dNOW & ARCHMASK)
+ #define TARGET_SSE (x86_SSE & ARCHMASK)
  
  #define TARGET_STACK_PROBE (target_flags & MASK_STACK_PROBE)
  
*************** while (0)
*** 1756,1761 ****
--- 1764,1775 ----
     in one reasonably fast instruction.  */
  #define MOVE_MAX 4
  
+ /* Size of block read by single prefetch operation.  */
+ #define PREFETCH_BLOCK ix86_cost->prefetch_block
+ 
+ /* Number of prefetch operations doable in parallel.  */
+ #define SIMULTATENOUS_PREFETCHES ix86_cost->simultatenous_prefetches
+ 
  /* If a memory-to-memory move would take MOVE_RATIO or more simple
     move-instruction pairs, we will do a movstr or libcall instead.
     Increasing the value will always make code faster, but eventually
*** config/i386/i386.md.old	Thu Apr 13 08:14:56 2000
--- config/i386/i386.md	Thu Apr 13 09:24:10 2000
***************
*** 71,76 ****
--- 71,78 ----
  ;; 9  This is an `fnstsw' operation.
  ;; 10 This is a `sahf' operation.
  ;; 11 This is a `fstcw' operation
+ ;; 12 This is a prefetch operation
+ ;; 13 This is a prefetchw operation
  ;;
  ;; Insns whose names begin with "x86_" are emitted by gen_FOO calls
  ;; from i386.c.
***************
*** 84,90 ****
  ;; A basic instruction type.  Refinements due to arguments to be
  ;; provided in other attributes.
  (define_attr "type"
!   "other,multi,alu1,negnot,alu,icmp,imov,imovx,lea,incdec,ishift,imul,idiv,ibr,setcc,push,pop,call,callv,icmov,fmov,fop,fop1,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,fxch,str,cld"
    (const_string "other"))
  
  ;; The (bounding maximum) length of an instruction in bytes.
--- 86,92 ----
  ;; A basic instruction type.  Refinements due to arguments to be
  ;; provided in other attributes.
  (define_attr "type"
!   "other,multi,alu1,negnot,alu,icmp,imov,imovx,lea,incdec,ishift,imul,idiv,ibr,setcc,push,pop,call,callv,icmov,fmov,fop,fop1,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,fxch,str,cld,prefetch"
    (const_string "other"))
  
  ;; The (bounding maximum) length of an instruction in bytes.
***************
*** 104,110 ****
  
  ;; Supporting: bytes in the opcode+modrm.
  (define_attr "length_opcode" ""
!   (cond [(eq_attr "type" "imovx,setcc,icmov")
  	   (const_int 3)
  	 (eq_attr "type" "str,cld")
  	   (const_int 1)
--- 106,112 ----
  
  ;; Supporting: bytes in the opcode+modrm.
  (define_attr "length_opcode" ""
!   (cond [(eq_attr "type" "imovx,setcc,icmov,prefetch")
  	   (const_int 3)
  	 (eq_attr "type" "str,cld")
  	   (const_int 1)
***************
*** 147,152 ****
--- 149,156 ----
  			      (match_operand 1 "memory_operand" ""))
  	     (const_string "load")
  	     (const_string "none"))
+ 	 (eq_attr "type" "prefetch")
+ 	   (const_string "load")
  	 (eq_attr "type" "ibr")
  	   (if_then_else (match_operand 0 "memory_operand" "")
  	     (const_string "load")
***************
*** 637,643 ****
  
  (define_function_unit "k6_alu" 2 0
    (and (eq_attr "cpu" "k6")
!        (eq_attr "type" "ishift,alu1,negnot,alu,icmp,imovx,incdec,setcc,lea"))
    1 1)
  
  (define_function_unit "k6_alu" 2 0
--- 641,647 ----
  
  (define_function_unit "k6_alu" 2 0
    (and (eq_attr "cpu" "k6")
!        (eq_attr "type" "ishift,alu1,negnot,alu,icmp,imovx,incdec,setcc,lea,prefetch"))
    1 1)
  
  (define_function_unit "k6_alu" 2 0
***************
*** 766,772 ****
  
  (define_function_unit "athlon_ieu" 3 0
    (and (eq_attr "cpu" "athlon")
!        (eq_attr "type" "alu1,negnot,alu,icmp,imov,imovx,lea,incdec,ishift,imul,idiv,ibr,setcc,push,pop,call,callv,icmov,str,cld"))
    1 1)
  
  (define_function_unit "athlon_ieu" 3 0
--- 770,776 ----
  
  (define_function_unit "athlon_ieu" 3 0
    (and (eq_attr "cpu" "athlon")
!        (eq_attr "type" "alu1,negnot,alu,icmp,imov,imovx,lea,incdec,ishift,imul,idiv,ibr,setcc,push,pop,call,callv,icmov,str,cld,prefetch"))
    1 1)
  
  (define_function_unit "athlon_ieu" 3 0
***************
*** 9191,9196 ****
--- 9195,9225 ----
     fcmov%F1\\t{%2, %0|%0, %2}
     fcmov%f1\\t{%3, %0|%0, %3}"
    [(set_attr "type" "fcmov")])
+ 
+ ;; Prefetch patterns
+ 
+ (define_expand "prefetch"
+   [(unspec [(match_operand:SI 0 "address_operand" "")] 12)]
+   "TARGET_3DNOW || TARGET_SSE"
+   "")
+ 
+ (define_insn ""
+   [(unspec [(match_operand:SI 0 "address_operand" "p")] 12)]
+   "TARGET_3DNOW"
+   "prefetch\\t%a0"
+   [(set_attr "type" "prefetch")])
+ 
+ (define_insn ""
+   [(unspec [(match_operand:SI 0 "address_operand" "p")] 12)]
+   "TARGET_SSE"
+   "prefetchnta\\t%a0"
+   [(set_attr "type" "prefetch")])
+ 
+ (define_insn "prefetchw"
+   [(unspec [(match_operand:SI 0 "address_operand" "p")] 13)]
+   "TARGET_3DNOW"
+   "prefetchw\\t%a0"
+   [(set_attr "type" "prefetch")])
  
  ;; Misc patterns (?)
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]