This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] x86-64: Add support for non temporal prefetches.


Hi,

The patch supports generation of PREFETCHNTA and MOVNTx (streaming stores) for AMD 64-bit target for non-temporal data accesses. Data is said to be non-temporal if it's size is more than the size of L2 cache or it is more than 64KB and not accessed again any time soon. 

Also, at present loop unrolling is done in second loop optimizer where as prefetching is done in first loop optimizer. So, once prefetch instructions are generated for each iteration then the loop is unrolled which results in quite a lot of prefetches in an iteration for same GIV i.e. prefetching of same cache line many times. 
The patch also overcomes this problem of fetching same cache line more than once for AMD 64-bit target.

The spec-benchmark CPU 2000 shows some gain with this compiler over the previous one. Especially for 168.wupwise, 173.applu, 183.equake, 200.sixtrack of FP benchmarks and 164.gzip, 254.gap, 255.vortex, 256.bzip2, 300.twolf of INT benchmarks. Attached is the file "C2000.asc" showing these values. Please observe the difference between base (with -fprefetch-loop-arrays option) and peak (without -fprefetch-loop-arrays option) values.
Comparing the new compiler with that of the old one 173.applu, 183.equake, 200.sixtrack, 164.gzip, 254.gap show performance gain.     

When loop iteration count is not known at the time of prefetching, it assumes a very high value (0XFFFFFFFF) as loop iteration count making data size more than the size of L2 cache hence forcing generation of PREFETCHNTA. For some functions PREFETCHx instead of PREFETCHNTA is beneficial for above case, as the loop iteration actually turns out to be small. Hence we need to specify a small value for loop iteration count so as to generate PREFETCHx. Adding assumed-loop-iteration option for --param does this.

Reg tested on x86-64.

Ok for 3.4 branch?

Nutan
------------------------------------------------------------
2004-07-05  Nutan Singh  <nutans@noida.hcltech.com>

        * loop.c (emit_prefetch_instructions): Emit non-temporal prefetches
	  where givs are not reused.
	  (non_temporal_store): New function for generating non-temporal stores.

     	  * common.opt: Enable guessing of loop iteration count.

	  * opts.c: Set assumed-loop-iteration value.

	  * params.def: Defining parameter assumed-loop-iteration.

	  * params.h: Define macro ASSUMED_LOOP_ITERATION with value 
	  assumed-loop-iteration.

	  * config/i386/i386.c: For ATHLON or K8 targets make x86_cost value 
	  equal to k8_cost.

	  * config/i386/i386.h (L2_CACHE_SIZE, SIZE_L2, SIZE_LESS_THAN_L2) : 
	  Define new macros for AMD related cache sizes.
	  (PREFETCH_BLOCKS_BEFORE_LOOP_MAX, PREFETCH_BLOCKS_BEFORE_LOOP_MIN): 
	  Defined these macros with defferent values for AMD target.
	  (NON_TEMPORAL_STORE): New macro to generate movnti instructions.

	  * cfgloop.c (remove_redundant_prefetches): New function to remove 
	  redundant prefetches for a loop to avoid fetching same cache line 
	  repeatedly.

	  * toplev.c: Calls function remove_redundant_prefetches.

Index: gcc/cfgloop.c
===================================================================
RCS file: /home/gnu/cvs/gcc-3.4/gcc/gcc/cfgloop.c,v
retrieving revision 3.4.1.1
diff -u -p -r3.4.1.1 cfgloop.c
--- gcc/cfgloop.c	2004/05/13 06:12:48	3.4.1.1
+++ gcc/cfgloop.c	2004/08/04 11:59:19
@@ -1286,3 +1286,56 @@ loop_preheader_edge (const struct loop *
 
   return e;
 }
+
+#ifdef HAVE_prefetch
+void
+remove_redundant_prefetches (struct loops *loops, FILE * rtl_dump_file)
+{
+  basic_block *bbs;
+  rtx plist = 0, insn;
+  int i, j;
+  int loop_num = loops->num;
+
+  for (i = 1; i < loop_num; i++)
+    {
+      struct loop *loop = loops->parray[i];
+
+      if (!loop)
+	continue;
+
+      bbs = get_loop_body (loop);
+      if (rtl_dump_file)
+	{
+	  int i;
+	  fprintf (rtl_dump_file, ";;\n;; Analyzing Loop:%d (For PREFETCH)\n",
+		   loop->num);
+	  fprintf (rtl_dump_file, ";;  nodes:");
+	  for (i = 0; i < (int) loop->num_nodes; i++)
+	    fprintf (rtl_dump_file, " %d", bbs[i]->index);
+	  fprintf (rtl_dump_file,
+		   "\n--------------------------------------\n");
+	}
+      for (j = 0; j < (int) loop->num_nodes; j++)
+	{
+	  for (insn = BB_HEAD (bbs[j]); insn != BB_END (bbs[j]);
+	       insn = NEXT_INSN (insn))
+	    {
+	      if (INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH)
+		{
+		  rtx addr = XEXP (PATTERN (insn), 0);
+		  rtx next;
+		  if (plist)
+		    for (next = plist; next; next = XEXP (next, 1))
+		      if (rtx_equal_p (addr, XEXP (next, 0)))
+		        {
+			  insn = PREV_INSN (delete_insn (insn));
+			  break;
+			}
+		  plist = gen_rtx_EXPR_LIST (VOIDmode, addr, plist);
+		}
+	    }
+	}
+      free (bbs);
+    }
+}
+#endif
Index: gcc/cfgloop.h
===================================================================
RCS file: /home/gnu/cvs/gcc-3.4/gcc/gcc/cfgloop.h,v
retrieving revision 3.4.1.1
diff -u -p -r3.4.1.1 cfgloop.h
--- gcc/cfgloop.h	2004/05/13 06:12:48	3.4.1.1
+++ gcc/cfgloop.h	2004/08/04 11:59:19
@@ -325,6 +325,7 @@ extern edge split_loop_bb (basic_block, 
 /* Loop optimizer initialization.  */
 extern struct loops *loop_optimizer_init (FILE *);
 extern void loop_optimizer_finalize (struct loops *, FILE *);
+extern void remove_redundant_prefetches(struct loops *, FILE *);
 
 /* Optimization passes.  */
 extern void unswitch_loops (struct loops *);
Index: gcc/common.opt
===================================================================
RCS file: /home/gnu/cvs/gcc-3.4/gcc/gcc/common.opt,v
retrieving revision 3.4.1.1
diff -u -p -r3.4.1.1 common.opt
--- gcc/common.opt	2004/05/13 06:12:49	3.4.1.1
+++ gcc/common.opt	2004/08/04 11:59:19
@@ -373,6 +373,10 @@ fguess-branch-probability
 Common
 Enable guessing of branch probabilities
 
+fguess-loop-iteration
+Common
+Enable guessing of loop iteration count
+
 fident
 Common
 Process #ident directives
Index: gcc/loop.c
===================================================================
RCS file: /home/gnu/cvs/gcc-3.4/gcc/gcc/loop.c,v
retrieving revision 3.4.1.1
diff -u -p -r3.4.1.1 loop.c
--- gcc/loop.c	2004/05/13 06:12:52	3.4.1.1
+++ gcc/loop.c	2004/08/04 11:59:19
@@ -66,13 +66,18 @@ Software Foundation, 59 Temple Place - S
 #include "optabs.h"
 #include "cfgloop.h"
 #include "ggc.h"
+#include "params.h"
 
 /* Not really meaningful values, but at least something.  */
 #ifndef SIMULTANEOUS_PREFETCHES
 #define SIMULTANEOUS_PREFETCHES 3
 #endif
+/* Number of cache lines ahead to prefetch.  */
+#ifndef PREFETCH_DISTANCE
+#define PREFETCH_DISTANCE 5
+#endif
 #ifndef PREFETCH_BLOCK
-#define PREFETCH_BLOCK 32
+#define PREFETCH_BLOCK 32 
 #endif
 #ifndef HAVE_prefetch
 #define HAVE_prefetch 0
@@ -86,10 +91,14 @@ Software Foundation, 59 Temple Place - S
 #define MAX_PREFETCHES 100
 /* The number of prefetch blocks that are beneficial to fetch at once before
    a loop with a known (and low) iteration count.  */
-#define PREFETCH_BLOCKS_BEFORE_LOOP_MAX  6
+#ifndef PREFETCH_BLOCKS_BEFORE_LOOP_MAX
+#define PREFETCH_BLOCKS_BEFORE_LOOP_MAX 6
+#endif
 /* For very tiny loops it is not worthwhile to prefetch even before the loop,
    since it is likely that the data are already in the cache.  */
-#define PREFETCH_BLOCKS_BEFORE_LOOP_MIN  2
+#ifndef PREFETCH_BLOCKS_BEFORE_LOOP_MIN
+#define PREFETCH_BLOCKS_BEFORE_LOOP_MIN 2
+#endif
 
 /* Parameterize some prefetch heuristics so they can be turned on and off
    easily for performance testing on new architectures.  These can be
@@ -146,7 +155,7 @@ Software Foundation, 59 Temple Place - S
 
 /* Do not handle reversed order prefetches (negative stride).  */
 #ifndef PREFETCH_NO_REVERSE_ORDER
-#define PREFETCH_NO_REVERSE_ORDER 1
+#define PREFETCH_NO_REVERSE_ORDER 0
 #endif
 
 /* Prefetch even if the GIV is in conditional code.  */
@@ -3665,6 +3674,7 @@ struct prefetch_info
   int prefetch_in_loop;		/* Number of prefetch insns in loop.  */
   int prefetch_before_loop;	/* Number of prefetch insns before loop.  */
   unsigned int write : 1;	/* 1 for read/write prefetches.  */
+  int reused;			/* 1 if this giv is reused in the loop */
 };
 
 /* Data used by check_store function.  */
@@ -3852,7 +3862,7 @@ emit_prefetch_instructions (struct loop 
   int i;
   struct iv_class *bl;
   struct induction *iv;
-  struct prefetch_info info[MAX_PREFETCHES];
+  struct prefetch_info info[MAX_PREFETCHES] = {0};
   struct loop_ivs *ivs = LOOP_IVS (loop);
 
   if (!HAVE_prefetch)
@@ -4038,6 +4048,7 @@ emit_prefetch_instructions (struct loop 
 		    info[i].class = bl;
 		    info[num_prefetches].base_address = address;
 		    add = 0;
+		    info[i].reused = 1;
 		    break;
 		  }
 
@@ -4047,6 +4058,7 @@ emit_prefetch_instructions (struct loop 
 		    info[i].write |= d.mem_write;
 		    info[i].bytes_accessed += size;
 		    add = 0;
+		    info[i].reused = 1;
 		    break;
 		  }
 	      }
@@ -4084,7 +4096,7 @@ emit_prefetch_instructions (struct loop 
 	      >= LOOP_INFO (loop)->n_iterations))
 	info[i].total_bytes = info[i].stride * LOOP_INFO (loop)->n_iterations;
       else
-	info[i].total_bytes = 0xffffffff;
+	info[i].total_bytes = info[i].stride * ASSUMED_LOOP_ITERATION;
 
       density = info[i].bytes_accessed * 100 / info[i].stride;
 
@@ -4136,9 +4148,9 @@ emit_prefetch_instructions (struct loop 
     }
   /* We'll also use AHEAD to determine how many prefetch instructions to
      emit before a loop, so don't leave it zero.  */
-  if (ahead == 0)
-    ahead = PREFETCH_BLOCKS_BEFORE_LOOP_MAX;
 
+  ahead = PREFETCH_BLOCKS_BEFORE_LOOP_MAX;
+
   for (i = 0; i < num_prefetches; i++)
     {
       /* Update if we've decided not to prefetch anything within the loop.  */
@@ -4203,7 +4215,7 @@ emit_prefetch_instructions (struct loop 
 	{
 	  rtx loc = copy_rtx (*info[i].giv->location);
 	  rtx insn;
-	  int bytes_ahead = PREFETCH_BLOCK * (ahead + y);
+	  int bytes_ahead = PREFETCH_BLOCK * (ahead + y + PREFETCH_DISTANCE);
 	  rtx before_insn = info[i].giv->insn;
 	  rtx prev_insn = PREV_INSN (info[i].giv->insn);
 	  rtx seq;
@@ -4226,8 +4238,23 @@ emit_prefetch_instructions (struct loop 
 	  if (! (*insn_data[(int)CODE_FOR_prefetch].operand[0].predicate)
 		  (loc, insn_data[(int)CODE_FOR_prefetch].operand[0].mode))
 	    loc = force_reg (Pmode, loc);
-	  emit_insn (gen_prefetch (loc, GEN_INT (info[i].write),
-				   GEN_INT (3)));
+
+#ifdef L2_CACHE_SIZE
+	  if (info[i].total_bytes > SIZE_L2
+	      || ((info[i].total_bytes > SIZE_LESS_THAN_L2)
+		   && (!info[i].reused)))
+	  {
+#ifdef NON_TEMPORAL_STORE
+	  if (!info[i].write || !non_temporal_store (info[i].giv->insn))
+#endif
+	    emit_insn (gen_prefetch (loc, GEN_INT (0), GEN_INT(0)));
+	  }
+	  else
+	    emit_insn (gen_prefetch (loc, GEN_INT (info[i].write), GEN_INT(3)));
+#else
+	  emit_insn (gen_prefetch (loc, GEN_INT (info[i].write), GEN_INT(3)));
+#endif
+
 	  seq = get_insns ();
 	  end_sequence ();
 	  emit_insn_before (seq, before_insn);
@@ -4256,7 +4283,7 @@ emit_prefetch_instructions (struct loop 
 	      rtx init_val = info[i].class->initial_value;
 	      rtx add_val = simplify_gen_binary (PLUS, Pmode,
 						 info[i].giv->add_val,
-						 GEN_INT (y * PREFETCH_BLOCK));
+						 GEN_INT ((y + PREFETCH_DISTANCE) * PREFETCH_BLOCK));
 
 	      /* Functions called by LOOP_IV_ADD_EMIT_BEFORE expect a
 		 non-constant INIT_VAL to have the same mode as REG, which
@@ -4274,15 +4301,42 @@ emit_prefetch_instructions (struct loop 
 	      loop_iv_add_mult_emit_before (loop, init_val,
 					    info[i].giv->mult_val,
 					    add_val, reg, 0, loop_start);
-	      emit_insn_before (gen_prefetch (reg, GEN_INT (info[i].write),
-					      GEN_INT (3)),
-				loop_start);
+#ifdef L2_CACHE_SIZE 
+	      if (info[i].total_bytes > SIZE_L2
+		  || ((info[i].total_bytes >= SIZE_LESS_THAN_L2) && (!info[i].reused)))
+         	 emit_insn_before (gen_prefetch (reg, GEN_INT (0), GEN_INT(0)), loop_start);
+	      else
+	      	 emit_insn_before (gen_prefetch (reg, GEN_INT (info[i].write), GEN_INT(3)), loop_start);
+#else
+	      emit_insn_before (gen_prefetch (reg, GEN_INT (info[i].write), GEN_INT(3)), loop_start);
+#endif 
+
 	    }
 	}
     }
 
   return;
 }
+
+#ifdef NON_TEMPORAL_STORE
+/* See if a non-tempopral store can be used 
+   for insn. If valid, make the change and return non-zero.  */
+static int
+non_temporal_store (rtx insn)
+{
+  rtx set = single_set (insn);
+  rtx store;
+  int rval=0;
+  if (set && GET_CODE (SET_DEST (set)) == MEM
+      && GET_CODE (SET_SRC (set)) == REG)
+    {
+      store = NON_TEMPORAL_STORE (SET_DEST (set), SET_SRC (set));
+      rval = validate_change (insn, &PATTERN(insn), store, 0);
+    }
+  return rval;
+}
+#endif
+
 
 /* Communication with routines called via `note_stores'.  */
 
Index: gcc/loop.h
===================================================================
RCS file: /home/gnu/cvs/gcc-3.4/gcc/gcc/loop.h,v
retrieving revision 3.4.1.1
diff -u -p -r3.4.1.1 loop.h
--- gcc/loop.h	2004/05/13 06:12:52	3.4.1.1
+++ gcc/loop.h	2004/08/04 11:59:19
@@ -402,6 +402,7 @@ extern FILE *loop_dump_stream;
 /* Forward declarations for non-static functions declared in loop.c and
    unroll.c.  */
 extern int loop_invariant_p (const struct loop *, rtx);
+extern int reg_in_basic_block_p (rtx, rtx);
 extern rtx get_condition_for_loop (const struct loop *, rtx);
 extern void loop_iv_add_mult_hoist (const struct loop *, rtx, rtx, rtx, rtx);
 extern void loop_iv_add_mult_sink (const struct loop *, rtx, rtx, rtx, rtx);
Index: gcc/opts.c
===================================================================
RCS file: /home/gnu/cvs/gcc-3.4/gcc/gcc/opts.c,v
retrieving revision 3.4.1.1
diff -u -p -r3.4.1.1 opts.c
--- gcc/opts.c	2004/05/13 06:12:52	3.4.1.1
+++ gcc/opts.c	2004/08/04 11:59:19
@@ -1071,6 +1071,10 @@ common_handle_option (size_t scode, cons
       set_param_value ("max-inline-insns-rtl", value);
       break;
 
+    case OPT_fguess_loop_iteration:
+      set_param_value ("assumed-loop-iteration", value);
+      break;	
+    
     case OPT_finstrument_functions:
       flag_instrument_function_entry_exit = value;
       break;
Index: gcc/params.def
===================================================================
RCS file: /home/gnu/cvs/gcc-3.4/gcc/gcc/params.def,v
retrieving revision 3.4.1.1
diff -u -p -r3.4.1.1 params.def
--- gcc/params.def	2004/05/13 06:12:52	3.4.1.1
+++ gcc/params.def	2004/08/04 11:59:19
@@ -215,6 +215,11 @@ DEFPARAM(TRACER_MAX_CODE_GROWTH,
 	 "tracer-max-code-growth",
 	 "Maximal code growth caused by tail duplication (in percent)",
 	 100)
+DEFPARAM(PARAM_ASSUMED_LOOP_ITERATION,
+	 "assumed-loop-iteration",
+	 "Assume loop iteration count when it is not known at the time of loop \
+optimization)",
+	 200)
 DEFPARAM(TRACER_MIN_BRANCH_RATIO,
 	 "tracer-min-branch-ratio",
 	 "Stop reverse growth if the reverse probability of best edge is less \
Index: gcc/params.h
===================================================================
RCS file: /home/gnu/cvs/gcc-3.4/gcc/gcc/params.h,v
retrieving revision 3.4.1.1
diff -u -p -r3.4.1.1 params.h
--- gcc/params.h	2004/05/13 06:12:52	3.4.1.1
+++ gcc/params.h	2004/08/04 11:59:19
@@ -82,6 +82,8 @@ typedef enum compiler_param
   (compiler_params[(int) ENUM].value)
 
 /* Macros for the various parameters.  */
+#define ASSUMED_LOOP_ITERATION \
+  PARAM_VALUE (PARAM_ASSUMED_LOOP_ITERATION)
 #define MAX_INLINE_INSNS_SINGLE \
   PARAM_VALUE (PARAM_MAX_INLINE_INSNS_SINGLE)
 #define MAX_INLINE_INSNS \
Index: gcc/toplev.c
===================================================================
RCS file: /home/gnu/cvs/gcc-3.4/gcc/gcc/toplev.c,v
retrieving revision 3.4.1.1
diff -u -p -r3.4.1.1 toplev.c
--- gcc/toplev.c	2004/05/13 06:12:53	3.4.1.1
+++ gcc/toplev.c	2004/08/04 11:59:20
@@ -3083,7 +3083,12 @@ rest_of_handle_loop2 (tree decl, rtx ins
 			       (flag_peel_loops ? UAP_PEEL : 0) |
 			       (flag_unroll_loops ? UAP_UNROLL : 0) |
 			       (flag_unroll_all_loops ? UAP_UNROLL_ALL : 0));
-
+#ifdef HAVE_prefetch
+      /* Remove redundant copies of prefetch, generated during loop
+        unrolling. */
+      if (flag_prefetch_loop_arrays)
+        remove_redundant_prefetches (loops, rtl_dump_file);
+#endif
       loop_optimizer_finalize (loops, rtl_dump_file);
     }
 
Index: gcc/config/i386/i386.c
===================================================================
RCS file: /home/gnu/cvs/gcc-3.4/gcc/gcc/config/i386/i386.c,v
retrieving revision 3.4.1.1
diff -u -p -r3.4.1.1 i386.c
--- gcc/config/i386/i386.c	2004/05/13 06:13:22	3.4.1.1
+++ gcc/config/i386/i386.c	2004/08/04 11:59:21
@@ -457,7 +457,7 @@ struct processor_costs pentium4_cost = {
   43,					/* cost of FSQRT instruction.  */
 };
 
-const struct processor_costs *ix86_cost = &pentium_cost;
+
 
 /* Processor feature/optimization bitmasks.  */
 #define m_386 (1<<PROCESSOR_I386)
@@ -469,6 +469,12 @@ const struct processor_costs *ix86_cost 
 #define m_PENT4  (1<<PROCESSOR_PENTIUM4)
 #define m_K8  (1<<PROCESSOR_K8)
 #define m_ATHLON_K8  (m_K8 | m_ATHLON)
+
+#ifdef m_ATHLON_K8
+const struct processor_costs *ix86_cost = &k8_cost;
+#else
+const struct processor_costs *ix86_cost = &pentium_cost;
+#endif
 
 const int x86_use_leave = m_386 | m_K6 | m_ATHLON_K8;
 const int x86_push_memory = m_386 | m_K6 | m_ATHLON_K8 | m_PENT4;
Index: gcc/config/i386/i386.h
===================================================================
RCS file: /home/gnu/cvs/gcc-3.4/gcc/gcc/config/i386/i386.h,v
retrieving revision 3.4.1.1
diff -u -p -r3.4.1.1 i386.h
--- gcc/config/i386/i386.h	2004/05/13 06:13:22	3.4.1.1
+++ gcc/config/i386/i386.h	2004/08/04 11:59:21
@@ -2569,6 +2569,25 @@ enum ix86_builtins
 /* Number of prefetch operations that can be done in parallel.  */
 #define SIMULTANEOUS_PREFETCHES ix86_cost->simultaneous_prefetches
 
+/* Define the  L2 data cache size. */
+#define SIZE_L2 1048576
+#define L2_CACHE_SIZE SIZE_L2
+
+#define NON_TEMPORAL_STORE(OP1, OP2)	gen_sse2_movntsi ((OP1), (OP2))
+
+/* MINIMUM threshhold to generate the temporal prefetches. e.g. on AMD64
+   64KB is much less than L2 size and a temporal prefetch will always be generated
+   for sizez less than this size.  */
+#define SIZE_LESS_THAN_L2 65536
+
+#ifdef TARGET_ATHLON_K8
+#define PREFETCH_BLOCKS_BEFORE_LOOP_MAX 2
+#endif
+
+#ifdef TARGET_ATHLON_K8
+#define PREFETCH_BLOCKS_BEFORE_LOOP_MIN 0
+#endif
+
 /* Max number of bytes we can move from memory to memory
    in one reasonably fast instruction.  */
 #define MOVE_MAX 16 


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]