This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[lno] New array prefetching pass
- From: Zdenek Dvorak <rakdver at atrey dot karlin dot mff dot cuni dot cz>
- To: gcc-patches at gcc dot gnu dot org
- Date: Thu, 1 Jul 2004 06:23:49 +0200
- Subject: [lno] New array prefetching pass
Hello,
this patch reimplements the loop array prefetching on tree level. It is
quite incomplete for now, and even after all TODOs are done, it won't be
quite the best currently known solution, but it should suffice for
purposes of replacing the old loop optimizer.
Zdenek
Index: ChangeLog.lno
===================================================================
RCS file: /cvs/gcc/gcc/gcc/Attic/ChangeLog.lno,v
retrieving revision 1.1.2.207
diff -c -3 -p -r1.1.2.207 ChangeLog.lno
*** ChangeLog.lno 28 Jun 2004 19:59:58 -0000 1.1.2.207
--- ChangeLog.lno 1 Jul 2004 04:19:14 -0000
***************
*** 1,3 ****
--- 1,27 ----
+ 2004-06-30 Zdenek Dvorak <rakdver@atrey.karlin.mff.cuni.cz>
+
+ * tree-ssa-loop-prefetch.c: New file.
+ * Makefile.in (tree-ssa-loop-prefetch.o): Add.
+ * timevar.def (TV_TREE_PREFETCH): New.
+ * tree-flow.h (tree_ssa_prefetch_arrays,
+ standard_iv_increment_position): Declare.
+ * tree-optimize.c (init_tree_optimization_passes): Add
+ pass_loop_prefetch.
+ * tree-pass.h (pass_loop_prefetch): Declare.
+ * tree-ssa-loop-ivcanon.c (estimate_loop_size): Force the size to be
+ nonzero.
+ * tree-ssa-loop-ivopts.c (cst_and_fits_in_hwi): Moved to tree.c.
+ (standard_iv_increment_position): New function.
+ (zero_p): Export.
+ * tree-ssa-loop-niter.c (zero_p): Remove duplicate function.
+ * tree-ssa-loop.c (tree_ssa_loop_prefetch,
+ gate_tree_ssa_loop_prefetch): New functions.
+ (pass_loop_prefetch): New pass structure.
+ * tree-ssa-operands.c (function_ignores_memory_p): New function.
+ (get_expr_operands): Use it.
+ * tree.c (cst_and_fits_in_hwi): Moved from tree-ssa-loop-ivopts.c.
+ * tree.h (zero_p, cst_and_fits_in_hwi): Declare.
+
2004-06-28 Daniel Berlin <dberlin@dberlin.org>
* lambda-code.c (print_linear_expression): Rename a few variables,
Index: Makefile.in
===================================================================
RCS file: /cvs/gcc/gcc/gcc/Makefile.in,v
retrieving revision 1.903.2.158.2.35
diff -c -3 -p -r1.903.2.158.2.35 Makefile.in
*** Makefile.in 22 Jun 2004 16:03:41 -0000 1.903.2.158.2.35
--- Makefile.in 1 Jul 2004 04:19:14 -0000
*************** OBJS-common = \
*** 902,908 ****
cfg.o cfganal.o cfgbuild.o cfgcleanup.o cfglayout.o cfgloop.o \
cfgloopanal.o cfgloopmanip.o loop-init.o loop-unswitch.o loop-unroll.o \
cfgrtl.o combine.o conflict.o convert.o coverage.o cse.o cselib.o \
! dbxout.o ddg.o \
debug.o df.o diagnostic.o dojump.o dominance.o loop-doloop.o \
dwarf2asm.o dwarf2out.o emit-rtl.o except.o explow.o loop-iv.o \
expmed.o expr.o final.o flow.o fold-const.o function.o gcse.o \
--- 902,908 ----
cfg.o cfganal.o cfgbuild.o cfgcleanup.o cfglayout.o cfgloop.o \
cfgloopanal.o cfgloopmanip.o loop-init.o loop-unswitch.o loop-unroll.o \
cfgrtl.o combine.o conflict.o convert.o coverage.o cse.o cselib.o \
! dbxout.o ddg.o tree-ssa-loop-prefetch.o \
debug.o df.o diagnostic.o dojump.o dominance.o loop-doloop.o \
dwarf2asm.o dwarf2out.o emit-rtl.o except.o explow.o loop-iv.o \
expmed.o expr.o final.o flow.o fold-const.o function.o gcse.o \
*************** tree-ssa-loop-ivcanon.o : tree-ssa-loop-
*** 1683,1688 ****
--- 1683,1692 ----
$(SYSTEM_H) $(RTL_H) $(TREE_H) $(TM_P_H) $(CFGLOOP_H) $(PARAMS_H) tree-inline.h \
output.h diagnostic.h $(TM_H) coretypes.h $(TREE_DUMP_H) flags.h \
tree-pass.h $(SCEV_H)
+ tree-ssa-loop-prefetch.o : tree-ssa-loop-prefetch.c $(TREE_FLOW_H) $(CONFIG_H) \
+ $(SYSTEM_H) $(RTL_H) $(TREE_H) $(TM_P_H) $(CFGLOOP_H) varray.h $(EXPR_H) \
+ output.h diagnostic.h $(TIMEVAR_H) $(TM_H) coretypes.h $(TREE_DUMP_H) \
+ tree-pass.h $(GGC_H) $(RECOG_H) insn-config.h $(HASHTAB_H) $(SCEV_H)
tree-ssa-loop-ivopts.o : tree-ssa-loop-ivopts.c $(TREE_FLOW_H) $(CONFIG_H) \
$(SYSTEM_H) $(RTL_H) $(TREE_H) $(TM_P_H) $(CFGLOOP_H) varray.h $(EXPR_H) \
output.h diagnostic.h $(TIMEVAR_H) $(TM_H) coretypes.h $(TREE_DUMP_H) \
Index: timevar.def
===================================================================
RCS file: /cvs/gcc/gcc/gcc/timevar.def,v
retrieving revision 1.14.2.28.2.14
diff -c -3 -p -r1.14.2.28.2.14 timevar.def
*** timevar.def 14 Jun 2004 21:41:34 -0000 1.14.2.28.2.14
--- timevar.def 1 Jul 2004 04:19:14 -0000
*************** DEFTIMEVAR (TV_TREE_DSE , "tree DS
*** 83,88 ****
--- 83,89 ----
DEFTIMEVAR (TV_TREE_LOOP , "tree loop optimization")
DEFTIMEVAR (TV_TREE_LINEAR_TRANSFORM , "tree loop linear transforms")
DEFTIMEVAR (TV_TREE_LOOP_IVOPTS , "tree iv optimization")
+ DEFTIMEVAR (TV_TREE_PREFETCH , "tree prefetching")
DEFTIMEVAR (TV_TREE_LOOP_IVCANON , "tree canonical iv creation")
DEFTIMEVAR (TV_TREE_VECTORIZATION , "tree loop vectorization")
DEFTIMEVAR (TV_TREE_ELIM_CHECKS , "chrec eliminate checks")
Index: tree-flow.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/tree-flow.h,v
retrieving revision 1.1.4.177.2.35
diff -c -3 -p -r1.1.4.177.2.35 tree-flow.h
*** tree-flow.h 17 Jun 2004 18:25:11 -0000 1.1.4.177.2.35
--- tree-flow.h 1 Jul 2004 04:19:14 -0000
*************** void tree_ssa_dce_no_cfg_changes (void);
*** 608,613 ****
--- 608,614 ----
struct loops *tree_loop_optimizer_init (FILE *, bool);
void tree_ssa_lim (struct loops *loops);
void tree_ssa_iv_optimize (struct loops *);
+ void tree_ssa_prefetch_arrays (struct loops *);
void canonicalize_induction_variables (struct loops *loops);
bool tree_duplicate_loop_to_header_edge (struct loop *, edge, struct loops *,
unsigned int, sbitmap,
*************** void tree_unroll_loops_completely (struc
*** 617,622 ****
--- 618,625 ----
bool tree_duplicate_loop_to_exit (struct loop *loop, struct loops *loops);
void create_iv (tree, tree, tree, struct loop *, block_stmt_iterator *, bool,
tree *, tree *);
+ void standard_iv_increment_position (struct loop *, block_stmt_iterator *,
+ bool *);
struct loop *tree_ssa_loop_version (struct loops *, struct loop *, tree,
basic_block *);
void update_lv_condition (basic_block *, tree);
Index: tree-optimize.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/tree-optimize.c,v
retrieving revision 1.1.4.98.2.23
diff -c -3 -p -r1.1.4.98.2.23 tree-optimize.c
*** tree-optimize.c 14 Jun 2004 21:41:34 -0000 1.1.4.98.2.23
--- tree-optimize.c 1 Jul 2004 04:19:14 -0000
*************** init_tree_optimization_passes (void)
*** 334,339 ****
--- 334,340 ----
NEXT_PASS (pass_vectorize);
NEXT_PASS (pass_complete_unroll);
NEXT_PASS (pass_linear_transform);
+ NEXT_PASS (pass_loop_prefetch);
NEXT_PASS (pass_iv_optimize);
NEXT_PASS (pass_loop_done);
*p = NULL;
Index: tree-pass.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/tree-pass.h,v
retrieving revision 1.1.4.12
diff -c -3 -p -r1.1.4.12 tree-pass.h
*** tree-pass.h 14 Jun 2004 21:41:34 -0000 1.1.4.12
--- tree-pass.h 1 Jul 2004 04:19:14 -0000
*************** extern struct tree_opt_pass pass_vectori
*** 114,119 ****
--- 114,120 ----
extern struct tree_opt_pass pass_complete_unroll;
extern struct tree_opt_pass pass_linear_transform;
extern struct tree_opt_pass pass_iv_optimize;
+ extern struct tree_opt_pass pass_loop_prefetch;
extern struct tree_opt_pass pass_loop_done;
extern struct tree_opt_pass pass_ch;
extern struct tree_opt_pass pass_ccp;
Index: tree-ssa-loop-ivcanon.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/Attic/tree-ssa-loop-ivcanon.c,v
retrieving revision 1.1.2.14
diff -c -3 -p -r1.1.2.14 tree-ssa-loop-ivcanon.c
*** tree-ssa-loop-ivcanon.c 22 Jun 2004 16:03:44 -0000 1.1.2.14
--- tree-ssa-loop-ivcanon.c 1 Jul 2004 04:19:14 -0000
*************** estimate_loop_size (struct loop *loop)
*** 98,104 ****
{
basic_block *body = get_loop_body (loop);
block_stmt_iterator bsi;
! unsigned size = 0, i;
for (i = 0; i < loop->num_nodes; i++)
for (bsi = bsi_start (body[i]); !bsi_end_p (bsi); bsi_next (&bsi))
--- 98,104 ----
{
basic_block *body = get_loop_body (loop);
block_stmt_iterator bsi;
! unsigned size = 1, i;
for (i = 0; i < loop->num_nodes; i++)
for (bsi = bsi_start (body[i]); !bsi_end_p (bsi); bsi_next (&bsi))
Index: tree-ssa-loop-ivopts.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/Attic/tree-ssa-loop-ivopts.c,v
retrieving revision 1.1.2.41
diff -c -3 -p -r1.1.2.41 tree-ssa-loop-ivopts.c
*** tree-ssa-loop-ivopts.c 22 Jun 2004 16:03:44 -0000 1.1.2.41
--- tree-ssa-loop-ivopts.c 1 Jul 2004 04:19:14 -0000
*************** name_info (struct ivopts_data *data, tre
*** 461,467 ****
/* Checks whether ARG is either NULL_TREE or constant zero. */
! static bool
zero_p (tree arg)
{
if (arg == NULL_TREE)
--- 461,467 ----
/* Checks whether ARG is either NULL_TREE or constant zero. */
! bool
zero_p (tree arg)
{
if (arg == NULL_TREE)
*************** zero_p (tree arg)
*** 470,489 ****
return integer_zerop (arg);
}
- /* Checks that X is integer constant that fits in unsigned HOST_WIDE_INT.
- Similar to host_integerp (x, 1), but does not fail if the value is
- negative. */
-
- static bool
- cst_and_fits_in_hwi (tree x)
- {
- if (TREE_CODE (x) != INTEGER_CST)
- return false;
-
- return (TREE_INT_CST_HIGH (x) == 0
- || TREE_INT_CST_HIGH (x) == -1);
- }
-
/* Checks whether there exists number X such that X * B = A, counting modulo
2^BITS. */
--- 470,475 ----
*************** ip_normal_pos (struct loop *loop)
*** 840,845 ****
--- 826,856 ----
return bb;
}
+ /* Returns the standard position for induction variable increment in LOOP
+ (ip_normal_pos if it is available and latch is empty, ip_end_pos
+ otherwise) in BSI. INSERT_AFTER is set to true if the increment should
+ be inserted after *BSI. */
+
+ void
+ standard_iv_increment_position (struct loop *loop, block_stmt_iterator *bsi,
+ bool *insert_after)
+ {
+ basic_block bb = ip_normal_pos (loop), latch = ip_end_pos (loop);
+ tree last = last_stmt (latch);
+
+ if (!bb
+ || (last && TREE_CODE (last) != LABEL_EXPR))
+ {
+ *bsi = bsi_last (latch);
+ *insert_after = true;
+ }
+ else
+ {
+ *bsi = bsi_last (bb);
+ *insert_after = false;
+ }
+ }
+
/* Returns true if STMT is after the place where the IP_NORMAL ivs will be
emitted in LOOP. */
Index: tree-ssa-loop-niter.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/Attic/tree-ssa-loop-niter.c,v
retrieving revision 1.1.2.8
diff -c -3 -p -r1.1.2.8 tree-ssa-loop-niter.c
*** tree-ssa-loop-niter.c 22 Jun 2004 16:03:44 -0000 1.1.2.8
--- tree-ssa-loop-niter.c 1 Jul 2004 04:19:14 -0000
*************** Software Foundation, 59 Temple Place - S
*** 52,68 ****
*/
- /* Checks whether ARG is either NULL_TREE or constant zero. */
-
- static bool
- zero_p (tree arg)
- {
- if (!arg)
- return true;
-
- return integer_zerop (arg);
- }
-
/* Computes inverse of X modulo 2^s, where MASK = 2^s-1. */
static tree
--- 52,57 ----
Index: tree-ssa-loop-prefetch.c
===================================================================
RCS file: tree-ssa-loop-prefetch.c
diff -N tree-ssa-loop-prefetch.c
*** /dev/null 1 Jan 1970 00:00:00 -0000
--- tree-ssa-loop-prefetch.c 1 Jul 2004 04:19:14 -0000
***************
*** 0 ****
--- 1,665 ----
+ /* Array prefetching.
+ Copyright (C) 2004 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by the
+ Free Software Foundation; either version 2, or (at your option) any
+ later version.
+
+ GCC is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING. If not, write to the Free
+ Software Foundation, 59 Temple Place - Suite 330, Boston, MA
+ 02111-1307, USA. */
+
+ #include "config.h"
+ #include "system.h"
+ #include "coretypes.h"
+ #include "tm.h"
+ #include "tree.h"
+ #include "rtl.h"
+ #include "tm_p.h"
+ #include "hard-reg-set.h"
+ #include "basic-block.h"
+ #include "output.h"
+ #include "diagnostic.h"
+ #include "tree-flow.h"
+ #include "tree-dump.h"
+ #include "timevar.h"
+ #include "cfgloop.h"
+ #include "varray.h"
+ #include "expr.h"
+ #include "tree-pass.h"
+ #include "ggc.h"
+ #include "insn-config.h"
+ #include "recog.h"
+ #include "hashtab.h"
+ #include "tree-chrec.h"
+ #include "tree-scalar-evolution.h"
+
+ /* This pass inserts prefetch instructions to optimize cache usage during
+ accesses to arrays in loops. It processes loops sequentially and:
+
+ 1) Gathers all memory references in the single loop.
+ 2) For each of the references it decides when it is profitable to prefetch
+ it. To do it, we evaluate the reuse among the accesses, and determines
+ two values: PREFETCH_BEFORE (meaning that it only makes sense to do
+ prefetching in the first PREFETCH_BEFORE iterations of the loop) and
+ PREFETCH_MOD (meaning that it only makes sense to prefetch in the
+ iterations of the loop that are zero modulo PREFETCH_MOD). For example
+ (assuming cache line size is 64 bytes, char has size 1 byte and there
+ is no hardware sequential prefetch):
+
+ char *a;
+ for (i = 0; i < max; i++)
+ {
+ a[255] = ...; (0)
+ a[i] = ...; (1)
+ a[i + 64] = ...; (2)
+ a[16*i] = ...; (3)
+ a[187*i] = ...; (4)
+ a[187*i + 50] = ...; (5)
+ }
+
+ (0) obviously has PREFETCH_BEFORE 1
+ (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
+ location 64 iterations befor it, and PREFETCH_MOD 64 (since
+ it hits the same cache line otherwise).
+ (2) has PREFETCH_MOD 64
+ (3) has PREFETCH_MOD 4
+ (5) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since
+ the cache line accessed by (5) is always different from the one
+ we need to access.
+ (6) has PREFETCH_MOD 1 as well. We do not set PREFETCH_BEFORE here,
+ since (4) accesses the same cache line with probability only
+ 7/32.
+
+ TODO: actually implement this.
+
+ 3) We determine how much ahead we need to prefetch. The number of
+ iterations needed is time to fetch / time spent in one iteration of
+ the loop. The problem is that we do not know either of these values,
+ so we just make a heuristic guess based on a magic (possibly)
+ target-specific constant and size of the loop.
+
+ 4) Determine which of the references we prefetch. We take into account
+ that there is a maximum number of simultaneous prefetches (provided
+ by machine description). We prefetch as many prefetches as possible
+ while still within this bound (starting with those with lowest
+ prefetch_mod, since they are responsible for most of the cache
+ misses).
+
+ 5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
+ and PREFETCH_BEFORE requirements (within some bounds), and to avoid
+ prefetching nonaccessed memory.
+ TODO -- actually implement this.
+
+ 6) We actually emit the prefetch instructions. ??? Perhaps emit the
+ prefetch instructions with guards in cases where 5) was not sufficient
+ to satisfy the constraints?
+
+ Some other TODO:
+ -- write and use more general reuse analysis (that could be also used
+ in other cache aimed loop optimizations)
+ -- make it behave sanely together with the prefetches given by user
+ (now we just ignore them; at the very least we should avoid
+ optimizing loops in that user put his own prefetches) */
+
+ /* Magic constants follow. These should be replaced by machine specific
+ numbers. */
+
+ /* A number that should rouhgly correspond to the number of instructions
+ executed before the prefetch is completed. */
+
+ #ifndef PREFETCH_LATENCY
+ #define PREFETCH_LATENCY 50
+ #endif
+
+ /* Number of prefetches that can run at the same time. */
+
+ #ifndef SIMULTANEOUS_PREFETCHES
+ #define SIMULTANEOUS_PREFETCHES 3
+ #endif
+
+ #ifndef HAVE_prefetch
+ #define HAVE_prefetch 0
+ #endif
+
+ /* The group of references between that reuse may occur. */
+
+ struct mem_ref_group
+ {
+ tree base; /* Base of the reference. */
+ unsigned HOST_WIDE_INT step; /* Step of the reference. */
+ tree group_iv; /* Induction variable for the group. */
+ bool issue_prefetch_p; /* Is there any prefetch issued in the
+ group? */
+ struct mem_ref *refs; /* References in the group. */
+ struct mem_ref_group *next; /* Next group of references. */
+ };
+
+ /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */
+
+ #define PREFETCH_ALL (~(unsigned HOST_WIDE_INT) 0)
+
+ /* The memory reference. */
+
+ struct mem_ref
+ {
+ unsigned HOST_WIDE_INT delta; /* Constant offset of the reference. */
+ bool write_p; /* Is it a write? */
+ struct mem_ref_group *group; /* The group of references it belongs to. */
+ unsigned HOST_WIDE_INT prefetch_mod;
+ /* Prefetch only each PREFETCH_MOD-th
+ iteration. */
+ unsigned HOST_WIDE_INT prefetch_before;
+ /* Prefetch only first PREFETCH_BEFORE
+ iterations. */
+ bool issue_prefetch_p; /* Should we really issue the prefetch? */
+ struct mem_ref *next; /* The next reference in the group. */
+ };
+
+ /* Dumps information obout reference REF to FILE. */
+
+ static void
+ dump_mem_ref (FILE *file, struct mem_ref *ref)
+ {
+ fprintf (file, "Reference %p:\n", (void *) ref);
+
+ fprintf (file, " group %p (base ", (void *) ref->group);
+ print_generic_expr (file, ref->group->base, TDF_SLIM);
+ fprintf (file, ", step ");
+ fprintf (file, HOST_WIDE_INT_PRINT_UNSIGNED, ref->group->step);
+ fprintf (file, ")\n");
+
+ fprintf (dump_file, " delta ");
+ fprintf (file, HOST_WIDE_INT_PRINT_UNSIGNED, ref->delta);
+ fprintf (file, ")\n");
+
+ fprintf (file, " %s\n", ref->write_p ? "write" : "read");
+
+ fprintf (file, "\n");
+ }
+
+ /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
+ exist. */
+
+ static struct mem_ref_group *
+ find_or_create_group (struct mem_ref_group **groups, tree base,
+ unsigned HOST_WIDE_INT step)
+ {
+ for (; *groups; groups = &(*groups)->next)
+ {
+ if ((*groups)->step == step
+ && operand_equal_p ((*groups)->base, base, 0))
+ return *groups;
+ }
+
+ (*groups) = xcalloc (1, sizeof (struct mem_ref_group));
+ (*groups)->base = base;
+ (*groups)->step = step;
+ (*groups)->group_iv = NULL_TREE;
+ (*groups)->refs = NULL;
+ (*groups)->next = NULL;
+
+ return *groups;
+ }
+
+ /* Records a memory reference in GROUP with offset DELTA and write status
+ WRITE_P. */
+
+ static void
+ record_ref (struct mem_ref_group *group, unsigned HOST_WIDE_INT delta,
+ bool write_p)
+ {
+ struct mem_ref **aref;
+
+ for (aref = &group->refs; *aref; aref = &(*aref)->next)
+ {
+ /* It does not have to be possible for write reference to reuse the read
+ prefetch. */
+ if (write_p && !(*aref)->write_p)
+ continue;
+
+ if ((*aref)->delta == delta)
+ return;
+ }
+
+ (*aref) = xcalloc (1, sizeof (struct mem_ref));
+ (*aref)->delta = delta;
+ (*aref)->write_p = write_p;
+ (*aref)->prefetch_before = PREFETCH_ALL;
+ (*aref)->prefetch_mod = 1;
+ (*aref)->issue_prefetch_p = false;
+ (*aref)->group = group;
+ (*aref)->next = NULL;
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ dump_mem_ref (dump_file, *aref);
+ }
+
+ /* Release memory references in GROUPS. */
+
+ static void
+ release_mem_refs (struct mem_ref_group *groups)
+ {
+ struct mem_ref_group *next_g;
+ struct mem_ref *ref, *next_r;
+
+ for (; groups; groups = next_g)
+ {
+ next_g = groups->next;
+ for (ref = groups->refs; ref; ref = next_r)
+ {
+ next_r = ref->next;
+ free (ref);
+ }
+ free (groups);
+ }
+ }
+
+ /* A structure used to pass arguments to idx_analyze_ref. */
+
+ struct ar_data
+ {
+ struct loop *loop; /* Loop of the reference. */
+ tree stmt; /* Statement of the reference. */
+ unsigned HOST_WIDE_INT *step; /* Step of the memory reference. */
+ unsigned HOST_WIDE_INT *delta; /* Offset of the memory reference. */
+ };
+
+ /* Analyzes a single INDEX of a memory reference to obtain information
+ described at analyze_ref. Callback for for_each_index. */
+
+ static bool
+ idx_analyze_ref (tree base, tree *index, void *data)
+ {
+ struct ar_data *ar_data = data;
+ tree ibase, step, stepsize;
+ unsigned HOST_WIDE_INT istep, idelta = 0, imult = 1;
+
+ if (!simple_iv (ar_data->loop, ar_data->stmt, *index, &ibase, &step))
+ return false;
+
+ if (zero_p (step))
+ istep = 0;
+ else
+ {
+ if (!cst_and_fits_in_hwi (step))
+ return false;
+ istep = int_cst_value (step);
+ }
+
+ if (base)
+ {
+ if (TREE_CODE (ibase) == PLUS_EXPR
+ && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
+ {
+ idelta = int_cst_value (TREE_OPERAND (ibase, 1));
+ ibase = TREE_OPERAND (ibase, 0);
+ }
+
+ stepsize = TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (base)));
+ if (!cst_and_fits_in_hwi (stepsize))
+ return false;
+ imult = int_cst_value (stepsize);
+
+ istep *= imult;
+ idelta *= imult;
+ }
+
+ *ar_data->step += istep;
+ *ar_data->delta += idelta;
+ *index = ibase;
+
+ return true;
+ }
+
+ /* Tries to express REF in shape &BASE + STEP * iter + DELTA, where DELTA and
+ STEP are integer constants and iter is number of iterations of LOOP. The
+ reference occurs in statement STMT. */
+
+ static bool
+ analyze_ref (struct loop *loop, tree ref, tree *base,
+ unsigned HOST_WIDE_INT *step, unsigned HOST_WIDE_INT *delta,
+ tree stmt)
+ {
+ struct ar_data ar_data;
+ tree off;
+ unsigned HOST_WIDE_INT bit_offset;
+
+ *step = 0;
+ *delta = 0;
+
+ /* First strip off the component references. Ignore bitfields. */
+ if (TREE_CODE (ref) == COMPONENT_REF
+ && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1)))
+ ref = TREE_OPERAND (ref, 0);
+
+ for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
+ {
+ off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
+ bit_offset = TREE_INT_CST_LOW (off);
+
+ if (bit_offset % BITS_PER_UNIT)
+ abort ();
+
+ *delta += bit_offset / BITS_PER_UNIT;
+ }
+
+ *base = unshare_expr (ref);
+ ar_data.loop = loop;
+ ar_data.stmt = stmt;
+ ar_data.step = step;
+ ar_data.delta = delta;
+ return for_each_index (base, idx_analyze_ref, &ar_data);
+ }
+
+ /* Record a memory reference REF to the list REFS. The reference occurs in
+ LOOP in statement STMT and it is write if WRITE_P. */
+
+ static void
+ gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs,
+ tree ref, bool write_p, tree stmt)
+ {
+ tree base;
+ unsigned HOST_WIDE_INT step, delta;
+ struct mem_ref_group *agrp;
+
+ if (!analyze_ref (loop, ref, &base, &step, &delta, stmt))
+ return;
+
+ /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
+ are integer constants. */
+ agrp = find_or_create_group (refs, base, step);
+ record_ref (agrp, delta, write_p);
+ }
+
+ /* Record the suitable memory references in LOOP. */
+
+ static struct mem_ref_group *
+ gather_memory_references (struct loop *loop)
+ {
+ basic_block *body = get_loop_body_in_dom_order (loop);
+ basic_block bb;
+ unsigned i;
+ block_stmt_iterator bsi;
+ tree stmt, lhs, rhs;
+ struct mem_ref_group *refs = NULL;
+
+ /* Scan the loop body in order, so that the former references precede the
+ later ones. */
+ for (i = 0; i < loop->num_nodes; i++)
+ {
+ bb = body[i];
+ if (bb->loop_father != loop)
+ continue;
+
+ for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
+ {
+ stmt = bsi_stmt (bsi);
+ if (TREE_CODE (stmt) != MODIFY_EXPR)
+ continue;
+
+ lhs = TREE_OPERAND (stmt, 0);
+ rhs = TREE_OPERAND (stmt, 1);
+
+ if (TREE_CODE_CLASS (TREE_CODE (rhs)) == 'r')
+ gather_memory_references_ref (loop, &refs, rhs, false, stmt);
+ if (TREE_CODE_CLASS (TREE_CODE (lhs)) == 'r')
+ gather_memory_references_ref (loop, &refs, lhs, true, stmt);
+ }
+ }
+ free (body);
+
+ return refs;
+ }
+
+ /* Prune the prefetch candidate REF using the self-reuse. */
+
+ static void
+ prune_ref_by_self_reuse (struct mem_ref *ref)
+ {
+ if (ref->group->step == 0)
+ {
+ /* Prefetch references to invariant address just once. */
+ ref->prefetch_before = 1;
+ return;
+ }
+
+ /* TODO. */
+ }
+
+ /* Prune the prefetch candidate REF using the reuse with BY.
+ If BY_IS_BEFORE is true, BY is before REF in the loop. */
+
+ static void
+ prune_ref_by_group_reuse (struct mem_ref *ref ATTRIBUTE_UNUSED,
+ struct mem_ref *by ATTRIBUTE_UNUSED,
+ bool by_is_before ATTRIBUTE_UNUSED)
+ {
+ /* TODO. */
+ }
+
+ /* Prune the prefetch candidate REF using the reuses with other references
+ in REFS. */
+
+ static void
+ prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
+ {
+ struct mem_ref *prune_by;
+ bool before = true;
+
+ prune_ref_by_self_reuse (ref);
+
+ for (prune_by = refs; prune_by; prune_by = prune_by->next)
+ {
+ if (prune_by == ref)
+ {
+ before = false;
+ continue;
+ }
+
+ if (ref->write_p && !prune_by->write_p)
+ continue;
+
+ prune_ref_by_group_reuse (ref, prune_by, before);
+ }
+ }
+
+ /* Prune the prefetch candidates in GROUP using the reuse analysis. */
+
+ static void
+ prune_group_by_reuse (struct mem_ref_group *group)
+ {
+ struct mem_ref *ref_pruned;
+
+ for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
+ prune_ref_by_reuse (ref_pruned, group->refs);
+ }
+
+ /* Prune the list of prefetch candidates GROUPS using the reuse analysis. */
+
+ static void
+ prune_by_reuse (struct mem_ref_group *groups)
+ {
+ for (; groups; groups = groups->next)
+ prune_group_by_reuse (groups);
+ }
+
+ /* Decide which of the prefetch candidates in GROUPS to prefetch.
+ AHEAD is the number of iterations to prefetch ahead (which corresponds
+ to the number of simultaneous instances of one prefetch running at a
+ time). */
+
+ static void
+ schedule_prefetches (struct mem_ref_group *groups, unsigned ahead)
+ {
+ unsigned max_prefetches = SIMULTANEOUS_PREFETCHES / ahead;
+ struct mem_ref *ref;
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "Max prefetches to issue: %d.\n", max_prefetches);
+
+ /* For now we just take memory references one by one and issue
+ prefetches for as many as possible. TODO -- select the most
+ profitable prefetches. */
+
+ if (!max_prefetches)
+ return;
+
+ for (; groups; groups = groups->next)
+ for (ref = groups->refs; ref; ref = ref->next)
+ {
+ /* For now do not issue prefetches for only first few of the
+ iterations. */
+ if (ref->prefetch_before != PREFETCH_ALL)
+ continue;
+
+ ref->issue_prefetch_p = true;
+ groups->issue_prefetch_p = true;
+ max_prefetches--;
+
+ if (!max_prefetches)
+ return;
+ }
+ }
+
+ /* Issue prefetches for the referenc REF into LOOP as decided before.
+ HEAD is the number of iterations to prefetch ahead. */
+
+ static void
+ issue_prefetch_ref (struct loop *loop, struct mem_ref *ref, unsigned ahead)
+ {
+ unsigned HOST_WIDE_INT delta;
+ tree addr, stmts, prefetch, params, write_p;
+ block_stmt_iterator bsi;
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "Issued prefetch for %p.\n", (void *) ref);
+
+ /* Determine the address to prefetch. */
+ delta = ahead * ref->group->step + ref->delta;
+ addr = ref->group->group_iv;
+ if (delta)
+ addr = build (PLUS_EXPR, ptr_type_node,
+ addr, build_int_cst (ptr_type_node, delta));
+
+ addr = force_gimple_operand (addr, &stmts, false,
+ SSA_NAME_VAR (ref->group->group_iv));
+
+ /* Create the prefetch instruction. */
+ write_p = ref->write_p ? integer_one_node : integer_zero_node;
+ params = tree_cons (NULL_TREE, addr,
+ tree_cons (NULL_TREE, write_p, NULL_TREE));
+
+ prefetch = build_function_call_expr (built_in_decls[BUILT_IN_PREFETCH],
+ params);
+
+ /* And emit all the stuff. We put the prefetch to the loop header, so
+ that it runs early. */
+ bsi = bsi_after_labels (loop->header);
+ if (stmts)
+ bsi_insert_after (&bsi, stmts, BSI_CONTINUE_LINKING);
+ bsi_insert_after (&bsi, prefetch, BSI_NEW_STMT);
+ }
+
+ /* Issue prefetches for the references in GROUPS into LOOP as decided before.
+ HEAD is the number of iterations to prefetch ahead. */
+
+ static void
+ issue_prefetches (struct loop *loop, struct mem_ref_group *groups,
+ unsigned ahead)
+ {
+ struct mem_ref *ref;
+ tree iv_var, base, step;
+ block_stmt_iterator bsi;
+ bool after;
+
+ for (; groups; groups = groups->next)
+ {
+ if (!groups->issue_prefetch_p)
+ continue;
+
+ /* Create the induction variable for the group. */
+ iv_var = create_tmp_var (ptr_type_node, "prefetchtmp");
+ add_referenced_tmp_var (iv_var);
+ if (TREE_CODE (groups->base) == INDIRECT_REF)
+ base = fold_convert (ptr_type_node, TREE_OPERAND (groups->base, 0));
+ else
+ base = build (ADDR_EXPR, ptr_type_node, groups->base);
+ step = build_int_cst (ptr_type_node, groups->step);
+
+ standard_iv_increment_position (loop, &bsi, &after);
+ create_iv (base, step, iv_var, loop, &bsi, after, &groups->group_iv,
+ NULL);
+
+ for (ref = groups->refs; ref; ref = ref->next)
+ if (ref->issue_prefetch_p)
+ issue_prefetch_ref (loop, ref, ahead);
+ }
+ }
+
+ /* Issue prefetch instructions for array references in LOOP. */
+
+ static void
+ loop_prefetch_arrays (struct loop *loop)
+ {
+ struct mem_ref_group *refs;
+ unsigned ahead, ninsns;
+
+ /* Step 1: gather the memory references. */
+ refs = gather_memory_references (loop);
+
+ /* Step 2: estimate the reuse effects. */
+ prune_by_reuse (refs);
+
+ /* Step 3: determine the ahead. */
+
+ /* FIXME: We should use not size of the loop, but the average number of
+ instructions executed per iteration of the loop. */
+ ninsns = estimate_loop_size (loop);
+ ahead = (PREFETCH_LATENCY + ninsns - 1) / ninsns;
+
+ /* Step 4: what to prefetch? */
+ schedule_prefetches (refs, ahead);
+
+ /* Step 5: unroll and peel the loops. TODO. */
+
+ /* Step 6: issue the prefetches. */
+ issue_prefetches (loop, refs, ahead);
+
+ release_mem_refs (refs);
+ }
+
+ /* Issue prefetch instructions for array references in LOOPS. */
+
+ void
+ tree_ssa_prefetch_arrays (struct loops *loops)
+ {
+ unsigned i;
+ struct loop *loop;
+
+ if (!HAVE_prefetch)
+ return;
+
+ for (i = 1; i < loops->num; i++)
+ {
+ loop = loops->parray[i];
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "Processing loop %d:\n", loop->num);
+
+ if (loop)
+ loop_prefetch_arrays (loop);
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "\n\n");
+ }
+ }
Index: tree-ssa-loop.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/tree-ssa-loop.c,v
retrieving revision 1.1.2.3.2.24
diff -c -3 -p -r1.1.2.3.2.24 tree-ssa-loop.c
*** tree-ssa-loop.c 18 Jun 2004 20:52:32 -0000 1.1.2.3.2.24
--- tree-ssa-loop.c 1 Jul 2004 04:19:14 -0000
*************** struct tree_opt_pass pass_linear_transfo
*** 438,443 ****
--- 438,476 ----
TODO_dump_func /* todo_flags_finish */
};
+ /* Prefetching. */
+
+ static void
+ tree_ssa_loop_prefetch (void)
+ {
+ if (!current_loops)
+ return;
+
+ tree_ssa_prefetch_arrays (current_loops);
+ }
+
+ static bool
+ gate_tree_ssa_loop_prefetch (void)
+ {
+ return flag_prefetch_loop_arrays != 0;
+ }
+
+ struct tree_opt_pass pass_loop_prefetch =
+ {
+ "prefetch", /* name */
+ gate_tree_ssa_loop_prefetch, /* gate */
+ tree_ssa_loop_prefetch, /* execute */
+ NULL, /* sub */
+ NULL, /* next */
+ 0, /* static_pass_number */
+ TV_TREE_PREFETCH, /* tv_id */
+ PROP_cfg | PROP_ssa, /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ TODO_dump_func /* todo_flags_finish */
+ };
+
/* Induction variable optimizations. */
static void
Index: tree-ssa-operands.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/tree-ssa-operands.c,v
retrieving revision 1.1.2.3.2.9
diff -c -3 -p -r1.1.2.3.2.9 tree-ssa-operands.c
*** tree-ssa-operands.c 17 Jun 2004 18:25:13 -0000 1.1.2.3.2.9
--- tree-ssa-operands.c 1 Jul 2004 04:19:14 -0000
*************** get_stmt_operands (tree stmt)
*** 880,885 ****
--- 880,908 ----
}
+ /* Returns true if the function call EXPR does not access memory. */
+
+ static bool
+ function_ignores_memory_p (tree expr)
+ {
+ tree fndecl = get_callee_fndecl (expr);
+ enum built_in_function fcode;
+
+ if (!fndecl || !DECL_BUILT_IN (fndecl))
+ return false;
+
+ fcode = DECL_FUNCTION_CODE (fndecl);
+
+ switch (fcode)
+ {
+ case BUILT_IN_PREFETCH:
+ return true;
+
+ default:
+ return false;
+ }
+ }
+
/* Recursively scan the expression pointed by EXPR_P in statement STMT.
FLAGS is one of the OPF_* constants modifying how to interpret the
operands found. PREV_VOPS is as in append_v_may_def and append_vuse. */
*************** get_expr_operands (tree stmt, tree *expr
*** 1114,1120 ****
/* A 'pure' or a 'const' functions never call clobber anything.
A 'noreturn' function might, but since we don't return anyway
there is no point in recording that. */
! if (!(call_flags
& (ECF_PURE | ECF_CONST | ECF_NORETURN)))
add_call_clobber_ops (stmt, prev_vops);
else if (!(call_flags & (ECF_CONST | ECF_NORETURN)))
--- 1137,1145 ----
/* A 'pure' or a 'const' functions never call clobber anything.
A 'noreturn' function might, but since we don't return anyway
there is no point in recording that. */
! if (function_ignores_memory_p (expr))
! ;
! else if (!(call_flags
& (ECF_PURE | ECF_CONST | ECF_NORETURN)))
add_call_clobber_ops (stmt, prev_vops);
else if (!(call_flags & (ECF_CONST | ECF_NORETURN)))
Index: tree.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/tree.c,v
retrieving revision 1.263.2.75.2.9
diff -c -3 -p -r1.263.2.75.2.9 tree.c
*** tree.c 14 Jun 2004 01:58:16 -0000 1.263.2.75.2.9
--- tree.c 1 Jul 2004 04:19:14 -0000
*************** needs_to_live_in_memory (tree t)
*** 5609,5614 ****
--- 5609,5628 ----
|| decl_function_context (t) != current_function_decl);
}
+ /* Checks that X is integer constant that fits in unsigned HOST_WIDE_INT.
+ Similar to host_integerp (x, 1), but does not fail if the value is
+ negative. */
+
+ bool
+ cst_and_fits_in_hwi (tree x)
+ {
+ if (TREE_CODE (x) != INTEGER_CST)
+ return false;
+
+ return (TREE_INT_CST_HIGH (x) == 0
+ || TREE_INT_CST_HIGH (x) == -1);
+ }
+
/* Return value of a constant X. */
HOST_WIDE_INT
Index: tree.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/tree.h,v
retrieving revision 1.342.2.154.2.17
diff -c -3 -p -r1.342.2.154.2.17 tree.h
*** tree.h 17 Jun 2004 18:25:13 -0000 1.342.2.154.2.17
--- tree.h 1 Jul 2004 04:19:14 -0000
*************** extern int integer_pow2p (tree);
*** 3218,3223 ****
--- 3218,3227 ----
extern int integer_nonzerop (tree);
+ /* Checks whether the argument is either NULL_TREE or constant zero. */
+
+ extern bool zero_p (tree);
+
/* staticp (tree x) is nonzero if X is a reference to data allocated
at a fixed address in memory. */
*************** extern void init_ttree (void);
*** 3582,3587 ****
--- 3586,3592 ----
extern void build_common_tree_nodes (int);
extern void build_common_tree_nodes_2 (int);
extern tree build_range_type (tree, tree, tree);
+ extern bool cst_and_fits_in_hwi (tree);
extern HOST_WIDE_INT int_cst_value (tree);
extern tree build_int_cst (tree, unsigned HOST_WIDE_INT);