Updated prefetch patch
Jan Hubicka
hubicka@atrey.karlin.mff.cuni.cz
Fri Apr 14 06:23:00 GMT 2000
Hi
Here is updated patch for prefetching. I've found important problem in code
emiting prefetches before loop (it was using uninitialized bivs), so I've dropped
it for now, it is not all that important.
I've also cleaned up how the new givs are recorded - now the check_insn_for_givs
is used instead of doing this by hand.
It now results in bit worse code, since memory giv in prefetch instruction is not
recognized, but this is general problem with address operands.
I would like to teach check_insn_for_givs to handle this later and perhaps
tell it something about converting sets to leas too later today.
Now the prefetch instructions are optimized no-worse as any other insn lurking
around.
Honza
Thu Apr 13 10:46:44 CEST 2000 Jan Hubicka <jh@suse.cz>
Add basic prefetch infrastructure:
* loop.c (scan_loop, recombine_givs): Merge unroll_p abd bct_p parameters
to flags.
(strength_reduce): Likewise; call emit_prefetch_instructions.
(loop_optimize): Likewise.
(MAX_PREFETCHES): New macro.
(prefetch_info, check_store_data): New structure.
(check_store, emit_prefetch_instructions): New functions.
* loop.h (LOOP_UNROLL, LOOP_PREFETCH, LOOP_BCT): New constants.
* toplev.c (flag_prefetch): New global variable.
(lang_independent_options): Add -fprefetch
(rest_of_compilation): Update call of loop_optimize; do loop unrolling
in the first pass, do prefetch optimizations in second pass.
* flags.h (flag_strength_reduce): Declare.
* rtl.h (loop_optimize): Update prototype.
* invoke.texi (-fprefetch): Document.
* md.texi (prefetch, prefetchw): Document.
* tm.texi (PREFETCH_BLOCK_SIZE, SIMULTATENOUS_PREFETCHES): Document.
* i386.c (*_cost): Initialize prefetch_block_size and simultatenous_prefetches.
(x86_3dNOW, x86_SSE): New global variables.
(ix86_attr_length_default): Handle TYPE_PREFETCH.
* i386.h (struct processor_costs): Add simultatenous_prefetches
and prefetch_block.
(ARCHMASK): New macro.
(x86_3dNOW, x86_SSE): Declare.
(TARGET_CMOVE): Use ARCHMASK
(TARGET_3DNOW, TARGET_SSE): New macros.
(PREFETCH_BLOCK, SIMULTATENOUS_PREFETCHES): New macros.
* i386.md (attr type): Add "prefetch".
(attr length_opcode): Support prefetch type.
(attr memory_operand, k6_alu, athlon_ieu): Likewise
(prefetch expander): New.
(prefetch and prefetchw patterns): New.
Index: egcs/gcc/loop.c
===================================================================
RCS file: /cvs/gcc/egcs/gcc/loop.c,v
retrieving revision 1.233
diff -c -3 -p -r1.233 loop.c
*** loop.c 2000/03/25 18:34:03 1.233
--- loop.c 2000/04/14 13:18:25
*************** static void count_loop_regs_set PARAMS (
*** 246,252 ****
static void note_addr_stored PARAMS ((rtx, rtx, void *));
static void note_set_pseudo_multiple_uses PARAMS ((rtx, rtx, void *));
static int loop_reg_used_before_p PARAMS ((const struct loop *, rtx, rtx));
! static void scan_loop PARAMS ((struct loop*, int, int));
#if 0
static void replace_call_address PARAMS ((rtx, rtx, rtx));
#endif
--- 246,252 ----
static void note_addr_stored PARAMS ((rtx, rtx, void *));
static void note_set_pseudo_multiple_uses PARAMS ((rtx, rtx, void *));
static int loop_reg_used_before_p PARAMS ((const struct loop *, rtx, rtx));
! static void scan_loop PARAMS ((struct loop*, int));
#if 0
static void replace_call_address PARAMS ((rtx, rtx, rtx));
#endif
*************** static void add_label_notes PARAMS ((rtx
*** 261,267 ****
static void move_movables PARAMS ((struct loop *loop, struct movable *,
int, int, int));
static int count_nonfixed_reads PARAMS ((const struct loop *, rtx));
! static void strength_reduce PARAMS ((struct loop *, int, int, int));
static void find_single_use_in_loop PARAMS ((rtx, rtx, varray_type));
static int valid_initial_value_p PARAMS ((rtx, rtx, int, rtx));
static void find_mem_givs PARAMS ((const struct loop *, rtx, rtx, int, int));
--- 261,267 ----
static void move_movables PARAMS ((struct loop *loop, struct movable *,
int, int, int));
static int count_nonfixed_reads PARAMS ((const struct loop *, rtx));
! static void strength_reduce PARAMS ((struct loop *, int, int));
static void find_single_use_in_loop PARAMS ((rtx, rtx, varray_type));
static int valid_initial_value_p PARAMS ((rtx, rtx, int, rtx));
static void find_mem_givs PARAMS ((const struct loop *, rtx, rtx, int, int));
*************** static int replace_loop_reg PARAMS ((rtx
*** 308,313 ****
--- 308,314 ----
static void note_reg_stored PARAMS ((rtx, rtx, void *));
static void try_copy_prop PARAMS ((const struct loop *, rtx, unsigned int));
static int replace_label PARAMS ((rtx *, void *));
+ static void check_insn_for_givs PARAMS((struct loop *, rtx, int, int));
typedef struct rtx_and_int {
rtx r;
*************** compute_luids (start, end, prev_luid)
*** 419,429 ****
(or 0 if none should be output). */
void
! loop_optimize (f, dumpfile, unroll_p, bct_p)
/* f is the first instruction of a chain of insns for one function */
rtx f;
FILE *dumpfile;
! int unroll_p, bct_p;
{
register rtx insn;
register int i;
--- 420,430 ----
(or 0 if none should be output). */
void
! loop_optimize (f, dumpfile, flags)
/* f is the first instruction of a chain of insns for one function */
rtx f;
FILE *dumpfile;
! int flags;
{
register rtx insn;
register int i;
*************** loop_optimize (f, dumpfile, unroll_p, bc
*** 529,535 ****
struct loop *loop = &loops->array[i];
if (! loop->invalid && loop->end)
! scan_loop (loop, unroll_p, bct_p);
}
/* If there were lexical blocks inside the loop, they have been
--- 530,536 ----
struct loop *loop = &loops->array[i];
if (! loop->invalid && loop->end)
! scan_loop (loop, flags);
}
/* If there were lexical blocks inside the loop, they have been
*************** next_insn_in_loop (loop, insn)
*** 588,596 ****
write, then we can also mark the memory read as invariant. */
static void
! scan_loop (loop, unroll_p, bct_p)
struct loop *loop;
! int unroll_p, bct_p;
{
register int i;
rtx loop_start = loop->start;
--- 589,597 ----
write, then we can also mark the memory read as invariant. */
static void
! scan_loop (loop, flags)
struct loop *loop;
! int flags;
{
register int i;
rtx loop_start = loop->start;
*************** scan_loop (loop, unroll_p, bct_p)
*** 1142,1148 ****
if (flag_strength_reduce)
{
the_movables = movables;
! strength_reduce (loop, insn_count, unroll_p, bct_p);
reg_scan_update (update_start, update_end, loop_max_reg);
loop_max_reg = max_reg_num ();
--- 1143,1149 ----
if (flag_strength_reduce)
{
the_movables = movables;
! strength_reduce (loop, insn_count, flags);
reg_scan_update (update_start, update_end, loop_max_reg);
loop_max_reg = max_reg_num ();
*************** static rtx addr_placeholder;
*** 3694,3699 ****
--- 3695,4007 ----
was rerun in loop_optimize whenever a register was added or moved.
Also, some of the optimizations could be a little less conservative. */
+ #ifdef HAVE_prefetch
+ /* Give up the prefetch optimizations once we exceed given treshold. It is
+ unlikely we would be able to optimize something in loop with so many
+ detected prefetches. */
+
+ #define MAX_PREFETCHES 10
+
+ /* Informations we collect about arrays we may want to prefetch. */
+ struct prefetch_info
+ {
+ struct iv_class *class; /* Class this prefetch is based on. */
+ struct induction *giv; /* GIV this prefetch is based on. */
+ rtx base_address; /* Start prefetching from this adress plus index. */
+ int index;
+ int straddle; /* Prefetch straddle in bytes in each iteration. */
+ int write; /* 1 for read/write prefetches. */
+ int bytes_accesed; /* Sum of sizes of all acceses to this prefetch area. */
+ int do_prefetch; /* 1 for those choosed for prefetching. */
+ };
+
+ /* Data used by check_store function. */
+ struct check_store_data
+ {
+ rtx mem_address;
+ int mem_write;
+ };
+
+ static void check_store PARAMS ((rtx, rtx, void *));
+ static void emit_prefetch_instructions PARAMS ((struct loop *));
+
+ /* Set mem_write when mem_address is found. Used as callback to note_stores. */
+ static void
+ check_store (x, pat, data)
+ rtx x, pat ATTRIBUTE_UNUSED;
+ void *data;
+ {
+ struct check_store_data *d = (struct check_store_data *)data;
+ /* We don't want to return 1 if X is a MEM that contains a register
+ within REG_SET_REG. */
+
+
+ if ((GET_CODE (x) == MEM) && rtx_equal_p (d->mem_address, XEXP (x, 0)))
+ d->mem_write = 1;
+ }
+
+ /* Attempt to identify access to arrays most likely causing case miss and emit
+ prefetch instruction few prefetch blocks forward.
+
+ To detect the arrays we use GIV infromation collected by strength reduction
+ pass.
+
+ The prefetch instructions are generated after GIV information is done and
+ before strength reduction process. The new GIVs are injected to the strength
+ reduction's tables, so the prefetch addresses are optimized as well.
+
+ Givs are split into base address, straddle, and constant addition value.
+ Givs with same address, straddle and close addition values are combined into
+ single prefetch. Also writes to givs are detected, so Athlon's prefetchw
+ instruction can be used for block we write to.
+
+ Later we may want to implement more smart heruistics to detect cache misses
+ Currently we don't prefetch when one of following conditions hold:
+
+
+ 1) Loop has known (and low) iteration count.
+ 2) Loop has more than 10 prefetch blocks
+ 3) Loop contains function call (such loop is probably not internal loop)
+ 4) The density of prefetch is less than 80%
+ 5) When the straddle is larger than 4096 bytes, or negative
+
+ */
+ static void
+ emit_prefetch_instructions (struct loop *loop)
+ {
+ int num_prefetches = 0;
+ int num_real_prefetches = 0;
+ int num_real_write_prefetches = 0;
+ int ahead;
+ int i;
+ struct iv_class *bl;
+ struct induction *iv;
+ struct prefetch_info info[MAX_PREFETCHES];
+
+ if (!HAVE_prefetch)
+ return;
+
+ /* Loops with few iterations can be handled by emiting prefetches before the loop.
+ Handle this later. */
+ if ((LOOP_INFO (loop)->n_iterations && LOOP_INFO (loop)->n_iterations < 1000)
+ /* Consider only loops w/o calls. When call is done, loop is probably slow enought
+ to read the memory. */
+ || LOOP_INFO (loop)->has_call)
+ return;
+
+ /* Search all induction variables and pick those interesting for prefetch machinery. */
+ for (bl = loop_iv_list; bl; bl = bl->next)
+ {
+ struct induction *biv = bl->biv, *biv1;
+ int basestraddle = 0;
+
+ biv1 = biv;
+ /* Expect all bivs to be executed in each iteration. This makes our
+ analysis more conservative. */
+ while (biv1)
+ {
+ if (GET_CODE (biv->add_val) != CONST_INT)
+ break;
+ basestraddle += INTVAL (biv1->add_val);
+ biv1 = biv1->next_iv;
+ }
+ if (biv1 || !basestraddle)
+ continue;
+ for (iv = bl->giv; iv; iv = iv->next_iv)
+ {
+ rtx address;
+ rtx temp;
+ int index = 0;
+ int add = 1;
+ int straddle;
+ struct check_store_data d;
+
+ /* Half a dozen of purposes why induction variable is not
+ interesting for us. */
+ if (iv->giv_type != DEST_ADDR
+ /* We are interested only in constant straddle memory references
+ in order to be able to compute density easilly. */
+ || GET_CODE (iv->mult_val) != CONST_INT
+ /* Don't handle reversed order prefetches, since they are usually
+ ineffective. Later we may be able to reverse such bivs. */
+ || (straddle = INTVAL (iv->mult_val) * basestraddle) < 0
+ /* Prefetching of accesses with such a extreme straddle is probably not
+ worthwhile too. */
+ || straddle > 4096)
+ continue;
+
+ /* Determine pointer to the basic array we are examinating. Usually we will
+ be faced with constructs like (plus:SI (array_pointer) (const_int disp)) */
+ address = iv->add_val;
+ address = simplify_gen_binary (PLUS, Pmode, bl->initial_value, address);
+ /* Try hard to convert address to something we understand to.
+ The real base addresses can be well masked. */
+ address = canon_rtx (address);
+ temp = simplify_rtx (address);
+ if (temp)
+ address = temp;
+
+ if (GET_CODE (address) == CONST)
+ address = XEXP (address, 0);
+ if (GET_CODE (address) == PLUS
+ && GET_CODE (XEXP (address, 1)) == CONST_INT)
+ {
+ index = INTVAL (XEXP (address, 1));
+ address = XEXP (address, 0);
+ }
+ if (GET_CODE (address) == CONST_INT)
+ {
+ index = INTVAL (address);
+ address = const0_rtx;
+ }
+ index += GET_MODE_SIZE (iv->mem_mode);
+ d.mem_write = 0;
+ d.mem_address = *iv->location;
+ /* When the giv is not always executed, we may get better of by non dirtifying
+ the cache pages. */
+ if (iv->always_executed)
+ note_stores (PATTERN (iv->insn), check_store, &d);
+
+ /* Attempt to find prefetch to the same array and see if we can merge this one. */
+ for (i = 0; i < num_prefetches; i++)
+ if (rtx_equal_p (address, info[i].base_address)
+ && straddle == info[i].straddle)
+ {
+ if (index >= info[i].index && index - info[i].index < 4096)
+ {
+ info[i].write |= d.mem_write;
+ info[i].bytes_accesed += GET_MODE_SIZE (iv->mem_mode);
+ info[i].index = index;
+ info[i].giv = iv;
+ info[i].class = bl;
+ info[num_prefetches].base_address = address;
+ add = 0;
+ break;
+ }
+ if (index < info[i].index && info[i].index - index < 4096)
+ {
+ info[i].write |= d.mem_write;
+ info[i].bytes_accesed += GET_MODE_SIZE (iv->mem_mode);
+ add = 0;
+ break;
+ }
+ }
+ /* Merging failed. */
+ if (add)
+ {
+ info[num_prefetches].giv = iv;
+ info[num_prefetches].class = bl;
+ info[num_prefetches].index = index;
+ info[num_prefetches].straddle = straddle;
+ info[num_prefetches].base_address = address;
+ info[num_prefetches].write = d.mem_write;
+ info[num_prefetches].bytes_accesed = GET_MODE_SIZE (iv->mem_mode);
+ num_prefetches++;
+ if (num_prefetches >= MAX_PREFETCHES)
+ {
+ if (loop_dump_stream)
+ fprintf(stderr,"Maximal number of prefetches exceeded.\n");
+ return;
+ }
+ }
+ }
+ }
+ for (i = 0; i < num_prefetches; i++)
+ {
+ /* Prefetch is wortwhile only when the reads/writes are dense. */
+ if (info[i].bytes_accesed * 256 / info[i].straddle > 220)
+ info[i].do_prefetch = 1;
+ else
+ info[i].do_prefetch = 0;
+ }
+ for (i = 0; i < num_prefetches; i++)
+ {
+ if (info[i].do_prefetch)
+ {
+ num_real_prefetches += ((info[i].straddle + PREFETCH_BLOCK - 1)
+ / PREFETCH_BLOCK);
+ if (info[i].write)
+ num_real_write_prefetches += ((info[i].straddle + PREFETCH_BLOCK - 1)
+ / PREFETCH_BLOCK);
+ }
+ }
+ if (loop_dump_stream)
+ {
+ for (i = 0; i < num_prefetches; i++)
+ {
+ fprintf (loop_dump_stream, "Prefetch insn %i address: ",
+ INSN_UID (info[i].giv->insn));
+ print_rtl (loop_dump_stream, info[i].base_address);
+ fprintf (loop_dump_stream, " Index:%i straddle:%i density:%i%% %s %s\n",
+ info[i].index, info[i].straddle,
+ info[i].bytes_accesed * 100 / info[i].straddle,
+ info[i].write ? "read/write" : "read only",
+ info[i].do_prefetch ? "prefetch" : "ignore");
+ }
+ fprintf (loop_dump_stream, "Real prefetches needed:%i (write:%i)\n",
+ num_real_prefetches, num_real_write_prefetches);
+ }
+
+ if (!num_real_prefetches)
+ return;
+
+ ahead = (SIMULTATENOUS_PREFETCHES / (num_real_prefetches));
+
+ if (!ahead)
+ return;
+ for (i = 0; i < num_prefetches; i++)
+ {
+ if (info[i].do_prefetch)
+ {
+ int y;
+ for (y = 0; y < ((info[i].straddle + PREFETCH_BLOCK - 1)
+ / PREFETCH_BLOCK); y++)
+ {
+ rtx loc = copy_rtx (*info[i].giv->location);
+ rtx insn;
+ int bytes_ahead = PREFETCH_BLOCK * (ahead + y);
+ rtx before_insn = info[i].giv->insn;
+ rtx prev_insn = PREV_INSN (info[i].giv->insn);
+
+ /* We can save some effort by offsetting the address on architectures with
+ offsettable memory references. */
+ if (offsettable_address_p (0, SImode, loc))
+ {
+ loc = gen_rtx_MEM (SImode, loc);
+ loc = adj_offsettable_operand (loc, bytes_ahead);
+ loc = XEXP (loc, 0);
+ }
+ else
+ {
+ rtx reg = gen_reg_rtx (Pmode);
+ emit_iv_add_mult (loc, const1_rtx, GEN_INT (bytes_ahead), reg,
+ before_insn);
+ loc = reg;
+ }
+
+ #ifdef HAVE_prefetchw
+ if (info[i].write && HAVE_prefetchw)
+ emit_insn_before (gen_prefetchw (loc), before_insn);
+ else
+ #endif
+ emit_insn_before (gen_prefetch (loc), before_insn);
+
+ /* Check all insns emitted and record new giv information. */
+ insn = NEXT_INSN (prev_insn);
+ while (insn != before_insn)
+ {
+ check_insn_for_givs (loop, insn,
+ info[i].giv->always_executed,
+ info[i].giv->maybe_multiple);
+ insn = NEXT_INSN (insn);
+ }
+ }
+ }
+ }
+ return;
+ }
+ #endif
+
/* Perform strength reduction and induction variable elimination.
Pseudo registers created during this function will be beyond the last
*************** static rtx addr_placeholder;
*** 3703,3712 ****
But scan_loop must check regnos to make sure they are in bounds. */
static void
! strength_reduce (loop, insn_count, unroll_p, bct_p)
struct loop *loop;
int insn_count;
! int unroll_p, bct_p ATTRIBUTE_UNUSED;
{
rtx p;
rtx set;
--- 4011,4020 ----
But scan_loop must check regnos to make sure they are in bounds. */
static void
! strength_reduce (loop, insn_count, flags)
struct loop *loop;
int insn_count;
! int flags ATTRIBUTE_UNUSED;
{
rtx p;
rtx set;
*************** strength_reduce (loop, insn_count, unrol
*** 3979,3985 ****
{
/* Can still unroll the loop anyways, but indicate that there is no
strength reduction info available. */
! if (unroll_p)
unroll_loop (loop, insn_count, end_insert_before, 0);
goto egress;
--- 4287,4293 ----
{
/* Can still unroll the loop anyways, but indicate that there is no
strength reduction info available. */
! if (flags & LOOP_UNROLL)
unroll_loop (loop, insn_count, end_insert_before, 0);
goto egress;
*************** strength_reduce (loop, insn_count, unrol
*** 4635,4640 ****
--- 4871,4881 ----
loop_iterations (loop);
+ #ifdef HAVE_prefetch
+ if (flags & LOOP_PREFETCH)
+ emit_prefetch_instructions (loop);
+ #endif
+
/* Now for each giv for which we still don't know whether or not it is
replaceable, check to see if it is replaceable because its final value
can be calculated. This must be done after loop_iterations is called,
*************** strength_reduce (loop, insn_count, unrol
*** 4865,4871 ****
VARRAY_GROW (reg_iv_type, nregs);
VARRAY_GROW (reg_iv_info, nregs);
}
! recombine_givs (loop, bl, unroll_p);
/* Reduce each giv that we decided to reduce. */
--- 5106,5112 ----
VARRAY_GROW (reg_iv_type, nregs);
VARRAY_GROW (reg_iv_info, nregs);
}
! recombine_givs (loop, bl, flags);
/* Reduce each giv that we decided to reduce. */
*************** strength_reduce (loop, insn_count, unrol
*** 5259,5272 ****
induction variable information that strength_reduce has already
collected. Always unroll loops that would be as small or smaller
unrolled than when rolled. */
! if (unroll_p
|| (loop_info->n_iterations > 0
&& unrolled_insn_copies <= insn_count))
unroll_loop (loop, insn_count, end_insert_before, 1);
#ifdef HAVE_decrement_and_branch_on_count
/* Instrument the loop with BCT insn. */
! if (HAVE_decrement_and_branch_on_count && bct_p
&& flag_branch_on_count_reg)
insert_bct (loop);
#endif /* HAVE_decrement_and_branch_on_count */
--- 5500,5513 ----
induction variable information that strength_reduce has already
collected. Always unroll loops that would be as small or smaller
unrolled than when rolled. */
! if ((flags & LOOP_UNROLL)
|| (loop_info->n_iterations > 0
&& unrolled_insn_copies <= insn_count))
unroll_loop (loop, insn_count, end_insert_before, 1);
#ifdef HAVE_decrement_and_branch_on_count
/* Instrument the loop with BCT insn. */
! if (HAVE_decrement_and_branch_on_count && (flags & LOOP_BCT)
&& flag_branch_on_count_reg)
insert_bct (loop);
#endif /* HAVE_decrement_and_branch_on_count */
*************** find_life_end (x, stats, insn, biv)
*** 7395,7404 ****
This tends to shorten giv lifetimes, and helps the next step:
try to derive givs from other givs. */
static void
! recombine_givs (loop, bl, unroll_p)
const struct loop *loop;
struct iv_class *bl;
! int unroll_p;
{
struct induction *v, **giv_array, *last_giv;
struct recombine_givs_stats *stats;
--- 7719,7728 ----
This tends to shorten giv lifetimes, and helps the next step:
try to derive givs from other givs. */
static void
! recombine_givs (loop, bl, flags)
const struct loop *loop;
struct iv_class *bl;
! int flags;
{
struct induction *v, **giv_array, *last_giv;
struct recombine_givs_stats *stats;
*************** recombine_givs (loop, bl, unroll_p)
*** 7684,7690 ****
&& ((GET_CODE (sum) == PLUS
&& GET_CODE (XEXP (sum, 0)) == REG
&& GET_CODE (XEXP (sum, 1)) == CONST_INT)
! || ! unroll_p)
&& validate_change (v->insn, &PATTERN (v->insn),
gen_rtx_SET (VOIDmode, v->dest_reg, sum), 0))
{
--- 8008,8014 ----
&& ((GET_CODE (sum) == PLUS
&& GET_CODE (XEXP (sum, 0)) == REG
&& GET_CODE (XEXP (sum, 1)) == CONST_INT)
! || ! (flags & LOOP_UNROLL))
&& validate_change (v->insn, &PATTERN (v->insn),
gen_rtx_SET (VOIDmode, v->dest_reg, sum), 0))
{
*** ./loop.h.old Thu Apr 13 08:14:43 2000
--- loop.h Thu Apr 13 08:47:40 2000
*************** Boston, MA 02111-1307, USA. */
*** 20,25 ****
--- 20,29 ----
#include "varray.h"
#include "basic-block.h"
+ /* Flags passed to loop_optimize. */
+ #define LOOP_UNROLL 1
+ #define LOOP_PREFETCH 2
+ #define LOOP_BCT 4
/* Get the loop info pointer of a loop. */
#define LOOP_INFO(LOOP) ((struct loop_info *) (LOOP)->aux)
*** ./toplev.c.old Thu Apr 13 08:46:15 2000
--- toplev.c Thu Apr 13 10:33:27 2000
*************** int flag_unroll_loops;
*** 531,536 ****
--- 531,540 ----
int flag_unroll_all_loops;
+ /* Nonzero enables prefetch optimizations. */
+
+ int flag_prefetch;
+
/* Nonzero forces all invariant computations in loops to be moved
outside the loop. */
*************** lang_independent_options f_options[] =
*** 946,951 ****
--- 950,957 ----
"Perform loop unrolling when iteration count is known" },
{"unroll-all-loops", &flag_unroll_all_loops, 1,
"Perform loop unrolling for all loops" },
+ {"prefetch", &flag_prefetch, 1,
+ "Output prefetch instructions to increase memory banwidth when available" },
{"move-all-movables", &flag_move_all_movables, 1,
"Force all loop invariant computations out of loops" },
{"reduce-all-givs", &flag_reduce_all_givs, 1,
*************** rest_of_compilation (decl)
*** 3120,3126 ****
{
/* We only want to perform unrolling once. */
! loop_optimize (insns, rtl_dump_file, 0, 0);
/* The first call to loop_optimize makes some instructions
trivially dead. We delete those instructions now in the
--- 3126,3133 ----
{
/* We only want to perform unrolling once. */
! loop_optimize (insns, rtl_dump_file,
! (flag_unroll_loops ? LOOP_UNROLL : 0));
/* The first call to loop_optimize makes some instructions
trivially dead. We delete those instructions now in the
*************** rest_of_compilation (decl)
*** 3132,3138 ****
analysis code depends on this information. */
reg_scan (insns, max_reg_num (), 1);
}
! loop_optimize (insns, rtl_dump_file, flag_unroll_loops, 1);
});
close_dump_file (DFI_loop, print_rtl, insns);
--- 3139,3149 ----
analysis code depends on this information. */
reg_scan (insns, max_reg_num (), 1);
}
! loop_optimize (insns, rtl_dump_file,
! (LOOP_BCT
! | (flag_unroll_loops && ! flag_rerun_loop_opt
! ? LOOP_UNROLL : 0)
! | (flag_prefetch ? LOOP_PREFETCH : 0)));
});
close_dump_file (DFI_loop, print_rtl, insns);
*** ./flags.h.old Thu Apr 13 08:47:01 2000
--- flags.h Thu Apr 13 08:47:24 2000
*************** extern int flag_strength_reduce;
*** 226,231 ****
--- 226,234 ----
extern int flag_unroll_loops;
+ /* Nonzero enables prefetch optimizations. */
+ extern int flag_prefetch;
+
/* Nonzero enables loop unrolling in unroll.c. All loops are unrolled.
This is generally not a win. */
*** ./rtl.h.old Thu Apr 13 09:07:50 2000
--- rtl.h Thu Apr 13 09:07:58 2000
*************** extern void print_inline_rtx PARAMS ((F
*** 1582,1588 ****
extern void init_loop PARAMS ((void));
extern rtx libcall_other_reg PARAMS ((rtx, rtx));
#ifdef BUFSIZ
! extern void loop_optimize PARAMS ((rtx, FILE *, int, int));
#endif
extern void record_excess_regs PARAMS ((rtx, rtx, rtx *));
--- 1582,1588 ----
extern void init_loop PARAMS ((void));
extern rtx libcall_other_reg PARAMS ((rtx, rtx));
#ifdef BUFSIZ
! extern void loop_optimize PARAMS ((rtx, FILE *, int));
#endif
extern void record_excess_regs PARAMS ((rtx, rtx, rtx *));
*** ./invoke.texi.old Thu Apr 13 09:11:33 2000
--- invoke.texi Thu Apr 13 09:13:33 2000
*************** in the following sections.
*** 168,174 ****
-fregmove -frerun-cse-after-loop -frerun-loop-opt -freduce-all-givs
-fschedule-insns -fschedule-insns2 -fstrength-reduce
-fstrict-aliasing -fthread-jumps -funroll-all-loops
! -funroll-loops
-O -O0 -O1 -O2 -O3 -Os
@end smallexample
--- 168,174 ----
-fregmove -frerun-cse-after-loop -frerun-loop-opt -freduce-all-givs
-fschedule-insns -fschedule-insns2 -fstrength-reduce
-fstrict-aliasing -fthread-jumps -funroll-all-loops
! -funroll-loops -fprefetch
-O -O0 -O1 -O2 -O3 -Os
@end smallexample
*************** whose number of iterations can be determ
*** 2681,2686 ****
--- 2681,2690 ----
Perform the optimization of loop unrolling. This is done for all loops
and usually makes programs run more slowly. @samp{-funroll-all-loops}
implies @samp{-fstrength-reduce} as well as @samp{-frerun-cse-after-loop}.
+
+ @item -fprefetch
+ Output prefetch instructions. This may improve performance of loops accesing
+ large arrays. Only some architectures does support such instructions.
@item -fmove-all-movables
Forces all invariant computations in loops to be moved
*** ./md.texi.old Thu Apr 13 09:13:44 2000
--- md.texi Thu Apr 13 09:19:24 2000
*************** into consecutive memory locations. Oper
*** 1947,1952 ****
--- 1947,1963 ----
consecutive memory locations, operand 1 is the first register, and
operand 2 is a constant: the number of consecutive registers.
+ @cindex @samp{prefetch} instruction pattern
+ @item @samp{prefetch}
+ Operand 0 is the memory access. Prefetch is expected to emit instruction that
+ reads the address into cache. Instruction must not trap when address is
+ invalid. When this pattern is defined, PREFETCH_BLOCK_SIZE and
+ SIMULTATENOUS_PREFETCHES target macros must be defined too.
+
+ @cindex @samp{prefetchw} instruction pattern
+ @item @samp{prefetchw}
+ Similar to prefetch, but give hint to CPU that cache line will be modified.
+
@cindex @code{add@var{m}3} instruction pattern
@item @samp{add@var{m}3}
Add operand 2 and operand 1, storing the result in operand 0. All operands
*** ./tm.texi.old Thu Apr 13 09:19:43 2000
--- tm.texi Thu Apr 13 09:22:48 2000
*************** compatible with code compiled using the
*** 1124,1129 ****
--- 1124,1139 ----
If you are writing a new port, define @code{DEFAULT_VTABLE_THUNKS} to 1.
If you do not define this macro, the default for @samp{-fvtable-thunk} is 0.
+
+ @findex PREFETCH_BLOCK_SIZE
+ @item PREFETCH_BLOCK_SIZE
+ Size of the block brought to cache by single prefetch operation in bytes. Define
+ define this macro if and only if prefetch pattern exist.
+
+ @findex SIMULTATENOUS_PREFETCHES
+ @item SIMULTATENOUS_PREFETCHES
+ Number of prefetch operation CPU can issue in parallel.
+ Define define this macro if and only if prefetch pattern exist.
@end table
@node Type Layout
*** config/i386/i386.c.old Thu Apr 13 08:15:06 2000
--- config/i386/i386.c Thu Apr 13 09:15:42 2000
*************** struct processor_costs i386_cost = { /*
*** 74,80 ****
2, /* cost of reg,reg fld/fst */
{8, 8, 8}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
! {8, 8, 8} /* cost of loading integer registers */
};
struct processor_costs i486_cost = { /* 486 specific costs */
--- 74,82 ----
2, /* cost of reg,reg fld/fst */
{8, 8, 8}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
! {8, 8, 8}, /* cost of loading integer registers */
! 0, /* size of prefetch block */
! 0, /* number of prefetches doable in parallel */
};
struct processor_costs i486_cost = { /* 486 specific costs */
*************** struct processor_costs i486_cost = { /*
*** 95,101 ****
2, /* cost of reg,reg fld/fst */
{8, 8, 8}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
! {8, 8, 8} /* cost of loading integer registers */
};
struct processor_costs pentium_cost = {
--- 97,105 ----
2, /* cost of reg,reg fld/fst */
{8, 8, 8}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
! {8, 8, 8}, /* cost of loading integer registers */
! 0, /* size of prefetch block */
! 0, /* number of prefetches doable in parallel */
};
struct processor_costs pentium_cost = {
*************** struct processor_costs pentium_cost = {
*** 116,122 ****
2, /* cost of reg,reg fld/fst */
{2, 2, 6}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
! {4, 4, 6} /* cost of loading integer registers */
};
struct processor_costs pentiumpro_cost = {
--- 120,128 ----
2, /* cost of reg,reg fld/fst */
{2, 2, 6}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
! {4, 4, 6}, /* cost of loading integer registers */
! 0, /* size of prefetch block */
! 0, /* number of prefetches doable in parallel */
};
struct processor_costs pentiumpro_cost = {
*************** struct processor_costs pentiumpro_cost =
*** 137,143 ****
2, /* cost of reg,reg fld/fst */
{2, 2, 6}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
! {4, 4, 6} /* cost of loading integer registers */
};
struct processor_costs k6_cost = {
--- 143,154 ----
2, /* cost of reg,reg fld/fst */
{2, 2, 6}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
! {4, 4, 6}, /* cost of loading integer registers */
! 32, /* size of prefetch block */
! 6 /* number of prefetches doable in
! parallel */
! /* ??? Guess, only most recent PPRO
! familly CPUs do non-NOP prefetch. */
};
struct processor_costs k6_cost = {
*************** struct processor_costs k6_cost = {
*** 158,164 ****
4, /* cost of reg,reg fld/fst */
{6, 6, 6}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
! {4, 4, 4} /* cost of loading integer registers */
};
struct processor_costs athlon_cost = {
--- 169,178 ----
4, /* cost of reg,reg fld/fst */
{6, 6, 6}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
! {4, 4, 4}, /* cost of loading integer registers */
! 32, /* size of prefetch block */
! 1 /* number of prefetches doable in
! parallel */
};
struct processor_costs athlon_cost = {
*************** struct processor_costs athlon_cost = {
*** 179,185 ****
4, /* cost of reg,reg fld/fst */
{6, 6, 6}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
! {4, 4, 4} /* cost of loading integer registers */
};
struct processor_costs *ix86_cost = &pentium_cost;
--- 193,202 ----
4, /* cost of reg,reg fld/fst */
{6, 6, 6}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
! {4, 4, 4}, /* cost of loading integer registers */
! 64, /* size of prefetch block */
! 6 /* number of prefetches doable in
! parallel */
};
struct processor_costs *ix86_cost = &pentium_cost;
*************** const int x86_sub_esp_4 = m_ATHLON | m_P
*** 222,227 ****
--- 239,246 ----
const int x86_sub_esp_8 = m_ATHLON | m_PPRO | m_386 | m_486;
const int x86_add_esp_4 = m_ATHLON | m_K6;
const int x86_add_esp_8 = m_ATHLON | m_PPRO | m_K6 | m_386 | m_486;
+ const int x86_3dNOW = m_ATHLON | m_K6;
+ const int x86_SSE = m_ATHLON | m_PPRO;
#define AT_BP(mode) (gen_rtx_MEM ((mode), hard_frame_pointer_rtx))
*************** ix86_attr_length_default (insn)
*** 6100,6105 ****
--- 6144,6150 ----
case TYPE_IDIV:
case TYPE_PUSH:
case TYPE_POP:
+ case TYPE_PREFETCH:
for (i = recog_data.n_operands - 1; i >= 0; --i)
if (CONSTANT_P (recog_data.operand[i]))
{
*** config/i386/i386.h.old Thu Apr 13 08:15:03 2000
--- config/i386/i386.h Thu Apr 13 09:30:13 2000
*************** struct processor_costs {
*** 72,77 ****
--- 72,81 ----
in SFmode, DFmode and XFmode */
int fp_store[3]; /* cost of storing FP register
in SFmode, DFmode and XFmode */
+ int prefetch_block; /* Size of block read by single
+ prefetch operation. */
+ int simultatenous_prefetches; /* Number of prefetch operations
+ doable in parallel. */
};
extern struct processor_costs *ix86_cost;
*************** extern int target_flags;
*** 164,169 ****
--- 168,174 ----
#define TARGET_ATHLON (ix86_cpu == PROCESSOR_ATHLON)
#define CPUMASK (1 << ix86_cpu)
+ #define ARCHMASK (1 << ix86_arch)
extern const int x86_use_leave, x86_push_memory, x86_zero_extend_with_and;
extern const int x86_use_bit_test, x86_cmove, x86_deep_branch;
extern const int x86_unroll_strlen, x86_use_q_reg, x86_use_any_reg;
*************** extern const int x86_promote_QImode, x86
*** 175,180 ****
--- 180,186 ----
extern const int x86_himode_math, x86_qimode_math, x86_promote_qi_regs;
extern const int x86_promote_hi_regs;
extern const int x86_add_esp_4, x86_add_esp_8, x86_sub_esp_4, x86_sub_esp_8;
+ extern const int x86_3dNOW, x86_SSE;
#define TARGET_USE_LEAVE (x86_use_leave & CPUMASK)
#define TARGET_PUSH_MEMORY (x86_push_memory & CPUMASK)
*************** extern const int x86_add_esp_4, x86_add_
*** 183,189 ****
#define TARGET_UNROLL_STRLEN (x86_unroll_strlen & CPUMASK)
#define TARGET_USE_Q_REG (x86_use_q_reg & CPUMASK)
#define TARGET_USE_ANY_REG (x86_use_any_reg & CPUMASK)
! #define TARGET_CMOVE (x86_cmove & (1 << ix86_arch))
#define TARGET_DEEP_BRANCH_PREDICTION (x86_deep_branch & CPUMASK)
#define TARGET_DOUBLE_WITH_ADD (x86_double_with_add & CPUMASK)
#define TARGET_USE_SAHF (x86_use_sahf & CPUMASK)
--- 189,195 ----
#define TARGET_UNROLL_STRLEN (x86_unroll_strlen & CPUMASK)
#define TARGET_USE_Q_REG (x86_use_q_reg & CPUMASK)
#define TARGET_USE_ANY_REG (x86_use_any_reg & CPUMASK)
! #define TARGET_CMOVE (x86_cmove & ARCHMASK)
#define TARGET_DEEP_BRANCH_PREDICTION (x86_deep_branch & CPUMASK)
#define TARGET_DOUBLE_WITH_ADD (x86_double_with_add & CPUMASK)
#define TARGET_USE_SAHF (x86_use_sahf & CPUMASK)
*************** extern const int x86_add_esp_4, x86_add_
*** 206,211 ****
--- 212,219 ----
#define TARGET_ADD_ESP_8 (x86_add_esp_8 & CPUMASK)
#define TARGET_SUB_ESP_4 (x86_sub_esp_4 & CPUMASK)
#define TARGET_SUB_ESP_8 (x86_sub_esp_8 & CPUMASK)
+ #define TARGET_3DNOW (x86_3dNOW & ARCHMASK)
+ #define TARGET_SSE (x86_SSE & ARCHMASK)
#define TARGET_STACK_PROBE (target_flags & MASK_STACK_PROBE)
*************** while (0)
*** 1756,1761 ****
--- 1764,1775 ----
in one reasonably fast instruction. */
#define MOVE_MAX 4
+ /* Size of block read by single prefetch operation. */
+ #define PREFETCH_BLOCK ix86_cost->prefetch_block
+
+ /* Number of prefetch operations doable in parallel. */
+ #define SIMULTATENOUS_PREFETCHES ix86_cost->simultatenous_prefetches
+
/* If a memory-to-memory move would take MOVE_RATIO or more simple
move-instruction pairs, we will do a movstr or libcall instead.
Increasing the value will always make code faster, but eventually
*** config/i386/i386.md.old Thu Apr 13 08:14:56 2000
--- config/i386/i386.md Thu Apr 13 09:24:10 2000
***************
*** 71,76 ****
--- 71,78 ----
;; 9 This is an `fnstsw' operation.
;; 10 This is a `sahf' operation.
;; 11 This is a `fstcw' operation
+ ;; 12 This is a prefetch operation
+ ;; 13 This is a prefetchw operation
;;
;; Insns whose names begin with "x86_" are emitted by gen_FOO calls
;; from i386.c.
***************
*** 84,90 ****
;; A basic instruction type. Refinements due to arguments to be
;; provided in other attributes.
(define_attr "type"
! "other,multi,alu1,negnot,alu,icmp,imov,imovx,lea,incdec,ishift,imul,idiv,ibr,setcc,push,pop,call,callv,icmov,fmov,fop,fop1,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,fxch,str,cld"
(const_string "other"))
;; The (bounding maximum) length of an instruction in bytes.
--- 86,92 ----
;; A basic instruction type. Refinements due to arguments to be
;; provided in other attributes.
(define_attr "type"
! "other,multi,alu1,negnot,alu,icmp,imov,imovx,lea,incdec,ishift,imul,idiv,ibr,setcc,push,pop,call,callv,icmov,fmov,fop,fop1,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,fxch,str,cld,prefetch"
(const_string "other"))
;; The (bounding maximum) length of an instruction in bytes.
***************
*** 104,110 ****
;; Supporting: bytes in the opcode+modrm.
(define_attr "length_opcode" ""
! (cond [(eq_attr "type" "imovx,setcc,icmov")
(const_int 3)
(eq_attr "type" "str,cld")
(const_int 1)
--- 106,112 ----
;; Supporting: bytes in the opcode+modrm.
(define_attr "length_opcode" ""
! (cond [(eq_attr "type" "imovx,setcc,icmov,prefetch")
(const_int 3)
(eq_attr "type" "str,cld")
(const_int 1)
***************
*** 147,152 ****
--- 149,156 ----
(match_operand 1 "memory_operand" ""))
(const_string "load")
(const_string "none"))
+ (eq_attr "type" "prefetch")
+ (const_string "load")
(eq_attr "type" "ibr")
(if_then_else (match_operand 0 "memory_operand" "")
(const_string "load")
***************
*** 637,643 ****
(define_function_unit "k6_alu" 2 0
(and (eq_attr "cpu" "k6")
! (eq_attr "type" "ishift,alu1,negnot,alu,icmp,imovx,incdec,setcc,lea"))
1 1)
(define_function_unit "k6_alu" 2 0
--- 641,647 ----
(define_function_unit "k6_alu" 2 0
(and (eq_attr "cpu" "k6")
! (eq_attr "type" "ishift,alu1,negnot,alu,icmp,imovx,incdec,setcc,lea,prefetch"))
1 1)
(define_function_unit "k6_alu" 2 0
***************
*** 766,772 ****
(define_function_unit "athlon_ieu" 3 0
(and (eq_attr "cpu" "athlon")
! (eq_attr "type" "alu1,negnot,alu,icmp,imov,imovx,lea,incdec,ishift,imul,idiv,ibr,setcc,push,pop,call,callv,icmov,str,cld"))
1 1)
(define_function_unit "athlon_ieu" 3 0
--- 770,776 ----
(define_function_unit "athlon_ieu" 3 0
(and (eq_attr "cpu" "athlon")
! (eq_attr "type" "alu1,negnot,alu,icmp,imov,imovx,lea,incdec,ishift,imul,idiv,ibr,setcc,push,pop,call,callv,icmov,str,cld,prefetch"))
1 1)
(define_function_unit "athlon_ieu" 3 0
***************
*** 9191,9196 ****
--- 9195,9225 ----
fcmov%F1\\t{%2, %0|%0, %2}
fcmov%f1\\t{%3, %0|%0, %3}"
[(set_attr "type" "fcmov")])
+
+ ;; Prefetch patterns
+
+ (define_expand "prefetch"
+ [(unspec [(match_operand:SI 0 "address_operand" "")] 12)]
+ "TARGET_3DNOW || TARGET_SSE"
+ "")
+
+ (define_insn ""
+ [(unspec [(match_operand:SI 0 "address_operand" "p")] 12)]
+ "TARGET_3DNOW"
+ "prefetch\\t%a0"
+ [(set_attr "type" "prefetch")])
+
+ (define_insn ""
+ [(unspec [(match_operand:SI 0 "address_operand" "p")] 12)]
+ "TARGET_SSE"
+ "prefetchnta\\t%a0"
+ [(set_attr "type" "prefetch")])
+
+ (define_insn "prefetchw"
+ [(unspec [(match_operand:SI 0 "address_operand" "p")] 13)]
+ "TARGET_3DNOW"
+ "prefetchw\\t%a0"
+ [(set_attr "type" "prefetch")])
;; Misc patterns (?)
More information about the Gcc-patches
mailing list