Index: Makefile.in =================================================================== RCS file: /cvsroot/gcc/gcc/gcc/Makefile.in,v retrieving revision 1.1338 diff -c -3 -p -r1.1338 Makefile.in *** Makefile.in 5 Aug 2004 21:33:19 -0000 1.1338 --- Makefile.in 22 Aug 2004 15:06:30 -0000 *************** loop-init.o : loop-init.c $(CONFIG_H) $( *** 1995,2001 **** coretypes.h $(TM_H) loop-unswitch.o : loop-unswitch.c $(CONFIG_H) $(SYSTEM_H) $(RTL_H) $(TM_H) \ $(BASIC_BLOCK_H) hard-reg-set.h $(CFGLOOP_H) $(CFGLAYOUT_H) $(PARAMS_H) \ ! output.h $(EXPR_H) coretypes.h $(TM_H) loop-unroll.o: loop-unroll.c $(CONFIG_H) $(SYSTEM_H) $(RTL_H) $(TM_H) \ $(BASIC_BLOCK_H) hard-reg-set.h $(CFGLOOP_H) $(CFGLAYOUT_H) $(PARAMS_H) \ output.h $(EXPR_H) coretypes.h $(TM_H) --- 1995,2001 ---- coretypes.h $(TM_H) loop-unswitch.o : loop-unswitch.c $(CONFIG_H) $(SYSTEM_H) $(RTL_H) $(TM_H) \ $(BASIC_BLOCK_H) hard-reg-set.h $(CFGLOOP_H) $(CFGLAYOUT_H) $(PARAMS_H) \ ! output.h $(EXPR_H) coretypes.h $(TM_H) $(HASHTAB_H) $(RECOG_H) loop-unroll.o: loop-unroll.c $(CONFIG_H) $(SYSTEM_H) $(RTL_H) $(TM_H) \ $(BASIC_BLOCK_H) hard-reg-set.h $(CFGLOOP_H) $(CFGLAYOUT_H) $(PARAMS_H) \ output.h $(EXPR_H) coretypes.h $(TM_H) Index: basic-block.h =================================================================== RCS file: /cvsroot/gcc/gcc/gcc/basic-block.h,v retrieving revision 1.205 diff -c -3 -p -r1.205 basic-block.h *** basic-block.h 4 Aug 2004 21:37:03 -0000 1.205 --- basic-block.h 22 Aug 2004 15:06:31 -0000 *************** typedef struct reorder_block_def *** 297,302 **** --- 297,303 ---- /* Used by loop copying. */ basic_block copy; int duplicated; + int copy_number; /* These fields are used by bb-reorder pass. */ int visited; Index: cfgloop.h =================================================================== RCS file: /cvsroot/gcc/gcc/gcc/cfgloop.h,v retrieving revision 1.23 diff -c -3 -p -r1.23 cfgloop.h *** cfgloop.h 12 Jul 2004 19:31:16 -0000 1.23 --- cfgloop.h 22 Aug 2004 15:06:38 -0000 *************** struct niter_desc *** 411,416 **** --- 411,417 ---- extern void iv_analysis_loop_init (struct loop *); extern rtx iv_get_reaching_def (rtx, rtx); extern bool iv_analyze (rtx, rtx, struct rtx_iv *); + extern bool biv_p (rtx, rtx); extern rtx get_iv_value (struct rtx_iv *, rtx); extern void find_simple_exit (struct loop *, struct niter_desc *); extern void iv_number_of_iterations (struct loop *, rtx, rtx, Index: cfgloopmanip.c =================================================================== RCS file: /cvsroot/gcc/gcc/gcc/cfgloopmanip.c,v retrieving revision 1.27 diff -c -3 -p -r1.27 cfgloopmanip.c *** cfgloopmanip.c 9 Jul 2004 03:29:26 -0000 1.27 --- cfgloopmanip.c 22 Aug 2004 15:06:38 -0000 *************** duplicate_loop_to_header_edge (struct lo *** 979,984 **** --- 979,987 ---- /* Copy bbs. */ copy_bbs (bbs, n, new_bbs, spec_edges, 2, new_spec_edges, loop); + for (i = 0; i < n; i++) + new_bbs[i]->rbi->copy_number = j + 1; + /* Note whether the blocks and edges belong to an irreducible loop. */ if (add_irreducible_flag) { *************** duplicate_loop_to_header_edge (struct lo *** 1057,1062 **** --- 1060,1066 ---- int n_dom_bbs,j; bb = bbs[i]; + bb->rbi->copy_number = 0; n_dom_bbs = get_dominated_by (CDI_DOMINATORS, bb, &dom_bbs); for (j = 0; j < n_dom_bbs; j++) { Index: common.opt =================================================================== RCS file: /cvsroot/gcc/gcc/gcc/common.opt,v retrieving revision 1.42 diff -c -3 -p -r1.42 common.opt *** common.opt 25 Jul 2004 22:52:11 -0000 1.42 --- common.opt 22 Aug 2004 15:06:41 -0000 *************** fsingle-precision-constant *** 744,749 **** --- 744,757 ---- Common Report Var(flag_single_precision_constant) Convert floating point constants to single precision constants + fsplit-ivs-in-unroller + Common Report Var(flag_split_ivs_in_unroller) Init(1) + Split lifetimes of induction variables when loops are unrolled. + + fvariable-expansion-in-unroller + Common Report Var(flag_variable_expansion_in_unroller) + Apply variable expansion when loops are unrolled. + ; Emit code to probe the stack, to help detect stack overflow; also ; may cause large objects to be allocated dynamically. fstack-check Index: loop-iv.c =================================================================== RCS file: /cvsroot/gcc/gcc/gcc/loop-iv.c,v retrieving revision 2.15 diff -c -3 -p -r2.15 loop-iv.c *** loop-iv.c 6 Aug 2004 09:40:39 -0000 2.15 --- loop-iv.c 22 Aug 2004 15:07:04 -0000 *************** iv_analyze (rtx insn, rtx def, struct rt *** 1183,1188 **** --- 1183,1206 ---- return iv->base != NULL_RTX; } + /* Checks whether definition of register REG in INSN a basic induction + variable. */ + + bool + biv_p (rtx insn, rtx reg) + { + struct rtx_iv iv; + + if (!REG_P (reg)) + return false; + + if (last_def[REGNO (reg)] != insn) + return false; + + return iv_analyze_biv (reg, &iv); + } + + /* Calculates value of IV at ITERATION-th iteration. */ rtx *************** free_simple_loop_desc (struct loop *loop *** 2624,2626 **** --- 2642,2647 ---- free (desc); loop->aux = NULL; } + + + Index: loop-unroll.c =================================================================== RCS file: /cvsroot/gcc/gcc/gcc/loop-unroll.c,v retrieving revision 1.16 diff -c -3 -p -r1.16 loop-unroll.c *** loop-unroll.c 24 Feb 2004 23:39:55 -0000 1.16 --- loop-unroll.c 22 Aug 2004 15:07:04 -0000 *************** GCC is free software; you can redistribu *** 7,13 **** the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. - GCC is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License --- 7,12 ---- *************** Software Foundation, 59 Temple Place - S *** 30,35 **** --- 29,39 ---- #include "params.h" #include "output.h" #include "expr.h" + #include "hashtab.h" + #include "recog.h" + #include "varray.h" + #include "toplev.h" + /* This pass performs loop unrolling and peeling. We only perform these optimizations on innermost loops (with single exception) because *************** Software Foundation, 59 Temple Place - S *** 65,70 **** --- 69,110 ---- how many times we should unroll the loop; the experiments I have made showed that this choice may affect performance in order of several %. */ + /* Information about induction variables to split. */ + + struct iv_to_split + { + rtx insn; /* The insn in that the induction variable occurs. */ + rtx base_var; /* The variable on that the values in the further + iterations are based. */ + rtx step; /* Step of the induction variable. */ + unsigned n_loc; + unsigned loc[3]; /* Location where the definition of the induction + variable occurs in the insn. For example if + N_LOC is 2, the expression is located at + XEXP (XEXP (single_set, loc[0]), loc[1]). */ + }; + + /* Information about variable to expand. */ + struct var_to_expand + { + rtx insn; /* The insn in that the variable expansion occurs. */ + rtx reg; /* The accumolator var which is expanded. */ + varray_type var_expansions; /* The copies of the var was expanded. */ + enum rtx_code op; /* The type of the accumolation - addition, subsraction + or multiplication. */ + }; + + + struct opt_info + { + htab_t insns_to_split; /* A hashtable of insns to split. */ + htab_t insns_with_var_to_expand; /* A hashtable of insns to apply variable + expansion. */ + unsigned first_new_block; /* The first basic block that was + duplicated. */ + basic_block *loop_exit; /* The loop exit basic block. */ + basic_block *loop_preheader; /* The loop preheader basic block. */ + }; static void decide_unrolling_and_peeling (struct loops *, int); static void peel_loops_completely (struct loops *, int); *************** static void peel_loop_completely (struct *** 79,84 **** --- 119,136 ---- static void unroll_loop_stupid (struct loops *, struct loop *); static void unroll_loop_constant_iterations (struct loops *, struct loop *); static void unroll_loop_runtime_iterations (struct loops *, struct loop *); + static struct opt_info *analyze_insns_in_loop (struct loop *); + static void opt_info_start_duplication (struct opt_info *); + static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool); + static void free_opt_info (struct opt_info *); + static struct iv_to_split *analyze_memory_ref_containing_iv (rtx); + static struct var_to_expand *analyze_insn_to_exapnd_var (struct loop*, rtx); + static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx); + static struct iv_to_split *analyze_iv_to_split_insn (rtx); + static void expand_var_during_unrolling (struct var_to_expand *, rtx); + static int insert_var_expansion_initialization (void **, void *); + static int combine_var_copies_in_loop_exit (void **, void *); + static int release_var_copies (void **, void *); /* Unroll and/or peel (depending on FLAGS) LOOPS. */ void *************** peel_loop_completely (struct loops *loop *** 428,433 **** --- 480,486 ---- unsigned n_remove_edges, i; edge *remove_edges, ei; struct niter_desc *desc = get_simple_loop_desc (loop); + struct opt_info *opt_info = NULL; npeel = desc->niter; *************** peel_loop_completely (struct loops *loop *** 442,454 **** remove_edges = xcalloc (npeel, sizeof (edge)); n_remove_edges = 0; if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop), ! loops, npeel, wont_exit, desc->out_edge, remove_edges, &n_remove_edges, DLTHE_FLAG_UPDATE_FREQ)) abort (); free (wont_exit); /* Remove the exit edges. */ for (i = 0; i < n_remove_edges; i++) --- 495,518 ---- remove_edges = xcalloc (npeel, sizeof (edge)); n_remove_edges = 0; + if (flag_split_ivs_in_unroller) + opt_info = analyze_insns_in_loop (loop); + + opt_info_start_duplication (opt_info); + if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop), ! loops, npeel, wont_exit, desc->out_edge, remove_edges, &n_remove_edges, DLTHE_FLAG_UPDATE_FREQ)) abort (); free (wont_exit); + + if (opt_info) + { + apply_opt_in_copies (opt_info, npeel, false, true); + free_opt_info (opt_info); + } /* Remove the exit edges. */ for (i = 0; i < n_remove_edges; i++) *************** unroll_loop_constant_iterations (struct *** 597,603 **** unsigned max_unroll = loop->lpt_decision.times; struct niter_desc *desc = get_simple_loop_desc (loop); bool exit_at_end = loop_exit_at_end_p (loop); ! niter = desc->niter; if (niter <= max_unroll + 1) --- 661,669 ---- unsigned max_unroll = loop->lpt_decision.times; struct niter_desc *desc = get_simple_loop_desc (loop); bool exit_at_end = loop_exit_at_end_p (loop); ! struct opt_info *opt_info = NULL; ! ! niter = desc->niter; if (niter <= max_unroll + 1) *************** unroll_loop_constant_iterations (struct *** 610,616 **** remove_edges = xcalloc (max_unroll + exit_mod + 1, sizeof (edge)); n_remove_edges = 0; ! if (!exit_at_end) { /* The exit is not at the end of the loop; leave exit test --- 676,685 ---- remove_edges = xcalloc (max_unroll + exit_mod + 1, sizeof (edge)); n_remove_edges = 0; ! if (flag_split_ivs_in_unroller ! || flag_variable_expansion_in_unroller) ! opt_info = analyze_insns_in_loop (loop); ! if (!exit_at_end) { /* The exit is not at the end of the loop; leave exit test *************** unroll_loop_constant_iterations (struct *** 627,638 **** if (exit_mod) { ! if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop), loops, exit_mod, wont_exit, desc->out_edge, remove_edges, &n_remove_edges, DLTHE_FLAG_UPDATE_FREQ)) abort (); desc->noloop_assumptions = NULL_RTX; desc->niter -= exit_mod; --- 696,712 ---- if (exit_mod) { ! opt_info_start_duplication (opt_info); ! ! if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop), loops, exit_mod, wont_exit, desc->out_edge, remove_edges, &n_remove_edges, DLTHE_FLAG_UPDATE_FREQ)) abort (); + if (opt_info && exit_mod > 1) + apply_opt_in_copies (opt_info, exit_mod, false, false); + desc->noloop_assumptions = NULL_RTX; desc->niter -= exit_mod; *************** unroll_loop_constant_iterations (struct *** 658,669 **** --- 732,749 ---- RESET_BIT (wont_exit, 0); if (desc->noloop_assumptions) RESET_BIT (wont_exit, 1); + + opt_info_start_duplication (opt_info); if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop), loops, exit_mod + 1, wont_exit, desc->out_edge, remove_edges, &n_remove_edges, DLTHE_FLAG_UPDATE_FREQ)) abort (); + + if (opt_info && exit_mod > 0) + apply_opt_in_copies (opt_info, exit_mod + 1, false, false); + desc->niter -= exit_mod + 1; desc->niter_max -= exit_mod + 1; *************** unroll_loop_constant_iterations (struct *** 677,688 **** --- 757,778 ---- } /* Now unroll the loop. */ + + opt_info_start_duplication (opt_info); + if (!duplicate_loop_to_header_edge (loop, loop_latch_edge (loop), loops, max_unroll, wont_exit, desc->out_edge, remove_edges, &n_remove_edges, DLTHE_FLAG_UPDATE_FREQ)) abort (); + if (opt_info) + { + apply_opt_in_copies (opt_info, max_unroll, true, true); + free_opt_info (opt_info); + } + + free (wont_exit); if (exit_at_end) *************** unroll_loop_runtime_iterations (struct l *** 842,848 **** unsigned max_unroll = loop->lpt_decision.times; struct niter_desc *desc = get_simple_loop_desc (loop); bool exit_at_end = loop_exit_at_end_p (loop); ! /* Remember blocks whose dominators will have to be updated. */ dom_bbs = xcalloc (n_basic_blocks, sizeof (basic_block)); n_dom_bbs = 0; --- 932,943 ---- unsigned max_unroll = loop->lpt_decision.times; struct niter_desc *desc = get_simple_loop_desc (loop); bool exit_at_end = loop_exit_at_end_p (loop); ! struct opt_info *opt_info = NULL; ! ! if (flag_split_ivs_in_unroller ! || flag_variable_expansion_in_unroller) ! opt_info = analyze_insns_in_loop (loop); ! /* Remember blocks whose dominators will have to be updated. */ dom_bbs = xcalloc (n_basic_blocks, sizeof (basic_block)); n_dom_bbs = 0; *************** unroll_loop_runtime_iterations (struct l *** 978,990 **** sbitmap_ones (wont_exit); RESET_BIT (wont_exit, may_exit_copy); ! if (!duplicate_loop_to_header_edge (loop, loop_latch_edge (loop), loops, max_unroll, wont_exit, desc->out_edge, remove_edges, &n_remove_edges, DLTHE_FLAG_UPDATE_FREQ)) abort (); ! free (wont_exit); if (exit_at_end) --- 1073,1093 ---- sbitmap_ones (wont_exit); RESET_BIT (wont_exit, may_exit_copy); ! opt_info_start_duplication (opt_info); ! if (!duplicate_loop_to_header_edge (loop, loop_latch_edge (loop), loops, max_unroll, wont_exit, desc->out_edge, remove_edges, &n_remove_edges, DLTHE_FLAG_UPDATE_FREQ)) abort (); ! ! if (opt_info) ! { ! apply_opt_in_copies (opt_info, max_unroll, true, true); ! free_opt_info (opt_info); ! } ! ! free (wont_exit); if (exit_at_end) *************** peel_loop_simple (struct loops *loops, s *** 1138,1154 **** sbitmap wont_exit; unsigned npeel = loop->lpt_decision.times; struct niter_desc *desc = get_simple_loop_desc (loop); ! wont_exit = sbitmap_alloc (npeel + 1); sbitmap_zero (wont_exit); ! if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop), loops, npeel, wont_exit, NULL, NULL, NULL, DLTHE_FLAG_UPDATE_FREQ)) abort (); free (wont_exit); ! if (desc->simple_p) { if (desc->const_iter) --- 1241,1269 ---- sbitmap wont_exit; unsigned npeel = loop->lpt_decision.times; struct niter_desc *desc = get_simple_loop_desc (loop); ! struct opt_info *opt_info = NULL; ! ! if (flag_split_ivs_in_unroller && npeel > 1) ! opt_info = analyze_insns_in_loop (loop); ! wont_exit = sbitmap_alloc (npeel + 1); sbitmap_zero (wont_exit); ! ! opt_info_start_duplication (opt_info); ! if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop), loops, npeel, wont_exit, NULL, NULL, NULL, DLTHE_FLAG_UPDATE_FREQ)) abort (); free (wont_exit); ! ! if (opt_info) ! { ! apply_opt_in_copies (opt_info, npeel, false, false); ! free_opt_info (opt_info); ! } ! if (desc->simple_p) { if (desc->const_iter) *************** unroll_loop_stupid (struct loops *loops, *** 1271,1287 **** sbitmap wont_exit; unsigned nunroll = loop->lpt_decision.times; struct niter_desc *desc = get_simple_loop_desc (loop); ! wont_exit = sbitmap_alloc (nunroll + 1); sbitmap_zero (wont_exit); ! if (!duplicate_loop_to_header_edge (loop, loop_latch_edge (loop), loops, nunroll, wont_exit, NULL, NULL, NULL, DLTHE_FLAG_UPDATE_FREQ)) abort (); ! free (wont_exit); ! if (desc->simple_p) { /* We indeed may get here provided that there are nontrivial assumptions --- 1386,1415 ---- sbitmap wont_exit; unsigned nunroll = loop->lpt_decision.times; struct niter_desc *desc = get_simple_loop_desc (loop); ! struct opt_info *opt_info = NULL; ! ! if (flag_split_ivs_in_unroller ! || flag_variable_expansion_in_unroller) ! opt_info = analyze_insns_in_loop (loop); ! ! wont_exit = sbitmap_alloc (nunroll + 1); sbitmap_zero (wont_exit); ! opt_info_start_duplication (opt_info); ! if (!duplicate_loop_to_header_edge (loop, loop_latch_edge (loop), loops, nunroll, wont_exit, NULL, NULL, NULL, DLTHE_FLAG_UPDATE_FREQ)) abort (); ! ! if (opt_info) ! { ! apply_opt_in_copies (opt_info, nunroll, true, true); ! free_opt_info (opt_info); ! } ! free (wont_exit); ! if (desc->simple_p) { /* We indeed may get here provided that there are nontrivial assumptions *************** unroll_loop_stupid (struct loops *loops, *** 1297,1299 **** --- 1425,2233 ---- fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n", nunroll, num_loop_insns (loop)); } + + + + + /* A hash function for information about insns to split. */ + + static hashval_t + si_info_hash (const void *ivts) + { + return htab_hash_pointer (((struct iv_to_split *) ivts)->insn); + } + + /* An equality functions for information about insns to split. */ + + static int + si_info_eq (const void *ivts1, const void *ivts2) + { + const struct iv_to_split *i1 = ivts1; + const struct iv_to_split *i2 = ivts2; + + return i1->insn == i2->insn; + } + + + /* A hash function for information about insns which contains + variable to expand. */ + + static hashval_t + ve_info_hash (const void *ves) + { + return htab_hash_pointer (((struct var_to_expand *) ves)->insn); + } + + /* An equality functions for information about insns which contains + variable to expand. */ + + static int + ve_info_eq (const void *ivts1, const void *ivts2) + { + const struct var_to_expand *i1 = ivts1; + const struct var_to_expand *i2 = ivts2; + + return i1->insn == i2->insn; + } + + + /* Determine whether INSN contains memory reference + which uses an induction variable. If there is a memory + reference a[i] in the loop body it can be transformed + into the following in the unrolled loop, where base = &a+i: + + load 4(base) + load 8(base) + load 12(base) + . + . + */ + static struct iv_to_split * + analyze_memory_ref_containing_iv (rtx insn) + { + struct iv_to_split *ivts; + rtx lop, rop, expr; + int n_loc, loc_0, loc_1, loc_2 = -1; + rtx set, dest, src; + rtx insn1 = NULL_RTX, insn2 = NULL_RTX; + struct rtx_iv iv1, iv2, iv; + + set = single_set (insn); + if (!set) + return NULL; + + dest = SET_DEST (set); + src = SET_SRC (set); + + /* reg = mem [ invariant + iv ] */ + if ((REG_P (dest) + || (GET_CODE (dest) == SUBREG + && REG_P (SUBREG_REG (dest)))) + && MEM_P (src)) + { + n_loc = 2; + loc_0 = 1; + loc_1 = 0; + expr = XEXP (src, 0); + } + /* mem [ invariant + iv ] = reg */ + else if ((REG_P (src) + || (GET_CODE (src) == SUBREG + && REG_P (SUBREG_REG (src)))) + && MEM_P (dest)) + { + n_loc = 2; + loc_0 = 0; + loc_1 = 0; + expr = XEXP (dest, 0); + } + /* reg = zero_extend (mem [ invariant + iv ]) */ + else if (GET_CODE (dest) == REG + && GET_CODE (src) == ZERO_EXTEND + && MEM_P (XEXP (src, 0))) + { + n_loc = 3; + loc_0 = 1; + loc_1 = 0; + loc_2 = 0; + expr = XEXP (XEXP (src, 0), 0); + } + else + return NULL; + + if (GET_CODE (expr) != PLUS) + return NULL; + + lop = XEXP (expr, 0); + rop = XEXP (expr, 1); + + if (!REG_P (lop) || !REG_P (rop)) + return NULL; + + insn1 = iv_get_reaching_def (insn, lop); + insn2 = iv_get_reaching_def (insn, rop); + + /* Memory reference is of the form: + invariant + induction variable + or induction variable + invariant. */ + if (iv_analyze (insn1, lop, &iv1) + && !iv1.first_special + && iv_analyze (insn2, rop, &iv2) + && !iv2.first_special + && ((iv1.step == const0_rtx + && iv2.mode == iv2.extend_mode) + || (iv2.step == const0_rtx + && iv1.mode == iv1.extend_mode))) + { + if (iv1.step == const0_rtx) + iv = iv2; + else + iv = iv1; + if (iv.step == NULL_RTX + || GET_CODE (iv.step) != CONST_INT) + return NULL; + if (INTVAL (iv.step) < 0) + return NULL; + /* Record the insn to split. */ + ivts = xmalloc (sizeof (struct iv_to_split)); + ivts->insn = insn; + ivts->base_var = NULL_RTX; + ivts->step = iv.step; + ivts->n_loc = n_loc; + ivts->loc[0] = loc_0; + ivts->loc[1] = loc_1; + ivts->loc[2] = loc_2; + return ivts; + } + return NULL; + } + + /* Returns true if reg is referenced in one insn in loops. */ + + bool + referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg) + { + basic_block *body, bb; + unsigned i; + int count_ref = 0; + rtx insn; + + body = get_loop_body (loop); + for (i = 0; i < loop->num_nodes; i++) + { + bb = body[i]; + + FOR_BB_INSNS (bb, insn) + { + if (rtx_referenced_p (reg, insn)) + { + count_ref++; + } + } + } + return (count_ref == 1); + } + + /* Determine whether INSN contains an accumolator + which can be expanded into separate copies, + one for each copy of the loop body. + + for (i = 0 ; i < n; i++) + sum += a[i]; + + ==> + + sum += a[i] + .... + i = i+1; + sum1 += a[i] + .... + i = i+1 + sum2 += a[i]; + .... + */ + + static struct var_to_expand * + analyze_insn_to_exapnd_var (struct loop *loop, rtx insn) + { + rtx set, dest, src, op1; + struct var_to_expand *ves; + + set = single_set (insn); + if (!set) + return NULL; + + dest = SET_DEST (set); + src = SET_SRC (set); + + if (GET_CODE (src) != PLUS + && GET_CODE (src) != MINUS + && GET_CODE (src) != MULT) + return NULL; + + if (!XEXP (src, 0)) + return NULL; + + op1 = XEXP (src, 0); + + if (!REG_P (dest) + && !(GET_CODE (dest) == SUBREG + && REG_P (SUBREG_REG (dest)))) + return NULL; + + if (!rtx_equal_p (dest, op1)) + return NULL; + + if (!referenced_in_one_insn_in_loop_p (loop, dest)) + return NULL; + + if (rtx_referenced_p (dest, XEXP (src, 1))) + return NULL; + + /* Record the insn to split. */ + ves = xmalloc (sizeof (struct var_to_expand)); + ves->insn = insn; + VARRAY_RTX_INIT (ves->var_expansions, 1, "var_expansions"); + ves->reg = copy_rtx (dest); + ves->op = GET_CODE (src); + return ves; + } + + /* Determine whether there is an induction variable in INSN that + we would like to split during unrolling. + + I.e. replace + + i = i + 1; + ... + i = i + 1; + ... + i = i + 1; + ... + + type chains by + + i0 = i + 1 + ... + i = i0 + 1 + ... + i = i0 + 2 + ... */ + + + static struct iv_to_split * + analyze_iv_to_split_insn (rtx insn) + { + rtx set, dest; + struct rtx_iv iv; + struct iv_to_split *ivts; + + set = single_set (insn); + if (!set) + return NULL; + + dest = SET_DEST (set); + if (REG_P (dest) + && biv_p (insn, dest) + && !iv_analyze (insn, dest, &iv)) + abort (); + + /* Split basic induction variables if we can. */ + if (REG_P (dest) + && biv_p (insn, dest) + && iv.step != const0_rtx + && iv.mode == iv.extend_mode) + { + /* Record the insn to split. */ + ivts = xmalloc (sizeof (struct iv_to_split)); + ivts->insn = insn; + ivts->base_var = NULL_RTX; + ivts->step = iv.step; + ivts->n_loc = 1; + ivts->loc[0] = 1; + return ivts; + } + /* Check if we can split memory references. */ + ivts = analyze_memory_ref_containing_iv (insn); + return ivts; + } + + /* Determines which of the insn in the loop can be optimized. */ + static struct opt_info * + analyze_insns_in_loop (struct loop *loop) + { + basic_block *body, bb; + unsigned i, n_edges = 0; + struct opt_info *opt_info = xcalloc (1, sizeof (struct opt_info)); + rtx insn; + struct iv_to_split *ivts = NULL; + struct var_to_expand *ves = NULL; + PTR *slot1; + PTR *slot2; + edge *edges = get_loop_exit_edges (loop, &n_edges); + basic_block preheader; + + if (flag_split_ivs_in_unroller) + opt_info->insns_to_split = htab_create (5 * loop->num_nodes, + si_info_hash, si_info_eq, free); + if (flag_variable_expansion_in_unroller + && n_edges == 1 + && fast_math_flags_set_p ()) + opt_info->insns_with_var_to_expand = htab_create (5 * loop->num_nodes, + ve_info_hash, ve_info_eq, free); + iv_analysis_loop_init (loop); + + body = get_loop_body (loop); + + /* Record the loop exit bb and loop preheader before the unrolling. */ + opt_info->loop_exit = &edges[0]->dest; + if (!loop_preheader_edge (loop)->src) + { + preheader = loop_split_edge_with (loop_preheader_edge (loop), NULL_RTX); + opt_info->loop_preheader = &preheader; + } + else + opt_info->loop_preheader = &loop_preheader_edge (loop)->src; + + for (i = 0; i < loop->num_nodes; i++) + { + bb = body[i]; + if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb)) + continue; + + FOR_BB_INSNS (bb, insn) + { + if (!INSN_P (insn)) + continue; + + if (opt_info->insns_to_split) + ivts = analyze_iv_to_split_insn (insn); + + if (ivts) + { + slot1 = htab_find_slot (opt_info->insns_to_split, ivts, INSERT); + *slot1 = ivts; + continue; + } + + if (opt_info->insns_with_var_to_expand) + ves = analyze_insn_to_exapnd_var (loop, insn); + + if (ves) + { + slot2 = htab_find_slot (opt_info->insns_with_var_to_expand, ves, INSERT); + *slot2 = ves; + } + } + } + + free (body); + return opt_info; + } + + /* Called just before loop duplication. Records start of duplicated area + to OPT_INFO. */ + + static void + opt_info_start_duplication (struct opt_info *opt_info) + { + if (opt_info) + opt_info->first_new_block = last_basic_block; + } + + /* Determine the number of iterations between initialization of the base + variable and the current copy (N_COPY). N_COPIES is the total number + of newly created copies. UNROLLING is true if we are unrolling + (not peeling) the loop. */ + + static unsigned + determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling) + { + if (unrolling) + { + /* If we are unrolling, initialization is done in the original loop + body (number 0). */ + return n_copy; + } + else + { + /* If we are peeling, the copy in that the initialization occurs has + number 1. The original loop (number 0) is the last. */ + if (n_copy) + return n_copy - 1; + else + return n_copies; + } + } + + /* Locates expression corresponding to the location recorded in IVTS + in EXPR. */ + + static rtx * + get_ivts_expr (rtx expr, struct iv_to_split *ivts) + { + unsigned i; + rtx *ret = &expr; + + for (i = 0; i < ivts->n_loc; i++) + ret = &XEXP (*ret, ivts->loc[i]); + + return ret; + } + + /* Allocate basic variable for the induction variable chain. Callback for + htab_traverse. */ + + static int + allocate_basic_variable (void **slot, void *data ATTRIBUTE_UNUSED) + { + struct iv_to_split *ivts = *slot; + rtx expr = *get_ivts_expr (single_set (ivts->insn), ivts); + + ivts->base_var = gen_reg_rtx (GET_MODE (expr)); + + return 1; + } + + /* Insert initialization of basic variable of IVTS before INSN, taking + the initial value from it. */ + + static void + insert_base_initialization (struct iv_to_split *ivts, rtx insn) + { + rtx expr = copy_rtx (*get_ivts_expr (single_set (insn), ivts)); + rtx seq; + + start_sequence (); + expr = force_operand (expr, ivts->base_var); + if (expr != ivts->base_var) + emit_move_insn (ivts->base_var, expr); + seq = get_insns (); + end_sequence (); + + emit_insn_before (seq, insn); + } + + /* Replace the use of induction variable described in IVTS in INSN + by base variable + DELTA * step. */ + + static void + split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta) + { + rtx expr, *loc, seq, incr, var; + enum machine_mode mode = GET_MODE (ivts->base_var); + rtx src, dest, set; + + if (!delta) + expr = ivts->base_var; + else + { + incr = simplify_gen_binary (MULT, mode, + ivts->step, gen_int_mode (delta, mode)); + expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var), + ivts->base_var, incr); + } + + loc = get_ivts_expr (single_set (insn), ivts); + + if (validate_change (insn, loc, expr, 0)) + return; + + start_sequence (); + if (REG_P (expr)) + var = expr; + else + { + var = gen_reg_rtx (mode); + + expr = force_operand (expr, var); + if (expr != var) + emit_move_insn (var, expr); + } + seq = get_insns (); + end_sequence (); + + emit_insn_before (seq, insn); + + if (validate_change (insn, loc, var, 0)) + return; + + /* The last chance. Try recreating the assignment in insn completely from + scratch. */ + set = single_set (insn); + if (!set) + abort (); + + start_sequence (); + *loc = var; + src = copy_rtx (SET_SRC (set)); + dest = copy_rtx (SET_DEST (set)); + src = force_operand (src, dest); + if (src != dest) + emit_move_insn (dest, src); + seq = get_insns (); + end_sequence (); + + emit_insn_before (seq, insn); + delete_insn (insn); + } + + + /* Replace the use of var recorded in VE with a new + register. */ + + static void + expand_var_during_unrolling (struct var_to_expand *ve, rtx insn) + { + rtx new_reg, set; + + set = single_set (insn); + if (!set) + abort (); + + new_reg = gen_reg_rtx (GET_MODE (ve->reg)); + VARRAY_PUSH_RTX (ve->var_expansions, new_reg); + + if (!validate_change (insn, &SET_DEST (set), new_reg, 0)) + return; + + if (!XEXP (SET_SRC (set), 0)) + abort (); + + if (validate_change (insn, &XEXP (SET_SRC (set), 0), new_reg, 0)) + return; + else + { + /* Restore the original state. */ + if (!validate_change (insn, &SET_DEST (set), ve->reg, 0)) + abort (); + } + return; + } + + + /* Initialize the variable expansions in the loop preheader. */ + + static int + insert_var_expansion_initialization (void **slot, void *place_p) + { + struct var_to_expand *ve = *slot; + basic_block *place = (basic_block *)place_p; + rtx seq, var, zero_init; + unsigned i; + + start_sequence (); + if (ve->op == PLUS || ve->op == MINUS) + { + for (i = 0; i < VARRAY_ACTIVE_SIZE (ve->var_expansions); i++) + { + var = VARRAY_RTX (ve->var_expansions, i); + zero_init = CONST0_RTX (GET_MODE (var)); + emit_move_insn (var, zero_init); + } + } + else if (ve->op == MULT) + { + for (i = 0; i < VARRAY_ACTIVE_SIZE (ve->var_expansions); i++) + { + var = VARRAY_RTX (ve->var_expansions, i); + zero_init = CONST1_RTX (GET_MODE (var)); + emit_move_insn (var, zero_init); + } + } + seq = get_insns (); + end_sequence (); + + emit_insn_after (seq, BB_END (*place)); + + /* Continue traversing the hash table. */ + return 1; + } + + /* Combine the variable expansions at the loop exit. */ + + static int + combine_var_copies_in_loop_exit (void **slot, void *place_p) + { + struct var_to_expand *ve = *slot; + basic_block *place = (basic_block *)place_p; + rtx sum = ve->reg; + rtx expr, seq, var, insn; + unsigned i; + + if (ve->op == PLUS || ve->op == MINUS) + { + for (i = 0; i < VARRAY_ACTIVE_SIZE (ve->var_expansions); i++) + { + var = VARRAY_RTX (ve->var_expansions, i); + sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), + var, sum); + } + } + else if (ve->op == MULT) + { + for (i = 0; i < VARRAY_ACTIVE_SIZE (ve->var_expansions); i++) + { + var = VARRAY_RTX (ve->var_expansions, i); + sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), + var, sum); + } + } + start_sequence (); + expr = force_operand (sum, ve->reg); + if (expr != ve->reg) + emit_move_insn (ve->reg, expr); + seq = get_insns (); + end_sequence (); + + insn = BB_HEAD (*place); + while (!INSN_P (insn)) + insn = NEXT_INSN (insn); + emit_insn_before (seq, insn); + + /* Continue traversing the hash table. */ + return 1; + } + + + /* Apply loop optimizations in loop copies using the + data which gathered during the unrolling. + + UNROLLING is true if we unrolled (not peeled) the loop. + REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of + the loop (as it should happen in complete unrolling, but not in ordinary + peeling of the loop). */ + + static void + apply_opt_in_copies (struct opt_info *opt_info, + unsigned n_copies, bool unrolling, + bool rewrite_original_loop) + { + unsigned i, delta; + basic_block bb, orig_bb; + rtx insn, orig_insn, next; + struct iv_to_split ivts_templ, *ivts; + struct var_to_expand ve_templ, *ves; + + /* Sanity check -- we need to put initialization in the original loop + body. */ + if (unrolling && !rewrite_original_loop) + abort (); + + /* Allocate the basic variables (i0). */ + if (opt_info->insns_to_split) + htab_traverse (opt_info->insns_to_split, allocate_basic_variable, NULL); + + for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++) + { + bb = BASIC_BLOCK (i); + orig_bb = bb->rbi->original; + + delta = determine_split_iv_delta (bb->rbi->copy_number, n_copies, + unrolling); + orig_insn = BB_HEAD (orig_bb); + for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb)); insn = next) + { + next = NEXT_INSN (insn); + if (!INSN_P (insn)) + continue; + + while (!INSN_P (orig_insn)) + orig_insn = NEXT_INSN (orig_insn); + + ivts_templ.insn = orig_insn; + ve_templ.insn = orig_insn; + + /* Apply splitting iv optimization. */ + if (opt_info->insns_to_split) + { + ivts = htab_find (opt_info->insns_to_split, &ivts_templ); + + if (ivts) + { + + #ifdef ENABLE_CHECKING + if (!rtx_equal_p (PATTERN (insn), PATTERN (orig_insn))) + abort (); + #endif + + if (!delta) + insert_base_initialization (ivts, insn); + split_iv (ivts, insn, delta); + } + } + /* Apply variable expansion optimization. */ + if (unrolling && opt_info->insns_with_var_to_expand) + { + ves = htab_find (opt_info->insns_with_var_to_expand, &ve_templ); + if (ves) + { + #ifdef ENABLE_CHECKING + if (!rtx_equal_p (PATTERN (insn), PATTERN (orig_insn))) + abort (); + #endif + expand_var_during_unrolling (ves, insn); + } + } + orig_insn = NEXT_INSN (orig_insn); + } + } + + if (!rewrite_original_loop) + return; + + /* Initialize the variable expansions in the loop preheader, + and take care of combining them at the end of the unrolled loop. */ + if (opt_info->insns_with_var_to_expand) + { + htab_traverse (opt_info->insns_with_var_to_expand, + insert_var_expansion_initialization, + opt_info->loop_preheader); + htab_traverse (opt_info->insns_with_var_to_expand, + combine_var_copies_in_loop_exit, + opt_info->loop_exit); + } + + /* Rewrite also the original loop body. Find them as originals of the blocks + in the last copied iteration, i.e. those that have + bb->rbi->original->copy == bb. */ + for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++) + { + bb = BASIC_BLOCK (i); + orig_bb = bb->rbi->original; + if (orig_bb->rbi->copy != bb) + continue; + + delta = determine_split_iv_delta (0, n_copies, unrolling); + for (orig_insn = BB_HEAD (orig_bb); + orig_insn != NEXT_INSN (BB_END (bb)); + orig_insn = next) + { + next = NEXT_INSN (orig_insn); + + if (!INSN_P (orig_insn)) + continue; + + ivts_templ.insn = orig_insn; + if (opt_info->insns_to_split) + { + ivts = htab_find (opt_info->insns_to_split, &ivts_templ); + if (ivts) + { + if (!delta) + insert_base_initialization (ivts, orig_insn); + split_iv (ivts, orig_insn, delta); + continue; + } + } + + } + } + } + + static int + release_var_copies (void **slot, void *data ATTRIBUTE_UNUSED) + { + struct var_to_expand *ve = *slot; + + VARRAY_CLEAR (ve->var_expansions); + /* Continue traversing the hash table. */ + return 1; + } + + /* Release OPT_INFO. */ + + static void + free_opt_info (struct opt_info *opt_info) + { + if (opt_info->insns_to_split) + htab_delete (opt_info->insns_to_split); + if (opt_info->insns_with_var_to_expand) + { + htab_traverse (opt_info->insns_with_var_to_expand, + release_var_copies, NULL); + htab_delete (opt_info->insns_with_var_to_expand); + } + free (opt_info); + } + Index: doc/invoke.texi =================================================================== RCS file: /cvsroot/gcc/gcc/gcc/doc/invoke.texi,v retrieving revision 1.500 diff -c -3 -p -r1.500 invoke.texi *** doc/invoke.texi 6 Aug 2004 17:20:53 -0000 1.500 --- doc/invoke.texi 22 Aug 2004 15:10:22 -0000 *************** in the following sections. *** 312,317 **** --- 312,319 ---- -fsingle-precision-constant @gol -fstrength-reduce -fstrict-aliasing -ftracer -fthread-jumps @gol -funroll-all-loops -funroll-loops -fpeel-loops @gol + -fsplit-ivs-in-unroller @gol + -fvariable-expansion-in-unroller @gol -funswitch-loops -fold-unroll-loops -fold-unroll-all-loops @gol -ftree-pre -ftree-ccp -ftree-dce -ftree-loop-optimize @gol -ftree-lim @gol *************** the loop is entered. This usually makes *** 4489,4494 **** --- 4491,4519 ---- @option{-funroll-all-loops} implies the same options as @option{-funroll-loops}, + @item -fsplit-ivs-in-unroller + @opindex -fsplit-ivs-in-unroller + Enables expressing of values of induction variables in later iterations + of the unrolled loop using the value in the first iteration. This breaks + long dependency chains, thus improving efficiency of the scheduling passes + (for best results, @option{-fweb} should be used as well). + + Combination of @option{-fweb} and CSE is often sufficient to obtain the + same effect. However in cases the loop body is more complicated than + a single basic block, this is not reliable. It also does not work at all + on some of the architectures due to restrictions in the CSE pass. + + This optimization is enabled by default. + + @item -fvariable-expansion-in-unroller + @opindex -fvariable-expansion-in-unroller + Enables the expansion of accumolators in loop + into separate copies, one for each copy of the loop body + and combine them at the loop exit. + + This optimization is enabled only if ffast-math option is enabled. + + @item -fprefetch-loop-arrays @opindex fprefetch-loop-arrays If supported by the target machine, generate instructions to prefetch