This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: Move loop peeling from RTL to gimple
- From: Richard Biener <rguenther at suse dot de>
- To: Jan Hubicka <hubicka at ucw dot cz>
- Cc: gcc-patches at gcc dot gnu dot org
- Date: Tue, 14 Oct 2014 09:36:47 +0200 (CEST)
- Subject: Re: Move loop peeling from RTL to gimple
- Authentication-results: sourceware.org; auth=none
- References: <20141014071951 dot GB13803 at kam dot mff dot cuni dot cz>
On Tue, 14 Oct 2014, Jan Hubicka wrote:
> Hi,
> this is update of my 2013 update to 2012 patch to move rtl loop peeling
> to tree level. This is to expose optimization oppurtunities earlier.
> Incrementally I think I can also improve profiling to provide a histogram
> on loop iterations and get more sensible peeling decisions.
>
> profiled-bootstrapped/regtested x86_64-linux, OK?
Ok.
Thanks,
Richard.
> Honza
>
> * loop-unroll.c: (decide_unrolling_and_peeling): Rename to
> (decide_unrolling): ... this one.
> (peel_loops_completely): Remove.
> (decide_peel_simple): Remove.
> (decide_peel_once_rolling): Remove.
> (decide_peel_completely): Remove.
> (peel_loop_simple): Remove.
> (peel_loop_completely): Remove.
> (unroll_and_peel_loops): Rename to ...
> (unroll_loops): ... this one; handle only unrolling.
> * cfgloop.h (lpt_dec): Remove LPT_PEEL_COMPLETELY and
> LPT_PEEL_SIMPLE.
> (UAP_PEEL): Remove.
> (unroll_and_peel_loops): Remove.
> (unroll_loops): New.
> * passes.def: Replace
> pass_rtl_unroll_and_peel_loops by pass_rtl_unroll_loops.
> * loop-init.c (gate_rtl_unroll_and_peel_loops,
> rtl_unroll_and_peel_loops): Rename to ...
> (gate_rtl_unroll_loops, rtl_unroll_loops): ... these; update.
> (pass_rtl_unroll_and_peel_loops): Rename to ...
> (pass_rtl_unroll_loops): ... this one.
> * tree-pass.h (make_pass_rtl_unroll_and_peel_loops): Remove.
> (make_pass_rtl_unroll_loops): New.
> * tree-ssa-loop-ivcanon.c: (estimated_peeled_sequence_size, try_peel_loop): New.
> (canonicalize_loop_induction_variables): Update.
>
> * gcc.dg/tree-prof/peel-1.c: Update.
> * gcc.dg/tree-prof/unroll-1.c: Update.
> * gcc.dg/gcc.dg/unroll_1.c: Update.
> * gcc.dg/gcc.dg/unroll_2.c: Update.
> * gcc.dg/gcc.dg/unroll_3.c: Update.
> * gcc.dg/gcc.dg/unroll_4.c: Update.
> Index: tree-pass.h
> ===================================================================
> --- tree-pass.h (revision 216145)
> +++ tree-pass.h (working copy)
> @@ -504,7 +504,7 @@ extern rtl_opt_pass *make_pass_outof_cfg
> extern rtl_opt_pass *make_pass_loop2 (gcc::context *ctxt);
> extern rtl_opt_pass *make_pass_rtl_loop_init (gcc::context *ctxt);
> extern rtl_opt_pass *make_pass_rtl_move_loop_invariants (gcc::context *ctxt);
> -extern rtl_opt_pass *make_pass_rtl_unroll_and_peel_loops (gcc::context *ctxt);
> +extern rtl_opt_pass *make_pass_rtl_unroll_loops (gcc::context *ctxt);
> extern rtl_opt_pass *make_pass_rtl_doloop (gcc::context *ctxt);
> extern rtl_opt_pass *make_pass_rtl_loop_done (gcc::context *ctxt);
>
> Index: tree-ssa-loop-ivcanon.c
> ===================================================================
> --- tree-ssa-loop-ivcanon.c (revision 216145)
> +++ tree-ssa-loop-ivcanon.c (working copy)
> @@ -28,9 +28,12 @@ along with GCC; see the file COPYING3.
> variables. In that case the created optimization possibilities are likely
> to pay up.
>
> - Additionally in case we detect that it is beneficial to unroll the
> - loop completely, we do it right here to expose the optimization
> - possibilities to the following passes. */
> + We also perform
> + - complette unrolling (or peeling) when the loops is rolling few enough
> + times
> + - simple peeling (i.e. copying few initial iterations prior the loop)
> + when number of iteration estimate is known (typically by the profile
> + info). */
>
> #include "config.h"
> #include "system.h"
> @@ -657,11 +660,12 @@ try_unroll_loop_completely (struct loop
> HOST_WIDE_INT maxiter,
> location_t locus)
> {
> - unsigned HOST_WIDE_INT n_unroll, ninsns, max_unroll, unr_insns;
> + unsigned HOST_WIDE_INT n_unroll = 0, ninsns, max_unroll, unr_insns;
> gimple cond;
> struct loop_size size;
> bool n_unroll_found = false;
> edge edge_to_cancel = NULL;
> + int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;
>
> /* See if we proved number of iterations to be low constant.
>
> @@ -821,6 +825,8 @@ try_unroll_loop_completely (struct loop
> loop->num);
> return false;
> }
> + dump_printf_loc (report_flags, locus,
> + "loop turned into non-loop; it never loops.\n");
>
> initialize_original_copy_tables ();
> wont_exit = sbitmap_alloc (n_unroll + 1);
> @@ -902,6 +908,133 @@ try_unroll_loop_completely (struct loop
> return true;
> }
>
> +/* Return number of instructions after peeling. */
> +static unsigned HOST_WIDE_INT
> +estimated_peeled_sequence_size (struct loop_size *size,
> + unsigned HOST_WIDE_INT npeel)
> +{
> + return MAX (npeel * (HOST_WIDE_INT) (size->overall
> + - size->eliminated_by_peeling), 1);
> +}
> +
> +/* If the loop is expected to iterate N times and is
> + small enough, duplicate the loop body N+1 times before
> + the loop itself. This way the hot path will never
> + enter the loop.
> + Parameters are the same as for try_unroll_loops_completely */
> +
> +static bool
> +try_peel_loop (struct loop *loop,
> + edge exit, tree niter,
> + HOST_WIDE_INT maxiter)
> +{
> + int npeel;
> + struct loop_size size;
> + int peeled_size;
> + sbitmap wont_exit;
> + unsigned i;
> + vec<edge> to_remove = vNULL;
> + edge e;
> +
> + /* If the iteration bound is known and large, then we can safely eliminate
> + the check in peeled copies. */
> + if (TREE_CODE (niter) != INTEGER_CST)
> + exit = NULL;
> +
> + if (!flag_peel_loops || PARAM_VALUE (PARAM_MAX_PEEL_TIMES) <= 0)
> + return false;
> +
> + /* Peel only innermost loops. */
> + if (loop->inner)
> + {
> + if (dump_file)
> + fprintf (dump_file, "Not peeling: outer loop\n");
> + return false;
> + }
> +
> + if (!optimize_loop_for_speed_p (loop))
> + {
> + if (dump_file)
> + fprintf (dump_file, "Not peeling: cold loop\n");
> + return false;
> + }
> +
> + /* Check if there is an estimate on the number of iterations. */
> + npeel = estimated_loop_iterations_int (loop);
> + if (npeel < 0)
> + {
> + if (dump_file)
> + fprintf (dump_file, "Not peeling: number of iterations is not "
> + "estimated\n");
> + return false;
> + }
> + if (maxiter >= 0 && maxiter <= npeel)
> + {
> + if (dump_file)
> + fprintf (dump_file, "Not peeling: upper bound is known so can "
> + "unroll complettely\n");
> + return false;
> + }
> +
> + /* We want to peel estimated number of iterations + 1 (so we never
> + enter the loop on quick path). Check against PARAM_MAX_PEEL_TIMES
> + and be sure to avoid overflows. */
> + if (npeel > PARAM_VALUE (PARAM_MAX_PEEL_TIMES) - 1)
> + {
> + if (dump_file)
> + fprintf (dump_file, "Not peeling: rolls too much "
> + "(%i + 1 > --param max-peel-times)\n", npeel);
> + return false;
> + }
> + npeel++;
> +
> + /* Check peeled loops size. */
> + tree_estimate_loop_size (loop, exit, NULL, &size,
> + PARAM_VALUE (PARAM_MAX_PEELED_INSNS));
> + if ((peeled_size = estimated_peeled_sequence_size (&size, npeel))
> + > PARAM_VALUE (PARAM_MAX_PEELED_INSNS))
> + {
> + if (dump_file)
> + fprintf (dump_file, "Not peeling: peeled sequence size is too large "
> + "(%i insns > --param max-peel-insns)", peeled_size);
> + return false;
> + }
> +
> + /* Duplicate possibly eliminating the exits. */
> + initialize_original_copy_tables ();
> + wont_exit = sbitmap_alloc (npeel + 1);
> + bitmap_ones (wont_exit);
> + bitmap_clear_bit (wont_exit, 0);
> + if (!gimple_duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
> + npeel, wont_exit,
> + exit, &to_remove,
> + DLTHE_FLAG_UPDATE_FREQ
> + | DLTHE_FLAG_COMPLETTE_PEEL))
> + {
> + free_original_copy_tables ();
> + free (wont_exit);
> + return false;
> + }
> + FOR_EACH_VEC_ELT (to_remove, i, e)
> + {
> + bool ok = remove_path (e);
> + gcc_assert (ok);
> + }
> + free (wont_exit);
> + free_original_copy_tables ();
> + if (dump_file && (dump_flags & TDF_DETAILS))
> + {
> + fprintf (dump_file, "Peeled loop %d, %i times.\n",
> + loop->num, npeel);
> + }
> + if (loop->any_upper_bound)
> + loop->nb_iterations_upper_bound -= npeel;
> + loop->nb_iterations_estimate = 0;
> + /* Make sure to mark loop cold so we do not try to peel it more. */
> + scale_loop_profile (loop, 1, 0);
> + loop->header->count = 0;
> + return true;
> +}
> /* Adds a canonical induction variable to LOOP if suitable.
> CREATE_IV is true if we may create a new iv. UL determines
> which loops we are allowed to completely unroll. If TRY_EVAL is true, we try
> @@ -981,6 +1114,9 @@ canonicalize_loop_induction_variables (s
> && exit && just_once_each_iteration_p (loop, exit->src))
> create_canonical_iv (loop, exit, niter);
>
> + if (ul == UL_ALL)
> + modified |= try_peel_loop (loop, exit, niter, maxiter);
> +
> return modified;
> }
>
> Index: loop-init.c
> ===================================================================
> --- loop-init.c (revision 216145)
> +++ loop-init.c (working copy)
> @@ -357,7 +357,6 @@ pass_loop2::gate (function *fun)
> if (optimize > 0
> && (flag_move_loop_invariants
> || flag_unswitch_loops
> - || flag_peel_loops
> || flag_unroll_loops
> #ifdef HAVE_doloop_end
> || (flag_branch_on_count_reg && HAVE_doloop_end)
> @@ -537,7 +536,7 @@ make_pass_rtl_move_loop_invariants (gcc:
>
> namespace {
>
> -const pass_data pass_data_rtl_unroll_and_peel_loops =
> +const pass_data pass_data_rtl_unroll_loops =
> {
> RTL_PASS, /* type */
> "loop2_unroll", /* name */
> @@ -550,11 +549,11 @@ const pass_data pass_data_rtl_unroll_and
> 0, /* todo_flags_finish */
> };
>
> -class pass_rtl_unroll_and_peel_loops : public rtl_opt_pass
> +class pass_rtl_unroll_loops : public rtl_opt_pass
> {
> public:
> - pass_rtl_unroll_and_peel_loops (gcc::context *ctxt)
> - : rtl_opt_pass (pass_data_rtl_unroll_and_peel_loops, ctxt)
> + pass_rtl_unroll_loops (gcc::context *ctxt)
> + : rtl_opt_pass (pass_data_rtl_unroll_loops, ctxt)
> {}
>
> /* opt_pass methods: */
> @@ -565,10 +564,10 @@ public:
>
> virtual unsigned int execute (function *);
>
> -}; // class pass_rtl_unroll_and_peel_loops
> +}; // class pass_rtl_unroll_loops
>
> unsigned int
> -pass_rtl_unroll_and_peel_loops::execute (function *fun)
> +pass_rtl_unroll_loops::execute (function *fun)
> {
> if (number_of_loops (fun) > 1)
> {
> @@ -576,14 +575,12 @@ pass_rtl_unroll_and_peel_loops::execute
> if (dump_file)
> df_dump (dump_file);
>
> - if (flag_peel_loops)
> - flags |= UAP_PEEL;
> if (flag_unroll_loops)
> flags |= UAP_UNROLL;
> if (flag_unroll_all_loops)
> flags |= UAP_UNROLL_ALL;
>
> - unroll_and_peel_loops (flags);
> + unroll_loops (flags);
> }
> return 0;
> }
> @@ -591,9 +588,9 @@ pass_rtl_unroll_and_peel_loops::execute
> } // anon namespace
>
> rtl_opt_pass *
> -make_pass_rtl_unroll_and_peel_loops (gcc::context *ctxt)
> +make_pass_rtl_unroll_loops (gcc::context *ctxt)
> {
> - return new pass_rtl_unroll_and_peel_loops (ctxt);
> + return new pass_rtl_unroll_loops (ctxt);
> }
>
>
> Index: cfgloop.h
> ===================================================================
> --- cfgloop.h (revision 216145)
> +++ cfgloop.h (working copy)
> @@ -30,8 +30,6 @@ along with GCC; see the file COPYING3.
> enum lpt_dec
> {
> LPT_NONE,
> - LPT_PEEL_COMPLETELY,
> - LPT_PEEL_SIMPLE,
> LPT_UNROLL_CONSTANT,
> LPT_UNROLL_RUNTIME,
> LPT_UNROLL_STUPID
> @@ -731,12 +729,11 @@ extern void loop_optimizer_finalize (voi
> /* Optimization passes. */
> enum
> {
> - UAP_PEEL = 1, /* Enables loop peeling. */
> - UAP_UNROLL = 2, /* Enables unrolling of loops if it seems profitable. */
> - UAP_UNROLL_ALL = 4 /* Enables unrolling of all loops. */
> + UAP_UNROLL = 1, /* Enables unrolling of loops if it seems profitable. */
> + UAP_UNROLL_ALL = 2 /* Enables unrolling of all loops. */
> };
>
> -extern void unroll_and_peel_loops (int);
> +extern void unroll_loops (int);
> extern void doloop_optimize_loops (void);
> extern void move_loop_invariants (void);
> extern void scale_loop_profile (struct loop *loop, int scale, gcov_type iteration_bound);
> Index: passes.def
> ===================================================================
> --- passes.def (revision 216145)
> +++ passes.def (working copy)
> @@ -359,7 +359,7 @@ along with GCC; see the file COPYING3.
> PUSH_INSERT_PASSES_WITHIN (pass_loop2)
> NEXT_PASS (pass_rtl_loop_init);
> NEXT_PASS (pass_rtl_move_loop_invariants);
> - NEXT_PASS (pass_rtl_unroll_and_peel_loops);
> + NEXT_PASS (pass_rtl_unroll_loops);
> NEXT_PASS (pass_rtl_doloop);
> NEXT_PASS (pass_rtl_loop_done);
> TERMINATE_PASS_LIST ()
> Index: loop-unroll.c
> ===================================================================
> --- loop-unroll.c (revision 216145)
> +++ loop-unroll.c (working copy)
> @@ -1,4 +1,4 @@
> -/* Loop unrolling and peeling.
> +/* Loop unrolling.
> Copyright (C) 2002-2014 Free Software Foundation, Inc.
>
> This file is part of GCC.
> @@ -34,8 +34,8 @@ along with GCC; see the file COPYING3.
> #include "target.h"
> #include "dumpfile.h"
>
> -/* This pass performs loop unrolling and peeling. We only perform these
> - optimizations on innermost loops (with single exception) because
> +/* This pass performs loop unrolling. We only perform this
> + optimization on innermost loops (with single exception) because
> the impact on performance is greatest here, and we want to avoid
> unnecessary code size growth. The gain is caused by greater sequentiality
> of code, better code to optimize for further passes and in some cases
> @@ -44,12 +44,6 @@ along with GCC; see the file COPYING3.
>
> What we do:
>
> - -- complete peeling of once-rolling loops; this is the above mentioned
> - exception, as this causes loop to be cancelled completely and
> - does not cause code growth
> - -- complete peeling of loops that roll (small) constant times.
> - -- simple peeling of first iterations of loops that do not roll much
> - (according to profile feedback)
> -- unrolling of loops that roll constant times; this is almost always
> win, as we get rid of exit condition tests.
> -- unrolling of loops that roll number of times that we can compute
> @@ -62,7 +56,7 @@ along with GCC; see the file COPYING3.
> appropriate function below.
>
> There is a lot of parameters (defined and described in params.def) that
> - control how much we unroll/peel.
> + control how much we unroll.
>
> ??? A great problem is that we don't have a good way how to determine
> how many times we should unroll the loop; the experiments I have made
> @@ -170,17 +164,11 @@ struct opt_info
> basic_block loop_preheader; /* The loop preheader basic block. */
> };
>
> -static void decide_unrolling_and_peeling (int);
> -static void peel_loops_completely (int);
> -static void decide_peel_simple (struct loop *, int);
> -static void decide_peel_once_rolling (struct loop *, int);
> -static void decide_peel_completely (struct loop *, int);
> static void decide_unroll_stupid (struct loop *, int);
> static void decide_unroll_constant_iterations (struct loop *, int);
> static void decide_unroll_runtime_iterations (struct loop *, int);
> -static void peel_loop_simple (struct loop *);
> -static void peel_loop_completely (struct loop *);
> static void unroll_loop_stupid (struct loop *);
> +static void decide_unrolling (int);
> static void unroll_loop_constant_iterations (struct loop *);
> static void unroll_loop_runtime_iterations (struct loop *);
> static struct opt_info *analyze_insns_in_loop (struct loop *);
> @@ -197,15 +185,13 @@ static void combine_var_copies_in_loop_e
> basic_block);
> static rtx get_expansion (struct var_to_expand *);
>
> -/* Emit a message summarizing the unroll or peel that will be
> +/* Emit a message summarizing the unroll that will be
> performed for LOOP, along with the loop's location LOCUS, if
> appropriate given the dump or -fopt-info settings. */
>
> static void
> -report_unroll_peel (struct loop *loop, location_t locus)
> +report_unroll (struct loop *loop, location_t locus)
> {
> - struct niter_desc *desc;
> - int niters = 0;
> int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;
>
> if (loop->lpt_decision.decision == LPT_NONE)
> @@ -214,169 +200,20 @@ report_unroll_peel (struct loop *loop, l
> if (!dump_enabled_p ())
> return;
>
> - /* In the special case where the loop never iterated, emit
> - a different message so that we don't report an unroll by 0.
> - This matches the equivalent message emitted during tree unrolling. */
> - if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY
> - && !loop->lpt_decision.times)
> - {
> - dump_printf_loc (report_flags, locus,
> - "loop turned into non-loop; it never loops.\n");
> - return;
> - }
> -
> - desc = get_simple_loop_desc (loop);
> -
> - if (desc->const_iter)
> - niters = desc->niter;
> - else if (loop->header->count)
> - niters = expected_loop_iterations (loop);
> -
> - if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
> - dump_printf_loc (report_flags, locus,
> - "loop with %d iterations completely unrolled",
> - loop->lpt_decision.times + 1);
> - else
> - dump_printf_loc (report_flags, locus,
> - "loop %s %d times",
> - (loop->lpt_decision.decision == LPT_PEEL_SIMPLE
> - ? "peeled" : "unrolled"),
> - loop->lpt_decision.times);
> + dump_printf_loc (report_flags, locus,
> + "loop unrolled %d times",
> + loop->lpt_decision.times);
> if (profile_info)
> dump_printf (report_flags,
> - " (header execution count %d",
> + " (header execution count %d)",
> (int)loop->header->count);
> - if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
> - dump_printf (report_flags,
> - "%s%s iterations %d)",
> - profile_info ? ", " : " (",
> - desc->const_iter ? "const" : "average",
> - niters);
> - else if (profile_info)
> - dump_printf (report_flags, ")");
>
> dump_printf (report_flags, "\n");
> }
>
> -/* Unroll and/or peel (depending on FLAGS) LOOPS. */
> -void
> -unroll_and_peel_loops (int flags)
> -{
> - struct loop *loop;
> - bool changed = false;
> -
> - /* First perform complete loop peeling (it is almost surely a win,
> - and affects parameters for further decision a lot). */
> - peel_loops_completely (flags);
> -
> - /* Now decide rest of unrolling and peeling. */
> - decide_unrolling_and_peeling (flags);
> -
> - /* Scan the loops, inner ones first. */
> - FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
> - {
> - /* And perform the appropriate transformations. */
> - switch (loop->lpt_decision.decision)
> - {
> - case LPT_PEEL_COMPLETELY:
> - /* Already done. */
> - gcc_unreachable ();
> - case LPT_PEEL_SIMPLE:
> - peel_loop_simple (loop);
> - changed = true;
> - break;
> - case LPT_UNROLL_CONSTANT:
> - unroll_loop_constant_iterations (loop);
> - changed = true;
> - break;
> - case LPT_UNROLL_RUNTIME:
> - unroll_loop_runtime_iterations (loop);
> - changed = true;
> - break;
> - case LPT_UNROLL_STUPID:
> - unroll_loop_stupid (loop);
> - changed = true;
> - break;
> - case LPT_NONE:
> - break;
> - default:
> - gcc_unreachable ();
> - }
> - }
> -
> - if (changed)
> - {
> - calculate_dominance_info (CDI_DOMINATORS);
> - fix_loop_structure (NULL);
> - }
> -
> - iv_analysis_done ();
> -}
> -
> -/* Check whether exit of the LOOP is at the end of loop body. */
> -
> -static bool
> -loop_exit_at_end_p (struct loop *loop)
> -{
> - struct niter_desc *desc = get_simple_loop_desc (loop);
> - rtx_insn *insn;
> -
> - if (desc->in_edge->dest != loop->latch)
> - return false;
> -
> - /* Check that the latch is empty. */
> - FOR_BB_INSNS (loop->latch, insn)
> - {
> - if (NONDEBUG_INSN_P (insn))
> - return false;
> - }
> -
> - return true;
> -}
> -
> -/* Depending on FLAGS, check whether to peel loops completely and do so. */
> -static void
> -peel_loops_completely (int flags)
> -{
> - struct loop *loop;
> - bool changed = false;
> -
> - /* Scan the loops, the inner ones first. */
> - FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
> - {
> - loop->lpt_decision.decision = LPT_NONE;
> - location_t locus = get_loop_location (loop);
> -
> - if (dump_enabled_p ())
> - dump_printf_loc (TDF_RTL, locus,
> - ";; *** Considering loop %d at BB %d for "
> - "complete peeling ***\n",
> - loop->num, loop->header->index);
> -
> - loop->ninsns = num_loop_insns (loop);
> -
> - decide_peel_once_rolling (loop, flags);
> - if (loop->lpt_decision.decision == LPT_NONE)
> - decide_peel_completely (loop, flags);
> -
> - if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
> - {
> - report_unroll_peel (loop, locus);
> - peel_loop_completely (loop);
> - changed = true;
> - }
> - }
> -
> - if (changed)
> - {
> - calculate_dominance_info (CDI_DOMINATORS);
> - fix_loop_structure (NULL);
> - }
> -}
> -
> -/* Decide whether unroll or peel loops (depending on FLAGS) and how much. */
> +/* Decide whether unroll loops and how much. */
> static void
> -decide_unrolling_and_peeling (int flags)
> +decide_unrolling (int flags)
> {
> struct loop *loop;
>
> @@ -389,7 +226,7 @@ decide_unrolling_and_peeling (int flags)
> if (dump_enabled_p ())
> dump_printf_loc (TDF_RTL, locus,
> ";; *** Considering loop %d at BB %d for "
> - "unrolling and peeling ***\n",
> + "unrolling ***\n",
> loop->num, loop->header->index);
>
> /* Do not peel cold areas. */
> @@ -428,204 +265,77 @@ decide_unrolling_and_peeling (int flags)
> decide_unroll_runtime_iterations (loop, flags);
> if (loop->lpt_decision.decision == LPT_NONE)
> decide_unroll_stupid (loop, flags);
> - if (loop->lpt_decision.decision == LPT_NONE)
> - decide_peel_simple (loop, flags);
> -
> - report_unroll_peel (loop, locus);
> - }
> -}
> -
> -/* Decide whether the LOOP is once rolling and suitable for complete
> - peeling. */
> -static void
> -decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
> -{
> - struct niter_desc *desc;
> -
> - if (dump_file)
> - fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
> -
> - /* Is the loop small enough? */
> - if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
> - {
> - if (dump_file)
> - fprintf (dump_file, ";; Not considering loop, is too big\n");
> - return;
> - }
>
> - /* Check for simple loops. */
> - desc = get_simple_loop_desc (loop);
> -
> - /* Check number of iterations. */
> - if (!desc->simple_p
> - || desc->assumptions
> - || desc->infinite
> - || !desc->const_iter
> - || (desc->niter != 0
> - && get_max_loop_iterations_int (loop) != 0))
> - {
> - if (dump_file)
> - fprintf (dump_file,
> - ";; Unable to prove that the loop rolls exactly once\n");
> - return;
> + report_unroll (loop, locus);
> }
> -
> - /* Success. */
> - loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
> }
>
> -/* Decide whether the LOOP is suitable for complete peeling. */
> -static void
> -decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
> +/* Unroll LOOPS. */
> +void
> +unroll_loops (int flags)
> {
> - unsigned npeel;
> - struct niter_desc *desc;
> -
> - if (dump_file)
> - fprintf (dump_file, "\n;; Considering peeling completely\n");
> -
> - /* Skip non-innermost loops. */
> - if (loop->inner)
> - {
> - if (dump_file)
> - fprintf (dump_file, ";; Not considering loop, is not innermost\n");
> - return;
> - }
> -
> - /* Do not peel cold areas. */
> - if (optimize_loop_for_size_p (loop))
> - {
> - if (dump_file)
> - fprintf (dump_file, ";; Not considering loop, cold area\n");
> - return;
> - }
> -
> - /* Can the loop be manipulated? */
> - if (!can_duplicate_loop_p (loop))
> - {
> - if (dump_file)
> - fprintf (dump_file,
> - ";; Not considering loop, cannot duplicate\n");
> - return;
> - }
> -
> - /* npeel = number of iterations to peel. */
> - npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
> - if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
> - npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
> -
> - /* Is the loop small enough? */
> - if (!npeel)
> - {
> - if (dump_file)
> - fprintf (dump_file, ";; Not considering loop, is too big\n");
> - return;
> - }
> -
> - /* Check for simple loops. */
> - desc = get_simple_loop_desc (loop);
> + struct loop *loop;
> + bool changed = false;
>
> - /* Check number of iterations. */
> - if (!desc->simple_p
> - || desc->assumptions
> - || !desc->const_iter
> - || desc->infinite)
> - {
> - if (dump_file)
> - fprintf (dump_file,
> - ";; Unable to prove that the loop iterates constant times\n");
> - return;
> - }
> + /* Now decide rest of unrolling. */
> + decide_unrolling (flags);
>
> - if (desc->niter > npeel - 1)
> + /* Scan the loops, inner ones first. */
> + FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
> {
> - if (dump_file)
> + /* And perform the appropriate transformations. */
> + switch (loop->lpt_decision.decision)
> {
> - fprintf (dump_file,
> - ";; Not peeling loop completely, rolls too much (");
> - fprintf (dump_file, "%"PRId64, desc->niter);
> - fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
> + case LPT_UNROLL_CONSTANT:
> + unroll_loop_constant_iterations (loop);
> + changed = true;
> + break;
> + case LPT_UNROLL_RUNTIME:
> + unroll_loop_runtime_iterations (loop);
> + changed = true;
> + break;
> + case LPT_UNROLL_STUPID:
> + unroll_loop_stupid (loop);
> + changed = true;
> + break;
> + case LPT_NONE:
> + break;
> + default:
> + gcc_unreachable ();
> }
> - return;
> }
>
> - /* Success. */
> - loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
> -}
> -
> -/* Peel all iterations of LOOP, remove exit edges and cancel the loop
> - completely. The transformation done:
> + if (changed)
> + {
> + calculate_dominance_info (CDI_DOMINATORS);
> + fix_loop_structure (NULL);
> + }
>
> - for (i = 0; i < 4; i++)
> - body;
> + iv_analysis_done ();
> +}
>
> - ==>
> +/* Check whether exit of the LOOP is at the end of loop body. */
>
> - i = 0;
> - body; i++;
> - body; i++;
> - body; i++;
> - body; i++;
> - */
> -static void
> -peel_loop_completely (struct loop *loop)
> +static bool
> +loop_exit_at_end_p (struct loop *loop)
> {
> - sbitmap wont_exit;
> - unsigned HOST_WIDE_INT npeel;
> - unsigned i;
> - edge ein;
> struct niter_desc *desc = get_simple_loop_desc (loop);
> - struct opt_info *opt_info = NULL;
> -
> - npeel = desc->niter;
> -
> - if (npeel)
> - {
> - bool ok;
> -
> - wont_exit = sbitmap_alloc (npeel + 1);
> - bitmap_ones (wont_exit);
> - bitmap_clear_bit (wont_exit, 0);
> - if (desc->noloop_assumptions)
> - bitmap_clear_bit (wont_exit, 1);
> -
> - auto_vec<edge> remove_edges;
> - if (flag_split_ivs_in_unroller)
> - opt_info = analyze_insns_in_loop (loop);
> -
> - opt_info_start_duplication (opt_info);
> - ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
> - npeel,
> - wont_exit, desc->out_edge,
> - &remove_edges,
> - DLTHE_FLAG_UPDATE_FREQ
> - | DLTHE_FLAG_COMPLETTE_PEEL
> - | (opt_info
> - ? DLTHE_RECORD_COPY_NUMBER : 0));
> - gcc_assert (ok);
> + rtx_insn *insn;
>
> - free (wont_exit);
> + /* We should never have conditional in latch block. */
> + gcc_assert (desc->in_edge->dest != loop->header);
>
> - if (opt_info)
> - {
> - apply_opt_in_copies (opt_info, npeel, false, true);
> - free_opt_info (opt_info);
> - }
> + if (desc->in_edge->dest != loop->latch)
> + return false;
>
> - /* Remove the exit edges. */
> - FOR_EACH_VEC_ELT (remove_edges, i, ein)
> - remove_path (ein);
> + /* Check that the latch is empty. */
> + FOR_BB_INSNS (loop->latch, insn)
> + {
> + if (INSN_P (insn) && active_insn_p (insn))
> + return false;
> }
>
> - ein = desc->in_edge;
> - free_simple_loop_desc (loop);
> -
> - /* Now remove the unreachable part of the last iteration and cancel
> - the loop. */
> - remove_path (ein);
> -
> - if (dump_file)
> - fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
> + return true;
> }
>
> /* Decide whether to unroll LOOP iterating constant number of times
> @@ -1372,160 +1082,6 @@ unroll_loop_runtime_iterations (struct l
> max_unroll, num_loop_insns (loop));
> }
>
> -/* Decide whether to simply peel LOOP and how much. */
> -static void
> -decide_peel_simple (struct loop *loop, int flags)
> -{
> - unsigned npeel;
> - widest_int iterations;
> -
> - if (!(flags & UAP_PEEL))
> - {
> - /* We were not asked to, just return back silently. */
> - return;
> - }
> -
> - if (dump_file)
> - fprintf (dump_file, "\n;; Considering simply peeling loop\n");
> -
> - /* npeel = number of iterations to peel. */
> - npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
> - if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
> - npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
> -
> - /* Skip big loops. */
> - if (!npeel)
> - {
> - if (dump_file)
> - fprintf (dump_file, ";; Not considering loop, is too big\n");
> - return;
> - }
> -
> - /* Do not simply peel loops with branches inside -- it increases number
> - of mispredicts.
> - Exception is when we do have profile and we however have good chance
> - to peel proper number of iterations loop will iterate in practice.
> - TODO: this heuristic needs tunning; while for complette unrolling
> - the branch inside loop mostly eliminates any improvements, for
> - peeling it is not the case. Also a function call inside loop is
> - also branch from branch prediction POV (and probably better reason
> - to not unroll/peel). */
> - if (num_loop_branches (loop) > 1
> - && profile_status_for_fn (cfun) != PROFILE_READ)
> - {
> - if (dump_file)
> - fprintf (dump_file, ";; Not peeling, contains branches\n");
> - return;
> - }
> -
> - /* If we have realistic estimate on number of iterations, use it. */
> - if (get_estimated_loop_iterations (loop, &iterations))
> - {
> - if (wi::leu_p (npeel, iterations))
> - {
> - if (dump_file)
> - {
> - fprintf (dump_file, ";; Not peeling loop, rolls too much (");
> - fprintf (dump_file, "%"PRId64,
> - (int64_t) (iterations.to_shwi () + 1));
> - fprintf (dump_file, " iterations > %d [maximum peelings])\n",
> - npeel);
> - }
> - return;
> - }
> - npeel = iterations.to_shwi () + 1;
> - }
> - /* If we have small enough bound on iterations, we can still peel (completely
> - unroll). */
> - else if (get_max_loop_iterations (loop, &iterations)
> - && wi::ltu_p (iterations, npeel))
> - npeel = iterations.to_shwi () + 1;
> - else
> - {
> - /* For now we have no good heuristics to decide whether loop peeling
> - will be effective, so disable it. */
> - if (dump_file)
> - fprintf (dump_file,
> - ";; Not peeling loop, no evidence it will be profitable\n");
> - return;
> - }
> -
> - /* Success. */
> - loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
> - loop->lpt_decision.times = npeel;
> -}
> -
> -/* Peel a LOOP LOOP->LPT_DECISION.TIMES times. The transformation does this:
> -
> - while (cond)
> - body;
> -
> - ==> (LOOP->LPT_DECISION.TIMES == 3)
> -
> - if (!cond) goto end;
> - body;
> - if (!cond) goto end;
> - body;
> - if (!cond) goto end;
> - body;
> - while (cond)
> - body;
> - end: ;
> - */
> -static void
> -peel_loop_simple (struct loop *loop)
> -{
> - sbitmap wont_exit;
> - unsigned npeel = loop->lpt_decision.times;
> - struct niter_desc *desc = get_simple_loop_desc (loop);
> - struct opt_info *opt_info = NULL;
> - bool ok;
> -
> - if (flag_split_ivs_in_unroller && npeel > 1)
> - opt_info = analyze_insns_in_loop (loop);
> -
> - wont_exit = sbitmap_alloc (npeel + 1);
> - bitmap_clear (wont_exit);
> -
> - opt_info_start_duplication (opt_info);
> -
> - ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
> - npeel, wont_exit, NULL,
> - NULL, DLTHE_FLAG_UPDATE_FREQ
> - | (opt_info
> - ? DLTHE_RECORD_COPY_NUMBER
> - : 0));
> - gcc_assert (ok);
> -
> - free (wont_exit);
> -
> - if (opt_info)
> - {
> - apply_opt_in_copies (opt_info, npeel, false, false);
> - free_opt_info (opt_info);
> - }
> -
> - if (desc->simple_p)
> - {
> - if (desc->const_iter)
> - {
> - desc->niter -= npeel;
> - desc->niter_expr = GEN_INT (desc->niter);
> - desc->noloop_assumptions = NULL_RTX;
> - }
> - else
> - {
> - /* We cannot just update niter_expr, as its value might be clobbered
> - inside loop. We could handle this by counting the number into
> - temporary just like we do in runtime unrolling, but it does not
> - seem worthwhile. */
> - free_simple_loop_desc (loop);
> - }
> - }
> - if (dump_file)
> - fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
> -}
> -
> /* Decide whether to unroll LOOP stupidly and how much. */
> static void
> decide_unroll_stupid (struct loop *loop, int flags)
> Index: testsuite/gcc.dg/unroll_3.c
> ===================================================================
> --- testsuite/gcc.dg/unroll_3.c (revision 216145)
> +++ testsuite/gcc.dg/unroll_3.c (working copy)
> @@ -1,5 +1,5 @@
> /* { dg-do compile } */
> -/* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fno-tree-vrp -fdisable-tree-cunroll -fdisable-tree-cunrolli -fenable-rtl-loop2_unroll=foo" } */
> +/* { dg-options "-O2 -fdump-tree-cunrolli-details -fno-peel-loops -fno-tree-vrp -fdisable-tree-cunroll -fenable-tree-cunrolli=foo -fdisable-tree-cunrolli=foo2" } */
>
> unsigned a[100], b[100];
> inline void bar()
> @@ -28,5 +28,5 @@ int foo2(void)
> return 1;
> }
>
> -/* { dg-final { scan-rtl-dump-times "loop turned into non-loop; it never loops" 1 "loop2_unroll" } } */
> -/* { dg-final { cleanup-rtl-dump "loop2_unroll" } } */
> +/* { dg-final { scan-tree-dump-times "loop with 3 iterations completely unrolled" 1 "cunrolli" } } */
> +/* { dg-final { cleanup-tree-dump "cunrolli" } } */
> Index: testsuite/gcc.dg/tree-prof/peel-1.c
> ===================================================================
> --- testsuite/gcc.dg/tree-prof/peel-1.c (revision 216145)
> +++ testsuite/gcc.dg/tree-prof/peel-1.c (working copy)
> @@ -1,4 +1,4 @@
> -/* { dg-options "-O3 -fdump-rtl-loop2_unroll -fno-unroll-loops -fpeel-loops" } */
> +/* { dg-options "-O3 -fdump-tree-cunroll-details -fno-unroll-loops -fpeel-loops" } */
> void abort();
>
> int a[1000];
> Index: testsuite/gcc.dg/unroll_4.c
> ===================================================================
> --- testsuite/gcc.dg/unroll_4.c (revision 216145)
> +++ testsuite/gcc.dg/unroll_4.c (working copy)
> @@ -1,5 +1,5 @@
> /* { dg-do compile } */
> -/* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fno-tree-vrp -fdisable-tree-cunroll -fdisable-tree-cunrolli -fenable-rtl-loop2_unroll=foo2" } */
> +/* { dg-options "-O2 -fdump-tree-cunrolli-details -fno-peel-loops -fno-tree-vrp -fdisable-tree-cunroll -fenable-tree-cunrolli=foo2 -fdisable-tree-cunrolli=foo" } */
>
> unsigned a[100], b[100];
> inline void bar()
> @@ -28,5 +28,5 @@ int foo2(void)
> return 1;
> }
>
> -/* { dg-final { scan-rtl-dump-times "loop turned into non-loop; it never loops" 1 "loop2_unroll" } } */
> -/* { dg-final { cleanup-rtl-dump "loop2_unroll" } } */
> +/* { dg-final { scan-tree-dump-times "loop with 3 iterations completely unrolled" 1 "cunrolli" } } */
> +/* { dg-final { cleanup-tree-dump "cunrolli" } } */
> Index: testsuite/gcc.dg/unroll_1.c
> ===================================================================
> --- testsuite/gcc.dg/unroll_1.c (revision 216145)
> +++ testsuite/gcc.dg/unroll_1.c (working copy)
> @@ -1,5 +1,5 @@
> /* { dg-do compile } */
> -/* { dg-options "-O2 -fdump-rtl-loop2_unroll=stderr -fno-peel-loops -fno-tree-vrp -fdisable-tree-cunroll -fdisable-tree-cunrolli -fenable-rtl-loop2 -fenable-rtl-loop2_unroll" } */
> +/* { dg-options "-O2 -fdump-tree-cunrolli-details=stderr -fno-peel-loops -fno-tree-vrp -fdisable-tree-cunroll -fenable-tree-cunrolli" } */
>
> unsigned a[100], b[100];
> inline void bar()
> @@ -11,7 +11,7 @@ int foo(void)
> {
> int i;
> bar();
> - for (i = 0; i < 2; i++) /* { dg-message "note: loop turned into non-loop; it never loops" } */
> + for (i = 0; i < 2; i++) /* { dg-message "note: loop with 3 iterations completely unrolled" } */
> {
> a[i]= b[i] + 1;
> }
> @@ -21,7 +21,7 @@ int foo(void)
> int foo2(void)
> {
> int i;
> - for (i = 0; i < 2; i++) /* { dg-message "note: loop turned into non-loop; it never loops" } */
> + for (i = 0; i < 2; i++) /* { dg-message "note: loop with 3 iterations completely unrolled" } */
> {
> a[i]= b[i] + 1;
> }
> Index: testsuite/gcc.dg/unroll_2.c
> ===================================================================
> --- testsuite/gcc.dg/unroll_2.c (revision 216145)
> +++ testsuite/gcc.dg/unroll_2.c (working copy)
> @@ -1,5 +1,5 @@
> /* { dg-do compile } */
> -/* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fno-tree-vrp -fdisable-tree-cunroll=foo -fdisable-tree-cunrolli=foo -fenable-rtl-loop2_unroll" } */
> +/* { dg-options "-O2 -fdump-tree-cunrolli-details -fno-peel-loops -fno-tree-vrp -fdisable-tree-cunrolli=foo -fenable-tree-cunrolli=foo" } */
>
> unsigned a[100], b[100];
> inline void bar()
> @@ -28,5 +28,5 @@ int foo2(void)
> return 1;
> }
>
> -/* { dg-final { scan-rtl-dump-times "loop turned into non-loop; it never loops" 1 "loop2_unroll" } } */
> -/* { dg-final { cleanup-rtl-dump "loop2_unroll" } } */
> +/* { dg-final { scan-tree-dump-times "loop with 3 iterations completely unrolled" 1 "cunrolli" } } */
> +/* { dg-final { cleanup-tree-dump "cunrolli" } } */
>
>
--
Richard Biener <rguenther@suse.de>
SUSE / SUSE Labs
SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer