This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] Take branch misprediction effects into account when RTL loop unrolling (issue6099055)


On Tue, Apr 24, 2012 at 2:26 PM, Teresa Johnson <tejohnson@google.com> wrote:
> This patch adds heuristics to limit unrolling in loops with branches that may increase
> branch mispredictions. It affects loops that are not frequently iterated, and that are
> nested within a hot region of code that already contains many branch instructions.
>
> Performance tested with both internal benchmarks and with SPEC 2000/2006 on a variety
> of Intel systems (Core2, Corei7, SandyBridge) and a couple of different AMD Opteron systems.
> This improves performance of an internal search indexing benchmark by close to 2% on
> all the tested Intel platforms. ÂIt also consistently improves 445.gobmk (with FDO feedback
> where unrolling kicks in) by close to 1% on AMD Opteron. Other performance effects are
> neutral.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu. ÂIs this ok for trunk?
>
> Thanks,
> Teresa
>
> 2012-04-24 Â Teresa Johnson Â<tejohnson@google.com>
>
> Â Â Â Â* loop-unroll.c (loop_has_call): New function.
> Â Â Â Â(loop_has_FP_comp): Ditto.
> Â Â Â Â(compute_weighted_branches): Ditto.
> Â Â Â Â(max_unroll_with_branches): Ditto.
> Â Â Â Â(decide_unroll_constant_iterations): Add heuristic to avoid
> Â Â Â Âincreasing branch mispredicts when unrolling.
> Â Â Â Â(decide_unroll_runtime_iterations): Ditto.
> Â Â Â Â* params.def (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES): New param.
> Â Â Â Â(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET): Ditto.
>
> Index: loop-unroll.c
> ===================================================================
> --- loop-unroll.c    (revision 186783)
> +++ loop-unroll.c    (working copy)
> @@ -152,6 +152,180 @@ static void combine_var_copies_in_loop_exit (struc
> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â basic_block);
> Âstatic rtx get_expansion (struct var_to_expand *);
>
> +/* Determine whether LOOP contains call. Â*/
> +static bool
> +loop_has_call(struct loop *loop)
> +{
> + Âbasic_block *body, bb;
> + Âunsigned i;
> + Ârtx insn;
> +
> + Âbody = get_loop_body (loop);
> + Âfor (i = 0; i < loop->num_nodes; i++)
> + Â Â{
> + Â Â Âbb = body[i];
> +
> + Â Â ÂFOR_BB_INSNS (bb, insn)
> + Â Â Â Â{
> + Â Â Â Â Âif (CALL_P (insn))
> + Â Â Â Â Â Â{
> + Â Â Â Â Â Â Âfree (body);
> + Â Â Â Â Â Â Âreturn true;
> + Â Â Â Â Â Â}
> + Â Â Â Â}
> + Â Â}
> + Âfree (body);
> + Âreturn false;
> +}
> +
> +/* Determine whether LOOP contains floating-point computation. Â*/
> +static bool
> +loop_has_FP_comp(struct loop *loop)
> +{
> + Ârtx set, dest;
> + Âbasic_block *body, bb;
> + Âunsigned i;
> + Ârtx insn;
> +
> + Âbody = get_loop_body (loop);
> + Âfor (i = 0; i < loop->num_nodes; i++)
> + Â Â{
> + Â Â Âbb = body[i];
> +
> + Â Â ÂFOR_BB_INSNS (bb, insn)
> + Â Â Â Â{
> + Â Â Â Â Âset = single_set (insn);
> + Â Â Â Â Âif (!set)
> + Â Â Â Â Â Âcontinue;
> +
> + Â Â Â Â Âdest = SET_DEST (set);
> + Â Â Â Â Âif (FLOAT_MODE_P (GET_MODE (dest)))
> + Â Â Â Â Â Â{
> + Â Â Â Â Â Â Âfree (body);
> + Â Â Â Â Â Â Âreturn true;
> + Â Â Â Â Â Â}
> + Â Â Â Â}
> + Â Â}
> + Âfree (body);
> + Âreturn false;
> +}
> +
> +/* Compute the number of branches in LOOP, weighted by execution counts. Â*/
> +static float
> +compute_weighted_branches(struct loop *loop)
> +{
> + Âint header_count = loop->header->count;
> + Âunsigned i;
> + Âfloat n;
> + Âbasic_block * body;
> +
> + Â/* If no profile feedback data exists, don't limit unrolling Â*/
> + Âif (header_count == 0)
> + Â Âreturn 0.0;
> +
> + Âgcc_assert (loop->latch != EXIT_BLOCK_PTR);
> +
> + Âbody = get_loop_body (loop);
> + Ân = 0.0;
> + Âfor (i = 0; i < loop->num_nodes; i++)
> + Â Â{
> + Â Â Âif (EDGE_COUNT (body[i]->succs) >= 2)
> + Â Â Â Â{
> + Â Â Â Â Â/* If this block is executed less frequently than the header (loop
> + Â Â Â Â Â Â entry), then it is weighted based on the ratio of times it is
> + Â Â Â Â Â Â executed compared to the header. Â*/
> + Â Â Â Â Âif (body[i]->count < header_count)
> + Â Â Â Â Â Ân += ((float)body[i]->count)/header_count;

Please don't introduce more floating point usage into the compiler
since it could change between different hosts (sse vs x87 for an
example).
Maybe use a fixed point multiply of 1000 (note use a macro for this
special value though)  like what is used in the rest of the predict
code.


Thanks,
Andrew Pinski


> +
> + Â Â Â Â Â/* When it is executed more frequently than the header (i.e. it is
> + Â Â Â Â Â Â in a nested inner loop), simply weight the branch at 1.0. Â*/
> + Â Â Â Â Âelse
> + Â Â Â Â Â Ân += 1.0;
> + Â Â Â Â}
> + Â Â}
> + Âfree (body);
> +
> + Âreturn n;
> +}
> +
> +/* Compute the maximum number of times LOOP can be unrolled without exceeding
> + Â a branch budget, which can increase branch mispredictions. The number of
> + Â branches is computed by weighting each branch with its expected execution
> + Â probability through the loop based on profile data. If no profile feedback
> + Â data exists, simply return the current NUNROLL factor. Â*/
> +static unsigned
> +max_unroll_with_branches(struct loop *loop, unsigned nunroll)
> +{
> + Âstruct loop *outer;
> + Âstruct niter_desc *outer_desc;
> + Âint outer_niters = 1;
> + Âfloat weighted_outer_branches = 0.0;
> + Âfloat weighted_num_branches = compute_weighted_branches (loop);
> +
> + Â/* If there was no profile feedback data, weighted_num_branches will be 0.0
> + Â Â and we won't limit unrolling. If the weighted_num_branches is at most 1.0,
> + Â Â also don't limit unrolling as the back-edge branch will not be duplicated. Â*/
> + Âif (weighted_num_branches <= 1.0)
> + Â Âreturn nunroll;
> +
> + Â/* Walk up the loop tree until we find a hot outer loop in which the current
> + Â Â loop is nested. At that point we will compute the number of times the
> + Â Â current loop can be unrolled based on the number of branches in the hot
> + Â Â outer loop. Â*/
> + Âouter = loop_outer(loop);
> + Â/* The loop structure contains a fake outermost loop, so this should always
> + Â Â be non-NULL for our current loop. Â*/
> + Âgcc_assert (outer);
> + Â/* Detect if this is the fake outermost loop (at which point we are done)
> + Â Â by checking its outer loop. Â*/
> + Âwhile (loop_outer(outer))
> + Â Â{
> + Â Â Âouter_desc = get_simple_loop_desc (outer);
> +
> + Â Â Âif (outer_desc->const_iter)
> + Â Â Â Âouter_niters *= outer_desc->niter;
> + Â Â Âelse if (outer->header->count)
> + Â Â Â Âouter_niters *= expected_loop_iterations (outer);
> +
> + Â Â Âweighted_outer_branches = compute_weighted_branches (outer);
> +
> + Â Â Â/* Should have been checked by caller. Â*/
> + Â Â Âgcc_assert(PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1);
> +
> + Â Â Â/* If the outer loop has enough iterations to be considered hot, then
> + Â Â Â Â we can stop our upwards loop tree traversal and examine the current
> + Â Â Â Â outer loop. Â*/
> + Â Â Âif (outer_niters >= PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
> + Â Â Â Â{
> + Â Â Â Â Â/* Assume that any call will cause the branch budget to be exceeded,
> + Â Â Â Â Â Â and that we can't unroll the current loop without increasing
> + Â Â Â Â Â Â mispredicts. Â*/
> + Â Â Â Â Âif (loop_has_call(outer))
> + Â Â Â Â Â Âreturn 0;
> +
> + Â Â Â Â Â/* Otherwise, compute the maximum number of times current loop can be
> + Â Â Â Â Â Â unrolled without exceeding our branch budget. First we subtract
> + Â Â Â Â Â Â off the outer loop's weighted branch count from the budget. Note
> + Â Â Â Â Â Â that this includes the branches in the current loop. This yields
> + Â Â Â Â Â Â the number of branches left in the budget for the unrolled copies.
> + Â Â Â Â Â Â We divide this by the number of branches in the current loop that
> + Â Â Â Â Â Â must be duplicated when we unroll, which is the total weighted
> + Â Â Â Â Â Â number of branches minus the back-edge branch. This yields the
> + Â Â Â Â Â Â number of new loop body copies that can be created by unrolling
> + Â Â Â Â Â Â without exceeding the budget, to which we add 1 to get the unroll
> + Â Â Â Â Â Â factor. Â*/
> + Â Â Â Â Âreturn (PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET) -
> + Â Â Â Â Â Â Â Â Âweighted_outer_branches)/(weighted_num_branches - 1) + 1;
> + Â Â Â Â}
> + Â Â Âouter = loop_outer(outer);
> + Â Â}
> +
> + Â/* The current loop is not enclosed by a hot enough outer loop in this
> + Â Â procedure, since the hot outer loop is inter-procedural, assume that
> + Â Â it already contains a significant number of branches, so don't unroll. Â*/
> + Âreturn 0;
> +}
> +
> Â/* Unroll and/or peel (depending on FLAGS) LOOPS. Â*/
> Âvoid
> Âunroll_and_peel_loops (int flags)
> @@ -522,6 +696,7 @@ static void
> Âdecide_unroll_constant_iterations (struct loop *loop, int flags)
> Â{
> Â unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
> + Âunsigned nunroll_branches;
> Â struct niter_desc *desc;
>
> Â if (!(flags & UAP_UNROLL))
> @@ -565,6 +740,25 @@ decide_unroll_constant_iterations (struct loop *lo
> Â Â Â return;
> Â Â }
>
> + Â/* Be careful when unrolling loops with branches inside -- it can increase
> + Â Â the number of mispredicts. Ignore loops with FP computation as these
> + Â Â tend to benefit much more consistently from unrolling. Â*/
> + Âif (num_loop_branches (loop) > 1
> + Â Â Â&& loop_has_FP_comp(loop)
> + Â Â Â&& PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1
> + Â Â Â&& desc->niter < (unsigned) PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
> + Â Â{
> + Â Â Ânunroll_branches = max_unroll_with_branches(loop, nunroll);
> + Â Â Âif (nunroll > nunroll_branches)
> + Â Â Â Ânunroll = nunroll_branches;
> + Â Â Âif (nunroll <= 1)
> + Â Â Â Â{
> + Â Â Â Â Âif (dump_file)
> + Â Â Â Â Â fprintf (dump_file, ";; Not unrolling, contains branches\n");
> + Â Â Â Â Âreturn;
> + Â Â Â Â}
> + Â Â}
> +
> Â /* Check whether the loop rolls enough to consider. Â*/
> Â if (desc->niter < 2 * nunroll)
> Â Â {
> @@ -802,7 +996,7 @@ unroll_loop_constant_iterations (struct loop *loop
> Âstatic void
> Âdecide_unroll_runtime_iterations (struct loop *loop, int flags)
> Â{
> - Âunsigned nunroll, nunroll_by_av, i;
> + Âunsigned nunroll, nunroll_by_av, nunroll_branches, i;
> Â struct niter_desc *desc;
>
> Â if (!(flags & UAP_UNROLL))
> @@ -856,6 +1050,25 @@ decide_unroll_runtime_iterations (struct loop *loo
> Â Â Â return;
> Â Â }
>
> + Â/* Be careful when unrolling loops with branches inside -- it can increase
> + Â Â the number of mispredicts. Ignore loops with FP computation as these
> + Â Â tend to benefit much more consistently from unrolling. Â*/
> + Âif (num_loop_branches (loop) > 1
> + Â Â Â&& loop_has_FP_comp(loop)
> + Â Â Â&& PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1
> + Â Â Â&& expected_loop_iterations (loop) < (unsigned) PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
> + Â Â{
> + Â Â Ânunroll_branches = max_unroll_with_branches(loop, nunroll);
> + Â Â Âif (nunroll > nunroll_branches)
> + Â Â Â Ânunroll = nunroll_branches;
> + Â Â Âif (nunroll <= 1)
> + Â Â Â Â{
> + Â Â Â Â Âif (dump_file)
> + Â Â Â Â Â Âfprintf (dump_file, ";; Not unrolling, contains branches\n");
> + Â Â Â Â Âreturn;
> + Â Â Â Â}
> + Â Â}
> +
> Â /* If we have profile feedback, check whether the loop rolls. Â*/
> Â if ((loop->header->count
> Â Â Â Â&& expected_loop_iterations (loop) < 2 * nunroll)
> Index: params.def
> ===================================================================
> --- params.def Â(revision 186783)
> +++ params.def Â(working copy)
> @@ -312,6 +312,16 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS,
> Â Â Â Â "The maximum depth of a loop nest we completely peel",
> Â Â Â Â 8, 0, 0)
>
> +DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES,
> + Â Â Â "min-iter-unroll-with-branches",
> + Â Â Â "Minimum iteration count to ignore branch effects when unrolling",
> + Â Â Â 50, 0, 0)
> +
> +DEFPARAM(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET,
> + Â Â Â "unroll-outer-loop-branch-budget",
> + Â Â Â "Maximum number of branches allowed in hot outer loop region after unroll",
> + Â Â Â 25, 0, 0)
> +
> Â/* The maximum number of insns of an unswitched loop. Â*/
> ÂDEFPARAM(PARAM_MAX_UNSWITCH_INSNS,
> Â Â Â Â"max-unswitch-insns",
>
> --
> This patch is available for review at http://codereview.appspot.com/6099055


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]