This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] Take branch misprediction effects into account when RTL loop unrolling (issue6099055)


Resending my response in plain text so it will go through to gcc-patches...

On Tue, Apr 24, 2012 at 2:36 PM, Teresa Johnson <tejohnson@google.com> wrote:
>
>
>
> On Tue, Apr 24, 2012 at 2:30 PM, Andrew Pinski <pinskia@gmail.com> wrote:
>>
>> On Tue, Apr 24, 2012 at 2:26 PM, Teresa Johnson <tejohnson@google.com> wrote:
>> > This patch adds heuristics to limit unrolling in loops with branches that may increase
>> > branch mispredictions. It affects loops that are not frequently iterated, and that are
>> > nested within a hot region of code that already contains many branch instructions.
>> >
>> > Performance tested with both internal benchmarks and with SPEC 2000/2006 on a variety
>> > of Intel systems (Core2, Corei7, SandyBridge) and a couple of different AMD Opteron systems.
>> > This improves performance of an internal search indexing benchmark by close to 2% on
>> > all the tested Intel platforms. ?It also consistently improves 445.gobmk (with FDO feedback
>> > where unrolling kicks in) by close to 1% on AMD Opteron. Other performance effects are
>> > neutral.
>> >
>> > Bootstrapped and tested on x86_64-unknown-linux-gnu. ?Is this ok for trunk?
>> >
>> > Thanks,
>> > Teresa
>> >
>> > 2012-04-24 ? Teresa Johnson ?<tejohnson@google.com>
>> >
>> > ? ? ? ?* loop-unroll.c (loop_has_call): New function.
>> > ? ? ? ?(loop_has_FP_comp): Ditto.
>> > ? ? ? ?(compute_weighted_branches): Ditto.
>> > ? ? ? ?(max_unroll_with_branches): Ditto.
>> > ? ? ? ?(decide_unroll_constant_iterations): Add heuristic to avoid
>> > ? ? ? ?increasing branch mispredicts when unrolling.
>> > ? ? ? ?(decide_unroll_runtime_iterations): Ditto.
>> > ? ? ? ?* params.def (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES): New param.
>> > ? ? ? ?(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET): Ditto.
>> >
>> > Index: loop-unroll.c
>> > ===================================================================
>> > --- loop-unroll.c ? ? ? (revision 186783)
>> > +++ loop-unroll.c ? ? ? (working copy)
>> > @@ -152,6 +152,180 @@ static void combine_var_copies_in_loop_exit (struc
>> > ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? basic_block);
>> > ?static rtx get_expansion (struct var_to_expand *);
>> >
>> > +/* Determine whether LOOP contains call. ?*/
>> > +static bool
>> > +loop_has_call(struct loop *loop)
>> > +{
>> > + ?basic_block *body, bb;
>> > + ?unsigned i;
>> > + ?rtx insn;
>> > +
>> > + ?body = get_loop_body (loop);
>> > + ?for (i = 0; i < loop->num_nodes; i++)
>> > + ? ?{
>> > + ? ? ?bb = body[i];
>> > +
>> > + ? ? ?FOR_BB_INSNS (bb, insn)
>> > + ? ? ? ?{
>> > + ? ? ? ? ?if (CALL_P (insn))
>> > + ? ? ? ? ? ?{
>> > + ? ? ? ? ? ? ?free (body);
>> > + ? ? ? ? ? ? ?return true;
>> > + ? ? ? ? ? ?}
>> > + ? ? ? ?}
>> > + ? ?}
>> > + ?free (body);
>> > + ?return false;
>> > +}
>> > +
>> > +/* Determine whether LOOP contains floating-point computation. ?*/
>> > +static bool
>> > +loop_has_FP_comp(struct loop *loop)
>> > +{
>> > + ?rtx set, dest;
>> > + ?basic_block *body, bb;
>> > + ?unsigned i;
>> > + ?rtx insn;
>> > +
>> > + ?body = get_loop_body (loop);
>> > + ?for (i = 0; i < loop->num_nodes; i++)
>> > + ? ?{
>> > + ? ? ?bb = body[i];
>> > +
>> > + ? ? ?FOR_BB_INSNS (bb, insn)
>> > + ? ? ? ?{
>> > + ? ? ? ? ?set = single_set (insn);
>> > + ? ? ? ? ?if (!set)
>> > + ? ? ? ? ? ?continue;
>> > +
>> > + ? ? ? ? ?dest = SET_DEST (set);
>> > + ? ? ? ? ?if (FLOAT_MODE_P (GET_MODE (dest)))
>> > + ? ? ? ? ? ?{
>> > + ? ? ? ? ? ? ?free (body);
>> > + ? ? ? ? ? ? ?return true;
>> > + ? ? ? ? ? ?}
>> > + ? ? ? ?}
>> > + ? ?}
>> > + ?free (body);
>> > + ?return false;
>> > +}
>> > +
>> > +/* Compute the number of branches in LOOP, weighted by execution counts. ?*/
>> > +static float
>> > +compute_weighted_branches(struct loop *loop)
>> > +{
>> > + ?int header_count = loop->header->count;
>> > + ?unsigned i;
>> > + ?float n;
>> > + ?basic_block * body;
>> > +
>> > + ?/* If no profile feedback data exists, don't limit unrolling ?*/
>> > + ?if (header_count == 0)
>> > + ? ?return 0.0;
>> > +
>> > + ?gcc_assert (loop->latch != EXIT_BLOCK_PTR);
>> > +
>> > + ?body = get_loop_body (loop);
>> > + ?n = 0.0;
>> > + ?for (i = 0; i < loop->num_nodes; i++)
>> > + ? ?{
>> > + ? ? ?if (EDGE_COUNT (body[i]->succs) >= 2)
>> > + ? ? ? ?{
>> > + ? ? ? ? ?/* If this block is executed less frequently than the header (loop
>> > + ? ? ? ? ? ? entry), then it is weighted based on the ratio of times it is
>> > + ? ? ? ? ? ? executed compared to the header. ?*/
>> > + ? ? ? ? ?if (body[i]->count < header_count)
>> > + ? ? ? ? ? ?n += ((float)body[i]->count)/header_count;
>>
>> Please don't introduce more floating point usage into the compiler
>> since it could change between different hosts (sse vs x87 for an
>> example).
>> Maybe use a fixed point multiply of 1000 (note use a macro for this
>> special value though) ?like what is used in the rest of the predict
>> code.
>
>
> Ok, got it. I will address this in the next version of the patch.
>
> Thanks,
> Teresa
>
>>
>>
>>
>> Thanks,
>> Andrew Pinski
>>
>>
>> > +
>> > + ? ? ? ? ?/* When it is executed more frequently than the header (i.e. it is
>> > + ? ? ? ? ? ? in a nested inner loop), simply weight the branch at 1.0. ?*/
>> > + ? ? ? ? ?else
>> > + ? ? ? ? ? ?n += 1.0;
>> > + ? ? ? ?}
>> > + ? ?}
>> > + ?free (body);
>> > +
>> > + ?return n;
>> > +}
>> > +
>> > +/* Compute the maximum number of times LOOP can be unrolled without exceeding
>> > + ? a branch budget, which can increase branch mispredictions. The number of
>> > + ? branches is computed by weighting each branch with its expected execution
>> > + ? probability through the loop based on profile data. If no profile feedback
>> > + ? data exists, simply return the current NUNROLL factor. ?*/
>> > +static unsigned
>> > +max_unroll_with_branches(struct loop *loop, unsigned nunroll)
>> > +{
>> > + ?struct loop *outer;
>> > + ?struct niter_desc *outer_desc;
>> > + ?int outer_niters = 1;
>> > + ?float weighted_outer_branches = 0.0;
>> > + ?float weighted_num_branches = compute_weighted_branches (loop);
>> > +
>> > + ?/* If there was no profile feedback data, weighted_num_branches will be 0.0
>> > + ? ? and we won't limit unrolling. If the weighted_num_branches is at most 1.0,
>> > + ? ? also don't limit unrolling as the back-edge branch will not be duplicated. ?*/
>> > + ?if (weighted_num_branches <= 1.0)
>> > + ? ?return nunroll;
>> > +
>> > + ?/* Walk up the loop tree until we find a hot outer loop in which the current
>> > + ? ? loop is nested. At that point we will compute the number of times the
>> > + ? ? current loop can be unrolled based on the number of branches in the hot
>> > + ? ? outer loop. ?*/
>> > + ?outer = loop_outer(loop);
>> > + ?/* The loop structure contains a fake outermost loop, so this should always
>> > + ? ? be non-NULL for our current loop. ?*/
>> > + ?gcc_assert (outer);
>> > + ?/* Detect if this is the fake outermost loop (at which point we are done)
>> > + ? ? by checking its outer loop. ?*/
>> > + ?while (loop_outer(outer))
>> > + ? ?{
>> > + ? ? ?outer_desc = get_simple_loop_desc (outer);
>> > +
>> > + ? ? ?if (outer_desc->const_iter)
>> > + ? ? ? ?outer_niters *= outer_desc->niter;
>> > + ? ? ?else if (outer->header->count)
>> > + ? ? ? ?outer_niters *= expected_loop_iterations (outer);
>> > +
>> > + ? ? ?weighted_outer_branches = compute_weighted_branches (outer);
>> > +
>> > + ? ? ?/* Should have been checked by caller. ?*/
>> > + ? ? ?gcc_assert(PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1);
>> > +
>> > + ? ? ?/* If the outer loop has enough iterations to be considered hot, then
>> > + ? ? ? ? we can stop our upwards loop tree traversal and examine the current
>> > + ? ? ? ? outer loop. ?*/
>> > + ? ? ?if (outer_niters >= PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
>> > + ? ? ? ?{
>> > + ? ? ? ? ?/* Assume that any call will cause the branch budget to be exceeded,
>> > + ? ? ? ? ? ? and that we can't unroll the current loop without increasing
>> > + ? ? ? ? ? ? mispredicts. ?*/
>> > + ? ? ? ? ?if (loop_has_call(outer))
>> > + ? ? ? ? ? ?return 0;
>> > +
>> > + ? ? ? ? ?/* Otherwise, compute the maximum number of times current loop can be
>> > + ? ? ? ? ? ? unrolled without exceeding our branch budget. First we subtract
>> > + ? ? ? ? ? ? off the outer loop's weighted branch count from the budget. Note
>> > + ? ? ? ? ? ? that this includes the branches in the current loop. This yields
>> > + ? ? ? ? ? ? the number of branches left in the budget for the unrolled copies.
>> > + ? ? ? ? ? ? We divide this by the number of branches in the current loop that
>> > + ? ? ? ? ? ? must be duplicated when we unroll, which is the total weighted
>> > + ? ? ? ? ? ? number of branches minus the back-edge branch. This yields the
>> > + ? ? ? ? ? ? number of new loop body copies that can be created by unrolling
>> > + ? ? ? ? ? ? without exceeding the budget, to which we add 1 to get the unroll
>> > + ? ? ? ? ? ? factor. ?*/
>> > + ? ? ? ? ?return (PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET) -
>> > + ? ? ? ? ? ? ? ? ?weighted_outer_branches)/(weighted_num_branches - 1) + 1;
>> > + ? ? ? ?}
>> > + ? ? ?outer = loop_outer(outer);
>> > + ? ?}
>> > +
>> > + ?/* The current loop is not enclosed by a hot enough outer loop in this
>> > + ? ? procedure, since the hot outer loop is inter-procedural, assume that
>> > + ? ? it already contains a significant number of branches, so don't unroll. ?*/
>> > + ?return 0;
>> > +}
>> > +
>> > ?/* Unroll and/or peel (depending on FLAGS) LOOPS. ?*/
>> > ?void
>> > ?unroll_and_peel_loops (int flags)
>> > @@ -522,6 +696,7 @@ static void
>> > ?decide_unroll_constant_iterations (struct loop *loop, int flags)
>> > ?{
>> > ? unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
>> > + ?unsigned nunroll_branches;
>> > ? struct niter_desc *desc;
>> >
>> > ? if (!(flags & UAP_UNROLL))
>> > @@ -565,6 +740,25 @@ decide_unroll_constant_iterations (struct loop *lo
>> > ? ? ? return;
>> > ? ? }
>> >
>> > + ?/* Be careful when unrolling loops with branches inside -- it can increase
>> > + ? ? the number of mispredicts. Ignore loops with FP computation as these
>> > + ? ? tend to benefit much more consistently from unrolling. ?*/
>> > + ?if (num_loop_branches (loop) > 1
>> > + ? ? ?&& loop_has_FP_comp(loop)
>> > + ? ? ?&& PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1
>> > + ? ? ?&& desc->niter < (unsigned) PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
>> > + ? ?{
>> > + ? ? ?nunroll_branches = max_unroll_with_branches(loop, nunroll);
>> > + ? ? ?if (nunroll > nunroll_branches)
>> > + ? ? ? ?nunroll = nunroll_branches;
>> > + ? ? ?if (nunroll <= 1)
>> > + ? ? ? ?{
>> > + ? ? ? ? ?if (dump_file)
>> > + ? ? ? ? ? fprintf (dump_file, ";; Not unrolling, contains branches\n");
>> > + ? ? ? ? ?return;
>> > + ? ? ? ?}
>> > + ? ?}
>> > +
>> > ? /* Check whether the loop rolls enough to consider. ?*/
>> > ? if (desc->niter < 2 * nunroll)
>> > ? ? {
>> > @@ -802,7 +996,7 @@ unroll_loop_constant_iterations (struct loop *loop
>> > ?static void
>> > ?decide_unroll_runtime_iterations (struct loop *loop, int flags)
>> > ?{
>> > - ?unsigned nunroll, nunroll_by_av, i;
>> > + ?unsigned nunroll, nunroll_by_av, nunroll_branches, i;
>> > ? struct niter_desc *desc;
>> >
>> > ? if (!(flags & UAP_UNROLL))
>> > @@ -856,6 +1050,25 @@ decide_unroll_runtime_iterations (struct loop *loo
>> > ? ? ? return;
>> > ? ? }
>> >
>> > + ?/* Be careful when unrolling loops with branches inside -- it can increase
>> > + ? ? the number of mispredicts. Ignore loops with FP computation as these
>> > + ? ? tend to benefit much more consistently from unrolling. ?*/
>> > + ?if (num_loop_branches (loop) > 1
>> > + ? ? ?&& loop_has_FP_comp(loop)
>> > + ? ? ?&& PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1
>> > + ? ? ?&& expected_loop_iterations (loop) < (unsigned) PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
>> > + ? ?{
>> > + ? ? ?nunroll_branches = max_unroll_with_branches(loop, nunroll);
>> > + ? ? ?if (nunroll > nunroll_branches)
>> > + ? ? ? ?nunroll = nunroll_branches;
>> > + ? ? ?if (nunroll <= 1)
>> > + ? ? ? ?{
>> > + ? ? ? ? ?if (dump_file)
>> > + ? ? ? ? ? ?fprintf (dump_file, ";; Not unrolling, contains branches\n");
>> > + ? ? ? ? ?return;
>> > + ? ? ? ?}
>> > + ? ?}
>> > +
>> > ? /* If we have profile feedback, check whether the loop rolls. ?*/
>> > ? if ((loop->header->count
>> > ? ? ? ?&& expected_loop_iterations (loop) < 2 * nunroll)
>> > Index: params.def
>> > ===================================================================
>> > --- params.def ?(revision 186783)
>> > +++ params.def ?(working copy)
>> > @@ -312,6 +312,16 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS,
>> > ? ? ? ? "The maximum depth of a loop nest we completely peel",
>> > ? ? ? ? 8, 0, 0)
>> >
>> > +DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES,
>> > + ? ? ? "min-iter-unroll-with-branches",
>> > + ? ? ? "Minimum iteration count to ignore branch effects when unrolling",
>> > + ? ? ? 50, 0, 0)
>> > +
>> > +DEFPARAM(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET,
>> > + ? ? ? "unroll-outer-loop-branch-budget",
>> > + ? ? ? "Maximum number of branches allowed in hot outer loop region after unroll",
>> > + ? ? ? 25, 0, 0)
>> > +
>> > ?/* The maximum number of insns of an unswitched loop. ?*/
>> > ?DEFPARAM(PARAM_MAX_UNSWITCH_INSNS,
>> > ? ? ? ?"max-unswitch-insns",
>> >
>> > --
>> > This patch is available for review at http://codereview.appspot.com/6099055
>
>
>
>
> --
> Teresa Johnson?|?Software Engineer?| ?tejohnson@google.com?| ?408-460-2413
>



--
Teresa Johnson?|?Software Engineer?|?tejohnson@google.com?|?408-460-2413


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]