This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [4/6] Optionally pick the cheapest loop_vec_info
- From: Richard Biener <richard dot guenther at gmail dot com>
- To: Richard Sandiford <Richard dot Sandiford at arm dot com>
- Cc: "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>
- Date: Wed, 6 Nov 2019 13:09:31 +0100
- Subject: Re: [4/6] Optionally pick the cheapest loop_vec_info
- References: <mptbltqtn9b.fsf@arm.com> <mpttv7is8fp.fsf@arm.com>
On Tue, Nov 5, 2019 at 3:29 PM Richard Sandiford
<Richard.Sandiford@arm.com> wrote:
>
> This patch adds a mode in which the vectoriser tries each available
> base vector mode and picks the one with the lowest cost. For now
> the behaviour is behind a default-off --param, but a later patch
> enables it by default for SVE.
>
> The patch keeps the current behaviour of preferring a VF of
> loop->simdlen over any larger or smaller VF, regardless of costs
> or target preferences.
Can you avoid using a --param for this? Instead I'd suggest to
amend the vectorize_modes target hook to return some
flags like VECT_FIRST_MODE_WINS. We'd eventually want
to make the target able to say do-not-vectorize-epiloges-of-MODE
(I think we may not want to vectorize SSE vectorized loop
epilogues with MMX-with-SSE or GPRs for example). I guess
for the latter we'd use a new target hook.
Otherwise looks reasonable.
Richard.
>
> 2019-11-05 Richard Sandiford <richard.sandiford@arm.com>
>
> gcc/
> * params.def (vect-compare-loop-costs): New param.
> * doc/invoke.texi: Document it.
> * tree-vectorizer.h (_loop_vec_info::vec_outside_cost)
> (_loop_vec_info::vec_inside_cost): New member variables.
> * tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Initialize them.
> (vect_better_loop_vinfo_p, vect_joust_loop_vinfos): New functions.
> (vect_analyze_loop): When the new parameter allows, try vectorizing
> the loop with each available vector mode and picking the one with
> the lowest cost.
> (vect_estimate_min_profitable_iters): Record the computed costs
> in the loop_vec_info.
>
> Index: gcc/params.def
> ===================================================================
> --- gcc/params.def 2019-10-31 17:15:25.470517368 +0000
> +++ gcc/params.def 2019-11-05 14:19:58.781197820 +0000
> @@ -661,6 +661,13 @@ DEFPARAM(PARAM_VECT_MAX_PEELING_FOR_ALIG
> "Maximum number of loop peels to enhance alignment of data references in a loop.",
> -1, -1, 64)
>
> +DEFPARAM(PARAM_VECT_COMPARE_LOOP_COSTS,
> + "vect-compare-loop-costs",
> + "Whether to try vectorizing a loop using each supported"
> + " combination of vector types and picking the version with the"
> + " lowest cost.",
> + 0, 0, 1)
> +
> DEFPARAM(PARAM_MAX_CSELIB_MEMORY_LOCATIONS,
> "max-cselib-memory-locations",
> "The maximum memory locations recorded by cselib.",
> Index: gcc/doc/invoke.texi
> ===================================================================
> --- gcc/doc/invoke.texi 2019-11-04 21:13:57.611756365 +0000
> +++ gcc/doc/invoke.texi 2019-11-05 14:19:58.777197850 +0000
> @@ -11563,6 +11563,12 @@ doing loop versioning for alias in the v
> The maximum number of loop peels to enhance access alignment
> for vectorizer. Value -1 means no limit.
>
> +@item vect-compare-loop-costs
> +Whether to try vectorizing a loop using each supported combination of
> +vector types and picking the version with the lowest cost. This parameter
> +has no effect when @option{-fno-vect-cost-model} or
> +@option{-fvect-cost-model=unlimited} are used.
> +
> @item max-iterations-to-track
> The maximum number of iterations of a loop the brute-force algorithm
> for analysis of the number of iterations of the loop tries to evaluate.
> Index: gcc/tree-vectorizer.h
> ===================================================================
> --- gcc/tree-vectorizer.h 2019-11-05 14:19:33.829371745 +0000
> +++ gcc/tree-vectorizer.h 2019-11-05 14:19:58.781197820 +0000
> @@ -601,6 +601,13 @@ typedef class _loop_vec_info : public ve
> /* Cost of a single scalar iteration. */
> int single_scalar_iteration_cost;
>
> + /* The cost of the vector prologue and epilogue, including peeled
> + iterations and set-up code. */
> + int vec_outside_cost;
> +
> + /* The cost of the vector loop body. */
> + int vec_inside_cost;
> +
> /* Is the loop vectorizable? */
> bool vectorizable;
>
> Index: gcc/tree-vect-loop.c
> ===================================================================
> --- gcc/tree-vect-loop.c 2019-11-05 14:19:33.829371745 +0000
> +++ gcc/tree-vect-loop.c 2019-11-05 14:19:58.781197820 +0000
> @@ -830,6 +830,8 @@ _loop_vec_info::_loop_vec_info (class lo
> scan_map (NULL),
> slp_unrolling_factor (1),
> single_scalar_iteration_cost (0),
> + vec_outside_cost (0),
> + vec_inside_cost (0),
> vectorizable (false),
> can_fully_mask_p (true),
> fully_masked_p (false),
> @@ -2373,6 +2375,80 @@ vect_analyze_loop_2 (loop_vec_info loop_
> goto start_over;
> }
>
> +/* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
> + to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
> + OLD_LOOP_VINFO is better unless something specifically indicates
> + otherwise.
> +
> + Note that this deliberately isn't a partial order. */
> +
> +static bool
> +vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
> + loop_vec_info old_loop_vinfo)
> +{
> + struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
> + gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
> +
> + poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
> + poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
> +
> + /* Always prefer a VF of loop->simdlen over any other VF. */
> + if (loop->simdlen)
> + {
> + bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
> + bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
> + if (new_simdlen_p != old_simdlen_p)
> + return new_simdlen_p;
> + }
> +
> + /* Limit the VFs to what is likely to be the maximum number of iterations,
> + to handle cases in which at least one loop_vinfo is fully-masked. */
> + HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
> + if (estimated_max_niter != -1)
> + {
> + if (known_le (estimated_max_niter, new_vf))
> + new_vf = estimated_max_niter;
> + if (known_le (estimated_max_niter, old_vf))
> + old_vf = estimated_max_niter;
> + }
> +
> + /* Check whether the (fractional) cost per scalar iteration is lower
> + or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
> + poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
> + * poly_widest_int (old_vf));
> + poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
> + * poly_widest_int (new_vf));
> + if (maybe_lt (rel_old, rel_new))
> + return false;
> + if (known_lt (rel_new, rel_old))
> + return true;
> +
> + /* If there's nothing to choose between the loop bodies, see whether
> + there's a difference in the prologue and epilogue costs. */
> + if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
> + return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
> +
> + return false;
> +}
> +
> +/* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
> + true if we should. */
> +
> +static bool
> +vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
> + loop_vec_info old_loop_vinfo)
> +{
> + if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
> + return false;
> +
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_NOTE, vect_location,
> + "***** Preferring vector mode %s to vector mode %s\n",
> + GET_MODE_NAME (new_loop_vinfo->vector_mode),
> + GET_MODE_NAME (old_loop_vinfo->vector_mode));
> + return true;
> +}
> +
> /* Function vect_analyze_loop.
>
> Apply a set of analyses on LOOP, and create a loop_vec_info struct
> @@ -2408,6 +2484,8 @@ vect_analyze_loop (class loop *loop, vec
> machine_mode next_vector_mode = VOIDmode;
> poly_uint64 lowest_th = 0;
> unsigned vectorized_loops = 0;
> + bool pick_lowest_cost_p = (PARAM_VALUE (PARAM_VECT_COMPARE_LOOP_COSTS)
> + && !unlimited_cost_model (loop));
>
> bool vect_epilogues = false;
> opt_result res = opt_result::success ();
> @@ -2428,6 +2506,34 @@ vect_analyze_loop (class loop *loop, vec
>
> bool fatal = false;
>
> + /* When pick_lowest_cost_p is true, we should in principle iterate
> + over all the loop_vec_infos that LOOP_VINFO could replace and
> + try to vectorize LOOP_VINFO under the same conditions.
> + E.g. when trying to replace an epilogue loop, we should vectorize
> + LOOP_VINFO as an epilogue loop with the same VF limit. When trying
> + to replace the main loop, we should vectorize LOOP_VINFO as a main
> + loop too.
> +
> + However, autovectorize_vector_modes is usually sorted as follows:
> +
> + - Modes that naturally produce lower VFs usually follow modes that
> + naturally produce higher VFs.
> +
> + - When modes naturally produce the same VF, maskable modes
> + usually follow unmaskable ones, so that the maskable mode
> + can be used to vectorize the epilogue of the unmaskable mode.
> +
> + This order is preferred because it leads to the maximum
> + epilogue vectorization opportunities. Targets should only use
> + a different order if they want to make wide modes available while
> + disparaging them relative to earlier, smaller modes. The assumption
> + in that case is that the wider modes are more expensive in some
> + way that isn't reflected directly in the costs.
> +
> + There should therefore be few interesting cases in which
> + LOOP_VINFO fails when treated as an epilogue loop, succeeds when
> + treated as a standalone loop, and ends up being genuinely cheaper
> + than FIRST_LOOP_VINFO. */
> if (vect_epilogues)
> LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
>
> @@ -2475,13 +2581,34 @@ vect_analyze_loop (class loop *loop, vec
> LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
> simdlen = 0;
> }
> + else if (pick_lowest_cost_p && first_loop_vinfo)
> + {
> + /* Keep trying to roll back vectorization attempts while the
> + loop_vec_infos they produced were worse than this one. */
> + vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
> + while (!vinfos.is_empty ()
> + && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
> + {
> + gcc_assert (vect_epilogues);
> + delete vinfos.pop ();
> + }
> + if (vinfos.is_empty ()
> + && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
> + {
> + delete first_loop_vinfo;
> + first_loop_vinfo = opt_loop_vec_info::success (NULL);
> + LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
> + }
> + }
>
> if (first_loop_vinfo == NULL)
> {
> first_loop_vinfo = loop_vinfo;
> lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
> }
> - else if (vect_epilogues)
> + else if (vect_epilogues
> + /* For now only allow one epilogue loop. */
> + && first_loop_vinfo->epilogue_vinfos.is_empty ())
> {
> first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
> poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
> @@ -2501,12 +2628,14 @@ vect_analyze_loop (class loop *loop, vec
> && loop->inner == NULL
> && PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK)
> && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
> - /* For now only allow one epilogue loop. */
> - && first_loop_vinfo->epilogue_vinfos.is_empty ());
> + /* For now only allow one epilogue loop, but allow
> + pick_lowest_cost_p to replace it. */
> + && (first_loop_vinfo->epilogue_vinfos.is_empty ()
> + || pick_lowest_cost_p));
>
> /* Commit to first_loop_vinfo if we have no reason to try
> alternatives. */
> - if (!simdlen && !vect_epilogues)
> + if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
> break;
> }
> else
> @@ -3454,7 +3583,11 @@ vect_estimate_min_profitable_iters (loop
> &vec_inside_cost, &vec_epilogue_cost);
>
> vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
> -
> +
> + /* Stash the costs so that we can compare two loop_vec_infos. */
> + loop_vinfo->vec_inside_cost = vec_inside_cost;
> + loop_vinfo->vec_outside_cost = vec_outside_cost;
> +
> if (dump_enabled_p ())
> {
> dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");