This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: Add an alternative vector loop iv mechanism
Richard Biener <richard.guenther@gmail.com> writes:
> On Fri, Oct 13, 2017 at 4:10 PM, Richard Sandiford
> <richard.sandiford@linaro.org> wrote:
>> Normally we adjust the vector loop so that it iterates:
>>
>> (original number of scalar iterations - number of peels) / VF
>>
>> times, enforcing this using an IV that starts at zero and increments
>> by one each iteration. However, dividing by VF would be expensive
>> for variable VF, so this patch adds an alternative in which the IV
>> increments by VF each iteration instead. We then need to take care
>> to handle possible overflow in the IV.
>
> Hmm, why do you need to handle possible overflow? Doesn't the
> original loop have a natural IV that evolves like this? After all we
> can compute an expression for niters of the scalar loop.
The problem comes with loops like:
unsigned char i = 0;
do
{
...
i--;
}
while (i != 0);
The loop statements execute 256 times and the latch executes 255 times.
LOOP_VINFO_NITERSM1 is then 255 but LOOP_VINFO_NITERS (stored as an
unsigned char) is 0.
This leads to things like:
/* Constant case. */
if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
{
tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
return true;
}
in loop_niters_no_overflow.
>> The new mechanism isn't used yet; a later patch replaces the
>> "if (1)" with a check for variable VF. If the patch is OK, I'll
>> hold off applying it until the follow-on is ready to go in.
>
> I indeed don't like code that isn't exercised. Otherwise looks reasonable.
Thanks.
Richard
> Thanks,
> Richard.
>
>> Tested on aarch64-linux-gnu, x86_64-linux-gnu and powerpc64-linux-gnu.
>> OK to install when the time comes?
>>
>> Richard
>>
>>
>> 2017-10-13 Richard Sandiford <richard.sandiford@linaro.org>
>>
>> gcc/
>> * tree-vect-loop-manip.c: Include gimple-fold.h.
>> (slpeel_make_loop_iterate_ntimes): Add step, final_iv and
>> niters_maybe_zero parameters. Handle other cases besides a step of 1.
>> (vect_gen_vector_loop_niters): Add a step_vector_ptr parameter.
>> Add a path that uses a step of VF instead of 1, but disable it
>> for now.
>> (vect_do_peeling): Add step_vector, niters_vector_mult_vf_var
>> and niters_no_overflow parameters. Update calls to
>> slpeel_make_loop_iterate_ntimes and vect_gen_vector_loop_niters.
>> Create a new SSA name if the latter choses to use a ste other
>> than zero, and return it via niters_vector_mult_vf_var.
>> * tree-vect-loop.c (vect_transform_loop): Update calls to
>> vect_do_peeling, vect_gen_vector_loop_niters and
>> slpeel_make_loop_iterate_ntimes.
>> * tree-vectorizer.h (slpeel_make_loop_iterate_ntimes, vect_do_peeling)
>> (vect_gen_vector_loop_niters): Update declarations after above
> changes.
>>
>> Index: gcc/tree-vect-loop-manip.c
>> ===================================================================
>> --- gcc/tree-vect-loop-manip.c 2017-10-13 15:01:40.144777367 +0100
>> +++ gcc/tree-vect-loop-manip.c 2017-10-13 15:01:40.296014347 +0100
>> @@ -41,6 +41,7 @@ Software Foundation; either version 3, o
>> #include "tree-scalar-evolution.h"
>> #include "tree-vectorizer.h"
>> #include "tree-ssa-loop-ivopts.h"
>> +#include "gimple-fold.h"
>>
>> /*************************************************************************
>> Simple Loop Peeling Utilities
>> @@ -247,30 +248,115 @@ adjust_phi_and_debug_stmts (gimple *upda
>> gimple_bb (update_phi));
>> }
>>
>> -/* Make the LOOP iterate NITERS times. This is done by adding a new IV
>> - that starts at zero, increases by one and its limit is NITERS.
>> +/* Make LOOP iterate N == (NITERS - STEP) / STEP + 1 times,
>> + where NITERS is known to be outside the range [1, STEP - 1].
>> + This is equivalent to making the loop execute NITERS / STEP
>> + times when NITERS is nonzero and (1 << M) / STEP times otherwise,
>> + where M is the precision of NITERS.
>> +
>> + NITERS_MAYBE_ZERO is true if NITERS can be zero, false it is known
>> + to be >= STEP. In the latter case N is always NITERS / STEP.
>> +
>> + If FINAL_IV is nonnull, it is an SSA name that should be set to
>> + N * STEP on exit from the loop.
>>
>> Assumption: the exit-condition of LOOP is the last stmt in the loop. */
>>
>> void
>> -slpeel_make_loop_iterate_ntimes (struct loop *loop, tree niters)
>> +slpeel_make_loop_iterate_ntimes (struct loop *loop, tree niters, tree step,
>> + tree final_iv, bool niters_maybe_zero)
>> {
>> tree indx_before_incr, indx_after_incr;
>> gcond *cond_stmt;
>> gcond *orig_cond;
>> + edge pe = loop_preheader_edge (loop);
>> edge exit_edge = single_exit (loop);
>> gimple_stmt_iterator loop_cond_gsi;
>> gimple_stmt_iterator incr_gsi;
>> bool insert_after;
>> - tree init = build_int_cst (TREE_TYPE (niters), 0);
>> - tree step = build_int_cst (TREE_TYPE (niters), 1);
>> source_location loop_loc;
>> enum tree_code code;
>> + tree niters_type = TREE_TYPE (niters);
>>
>> orig_cond = get_loop_exit_condition (loop);
>> gcc_assert (orig_cond);
>> loop_cond_gsi = gsi_for_stmt (orig_cond);
>>
>> + tree init, limit;
>> + if (!niters_maybe_zero && integer_onep (step))
>> + {
>> + /* In this case we can use a simple 0-based IV:
>> +
>> + A:
>> + x = 0;
>> + do
>> + {
>> + ...
>> + x += 1;
>> + }
>> + while (x < NITERS); */
>> + code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
>> + init = build_zero_cst (niters_type);
>> + limit = niters;
>> + }
>> + else
>> + {
>> + /* The following works for all values of NITERS except 0:
>> +
>> + B:
>> + x = 0;
>> + do
>> + {
>> + ...
>> + x += STEP;
>> + }
>> + while (x <= NITERS - STEP);
>> +
>> + so that the loop continues to iterate if x + STEP - 1 < NITERS
>> + but stops if x + STEP - 1 >= NITERS.
>> +
>> + However, if NITERS is zero, x never hits a value above NITERS - STEP
>> + before wrapping around. There are two obvious ways of dealing with
>> + this:
>> +
>> + - start at STEP - 1 and compare x before incrementing it
>> + - start at -1 and compare x after incrementing it
>> +
>> + The latter is simpler and is what we use. The loop in this case
>> + looks like:
>> +
>> + C:
>> + x = -1;
>> + do
>> + {
>> + ...
>> + x += STEP;
>> + }
>> + while (x < NITERS - STEP);
>> +
>> + In both cases the loop limit is NITERS - STEP. */
>> + gimple_seq seq = NULL;
>> + limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
>> + limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit,
> step);
>> + if (seq)
>> + {
>> + basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
>> + gcc_assert (!new_bb);
>> + }
>> + if (niters_maybe_zero)
>> + {
>> + /* Case C. */
>> + code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
>> + init = build_all_ones_cst (niters_type);
>> + }
>> + else
>> + {
>> + /* Case B. */
>> + code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
>> + init = build_zero_cst (niters_type);
>> + }
>> + }
>> +
>> standard_iv_increment_position (loop, &incr_gsi, &insert_after);
>> create_iv (init, step, NULL_TREE, loop,
>> &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
>> @@ -278,11 +364,10 @@ slpeel_make_loop_iterate_ntimes (struct
>> indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi,
> indx_after_incr,
>> true, NULL_TREE, true,
>> GSI_SAME_STMT);
>> - niters = force_gimple_operand_gsi (&loop_cond_gsi, niters, true, NULL_TREE,
>> + limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
>> true, GSI_SAME_STMT);
>>
>> - code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
>> - cond_stmt = gimple_build_cond (code, indx_after_incr, niters, NULL_TREE,
>> + cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
>> NULL_TREE);
>>
>> gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
>> @@ -301,8 +386,23 @@ slpeel_make_loop_iterate_ntimes (struct
>> }
>>
>> /* Record the number of latch iterations. */
>> - loop->nb_iterations = fold_build2 (MINUS_EXPR, TREE_TYPE (niters), niters,
>> - build_int_cst (TREE_TYPE (niters), 1));
>> + if (limit == niters)
>> + /* Case A: the loop iterates NITERS times. Subtract one to get the
>> + latch count. */
>> + loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
>> + build_int_cst (niters_type, 1));
>> + else
>> + /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
>> + Subtract one from this to get the latch count. */
>> + loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
>> + limit, step);
>> +
>> + if (final_iv)
>> + {
>> + gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
>> + indx_after_incr, init);
>> + gsi_insert_on_edge_immediate (single_exit (loop), assign);
>> + }
>> }
>>
>> /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
>> @@ -1170,23 +1270,32 @@ vect_gen_scalar_loop_niters (tree niters
>> return niters;
>> }
>>
>> -/* This function generates the following statements:
>> +/* NITERS is the number of times that the original scalar loop executes
>> + after peeling. Work out the maximum number of iterations N that can
>> + be handled by the vectorized form of the loop and then either:
>> +
>> + a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
>> +
>> + niters_vector = N
>> +
>> + b) set *STEP_VECTOR_PTR to one and generate:
>>
>> - niters = number of iterations loop executes (after peeling)
>> - niters_vector = niters / vf
>> + niters_vector = N / vf
>>
>> - and places them on the loop preheader edge. NITERS_NO_OVERFLOW is
>> - true if NITERS doesn't overflow. */
>> + In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
>> + any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
>> + is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
>>
>> void
>> vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
>> - tree *niters_vector_ptr, bool niters_no_overflow)
>> + tree *niters_vector_ptr, tree *step_vector_ptr,
>> + bool niters_no_overflow)
>> {
>> tree ni_minus_gap, var;
>> - tree niters_vector, type = TREE_TYPE (niters);
>> + tree niters_vector, step_vector, type = TREE_TYPE (niters);
>> int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
>> edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
>> - tree log_vf = build_int_cst (type, exact_log2 (vf));
>> + tree log_vf = NULL_TREE;
>>
>> /* If epilogue loop is required because of data accesses with gaps, we
>> subtract one iteration from the total number of iterations here for
>> @@ -1207,21 +1316,32 @@ vect_gen_vector_loop_niters (loop_vec_in
>> else
>> ni_minus_gap = niters;
>>
>> - /* Create: niters >> log2(vf) */
>> - /* If it's known that niters == number of latch executions + 1 doesn't
>> - overflow, we can generate niters >> log2(vf); otherwise we generate
>> - (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
>> - will be at least one. */
>> - if (niters_no_overflow)
>> - niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
>> + if (1)
>> + {
>> + /* Create: niters >> log2(vf) */
>> + /* If it's known that niters == number of latch executions + 1 doesn't
>> + overflow, we can generate niters >> log2(vf); otherwise we generate
>> + (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
>> + will be at least one. */
>> + log_vf = build_int_cst (type, exact_log2 (vf));
>> + if (niters_no_overflow)
>> + niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
>> + else
>> + niters_vector
>> + = fold_build2 (PLUS_EXPR, type,
>> + fold_build2 (RSHIFT_EXPR, type,
>> + fold_build2 (MINUS_EXPR, type,
>> + ni_minus_gap,
>> + build_int_cst (type, vf)),
>> + log_vf),
>> + build_int_cst (type, 1));
>> + step_vector = build_one_cst (type);
>> + }
>> else
>> - niters_vector
>> - = fold_build2 (PLUS_EXPR, type,
>> - fold_build2 (RSHIFT_EXPR, type,
>> - fold_build2 (MINUS_EXPR, type, ni_minus_gap,
>> - build_int_cst (type, vf)),
>> - log_vf),
>> - build_int_cst (type, 1));
>> + {
>> + niters_vector = ni_minus_gap;
>> + step_vector = build_int_cst (type, vf);
>> + }
>>
>> if (!is_gimple_val (niters_vector))
>> {
>> @@ -1231,7 +1351,7 @@ vect_gen_vector_loop_niters (loop_vec_in
>> gsi_insert_seq_on_edge_immediate (pe, stmts);
>> /* Peeling algorithm guarantees that vector loop bound is at least ONE,
>> we set range information to make niters analyzer's life easier. */
>> - if (stmts != NULL)
>> + if (stmts != NULL && log_vf)
>> set_range_info (niters_vector, VR_RANGE,
>> wi::to_wide (build_int_cst (type, 1)),
>> wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
>> @@ -1239,6 +1359,7 @@ vect_gen_vector_loop_niters (loop_vec_in
>> log_vf)));
>> }
>> *niters_vector_ptr = niters_vector;
>> + *step_vector_ptr = step_vector;
>>
>> return;
>> }
>> @@ -1600,7 +1721,12 @@ slpeel_update_phi_nodes_for_lcssa (struc
>> - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
>> CHECK_PROFITABILITY is true.
>> Output:
>> - - NITERS_VECTOR: The number of iterations of loop after vectorization.
>> + - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
>> + iterate after vectorization; see slpeel_make_loop_iterate_ntimes
>> + for details.
>> + - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
>> + should be set to the number of scalar iterations handled by the
>> + vector loop. The SSA name is only used on exit from the loop.
>>
>> This function peels prolog and epilog from the loop, adds guards skipping
>> PROLOG and EPILOG for various conditions. As a result, the changed CFG
>> @@ -1657,8 +1783,9 @@ slpeel_update_phi_nodes_for_lcssa (struc
>>
>> struct loop *
>> vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
>> - tree *niters_vector, int th, bool check_profitability,
>> - bool niters_no_overflow)
>> + tree *niters_vector, tree *step_vector,
>> + tree *niters_vector_mult_vf_var, int th,
>> + bool check_profitability, bool niters_no_overflow)
>> {
>> edge e, guard_e;
>> tree type = TREE_TYPE (niters), guard_cond;
>> @@ -1754,7 +1881,9 @@ vect_do_peeling (loop_vec_info loop_vinf
>> /* Generate and update the number of iterations for prolog loop. */
>> niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
>> &bound_prolog);
>> - slpeel_make_loop_iterate_ntimes (prolog, niters_prolog);
>> + tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
>> + slpeel_make_loop_iterate_ntimes (prolog, niters_prolog, step_prolog,
>> + NULL_TREE, false);
>>
>> /* Skip the prolog loop. */
>> if (skip_prolog)
>> @@ -1867,9 +1996,20 @@ vect_do_peeling (loop_vec_info loop_vinf
>> overflows. */
>> niters_no_overflow |= (prolog_peeling > 0);
>> vect_gen_vector_loop_niters (loop_vinfo, niters,
>> - niters_vector, niters_no_overflow);
>> - vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
>> - &niters_vector_mult_vf);
>> + niters_vector, step_vector,
>> + niters_no_overflow);
>> + if (!integer_onep (*step_vector))
>> + {
>> + /* On exit from the loop we will have an easy way of calcalating
>> + NITERS_VECTOR / STEP * STEP. Install a dummy definition
>> + until then. */
>> + niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
>> + SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
>> + *niters_vector_mult_vf_var = niters_vector_mult_vf;
>> + }
>> + else
>> + vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
>> + &niters_vector_mult_vf);
>> /* Update IVs of original loop as if they were advanced by
>> niters_vector_mult_vf steps. */
>> gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
>> Index: gcc/tree-vect-loop.c
>> ===================================================================
>> --- gcc/tree-vect-loop.c 2017-10-13 15:01:40.144777367 +0100
>> +++ gcc/tree-vect-loop.c 2017-10-13 15:01:40.296014347 +0100
>> @@ -7273,7 +7273,9 @@ vect_transform_loop (loop_vec_info loop_
>> basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
>> int nbbs = loop->num_nodes;
>> int i;
>> - tree niters_vector = NULL;
>> + tree niters_vector = NULL_TREE;
>> + tree step_vector = NULL_TREE;
>> + tree niters_vector_mult_vf = NULL_TREE;
>> int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
>> bool grouped_store;
>> bool slp_scheduled = false;
>> @@ -7342,17 +7344,21 @@ vect_transform_loop (loop_vec_info loop_
>> LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
>> tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
>> bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
>> - epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1,
> &niters_vector, th,
>> + epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
>> + &step_vector, &niters_vector_mult_vf, th,
>> check_profitability, niters_no_overflow);
>> if (niters_vector == NULL_TREE)
>> {
>> if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
>> - niters_vector
>> - = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
>> - LOOP_VINFO_INT_NITERS (loop_vinfo) / vf);
>> + {
>> + niters_vector
>> + = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
>> + LOOP_VINFO_INT_NITERS (loop_vinfo) / vf);
>> + step_vector = build_one_cst (TREE_TYPE (niters));
>> + }
>> else
>> vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
>> - niters_no_overflow);
>> + &step_vector, niters_no_overflow);
>> }
>>
>> /* 1) Make sure the loop header has exactly two entries
>> @@ -7603,7 +7609,13 @@ vect_transform_loop (loop_vec_info loop_
>> } /* stmts in BB */
>> } /* BBs in loop */
>>
>> - slpeel_make_loop_iterate_ntimes (loop, niters_vector);
>> + /* The vectorization factor is always > 1, so if we use an IV
> increment of 1.
>> + a zero NITERS becomes a nonzero NITERS_VECTOR. */
>> + if (integer_onep (step_vector))
>> + niters_no_overflow = true;
>> + slpeel_make_loop_iterate_ntimes (loop, niters_vector, step_vector,
>> + niters_vector_mult_vf,
>> + !niters_no_overflow);
>>
>> scale_profile_for_vect_loop (loop, vf);
>>
>> Index: gcc/tree-vectorizer.h
>> ===================================================================
>> --- gcc/tree-vectorizer.h 2017-10-13 15:01:40.144777367 +0100
>> +++ gcc/tree-vectorizer.h 2017-10-13 15:01:40.296014347 +0100
>> @@ -1138,13 +1138,14 @@ vect_get_scalar_dr_size (struct data_ref
>>
>> /* Simple loop peeling and versioning utilities for vectorizer's purposes -
>> in tree-vect-loop-manip.c. */
>> -extern void slpeel_make_loop_iterate_ntimes (struct loop *, tree);
>> +extern void slpeel_make_loop_iterate_ntimes (struct loop *, tree, tree,
>> + tree, bool);
>> extern bool slpeel_can_duplicate_loop_p (const struct loop *, const_edge);
>> struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *,
>> struct loop *, edge);
>> extern void vect_loop_versioning (loop_vec_info, unsigned int, bool);
>> extern struct loop *vect_do_peeling (loop_vec_info, tree, tree,
>> - tree *, int, bool, bool);
>> + tree *, tree *, tree *, int, bool, bool);
>> extern source_location find_loop_location (struct loop *);
>> extern bool vect_can_advance_ivs_p (loop_vec_info);
>>
>> @@ -1258,7 +1259,8 @@ extern gimple *vect_force_simple_reducti
>> /* Drive for loop analysis stage. */
>> extern loop_vec_info vect_analyze_loop (struct loop *, loop_vec_info);
>> extern tree vect_build_loop_niters (loop_vec_info, bool * = NULL);
>> -extern void vect_gen_vector_loop_niters (loop_vec_info, tree, tree *, bool);
>> +extern void vect_gen_vector_loop_niters (loop_vec_info, tree, tree *,
>> + tree *, bool);
>> /* Drive for loop transformation stage. */
>> extern struct loop *vect_transform_loop (loop_vec_info);
>> extern loop_vec_info vect_analyze_loop_form (struct loop *);