This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: Add support for in-order addition reduction using SVE FADDA
Richard Biener <richard.guenther@gmail.com> writes:
> On Mon, Nov 20, 2017 at 1:54 PM, Richard Sandiford
> <richard.sandiford@linaro.org> wrote:
>> Richard Biener <richard.guenther@gmail.com> writes:
>>> On Fri, Nov 17, 2017 at 5:53 PM, Richard Sandiford
>>> <richard.sandiford@linaro.org> wrote:
>>>> This patch adds support for in-order floating-point addition reductions,
>>>> which are suitable even in strict IEEE mode.
>>>>
>>>> Previously vect_is_simple_reduction would reject any cases that forbid
>>>> reassociation. The idea is instead to tentatively accept them as
>>>> "FOLD_LEFT_REDUCTIONs" and only fail later if there is no target
>>>> support for them. Although this patch only handles the particular
>>>> case of plus and minus on floating-point types, there's no reason in
>>>> principle why targets couldn't handle other cases.
>>>>
>>>> The vect_force_simple_reduction change makes it simpler for parloops
>>>> to read the type of reduction.
>>>>
>>>> Tested on aarch64-linux-gnu (with and without SVE), x86_64-linux-gnu
>>>> and powerpc64le-linux-gnu. OK to install?
>>>
>>> I don't like that you add a new tree code for this. A new IFN looks more
>>> suitable to me.
>>
>> OK.
>
> Thanks. I'd like to eventually get rid of other vectorizer tree codes as well,
> like the REDUC_*_EXPR, DOT_PROD_EXPR and SAD_EXPR. IFNs
> are now really the way to go for "target instructions on GIMPLE".
>
>>> Also I think if there's a way to handle this correctly with target support
>>> you can also implement a fallback if there is no such support increasing
>>> test coverage. It would basically boil down to extracting all scalars from
>>> the non-reduction operand vector and performing a series of reduction
>>> ops, keeping the reduction PHI scalar. This would also support any
>>> reduction operator.
>>
>> Yeah, but without target support, that's probably going to be expensive.
>> It's a bit like how we can implement element-by-element loads and stores
>> for cases that don't have target support, but had to explicitly disable
>> that in many cases, since the cost model was too optimistic.
>
> I expect that for V2DF or even V4DF it might be profitable in quite a number
> of cases. V2DF definitely.
>
>> I can give it a go anyway if you think it's worth it.
>
> I think it is.
OK, here's 2/3. It just splits out some code for reuse in 3/3.
Tested as before.
Thanks,
Richard
2017-11-21 Richard Sandiford <richard.sandiford@linaro.org>
gcc/
* tree-vect-loop.c (vect_extract_elements, vect_expand_fold_left): New
functions, split out from...
(vect_create_epilog_for_reduction): ...here.
Index: gcc/tree-vect-loop.c
===================================================================
--- gcc/tree-vect-loop.c 2017-11-21 16:31:49.728927972 +0000
+++ gcc/tree-vect-loop.c 2017-11-21 16:43:13.061221251 +0000
@@ -4566,6 +4566,65 @@ get_initial_defs_for_reduction (slp_tree
}
}
+/* Extract all the elements of VECTOR into SCALAR_RESULTS, inserting
+ the extraction statements before GSI. Associate the new scalar
+ SSA names with variable SCALAR_DEST. */
+
+static void
+vect_extract_elements (gimple_stmt_iterator *gsi, vec<tree> *scalar_results,
+ tree scalar_dest, tree vector)
+{
+ tree vectype = TREE_TYPE (vector);
+ tree scalar_type = TREE_TYPE (vectype);
+ tree bitsize = TYPE_SIZE (scalar_type);
+ unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
+ unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
+
+ for (unsigned HOST_WIDE_INT bit_offset = 0;
+ bit_offset < vec_size_in_bits;
+ bit_offset += element_bitsize)
+ {
+ tree bitpos = bitsize_int (bit_offset);
+ tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector, bitsize, bitpos);
+
+ gassign *stmt = gimple_build_assign (scalar_dest, rhs);
+ tree new_name = make_ssa_name (scalar_dest, stmt);
+ gimple_assign_set_lhs (stmt, new_name);
+ gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
+
+ scalar_results->safe_push (new_name);
+ }
+}
+
+/* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
+ order. Start with LHS if LHS is nonnull, otherwise start with the first
+ element of VECTOR_RHS. Insert the extraction statements before GSI and
+ associate the new scalar SSA names with variable SCALAR_DEST.
+ Return the SSA name for the result. */
+
+static tree
+vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
+ tree_code code, tree lhs, tree vector_rhs)
+{
+ auto_vec<tree, 64> scalar_results;
+ vect_extract_elements (gsi, &scalar_results, scalar_dest, vector_rhs);
+ tree rhs;
+ unsigned int i;
+ FOR_EACH_VEC_ELT (scalar_results, i, rhs)
+ {
+ if (lhs == NULL_TREE)
+ lhs = rhs;
+ else
+ {
+ gassign *stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
+ tree new_name = make_ssa_name (scalar_dest, stmt);
+ gimple_assign_set_lhs (stmt, new_name);
+ gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
+ lhs = new_name;
+ }
+ }
+ return lhs;
+}
/* Function vect_create_epilog_for_reduction
@@ -5499,52 +5558,16 @@ vect_create_epilog_for_reduction (vec<tr
vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
FOR_EACH_VEC_ELT (new_phis, i, new_phi)
{
- int bit_offset;
if (gimple_code (new_phi) == GIMPLE_PHI)
vec_temp = PHI_RESULT (new_phi);
else
vec_temp = gimple_assign_lhs (new_phi);
- tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
- bitsize_zero_node);
- epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
- new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
- gimple_assign_set_lhs (epilog_stmt, new_temp);
- gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
-
- /* In SLP we don't need to apply reduction operation, so we just
- collect s' values in SCALAR_RESULTS. */
- if (slp_reduc)
- scalar_results.safe_push (new_temp);
-
- for (bit_offset = element_bitsize;
- bit_offset < vec_size_in_bits;
- bit_offset += element_bitsize)
- {
- tree bitpos = bitsize_int (bit_offset);
- tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
- bitsize, bitpos);
-
- epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
- new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
- gimple_assign_set_lhs (epilog_stmt, new_name);
- gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
-
- if (slp_reduc)
- {
- /* In SLP we don't need to apply reduction operation, so
- we just collect s' values in SCALAR_RESULTS. */
- new_temp = new_name;
- scalar_results.safe_push (new_name);
- }
- else
- {
- epilog_stmt = gimple_build_assign (new_scalar_dest, code,
- new_name, new_temp);
- new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
- gimple_assign_set_lhs (epilog_stmt, new_temp);
- gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
- }
- }
+ if (slp_reduc)
+ vect_extract_elements (&exit_gsi, &scalar_results,
+ new_scalar_dest, vec_temp);
+ else
+ new_temp = vect_expand_fold_left (&exit_gsi, new_scalar_dest,
+ code, NULL_TREE, vec_temp);
}
/* The only case where we need to reduce scalar results in SLP, is