Extend fold_vec_perm to fold VEC_PERM_EXPR in VLA manner
Prathamesh Kulkarni
prathamesh.kulkarni@linaro.org
Wed Aug 17 12:39:01 GMT 2022
Hi,
The attached prototype patch extends fold_vec_perm to fold VEC_PERM_EXPR
in VLA manner, and currently handles the following cases:
(a) fixed len arg0, arg1 and fixed len sel.
(b) fixed len arg0, arg1 and vla sel
(c) vla arg0, arg1 and vla sel with arg0, arg1 being VECTOR_CST.
It seems to work for the VLA tests written in
test_vec_perm_vla_folding (), and am working thru the fallout observed in
regression testing.
Does the approach taken in the patch look in the right direction ?
I am not sure if I have got the conversion from "sel_index"
to index of either arg0, or arg1 entirely correct.
I would be grateful for suggestions on the patch.
Thanks,
Prathamesh
-------------- next part --------------
diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
index 4f4ec81c8d4..5e12260211e 100644
--- a/gcc/fold-const.cc
+++ b/gcc/fold-const.cc
@@ -85,6 +85,9 @@ along with GCC; see the file COPYING3. If not see
#include "vec-perm-indices.h"
#include "asan.h"
#include "gimple-range.h"
+#include "tree-pretty-print.h"
+#include "gimple-pretty-print.h"
+#include "print-tree.h"
/* Nonzero if we are folding constants inside an initializer or a C++
manifestly-constant-evaluated context; zero otherwise.
@@ -10496,40 +10499,6 @@ fold_mult_zconjz (location_t loc, tree type, tree expr)
build_zero_cst (itype));
}
-
-/* Helper function for fold_vec_perm. Store elements of VECTOR_CST or
- CONSTRUCTOR ARG into array ELTS, which has NELTS elements, and return
- true if successful. */
-
-static bool
-vec_cst_ctor_to_array (tree arg, unsigned int nelts, tree *elts)
-{
- unsigned HOST_WIDE_INT i, nunits;
-
- if (TREE_CODE (arg) == VECTOR_CST
- && VECTOR_CST_NELTS (arg).is_constant (&nunits))
- {
- for (i = 0; i < nunits; ++i)
- elts[i] = VECTOR_CST_ELT (arg, i);
- }
- else if (TREE_CODE (arg) == CONSTRUCTOR)
- {
- constructor_elt *elt;
-
- FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (arg), i, elt)
- if (i >= nelts || TREE_CODE (TREE_TYPE (elt->value)) == VECTOR_TYPE)
- return false;
- else
- elts[i] = elt->value;
- }
- else
- return false;
- for (; i < nelts; i++)
- elts[i]
- = fold_convert (TREE_TYPE (TREE_TYPE (arg)), integer_zero_node);
- return true;
-}
-
/* Attempt to fold vector permutation of ARG0 and ARG1 vectors using SEL
selector. Return the folded VECTOR_CST or CONSTRUCTOR if successful,
NULL_TREE otherwise. */
@@ -10537,45 +10506,149 @@ vec_cst_ctor_to_array (tree arg, unsigned int nelts, tree *elts)
tree
fold_vec_perm (tree type, tree arg0, tree arg1, const vec_perm_indices &sel)
{
- unsigned int i;
- unsigned HOST_WIDE_INT nelts;
- bool need_ctor = false;
+ poly_uint64 arg0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+ poly_uint64 arg1_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg1));
+
+ gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (type),
+ sel.length ()));
+ gcc_assert (known_eq (arg0_len, arg1_len));
- if (!sel.length ().is_constant (&nelts))
- return NULL_TREE;
- gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (type), nelts)
- && known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)), nelts)
- && known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg1)), nelts));
if (TREE_TYPE (TREE_TYPE (arg0)) != TREE_TYPE (type)
|| TREE_TYPE (TREE_TYPE (arg1)) != TREE_TYPE (type))
return NULL_TREE;
- tree *in_elts = XALLOCAVEC (tree, nelts * 2);
- if (!vec_cst_ctor_to_array (arg0, nelts, in_elts)
- || !vec_cst_ctor_to_array (arg1, nelts, in_elts + nelts))
+ unsigned input_npatterns = 0;
+ unsigned out_npatterns = sel.encoding ().npatterns ();
+ unsigned out_nelts_per_pattern = sel.encoding ().nelts_per_pattern ();
+
+ /* FIXME: How to reshape fixed length vector_cst, so that
+ npatterns == vector.length () and nelts_per_pattern == 1 ?
+ It seems the vector is canonicalized to minimize npatterns. */
+
+ if (arg0_len.is_constant ())
+ {
+ /* If arg0, arg1 are fixed width vectors, and sel is VLA,
+ ensure that it is a dup sequence and has same period
+ as input vector. */
+
+ if (!sel.length ().is_constant ()
+ && (sel.encoding ().nelts_per_pattern () > 2
+ || !known_eq (arg0_len, sel.encoding ().npatterns ())))
+ return NULL_TREE;
+
+ input_npatterns = arg0_len.to_constant ();
+
+ if (sel.length ().is_constant ())
+ {
+ out_npatterns = sel.length ().to_constant ();
+ out_nelts_per_pattern = 1;
+ }
+ }
+ else if (TREE_CODE (arg0) == VECTOR_CST
+ && TREE_CODE (arg1) == VECTOR_CST)
+ {
+ unsigned npatterns = VECTOR_CST_NPATTERNS (arg0);
+ unsigned input_nelts_per_pattern = VECTOR_CST_NELTS_PER_PATTERN (arg0);
+
+ /* If arg0, arg1 are VLA, then ensure that,
+ (a) sel also has same length as input vectors.
+ (b) arg0 and arg1 have same encoding.
+ (c) sel has same number of patterns as input vectors.
+ (d) if sel is a stepped sequence, then it has same
+ encoding as input vectors. */
+
+ if (!known_eq (arg0_len, sel.length ())
+ || npatterns != VECTOR_CST_NPATTERNS (arg1)
+ || input_nelts_per_pattern != VECTOR_CST_NELTS_PER_PATTERN (arg1)
+ || npatterns != sel.encoding ().npatterns ()
+ || (sel.encoding ().nelts_per_pattern () > 2
+ && sel.encoding ().nelts_per_pattern () != input_nelts_per_pattern))
+ return NULL_TREE;
+
+ input_npatterns = npatterns;
+ }
+ else
return NULL_TREE;
- tree_vector_builder out_elts (type, nelts, 1);
- for (i = 0; i < nelts; i++)
+ tree_vector_builder out_elts_builder (type, out_npatterns,
+ out_nelts_per_pattern);
+ bool need_ctor = false;
+ unsigned out_encoded_nelts = out_npatterns * out_nelts_per_pattern;
+
+ for (unsigned i = 0; i < out_encoded_nelts; i++)
{
- HOST_WIDE_INT index;
- if (!sel[i].is_constant (&index))
+ HOST_WIDE_INT sel_index;
+ if (!sel[i].is_constant (&sel_index))
return NULL_TREE;
- if (!CONSTANT_CLASS_P (in_elts[index]))
- need_ctor = true;
- out_elts.quick_push (unshare_expr (in_elts[index]));
+
+ /* Convert sel_index to index of either arg0 or arg1.
+ For eg:
+ arg0: {a0, b0, a1, b1, a1 + S, b1 + S, ...}
+ arg1: {c0, d0, c1, d1, c1 + S, d1 + S, ...}
+ Both have npatterns == 2, nelts_per_pattern == 3.
+ Then the combined vector would be:
+ {a0, b0, c0, d0, a1, b1, c1, d1, a1 + S, b1 + S, c1 + S, d1 + S, ... }
+ This combined vector will have,
+ npatterns = 2 * input_npatterns == 4.
+ sel_index is used to index this above combined vector.
+
+ Since we don't explicitly build the combined vector, we convert
+ sel_index to corresponding index for either arg0 or arg1.
+ For eg, if sel_index == 7,
+ pattern = 7 % 4 == 3.
+ Since pattern > input_npatterns, the elem will come from:
+ pattern = 3 - input_npatterns ie, pattern 1 from arg1.
+ elem_index_in_pattern = 7 / 4 == 1.
+ So the actual index of the element in arg1 would be: 1 + (1 * 2) == 3.
+ So, sel_index == 7 corresponds to arg1[3], ie, d1. */
+
+ unsigned pattern = sel_index % (2 * input_npatterns);
+ unsigned elem_index_in_pattern = sel_index / (2 * input_npatterns);
+ tree arg;
+ if (pattern < input_npatterns)
+ arg = arg0;
+ else
+ {
+ arg = arg1;
+ pattern -= input_npatterns;
+ }
+
+ unsigned elem_index = (elem_index_in_pattern * input_npatterns) + pattern;
+ tree elem;
+ if (TREE_CODE (arg) == VECTOR_CST)
+ {
+ /* If arg is fixed width vector, and elem_index goes out of range,
+ then return NULL_TREE. */
+ if (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg)).is_constant ()
+ && elem_index > vector_cst_encoded_nelts (arg))
+ return NULL_TREE;
+ elem = vector_cst_elt (arg, elem_index);
+ }
+ else
+ {
+ gcc_assert (TREE_CODE (arg) == CONSTRUCTOR);
+ if (elem_index >= CONSTRUCTOR_NELTS (arg))
+ return NULL_TREE;
+ elem = CONSTRUCTOR_ELT (arg, elem_index)->value;
+ if (VECTOR_TYPE_P (TREE_TYPE (elem)))
+ return NULL_TREE;
+ need_ctor = true;
+ }
+
+ out_elts_builder.quick_push (unshare_expr (elem));
}
if (need_ctor)
{
vec<constructor_elt, va_gc> *v;
- vec_alloc (v, nelts);
- for (i = 0; i < nelts; i++)
- CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, out_elts[i]);
+ vec_alloc (v, out_encoded_nelts);
+
+ for (unsigned i = 0; i < out_encoded_nelts; i++)
+ CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, out_elts_builder[i]);
return build_constructor (type, v);
}
- else
- return out_elts.build ();
+
+ return out_elts_builder.build ();
}
/* Try to fold a pointer difference of type TYPE two address expressions of
@@ -16912,6 +16985,91 @@ test_vec_duplicate_folding ()
ASSERT_TRUE (operand_equal_p (dup5_expr, dup5_cst, 0));
}
+static tree
+build_vec_int_cst (unsigned npatterns, unsigned nelts_per_pattern,
+ int *encoded_elems)
+{
+ scalar_int_mode int_mode = SCALAR_INT_TYPE_MODE (integer_type_node);
+ machine_mode vmode = targetm.vectorize.preferred_simd_mode (int_mode);
+ poly_uint64 nunits = GET_MODE_NUNITS (vmode);
+ tree vectype = build_vector_type (integer_type_node, nunits);
+
+ tree_vector_builder builder (vectype, npatterns, nelts_per_pattern);
+ for (unsigned i = 0; i < npatterns * nelts_per_pattern; i++)
+ builder.quick_push (build_int_cst (integer_type_node, encoded_elems[i]));
+ return builder.build ();
+}
+
+static void
+vpe_verify_res (tree res, unsigned npatterns, unsigned nelts_per_pattern,
+ int *encoded_elems)
+{
+ ASSERT_TRUE (res != NULL_TREE);
+ ASSERT_TRUE (VECTOR_CST_NPATTERNS (res) == npatterns);
+ ASSERT_TRUE (VECTOR_CST_NELTS_PER_PATTERN (res) == nelts_per_pattern);
+
+ for (unsigned i = 0; i < npatterns * nelts_per_pattern; i++)
+ ASSERT_TRUE (wi::to_wide (VECTOR_CST_ELT (res, i))
+ == encoded_elems[i]);
+}
+
+static void
+test_vec_perm_vla_folding ()
+{
+ /* For all cases
+ arg0: {1, 11, 21, 31, 2, 12, 22, 32, 3, 13, 23, 33, ...}, npatterns == 4, nelts_per_pattern == 3.
+ arg1: {41, 51, 61, 71, 42, 52, 62, 72, 43, 53, 63, 73 ...}, npatterns == 4, nelts_per_pattern == 3. */
+
+ int arg0_elems[] = { 1, 11, 21, 31, 2, 12, 22, 32, 3, 13, 23, 33 };
+ tree arg0 = build_vec_int_cst (4, 3, arg0_elems);
+
+ int arg1_elems[] = { 41, 51, 61, 71, 42, 52, 62, 72, 43, 53, 63, 73 };
+ tree arg1 = build_vec_int_cst (4, 3, arg1_elems);
+
+ if (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)).is_constant ()
+ || TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg1)).is_constant ())
+ return;
+
+ /* Case 1: Dup mask sequence.
+ mask = {0, 9, 3, 11, ...}, npatterns == 4, nelts_per_pattern == 1.
+ expected result: {1, 21, 31, 32, ...}, npatterns == 4, nelts_per_pattern == 1. */
+ {
+ int mask_elems[] = {0, 9, 3, 12};
+ tree mask = build_vec_int_cst (4, 1, mask_elems);
+ if (TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask)).is_constant ())
+ return;
+ tree res = fold_ternary (VEC_PERM_EXPR, TREE_TYPE (arg0), arg0, arg1, mask);
+ int res_encoded_elems[] = {1, 12, 31, 42};
+ vpe_verify_res (res, 4, 1, res_encoded_elems);
+ }
+
+ /* Case 2:
+ mask = {0, 4, 1, 5, 8, 12, 9, 13 ...}, npatterns == 4, nelts_per_pattern == 2.
+ expected result: {1, 41, 11, 51, 2, 12, 42, 52, ...}, npatterns == 4, nelts_per_pattern == 2. */
+ {
+ int mask_elems[] = {0, 4, 1, 5, 8, 12, 9, 13};
+ tree mask = build_vec_int_cst (4, 2, mask_elems);
+ if (TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask)).is_constant ())
+ return;
+ tree res = fold_ternary (VEC_PERM_EXPR, TREE_TYPE (arg0), arg0, arg1, mask);
+ int res_encoded_elems[] = {1, 41, 11, 51, 2, 42, 12, 52};
+ vpe_verify_res (res, 4, 2, res_encoded_elems);
+ }
+
+ /* Case 3: Stepped mask sequence.
+ mask = {0, 4, 1, 5, 8, 12, 9, 13, 16, 20, 17, 21}, npatterns == 4, nelts_per_pattern == 3.
+ expected result = {1, 41, 11, 51, 2, 42, 12, 52, 3, 43, 13, 53 ...}, npatterns == 4, nelts_per_pattern == 3. */
+ {
+ int mask_elems[] = {0, 4, 1, 5, 8, 12, 9, 13, 16, 20, 17, 21};
+ tree mask = build_vec_int_cst (4, 3, mask_elems);
+ if (TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask)).is_constant ())
+ return;
+ tree res = fold_ternary (VEC_PERM_EXPR, TREE_TYPE (arg0), arg0, arg1, mask);
+ int res_encoded_elems[] = {1, 41, 11, 51, 2, 42, 12, 52, 3, 43, 13, 53};
+ vpe_verify_res (res, 4, 3, res_encoded_elems);
+ }
+}
+
/* Run all of the selftests within this file. */
void
@@ -16920,6 +17078,7 @@ fold_const_cc_tests ()
test_arithmetic_folding ();
test_vector_folding ();
test_vec_duplicate_folding ();
+ test_vec_perm_vla_folding ();
}
} // namespace selftest
More information about the Gcc-patches
mailing list