[09/13] Use explicit encodings for simple permutes
Richard Biener
richard.guenther@gmail.com
Tue Jan 2 13:07:00 GMT 2018
On Sun, Dec 10, 2017 at 12:21 AM, Richard Sandiford
<richard.sandiford@linaro.org> wrote:
> This patch makes users of vec_perm_builders use the compressed encoding
> where possible. This means that they work with variable-length vectors.
>
Ok.
Richard.
> 2017-12-09 Richard Sandiford <richard.sandiford@linaro.org>
>
> gcc/
> * optabs.c (expand_vec_perm_var): Use an explicit encoding for
> the broadcast of the low byte.
> (expand_mult_highpart): Use an explicit encoding for the permutes.
> * optabs-query.c (can_mult_highpart_p): Likewise.
> * tree-vect-loop.c (calc_vec_perm_mask_for_shift): Likewise.
> * tree-vect-stmts.c (perm_mask_for_reverse): Likewise.
> (vectorizable_bswap): Likewise.
> * tree-vect-data-refs.c (vect_grouped_store_supported): Use an
> explicit encoding for the power-of-2 permutes.
> (vect_permute_store_chain): Likewise.
> (vect_grouped_load_supported): Likewise.
> (vect_permute_load_chain): Likewise.
>
> Index: gcc/optabs.c
> ===================================================================
> --- gcc/optabs.c 2017-12-09 22:48:47.546825312 +0000
> +++ gcc/optabs.c 2017-12-09 22:48:52.266015836 +0000
> @@ -5625,15 +5625,14 @@ expand_vec_perm_var (machine_mode mode,
> NULL, 0, OPTAB_DIRECT);
> gcc_assert (sel != NULL);
>
> - /* Broadcast the low byte each element into each of its bytes. */
> - vec_perm_builder const_sel (w, w, 1);
> - for (i = 0; i < w; ++i)
> - {
> - int this_e = i / u * u;
> - if (BYTES_BIG_ENDIAN)
> - this_e += u - 1;
> - const_sel.quick_push (this_e);
> - }
> + /* Broadcast the low byte each element into each of its bytes.
> + The encoding has U interleaved stepped patterns, one for each
> + byte of an element. */
> + vec_perm_builder const_sel (w, u, 3);
> + unsigned int low_byte_in_u = BYTES_BIG_ENDIAN ? u - 1 : 0;
> + for (i = 0; i < 3; ++i)
> + for (unsigned int j = 0; j < u; ++j)
> + const_sel.quick_push (i * u + low_byte_in_u);
> sel = gen_lowpart (qimode, sel);
> sel = expand_vec_perm_const (qimode, sel, sel, const_sel, qimode, NULL);
> gcc_assert (sel != NULL);
> @@ -5853,16 +5852,20 @@ expand_mult_highpart (machine_mode mode,
> expand_insn (optab_handler (tab2, mode), 3, eops);
> m2 = gen_lowpart (mode, eops[0].value);
>
> - vec_perm_builder sel (nunits, nunits, 1);
> + vec_perm_builder sel;
> if (method == 2)
> {
> - for (i = 0; i < nunits; ++i)
> + /* The encoding has 2 interleaved stepped patterns. */
> + sel.new_vector (nunits, 2, 3);
> + for (i = 0; i < 6; ++i)
> sel.quick_push (!BYTES_BIG_ENDIAN + (i & ~1)
> + ((i & 1) ? nunits : 0));
> }
> else
> {
> - for (i = 0; i < nunits; ++i)
> + /* The encoding has a single interleaved stepped pattern. */
> + sel.new_vector (nunits, 1, 3);
> + for (i = 0; i < 3; ++i)
> sel.quick_push (2 * i + (BYTES_BIG_ENDIAN ? 0 : 1));
> }
>
> Index: gcc/optabs-query.c
> ===================================================================
> --- gcc/optabs-query.c 2017-12-09 22:48:47.545825268 +0000
> +++ gcc/optabs-query.c 2017-12-09 22:48:52.265015799 +0000
> @@ -501,8 +501,9 @@ can_mult_highpart_p (machine_mode mode,
> op = uns_p ? vec_widen_umult_odd_optab : vec_widen_smult_odd_optab;
> if (optab_handler (op, mode) != CODE_FOR_nothing)
> {
> - vec_perm_builder sel (nunits, nunits, 1);
> - for (i = 0; i < nunits; ++i)
> + /* The encoding has 2 interleaved stepped patterns. */
> + vec_perm_builder sel (nunits, 2, 3);
> + for (i = 0; i < 6; ++i)
> sel.quick_push (!BYTES_BIG_ENDIAN
> + (i & ~1)
> + ((i & 1) ? nunits : 0));
> @@ -518,8 +519,9 @@ can_mult_highpart_p (machine_mode mode,
> op = uns_p ? vec_widen_umult_lo_optab : vec_widen_smult_lo_optab;
> if (optab_handler (op, mode) != CODE_FOR_nothing)
> {
> - vec_perm_builder sel (nunits, nunits, 1);
> - for (i = 0; i < nunits; ++i)
> + /* The encoding has a single stepped pattern. */
> + vec_perm_builder sel (nunits, 1, 3);
> + for (int i = 0; i < 3; ++i)
> sel.quick_push (2 * i + (BYTES_BIG_ENDIAN ? 0 : 1));
> vec_perm_indices indices (sel, 2, nunits);
> if (can_vec_perm_const_p (mode, indices))
> Index: gcc/tree-vect-loop.c
> ===================================================================
> --- gcc/tree-vect-loop.c 2017-12-09 22:48:47.547825355 +0000
> +++ gcc/tree-vect-loop.c 2017-12-09 22:48:52.267015873 +0000
> @@ -3716,8 +3716,10 @@ vect_estimate_min_profitable_iters (loop
> calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
> vec_perm_builder *sel)
> {
> - sel->new_vector (nelt, nelt, 1);
> - for (unsigned int i = 0; i < nelt; i++)
> + /* The encoding is a single stepped pattern. Any wrap-around is handled
> + by vec_perm_indices. */
> + sel->new_vector (nelt, 1, 3);
> + for (unsigned int i = 0; i < 3; i++)
> sel->quick_push (i + offset);
> }
>
> Index: gcc/tree-vect-stmts.c
> ===================================================================
> --- gcc/tree-vect-stmts.c 2017-12-09 22:48:50.360942531 +0000
> +++ gcc/tree-vect-stmts.c 2017-12-09 22:48:52.268015910 +0000
> @@ -1717,8 +1717,9 @@ perm_mask_for_reverse (tree vectype)
>
> nunits = TYPE_VECTOR_SUBPARTS (vectype);
>
> - vec_perm_builder sel (nunits, nunits, 1);
> - for (i = 0; i < nunits; ++i)
> + /* The encoding has a single stepped pattern. */
> + vec_perm_builder sel (nunits, 1, 3);
> + for (i = 0; i < 3; ++i)
> sel.quick_push (nunits - 1 - i);
>
> vec_perm_indices indices (sel, 1, nunits);
> @@ -2504,8 +2505,9 @@ vectorizable_bswap (gimple *stmt, gimple
> unsigned int num_bytes = TYPE_VECTOR_SUBPARTS (char_vectype);
> unsigned word_bytes = num_bytes / nunits;
>
> - vec_perm_builder elts (num_bytes, num_bytes, 1);
> - for (unsigned i = 0; i < nunits; ++i)
> + /* The encoding uses one stepped pattern for each byte in the word. */
> + vec_perm_builder elts (num_bytes, word_bytes, 3);
> + for (unsigned i = 0; i < 3; ++i)
> for (unsigned j = 0; j < word_bytes; ++j)
> elts.quick_push ((i + 1) * word_bytes - j - 1);
>
> Index: gcc/tree-vect-data-refs.c
> ===================================================================
> --- gcc/tree-vect-data-refs.c 2017-12-09 22:48:47.546825312 +0000
> +++ gcc/tree-vect-data-refs.c 2017-12-09 22:48:52.267015873 +0000
> @@ -4566,14 +4566,13 @@ vect_grouped_store_supported (tree vecty
> if (VECTOR_MODE_P (mode))
> {
> unsigned int i, nelt = GET_MODE_NUNITS (mode);
> - vec_perm_builder sel (nelt, nelt, 1);
> - sel.quick_grow (nelt);
> -
> if (count == 3)
> {
> unsigned int j0 = 0, j1 = 0, j2 = 0;
> unsigned int i, j;
>
> + vec_perm_builder sel (nelt, nelt, 1);
> + sel.quick_grow (nelt);
> vec_perm_indices indices;
> for (j = 0; j < 3; j++)
> {
> @@ -4623,7 +4622,10 @@ vect_grouped_store_supported (tree vecty
> /* If length is not equal to 3 then only power of 2 is supported. */
> gcc_assert (pow2p_hwi (count));
>
> - for (i = 0; i < nelt / 2; i++)
> + /* The encoding has 2 interleaved stepped patterns. */
> + vec_perm_builder sel (nelt, 2, 3);
> + sel.quick_grow (6);
> + for (i = 0; i < 3; i++)
> {
> sel[i * 2] = i;
> sel[i * 2 + 1] = i + nelt;
> @@ -4631,7 +4633,7 @@ vect_grouped_store_supported (tree vecty
> vec_perm_indices indices (sel, 2, nelt);
> if (can_vec_perm_const_p (mode, indices))
> {
> - for (i = 0; i < nelt; i++)
> + for (i = 0; i < 6; i++)
> sel[i] += nelt / 2;
> indices.new_vector (sel, 2, nelt);
> if (can_vec_perm_const_p (mode, indices))
> @@ -4736,9 +4738,6 @@ vect_permute_store_chain (vec<tree> dr_c
> unsigned int i, n, log_length = exact_log2 (length);
> unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
>
> - vec_perm_builder sel (nelt, nelt, 1);
> - sel.quick_grow (nelt);
> -
> result_chain->quick_grow (length);
> memcpy (result_chain->address (), dr_chain.address (),
> length * sizeof (tree));
> @@ -4747,6 +4746,8 @@ vect_permute_store_chain (vec<tree> dr_c
> {
> unsigned int j0 = 0, j1 = 0, j2 = 0;
>
> + vec_perm_builder sel (nelt, nelt, 1);
> + sel.quick_grow (nelt);
> vec_perm_indices indices;
> for (j = 0; j < 3; j++)
> {
> @@ -4808,7 +4809,10 @@ vect_permute_store_chain (vec<tree> dr_c
> /* If length is not equal to 3 then only power of 2 is supported. */
> gcc_assert (pow2p_hwi (length));
>
> - for (i = 0, n = nelt / 2; i < n; i++)
> + /* The encoding has 2 interleaved stepped patterns. */
> + vec_perm_builder sel (nelt, 2, 3);
> + sel.quick_grow (6);
> + for (i = 0; i < 3; i++)
> {
> sel[i * 2] = i;
> sel[i * 2 + 1] = i + nelt;
> @@ -4816,7 +4820,7 @@ vect_permute_store_chain (vec<tree> dr_c
> vec_perm_indices indices (sel, 2, nelt);
> perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
>
> - for (i = 0; i < nelt; i++)
> + for (i = 0; i < 6; i++)
> sel[i] += nelt / 2;
> indices.new_vector (sel, 2, nelt);
> perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
> @@ -5164,11 +5168,11 @@ vect_grouped_load_supported (tree vectyp
> if (VECTOR_MODE_P (mode))
> {
> unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
> - vec_perm_builder sel (nelt, nelt, 1);
> - sel.quick_grow (nelt);
>
> if (count == 3)
> {
> + vec_perm_builder sel (nelt, nelt, 1);
> + sel.quick_grow (nelt);
> vec_perm_indices indices;
> unsigned int k;
> for (k = 0; k < 3; k++)
> @@ -5209,12 +5213,15 @@ vect_grouped_load_supported (tree vectyp
> /* If length is not equal to 3 then only power of 2 is supported. */
> gcc_assert (pow2p_hwi (count));
>
> - for (i = 0; i < nelt; i++)
> + /* The encoding has a single stepped pattern. */
> + vec_perm_builder sel (nelt, 1, 3);
> + sel.quick_grow (3);
> + for (i = 0; i < 3; i++)
> sel[i] = i * 2;
> vec_perm_indices indices (sel, 2, nelt);
> if (can_vec_perm_const_p (mode, indices))
> {
> - for (i = 0; i < nelt; i++)
> + for (i = 0; i < 3; i++)
> sel[i] = i * 2 + 1;
> indices.new_vector (sel, 2, nelt);
> if (can_vec_perm_const_p (mode, indices))
> @@ -5332,9 +5339,6 @@ vect_permute_load_chain (vec<tree> dr_ch
> unsigned int i, j, log_length = exact_log2 (length);
> unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
>
> - vec_perm_builder sel (nelt, nelt, 1);
> - sel.quick_grow (nelt);
> -
> result_chain->quick_grow (length);
> memcpy (result_chain->address (), dr_chain.address (),
> length * sizeof (tree));
> @@ -5343,6 +5347,8 @@ vect_permute_load_chain (vec<tree> dr_ch
> {
> unsigned int k;
>
> + vec_perm_builder sel (nelt, nelt, 1);
> + sel.quick_grow (nelt);
> vec_perm_indices indices;
> for (k = 0; k < 3; k++)
> {
> @@ -5390,12 +5396,15 @@ vect_permute_load_chain (vec<tree> dr_ch
> /* If length is not equal to 3 then only power of 2 is supported. */
> gcc_assert (pow2p_hwi (length));
>
> - for (i = 0; i < nelt; ++i)
> + /* The encoding has a single stepped pattern. */
> + vec_perm_builder sel (nelt, 1, 3);
> + sel.quick_grow (3);
> + for (i = 0; i < 3; ++i)
> sel[i] = i * 2;
> vec_perm_indices indices (sel, 2, nelt);
> perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
>
> - for (i = 0; i < nelt; ++i)
> + for (i = 0; i < 3; ++i)
> sel[i] = i * 2 + 1;
> indices.new_vector (sel, 2, nelt);
> perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
More information about the Gcc-patches
mailing list