This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [PATCH2/2, PR52252] Vectorization for load/store groups of size 3.
- From: Evgeny Stupachenko <evstupac at gmail dot com>
- To: GCC Patches <gcc-patches at gcc dot gnu dot org>, Richard Biener <rguenther at suse dot de>, Jakub Jelinek <jakub at redhat dot com>, Uros Bizjak <ubizjak at gmail dot com>
- Date: Wed, 28 May 2014 17:18:15 +0400
- Subject: Re: [PATCH2/2, PR52252] Vectorization for load/store groups of size 3.
- Authentication-results: sourceware.org; auth=none
- References: <CAOvf_xxiWyFOLmJgZKF2TO=V41nBFScHkRZVuYPaj6mZ4h6aHA at mail dot gmail dot com>
Ping.
Test is modified according to the fix in the test for loads.
diff --git a/gcc/testsuite/gcc.dg/vect/pr52252-st.c
b/gcc/testsuite/gcc.dg/vect/pr52252-st.c
new file mode 100644
index 0000000..e7161f7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr52252-st.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-mssse3" { target { i?86-*-* x86_64-*-* } } } */
+
+#define byte unsigned char
+
+void
+matrix_mul (byte *in, byte *out, int size)
+{
+ int i;
+ for (i = 0; i < size; i++)
+ {
+ out[0] = in[0] + in[1] + in[3];
+ out[1] = in[0] + in[2] + in[4];
+ out[2] = in[1] + in[2] + in[4];
+ in += 4;
+ out += 3;
+ }
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" {
target { i?86-*-* x86_64-*-* } } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
On Tue, May 6, 2014 at 6:39 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
> 2nd part of patch is on stores group.
> Bootstrap and make check passed on x86.
>
> Is it ok?
>
> 2014-05-06 Evgeny Stupachenko <evstupac@gmail.com>
>
> * tree-vect-data-refs.c (vect_grouped_store_supported): New
> check for storess group of length 3.
> (vect_permute_store_chain): New permutations for storess group of
> length 3.
> * tree-vect-stmts.c (vect_model_store_cost): Change cost
> of vec_perm_shuffle for the new permutations.
>
> ChangeLog for testsuite:
>
> 2014-05-06 Evgeny Stupachenko <evstupac@gmail.com>
>
> PR tree-optimization/52252
> * gcc.dg/vect/pr52252-st.c: Test on stores group of size 3.
>
> diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
> index ef710cf..fb0e30d 100644
> --- a/gcc/tree-vect-data-refs.c
> +++ b/gcc/tree-vect-data-refs.c
> @@ -4365,13 +4365,14 @@ vect_grouped_store_supported (tree vectype,
> unsigned HOST_WIDE_INT count)
> {
> enum machine_mode mode = TYPE_MODE (vectype);
>
> - /* vect_permute_store_chain requires the group size to be a power of two. */
> - if (exact_log2 (count) == -1)
> + /* vect_permute_store_chain requires the group size to be equal to 3 or
> + be a power of two. */
> + if (count != 3 && exact_log2 (count) == -1)
> {
> if (dump_enabled_p ())
> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> - "the size of the group of accesses"
> - " is not a power of 2\n");
> + "the size of the group of accesses"
> + " is not a power of 2 or not eqaul to 3\n");
> return false;
> }
>
> @@ -4380,23 +4381,76 @@ vect_grouped_store_supported (tree vectype,
> unsigned HOST_WIDE_INT count)
> {
> unsigned int i, nelt = GET_MODE_NUNITS (mode);
> unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
> - for (i = 0; i < nelt / 2; i++)
> +
> + if (count == 3)
> {
> - sel[i * 2] = i;
> - sel[i * 2 + 1] = i + nelt;
> + unsigned int j0 = 0, j1 = 0, j2 = 0;
> + unsigned int i, j;
> +
> + for (j = 0; j < 3; j++)
> + {
> + int nelt0 = ((3 - j) * nelt) % 3;
> + int nelt1 = ((3 - j) * nelt + 1) % 3;
> + int nelt2 = ((3 - j) * nelt + 2) % 3;
> + for (i = 0; i < nelt; i++)
> + {
> + if (3 * i + nelt0 < nelt)
> + sel[3 * i + nelt0] = j0++;
> + if (3 * i + nelt1 < nelt)
> + sel[3 * i + nelt1] = nelt + j1++;
> + if (3 * i + nelt2 < nelt)
> + sel[3 * i + nelt2] = 0;
> + }
> + if (!can_vec_perm_p (mode, false, sel))
> + {
> + if (dump_enabled_p ())
> + dump_printf (MSG_MISSED_OPTIMIZATION,
> + "permutaion op not supported by target.\n");
> + return false;
> + }
> +
> + for (i = 0; i < nelt; i++)
> + {
> + if (3 * i + nelt0 < nelt)
> + sel[3 * i + nelt0] = 3 * i + nelt0;
> + if (3 * i + nelt1 < nelt)
> + sel[3 * i + nelt1] = 3 * i + nelt1;
> + if (3 * i + nelt2 < nelt)
> + sel[3 * i + nelt2] = nelt + j2++;
> + }
> + if (!can_vec_perm_p (mode, false, sel))
> + {
> + if (dump_enabled_p ())
> + dump_printf (MSG_MISSED_OPTIMIZATION,
> + "permutaion op not supported by target.\n");
> + return false;
> + }
> + }
> + return true;
> }
> - if (can_vec_perm_p (mode, false, sel))
> + else
> {
> - for (i = 0; i < nelt; i++)
> - sel[i] += nelt / 2;
> - if (can_vec_perm_p (mode, false, sel))
> - return true;
> + /* If length is not equal to 3 then only power of 2 is supported. */
> + gcc_assert (exact_log2 (count) != -1);
> +
> + for (i = 0; i < nelt / 2; i++)
> + {
> + sel[i * 2] = i;
> + sel[i * 2 + 1] = i + nelt;
> + }
> + if (can_vec_perm_p (mode, false, sel))
> + {
> + for (i = 0; i < nelt; i++)
> + sel[i] += nelt / 2;
> + if (can_vec_perm_p (mode, false, sel))
> + return true;
> + }
> }
> }
>
> if (dump_enabled_p ())
> dump_printf (MSG_MISSED_OPTIMIZATION,
> - "interleave op not supported by target.\n");
> + "permutaion op not supported by target.\n");
> return false;
> }
>
> @@ -4416,9 +4470,9 @@ vect_store_lanes_supported (tree vectype,
> unsigned HOST_WIDE_INT count)
> /* Function vect_permute_store_chain.
>
> Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
> - a power of 2, generate interleave_high/low stmts to reorder the data
> - correctly for the stores. Return the final references for stores in
> - RESULT_CHAIN.
> + a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
> + the data correctly for the stores. Return the final references for stores
> + in RESULT_CHAIN.
>
> E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
> The input is 4 vectors each containing 8 elements. We assign a number to
> @@ -4485,7 +4539,9 @@ vect_permute_store_chain (vec<tree> dr_chain,
> gimple perm_stmt;
> tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
> tree perm_mask_low, perm_mask_high;
> - unsigned int i, n;
> + tree data_ref;
> + tree perm3_mask_low, perm3_mask_high;
> + unsigned int i, n, log_length = exact_log2 (length);
> unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
> unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
>
> @@ -4493,47 +4549,116 @@ vect_permute_store_chain (vec<tree> dr_chain,
> memcpy (result_chain->address (), dr_chain.address (),
> length * sizeof (tree));
>
> - for (i = 0, n = nelt / 2; i < n; i++)
> + if (length == 3)
> {
> - sel[i * 2] = i;
> - sel[i * 2 + 1] = i + nelt;
> - }
> - perm_mask_high = vect_gen_perm_mask (vectype, sel);
> - gcc_assert (perm_mask_high != NULL);
> + unsigned int j0 = 0, j1 = 0, j2 = 0;
>
> - for (i = 0; i < nelt; i++)
> - sel[i] += nelt / 2;
> - perm_mask_low = vect_gen_perm_mask (vectype, sel);
> - gcc_assert (perm_mask_low != NULL);
> + for (j = 0; j < 3; j++)
> + {
> + int nelt0 = ((3 - j) * nelt) % 3;
> + int nelt1 = ((3 - j) * nelt + 1) % 3;
> + int nelt2 = ((3 - j) * nelt + 2) % 3;
>
> - for (i = 0, n = exact_log2 (length); i < n; i++)
> - {
> - for (j = 0; j < length/2; j++)
> - {
> - vect1 = dr_chain[j];
> - vect2 = dr_chain[j+length/2];
> + for (i = 0; i < nelt; i++)
> + {
> + if (3 * i + nelt0 < nelt)
> + sel[3 * i + nelt0] = j0++;
> + if (3 * i + nelt1 < nelt)
> + sel[3 * i + nelt1] = nelt + j1++;
> + if (3 * i + nelt2 < nelt)
> + sel[3 * i + nelt2] = 0;
> + }
> + perm3_mask_low = vect_gen_perm_mask (vectype, sel);
> + gcc_assert (perm3_mask_low != NULL);
> +
> + for (i = 0; i < nelt; i++)
> + {
> + if (3 * i + nelt0 < nelt)
> + sel[3 * i + nelt0] = 3 * i + nelt0;
> + if (3 * i + nelt1 < nelt)
> + sel[3 * i + nelt1] = 3 * i + nelt1;
> + if (3 * i + nelt2 < nelt)
> + sel[3 * i + nelt2] = nelt + j2++;
> + }
> + perm3_mask_high = vect_gen_perm_mask (vectype, sel);
> + gcc_assert (perm3_mask_high != NULL);
> +
> + vect1 = dr_chain[0];
> + vect2 = dr_chain[1];
>
> /* Create interleaving stmt:
> - high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
> - high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
> - perm_stmt
> - = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
> - vect1, vect2, perm_mask_high);
> + low = VEC_PERM_EXPR <vect1, vect2,
> + {j, nelt, *, j + 1, nelt + j + 1, *,
> + j + 2, nelt + j + 2, *, ...}> */
> + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
> + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
> + vect1, vect2,
> + perm3_mask_low);
> vect_finish_stmt_generation (stmt, perm_stmt, gsi);
> - (*result_chain)[2*j] = high;
>
> + vect1 = data_ref;
> + vect2 = dr_chain[2];
> /* Create interleaving stmt:
> - low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
> - nelt*3/2+1, ...}> */
> - low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
> - perm_stmt
> - = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
> - vect1, vect2, perm_mask_low);
> + low = VEC_PERM_EXPR <vect1, vect2,
> + {0, 1, nelt + j, 3, 4, nelt + j + 1,
> + 6, 7, nelt + j + 2, ...}> */
> + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
> + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
> + vect1, vect2,
> + perm3_mask_high);
> vect_finish_stmt_generation (stmt, perm_stmt, gsi);
> - (*result_chain)[2*j+1] = low;
> + (*result_chain)[j] = data_ref;
> + }
> + }
> + else
> + {
> + /* If length is not equal to 3 then only power of 2 is supported. */
> + gcc_assert (exact_log2 (length) != -1);
> +
> + for (i = 0, n = nelt / 2; i < n; i++)
> + {
> + sel[i * 2] = i;
> + sel[i * 2 + 1] = i + nelt;
> }
> - memcpy (dr_chain.address (), result_chain->address (),
> - length * sizeof (tree));
> + perm_mask_high = vect_gen_perm_mask (vectype, sel);
> + gcc_assert (perm_mask_high != NULL);
> +
> + for (i = 0; i < nelt; i++)
> + sel[i] += nelt / 2;
> + perm_mask_low = vect_gen_perm_mask (vectype, sel);
> + gcc_assert (perm_mask_low != NULL);
> +
> + for (i = 0, n = log_length; i < n; i++)
> + {
> + for (j = 0; j < length/2; j++)
> + {
> + vect1 = dr_chain[j];
> + vect2 = dr_chain[j+length/2];
> +
> + /* Create interleaving stmt:
> + high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
> + ...}> */
> + high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
> + perm_stmt
> + = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
> + vect1, vect2, perm_mask_high);
> + vect_finish_stmt_generation (stmt, perm_stmt, gsi);
> + (*result_chain)[2*j] = high;
> +
> + /* Create interleaving stmt:
> + low = VEC_PERM_EXPR <vect1, vect2,
> + {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
> + ...}> */
> + low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
> + perm_stmt
> + = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
> + vect1, vect2, perm_mask_low);
> + vect_finish_stmt_generation (stmt, perm_stmt, gsi);
> + (*result_chain)[2*j+1] = low;
> + }
> + memcpy (dr_chain.address (), result_chain->address (),
> + length * sizeof (tree));
> + }
> }
> }
>
> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index b87c143..24d0b94 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -974,9 +974,9 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
> include the cost of the permutes. */
> if (!store_lanes_p && group_size > 1)
> {
> - /* Uses a high and low interleave operation for each needed permute. */
> -
> - int nstmts = ncopies * exact_log2 (group_size) * group_size;
> + /* Uses a high and low interleave or shuffle operations for each
> + needed permute. */
> + int nstmts = ncopies * ceil_log2 (group_size) * group_size;
> inside_cost = record_stmt_cost (body_cost_vec, nstmts, vec_perm,
> stmt_info, 0, vect_body);
>
>
> diff --git a/gcc/testsuite/gcc.dg/vect/pr52252-st.c
> b/gcc/testsuite/gcc.dg/vect/pr52252-st.c
> new file mode 100644
> index 0000000..cc1e72e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/pr52252-st.c
> @@ -0,0 +1,21 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -g -ftree-vectorize -mssse3
> -fdump-tree-vect-details" { target { i?86-*-* x86_64-*-* } } } */
> +
> +#define byte unsigned char
> +
> +void
> +matrix_mul (byte *in, byte *out, int size)
> +{
> + int i;
> + for (i = 0; i < size; i++)
> + {
> + out[0] = in[0] + in[1] + in[3];
> + out[1] = in[0] + in[2] + in[4];
> + out[2] = in[1] + in[2] + in[4];
> + in += 4;
> + out += 3;
> + }
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
> +/* { dg-final { cleanup-tree-dump "vect" } } */