This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH2/2, PR52252] Vectorization for load/store groups of size 3.


2nd part of patch is on stores group.
Bootstrap and make check passed on x86.

Is it ok?

2014-05-06  Evgeny Stupachenko  <evstupac@gmail.com>

        * tree-vect-data-refs.c (vect_grouped_store_supported): New
        check for storess group of length 3.
        (vect_permute_store_chain): New permutations for storess group of
        length 3.
        * tree-vect-stmts.c (vect_model_store_cost): Change cost
        of vec_perm_shuffle for the new permutations.

ChangeLog for testsuite:

2014-05-06  Evgeny Stupachenko  <evstupac@gmail.com>

       PR tree-optimization/52252
       * gcc.dg/vect/pr52252-st.c: Test on stores group of size 3.

diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index ef710cf..fb0e30d 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -4365,13 +4365,14 @@ vect_grouped_store_supported (tree vectype,
unsigned HOST_WIDE_INT count)
 {
   enum machine_mode mode = TYPE_MODE (vectype);

-  /* vect_permute_store_chain requires the group size to be a power of two.  */
-  if (exact_log2 (count) == -1)
+  /* vect_permute_store_chain requires the group size to be equal to 3 or
+     be a power of two.  */
+  if (count != 3 && exact_log2 (count) == -1)
     {
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                         "the size of the group of accesses"
-                         " is not a power of 2\n");
+                        "the size of the group of accesses"
+                        " is not a power of 2 or not eqaul to 3\n");
       return false;
     }

@@ -4380,23 +4381,76 @@ vect_grouped_store_supported (tree vectype,
unsigned HOST_WIDE_INT count)
     {
       unsigned int i, nelt = GET_MODE_NUNITS (mode);
       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
-      for (i = 0; i < nelt / 2; i++)
+
+      if (count == 3)
        {
-         sel[i * 2] = i;
-         sel[i * 2 + 1] = i + nelt;
+         unsigned int j0 = 0, j1 = 0, j2 = 0;
+         unsigned int i, j;
+
+         for (j = 0; j < 3; j++)
+           {
+             int nelt0 = ((3 - j) * nelt) % 3;
+             int nelt1 = ((3 - j) * nelt + 1) % 3;
+             int nelt2 = ((3 - j) * nelt + 2) % 3;
+             for (i = 0; i < nelt; i++)
+               {
+                 if (3 * i + nelt0 < nelt)
+                   sel[3 * i + nelt0] = j0++;
+                 if (3 * i + nelt1 < nelt)
+                   sel[3 * i + nelt1] = nelt + j1++;
+                 if (3 * i + nelt2 < nelt)
+                   sel[3 * i + nelt2] = 0;
+               }
+             if (!can_vec_perm_p (mode, false, sel))
+               {
+                 if (dump_enabled_p ())
+                   dump_printf (MSG_MISSED_OPTIMIZATION,
+                                "permutaion op not supported by target.\n");
+                 return false;
+               }
+
+             for (i = 0; i < nelt; i++)
+               {
+                 if (3 * i + nelt0 < nelt)
+                   sel[3 * i + nelt0] = 3 * i + nelt0;
+                 if (3 * i + nelt1 < nelt)
+                   sel[3 * i + nelt1] = 3 * i + nelt1;
+                 if (3 * i + nelt2 < nelt)
+                   sel[3 * i + nelt2] = nelt + j2++;
+               }
+             if (!can_vec_perm_p (mode, false, sel))
+               {
+                 if (dump_enabled_p ())
+                   dump_printf (MSG_MISSED_OPTIMIZATION,
+                                "permutaion op not supported by target.\n");
+                 return false;
+               }
+           }
+         return true;
        }
-      if (can_vec_perm_p (mode, false, sel))
+      else
        {
-         for (i = 0; i < nelt; i++)
-           sel[i] += nelt / 2;
-         if (can_vec_perm_p (mode, false, sel))
-           return true;
+         /* If length is not equal to 3 then only power of 2 is supported.  */
+         gcc_assert (exact_log2 (count) != -1);
+
+         for (i = 0; i < nelt / 2; i++)
+           {
+             sel[i * 2] = i;
+             sel[i * 2 + 1] = i + nelt;
+           }
+           if (can_vec_perm_p (mode, false, sel))
+             {
+               for (i = 0; i < nelt; i++)
+                 sel[i] += nelt / 2;
+               if (can_vec_perm_p (mode, false, sel))
+                 return true;
+             }
        }
     }

   if (dump_enabled_p ())
     dump_printf (MSG_MISSED_OPTIMIZATION,
-                 "interleave op not supported by target.\n");
+                "permutaion op not supported by target.\n");
   return false;
 }

@@ -4416,9 +4470,9 @@ vect_store_lanes_supported (tree vectype,
unsigned HOST_WIDE_INT count)
 /* Function vect_permute_store_chain.

    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
-   a power of 2, generate interleave_high/low stmts to reorder the data
-   correctly for the stores.  Return the final references for stores in
-   RESULT_CHAIN.
+   a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
+   the data correctly for the stores.  Return the final references for stores
+   in RESULT_CHAIN.

    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
    The input is 4 vectors each containing 8 elements.  We assign a number to
@@ -4485,7 +4539,9 @@ vect_permute_store_chain (vec<tree> dr_chain,
   gimple perm_stmt;
   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
   tree perm_mask_low, perm_mask_high;
-  unsigned int i, n;
+  tree data_ref;
+  tree perm3_mask_low, perm3_mask_high;
+  unsigned int i, n, log_length = exact_log2 (length);
   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);

@@ -4493,47 +4549,116 @@ vect_permute_store_chain (vec<tree> dr_chain,
   memcpy (result_chain->address (), dr_chain.address (),
          length * sizeof (tree));

-  for (i = 0, n = nelt / 2; i < n; i++)
+  if (length == 3)
     {
-      sel[i * 2] = i;
-      sel[i * 2 + 1] = i + nelt;
-    }
-  perm_mask_high = vect_gen_perm_mask (vectype, sel);
-  gcc_assert (perm_mask_high != NULL);
+      unsigned int j0 = 0, j1 = 0, j2 = 0;

-  for (i = 0; i < nelt; i++)
-    sel[i] += nelt / 2;
-  perm_mask_low = vect_gen_perm_mask (vectype, sel);
-  gcc_assert (perm_mask_low != NULL);
+      for (j = 0; j < 3; j++)
+        {
+         int nelt0 = ((3 - j) * nelt) % 3;
+         int nelt1 = ((3 - j) * nelt + 1) % 3;
+         int nelt2 = ((3 - j) * nelt + 2) % 3;

-  for (i = 0, n = exact_log2 (length); i < n; i++)
-    {
-      for (j = 0; j < length/2; j++)
-       {
-         vect1 = dr_chain[j];
-         vect2 = dr_chain[j+length/2];
+         for (i = 0; i < nelt; i++)
+           {
+             if (3 * i + nelt0 < nelt)
+               sel[3 * i + nelt0] = j0++;
+             if (3 * i + nelt1 < nelt)
+               sel[3 * i + nelt1] = nelt + j1++;
+             if (3 * i + nelt2 < nelt)
+               sel[3 * i + nelt2] = 0;
+           }
+         perm3_mask_low = vect_gen_perm_mask (vectype, sel);
+         gcc_assert (perm3_mask_low != NULL);
+
+         for (i = 0; i < nelt; i++)
+           {
+             if (3 * i + nelt0 < nelt)
+               sel[3 * i + nelt0] = 3 * i + nelt0;
+             if (3 * i + nelt1 < nelt)
+               sel[3 * i + nelt1] = 3 * i + nelt1;
+             if (3 * i + nelt2 < nelt)
+               sel[3 * i + nelt2] = nelt + j2++;
+           }
+         perm3_mask_high = vect_gen_perm_mask (vectype, sel);
+         gcc_assert (perm3_mask_high != NULL);
+
+         vect1 = dr_chain[0];
+         vect2 = dr_chain[1];

          /* Create interleaving stmt:
-            high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}>  */
-         high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
-         perm_stmt
-           = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
-                                           vect1, vect2, perm_mask_high);
+            low = VEC_PERM_EXPR <vect1, vect2,
+                                 {j, nelt, *, j + 1, nelt + j + 1, *,
+                                  j + 2, nelt + j + 2, *, ...}>  */
+         data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
+         perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+                                                   vect1, vect2,
+                                                   perm3_mask_low);
          vect_finish_stmt_generation (stmt, perm_stmt, gsi);
-         (*result_chain)[2*j] = high;

+         vect1 = data_ref;
+         vect2 = dr_chain[2];
          /* Create interleaving stmt:
-            low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
-                                                nelt*3/2+1, ...}>  */
-         low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
-         perm_stmt
-           = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
-                                           vect1, vect2, perm_mask_low);
+            low = VEC_PERM_EXPR <vect1, vect2,
+                                 {0, 1, nelt + j, 3, 4, nelt + j + 1,
+                                  6, 7, nelt + j + 2, ...}>  */
+         data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
+         perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+                                                   vect1, vect2,
+                                                   perm3_mask_high);
          vect_finish_stmt_generation (stmt, perm_stmt, gsi);
-         (*result_chain)[2*j+1] = low;
+         (*result_chain)[j] = data_ref;
+       }
+    }
+  else
+    {
+      /* If length is not equal to 3 then only power of 2 is supported.  */
+      gcc_assert (exact_log2 (length) != -1);
+
+      for (i = 0, n = nelt / 2; i < n; i++)
+       {
+         sel[i * 2] = i;
+         sel[i * 2 + 1] = i + nelt;
        }
-      memcpy (dr_chain.address (), result_chain->address (),
-             length * sizeof (tree));
+       perm_mask_high = vect_gen_perm_mask (vectype, sel);
+       gcc_assert (perm_mask_high != NULL);
+
+       for (i = 0; i < nelt; i++)
+         sel[i] += nelt / 2;
+       perm_mask_low = vect_gen_perm_mask (vectype, sel);
+       gcc_assert (perm_mask_low != NULL);
+
+       for (i = 0, n = log_length; i < n; i++)
+         {
+           for (j = 0; j < length/2; j++)
+             {
+               vect1 = dr_chain[j];
+               vect2 = dr_chain[j+length/2];
+
+               /* Create interleaving stmt:
+                  high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
+                                                       ...}>  */
+               high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
+               perm_stmt
+                 = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
+                                                 vect1, vect2, perm_mask_high);
+               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+               (*result_chain)[2*j] = high;
+
+               /* Create interleaving stmt:
+                  low = VEC_PERM_EXPR <vect1, vect2,
+                                       {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
+                                        ...}>  */
+               low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
+               perm_stmt
+                 = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
+                                                 vect1, vect2, perm_mask_low);
+               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+               (*result_chain)[2*j+1] = low;
+             }
+           memcpy (dr_chain.address (), result_chain->address (),
+                   length * sizeof (tree));
+         }
     }
 }

diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index b87c143..24d0b94 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -974,9 +974,9 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
      include the cost of the permutes.  */
   if (!store_lanes_p && group_size > 1)
     {
-      /* Uses a high and low interleave operation for each needed permute.  */
-
-      int nstmts = ncopies * exact_log2 (group_size) * group_size;
+      /* Uses a high and low interleave or shuffle operations for each
+        needed permute.  */
+      int nstmts = ncopies * ceil_log2 (group_size) * group_size;
       inside_cost = record_stmt_cost (body_cost_vec, nstmts, vec_perm,
                                      stmt_info, 0, vect_body);


diff --git a/gcc/testsuite/gcc.dg/vect/pr52252-st.c
b/gcc/testsuite/gcc.dg/vect/pr52252-st.c
new file mode 100644
index 0000000..cc1e72e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr52252-st.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -g -ftree-vectorize -mssse3
-fdump-tree-vect-details" { target { i?86-*-* x86_64-*-* } } } */
+
+#define byte unsigned char
+
+void
+matrix_mul (byte *in, byte *out, int size)
+{
+  int i;
+  for (i = 0; i < size; i++)
+    {
+      out[0] = in[0] + in[1] + in[3];
+      out[1] = in[0] + in[2] + in[4];
+      out[2] = in[1] + in[2] + in[4];
+      in += 4;
+      out += 3;
+    }
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]