This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: Handle unpropagated assignments in SLP


On Thu, Jun 1, 2017 at 8:45 AM, Richard Sandiford
<richard.sandiford@linaro.org> wrote:
> Some of the SVE patches extend SLP to predicated operations created by
> ifcvt.  However, ifcvt currently forces the mask into a temporary:
>
>                 mask = ifc_temp_var (TREE_TYPE (mask), mask, &gsi);
>
> and at the moment SLP doesn't handle simple assignments like:
>
>    SSA_NAME = SSA_NAME
>    SSA_NAME = <constant>
>
> (It does of course handle:
>
>    SSA_NAME = SSA_NAME op SSA_NAME
>    SSA_NAME = SSA_NAME op <constant>)
>
> I realise copy propagation should usually ensure that these simple
> assignments don't occur, but normal loop vectorisation handles them
> just fine, and SLP does too once we get over the initial validity check.
> I thought this patch might be useful even if we decide that we don't want
> ifcvt to create a temporary mask in such cases.
>
> Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?
>
> Thanks,
> Richard
>
>
> 2017-06-01  Richard Sandiford  <richard.sandiford@linaro.org>
>
> gcc/
>         * tree-vect-slp.c (vect_build_slp_tree_1): Allow mixtures of SSA
>         names and constants, without treating them as separate operations.
>         Explicitly reject stores.
>
> gcc/testsuite/
>         * gcc.dg/vect/slp-temp-1.c: New test.
>         * gcc.dg/vect/slp-temp-2.c: Likewise.
>         * gcc.dg/vect/slp-temp-3.c: Likewise.
>
> Index: gcc/tree-vect-slp.c
> ===================================================================
> --- gcc/tree-vect-slp.c 2017-05-18 07:51:12.387750673 +0100
> +++ gcc/tree-vect-slp.c 2017-06-01 07:21:44.094320070 +0100
> @@ -671,6 +671,13 @@ vect_build_slp_tree_1 (vec_info *vinfo,
>                first_op1 = gimple_assign_rhs2 (stmt);
>              }
>         }
> +      else if ((TREE_CODE_CLASS (rhs_code) == tcc_constant
> +               || rhs_code == SSA_NAME)
> +              && (TREE_CODE_CLASS (first_stmt_code) == tcc_constant
> +                  || first_stmt_code == SSA_NAME))
> +       /* Merging two simple rvalues is OK and doesn't count as two
> +          operations.  */
> +       ;


But this doesn't help in the case one stmt has the copy/constant
propagated and one not ...

>        else
>         {
>           if (first_stmt_code != rhs_code
> @@ -800,11 +807,14 @@ vect_build_slp_tree_1 (vec_info *vinfo,
>             }
>
>           /* Not memory operation.  */
> -         if (TREE_CODE_CLASS (rhs_code) != tcc_binary
> -             && TREE_CODE_CLASS (rhs_code) != tcc_unary
> -             && TREE_CODE_CLASS (rhs_code) != tcc_expression
> -             && TREE_CODE_CLASS (rhs_code) != tcc_comparison
> -             && rhs_code != CALL_EXPR)
> +         if (REFERENCE_CLASS_P (lhs)
> +             || (TREE_CODE_CLASS (rhs_code) != tcc_binary
> +                 && TREE_CODE_CLASS (rhs_code) != tcc_unary
> +                 && TREE_CODE_CLASS (rhs_code) != tcc_expression
> +                 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
> +                 && TREE_CODE_CLASS (rhs_code) != tcc_constant
> +                 && rhs_code != CALL_EXPR
> +                 && rhs_code != SSA_NAME))

I think this whole block is dead code ... we can't ever visit stores and the
only case the rhs_code == tcc_reference check above would miss is
plain decls (but non-indexed decls should be rejected by dataref analysis
already).

I think a better fix is to ensure we do not have non-copy/constant propagated
IL fed into the vectorizer.

Richard.

>             {
>               if (dump_enabled_p ())
>                 {
> Index: gcc/testsuite/gcc.dg/vect/slp-temp-1.c
> ===================================================================
> --- /dev/null   2017-06-01 07:09:35.344016119 +0100
> +++ gcc/testsuite/gcc.dg/vect/slp-temp-1.c      2017-06-01 07:39:24.406603119 +0100
> @@ -0,0 +1,71 @@
> +/* { dg-do run { target { lp64 || ilp32 } } } */
> +/* { dg-additional-options "-fgimple -fno-tree-copy-prop" } */
> +
> +void __GIMPLE (startwith ("loop"))
> +f (int *x, int n)
> +{
> +  int i_in;
> +  int i_out;
> +  int double_i;
> +
> +  long unsigned int index_0;
> +  long unsigned int offset_0;
> +  int *addr_0;
> +  int temp_0;
> +
> +  long unsigned int index_1;
> +  long unsigned int offset_1;
> +  int *addr_1;
> +  int temp_1;
> +
> + entry:
> +  goto loop;
> +
> + loop:
> +  i_in = __PHI (entry: 0, latch: i_out);
> +  double_i = i_in * 2;
> +
> +  index_0 = (long unsigned int) double_i;
> +  offset_0 = index_0 * 4ul;
> +  addr_0 = x_1(D) + offset_0;
> +  temp_0 = 1;
> +  *addr_0 = temp_0;
> +
> +  index_1 = index_0 + 1ul;
> +  offset_1 = index_1 * 4ul;
> +  addr_1 = x_1(D) + offset_1;
> +  temp_1 = 3;
> +  *addr_1 = temp_1;
> +
> +  i_out = i_in + 1;
> +  if (n_2(D) > i_out)
> +    goto latch;
> +  else
> +    goto exit;
> +
> + latch:
> +  goto loop;
> +
> + exit:
> +  return;
> +}
> +
> +#define N 1024
> +
> +int
> +main (void)
> +{
> +  int a[N * 2];
> +  f (a, N);
> +  for (int i = 0; i < N; ++i)
> +    {
> +      if (a[i * 2] != 1
> +         || a[i * 2 + 1] != 3)
> +       __builtin_abort ();
> +      asm volatile ("" ::: "memory");
> +    }
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_int } } } */
> +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_int } } } */
> Index: gcc/testsuite/gcc.dg/vect/slp-temp-2.c
> ===================================================================
> --- /dev/null   2017-06-01 07:09:35.344016119 +0100
> +++ gcc/testsuite/gcc.dg/vect/slp-temp-2.c      2017-06-01 07:39:30.914219838 +0100
> @@ -0,0 +1,71 @@
> +/* { dg-do run { target { lp64 || ilp32 } } } */
> +/* { dg-additional-options "-fgimple -fno-tree-copy-prop" } */
> +
> +void __GIMPLE (startwith ("loop"))
> +f (int *x, int n, int foo)
> +{
> +  int i_in;
> +  int i_out;
> +  int double_i;
> +
> +  long unsigned int index_0;
> +  long unsigned int offset_0;
> +  int *addr_0;
> +  int temp_0;
> +
> +  long unsigned int index_1;
> +  long unsigned int offset_1;
> +  int *addr_1;
> +  int temp_1;
> +
> + entry:
> +  goto loop;
> +
> + loop:
> +  i_in = __PHI (entry: 0, latch: i_out);
> +  double_i = i_in * 2;
> +
> +  index_0 = (long unsigned int) double_i;
> +  offset_0 = index_0 * 4ul;
> +  addr_0 = x_1(D) + offset_0;
> +  temp_0 = 1;
> +  *addr_0 = temp_0;
> +
> +  index_1 = index_0 + 1ul;
> +  offset_1 = index_1 * 4ul;
> +  addr_1 = x_1(D) + offset_1;
> +  temp_1 = foo_2(D);
> +  *addr_1 = temp_1;
> +
> +  i_out = i_in + 1;
> +  if (n_3(D) > i_out)
> +    goto latch;
> +  else
> +    goto exit;
> +
> + latch:
> +  goto loop;
> +
> + exit:
> +  return;
> +}
> +
> +#define N 1024
> +
> +int
> +main (void)
> +{
> +  int a[N * 2];
> +  f (a, N, 11);
> +  for (int i = 0; i < N; ++i)
> +    {
> +      if (a[i * 2] != 1
> +         || a[i * 2 + 1] != 11)
> +       __builtin_abort ();
> +      asm volatile ("" ::: "memory");
> +    }
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_int } } } */
> +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_int } } } */
> Index: gcc/testsuite/gcc.dg/vect/slp-temp-3.c
> ===================================================================
> --- /dev/null   2017-06-01 07:09:35.344016119 +0100
> +++ gcc/testsuite/gcc.dg/vect/slp-temp-3.c      2017-06-01 07:39:40.925589561 +0100
> @@ -0,0 +1,84 @@
> +/* { dg-do run { target { lp64 || ilp32 } } } */
> +/* { dg-additional-options "-fgimple -fno-tree-copy-prop" } */
> +
> +void __GIMPLE (startwith ("loop")) __attribute__ ((noinline, noclone))
> +f (int *x, int n)
> +{
> +  int i_in;
> +  int i_out;
> +  int double_i;
> +
> +  long unsigned int index_0;
> +  long unsigned int offset_0;
> +  int *addr_0;
> +  int old_0;
> +  int new_0;
> +  int temp_0;
> +
> +  long unsigned int index_1;
> +  long unsigned int offset_1;
> +  int *addr_1;
> +  int old_1;
> +  int new_1;
> +  int temp_1;
> +
> + entry:
> +  goto loop;
> +
> + loop:
> +  i_in = __PHI (entry: 0, latch: i_out);
> +  double_i = i_in * 2;
> +
> +  index_0 = (long unsigned int) double_i;
> +  offset_0 = index_0 * 4ul;
> +  addr_0 = x_1(D) + offset_0;
> +  old_0 = *addr_0;
> +  new_0 = old_0 + 1;
> +  temp_0 = new_0;
> +  *addr_0 = temp_0;
> +
> +  index_1 = index_0 + 1ul;
> +  offset_1 = index_1 * 4ul;
> +  addr_1 = x_1(D) + offset_1;
> +  old_1 = *addr_1;
> +  new_1 = old_1 + 3;
> +  temp_1 = new_1;
> +  *addr_1 = temp_1;
> +
> +  i_out = i_in + 1;
> +  if (n_2(D) > i_out)
> +    goto latch;
> +  else
> +    goto exit;
> +
> + latch:
> +  goto loop;
> +
> + exit:
> +  return;
> +}
> +
> +#define N 1024
> +
> +int
> +main (void)
> +{
> +  int a[N * 2];
> +  for (int i = 0; i < N * 2; ++i)
> +    {
> +      a[i] = i * 4;
> +      asm volatile ("" ::: "memory");
> +    }
> +  f (a, N);
> +  for (int i = 0; i < N; ++i)
> +    {
> +      if (a[i * 2] != i * 8 + 1
> +         || a[i * 2 + 1] != i * 8 + 7)
> +       __builtin_abort ();
> +      asm volatile ("" ::: "memory");
> +    }
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_int } } } */
> +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_int } } } */


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]