[PATCH] Fix PR66623

Fri Jun 9 15:48:00 GMT 2017

On June 9, 2017 5:32:10 PM GMT+02:00, Christophe Lyon <christophe.lyon@linaro.org> wrote:
>On 8 June 2017 at 15:49, Richard Biener <rguenther@suse.de> wrote:
>> On Thu, 8 Jun 2017, Richard Biener wrote:
>>
>>>
>>> The following fixes unsafe vectorization of reductions in outer loop
>>> vectorization.  It combines it with a bit of cleanup in the affected
>>> function.
>>>
>>> Bootstrap and regtest running on x86_64-unknown-linux-gnu.
>>
>> Re-testing the following variant -- multiple loop-closed PHI nodes
>> for the same SSA name happen.
>>
>> Richard.
>>
>> 2017-06-08  Richard Biener  <rguenther@suse.de>
>>
>>         PR tree-optimization/66623
>>         * tree-vect-loop.c (vect_is_simple_reduction): Cleanup,
>>         refactor check_reduction into two parts, properly computing
>>         whether we have to check reduction validity for outer loop
>>         vectorization.
>>
>>         * gcc.dg/vect/pr66623.c: New testcase.
>>
>
>Hi Richard,
>
>The new testcase fails on armeb* targets:
>FAIL:    gcc.dg/vect/pr66623.c -flto -ffat-lto-objects execution test
>FAIL:    gcc.dg/vect/pr66623.c execution test

Ah, the xfail probably isn't enough.  Can you try a dg-skip or so?

>Christophe
>
>
>> Index: gcc/tree-vect-loop.c
>> ===================================================================
>> --- gcc/tree-vect-loop.c        (revision 249007)
>> +++ gcc/tree-vect-loop.c        (working copy)
>> @@ -2727,8 +2727,6 @@ vect_is_simple_reduction (loop_vec_info
>>  {
>>    struct loop *loop = (gimple_bb (phi))->loop_father;
>>    struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
>> -  edge latch_e = loop_latch_edge (loop);
>> -  tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
>>    gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt =
>NULL;
>>    enum tree_code orig_code, code;
>>    tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
>> @@ -2742,11 +2740,6 @@ vect_is_simple_reduction (loop_vec_info
>>    *double_reduc = false;
>>    *v_reduc_type = TREE_CODE_REDUCTION;
>>
>> -  /* Check validity of the reduction only for the innermost loop. 
>*/
>> -  bool check_reduction = ! flow_loop_nested_p (vect_loop, loop);
>> -  gcc_assert ((check_reduction && loop == vect_loop)
>> -              || (!check_reduction && flow_loop_nested_p (vect_loop,
>loop)));
>> -
>>    name = PHI_RESULT (phi);
>>    /* ???  If there are no uses of the PHI result the inner loop
>reduction
>>       won't be detected as possibly double-reduction by
>vectorizable_reduction
>> @@ -2775,13 +2768,15 @@ vect_is_simple_reduction (loop_vec_info
>>          {
>>            if (dump_enabled_p ())
>>             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> -                            "reduction used in loop.\n");
>> +                            "reduction value used in loop.\n");
>>            return NULL;
>>          }
>>
>>        phi_use_stmt = use_stmt;
>>      }
>>
>> +  edge latch_e = loop_latch_edge (loop);
>> +  tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
>>    if (TREE_CODE (loop_arg) != SSA_NAME)
>>      {
>>        if (dump_enabled_p ())
>> @@ -2795,18 +2790,22 @@ vect_is_simple_reduction (loop_vec_info
>>      }
>>
>>    def_stmt = SSA_NAME_DEF_STMT (loop_arg);
>> -  if (!def_stmt)
>> +  if (gimple_nop_p (def_stmt))
>>      {
>>        if (dump_enabled_p ())
>>         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> -                        "reduction: no def_stmt.\n");
>> +                        "reduction: no def_stmt\n");
>>        return NULL;
>>      }
>>
>>    if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) !=
>GIMPLE_PHI)
>>      {
>>        if (dump_enabled_p ())
>> -       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
>> +       {
>> +         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> +                          "reduction: unhandled reduction operation:
>");
>> +         dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
>def_stmt, 0);
>> +       }
>>        return NULL;
>>      }
>>
>> @@ -2822,6 +2821,7 @@ vect_is_simple_reduction (loop_vec_info
>>      }
>>
>>    nloop_uses = 0;
>> +  auto_vec<gphi *, 3> lcphis;
>>    FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
>>      {
>>        gimple *use_stmt = USE_STMT (use_p);
>> @@ -2829,6 +2829,9 @@ vect_is_simple_reduction (loop_vec_info
>>         continue;
>>        if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
>>         nloop_uses++;
>> +      else
>> +       /* We can have more than one loop-closed PHI.  */
>> +       lcphis.safe_push (as_a <gphi *> (use_stmt));
>>        if (nloop_uses > 1)
>>         {
>>           if (dump_enabled_p ())
>> @@ -2873,6 +2876,27 @@ vect_is_simple_reduction (loop_vec_info
>>        return NULL;
>>      }
>>
>> +  /* If we are vectorizing an inner reduction we are executing that
>> +     in the original order only in case we are not dealing with a
>> +     double reduction.  */
>> +  bool check_reduction = true;
>> +  if (flow_loop_nested_p (vect_loop, loop))
>> +    {
>> +      gphi *lcphi;
>> +      unsigned i;
>> +      check_reduction = false;
>> +      FOR_EACH_VEC_ELT (lcphis, i, lcphi)
>> +       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result
>(lcphi))
>> +         {
>> +           gimple *use_stmt = USE_STMT (use_p);
>> +           if (is_gimple_debug (use_stmt))
>> +             continue;
>> +           if (! flow_bb_inside_loop_p (vect_loop, gimple_bb
>(use_stmt)))
>> +             check_reduction = true;
>> +         }
>> +    }
>> +
>> +  bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
>>    code = orig_code = gimple_assign_rhs_code (def_stmt);
>>
>>    /* We can handle "res -= x[i]", which is non-associative by
>> @@ -2887,27 +2911,8 @@ vect_is_simple_reduction (loop_vec_info
>>
>>    if (code == COND_EXPR)
>>      {
>> -      if (check_reduction)
>> +      if (! nested_in_vect_loop)
>>         *v_reduc_type = COND_REDUCTION;
>> -    }
>> -  else if (!commutative_tree_code (code) || !associative_tree_code
>(code))
>> -    {
>> -      if (dump_enabled_p ())
>> -       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
>> -                       "reduction: not commutative/associative: ");
>> -      return NULL;
>> -    }
>> -
>> -  if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
>> -    {
>> -      if (code != COND_EXPR)
>> -        {
>> -         if (dump_enabled_p ())
>> -           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
>> -                           "reduction: not binary operation: ");
>> -
>> -          return NULL;
>> -        }
>>
>>        op3 = gimple_assign_rhs1 (def_stmt);
>>        if (COMPARISON_CLASS_P (op3))
>> @@ -2918,30 +2923,35 @@ vect_is_simple_reduction (loop_vec_info
>>
>>        op1 = gimple_assign_rhs2 (def_stmt);
>>        op2 = gimple_assign_rhs3 (def_stmt);
>> -
>> -      if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) !=
>SSA_NAME)
>> -        {
>> -          if (dump_enabled_p ())
>> -            report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
>> -                           "reduction: uses not ssa_names: ");
>> -
>> -          return NULL;
>> -        }
>>      }
>> -  else
>> +  else if (!commutative_tree_code (code) || !associative_tree_code
>(code))
>> +    {
>> +      if (dump_enabled_p ())
>> +       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
>> +                       "reduction: not commutative/associative: ");
>> +      return NULL;
>> +    }
>> +  else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
>>      {
>>        op1 = gimple_assign_rhs1 (def_stmt);
>>        op2 = gimple_assign_rhs2 (def_stmt);
>> +    }
>> +  else
>> +    {
>> +      if (dump_enabled_p ())
>> +       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
>> +                       "reduction: not handled operation: ");
>> +      return NULL;
>> +    }
>>
>> -      if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) !=
>SSA_NAME)
>> -        {
>> -          if (dump_enabled_p ())
>> -           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
>> -                           "reduction: uses not ssa_names: ");
>> +  if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
>> +    {
>> +      if (dump_enabled_p ())
>> +       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
>> +                       "reduction: both uses not ssa_names: ");
>>
>> -          return NULL;
>> -        }
>> -   }
>> +      return NULL;
>> +    }
>>
>>    type = TREE_TYPE (gimple_assign_lhs (def_stmt));
>>    if ((TREE_CODE (op1) == SSA_NAME
>> @@ -3091,7 +3101,7 @@ vect_is_simple_reduction (loop_vec_info
>>                            == vect_internal_def
>>                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
>>      {
>> -      if (check_reduction && orig_code != MINUS_EXPR)
>> +      if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
>>         {
>>           /* Check if we can swap operands (just for simplicity - so
>that
>>              the rest of the code can assume that the reduction
>variable
>> @@ -3145,7 +3155,8 @@ vect_is_simple_reduction (loop_vec_info
>>      }
>>
>>    /* Try to find SLP reduction chain.  */
>> -  if (check_reduction && code != COND_EXPR
>> +  if (! nested_in_vect_loop
>> +      && code != COND_EXPR
>>        && vect_is_slp_reduction (loop_info, phi, def_stmt))
>>      {
>>        if (dump_enabled_p ())
>> Index: gcc/testsuite/gcc.dg/vect/pr66623.c
>> ===================================================================
>> --- gcc/testsuite/gcc.dg/vect/pr66623.c (revision 0)
>> +++ gcc/testsuite/gcc.dg/vect/pr66623.c (working copy)
>> @@ -0,0 +1,86 @@
>> +/* { dg-require-effective-target vect_float } */
>> +
>> +#include "tree-vect.h"
>> +
>> +extern void abort (void);
>> +
>> +#define OP *
>> +#define INIT 1.0
>> +
>> +float __attribute__((noinline,noclone))
>> +foo (float *__restrict__ i)
>> +{
>> +  float l = INIT;
>> +  int a;
>> +  int b;
>> +
>> +  for (a = 0; a < 4; a++)
>> +    for (b = 0; b < 4; b++)
>> +      l = l OP i[b];
>> +
>> +  return l;
>> +}
>> +
>> +float __attribute__((noinline,noclone))
>> +foo_ref (float *__restrict__ i)
>> +{
>> +  float l = INIT;
>> +
>> +  l = l OP i[0];
>> +  l = l OP i[1];
>> +  l = l OP i[2];
>> +  l = l OP i[3];
>> +
>> +  l = l OP i[0];
>> +  l = l OP i[1];
>> +  l = l OP i[2];
>> +  l = l OP i[3];
>> +
>> +  l = l OP i[0];
>> +  l = l OP i[1];
>> +  l = l OP i[2];
>> +  l = l OP i[3];
>> +
>> +  l = l OP i[0];
>> +  l = l OP i[1];
>> +  l = l OP i[2];
>> +  l = l OP i[3];
>> +
>> +  return l;
>> +}
>> +
>> +union u
>> +{
>> +  float f;
>> +  unsigned int u;
>> +};
>> +
>> +int
>> +main (void)
>> +{
>> +  union u res, res2;
>> +  float a[4];
>> +
>> +  if (sizeof (float) != sizeof (unsigned int))
>> +    return 0;
>> +
>> +  check_vect ();
>> +
>> +  a[0] = 0.01;
>> +  a[1] = 0.01;
>> +  a[2] = 0.01;
>> +  a[3] = 1.0;
>> +
>> +  res.f = foo_ref (a);
>> +
>> +  res2.f = foo (a);
>> +
>> +  if (res.u != res2.u)
>> +    abort ();
>> +
>> +  return 0;
>> +}
>> +
>> +/* need -ffast-math to vectorize this loop.  */
>> +/* ARM NEON passes -ffast-math to these tests, so expect this to
>fail.  */
>> +/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" {
>xfail arm_neon_ok } } } */