This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [SVE] PR86753


On Tue, 27 Aug 2019 at 17:29, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Richard Biener <richard.guenther@gmail.com> writes:
> > On Tue, Aug 27, 2019 at 11:58 AM Richard Sandiford
> > <richard.sandiford@arm.com> wrote:
> >> ifcvt produces:
> >>
> >>   <bb 3> [local count: 1063004407]:
> >>   # i_34 = PHI <i_30(10), 0(21)>
> >>   # ivtmp_5 = PHI <ivtmp_6(10), 100(21)>
> >>   _1 = (long unsigned int) i_34;
> >>   _2 = _1 * 2;
> >>   _3 = a_23(D) + _2;
> >>   _4 = *_3;
> >>   _7 = b_24(D) + _2;
> >>   _49 = _4 > 0;
> >>   _8 = .MASK_LOAD (_7, 16B, _49);
> >>   _12 = _4 > 0;
> >>   _13 = _8 > 0;
> >>   _9 = _12 & _13;
> >>   _10 = _4 > 0;
> >>   _11 = _8 > 0;
> >>   _27 = ~_11;
> >>   _15 = _10 & _27;
> >>   _14 = c_25(D) + _2;
> >>   iftmp.0_26 = .MASK_LOAD (_14, 16B, _15);
> >>   iftmp.0_19 = _9 ? _4 : iftmp.0_26;
> >>   _17 = x_28(D) + _2;
> >>   _50 = _4 > 0;
> >>   .MASK_STORE (_17, 16B, _50, iftmp.0_19);
> >>   i_30 = i_34 + 1;
> >>   ivtmp_6 = ivtmp_5 - 1;
> >>   if (ivtmp_6 != 0)
> >>     goto <bb 10>; [98.99%]
> >>   else
> >>     goto <bb 9>; [1.01%]
> >>
> >>   <bb 10> [local count: 1052266994]:
> >>   goto <bb 3>; [100.00%]
> >>
> >> which has 4 copies of _4 > 0 (a[i] > 0) and 2 copies of _8 > 0 (b[i] > 0).
> >
> > Huh.  if-conversion does
> >
> >   /* Now all statements are if-convertible.  Combine all the basic
> >      blocks into one huge basic block doing the if-conversion
> >      on-the-fly.  */
> >   combine_blocks (loop);
> >
> >   /* Delete dead predicate computations.  */
> >   ifcvt_local_dce (loop->header);
> >
> >   /* Perform local CSE, this esp. helps the vectorizer analysis if loads
> >      and stores are involved.  CSE only the loop body, not the entry
> >      PHIs, those are to be kept in sync with the non-if-converted copy.
> >      ???  We'll still keep dead stores though.  */
> >   exit_bbs = BITMAP_ALLOC (NULL);
> >   bitmap_set_bit (exit_bbs, single_exit (loop)->dest->index);
> >   bitmap_set_bit (exit_bbs, loop->latch->index);
> >   todo |= do_rpo_vn (cfun, loop_preheader_edge (loop), exit_bbs);
> >
> > which should remove those redundant _4 > 0 checks.  In fact when I
> > run this on x86_64 with -mavx512bw I see
> >
> >   <bb 3> [local count: 1063004407]:
> >   # i_25 = PHI <i_20(9), 0(21)>
> >   # ivtmp_24 = PHI <ivtmp_12(9), 100(21)>
> >   _1 = (long unsigned int) i_25;
> >   _2 = _1 * 2;
> >   _3 = a_14(D) + _2;
> >   _4 = *_3;
> >   _5 = b_15(D) + _2;
> >   _49 = _4 > 0;
> >   _6 = .MASK_LOAD (_5, 16B, _49);
> >   _22 = _6 > 0;
> >   _28 = ~_22;
> >   _29 = _28 & _49;
> >   _7 = c_16(D) + _2;
> >   iftmp.0_17 = .MASK_LOAD (_7, 16B, _29);
> >   iftmp.0_10 = _29 ? iftmp.0_17 : _4;
> >   _8 = x_18(D) + _2;
> >   .MASK_STORE (_8, 16B, _49, iftmp.0_10);
> >   i_20 = i_25 + 1;
> >   ivtmp_12 = ivtmp_24 - 1;
> >   if (ivtmp_12 != 0)
> >
> > after if-conversion (that should be the case already on the GCC 9 branch).
>
> Gah, sorry for the noise.  Turns out I still had a local change that was
> trying to poke the patch into doing something wrong.  Will try to check
> my facts more carefully next time.
>
> The redundant pattern statements I was thinking of come from
> vect_recog_mask_conversion_pattern, but I guess that isn't so
> interesting here.
>
> So yeah, let's drop this whole vn thing for now...
The attached version drops VN, and uses operand_equal_p for comparison.
Does it look OK ?

Thanks,
Prathamesh
>
> Thanks,
> Richard
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c b/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c
index 5c04bcdb3f5..a1b0667dab5 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c
@@ -15,5 +15,9 @@ f (double *restrict a, double *restrict b, double *restrict c,
     }
 }
 
-/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+/* See https://gcc.gnu.org/ml/gcc-patches/2019-08/msg01644.html
+   for XFAILing the below test.  */
+
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
 /* { dg-final { scan-assembler-not {\tfmad\t} } } */
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index b0cbbac0cb5..0fc7171d7ea 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -8609,6 +8609,9 @@ vect_transform_loop (loop_vec_info loop_vinfo)
       basic_block bb = bbs[i];
       stmt_vec_info stmt_info;
 
+      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+	loop_vinfo->cond_to_vec_mask = new cond_vmask_map_type (8);
+
       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 	   gsi_next (&si))
         {
@@ -8717,6 +8720,12 @@ vect_transform_loop (loop_vec_info loop_vinfo)
 		}
 	    }
 	}
+
+      if (loop_vinfo->cond_to_vec_mask)
+	{
+	  delete loop_vinfo->cond_to_vec_mask;
+	  loop_vinfo->cond_to_vec_mask = 0;
+	}
     }				/* BBs in loop */
 
   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 1e2dfe5d22d..862206b3256 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1989,17 +1989,31 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype,
 
 static tree
 prepare_load_store_mask (tree mask_type, tree loop_mask, tree vec_mask,
-			 gimple_stmt_iterator *gsi)
+			 gimple_stmt_iterator *gsi, tree mask,
+			 cond_vmask_map_type *cond_to_vec_mask)
 {
   gcc_assert (useless_type_conversion_p (mask_type, TREE_TYPE (vec_mask)));
   if (!loop_mask)
     return vec_mask;
 
   gcc_assert (TREE_TYPE (loop_mask) == mask_type);
+
+  tree *slot = 0;
+  if (cond_to_vec_mask)
+    {
+      cond_vmask_key cond (mask, loop_mask);
+      slot = &cond_to_vec_mask->get_or_insert (cond);
+      if (*slot)
+	return *slot;
+    }
+
   tree and_res = make_temp_ssa_name (mask_type, NULL, "vec_mask_and");
   gimple *and_stmt = gimple_build_assign (and_res, BIT_AND_EXPR,
 					  vec_mask, loop_mask);
   gsi_insert_before (gsi, and_stmt, GSI_SAME_STMT);
+
+  if (slot)
+    *slot = and_res;
   return and_res;
 }
 
@@ -3514,8 +3528,10 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 			  gcc_assert (ncopies == 1);
 			  tree mask = vect_get_loop_mask (gsi, masks, vec_num,
 							  vectype_out, i);
+			  tree scalar_mask = gimple_call_arg (gsi_stmt (*gsi), mask_opno);
 			  vargs[mask_opno] = prepare_load_store_mask
-			    (TREE_TYPE (mask), mask, vargs[mask_opno], gsi);
+			    (TREE_TYPE (mask), mask, vargs[mask_opno], gsi,
+			     scalar_mask, vinfo->cond_to_vec_mask);
 			}
 
 		      gcall *call;
@@ -3564,9 +3580,11 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	    {
 	      tree mask = vect_get_loop_mask (gsi, masks, ncopies,
 					      vectype_out, j);
+	      tree scalar_mask = gimple_call_arg (gsi_stmt (*gsi), mask_opno);
 	      vargs[mask_opno]
 		= prepare_load_store_mask (TREE_TYPE (mask), mask,
-					   vargs[mask_opno], gsi);
+					   vargs[mask_opno], gsi,
+					   scalar_mask, vinfo->cond_to_vec_mask);
 	    }
 
 	  if (cfn == CFN_GOMP_SIMD_LANE)
@@ -8109,7 +8127,8 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 					     vectype, j);
 	  if (vec_mask)
 	    final_mask = prepare_load_store_mask (mask_vectype, final_mask,
-						  vec_mask, gsi);
+						  vec_mask, gsi, mask,
+						  vinfo->cond_to_vec_mask);
 
 	  gcall *call;
 	  if (final_mask)
@@ -8163,7 +8182,8 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 						 vectype, vec_num * j + i);
 	      if (vec_mask)
 		final_mask = prepare_load_store_mask (mask_vectype, final_mask,
-						      vec_mask, gsi);
+						      vec_mask, gsi, mask,
+						      vinfo->cond_to_vec_mask);
 
 	      if (memory_access_type == VMAT_GATHER_SCATTER)
 		{
@@ -9304,7 +9324,8 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 					     vectype, j);
 	  if (vec_mask)
 	    final_mask = prepare_load_store_mask (mask_vectype, final_mask,
-						  vec_mask, gsi);
+						  vec_mask, gsi, mask,
+						  vinfo->cond_to_vec_mask);
 
 	  gcall *call;
 	  if (final_mask)
@@ -9355,7 +9376,8 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 						 vectype, vec_num * j + i);
 	      if (vec_mask)
 		final_mask = prepare_load_store_mask (mask_vectype, final_mask,
-						      vec_mask, gsi);
+						      vec_mask, gsi, mask,
+						      vinfo->cond_to_vec_mask);
 
 	      if (i > 0)
 		dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi,
@@ -9975,6 +9997,38 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
   /* Handle cond expr.  */
   for (j = 0; j < ncopies; j++)
     {
+      tree vec_mask = NULL_TREE;
+
+      if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+	  && TREE_CODE_CLASS (TREE_CODE (cond_expr)) == tcc_comparison
+	  && loop_vinfo->cond_to_vec_mask)
+	{
+	  vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+	  if (masks)
+	    {
+	      tree loop_mask = vect_get_loop_mask (gsi, masks,
+						   ncopies, vectype, j);
+
+	      cond_vmask_key cond (cond_expr, loop_mask);
+	      tree *slot = loop_vinfo->cond_to_vec_mask->get (cond);
+	      if (slot && *slot)
+		vec_mask = *slot;
+	      else
+		{
+		  cond.cond_ops.code
+		    = invert_tree_comparison (cond.cond_ops.code, true);
+		  slot = loop_vinfo->cond_to_vec_mask->get (cond);
+		  if (slot && *slot)
+		    {
+		      vec_mask = *slot;
+		      tree tmp = then_clause;
+		      then_clause = else_clause;
+		      else_clause = tmp;
+		    }
+		}
+	    }
+	}
+
       stmt_vec_info new_stmt_info = NULL;
       if (j == 0)
 	{
@@ -10054,6 +10108,8 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 
 	  if (masked)
 	    vec_compare = vec_cond_lhs;
+	  else if (vec_mask)
+	    vec_compare = vec_mask;
 	  else
 	    {
 	      vec_cond_rhs = vec_oprnds1[i];
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index dc181524744..fb404d4b91e 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -461,7 +461,8 @@ vec_info::vec_info (vec_info::vec_kind kind_in, void *target_cost_data_in,
 		    vec_info_shared *shared_)
   : kind (kind_in),
     shared (shared_),
-    target_cost_data (target_cost_data_in)
+    target_cost_data (target_cost_data_in),
+    cond_to_vec_mask (0)
 {
   stmt_vec_infos.create (50);
 }
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 1456cde4c2c..81629361f14 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -26,6 +26,7 @@ typedef class _stmt_vec_info *stmt_vec_info;
 #include "tree-data-ref.h"
 #include "tree-hash-traits.h"
 #include "target.h"
+#include "hash-map.h"
 
 /* Used for naming of new temporaries.  */
 enum vect_var_kind {
@@ -193,6 +194,81 @@ public:
   poly_uint64 min_value;
 };
 
+struct cond_vmask_key
+{
+  cond_vmask_key (tree t, tree loop_mask_)
+    : cond_ops (t), loop_mask (loop_mask_)
+  {}
+
+  hashval_t hash () const
+  {
+    inchash::hash h;
+    h.add_int (cond_ops.code);
+    h.add_int (TREE_HASH (cond_ops.op0));
+    h.add_int (TREE_HASH (cond_ops.op1));
+    h.add_int (TREE_HASH (loop_mask));
+    return h.end ();
+  }
+
+  void mark_empty ()
+  {
+    loop_mask = NULL_TREE;
+  }
+
+  bool is_empty ()
+  {
+    return loop_mask == NULL_TREE;
+  }
+
+  tree_cond_ops cond_ops;
+  tree loop_mask;
+};
+
+inline bool operator== (const cond_vmask_key& c1, const cond_vmask_key& c2)
+{
+  return c1.loop_mask == c2.loop_mask
+	 && c1.cond_ops == c2.cond_ops;
+}
+
+struct cond_vmask_key_traits
+{
+  typedef cond_vmask_key value_type;
+  typedef cond_vmask_key compare_type;
+
+  static inline hashval_t hash (value_type v)
+  {
+    return v.hash ();
+  }
+
+  static inline bool equal (value_type existing, value_type candidate)
+  {
+    return existing == candidate;
+  }
+
+  static inline void mark_empty (value_type& v)
+  {
+    v.mark_empty ();
+  }
+
+  static inline bool is_empty (value_type v)
+  {
+    return v.is_empty ();
+  }
+
+  static void mark_deleted (value_type&) {}
+
+  static inline bool is_deleted (value_type)
+  {
+    return false;
+  }
+
+  static inline void remove (value_type &) {}
+};
+
+typedef hash_map<cond_vmask_key, tree,
+		 simple_hashmap_traits <cond_vmask_key_traits, tree> >
+  cond_vmask_map_type;
+
 /* Vectorizer state shared between different analyses like vector sizes
    of the same CFG region.  */
 class vec_info_shared {
@@ -255,6 +331,8 @@ public:
   /* Cost data used by the target cost model.  */
   void *target_cost_data;
 
+  cond_vmask_map_type *cond_to_vec_mask;
+
 private:
   stmt_vec_info new_stmt_vec_info (gimple *stmt);
   void set_vinfo_for_stmt (gimple *, stmt_vec_info);
diff --git a/gcc/tree.c b/gcc/tree.c
index 8f80012c6e8..32a8fcf1eb8 100644
--- a/gcc/tree.c
+++ b/gcc/tree.c
@@ -15204,6 +15204,44 @@ max_object_size (void)
   return TYPE_MAX_VALUE (ptrdiff_type_node);
 }
 
+/* If code(T) is comparison op or def of comparison stmt,
+   extract it's operands.
+   Else return <NE_EXPR, T, 0>.  */
+
+tree_cond_ops::tree_cond_ops (tree t)
+{
+  if (TREE_CODE_CLASS (TREE_CODE (t)) == tcc_comparison)
+    {
+      this->code = TREE_CODE (t);
+      this->op0 = TREE_OPERAND (t, 0);
+      this->op1 = TREE_OPERAND (t, 1);
+      return;
+    }
+
+  else if (TREE_CODE (t) == SSA_NAME)
+    {
+      gassign *stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (t));
+      if (stmt)
+	{
+	  tree_code code = gimple_assign_rhs_code (stmt);
+	  if (TREE_CODE_CLASS (code) == tcc_comparison)
+	    {
+	      this->code = code;
+	      this->op0 = gimple_assign_rhs1 (stmt);
+	      this->op1 = gimple_assign_rhs2 (stmt);
+	      return;
+	    }
+	}
+
+      this->code = NE_EXPR;
+      this->op0 = t;
+      this->op1 = build_zero_cst (TREE_TYPE (t));
+    }
+
+  else
+    gcc_unreachable ();
+}
+
 #if CHECKING_P
 
 namespace selftest {
diff --git a/gcc/tree.h b/gcc/tree.h
index 94dbb95a78a..e6d6e9541c3 100644
--- a/gcc/tree.h
+++ b/gcc/tree.h
@@ -6141,4 +6141,25 @@ public:
   operator location_t () const { return m_combined_loc; }
 };
 
+struct tree_cond_ops
+{
+  tree_code code;
+  tree op0;
+  tree op1;
+
+  tree_cond_ops (tree);
+};
+
+/* ??? Not sure if it's good idea to include fold-const.h
+       only for operand_equal_p ? */
+extern bool operand_equal_p (const_tree, const_tree, unsigned int);
+
+inline bool
+operator== (const tree_cond_ops& o1, const tree_cond_ops &o2)
+{
+  return o1.code == o2.code
+	 && operand_equal_p (o1.op0, o2.op0, 0)
+	 && operand_equal_p (o1.op1, o2.op1, 0);
+}
+
 #endif  /* GCC_TREE_H  */

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]