[PATCH][RFC] tree-optimization/97832 - handle associatable chains in SLP discovery

Richard Biener rguenther@suse.de
Mon Nov 23 16:00:56 GMT 2020


This makes SLP discovery handle associatable (including mixed
plus/minus) chains better by swapping operands across the whole
chain.

While it works it still is a prototype only.  I need to consider
the compile-time complexity by reworking how many "permutes" we
allow during SLP discovery (like a comment suggests probably
capping the actual work, aka the number of vect_build_slp_tree
calls rather than the number of permutes).  There's also some
cleanup to be done.

And the biggest question is what to do for BB vectorization
(for permute handling in general) - we don't have a good
SLP discovery "failure mode" since there's always the possibility
to build a node from scalars.  That means if we're unlucky and
do not end in all loads or constants in the SLP leafs we will
end up need to cost one SLP build variant against another which
means we can't really "stop" when we found a match.  We could
still see matching better than not matching (we can still have
some fatal matches) but as said, I don't have a good idea here.

Bootstrapped and tested on x86_64-unknown-linux-gnu, I've also
built SPEC CPU 2017 (but did not yet benchmark anything).

So at this point looking for conceptual comments, maybe also
great implementation ideas I missed ;)

Even though this fixes a recently reported bug it does feel
like stage1 material.

Thanks,
Richard.

2020-11-18  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/97832
	* tree-vectorizer.h (_slp_tree::failed): New.
	* tree-vect-slp.c (_slp_tree::_slp_tree): Initialize
	failed member.
	(_slp_tree::~_slp_tree): Free failed.
	(vect_build_slp_tree): Retain failed nodes and record
	matches in them, copying that back out when running
	into a cached fail.  Dump start and end of discovery.
	(dt_sort_cmp): New.
	(vect_build_slp_tree_2): Handle associatable chains
	together doing more aggressive operand swapping.

	* gcc.dg/vect/pr97832-1.c: New testcase.
	* gcc.dg/vect/pr97832-2.c: Likewise.
	* gcc.dg/vect/pr97832-3.c: Likewise.
---
 gcc/testsuite/gcc.dg/vect/pr97832-1.c |  17 +
 gcc/testsuite/gcc.dg/vect/pr97832-2.c |  29 ++
 gcc/testsuite/gcc.dg/vect/pr97832-3.c |  50 +++
 gcc/testsuite/gcc.dg/vect/slp-50.c    |  20 ++
 gcc/tree-vect-slp.c                   | 472 +++++++++++++++++++++++++-
 gcc/tree-vectorizer.h                 |   5 +
 6 files changed, 587 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr97832-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr97832-2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr97832-3.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/slp-50.c

diff --git a/gcc/testsuite/gcc.dg/vect/pr97832-1.c b/gcc/testsuite/gcc.dg/vect/pr97832-1.c
new file mode 100644
index 00000000000..063fc7bd717
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr97832-1.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Ofast" } */
+/* { dg-require-effective-target vect_double } */
+
+double a[1024], b[1024], c[1024];
+
+void foo()
+{
+  for (int i = 0; i < 256; ++i)
+    {
+      a[2*i] = a[2*i] + b[2*i] - c[2*i];
+      a[2*i+1] = a[2*i+1] - b[2*i+1] - c[2*i+1];
+    }
+}
+
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr97832-2.c b/gcc/testsuite/gcc.dg/vect/pr97832-2.c
new file mode 100644
index 00000000000..4f0578120ee
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr97832-2.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Ofast" } */
+/* { dg-require-effective-target vect_double } */
+
+void foo1x1(double* restrict y, const double* restrict x, int clen)
+{
+  int xi = clen & 2;
+  double f_re = x[0+xi+0];
+  double f_im = x[4+xi+0];
+  int clen2 = (clen+xi) * 2;
+#pragma GCC unroll 0
+  for (int c = 0; c < clen2; c += 8) {
+    // y[c] = y[c] - x[c]*conj(f);
+#pragma GCC unroll 4
+    for (int k = 0; k < 4; ++k) {
+      double x_re = x[c+0+k];
+      double x_im = x[c+4+k];
+      double y_re = y[c+0+k];
+      double y_im = y[c+4+k];
+      y_re = y_re - x_re * f_re - x_im * f_im;;
+      y_im = y_im + x_re * f_im - x_im * f_re;
+      y[c+0+k] = y_re;
+      y[c+4+k] = y_im;
+    }
+  }
+}
+
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr97832-3.c b/gcc/testsuite/gcc.dg/vect/pr97832-3.c
new file mode 100644
index 00000000000..ad1225ddbaa
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr97832-3.c
@@ -0,0 +1,50 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Ofast" } */
+/* { dg-require-effective-target vect_double } */
+
+void foo(double* restrict y, const double* restrict x0, const double* restrict x1, int clen)
+{
+  int xi = clen & 2;
+  double f00_re = x0[0+xi+0];
+  double f10_re = x1[0+xi+0];
+  double f01_re = x0[0+xi+1];
+  double f11_re = x1[0+xi+1];
+  double f00_im = x0[4+xi+0];
+  double f10_im = x1[4+xi+0];
+  double f01_im = x0[4+xi+1];
+  double f11_im = x1[4+xi+1];
+  int clen2 = (clen+xi) * 2;
+  double* y0 = &y[0];
+  double* y1 = &y[clen2];
+  #pragma GCC unroll 0
+  for (int c = 0; c < clen2; c += 8) {
+    // y0[c] = y0[c] - x0[c]*conj(f00) - x1[c]*conj(f10);
+    // y1[c] = y1[c] - x0[c]*conj(f01) - x1[c]*conj(f11);
+    #pragma GCC unroll 4
+    for (int k = 0; k < 4; ++k) {
+      double x0_re = x0[c+0+k];
+      double x0_im = x0[c+4+k];
+      double y0_re = y0[c+0+k];
+      double y0_im = y0[c+4+k];
+      double y1_re = y1[c+0+k];
+      double y1_im = y1[c+4+k];
+      y0_re = y0_re - x0_re * f00_re - x0_im * f00_im;
+      y0_im = y0_im + x0_re * f00_im - x0_im * f00_re;
+      y1_re = y1_re - x0_re * f01_re - x0_im * f01_im;
+      y1_im = y1_im + x0_re * f01_im - x0_im * f01_re;
+      double x1_re = x1[c+0+k];
+      double x1_im = x1[c+4+k];
+      y0_re = y0_re - x1_re * f10_re - x1_im * f10_im;
+      y0_im = y0_im + x1_re * f10_im - x1_im * f10_re;
+      y1_re = y1_re - x1_re * f11_re - x1_im * f11_im;
+      y1_im = y1_im + x1_re * f11_im - x1_im * f11_re;
+      y0[c+0+k] = y0_re;
+      y0[c+4+k] = y0_im;
+      y1[c+0+k] = y1_re;
+      y1[c+4+k] = y1_im;
+    }
+  }
+}
+
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-50.c b/gcc/testsuite/gcc.dg/vect/slp-50.c
new file mode 100644
index 00000000000..17509e622a5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-50.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-ffast-math" } */
+
+typedef int Quantum;
+typedef struct {
+  Quantum blue, green;
+} PixelPacket;
+PixelPacket *EnhanceImage_image_q;
+int EnhanceImage_image_x;
+float EnhanceImage_image_distance_squared_total_weight;
+void EnhanceImage_image_distance_squared()
+{
+  float zero_1;
+  for (; EnhanceImage_image_x; EnhanceImage_image_x++) {
+      EnhanceImage_image_distance_squared_total_weight += 5.0;
+      EnhanceImage_image_q->green = EnhanceImage_image_q->blue =
+	  zero_1 + EnhanceImage_image_distance_squared_total_weight / 2 - 1;
+      EnhanceImage_image_q++;
+  }
+}
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index c534e51c7fc..55e0a23253d 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -51,6 +51,7 @@ along with GCC; see the file COPYING3.  If not see
 
 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
 					  slp_tree, stmt_vector_for_cost *);
+static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
 
 object_allocator<_slp_tree> *slp_tree_pool;
 
@@ -86,6 +87,7 @@ _slp_tree::_slp_tree ()
   SLP_TREE_VECTYPE (this) = NULL_TREE;
   SLP_TREE_REPRESENTATIVE (this) = NULL;
   SLP_TREE_REF_COUNT (this) = 1;
+  this->failed = NULL;
   this->max_nunits = 1;
   this->lanes = 0;
 }
@@ -101,6 +103,8 @@ _slp_tree::~_slp_tree ()
   SLP_TREE_VEC_DEFS (this).release ();
   SLP_TREE_LOAD_PERMUTATION (this).release ();
   SLP_TREE_LANE_PERMUTATION (this).release ();
+  if (this->failed)
+    free (failed);
 }
 
 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
@@ -1330,6 +1334,30 @@ bst_traits::equal (value_type existing, value_type candidate)
   return true;
 }
 
+/* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
+   but then vec::insert does memmove and that's not compatible with
+   std::pair.  */
+struct chain_op_t
+{
+  chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
+      : code (code_), dt (dt_), op (op_) {}
+  tree_code code;
+  vect_def_type dt;
+  tree op;
+};
+
+/* Comparator for sorting associatable chains.  */
+
+static int
+dt_sort_cmp (const void *op1_, const void *op2_, void *)
+{
+  auto *op1 = (const chain_op_t *) op1_;
+  auto *op2 = (const chain_op_t *) op2_;
+  if (op1->dt != op2->dt)
+    return (int)op1->dt - (int)op2->dt;
+  return (int)op1->code - (int)op2->code;
+}
+
 typedef hash_map <vec <gimple *>, slp_tree,
 		  simple_hashmap_traits <bst_traits, slp_tree> >
   scalar_stmts_to_slp_tree_map_t;
@@ -1352,13 +1380,15 @@ vect_build_slp_tree (vec_info *vinfo,
     {
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
-			 *leader ? "" : "failed ", *leader);
-      if (*leader)
+			 !(*leader)->failed ? "" : "failed ", *leader);
+      if (!(*leader)->failed)
 	{
 	  SLP_TREE_REF_COUNT (*leader)++;
 	  vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
+	  return *leader;
 	}
-      return *leader;
+      memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
+      return NULL;
     }
 
   /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
@@ -1368,22 +1398,31 @@ vect_build_slp_tree (vec_info *vinfo,
   SLP_TREE_SCALAR_STMTS (res) = stmts;
   bst_map->put (stmts.copy (), res);
 
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+		     "starting SLP discovery for node %p\n", res);
+
   poly_uint64 this_max_nunits = 1;
   slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
 					&this_max_nunits,
 					matches, npermutes, tree_size, bst_map);
   if (!res_)
     {
-      bool existed_p = bst_map->put (stmts, NULL);
-      gcc_assert (existed_p);
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "SLP discovery for node %p failed\n", res);
       /* Mark the node invalid so we can detect those when still in use
 	 as backedge destinations.  */
       SLP_TREE_SCALAR_STMTS (res) = vNULL;
       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
-      vect_free_slp_tree (res);
+      res->failed = XNEWVEC (bool, group_size);
+      memcpy (res->failed, matches, sizeof (bool) * group_size);
     }
   else
     {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "SLP discovery for node %p succeeded\n", res);
       gcc_assert (res_ == res);
       res->max_nunits = this_max_nunits;
       vect_update_max_nunits (max_nunits, this_max_nunits);
@@ -1393,6 +1432,47 @@ vect_build_slp_tree (vec_info *vinfo,
   return res_;
 }
 
+static void
+vect_slp_build_two_operator_nodes (slp_tree perm,
+				   slp_tree op0, slp_tree op1,
+				   stmt_vec_info oper1, stmt_vec_info oper2,
+				   vec<std::pair<unsigned, unsigned> > lperm)
+{
+  unsigned group_size = SLP_TREE_LANES (op1);
+  tree vectype = SLP_TREE_VECTYPE (op1);
+
+  slp_tree child1 = new _slp_tree;
+  SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
+  SLP_TREE_VECTYPE (child1) = vectype;
+  SLP_TREE_LANES (child1) = group_size;
+  SLP_TREE_CHILDREN (child1).create (2);
+  SLP_TREE_CHILDREN (child1).quick_push (op0);
+  SLP_TREE_CHILDREN (child1).quick_push (op1);
+  SLP_TREE_REPRESENTATIVE (child1) = oper1;
+
+  slp_tree child2 = new _slp_tree;
+  SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
+  SLP_TREE_VECTYPE (child2) = vectype;
+  SLP_TREE_LANES (child2) = group_size;
+  SLP_TREE_CHILDREN (child2).create (2);
+  SLP_TREE_CHILDREN (child2).quick_push (op0);
+  SLP_TREE_REF_COUNT (op0)++;
+  SLP_TREE_CHILDREN (child2).quick_push (op1);
+  SLP_TREE_REF_COUNT (op1)++;
+  SLP_TREE_REPRESENTATIVE (child2) = oper2;
+
+  SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
+  SLP_TREE_CHILDREN (perm).create (2);
+  SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
+  SLP_TREE_VECTYPE (perm) = vectype;
+  SLP_TREE_LANES (perm) = group_size;
+  /* ???  We should leave this NULL but that's not expected.  */
+  SLP_TREE_REPRESENTATIVE (perm) = oper1;
+  SLP_TREE_LANE_PERMUTATION (perm) = lperm;
+  SLP_TREE_CHILDREN (perm).quick_push (child1);
+  SLP_TREE_CHILDREN (perm).quick_push (child2);
+}
+
 /* Recursively build an SLP tree starting from NODE.
    Fail (and return a value not equal to zero) if def-stmts are not
    isomorphic, require data permutation or are of unsupported types of
@@ -1566,6 +1646,386 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
       SLP_TREE_CHILDREN (node).quick_push (vnode);
       return node;
     }
+  else if (flag_associative_math
+	   /* ???  For BB vectorization we cannot do the brute-force search
+	      for matching as we can succeed by means of builds from scalars
+	      and have no good way to "cost" one build against another.  */
+	   && is_a <loop_vec_info> (vinfo)
+	   /* ???  We don't handle !vect_internal_def defs below.  */
+	   && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
+	   && is_gimple_assign (stmt_info->stmt)
+	   && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
+	       || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
+	   && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
+	       || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
+		   && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
+    {
+      /* See if we have a chain of (mixed) adds or subtracts or other
+	 associatable ops.  */
+      enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
+      if (code == MINUS_EXPR)
+	code = PLUS_EXPR;
+      stmt_vec_info other_op_stmt_info = NULL;
+      stmt_vec_info op_stmt_info = NULL;
+      unsigned chain_len = 0;
+      auto_vec<chain_op_t> chain;
+      auto_vec<std::pair<tree_code, gimple *> > worklist;
+      auto_vec<vec<chain_op_t> > chains (group_size);
+      auto_vec<slp_tree, 4> children;
+      bool hard_fail = true;
+      for (unsigned lane = 0; lane < group_size; ++lane)
+	{
+	  /* For each lane linearize the addition/subtraction (or other
+	     uniform associatable operation) expression tree.  */
+	  worklist.safe_push (std::make_pair (code, stmts[lane]->stmt));
+	  while (!worklist.is_empty ())
+	    {
+	      auto entry = worklist.pop ();
+	      gassign *stmt = as_a <gassign *> (entry.second);
+	      enum tree_code in_code = entry.first;
+	      enum tree_code this_code = gimple_assign_rhs_code (stmt);
+	      /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
+	      if (!op_stmt_info
+		  && gimple_assign_rhs_code (stmt) == code)
+		op_stmt_info = vinfo->lookup_stmt (stmt);
+	      else if (!other_op_stmt_info
+		       && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
+		other_op_stmt_info = vinfo->lookup_stmt (stmt);
+	      for (unsigned opnum = 1; opnum <= 2; ++opnum)
+		{
+		  tree op = gimple_op (stmt, opnum);
+		  vect_def_type dt;
+		  stmt_vec_info def_stmt_info;
+		  bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
+		  gcc_assert (res);
+		  gimple *use_stmt;
+		  use_operand_p use_p;
+		  if (dt == vect_internal_def
+		      && single_imm_use (op, &use_p, &use_stmt)
+		      && is_gimple_assign (def_stmt_info->stmt)
+		      && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
+			  || (code == PLUS_EXPR
+			      && gimple_assign_rhs_code (def_stmt_info->stmt) == MINUS_EXPR)))
+		    {
+		      tree_code op_def_code = this_code;
+		      if (op_def_code == MINUS_EXPR && opnum == 1)
+			op_def_code = PLUS_EXPR;
+		      if (in_code == MINUS_EXPR)
+			op_def_code
+			  = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
+		      worklist.safe_push (std::make_pair (op_def_code,
+							  def_stmt_info->stmt));
+		    }
+		  else
+		    {
+		      tree_code op_def_code = this_code;
+		      if (op_def_code == MINUS_EXPR && opnum == 1)
+			op_def_code = PLUS_EXPR;
+		      if (in_code == MINUS_EXPR)
+			op_def_code
+			  = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
+		      chain.safe_push (chain_op_t (op_def_code, dt, op));
+		    }
+		}
+	    }
+	  if (chain.length () == 2)
+	    {
+	      /* In a chain of just two elements resort to the regular
+		 operand swapping scheme.  If we run into a length
+		 mismatch still hard-FAIL.  */
+	      if (chain_len == 0)
+		hard_fail = false;
+	      break;
+	    }
+	  else if (chain_len == 0)
+	    chain_len = chain.length ();
+	  else if (chain.length () != chain_len)
+	    /* ???  Here we could slip in magic to compensate with
+	       neutral operands.  */
+	    break;
+	  chains.quick_push (chain.copy ());
+	  chain.truncate (0);
+	}
+      if (chains.length () == group_size)
+	{
+	  /* Now we have a set of chains with the same length.  */
+	  /* 1. pre-sort according to def_type and operation.  */
+	  for (unsigned lane = 0; lane < group_size; ++lane)
+	    chains[lane].sort (dt_sort_cmp, vinfo);
+	  if (dump_enabled_p ())
+	    {
+	      dump_printf_loc (MSG_NOTE, vect_location,
+			       "pre-sorted chains of %s\n",
+			       get_tree_code_name (code));
+	      for (unsigned lane = 0; lane < group_size; ++lane)
+		{
+		  for (unsigned opnum = 0; opnum < chain_len; ++opnum)
+		    dump_printf (MSG_NOTE, "%s %T ",
+				 get_tree_code_name (chains[lane][opnum].code),
+				 chains[lane][opnum].op);
+		  dump_printf (MSG_NOTE, "\n");
+		}
+	    }
+	  /* 2. try to build children nodes, associating as necessary.  */
+	  for (unsigned n = 0; n < chain_len; ++n)
+	    {
+	      vect_def_type dt = chains[0][n].dt;
+	      unsigned lane;
+	      for (lane = 0; lane < group_size; ++lane)
+		if (chains[lane][n].dt != dt)
+		  {
+		    if (dt == vect_constant_def
+			&& chains[lane][n].dt == vect_external_def)
+		      dt = vect_external_def;
+		    else if (dt == vect_external_def
+			     && chains[lane][n].dt == vect_constant_def)
+		      ;
+		    else
+		      break;
+		  }
+	      if (lane != group_size)
+		{
+		  if (dump_enabled_p ())
+		    dump_printf_loc (MSG_NOTE, vect_location,
+				     "giving up on chain due to mismatched "
+				     "def types\n");
+		  goto out;
+		}
+	      if (dt == vect_constant_def
+		  || dt == vect_external_def)
+		{
+		/* We can always build those.  Might want to sort last
+		   or defer building.  */
+		   vec<tree> ops;
+		   ops.create (group_size);
+		   for (lane = 0; lane < group_size; ++lane)
+		     ops.quick_push (chains[lane][n].op);
+		   slp_tree child = vect_create_new_slp_node (ops);
+		   SLP_TREE_DEF_TYPE (child) = dt;
+		   children.safe_push (child);
+		}
+	      else if (dt != vect_internal_def)
+		{
+		  /* Not sure, we might need sth special.
+		     gcc.dg/vect/pr96854.c,
+		     gfortran.dg/vect/fast-math-pr37021.f90
+		     and gfortran.dg/vect/pr61171.f trigger.  */
+		  /* Soft-fail for now.  */
+		  hard_fail = false;
+		  goto out;
+		}
+	      else
+		{
+		  vec<stmt_vec_info> op_stmts;
+		  op_stmts.create (group_size);
+		  slp_tree child = NULL;
+		  /* Brute-force our way.  We have to consider a lane
+		     failing after fixing an earlier fail up in the
+		     SLP discovery recursion.  So track the current
+		     permute per lane.  */
+		  unsigned *perms = XALLOCAVEC (unsigned, group_size);
+		  memset (perms, 0, sizeof (unsigned) * group_size);
+		  do
+		    {
+		      op_stmts.truncate (0);
+		      for (lane = 0; lane < group_size; ++lane)
+			op_stmts.quick_push (vinfo->lookup_def (chains[lane][n].op));
+		      child = vect_build_slp_tree (vinfo, op_stmts,
+						   group_size, &this_max_nunits,
+						   matches, npermutes,
+						   &this_tree_size, bst_map);
+		      /* ???  We're likely getting too many fatal mismatches
+			 here so maybe we want to ignore them (but then we
+			 have no idea which lanes fatally mismatched).  */
+		      if (child || !matches[0])
+			break;
+		      /* Swap another lane we have not yet matched up into
+			 lanes that did not match.  If we run out of
+			 permute possibilities for a lane terminate the
+			 search.  */
+		      bool term = false;
+		      for (lane = 1; lane < group_size; ++lane)
+			if (!matches[lane])
+			  {
+			    if (n + perms[lane] + 1 == chain_len)
+			      {
+				term = true;
+				break;
+			      }
+			    std::swap (chains[lane][n],
+				       chains[lane][n + perms[lane] + 1]);
+			    perms[lane]++;
+			  }
+		      if (term)
+			break;
+		      /* We need a better way than to statically assign 4
+			 allowed permutes for each SLP graph entry analysis.
+			 Maybe limit the actual overall work (number of
+			 vect_build_slp_tree calls that did not hit the
+			 cache?) instead and limit that based on the number
+			 of scalar stmts in the loop/region?  */
+		      //(*npermutes)++;
+		    }
+		  while (1);
+		  if (!child)
+		    {
+		      if (dump_enabled_p ())
+			dump_printf_loc (MSG_NOTE, vect_location,
+					 "failed to match up op %d\n", n);
+		      op_stmts.release ();
+		      goto out;
+		    }
+		  if (dump_enabled_p ())
+		    {
+		      dump_printf_loc (MSG_NOTE, vect_location,
+				       "matched up op %d to\n", n);
+		      vect_print_slp_tree (MSG_NOTE, vect_location, child);
+		    }
+		  children.safe_push (child);
+		}
+	    }
+	  /* 3. build SLP nodes to combine the chain.  */
+	  for (unsigned lane = 0; lane < group_size; ++lane)
+	    if (chains[lane][0].code != code)
+	      {
+		/* See if there's any alternate all-PLUS entry.  */
+		unsigned n;
+		for (n = 1; n < chain_len; ++n)
+		  {
+		    for (lane = 0; lane < group_size; ++lane)
+		      if (chains[lane][n].code != code)
+			break;
+		    if (lane == group_size)
+		      break;
+		  }
+		if (n != chain_len)
+		  {
+		    /* Swap that in at first position.  */
+		    std::swap (children[0], children[n]);
+		    for (lane = 0; lane < group_size; ++lane)
+		      std::swap (chains[lane][0], chains[lane][n]);
+		  }
+		else
+		  {
+		    /* ???  When this triggers and we end up with two
+		       vect_constant/external_def up-front things break (ICE)
+		       spectacularly finding an insertion place for the
+		       all-constant op.  We should have a fully
+		       vect_internal_def operand though(?) so we can swap
+		       that into first place and then prepend the all-zero
+		       constant.  */
+		    if (dump_enabled_p ())
+		      dump_printf_loc (MSG_NOTE, vect_location,
+				       "inserting constant zero to compensate "
+				       "for (partially) negated first "
+				       "operand\n");
+		    chain_len++;
+		    for (lane = 0; lane < group_size; ++lane)
+		      chains[lane].safe_insert
+			(0, chain_op_t (code, vect_constant_def, NULL_TREE));
+		    vec<tree> zero_ops;
+		    zero_ops.create (group_size);
+		    zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
+		    for (lane = 1; lane < group_size; ++lane)
+		      zero_ops.quick_push (zero_ops[0]);
+		    slp_tree zero = vect_create_new_slp_node (zero_ops);
+		    SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
+		    children.safe_insert (0, zero);
+		  }
+		break;
+	      }
+	  for (unsigned i = 1; i < children.length () - 1; ++i)
+	    {
+	      slp_tree op0 = children[i - 1];
+	      slp_tree op1 = children[i];
+	      bool this_two_op = false;
+	      for (unsigned lane = 0; lane < group_size; ++lane)
+		if (chains[lane][i].code != chains[0][i].code)
+		  {
+		    this_two_op = true;
+		    break;
+		  }
+	      if (this_two_op)
+		{
+		  vec<std::pair<unsigned, unsigned> > lperm;
+		  lperm.create (group_size);
+		  for (unsigned lane = 0; lane < group_size; ++lane)
+		    lperm.quick_push (std::make_pair (chains[lane][i].code != chains[0][i].code, lane));
+		  slp_tree perm = new _slp_tree;
+		  vect_slp_build_two_operator_nodes (perm, op0, op1,
+						     (chains[0][i].code == code
+						      ? op_stmt_info : other_op_stmt_info),
+						     (chains[0][i].code == code
+						      ? other_op_stmt_info : op_stmt_info),
+						     lperm);
+		  children[i] = perm;
+		}
+	      else
+		{
+		  slp_tree child = new _slp_tree;
+		  SLP_TREE_DEF_TYPE (child) = vect_internal_def;
+		  SLP_TREE_VECTYPE (child) = vectype;
+		  SLP_TREE_LANES (child) = group_size;
+		  SLP_TREE_CHILDREN (child).create (2);
+		  SLP_TREE_CHILDREN (child).quick_push (op0);
+		  SLP_TREE_CHILDREN (child).quick_push (op1);
+		  /* ???  Now we have to deal with mixed operands.  */
+		  SLP_TREE_REPRESENTATIVE (child)
+		    = (chains[0][i].code == code
+		       ? op_stmt_info : other_op_stmt_info);
+		  children[i] = child;
+		}
+	    }
+	  /* ???  We can merge below into the above loop by conditionally
+	     allocating a new node.  */
+	  /* The last op is already us.  */
+	  bool this_two_op = false;
+	  for (unsigned lane = 0; lane < group_size; ++lane)
+	    if (chains[lane].last ().code != chains[0].last ().code)
+	      {
+		this_two_op = true;
+		break;
+	      }
+	  if (this_two_op)
+	    {
+	      vec<std::pair<unsigned, unsigned> > lperm;
+	      lperm.create (group_size);
+	      for (unsigned lane = 0; lane < group_size; ++lane)
+		lperm.quick_push (std::make_pair (chains[lane].last ().code != chains[0].last ().code, lane));
+	      vect_slp_build_two_operator_nodes (node, children[children.length () - 2],
+						 children[children.length () - 1],
+						 chains[0].last ().code == code
+						 ? op_stmt_info : other_op_stmt_info,
+						 chains[0].last ().code == code
+						 ? other_op_stmt_info : op_stmt_info,
+						 lperm);
+	    }
+	  else
+	    {
+	      node = vect_create_new_slp_node (node, stmts, 2);
+	      SLP_TREE_REPRESENTATIVE (node)
+		  = (chains[0].last ().code == code
+		     ? op_stmt_info : other_op_stmt_info);
+	      SLP_TREE_VECTYPE (node) = vectype;
+	      SLP_TREE_CHILDREN (node).quick_push (children[children.length () - 2]);
+	      SLP_TREE_CHILDREN (node).quick_push (children[children.length () - 1]);
+	      SLP_TREE_LANES (node) = group_size;
+	    }
+	  *tree_size += this_tree_size + 1;
+	  *max_nunits = this_max_nunits;
+	  return node;
+	}
+out:
+      while (!children.is_empty ())
+	vect_free_slp_tree (children.pop ());
+      while (!chains.is_empty ())
+	chains.pop ().release ();
+      /* Hard-fail, otherwise we might run into quadratic processing of the
+	 chains starting one stmt into the chain again.  */
+      if (hard_fail)
+	return NULL;
+      /* Fall thru to normal processing.  */
+    }
 
   /* Get at the operands, verifying they are compatible.  */
   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 0ee4ef32eb2..c9f95bc6ffe 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -167,6 +167,11 @@ struct _slp_tree {
 
   int vertex;
 
+  /* If not NULL this is a cached failed SLP discovery attempt with
+     the lanes that failed during SLP discovery as 'false'.  This is
+     a copy of the matches array.  */
+  bool *failed;
+
   /* Allocate from slp_tree_pool.  */
   static void *operator new (size_t);
 
-- 
2.26.2


More information about the Gcc-patches mailing list