[PATCH] tree-optimization/101264 - rework SLP "any" permute forward prop

Wed Jun 30 09:29:29 GMT 2021

This integrates the forward propagation of SLP "any" permutes into
the main propagation stage as a separate single-pass propagation
didn't work out.

It does make the propagation iterate more - propagation in both
directions doesn't tend to behave nicely.  I've checked on CPU 2017
fprate and it isn't too bad:

#iters  #count
2 30810
3 7386
4 1250
5 140
6 6
7 3

before and

2 30812
3 7303
4 1218
5 122
6 33
7 56
8 26
9 12
10 4
11 1
12 2
13 2
14 4

after.  So yes, the peak number of required iterations grows
significantly but the majority is still covered with three iterations.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

2021-06-30  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/101264
	* tree-vect-slp.c (vect_optimize_slp): Propagate the
	computed perm_in to all "any" permute successors
	we cannot de-duplicate immediately.

	* gfortran.dg/pr101264.f90: New testcase.
---
 gcc/testsuite/gfortran.dg/pr101264.f90 | 94 ++++++++++++++++++++++++++
 gcc/tree-vect-slp.c                    | 79 ++++++----------------
 2 files changed, 115 insertions(+), 58 deletions(-)
 create mode 100644 gcc/testsuite/gfortran.dg/pr101264.f90

diff --git a/gcc/testsuite/gfortran.dg/pr101264.f90 b/gcc/testsuite/gfortran.dg/pr101264.f90
new file mode 100644
index 00000000000..5602a709a36
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/pr101264.f90
@@ -0,0 +1,94 @@
+! { dg-do compile }
+! { dg-options "-Ofast" }
+  SUBROUTINE foo (a,b,c,d,trigs,inc1,inc2,inc3,inc4,lot,n,la)
+    IMPLICIT NONE (type, external)
+    INTEGER, PARAMETER ::   wp = 8
+    INTEGER, PARAMETER ::  iwp = 4
+    INTEGER(iwp) ::  inc1
+    INTEGER(iwp) ::  inc2
+    INTEGER(iwp) ::  inc3
+    INTEGER(iwp) ::  inc4
+    INTEGER(iwp) ::  la
+    INTEGER(iwp) ::  lot
+    INTEGER(iwp) ::  n
+
+    REAL(wp) ::  a(*)
+    REAL(wp) ::  b(*)
+    REAL(wp) ::  c(*)
+    REAL(wp) ::  d(*)
+    REAL(wp) ::  trigs(*)
+
+    REAL(wp) ::  c1
+    REAL(wp) ::  c2
+    REAL(wp) ::  s1
+    REAL(wp) ::  s2
+    REAL(wp) ::  sin60
+
+    INTEGER(iwp) ::  i
+    INTEGER(iwp) ::  ia
+    INTEGER(iwp) ::  ib
+    INTEGER(iwp) ::  ibase
+    INTEGER(iwp) ::  ic
+    INTEGER(iwp) ::  iink
+    INTEGER(iwp) ::  ijk
+    INTEGER(iwp) ::  j
+    INTEGER(iwp) ::  ja
+    INTEGER(iwp) ::  jb
+    INTEGER(iwp) ::  jbase
+    INTEGER(iwp) ::  jc
+    INTEGER(iwp) ::  jink
+    INTEGER(iwp) ::  jump
+    INTEGER(iwp) ::  k
+    INTEGER(iwp) ::  kb
+    INTEGER(iwp) ::  kc
+    INTEGER(iwp) ::  kstop
+    INTEGER(iwp) ::  l
+    INTEGER(iwp) ::  m
+
+    sin60=0.866025403784437_wp
+
+    ia = 1
+    ib = ia + (2*m-la)*inc1
+    ic = ib
+    ja = 1
+    jb = ja + jink
+    jc = jb + jink
+
+    DO k = la, kstop, la
+       kb = k + k
+       kc = kb + kb
+       c1 = trigs(kb+1)
+       s1 = trigs(kb+2)
+       c2 = trigs(kc+1)
+       s2 = trigs(kc+2)
+       ibase = 0
+       DO l = 1, la
+          i = ibase
+          j = jbase
+          DO ijk = 1, lot
+             c(ja+j) = a(ia+i) + (a(ib+i)+a(ic+i))
+             d(ja+j) = b(ia+i) + (b(ib+i)-b(ic+i))
+             c(jb+j) = c1*((a(ia+i)-0.5_wp*(a(ib+i)+a(ic+i)))-(sin60*(b(ib+i)+ &
+                  &            b(ic+i))))                                      &
+                  &    - s1*((b(ia+i)-0.5_wp*(b(ib+i)-b(ic+i)))+(sin60*(a(ib+i)- &
+                  &            a(ic+i))))
+             d(jb+j) = s1*((a(ia+i)-0.5_wp*(a(ib+i)+a(ic+i)))-(sin60*(b(ib+i)+ &
+                  &            b(ic+i))))                                      &
+                  &    + c1*((b(ia+i)-0.5_wp*(b(ib+i)-b(ic+i)))+(sin60*(a(ib+i)- &
+                  &            a(ic+i))))
+             c(jc+j) = c2*((a(ia+i)-0.5_wp*(a(ib+i)+a(ic+i)))+(sin60*(b(ib+i)+ &
+                  &            b(ic+i))))                                      &
+                  &    - s2*((b(ia+i)-0.5_wp*(b(ib+i)-b(ic+i)))-(sin60*(a(ib+i)- &
+                  &            a(ic+i))))
+             i = i + inc3
+             j = j + inc4
+          END DO
+          ibase = ibase + inc1
+          jbase = jbase + inc2
+       END DO
+       ia = ia + iink
+       ib = ib + iink
+       ic = ic - iink
+       jbase = jbase + jump
+    END DO
+  END
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 9155af499b3..10195d3629f 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -3729,6 +3729,7 @@ vect_optimize_slp (vec_info *vinfo)
 	    perm = vertices[idx].perm_out;
 	  else
 	    {
+	      bool any_succ_perm_out_m1 = false;
 	      perm = vertices[idx].get_perm_in ();
 	      for (graph_edge *succ = slpg->vertices[idx].succ;
 		   succ; succ = succ->succ_next)
@@ -3742,7 +3743,15 @@ vect_optimize_slp (vec_info *vinfo)
 		     For example see gcc.dg/vect/bb-slp-14.c for a case
 		     that would break.  */
 		  if (succ_perm == -1)
-		    continue;
+		    {
+		      /* When we handled a non-leaf optimistically, note
+			 that so we can adjust its outgoing permute below.  */
+		      slp_tree succ_node = vertices[succ_idx].node;
+		      if (SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
+			  && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
+			any_succ_perm_out_m1 = true;
+		      continue;
+		    }
 		  if (perm == -1)
 		    perm = succ_perm;
 		  else if (succ_perm == 0
@@ -3753,25 +3762,19 @@ vect_optimize_slp (vec_info *vinfo)
 		    }
 		}
 
-	      /* If this is a node we do not want to eventually unshare
-		 but it can be permuted at will, verify all users have
-		 the same permutations registered and otherwise drop to
-		 zero.  */
-	      if (perm == -1
-		  && SLP_TREE_DEF_TYPE (node) != vect_external_def
-		  && SLP_TREE_DEF_TYPE (node) != vect_constant_def)
+	      /* Adjust any incoming permutes we treated optimistically.  */
+	      if (perm != -1 && any_succ_perm_out_m1)
 		{
-		  int preds_perm = -1;
-		  for (graph_edge *pred = slpg->vertices[idx].pred;
-		       pred; pred = pred->pred_next)
+		  for (graph_edge *succ = slpg->vertices[idx].succ;
+		       succ; succ = succ->succ_next)
 		    {
-		      int pred_perm = vertices[pred->src].get_perm_in ();
-		      if (preds_perm == -1)
-			preds_perm = pred_perm;
-		      else if (!vect_slp_perms_eq (perms,
-						   pred_perm, preds_perm))
-			perm = 0;
+		      slp_tree succ_node = vertices[succ->dest].node;
+		      if (vertices[succ->dest].perm_out == -1
+			  && SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
+			  && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
+			vertices[succ->dest].perm_out = perm;
 		    }
+		  changed = true;
 		}
 
 	      if (!vect_slp_perms_eq (perms, perm,
@@ -3840,47 +3843,7 @@ vect_optimize_slp (vec_info *vinfo)
 	}
     }
   while (changed);
-  statistics_counter_event (cfun, "SLP optimize perm iterations", iteration);
-
-  /* Compute pre-order.  */
-  auto_vec<int> heads;
-  heads.reserve (vinfo->slp_instances.length ());
-  for (slp_instance inst : vinfo->slp_instances)
-    heads.quick_push (SLP_INSTANCE_TREE (inst)->vertex);
-  auto_vec<int> po;
-  graphds_dfs (slpg, &heads[0], heads.length (), &po, true, NULL, NULL);
-
-  /* Propagate materialized permutes to "any" permute nodes.  For heads
-     ending up as "any" (reductions with just invariants), set them to
-     no permute.  */
-  for (int idx : heads)
-    if (vertices[idx].perm_out == -1)
-      vertices[idx].perm_out = 0;
-  for (i = po.length (); i > 0; --i)
-    {
-      int idx = po[i-1];
-      int perm_in = vertices[idx].get_perm_in ();
-      slp_tree node = vertices[idx].node;
-      if (SLP_TREE_DEF_TYPE (node) == vect_external_def
-	  || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
-	continue;
-      gcc_assert (perm_in != -1);
-      for (graph_edge *succ = slpg->vertices[idx].succ;
-	   succ; succ = succ->succ_next)
-	{
-	  slp_tree succ_node = vertices[succ->dest].node;
-	  if (SLP_TREE_DEF_TYPE (succ_node) == vect_external_def
-	      || SLP_TREE_DEF_TYPE (succ_node) == vect_constant_def)
-	    continue;
-	  if (vertices[succ->dest].perm_out == -1)
-	    vertices[succ->dest].perm_out = perm_in;
-	  else
-	    /* Propagation should have ensured that all preds have the same
-	       permutation.  */
-	    gcc_assert (vect_slp_perms_eq (perms, perm_in,
-					   vertices[succ->dest].perm_out));
-	}
-    }
+  statistics_histogram_event (cfun, "SLP optimize perm iterations", iteration);
 
   /* Materialize.  */
   for (i = 0; i < vertices.length (); ++i)
-- 
2.26.2