[PATCH]middle-end Add an RPO pass after successful vectorization

Richard Biener rguenther@suse.de
Wed Nov 10 07:17:57 GMT 2021


On Tue, 9 Nov 2021, Tamar Christina wrote:

> > > +      bitmap_set_bit (exit_bbs, single_exit (loop)->dest->index);
> > > +      bitmap_set_bit (exit_bbs, loop->latch->index);
> > 
> > treating the latch as exit is probably premature optimization (yes, it's empty).
> > 
> > > +
> > > +      do_rpo_vn (cfun, loop_preheader_edge (loop), exit_bbs);
> > > +
> > > +      BITMAP_FREE (exit_bbs);
> > 
> > ... deallocation can go.  Note I wonder whether, if we are already spinning up
> > VN, we should include the preheader in the operation?
> > We regularly end up emitting redundant vector initializers that could be
> > cleaned up earlier this way.
> 
> I've change it to include the preheader but it looks like this breaks bootstrap on both
> x86 and AArch64.
> 
> On x86 the following testcase
> 
> double matmul_c8_vanilla_bbase_0;
> double *matmul_c8_vanilla_dest;
> matmul_c8_vanilla_x;
> matmul_c8_vanilla() {
>   for (; matmul_c8_vanilla_x; matmul_c8_vanilla_x++)
>     matmul_c8_vanilla_dest[matmul_c8_vanilla_x] += matmul_c8_vanilla_bbase_0;
> }
> 
> ICEs with -std=gnu11 -ffast-math -ftree-vectorize -O2 with:
> 
> internal compiler error: tree check: expected ssa_name, have var_decl in SSA_VAL, at tree-ssa-sccvn.c:535
> 0x80731c tree_check_failed(tree_node const*, char const*, int, char const*, ...)
>         ../gcc-dsg/gcc/tree.c:8689
> 0x7ebda2 tree_check(tree_node*, char const*, int, char const*, tree_code)
>         ../gcc-dsg/gcc/tree.h:3433
> 0x7ebda2 SSA_VAL(tree_node*, bool*)
>         ../gcc-dsg/gcc/tree-ssa-sccvn.c:535
> 0x7ebda2 vuse_ssa_val
>         ../gcc-dsg/gcc/tree-ssa-sccvn.c:553
> 0x7ebda2 vn_reference_lookup(tree_node*, tree_node*, vn_lookup_kind, vn_reference_s**, bool, tree_node**, tree_node*)
>         ../gcc-dsg/gcc/tree-ssa-sccvn.c:3664
> 0x10d8ca5 visit_reference_op_load
>         ../gcc-dsg/gcc/tree-ssa-sccvn.c:5166
> 0x10d8ca5 visit_stmt
>         ../gcc-dsg/gcc/tree-ssa-sccvn.c:5615
> 0x10d976c process_bb
>         ../gcc-dsg/gcc/tree-ssa-sccvn.c:7344
> 0x10dafe5 do_rpo_vn
>         ../gcc-dsg/gcc/tree-ssa-sccvn.c:7942
> 0x10dc828 do_rpo_vn(function*, edge_def*, bitmap_head*)
>         ../gcc-dsg/gcc/tree-ssa-sccvn.c:8039
> 0x119c39c vectorize_loops()
>         ../gcc-dsg/gcc/tree-vectorizer.c:1304

OK, as I thought this is .MEMs not in SSA form.  We're later
fixing that up so maybe try the attached which re-orders the
postprocessing after vectorization to do that earlier.

Richard.

> on AArch64 this one ICEs with -ffast-math -ftree-vectorize -O2
> 
> _Complex *a;
> _Complex b;
> c, d;
> fn1() {
>   _Complex e;
>   for (; c; ++c)
>     e = d * a[c];
>   b = e;
> }
> 
> With the message 
> 
> internal compiler error: tree check: expected ssa_name, have var_decl in VN_INFO, at tree-ssa-sccvn.c:451
> 0x734073 tree_check_failed(tree_node const*, char const*, int, char const*, ...)
>         ../../gcc-fsf/gcc/tree.c:8691
> 0x10e2e2f tree_check(tree_node*, char const*, int, char const*, tree_code)
>         ../../gcc-fsf/gcc/tree.h:3433
> 0x10e2e2f VN_INFO(tree_node*)
>         ../../gcc-fsf/gcc/tree-ssa-sccvn.c:451
> 0x10ed223 process_bb
>         ../../gcc-fsf/gcc/tree-ssa-sccvn.c:7331
> 0x10eea43 do_rpo_vn
>         ../../gcc-fsf/gcc/tree-ssa-sccvn.c:7944
> 0x10efe2b do_rpo_vn(function*, edge_def*, bitmap_head*)
>         ../../gcc-fsf/gcc/tree-ssa-sccvn.c:8039
> 0x11c436b vectorize_loops()
>         ../../gcc-fsf/gcc/tree-vectorizer.c:1304
> 
> Any ideas?
> 
> Thanks,
> Tamar
> 
> > 
> > Otherwise the change looks OK.
> > 
> 
> --- inline copy of patch ---
> 
> diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
> index edb7538a67f00cd80a608ee82510cf437fe88083..029d59016c9652f87d80fc5500f89532c79a66d0 100644
> --- a/gcc/tree-vectorizer.c
> +++ b/gcc/tree-vectorizer.c
> @@ -81,7 +81,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "gimple-pretty-print.h"
>  #include "opt-problem.h"
>  #include "internal-fn.h"
> -
> +#include "tree-ssa-sccvn.h"
>  
>  /* Loop or bb location, with hotness information.  */
>  dump_user_location_t vect_location;
> @@ -1298,6 +1298,17 @@ vectorize_loops (void)
>        if (has_mask_store
>  	  && targetm.vectorize.empty_mask_is_expensive (IFN_MASK_STORE))
>  	optimize_mask_stores (loop);
> +
> +      auto_bitmap exit_bbs;
> +      /* Perform local CSE, this esp. helps because we emit code for
> +	 predicates that need to be shared for optimal predicate usage.
> +	 However reassoc will re-order them and prevent CSE from working
> +	 as it should.  CSE only the loop body, not the entry.  */
> +      bitmap_set_bit (exit_bbs, single_exit (loop)->dest->index);
> +
> +      edge entry = EDGE_PRED (loop_preheader_edge (loop)->src, 0);
> +      do_rpo_vn (cfun, entry, exit_bbs);
> +
>        loop->aux = NULL;
>      }
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imend
-------------- next part --------------
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index a2e13acb6d2..78883b053b1 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -81,7 +81,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "gimple-pretty-print.h"
 #include "opt-problem.h"
 #include "internal-fn.h"
-
+#include "tree-ssa-sccvn.h"
 
 /* Loop or bb location, with hotness information.  */
 dump_user_location_t vect_location;
@@ -1278,23 +1278,6 @@ vectorize_loops (void)
 	  }
       }
 
-  for (i = 1; i < number_of_loops (cfun); i++)
-    {
-      loop_vec_info loop_vinfo;
-      bool has_mask_store;
-
-      loop = get_loop (cfun, i);
-      if (!loop || !loop->aux)
-	continue;
-      loop_vinfo = (loop_vec_info) loop->aux;
-      has_mask_store = LOOP_VINFO_HAS_MASK_STORE (loop_vinfo);
-      delete loop_vinfo;
-      if (has_mask_store
-	  && targetm.vectorize.empty_mask_is_expensive (IFN_MASK_STORE))
-	optimize_mask_stores (loop);
-      loop->aux = NULL;
-    }
-
   /* Fold IFN_GOMP_SIMD_{VF,LANE,LAST_LANE,ORDERED_{START,END}} builtins.  */
   if (cfun->has_simduid_loops)
     {
@@ -1302,14 +1285,12 @@ vectorize_loops (void)
       /* Avoid stale SCEV cache entries for the SIMD_LANE defs.  */
       scev_reset ();
     }
-
   /* Shrink any "omp array simd" temporary arrays to the
      actual vectorization factors.  */
   if (simd_array_to_simduid_htab)
     shrink_simd_arrays (simd_array_to_simduid_htab, simduid_to_vf_htab);
   delete simduid_to_vf_htab;
   cfun->has_simduid_loops = false;
-  vect_slp_fini ();
 
   if (num_vectorized_loops > 0)
     {
@@ -1317,9 +1298,39 @@ vectorize_loops (void)
 	 ???  Also while we try hard to update loop-closed SSA form we fail
 	 to properly do this in some corner-cases (see PR56286).  */
       rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa_only_virtuals);
-      return TODO_cleanup_cfg;
+      ret |= TODO_cleanup_cfg;
     }
 
+  for (i = 1; i < number_of_loops (cfun); i++)
+    {
+      loop_vec_info loop_vinfo;
+      bool has_mask_store;
+
+      loop = get_loop (cfun, i);
+      if (!loop || !loop->aux)
+	continue;
+      loop_vinfo = (loop_vec_info) loop->aux;
+      has_mask_store = LOOP_VINFO_HAS_MASK_STORE (loop_vinfo);
+      delete loop_vinfo;
+      if (has_mask_store
+	  && targetm.vectorize.empty_mask_is_expensive (IFN_MASK_STORE))
+	optimize_mask_stores (loop);
+
+      auto_bitmap exit_bbs;
+      /* Perform local CSE, this esp. helps because we emit code for
+	 predicates that need to be shared for optimal predicate usage.
+	 However reassoc will re-order them and prevent CSE from working
+	 as it should.  CSE only the loop body, not the entry.  */
+      bitmap_set_bit (exit_bbs, single_exit (loop)->dest->index);
+
+      edge entry = EDGE_PRED (loop_preheader_edge (loop)->src, 0);
+      do_rpo_vn (cfun, entry, exit_bbs);
+
+      loop->aux = NULL;
+    }
+
+  vect_slp_fini ();
+
   return ret;
 }
 


More information about the Gcc-patches mailing list