[RFC] Make vectorizer to skip loops with small iteration estimate

Richard Guenther richard.guenther@gmail.com
Mon Oct 8 10:02:00 GMT 2012


On Sat, Oct 6, 2012 at 11:34 AM, Jan Hubicka <hubicka@ucw.cz> wrote:
> Hi,
> I benchmarked the patch moving loop header copying and it is quite noticeable win.
>
> Some testsuite updating is needed. In many cases it is just because the
> optimizations are now happening earlier.
> There are however few testusite failures I have torubles to deal with:
> ./testsuite/gcc/gcc.sum:FAIL: gcc.dg/tree-ssa/pr21559.c scan-tree-dump-times vrp1 "Threaded jump" 3
> ./testsuite/gcc/gcc.sum:FAIL: gcc.dg/tree-ssa/ssa-dom-thread-2.c scan-tree-dump-times vrp1 "Jumps threaded: 1" 1
> ./testsuite/gcc/gcc.sum:FAIL: gcc.dg/vect/O3-slp-reduc-10.c scan-tree-dump-times vect "vectorized 1 loops" 2
> ./testsuite/g++/g++.sum:FAIL: g++.dg/tree-ssa/pr18178.C -std=gnu++98  scan-tree-dump-times vrp1 "if " 1
> ./testsuite/g++/g++.sum:FAIL: g++.dg/tree-ssa/pr18178.C -std=gnu++11  scan-tree-dump-times vrp1 "if " 1
>
> This is mostly about VRP losing its ability to thread some jumps from the
> duplicated loop header out of the loop across the loopback edge.  This seems to
> be due to loop updating logic.  Do we care about these?

Yes, I think so.  At least we care that the optimized result is the same.

Can you elaborate on "due to loop updating logic"?

Can you elaborate on the def_split_header_continue_p change?  Which probably
should be tested and installed separately?

Thanks,
Richard.

> Honza
>
> Index: tree-ssa-threadupdate.c
> ===================================================================
> *** tree-ssa-threadupdate.c     (revision 192123)
> --- tree-ssa-threadupdate.c     (working copy)
> *************** static bool
> *** 846,854 ****
>   def_split_header_continue_p (const_basic_block bb, const void *data)
>   {
>     const_basic_block new_header = (const_basic_block) data;
> !   return (bb != new_header
> !         && (loop_depth (bb->loop_father)
> !             >= loop_depth (new_header->loop_father)));
>   }
>
>   /* Thread jumps through the header of LOOP.  Returns true if cfg changes.
> --- 846,860 ----
>   def_split_header_continue_p (const_basic_block bb, const void *data)
>   {
>     const_basic_block new_header = (const_basic_block) data;
> !   const struct loop *l;
> !
> !   if (bb == new_header
> !       || loop_depth (bb->loop_father) < loop_depth (new_header->loop_father))
> !     return false;
> !   for (l = bb->loop_father; l; l = loop_outer (l))
> !     if (l == new_header->loop_father)
> !       return true;
> !   return false;
>   }
>
>   /* Thread jumps through the header of LOOP.  Returns true if cfg changes.
> Index: testsuite/gcc.dg/unroll_2.c
> ===================================================================
> *** testsuite/gcc.dg/unroll_2.c (revision 192123)
> --- testsuite/gcc.dg/unroll_2.c (working copy)
> ***************
> *** 1,5 ****
>   /* { dg-do compile  { target i?86-*-linux* x86_64-*-linux* } } */
> ! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fdisable-tree-cunroll=foo -fdisable-tree-cunrolli=foo -fenable-rtl-loop2_unroll" } */
>
>   unsigned a[100], b[100];
>   inline void bar()
> --- 1,5 ----
>   /* { dg-do compile  { target i?86-*-linux* x86_64-*-linux* } } */
> ! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fdisable-tree-cunroll=foo -fdisable-tree-cunrolli=foo -fenable-rtl-loop2_unroll -fno-tree-dominator-opts" } */
>
>   unsigned a[100], b[100];
>   inline void bar()
> Index: testsuite/gcc.dg/unroll_3.c
> ===================================================================
> *** testsuite/gcc.dg/unroll_3.c (revision 192123)
> --- testsuite/gcc.dg/unroll_3.c (working copy)
> ***************
> *** 1,5 ****
>   /* { dg-do compile  { target i?86-*-linux* x86_64-*-linux* } } */
> ! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fdisable-tree-cunroll -fdisable-tree-cunrolli -fenable-rtl-loop2_unroll=foo" } */
>
>   unsigned a[100], b[100];
>   inline void bar()
> --- 1,5 ----
>   /* { dg-do compile  { target i?86-*-linux* x86_64-*-linux* } } */
> ! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fdisable-tree-cunroll -fdisable-tree-cunrolli -fenable-rtl-loop2_unroll=foo -fno-tree-dominator-opts" } */
>
>   unsigned a[100], b[100];
>   inline void bar()
> Index: testsuite/gcc.dg/torture/pr23821.c
> ===================================================================
> *** testsuite/gcc.dg/torture/pr23821.c  (revision 192123)
> --- testsuite/gcc.dg/torture/pr23821.c  (working copy)
> ***************
> *** 1,9 ****
>   /* { dg-do compile } */
>   /* { dg-skip-if "" { *-*-* } { "-O0" "-fno-fat-lto-objects" } { "" } } */
> ! /* At -O1 DOM threads a jump in a non-optimal way which leads to
>      the bogus propagation.  */
> ! /* { dg-skip-if "" { *-*-* } { "-O1" } { "" } } */
> ! /* { dg-options "-fdump-tree-ivcanon-details" } */
>
>   int a[199];
>
> --- 1,8 ----
>   /* { dg-do compile } */
>   /* { dg-skip-if "" { *-*-* } { "-O0" "-fno-fat-lto-objects" } { "" } } */
> ! /* DOM threads a jump in a non-optimal way which leads to
>      the bogus propagation.  */
> ! /* { dg-options "-fdump-tree-ivcanon-details -fno-tree-dominator-opts" } */
>
>   int a[199];
>
> Index: testsuite/gcc.dg/tree-ssa/ivopt_1.c
> ===================================================================
> *** testsuite/gcc.dg/tree-ssa/ivopt_1.c (revision 192123)
> --- testsuite/gcc.dg/tree-ssa/ivopt_1.c (working copy)
> *************** void foo (int i_width, TYPE dst, TYPE sr
> *** 14,18 ****
>   }
>
>
> ! /* { dg-final { scan-tree-dump-times "PHI <ivtmp" 1 "ivopts"} } */
>   /* { dg-final { cleanup-tree-dump "ivopts" } } */
> --- 14,18 ----
>   }
>
>
> ! /* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts"} } */
>   /* { dg-final { cleanup-tree-dump "ivopts" } } */
> Index: testsuite/gcc.dg/tree-ssa/ivopt_2.c
> ===================================================================
> *** testsuite/gcc.dg/tree-ssa/ivopt_2.c (revision 192123)
> --- testsuite/gcc.dg/tree-ssa/ivopt_2.c (working copy)
> *************** void foo (int i_width, TYPE dst, TYPE sr
> *** 13,17 ****
>          }
>   }
>
> ! /* { dg-final { scan-tree-dump-times "PHI <ivtmp" 1 "ivopts"} } */
>   /* { dg-final { cleanup-tree-dump "ivopts" } } */
> --- 13,17 ----
>          }
>   }
>
> ! /* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts"} } */
>   /* { dg-final { cleanup-tree-dump "ivopts" } } */
> Index: testsuite/gcc.dg/tree-ssa/ivopt_3.c
> ===================================================================
> *** testsuite/gcc.dg/tree-ssa/ivopt_3.c (revision 192123)
> --- testsuite/gcc.dg/tree-ssa/ivopt_3.c (working copy)
> *************** void foo (int i_width, char* dst, char*
> *** 16,20 ****
>          }
>   }
>
> ! /* { dg-final { scan-tree-dump-times "PHI <ivtmp" 1 "ivopts"} } */
>   /* { dg-final { cleanup-tree-dump "ivopts" } } */
> --- 16,20 ----
>          }
>   }
>
> ! /* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts"} } */
>   /* { dg-final { cleanup-tree-dump "ivopts" } } */
> Index: testsuite/gcc.dg/tree-ssa/ivopt_4.c
> ===================================================================
> *** testsuite/gcc.dg/tree-ssa/ivopt_4.c (revision 192123)
> --- testsuite/gcc.dg/tree-ssa/ivopt_4.c (working copy)
> *************** void foo (int i_width, TYPE dst, TYPE sr
> *** 15,19 ****
>          }
>   }
>
> ! /* { dg-final { scan-tree-dump-times "PHI <ivtmp" 1 "ivopts"} } */
>   /* { dg-final { cleanup-tree-dump "ivopts" } } */
> --- 15,19 ----
>          }
>   }
>
> ! /* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts"} } */
>   /* { dg-final { cleanup-tree-dump "ivopts" } } */
> Index: testsuite/gcc.dg/unroll_4.c
> ===================================================================
> *** testsuite/gcc.dg/unroll_4.c (revision 192123)
> --- testsuite/gcc.dg/unroll_4.c (working copy)
> ***************
> *** 1,5 ****
>   /* { dg-do compile  { target i?86-*-linux* x86_64-*-linux* } } */
> ! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fdisable-tree-cunroll -fdisable-tree-cunrolli -fenable-rtl-loop2_unroll=foo2" } */
>
>   unsigned a[100], b[100];
>   inline void bar()
> --- 1,5 ----
>   /* { dg-do compile  { target i?86-*-linux* x86_64-*-linux* } } */
> ! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fdisable-tree-cunroll -fdisable-tree-cunrolli -fenable-rtl-loop2_unroll=foo2 -fno-tree-dominator-opts" } */
>
>   unsigned a[100], b[100];
>   inline void bar()
> Index: testsuite/gcc.dg/tree-prof/update-loopch.c
> ===================================================================
> *** testsuite/gcc.dg/tree-prof/update-loopch.c  (revision 192123)
> --- testsuite/gcc.dg/tree-prof/update-loopch.c  (working copy)
> *************** main ()
> *** 11,20 ****
>       }
>     return 0;
>   }
> ! /* Loop header copying will peel away the initial conditional, so the loop body
> !    is once reached directly from entry point of function, rest via loopback
> !    edge.  */
> ! /* { dg-final-use { scan-ipa-dump "loop depth 0, count 33334" "profile"} } */
>   /* { dg-final-use { scan-tree-dump "loop depth 1, count 33332" "optimized"} } */
>   /* { dg-final-use { scan-tree-dump-not "Invalid sum" "optimized"} } */
>   /* { dg-final-use { cleanup-ipa-dump "profile" } } */
> --- 11,20 ----
>       }
>     return 0;
>   }
> ! /* Loop header copying, now happening before profiling, will peel away the
> !    initial conditional, so the loop body is once reached directly from entry
> !    point of function, rest via loopback edge.  */
> ! /* { dg-final-use { scan-ipa-dump "loop depth 0, count 33332" "profile"} } */
>   /* { dg-final-use { scan-tree-dump "loop depth 1, count 33332" "optimized"} } */
>   /* { dg-final-use { scan-tree-dump-not "Invalid sum" "optimized"} } */
>   /* { dg-final-use { cleanup-ipa-dump "profile" } } */
> Index: testsuite/gcc.dg/unroll_1.c
> ===================================================================
> *** testsuite/gcc.dg/unroll_1.c (revision 192123)
> --- testsuite/gcc.dg/unroll_1.c (working copy)
> ***************
> *** 1,5 ****
>   /* { dg-do compile } */
> ! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fdisable-tree-cunroll -fdisable-tree-cunrolli -fenable-rtl-loop2_unroll" } */
>
>   unsigned a[100], b[100];
>   inline void bar()
> --- 1,5 ----
>   /* { dg-do compile } */
> ! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fdisable-tree-cunroll -fdisable-tree-cunrolli -fenable-rtl-loop2_unroll -fno-tree-dominator-opts" } */
>
>   unsigned a[100], b[100];
>   inline void bar()
> Index: passes.c
> ===================================================================
> *** passes.c    (revision 192123)
> --- passes.c    (working copy)
> *************** init_optimization_passes (void)
> *** 1330,1335 ****
> --- 1330,1340 ----
>           NEXT_PASS (pass_convert_switch);
>             NEXT_PASS (pass_cleanup_eh);
>             NEXT_PASS (pass_profile);
> +         /* Scheduling header copying before pass_ipa_tree_profile is important
> +            to get loop iteration counts estimated right.
> +            Scheduling it after pass_profile prevents it to copy loop headers
> +            in cold functions declares by the user.  */
> +           NEXT_PASS (pass_ch);
>             NEXT_PASS (pass_local_pure_const);
>           /* Split functions creates parts that are not run through
>              early optimizations again.  It is thus good idea to do this
> *************** init_optimization_passes (void)
> *** 1406,1412 ****
>         NEXT_PASS (pass_tree_ifcombine);
>         NEXT_PASS (pass_phiopt);
>         NEXT_PASS (pass_tail_recursion);
> -       NEXT_PASS (pass_ch);
>         NEXT_PASS (pass_stdarg);
>         NEXT_PASS (pass_lower_complex);
>         NEXT_PASS (pass_sra);
> --- 1411,1416 ----



More information about the Gcc-patches mailing list