[RFC] Make vectorizer to skip loops with small iteration estimate
Richard Guenther
richard.guenther@gmail.com
Mon Oct 8 10:02:00 GMT 2012
On Sat, Oct 6, 2012 at 11:34 AM, Jan Hubicka <hubicka@ucw.cz> wrote:
> Hi,
> I benchmarked the patch moving loop header copying and it is quite noticeable win.
>
> Some testsuite updating is needed. In many cases it is just because the
> optimizations are now happening earlier.
> There are however few testusite failures I have torubles to deal with:
> ./testsuite/gcc/gcc.sum:FAIL: gcc.dg/tree-ssa/pr21559.c scan-tree-dump-times vrp1 "Threaded jump" 3
> ./testsuite/gcc/gcc.sum:FAIL: gcc.dg/tree-ssa/ssa-dom-thread-2.c scan-tree-dump-times vrp1 "Jumps threaded: 1" 1
> ./testsuite/gcc/gcc.sum:FAIL: gcc.dg/vect/O3-slp-reduc-10.c scan-tree-dump-times vect "vectorized 1 loops" 2
> ./testsuite/g++/g++.sum:FAIL: g++.dg/tree-ssa/pr18178.C -std=gnu++98 scan-tree-dump-times vrp1 "if " 1
> ./testsuite/g++/g++.sum:FAIL: g++.dg/tree-ssa/pr18178.C -std=gnu++11 scan-tree-dump-times vrp1 "if " 1
>
> This is mostly about VRP losing its ability to thread some jumps from the
> duplicated loop header out of the loop across the loopback edge. This seems to
> be due to loop updating logic. Do we care about these?
Yes, I think so. At least we care that the optimized result is the same.
Can you elaborate on "due to loop updating logic"?
Can you elaborate on the def_split_header_continue_p change? Which probably
should be tested and installed separately?
Thanks,
Richard.
> Honza
>
> Index: tree-ssa-threadupdate.c
> ===================================================================
> *** tree-ssa-threadupdate.c (revision 192123)
> --- tree-ssa-threadupdate.c (working copy)
> *************** static bool
> *** 846,854 ****
> def_split_header_continue_p (const_basic_block bb, const void *data)
> {
> const_basic_block new_header = (const_basic_block) data;
> ! return (bb != new_header
> ! && (loop_depth (bb->loop_father)
> ! >= loop_depth (new_header->loop_father)));
> }
>
> /* Thread jumps through the header of LOOP. Returns true if cfg changes.
> --- 846,860 ----
> def_split_header_continue_p (const_basic_block bb, const void *data)
> {
> const_basic_block new_header = (const_basic_block) data;
> ! const struct loop *l;
> !
> ! if (bb == new_header
> ! || loop_depth (bb->loop_father) < loop_depth (new_header->loop_father))
> ! return false;
> ! for (l = bb->loop_father; l; l = loop_outer (l))
> ! if (l == new_header->loop_father)
> ! return true;
> ! return false;
> }
>
> /* Thread jumps through the header of LOOP. Returns true if cfg changes.
> Index: testsuite/gcc.dg/unroll_2.c
> ===================================================================
> *** testsuite/gcc.dg/unroll_2.c (revision 192123)
> --- testsuite/gcc.dg/unroll_2.c (working copy)
> ***************
> *** 1,5 ****
> /* { dg-do compile { target i?86-*-linux* x86_64-*-linux* } } */
> ! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fdisable-tree-cunroll=foo -fdisable-tree-cunrolli=foo -fenable-rtl-loop2_unroll" } */
>
> unsigned a[100], b[100];
> inline void bar()
> --- 1,5 ----
> /* { dg-do compile { target i?86-*-linux* x86_64-*-linux* } } */
> ! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fdisable-tree-cunroll=foo -fdisable-tree-cunrolli=foo -fenable-rtl-loop2_unroll -fno-tree-dominator-opts" } */
>
> unsigned a[100], b[100];
> inline void bar()
> Index: testsuite/gcc.dg/unroll_3.c
> ===================================================================
> *** testsuite/gcc.dg/unroll_3.c (revision 192123)
> --- testsuite/gcc.dg/unroll_3.c (working copy)
> ***************
> *** 1,5 ****
> /* { dg-do compile { target i?86-*-linux* x86_64-*-linux* } } */
> ! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fdisable-tree-cunroll -fdisable-tree-cunrolli -fenable-rtl-loop2_unroll=foo" } */
>
> unsigned a[100], b[100];
> inline void bar()
> --- 1,5 ----
> /* { dg-do compile { target i?86-*-linux* x86_64-*-linux* } } */
> ! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fdisable-tree-cunroll -fdisable-tree-cunrolli -fenable-rtl-loop2_unroll=foo -fno-tree-dominator-opts" } */
>
> unsigned a[100], b[100];
> inline void bar()
> Index: testsuite/gcc.dg/torture/pr23821.c
> ===================================================================
> *** testsuite/gcc.dg/torture/pr23821.c (revision 192123)
> --- testsuite/gcc.dg/torture/pr23821.c (working copy)
> ***************
> *** 1,9 ****
> /* { dg-do compile } */
> /* { dg-skip-if "" { *-*-* } { "-O0" "-fno-fat-lto-objects" } { "" } } */
> ! /* At -O1 DOM threads a jump in a non-optimal way which leads to
> the bogus propagation. */
> ! /* { dg-skip-if "" { *-*-* } { "-O1" } { "" } } */
> ! /* { dg-options "-fdump-tree-ivcanon-details" } */
>
> int a[199];
>
> --- 1,8 ----
> /* { dg-do compile } */
> /* { dg-skip-if "" { *-*-* } { "-O0" "-fno-fat-lto-objects" } { "" } } */
> ! /* DOM threads a jump in a non-optimal way which leads to
> the bogus propagation. */
> ! /* { dg-options "-fdump-tree-ivcanon-details -fno-tree-dominator-opts" } */
>
> int a[199];
>
> Index: testsuite/gcc.dg/tree-ssa/ivopt_1.c
> ===================================================================
> *** testsuite/gcc.dg/tree-ssa/ivopt_1.c (revision 192123)
> --- testsuite/gcc.dg/tree-ssa/ivopt_1.c (working copy)
> *************** void foo (int i_width, TYPE dst, TYPE sr
> *** 14,18 ****
> }
>
>
> ! /* { dg-final { scan-tree-dump-times "PHI <ivtmp" 1 "ivopts"} } */
> /* { dg-final { cleanup-tree-dump "ivopts" } } */
> --- 14,18 ----
> }
>
>
> ! /* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts"} } */
> /* { dg-final { cleanup-tree-dump "ivopts" } } */
> Index: testsuite/gcc.dg/tree-ssa/ivopt_2.c
> ===================================================================
> *** testsuite/gcc.dg/tree-ssa/ivopt_2.c (revision 192123)
> --- testsuite/gcc.dg/tree-ssa/ivopt_2.c (working copy)
> *************** void foo (int i_width, TYPE dst, TYPE sr
> *** 13,17 ****
> }
> }
>
> ! /* { dg-final { scan-tree-dump-times "PHI <ivtmp" 1 "ivopts"} } */
> /* { dg-final { cleanup-tree-dump "ivopts" } } */
> --- 13,17 ----
> }
> }
>
> ! /* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts"} } */
> /* { dg-final { cleanup-tree-dump "ivopts" } } */
> Index: testsuite/gcc.dg/tree-ssa/ivopt_3.c
> ===================================================================
> *** testsuite/gcc.dg/tree-ssa/ivopt_3.c (revision 192123)
> --- testsuite/gcc.dg/tree-ssa/ivopt_3.c (working copy)
> *************** void foo (int i_width, char* dst, char*
> *** 16,20 ****
> }
> }
>
> ! /* { dg-final { scan-tree-dump-times "PHI <ivtmp" 1 "ivopts"} } */
> /* { dg-final { cleanup-tree-dump "ivopts" } } */
> --- 16,20 ----
> }
> }
>
> ! /* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts"} } */
> /* { dg-final { cleanup-tree-dump "ivopts" } } */
> Index: testsuite/gcc.dg/tree-ssa/ivopt_4.c
> ===================================================================
> *** testsuite/gcc.dg/tree-ssa/ivopt_4.c (revision 192123)
> --- testsuite/gcc.dg/tree-ssa/ivopt_4.c (working copy)
> *************** void foo (int i_width, TYPE dst, TYPE sr
> *** 15,19 ****
> }
> }
>
> ! /* { dg-final { scan-tree-dump-times "PHI <ivtmp" 1 "ivopts"} } */
> /* { dg-final { cleanup-tree-dump "ivopts" } } */
> --- 15,19 ----
> }
> }
>
> ! /* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts"} } */
> /* { dg-final { cleanup-tree-dump "ivopts" } } */
> Index: testsuite/gcc.dg/unroll_4.c
> ===================================================================
> *** testsuite/gcc.dg/unroll_4.c (revision 192123)
> --- testsuite/gcc.dg/unroll_4.c (working copy)
> ***************
> *** 1,5 ****
> /* { dg-do compile { target i?86-*-linux* x86_64-*-linux* } } */
> ! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fdisable-tree-cunroll -fdisable-tree-cunrolli -fenable-rtl-loop2_unroll=foo2" } */
>
> unsigned a[100], b[100];
> inline void bar()
> --- 1,5 ----
> /* { dg-do compile { target i?86-*-linux* x86_64-*-linux* } } */
> ! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fdisable-tree-cunroll -fdisable-tree-cunrolli -fenable-rtl-loop2_unroll=foo2 -fno-tree-dominator-opts" } */
>
> unsigned a[100], b[100];
> inline void bar()
> Index: testsuite/gcc.dg/tree-prof/update-loopch.c
> ===================================================================
> *** testsuite/gcc.dg/tree-prof/update-loopch.c (revision 192123)
> --- testsuite/gcc.dg/tree-prof/update-loopch.c (working copy)
> *************** main ()
> *** 11,20 ****
> }
> return 0;
> }
> ! /* Loop header copying will peel away the initial conditional, so the loop body
> ! is once reached directly from entry point of function, rest via loopback
> ! edge. */
> ! /* { dg-final-use { scan-ipa-dump "loop depth 0, count 33334" "profile"} } */
> /* { dg-final-use { scan-tree-dump "loop depth 1, count 33332" "optimized"} } */
> /* { dg-final-use { scan-tree-dump-not "Invalid sum" "optimized"} } */
> /* { dg-final-use { cleanup-ipa-dump "profile" } } */
> --- 11,20 ----
> }
> return 0;
> }
> ! /* Loop header copying, now happening before profiling, will peel away the
> ! initial conditional, so the loop body is once reached directly from entry
> ! point of function, rest via loopback edge. */
> ! /* { dg-final-use { scan-ipa-dump "loop depth 0, count 33332" "profile"} } */
> /* { dg-final-use { scan-tree-dump "loop depth 1, count 33332" "optimized"} } */
> /* { dg-final-use { scan-tree-dump-not "Invalid sum" "optimized"} } */
> /* { dg-final-use { cleanup-ipa-dump "profile" } } */
> Index: testsuite/gcc.dg/unroll_1.c
> ===================================================================
> *** testsuite/gcc.dg/unroll_1.c (revision 192123)
> --- testsuite/gcc.dg/unroll_1.c (working copy)
> ***************
> *** 1,5 ****
> /* { dg-do compile } */
> ! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fdisable-tree-cunroll -fdisable-tree-cunrolli -fenable-rtl-loop2_unroll" } */
>
> unsigned a[100], b[100];
> inline void bar()
> --- 1,5 ----
> /* { dg-do compile } */
> ! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fdisable-tree-cunroll -fdisable-tree-cunrolli -fenable-rtl-loop2_unroll -fno-tree-dominator-opts" } */
>
> unsigned a[100], b[100];
> inline void bar()
> Index: passes.c
> ===================================================================
> *** passes.c (revision 192123)
> --- passes.c (working copy)
> *************** init_optimization_passes (void)
> *** 1330,1335 ****
> --- 1330,1340 ----
> NEXT_PASS (pass_convert_switch);
> NEXT_PASS (pass_cleanup_eh);
> NEXT_PASS (pass_profile);
> + /* Scheduling header copying before pass_ipa_tree_profile is important
> + to get loop iteration counts estimated right.
> + Scheduling it after pass_profile prevents it to copy loop headers
> + in cold functions declares by the user. */
> + NEXT_PASS (pass_ch);
> NEXT_PASS (pass_local_pure_const);
> /* Split functions creates parts that are not run through
> early optimizations again. It is thus good idea to do this
> *************** init_optimization_passes (void)
> *** 1406,1412 ****
> NEXT_PASS (pass_tree_ifcombine);
> NEXT_PASS (pass_phiopt);
> NEXT_PASS (pass_tail_recursion);
> - NEXT_PASS (pass_ch);
> NEXT_PASS (pass_stdarg);
> NEXT_PASS (pass_lower_complex);
> NEXT_PASS (pass_sra);
> --- 1411,1416 ----
More information about the Gcc-patches
mailing list