This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [PATCH][0/n] tree LIM TLC - series part for backporting, limit LIM
- From: Richard Biener <rguenther at suse dot de>
- To: gcc-patches at gcc dot gnu dot org
- Date: Fri, 15 Mar 2013 11:36:31 +0100 (CET)
- Subject: Re: [PATCH][0/n] tree LIM TLC - series part for backporting, limit LIM
- References: <alpine.LNX.2.00.1303141428080.3543@zhemvz.fhfr.qr>
On Thu, 14 Mar 2013, Richard Biener wrote:
>
> This extracts pieces from the already posted patch series that are
> most worthwhile and applicable for backporting to both 4.8 and 4.7.
> It also re-implements the limiting of the maximum number of memory
> references to consider for LIMs dependence analysis. This limiting
> is now done per loop-nest and disables optimizing outer loops
> only. The limiting requires backporting introduction of the
> shared unalalyzable mem-ref - it works by marking that as stored
> in loops we do not want to compute dependences for - which makes
> dependence computation for mems in those loops linear, as that
> mem-ref, which conveniently has ID 0, is tested first.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu.
>
> The current limit of 1000 datarefs is quite low (well, for LIMs
> purposes, that is), and I only bothered to care about -O1 for
> backports (no caching of the affine combination). With the
> limit in place and at -O1 LIM now takes
>
> tree loop invariant motion: 0.55 ( 1%) usr
>
> for the testcase in PR39326. Four patches in total, we might
> consider not backporting the limiting, without it this
> insane testcase has, at ~2GB memory usage (peak determined by IRA)
>
> tree loop invariant motion: 533.30 (77%) usr
>
> but avoids running into the DSE / combine issue (and thus stays
> managable overall at -O1). With limiting it requires -fno-dse
> to not blow up (>5GB of memory use).
Note that the limiting patch (below) causes code-generation differences
because it collects memory-references in a different order and
store-motion applies its transform in order of mem-ref IDs
(different order of loads / stores and different decl UIDs). The
different ordering results in quite a big speedup because bitmaps
have a more regular form (maybe only for this testcase though).
Richard.
> 2013-03-14 Richard Biener <rguenther@suse.de>
>
> PR tree-optimization/39326
> * tree-ssa-loop-im.c: Include diagnostic-core.h.
> (mark_ref_stored): Optimize.
> (gather_mem_refs_stmt): Also set all_refs_stored_bit if stored.
> (create_vop_ref_mapping_loop, create_vop_ref_mapping): Remove
> and fold functionality into ...
> (gather_mem_refs_in_loops): ... this. Iterate over loops,
> counting memory references and punting when more than
> --param loop-max-datarefs-for-datadeps.
> (analyze_memory_references): Adjust.
>
> Index: trunk/gcc/tree-ssa-loop-im.c
> ===================================================================
> *** trunk.orig/gcc/tree-ssa-loop-im.c 2013-03-14 12:52:37.000000000 +0100
> --- trunk/gcc/tree-ssa-loop-im.c 2013-03-14 14:23:47.533164359 +0100
> *************** along with GCC; see the file COPYING3.
> *** 20,25 ****
> --- 20,26 ----
> #include "config.h"
> #include "system.h"
> #include "coretypes.h"
> + #include "diagnostic-core.h"
> #include "tm.h"
> #include "tree.h"
> #include "tm_p.h"
> *************** record_mem_ref_loc (mem_ref_p ref, struc
> *** 1551,1561 ****
> static void
> mark_ref_stored (mem_ref_p ref, struct loop *loop)
> {
> ! for (;
> ! loop != current_loops->tree_root
> ! && !bitmap_bit_p (ref->stored, loop->num);
> ! loop = loop_outer (loop))
> ! bitmap_set_bit (ref->stored, loop->num);
> }
>
> /* Gathers memory references in statement STMT in LOOP, storing the
> --- 1552,1560 ----
> static void
> mark_ref_stored (mem_ref_p ref, struct loop *loop)
> {
> ! while (loop != current_loops->tree_root
> ! && bitmap_set_bit (ref->stored, loop->num))
> ! loop = loop_outer (loop);
> }
>
> /* Gathers memory references in statement STMT in LOOP, storing the
> *************** gather_mem_refs_stmt (struct loop *loop,
> *** 1618,1624 ****
> }
> bitmap_set_bit (memory_accesses.refs_in_loop[loop->num], ref->id);
> if (is_stored)
> ! mark_ref_stored (ref, loop);
> return;
> }
>
> --- 1617,1627 ----
> }
> bitmap_set_bit (memory_accesses.refs_in_loop[loop->num], ref->id);
> if (is_stored)
> ! {
> ! bitmap_set_bit (memory_accesses.all_refs_stored_in_loop[loop->num],
> ! ref->id);
> ! mark_ref_stored (ref, loop);
> ! }
> return;
> }
>
> *************** gather_mem_refs_stmt (struct loop *loop,
> *** 1627,1704 ****
> static void
> gather_mem_refs_in_loops (void)
> {
> - gimple_stmt_iterator bsi;
> - basic_block bb;
> struct loop *loop;
> loop_iterator li;
> - bitmap lrefs, alrefs, alrefso;
> -
> - FOR_EACH_BB (bb)
> - {
> - loop = bb->loop_father;
> - if (loop == current_loops->tree_root)
> - continue;
>
> ! for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
> ! gather_mem_refs_stmt (loop, gsi_stmt (bsi));
> ! }
>
> /* Propagate the information about accessed memory references up
> the loop hierarchy. */
> FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
> {
> ! lrefs = memory_accesses.refs_in_loop[loop->num];
> ! alrefs = memory_accesses.all_refs_in_loop[loop->num];
> ! bitmap_ior_into (alrefs, lrefs);
>
> ! if (loop_outer (loop) == current_loops->tree_root)
> continue;
>
> ! alrefso = memory_accesses.all_refs_in_loop[loop_outer (loop)->num];
> ! bitmap_ior_into (alrefso, alrefs);
> }
> - }
> -
> - /* Create a mapping from virtual operands to references that touch them
> - in LOOP. */
> -
> - static void
> - create_vop_ref_mapping_loop (struct loop *loop)
> - {
> - bitmap refs = memory_accesses.refs_in_loop[loop->num];
> - struct loop *sloop;
> - bitmap_iterator bi;
> - unsigned i;
> - mem_ref_p ref;
>
> ! EXECUTE_IF_SET_IN_BITMAP (refs, 0, i, bi)
> ! {
> ! ref = memory_accesses.refs_list[i];
> ! for (sloop = loop; sloop != current_loops->tree_root;
> ! sloop = loop_outer (sloop))
> ! if (bitmap_bit_p (ref->stored, loop->num))
> ! {
> ! bitmap refs_stored
> ! = memory_accesses.all_refs_stored_in_loop[sloop->num];
> ! bitmap_set_bit (refs_stored, ref->id);
> ! }
> ! }
> }
>
> - /* For each non-clobbered virtual operand and each loop, record the memory
> - references in this loop that touch the operand. */
> -
> - static void
> - create_vop_ref_mapping (void)
> - {
> - loop_iterator li;
> - struct loop *loop;
> -
> - FOR_EACH_LOOP (li, loop, 0)
> - {
> - create_vop_ref_mapping_loop (loop);
> - }
> - }
>
> /* Gathers information about memory accesses in the loops. */
>
> --- 1630,1707 ----
> static void
> gather_mem_refs_in_loops (void)
> {
> struct loop *loop;
> loop_iterator li;
>
> ! unsigned *count = XCNEWVEC (unsigned, number_of_loops ());
>
> /* Propagate the information about accessed memory references up
> the loop hierarchy. */
> FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
> {
> ! basic_block *bbs = get_loop_body (loop);
> ! struct loop *outer;
> ! unsigned i;
> !
> ! for (i = 0; i < loop->num_nodes; ++i)
> ! {
> ! basic_block bb = bbs[i];
> ! gimple_stmt_iterator gsi;
> ! if (bb->loop_father != loop)
> ! continue;
> ! for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> ! gather_mem_refs_stmt (loop, gsi_stmt (gsi));
> ! }
> ! free (bbs);
>
> ! /* Finalize the overall numbers (including subloops) for this loop. */
> ! count[loop->num]
> ! += bitmap_count_bits (memory_accesses.refs_in_loop[loop->num]);
> !
> ! /* If there are too many memory references in this loop and its
> ! sub-loops such that dependence testing would blow up compile-time
> ! drop a unanalyzed store into the list of memory references to
> ! get to the early-out. */
> ! if ((count[loop->num]
> ! > (unsigned) PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
> ! && bitmap_set_bit
> ! (memory_accesses.all_refs_stored_in_loop[loop->num],
> ! UNANALYZABLE_MEM_ID))
> ! {
> ! bitmap_set_bit (memory_accesses.refs_in_loop[loop->num],
> ! UNANALYZABLE_MEM_ID);
> ! mark_ref_stored (memory_accesses.refs_list[UNANALYZABLE_MEM_ID],
> ! loop);
> ! if (dump_file && (dump_flags & TDF_DETAILS))
> ! fprintf (dump_file, "Too many memory references in loop %d and its "
> ! "super-loops, giving up\n", loop->num);
> ! warning_at (gimple_location (gsi_stmt (gsi_start_bb (loop->header))),
> ! OPT_Wdisabled_optimization,
> ! "-ftree-loop-im: number of memory references in loop "
> ! "exceeds the --param loops-max-datarefs-for-datadeps "
> ! "threshold");
> ! }
> !
> ! /* Finalize the overall touched references (including subloops). */
> ! bitmap_ior_into (memory_accesses.all_refs_in_loop[loop->num],
> ! memory_accesses.refs_in_loop[loop->num]);
> !
> ! /* Propagate the information about accessed memory references up
> ! the loop hierarchy. */
> ! outer = loop_outer (loop);
> ! if (outer == current_loops->tree_root)
> continue;
>
> ! bitmap_ior_into (memory_accesses.all_refs_in_loop[outer->num],
> ! memory_accesses.all_refs_in_loop[loop->num]);
> ! bitmap_ior_into (memory_accesses.all_refs_stored_in_loop[outer->num],
> ! memory_accesses.all_refs_stored_in_loop[loop->num]);
> ! count[outer->num] += count[loop->num];
> }
>
> ! free (count);
> }
>
>
> /* Gathers information about memory accesses in the loops. */
>
> *************** analyze_memory_references (void)
> *** 1731,1737 ****
> memory_accesses.ttae_cache = NULL;
>
> gather_mem_refs_in_loops ();
> - create_vop_ref_mapping ();
> }
>
> /* Returns true if MEM1 and MEM2 may alias. TTAE_CACHE is used as a cache in
> --- 1734,1739 ----
>
--
Richard Biener <rguenther@suse.de>
SUSE / SUSE Labs
SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
GF: Jeff Hawn, Jennifer Guild, Felix Imend