This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Implement ggc_trim
- From: Jan Hubicka <hubicka at ucw dot cz>
- To: gcc-patches at gcc dot gnu dot org, rguenther at suse dot de, mliska at suse dot cz, mjambor at suse dot cz
- Date: Fri, 11 Oct 2019 09:03:53 +0200
- Subject: Implement ggc_trim
Hi,
this patch adds ggc_trim that releases free pages used by GGC memory
to system. This is useful to reduce memory footprint of WPA streaming:
WPA streaming ought to not use any more GGC memory (patches in testing
for that) and trimming the memory makes it available to fork&malloc used
by stream out machinery.
I collected some stats for cc1 for both GGC and heap (using mallinfo).
Memory footprints are as follows:
After streaming in global stream: 123MB GGC; 25MB of heap.
After streaming in callgraph : 228MB GGC; 45MB of heap.
After streaming in summaries : 373MB GGC; 126MB of heap.
After symbol merging : 348MB GGC; 130MB of heap.
After IPA-ICF : 501MB GGC; 160MB of heap. (this is all ICF)
After IPA-CP : 528MB GGC; 163MB of heap.
After IPA-SRA : 532MB GGC; 163MB of heap.
After Inline : 644MB GGC; 173MB of heap
This is after collecting of 118MB of
garbage and returning 740k to system
by madvise_dontneed
After ipa-reference : 644MB GGC; 370MB of heap
I checked this all goes into the
bitmaps; I have WIP patch for that
After releasing summariess : 431MB GGC; 383MB of heap
Trim releases 43MB by unmap
and 321MB by madvise_dontneed
At least i learnt new fact about ipa-reference consuming 200MB of
memory which was not obvious from our detailed mem stats.
I think the lowest hanging fruit after this patch is to add
malloc_madvise which further reduces footpring and fix ipa-reference.
Hopefully Martin will do a bit about ipa-icf.
I will dig into what inliner does but it produces a lot of clones so I
think it is mostly clone and summary duplication. Perhaps we can avoid
copying some of summaries for inline clones.
In TOP I see about 900MB instead of 1.4GB before WPA streaming starts
with both ggc_trim and madvise.
Note that I also tried to hack ggc_free to recognize free pages but at
least in simple implementation it is a loss since it makes ggc_alloc
more expensive (it needs to bring pages back and add into freelists)
which hurts stream-in performance.
I think sweeping once per WPA is no problem, it is definitly less than
1% of WPA time.
Bootstrapped/regtested x86_64-linux, OK?
* ggc-page.c (release_pages): Output statistics when !quiet_flag.
(ggc_collect): Dump later to not interfere with release_page dump.
(ggc_trim): New function.
* ggc-none.c (ggc_trim): New.
* lto.c (lto_wpa_write_files): Call ggc_trim.
Index: ggc-page.c
===================================================================
--- ggc-page.c (revision 276707)
+++ ggc-page.c (working copy)
@@ -529,7 +529,6 @@ static void clear_page_group_in_use (pag
#endif
static struct page_entry * alloc_page (unsigned);
static void free_page (struct page_entry *);
-static void release_pages (void);
static void clear_marks (void);
static void sweep_pages (void);
static void ggc_recalculate_in_use_p (page_entry *);
@@ -1016,6 +1015,8 @@ free_page (page_entry *entry)
static void
release_pages (void)
{
+ size_t n1 = 0;
+ size_t n2 = 0;
#ifdef USING_MADVISE
page_entry *p, *start_p;
char *start;
@@ -1061,6 +1062,7 @@ release_pages (void)
else
G.free_pages = p;
G.bytes_mapped -= mapped_len;
+ n1 += len;
continue;
}
prev = newprev;
@@ -1092,6 +1094,7 @@ release_pages (void)
/* Don't count those pages as mapped to not touch the garbage collector
unnecessarily. */
G.bytes_mapped -= len;
+ n2 += len;
while (start_p != p)
{
start_p->discarded = true;
@@ -1124,6 +1127,7 @@ release_pages (void)
}
munmap (start, len);
+ n1 += len;
G.bytes_mapped -= len;
}
@@ -1152,10 +1156,20 @@ release_pages (void)
*gp = g->next;
G.bytes_mapped -= g->alloc_size;
free (g->allocation);
+ n1 += g->alloc_size;
}
else
gp = &g->next;
#endif
+ if (!quiet_flag && (n1 || n2))
+ {
+ fprintf (stderr, " {GC");
+ if (n1)
+ fprintf (stderr, " released %luk", (unsigned long)(n1 / 1024));
+ if (n2)
+ fprintf (stderr, " madv_dontneed %luk", (unsigned long)(n2 / 1024));
+ fprintf (stderr, "}");
+ }
}
/* This table provides a fast way to determine ceil(log_2(size)) for
@@ -2178,19 +2192,22 @@ ggc_collect (void)
return;
timevar_push (TV_GC);
- if (!quiet_flag)
- fprintf (stderr, " {GC %luk -> ", (unsigned long) G.allocated / 1024);
if (GGC_DEBUG_LEVEL >= 2)
fprintf (G.debug_file, "BEGIN COLLECTING\n");
/* Zero the total allocated bytes. This will be recalculated in the
sweep phase. */
+ size_t allocated = G.allocated;
G.allocated = 0;
/* Release the pages we freed the last time we collected, but didn't
reuse in the interim. */
release_pages ();
+ /* Output this later so we do not interfere with release_pages. */
+ if (!quiet_flag)
+ fprintf (stderr, " {GC %luk -> ", (unsigned long) allocated / 1024);
+
/* Indicate that we've seen collections at this context depth. */
G.context_depth_collections = ((unsigned long)1 << (G.context_depth + 1)) - 1;
@@ -2221,9 +2238,25 @@ ggc_collect (void)
fprintf (G.debug_file, "END COLLECTING\n");
}
-/* Assume that all GGC memory is reachable and grow the limits for next collection.
- With checking, trigger GGC so -Q compilation outputs how much of memory really is
- reachable. */
+/* Return free pages to the system. */
+
+void
+ggc_trim ()
+{
+ timevar_push (TV_GC);
+ G.allocated = 0;
+ sweep_pages ();
+ release_pages ();
+ if (!quiet_flag)
+ fprintf (stderr, " {GC trimmed to %luk, %luk mapped}",
+ (unsigned long) G.allocated / 1024,
+ (unsigned long) G.bytes_mapped / 1024);
+ timevar_pop (TV_GC);
+}
+
+/* Assume that all GGC memory is reachable and grow the limits for next
+ collection. With checking, trigger GGC so -Q compilation outputs how much
+ of memory really is reachable. */
void
ggc_grow (void)
Index: ggc-none.c
===================================================================
--- ggc-none.c (revision 276707)
+++ ggc-none.c (working copy)
@@ -72,3 +72,8 @@ void
ggc_grow (void)
{
}
+
+void
+ggc_trim (void)
+{
+}
Index: lto/lto.c
===================================================================
--- lto/lto.c (revision 276707)
+++ lto/lto.c (working copy)
@@ -304,6 +306,7 @@ lto_wpa_write_files (void)
timevar_push (TV_WHOPR_WPA_IO);
+ ggc_trim ();
/* Generate a prefix for the LTRANS unit files. */
blen = strlen (ltrans_output_list);
temp_filename = (char *) xmalloc (blen + sizeof ("2147483648.o"));