This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Improve partitioning decisions
- From: Jan Hubicka <hubicka at ucw dot cz>
- To: gcc-patches at gcc dot gnu dot org
- Date: Sat, 28 Apr 2018 20:55:51 +0200
- Subject: Improve partitioning decisions
Hi,
with number of cores increasing our partitioning algorithm needs some love,
because it works quite bad.
This patch fixes few issues with ballanced partitioning algorithm:
1) I have switched all size/boundary summaries to 64bit arthmetics as they
may overflow for very large units
2) internal calls within partition was misaccounted
3) I no longer account external references (as those will never be local
no matter how we partition)
4) the range where we search for best split is not +- 1/8 of partition.
5) we no longer look for worst possible split, but for best one.
6) I have added bit extra debug info.
I have checked this with Firefox build. Previously we did 31 instead of 32 partitions
and the difference between largest and smallest was 170%. Now we do 32 partitions and get
the difference is "only" about 70%. The resulting code quality improves a bit and
we get 260s for largest partition build time instead of 305s.
I will commit it into the mainline and backport to release branches after some soaking
(and 8.1 release), since without the patch partitioning decisions with larger # of partitions
are really pretty bad.
Situation is still not perfect - I do not get any speedup for increasing partitions to 64
becuase we still get too much of extra streaming. Something to track this stage1.
Honza
* lto-partition.c: Include sreal.h
(add_symbol_to_partition_1): Use size instead of self_size
for size estimate.
(account_reference_p): New.
(lto_balanced_map): Use 64bit arithmetics for size calculatoins; cleanup;
fix accounting errors in boundary size; add debug output; combine cost
as cost/size instead of cost/internal; reduce the partitioning error to
+- 1/8 of the parttion size.
Index: lto-partition.c
===================================================================
--- lto-partition.c (revision 259742)
+++ lto-partition.c (working copy)
@@ -35,6 +35,7 @@
#include "ipa-prop.h"
#include "ipa-fnsummary.h"
#include "lto-partition.h"
+#include "sreal.h"
vec<ltrans_partition> ltrans_partitions;
@@ -152,8 +153,8 @@
if (cgraph_node *cnode = dyn_cast <cgraph_node *> (node))
{
struct cgraph_edge *e;
- if (!node->alias)
- part->insns += ipa_fn_summaries->get (cnode)->self_size;
+ if (!node->alias && c == SYMBOL_PARTITION)
+ part->insns += ipa_fn_summaries->get (cnode)->size;
/* Add all inline clones and callees that are duplicated. */
for (e = cnode->callees; e; e = e->next_callee)
@@ -276,8 +277,9 @@
delete partition->initializers_visited;
partition->initializers_visited = NULL;
- if (!node->alias && (cnode = dyn_cast <cgraph_node *> (node)))
- partition->insns -= ipa_fn_summaries->get (cnode)->self_size;
+ if (!node->alias && (cnode = dyn_cast <cgraph_node *> (node))
+ && node->get_partitioning_class () == SYMBOL_PARTITION)
+ partition->insns -= ipa_fn_summaries->get (cnode)->size;
lto_symtab_encoder_delete_node (partition->encoder, node);
node->aux = (void *)((size_t)node->aux - 1);
}
@@ -408,7 +410,25 @@
add_symbol_to_partition (partition, node);
}
+/* Return true if we should account reference from N1 to N2 in cost
+ of partition boundary. */
+bool
+account_reference_p (symtab_node *n1, symtab_node *n2)
+{
+ if (cgraph_node *cnode = dyn_cast <cgraph_node *> (n1))
+ n1 = cnode;
+ /* Do not account recursion - the code below will handle it incorrectly
+ otherwise. Also do not account references to external symbols.
+ They will never become local. */
+ if (n1 == n2
+ || DECL_EXTERNAL (n2->decl)
+ || !n2->definition)
+ return false;
+ return true;
+}
+
+
/* Group cgraph nodes into equally-sized partitions.
The partitioning algorithm is simple: nodes are taken in predefined order.
@@ -457,14 +477,14 @@
auto_vec<varpool_node *> varpool_order;
int i;
struct cgraph_node *node;
- int original_total_size, total_size = 0, best_total_size = 0;
- int partition_size;
+ int64_t original_total_size, total_size = 0;
+ int64_t partition_size;
ltrans_partition partition;
int last_visited_node = 0;
varpool_node *vnode;
- int cost = 0, internal = 0;
- int best_n_nodes = 0, best_i = 0, best_cost =
- INT_MAX, best_internal = 0;
+ int64_t cost = 0, internal = 0;
+ int best_n_nodes = 0, best_i = 0;
+ int64_t best_cost = -1, best_internal = 0, best_size = 0;
int npartitions;
int current_order = -1;
int noreorder_pos = 0;
@@ -513,7 +533,8 @@
/* Compute partition size and create the first partition. */
if (PARAM_VALUE (MIN_PARTITION_SIZE) > max_partition_size)
- fatal_error (input_location, "min partition size cannot be greater than max partition size");
+ fatal_error (input_location, "min partition size cannot be greater "
+ "than max partition size");
partition_size = total_size / n_lto_partitions;
if (partition_size < PARAM_VALUE (MIN_PARTITION_SIZE))
@@ -521,7 +542,7 @@
npartitions = 1;
partition = new_partition ("");
if (symtab->dump_file)
- fprintf (symtab->dump_file, "Total unit size: %i, partition size: %i\n",
+ fprintf (symtab->dump_file, "Total unit size: %" PRId64 ", partition size: %" PRId64 "\n",
total_size, partition_size);
auto_vec<symtab_node *> next_nodes;
@@ -540,17 +561,11 @@
next_nodes.safe_push (varpool_order[varpool_pos++]);
while (noreorder_pos < (int)noreorder.length ()
&& noreorder[noreorder_pos]->order < current_order)
- {
- if (!noreorder[noreorder_pos]->alias)
- total_size -= ipa_fn_summaries->get (noreorder[noreorder_pos])->size;
- next_nodes.safe_push (noreorder[noreorder_pos++]);
- }
+ next_nodes.safe_push (noreorder[noreorder_pos++]);
add_sorted_nodes (next_nodes, partition);
if (!symbol_partitioned_p (order[i]))
add_symbol_to_partition (partition, order[i]);
- if (!order[i]->alias)
- total_size -= ipa_fn_summaries->get (order[i])->size;
/* Once we added a new node to the partition, we also want to add
@@ -567,7 +582,6 @@
it and thus we need to subtract it from COST. */
while (last_visited_node < lto_symtab_encoder_size (partition->encoder))
{
- symtab_node *refs_node;
int j;
struct ipa_ref *ref = NULL;
symtab_node *snode = lto_symtab_encoder_deref (partition->encoder,
@@ -577,7 +591,6 @@
{
struct cgraph_edge *edge;
- refs_node = node;
last_visited_node++;
@@ -585,7 +598,9 @@
/* Compute boundary cost of callgraph edges. */
for (edge = node->callees; edge; edge = edge->next_callee)
- if (edge->callee->definition)
+ /* Inline edges will always end up local. */
+ if (edge->inline_failed
+ && account_reference_p (node, edge->callee))
{
int edge_cost = edge->frequency ();
int index;
@@ -602,6 +617,8 @@
cost += edge_cost;
}
for (edge = node->callers; edge; edge = edge->next_caller)
+ if (edge->inline_failed
+ && account_reference_p (edge->caller, node))
{
int edge_cost = edge->frequency ();
int index;
@@ -614,27 +631,24 @@
edge->caller);
if (index != LCC_NOT_FOUND
&& index < last_visited_node - 1)
- cost -= edge_cost;
+ cost -= edge_cost, internal += edge_cost;
else
cost += edge_cost;
}
}
else
- {
- refs_node = snode;
- last_visited_node++;
- }
+ last_visited_node++;
/* Compute boundary cost of IPA REF edges and at the same time look into
variables referenced from current partition and try to add them. */
- for (j = 0; refs_node->iterate_reference (j, ref); j++)
- if (is_a <varpool_node *> (ref->referred))
+ for (j = 0; snode->iterate_reference (j, ref); j++)
+ if (!account_reference_p (snode, ref->referred))
+ ;
+ else if (is_a <varpool_node *> (ref->referred))
{
int index;
vnode = dyn_cast <varpool_node *> (ref->referred);
- if (!vnode->definition)
- continue;
if (!symbol_partitioned_p (vnode)
&& !vnode->no_reorder
&& vnode->get_partitioning_class () == SYMBOL_PARTITION)
@@ -652,8 +666,6 @@
int index;
node = dyn_cast <cgraph_node *> (ref->referred);
- if (!node->definition)
- continue;
index = lto_symtab_encoder_lookup (partition->encoder,
node);
if (index != LCC_NOT_FOUND
@@ -662,8 +674,10 @@
else
cost++;
}
- for (j = 0; refs_node->iterate_referring (j, ref); j++)
- if (is_a <varpool_node *> (ref->referring))
+ for (j = 0; snode->iterate_referring (j, ref); j++)
+ if (!account_reference_p (ref->referring, snode))
+ ;
+ else if (is_a <varpool_node *> (ref->referring))
{
int index;
@@ -682,7 +696,7 @@
vnode);
if (index != LCC_NOT_FOUND
&& index < last_visited_node - 1)
- cost--;
+ cost--, internal++;
else
cost++;
}
@@ -696,36 +710,41 @@
node);
if (index != LCC_NOT_FOUND
&& index < last_visited_node - 1)
- cost--;
+ cost--, internal++;
else
cost++;
}
}
- /* If the partition is large enough, start looking for smallest boundary cost. */
- if (partition->insns < partition_size * 3 / 4
- || best_cost == INT_MAX
- || ((!cost
- || (best_internal * (HOST_WIDE_INT) cost
- > (internal * (HOST_WIDE_INT)best_cost)))
- && partition->insns < partition_size * 5 / 4))
+ gcc_assert (cost >= 0 && internal >= 0);
+
+ /* If the partition is large enough, start looking for smallest boundary cost.
+ If partition still seems too small (less than 7/8 of target weight) accept
+ any cost. If partition has right size, optimize for highest internal/cost.
+ Later we stop building partition if its size is 9/8 of the target wight. */
+ if (partition->insns < partition_size * 7 / 8
+ || best_cost == -1
+ || (!cost
+ || ((sreal)best_internal * (sreal) cost
+ < ((sreal) internal * (sreal)best_cost))))
{
best_cost = cost;
best_internal = internal;
+ best_size = partition->insns;
best_i = i;
best_n_nodes = lto_symtab_encoder_size (partition->encoder);
- best_total_size = total_size;
best_varpool_pos = varpool_pos;
}
if (symtab->dump_file)
- fprintf (symtab->dump_file, "Step %i: added %s/%i, size %i, cost %i/%i "
- "best %i/%i, step %i\n", i,
+ fprintf (symtab->dump_file, "Step %i: added %s/%i, size %i, "
+ "cost %" PRId64 "/%" PRId64 " "
+ "best %" PRId64 "/%" PRId64", step %i\n", i,
order[i]->name (), order[i]->order,
partition->insns, cost, internal,
best_cost, best_internal, best_i);
/* Partition is too large, unwind into step when best cost was reached and
start new partition. */
- if (partition->insns > 2 * partition_size
+ if (partition->insns > 9 * partition_size / 8
|| partition->insns > max_partition_size)
{
if (best_i != i)
@@ -736,21 +755,26 @@
undo_partition (partition, best_n_nodes);
varpool_pos = best_varpool_pos;
}
+ gcc_assert (best_size == partition->insns);
i = best_i;
+ if (symtab->dump_file)
+ fprintf (symtab->dump_file,
+ "Partition insns: %i (want %" PRId64 ")\n",
+ partition->insns, partition_size);
/* When we are finished, avoid creating empty partition. */
while (i < n_nodes - 1 && symbol_partitioned_p (order[i + 1]))
i++;
if (i == n_nodes - 1)
break;
+ total_size -= partition->insns;
partition = new_partition ("");
last_visited_node = 0;
- total_size = best_total_size;
cost = 0;
if (symtab->dump_file)
fprintf (symtab->dump_file, "New partition\n");
best_n_nodes = 0;
- best_cost = INT_MAX;
+ best_cost = -1;
/* Since the size of partitions is just approximate, update the size after
we finished current one. */
@@ -760,6 +784,10 @@
/* Watch for overflow. */
partition_size = INT_MAX / 16;
+ if (symtab->dump_file)
+ fprintf (symtab->dump_file,
+ "Total size: %" PRId64 " partition_size: %" PRId64 "\n",
+ total_size, partition_size);
if (partition_size < PARAM_VALUE (MIN_PARTITION_SIZE))
partition_size = PARAM_VALUE (MIN_PARTITION_SIZE);
npartitions ++;
@@ -779,6 +807,9 @@
next_nodes.safe_push (varpool_order[varpool_pos++]);
while (noreorder_pos < (int)noreorder.length ())
next_nodes.safe_push (noreorder[noreorder_pos++]);
+ /* For one partition the cost of boundary should be 0 unless we added final
+ symbols here (these are not accounted) or we have accounting bug. */
+ gcc_assert (next_nodes.length () || i != 1 || !best_cost);
add_sorted_nodes (next_nodes, partition);
free (order);