[PATCH] Time profiler - phase 2

Jan Hubicka hubicka@ucw.cz
Sat Nov 16 12:59:00 GMT 2013


> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index c566a85..1562098 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,15 @@
> +2013-11-13	Martin Liska	<marxin.liska@gmail.com>
> +						Jan Hubicka  <jh@suse.cz>
> +
> +	* cgraphunit.c (node_cmp): New function.
> +	(expand_all_functions): Function ordering added.
> +	* common.opt: New profile based function reordering flag introduced.
> +	* coverage.c (get_coverage_counts): Wrong profile handled.
> +	* ipa.c (cgraph_externally_visible_p): New late flag introduced.
> +	* lto-partition.c: Support for time profile added.
> +	* lto.c: Likewise.
> +	* value-prof.c: Histogram instrumentation switch added.
> +
>  2013-11-13  Vladimir Makarov  <vmakarov@redhat.com>
>  
>  	PR rtl-optimization/59036
> diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c
> index 4765e6a..7cdd9a4 100644
> --- a/gcc/cgraphunit.c
> +++ b/gcc/cgraphunit.c
> @@ -1821,6 +1821,17 @@ expand_function (struct cgraph_node *node)
>    ipa_remove_all_references (&node->ref_list);
>  }
>  
> +/* Node comparer that is responsible for the order that corresponds
> +   to time when a function was launched for the first time.  */
> +
> +static int
> +node_cmp (const void *pa, const void *pb)
> +{
> +  const struct cgraph_node *a = *(const struct cgraph_node * const *) pa;
> +  const struct cgraph_node *b = *(const struct cgraph_node * const *) pb;
> +
> +  return b->tp_first_run - a->tp_first_run;

Please stabilize this by using node->order when tp_first_run is equivalent.
Later we ought to use better heuristic here, but order may be good enough to
start with.
> diff --git a/gcc/ipa.c b/gcc/ipa.c
> index a11b1c7..d92a332 100644
> --- a/gcc/ipa.c
> +++ b/gcc/ipa.c
> @@ -761,10 +761,14 @@ cgraph_externally_visible_p (struct cgraph_node *node,
>       This improves code quality and we know we will duplicate them at most twice
>       (in the case that we are not using plugin and link with object file
>        implementing same COMDAT)  */
> -  if ((in_lto_p || whole_program)
> -      && DECL_COMDAT (node->decl)
> -      && comdat_can_be_unshared_p (node))
> -    return false;
> +  if ((in_lto_p || whole_program || profile_arc_flag)
> +     && DECL_COMDAT (node->decl)
> +     && comdat_can_be_unshared_p (node))
> +    {
> +      gcc_checking_assert (cgraph_function_body_availability (node)
> +			   > AVAIL_OVERWRITABLE);
> +      return false;
> +    }
>  
>    /* When doing link time optimizations, hidden symbols become local.  */
>    if (in_lto_p
> @@ -932,7 +936,7 @@ function_and_variable_visibility (bool whole_program)
>  	}
>        gcc_assert ((!DECL_WEAK (node->decl)
>  		  && !DECL_COMDAT (node->decl))
> -      	          || TREE_PUBLIC (node->decl)
> +		  || TREE_PUBLIC (node->decl)
>  		  || node->weakref
>  		  || DECL_EXTERNAL (node->decl));
>        if (cgraph_externally_visible_p (node, whole_program))
> @@ -949,7 +953,7 @@ function_and_variable_visibility (bool whole_program)
>  	  && node->definition && !node->weakref
>  	  && !DECL_EXTERNAL (node->decl))
>  	{
> -	  gcc_assert (whole_program || in_lto_p
> +	  gcc_assert (whole_program || in_lto_p || profile_arc_flag
>  		      || !TREE_PUBLIC (node->decl));
>  	  node->unique_name = ((node->resolution == LDPR_PREVAILING_DEF_IRONLY
>  				      || node->resolution == LDPR_PREVAILING_DEF_IRONLY_EXP)

These changes are unrelated, please remove them.
> @@ -395,6 +397,20 @@ node_cmp (const void *pa, const void *pb)
>  {
>    const struct cgraph_node *a = *(const struct cgraph_node * const *) pa;
>    const struct cgraph_node *b = *(const struct cgraph_node * const *) pb;
> +
> +  /* Profile reorder flag enables function reordering based on first execution
> +     of a function. All functions with profile are placed in ascending
> +     order at the beginning.  */
> +
> +  if (flag_profile_reorder_functions)
				       && a->tp_first_run != b->tp_first_run
> +  {
> +    if (a->tp_first_run && b->tp_first_run)
> +      return a->tp_first_run - b->tp_first_run;
> +
> +    if (a->tp_first_run || b->tp_first_run)
> +      return b->tp_first_run - a->tp_first_run;

Drop a comment explaining the logic here ;)
> @@ -449,7 +465,7 @@ void
>  lto_balanced_map (void)
>  {
>    int n_nodes = 0;
> -  int n_varpool_nodes = 0, varpool_pos = 0, best_varpool_pos = 0;
> +  int n_varpool_nodes = 0, varpool_pos = 0;
>    struct cgraph_node **order = XNEWVEC (struct cgraph_node *, cgraph_max_uid);
>    struct varpool_node **varpool_order = NULL;
>    int i;
> @@ -481,10 +497,13 @@ lto_balanced_map (void)
>       get better about minimizing the function bounday, but until that
>       things works smoother if we order in source order.  */
>    qsort (order, n_nodes, sizeof (struct cgraph_node *), node_cmp);
> +
> +  if (cgraph_dump_file)
> +    for(i = 0; i < n_nodes; i++)
> +      fprintf (cgraph_dump_file, "Balanced map symbol order:%s:%u\n", cgraph_node_asm_name (order[i]), order[i]->tp_first_run);
> +
>    if (!flag_toplevel_reorder)
>      {
> -      qsort (order, n_nodes, sizeof (struct cgraph_node *), node_cmp);
> -
>        FOR_EACH_VARIABLE (vnode)
>  	if (get_symbol_class (vnode) == SYMBOL_PARTITION)
>  	  n_varpool_nodes++;

Good catch (the redundant qsort)

> @@ -683,7 +702,6 @@ lto_balanced_map (void)
>  	  best_i = i;
>  	  best_n_nodes = lto_symtab_encoder_size (partition->encoder);
>  	  best_total_size = total_size;
> -	  best_varpool_pos = varpool_pos;
>  	}
>        if (cgraph_dump_file)
>  	fprintf (cgraph_dump_file, "Step %i: added %s/%i, size %i, cost %i/%i "
> @@ -701,7 +719,6 @@ lto_balanced_map (void)
>  		fprintf (cgraph_dump_file, "Unwinding %i insertions to step %i\n",
>  			 i - best_i, best_i);
>  	      undo_partition (partition, best_n_nodes);
> -	      varpool_pos = best_varpool_pos;
>  	    }
>  	  i = best_i;
>   	  /* When we are finished, avoid creating empty partition.  */

This actually reverts a bugfix, so please remove the two changes above.
> diff --git a/gcc/lto/lto.c b/gcc/lto/lto.c
> index 62856d0..2401c76 100644
> --- a/gcc/lto/lto.c
> +++ b/gcc/lto/lto.c
> @@ -2453,9 +2453,12 @@ lto_wpa_write_files (void)
>    /* Sort partitions by size so small ones are compiled last.
>       FIXME: Even when not reordering we may want to output one list for parallel make
>       and other for final link command.  */
> -  ltrans_partitions.qsort (flag_toplevel_reorder
> +
> +  if (!flag_profile_reorder_functions || !flag_profile_use)
> +    ltrans_partitions.qsort (flag_toplevel_reorder
>  			   ? cmp_partitions_size
>  			   : cmp_partitions_order);
> +

Add a FIXME explaining that we ought to produce two list - one for Make parallelizm and one for final link time.
lets work on this incrementally.
> diff --git a/gcc/varasm.c b/gcc/varasm.c
> index 2226912..758367f 100644
> --- a/gcc/varasm.c
> +++ b/gcc/varasm.c
> @@ -548,7 +548,14 @@ default_function_section (tree decl, enum node_frequency freq,
>       unlikely executed (this happens especially with function splitting
>       where we can split away unnecessary parts of static constructors.  */
>    if (startup && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED)
> -    return get_named_text_section (decl, ".text.startup", NULL);
> +    {
> +      /* If we do have a profile or(and) LTO phase is executed, we do not need
> +         these ELF section.  */
> +      if (!in_lto_p || !flag_profile_values)
> +        return get_named_text_section (decl, ".text.startup", NULL);
> +      else
> +        return NULL;
> +    }
>  
>    /* Similarly for exit.  */
>    if (exit && freq != NODE_FREQUENCY_UNLIKELY_EXECUTED)
> @@ -560,7 +567,10 @@ default_function_section (tree decl, enum node_frequency freq,
>        case NODE_FREQUENCY_UNLIKELY_EXECUTED:
>  	return get_named_text_section (decl, ".text.unlikely", NULL);
>        case NODE_FREQUENCY_HOT:
> -	return get_named_text_section (decl, ".text.hot", NULL);
> +        /* If we do have a profile or(and) LTO phase is executed, we do not need
> +           these ELF section.  */
> +        if (!in_lto_p || !flag_profile_values)
> +          return get_named_text_section (decl, ".text.hot", NULL);
>        default:
>  	return NULL;
>      }
Lets handle this in separate patch.

The patch is missing documentation in doc/invoke.texi.  You need to document
the new command line option and update documentation of -fprofile-use.  Please
add it and send an update patch.

Also Teresa recently sent patch for resetting profiles of functions where we clearly missed the profile.
Please add there a code filling in tp_first_run from largest first_run of the callers with non-0 count.

Thanks,
Honza



More information about the Gcc-patches mailing list