This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

More realistic alignment decisions


Hi,
Kenny run into problem where internal loop wasn't aligned as expected.
The current algorithm for alignment is based on profile and it is a bit dificult
to analyze what is going on, since debug info is not dumped and algorithm
can be misguided by mismatched profile.

This patch implements dumping facility that also builds loop structure so
one can see if profile is still matching a reality (sort of) and while
looking at the dumps I noticed problems that blocks get aligned only if they
are executed not less than 10 times less often than most hot block in function
and loop was detected as blocks reached twice as often via loopback then via
fallthru.

This was done so since initially we was predicting very flat profiles with loop
containing more than one exit predicted to exectue twice or three times.  We
now do more realistic estimate so this can be made more sharp - I changed first
constant to 100 and second to 4.  Benchmarking it on Itanium and x86-64 there
is no difference on Itanium and about 10 point gain on x86-64 SPEC that is
pretty consistent.

Patch also turns those two constants into parameters.

OK?

Honza
	* invoke.texi (align-threshold, align-loop-iterations): Document.
	* final.c: Include cfgloop.h, params.h
	(compute_alignments): Dump decisions and compare them with loop
	structure; honor given parameters.
	(pass_compute_alignments): New dump file.
	* params.def (PARAM_ALIGN_THRESHOLD, PARAM_ALIGN_LOOP_ITERATIONS): New.
Index: doc/invoke.texi
===================================================================
*** doc/invoke.texi	(revision 129057)
--- doc/invoke.texi	(working copy)
*************** with unknown.  We predict the known numb
*** 6873,6878 ****
--- 6873,6888 ----
  the unknown number of iterations average to roughly 10.  This means that the
  loop without bounds would appear artificially cold relative to the other one.
  
+ @item align-threshold
+ 
+ Select fraction of the maximal frequency of executions of basic block in
+ function given basic block will get aligned.
+ 
+ @item align-loop-iterations
+ 
+ A loop expected to iterate at lest the selected number of iterations will get
+ aligned.
+ 
  @item tracer-dynamic-coverage
  @itemx tracer-dynamic-coverage-feedback
  
Index: final.c
===================================================================
*** final.c	(revision 129057)
--- final.c	(working copy)
*************** along with GCC; see the file COPYING3.  
*** 78,83 ****
--- 78,85 ----
  #include "df.h"
  #include "vecprim.h"
  #include "ggc.h"
+ #include "cfgloop.h"
+ #include "params.h"
  
  #ifdef XCOFF_DEBUGGING_INFO
  #include "xcoffout.h"		/* Needed for external data
*************** compute_alignments (void)
*** 673,678 ****
--- 675,682 ----
  {
    int log, max_skip, max_log;
    basic_block bb;
+   int freq_max = 0;
+   int freq_threshold = 0;
  
    if (label_align)
      {
*************** compute_alignments (void)
*** 688,693 ****
--- 692,710 ----
    if (! optimize || optimize_size)
      return 0;
  
+   if (dump_file)
+     {
+       dump_flow_info (dump_file, TDF_DETAILS);
+       flow_loops_dump (dump_file, NULL, 1);
+       loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
+     }
+   FOR_EACH_BB (bb)
+     if (bb->frequency > freq_max)
+       freq_max = bb->frequency;
+   freq_threshold = freq_max / PARAM_VALUE (PARAM_ALIGN_THRESHOLD);
+ 
+   if (dump_file)
+     fprintf(dump_file, "freq_max: %i\n",freq_max);
    FOR_EACH_BB (bb)
      {
        rtx label = BB_HEAD (bb);
*************** compute_alignments (void)
*** 697,703 ****
  
        if (!LABEL_P (label)
  	  || probably_never_executed_bb_p (bb))
! 	continue;
        max_log = LABEL_ALIGN (label);
        max_skip = LABEL_ALIGN_MAX_SKIP;
  
--- 714,725 ----
  
        if (!LABEL_P (label)
  	  || probably_never_executed_bb_p (bb))
! 	{
! 	  if (dump_file)
! 	    fprintf(dump_file, "BB %4i freq %4i loop %2i loop_depth %2i skipped.\n",
! 		    bb->index, bb->frequency, bb->loop_father->num, bb->loop_depth);
! 	  continue;
! 	}
        max_log = LABEL_ALIGN (label);
        max_skip = LABEL_ALIGN_MAX_SKIP;
  
*************** compute_alignments (void)
*** 708,713 ****
--- 730,747 ----
  	  else
  	    branch_frequency += EDGE_FREQUENCY (e);
  	}
+       if (dump_file)
+ 	{
+ 	  fprintf(dump_file, "BB %4i freq %4i loop %2i loop_depth %2i fall %4i branch %4i",
+ 		  bb->index, bb->frequency, bb->loop_father->num,
+ 		  bb->loop_depth,
+ 		  fallthru_frequency, branch_frequency);
+ 	  if (!bb->loop_father->inner && bb->loop_father->num)
+ 	    fprintf (dump_file, " inner_loop");
+ 	  if (bb->loop_father->header == bb)
+ 	    fprintf (dump_file, " loop_header");
+ 	  fprintf (dump_file, "\n");
+ 	}
  
        /* There are two purposes to align block with no fallthru incoming edge:
  	 1) to avoid fetch stalls when branch destination is near cache boundary
*************** compute_alignments (void)
*** 720,731 ****
  	 when function is called.  */
  
        if (!has_fallthru
! 	  && (branch_frequency > BB_FREQ_MAX / 10
  	      || (bb->frequency > bb->prev_bb->frequency * 10
  		  && (bb->prev_bb->frequency
  		      <= ENTRY_BLOCK_PTR->frequency / 2))))
  	{
  	  log = JUMP_ALIGN (label);
  	  if (max_log < log)
  	    {
  	      max_log = log;
--- 754,767 ----
  	 when function is called.  */
  
        if (!has_fallthru
! 	  && (branch_frequency > freq_threshold
  	      || (bb->frequency > bb->prev_bb->frequency * 10
  		  && (bb->prev_bb->frequency
  		      <= ENTRY_BLOCK_PTR->frequency / 2))))
  	{
  	  log = JUMP_ALIGN (label);
+ 	  if (dump_file)
+ 	    fprintf(dump_file, "  jump alignment added.\n");
  	  if (max_log < log)
  	    {
  	      max_log = log;
*************** compute_alignments (void)
*** 736,745 ****
  	 align it.  It is most likely a first block of loop.  */
        if (has_fallthru
  	  && maybe_hot_bb_p (bb)
! 	  && branch_frequency + fallthru_frequency > BB_FREQ_MAX / 10
! 	  && branch_frequency > fallthru_frequency * 2)
  	{
  	  log = LOOP_ALIGN (label);
  	  if (max_log < log)
  	    {
  	      max_log = log;
--- 772,784 ----
  	 align it.  It is most likely a first block of loop.  */
        if (has_fallthru
  	  && maybe_hot_bb_p (bb)
! 	  && branch_frequency + fallthru_frequency > freq_threshold
! 	  && (branch_frequency
! 	      > fallthru_frequency * PARAM_VALUE (PARAM_ALIGN_LOOP_ITERATIONS)))
  	{
  	  log = LOOP_ALIGN (label);
+ 	  if (dump_file)
+ 	    fprintf(dump_file, "  internal loop alignment added.\n");
  	  if (max_log < log)
  	    {
  	      max_log = log;
*************** compute_alignments (void)
*** 749,760 ****
        LABEL_TO_ALIGNMENT (label) = max_log;
        LABEL_TO_MAX_SKIP (label) = max_skip;
      }
    return 0;
  }
  
  struct tree_opt_pass pass_compute_alignments =
  {
!   NULL,                                 /* name */
    NULL,                                 /* gate */
    compute_alignments,                   /* execute */
    NULL,                                 /* sub */
--- 788,802 ----
        LABEL_TO_ALIGNMENT (label) = max_log;
        LABEL_TO_MAX_SKIP (label) = max_skip;
      }
+ 
+   if (dump_file)
+     loop_optimizer_finalize ();
    return 0;
  }
  
  struct tree_opt_pass pass_compute_alignments =
  {
!   "alignments",                         /* name */
    NULL,                                 /* gate */
    compute_alignments,                   /* execute */
    NULL,                                 /* sub */
*************** struct tree_opt_pass pass_compute_alignm
*** 765,771 ****
    0,                                    /* properties_provided */
    0,                                    /* properties_destroyed */
    0,                                    /* todo_flags_start */
!   0,                                    /* todo_flags_finish */
    0                                     /* letter */
  };
  
--- 807,814 ----
    0,                                    /* properties_provided */
    0,                                    /* properties_destroyed */
    0,                                    /* todo_flags_start */
!   TODO_dump_func | TODO_verify_rtl_sharing
!   | TODO_ggc_collect,                   /* todo_flags_finish */
    0                                     /* letter */
  };
  
Index: params.def
===================================================================
*** params.def	(revision 129057)
--- params.def	(working copy)
*************** DEFPARAM(HOT_BB_FREQUENCY_FRACTION,
*** 334,339 ****
--- 334,349 ----
  	 "Select fraction of the maximal frequency of executions of basic block in function given basic block needs to have to be considered hot",
  	 1000, 0, 0)
  
+ DEFPARAM (PARAM_ALIGN_THRESHOLD,
+ 	  "align-threshold",
+ 	  "Select fraction of the maximal frequency of executions of basic block in function given basic block get alignment",
+ 	  100, 0, 0)
+ 
+ DEFPARAM (PARAM_ALIGN_LOOP_ITERATIONS,
+ 	  "align-loop-iterations",
+ 	  "Loops iterating at least selected number of iterations will get loop alignement.",
+ 	  4, 0, 0)
+ 
  /* For guessed profiles, the loops having unknown number of iterations
     are predicted to iterate relatively few (10) times at average.
     For functions containing one loop with large known number of iterations


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]