This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: RFC: LRA for x86/x86-64 [0/9]


Hi Vlad,

Thanks for the testing and the logs. You must have good hardware, your
timings are all ~3 times faster than mine :-)

On Sat, Sep 29, 2012 at 3:01 AM, Vladimir Makarov <vmakarov@redhat.com> wrote:
> ----------------------------------32-bit------------------------------------
> Reload:
> 581.85user 29.91system 27:15.18elapsed 37%CPU (0avgtext+0avgdata
> LRA:
> 629.67user 24.16system 24:31.08elapsed 44%CPU (0avgtext+0avgdata

This is a ~8% slowdown.


> ----------------------------------64-bit:-----------------------------------
> Reload:
> 503.26user 36.54system 30:16.62elapsed 29%CPU (0avgtext+0avgdata
> LRA:
> 598.70user 30.90system 27:26.92elapsed 38%CPU (0avgtext+0avgdata

This is a ~19% slowdown


> Here is the numbers for PR54146 on the same machine with -O1 only for
> 64-bit (compiler reports error for -m32).

Right, the test case is for 64-bits only, I think it's preprocessed
code for AMD64.

> Reload:
> 350.40user 21.59system 17:09.75elapsed 36%CPU (0avgtext+0avgdata
> LRA:
> 468.29user 21.35system 15:47.76elapsed 51%CPU (0avgtext+0avgdata

This is a ~34% slowdown.

To put it in another perspective, here are my timings of trunk vs lra
(both checkouts done today):

trunk:
 integrated RA           : 181.68 (24%) usr   1.68 (11%) sys 183.43
(24%) wall  643564 kB (20%) ggc
 reload                  :  11.00 ( 1%) usr   0.18 ( 1%) sys  11.17 (
1%) wall   32394 kB ( 1%) ggc
 TOTAL                 : 741.64            14.76           756.41
      3216164 kB

lra branch:
 integrated RA           : 174.65 (16%) usr   1.33 ( 8%) sys 176.33
(16%) wall  643560 kB (20%) ggc
 reload                  : 399.69 (36%) usr   2.48 (15%) sys 402.69
(36%) wall   41852 kB ( 1%) ggc
 TOTAL                 :1102.06            16.05          1120.83
      3231738 kB

That's a 49% slowdown. The difference is completely accounted for by
the timing difference between reload and LRA.
(Timings done on gcc17, which is AMD Opteron(tm) Processor 8354 with
15GB ram, so swapping is no issue.)

It looks like the reload timevar is used for LRA. Why not have
multiple timevars, one per phase of LRA? Sth like the patch below
would be nice. This gives me the following timings:

 integrated RA           : 189.34 (16%) usr   1.84 (11%) sys 191.18
(16%) wall  643560 kB (20%) ggc
 LRA non-specific        :  59.82 ( 5%) usr   0.22 ( 1%) sys  60.12 (
5%) wall   18202 kB ( 1%) ggc
 LRA virtuals eliminatenon:  56.79 ( 5%) usr   0.03 ( 0%) sys  56.80 (
5%) wall   19223 kB ( 1%) ggc
 LRA reload inheritance  :   6.41 ( 1%) usr   0.01 ( 0%) sys   6.42 (
1%) wall    1665 kB ( 0%) ggc
 LRA create live ranges  : 175.30 (15%) usr   2.14 (13%) sys 177.44
(15%) wall    2761 kB ( 0%) ggc
 LRA hard reg assignment : 130.85 (11%) usr   0.20 ( 1%) sys 131.17
(11%) wall       0 kB ( 0%) ggc
 LRA coalesce pseudo regs:   2.54 ( 0%) usr   0.00 ( 0%) sys   2.55 (
0%) wall       0 kB ( 0%) ggc
 reload                  :   6.73 ( 1%) usr   0.20 ( 1%) sys   6.92 (
1%) wall       0 kB ( 0%) ggc

so the LRA "slowness" (for lack of a better word) appears to be due to
scalability problems in all sub-passes.

The code size changes are impressive, but I think that this kind of
slowdown should be addressed before making LRA the default for any
target.

Ciao!
Steven




Index: lra-assigns.c
===================================================================
--- lra-assigns.c       (revision 191858)
+++ lra-assigns.c       (working copy)
@@ -1261,6 +1261,8 @@ lra_assign (void)
   bitmap_head insns_to_process;
   bool no_spills_p;

+  timevar_push (TV_LRA_ASSIGN);
+
   init_lives ();
   sorted_pseudos = (int *) xmalloc (sizeof (int) * max_reg_num ());
   sorted_reload_pseudos = (int *) xmalloc (sizeof (int) * max_reg_num ());
@@ -1312,5 +1314,6 @@ lra_assign (void)
   free (sorted_pseudos);
   free (sorted_reload_pseudos);
   finish_lives ();
+  timevar_pop (TV_LRA_ASSIGN);
   return no_spills_p;
 }
Index: lra.c
===================================================================
--- lra.c       (revision 191858)
+++ lra.c       (working copy)
@@ -2193,6 +2193,7 @@ lra (FILE *f)

   lra_dump_file = f;

+  timevar_push (TV_LRA);

   init_insn_recog_data ();

@@ -2271,6 +2272,7 @@ lra (FILE *f)
             to use a constant pool.  */
          lra_eliminate (false);
          lra_inheritance ();
+
          /* We need live ranges for lra_assign -- so build them.  */
          lra_create_live_ranges (true);
          live_p = true;
@@ -2343,6 +2345,8 @@ lra (FILE *f)
 #ifdef ENABLE_CHECKING
   check_rtl (true);
 #endif
+
+  timevar_pop (TV_LRA);
 }

 /* Called once per compiler to initialize LRA data once.  */
Index: lra-eliminations.c
===================================================================
--- lra-eliminations.c  (revision 191858)
+++ lra-eliminations.c  (working copy)
@@ -1297,6 +1297,8 @@ lra_eliminate (bool final_p)
   struct elim_table *ep;
   int regs_num = max_reg_num ();

+  timevar_push (TV_LRA_ELIMINATE);
+
   bitmap_initialize (&insns_with_changed_offsets, &reg_obstack);
   if (final_p)
     {
@@ -1317,7 +1319,7 @@ lra_eliminate (bool final_p)
     {
       update_reg_eliminate (&insns_with_changed_offsets);
       if (bitmap_empty_p (&insns_with_changed_offsets))
-       return;
+       goto lra_eliminate_done;
     }
   if (lra_dump_file != NULL)
     {
@@ -1349,4 +1351,7 @@ lra_eliminate (bool final_p)
          process_insn_for_elimination (insn, final_p);
       }
   bitmap_clear (&insns_with_changed_offsets);
+
+lra_eliminate_done:
+  timevar_pop (TV_LRA_ELIMINATE);
 }
Index: lra-lives.c
===================================================================
--- lra-lives.c (revision 191858)
+++ lra-lives.c (working copy)
@@ -962,6 +962,8 @@ lra_create_live_ranges (bool all_p)
   basic_block bb;
   int i, hard_regno, max_regno = max_reg_num ();

+  timevar_push (TV_LRA_CREATE_LIVE_RANGES);
+
   complete_info_p = all_p;
   if (lra_dump_file != NULL)
     fprintf (lra_dump_file,
@@ -1016,6 +1018,7 @@ lra_create_live_ranges (bool all_p)
   sparseset_free (pseudos_live_through_setjumps);
   sparseset_free (pseudos_live);
   compress_live_ranges ();
+  timevar_pop (TV_LRA_CREATE_LIVE_RANGES);
 }

 /* Finish all live ranges.  */
Index: timevar.def
===================================================================
--- timevar.def (revision 191858)
+++ timevar.def (working copy)
@@ -223,10 +223,16 @@ DEFTIMEVAR (TV_REGMOVE               , "
 DEFTIMEVAR (TV_MODE_SWITCH           , "mode switching")
 DEFTIMEVAR (TV_SMS                  , "sms modulo scheduling")
 DEFTIMEVAR (TV_SCHED                 , "scheduling")
-DEFTIMEVAR (TV_IRA                  , "integrated RA")
-DEFTIMEVAR (TV_RELOAD               , "reload")
+DEFTIMEVAR (TV_IRA                  , "integrated RA")
+DEFTIMEVAR (TV_LRA                  , "LRA non-specific")
+DEFTIMEVAR (TV_LRA_ELIMINATE        , "LRA virtuals eliminatenon")
+DEFTIMEVAR (TV_LRA_INHERITANCE      , "LRA reload inheritance")
+DEFTIMEVAR (TV_LRA_CREATE_LIVE_RANGES, "LRA create live ranges")
+DEFTIMEVAR (TV_LRA_ASSIGN           , "LRA hard reg assignment")
+DEFTIMEVAR (TV_LRA_COALESCE         , "LRA coalesce pseudo regs")
+DEFTIMEVAR (TV_RELOAD               , "reload")
 DEFTIMEVAR (TV_RELOAD_CSE_REGS       , "reload CSE regs")
-DEFTIMEVAR (TV_GCSE_AFTER_RELOAD      , "load CSE after reload")
+DEFTIMEVAR (TV_GCSE_AFTER_RELOAD     , "load CSE after reload")
 DEFTIMEVAR (TV_REE                  , "ree")
 DEFTIMEVAR (TV_THREAD_PROLOGUE_AND_EPILOGUE, "thread pro- & epilogue")
 DEFTIMEVAR (TV_IFCVT2               , "if-conversion 2")
Index: lra-coalesce.c
===================================================================
--- lra-coalesce.c      (revision 191858)
+++ lra-coalesce.c      (working copy)
@@ -221,6 +221,8 @@ lra_coalesce (void)
   bitmap_head involved_insns_bitmap, split_origin_bitmap;
   bitmap_iterator bi;

+  timevar_push (TV_LRA_COALESCE);
+
   if (lra_dump_file != NULL)
     fprintf (lra_dump_file,
             "\n********** Pseudos coalescing #%d: **********\n\n",
@@ -371,5 +373,6 @@ lra_coalesce (void)
   free (sorted_moves);
   free (next_coalesced_pseudo);
   free (first_coalesced_pseudo);
+  timevar_pop (TV_LRA_COALESCE);
   return coalesced_moves != 0;
 }
Index: lra-constraints.c
===================================================================
--- lra-constraints.c   (revision 191858)
+++ lra-constraints.c   (working copy)
@@ -4859,6 +4859,8 @@ lra_inheritance (void)
   basic_block bb, start_bb;
   edge e;

+  timevar_push (TV_LRA_INHERITANCE);
+
   lra_inheritance_iter++;
   if (lra_dump_file != NULL)
     fprintf (lra_dump_file, "\n********** Inheritance #%d: **********\n\n",
@@ -4907,6 +4909,8 @@ lra_inheritance (void)
   bitmap_clear (&live_regs);
   bitmap_clear (&check_only_regs);
   free (usage_insns);
+
+  timevar_pop (TV_LRA_INHERITANCE);
 }

 ^L


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]