This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Improve x86-64 prologues and epilogues


Hi,
the attached patch changes x86-64 prologues so the registers can be saved
into red zone.  Also the stack frame is allocated after saving registers in order
to allow saving to begin one cycle earlier.

Honza

Tue Mar 18 19:06:18 CET 2003  Jan Hubicka  <jh at suse dot cz>
	* i386.c (use_fast_prologue_epilogue): Remove.
	(machine_function): New fields use_fast_prologue_epilogue;
	use_fast_prologue_epilogue_initialized.
	(ix86_frame): New field save_regs-using_mov;
	(ix86_compute_frame_layout):  Decide on fast prologues;
	allocate saved registers in red zone.
	(ix86_expand_epilogue, ix86_expand_prolgoues): Obey new parameters.
Index: i386.c
===================================================================
RCS file: /cvsroot/gcc/gcc/gcc/config/i386/i386.c,v
retrieving revision 1.490.2.31
diff -c -3 -p -r1.490.2.31 i386.c
*** i386.c	10 Mar 2003 14:44:43 -0000	1.490.2.31
--- i386.c	17 Mar 2003 15:36:29 -0000
*************** const int x86_inter_unit_moves = ~(m_ATH
*** 518,527 ****
     epilogue code.  */
  #define FAST_PROLOGUE_INSN_COUNT 20
  
- /* Set by prologue expander and used by epilogue expander to determine
-    the style used.  */
- static int use_fast_prologue_epilogue;
- 
  /* Names for 8 (low), 8 (high), and 16-bit registers, respectively.  */
  static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
  static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
--- 518,523 ----
*************** struct machine_function GTY(())
*** 675,680 ****
--- 671,682 ----
    const char *some_ld_name;
    int save_varrargs_registers;
    int accesses_prev_frame;
+   /* Set by ix86_compute_frame_layout and used by prologue/epilogue expander to
+      determine the style used.  */
+   bool use_fast_prologue_epilogue;
+   /* Do not change the decision in between register ellimination and
+      prologue/epilogue expansion.  */
+   bool use_fast_prologue_epilogue_initialized;
  };
  
  #define ix86_stack_locals (cfun->machine->stack_locals)
*************** struct ix86_frame
*** 714,719 ****
--- 716,725 ----
    HOST_WIDE_INT frame_pointer_offset;
    HOST_WIDE_INT hard_frame_pointer_offset;
    HOST_WIDE_INT stack_pointer_offset;
+ 
+   /* When save_regs_using_mov is set, emit prologue using
+      move instead of push instructions.  */
+   bool save_regs_using_mov;
  };
  
  /* Used to enable/disable debugging features.  */
*************** ix86_compute_frame_layout (frame)
*** 4605,4610 ****
--- 4611,4649 ----
    frame->nregs = ix86_nsaved_regs ();
    total_size = size;
  
+   if (!optimize_size && !cfun->machine->use_fast_prologue_epilogue_initialized)
+     {
+       int count = frame->nregs;
+ 
+       cfun->machine->use_fast_prologue_epilogue_initialized = 1;
+ 
+       /* The fast prologue uses move instead of push to save registers.  This
+          is significantly longer, but also executes faster as modern hardware
+          can execute the moves in parallel, but can't do that for push/pop.
+ 	 
+ 	 Be careful about choosing what prologue to emit:  When function takes
+ 	 many instructions to execute we may use slow version as well as in
+ 	 case function is known to be outside hot spot (this is known with
+ 	 feedback only).  Weight the size of function by number of registers
+ 	 to save as it is cheap to use one or two push instructions but very
+ 	 slow to use many of them.  */
+       if (count)
+ 	count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
+       if (cfun->function_frequency < FUNCTION_FREQUENCY_NORMAL
+ 	  || (flag_branch_probabilities
+ 	      && cfun->function_frequency < FUNCTION_FREQUENCY_HOT))
+         cfun->machine->use_fast_prologue_epilogue = false;
+       else
+         cfun->machine->use_fast_prologue_epilogue
+ 	   = !expensive_function_p (count);
+     }
+   if (TARGET_PROLOGUE_USING_MOVE
+       && cfun->machine->use_fast_prologue_epilogue)
+     frame->save_regs_using_mov = true;
+   else
+     frame->save_regs_using_mov = false;
+ 
+ 
    /* Skip return address and saved base pointer.  */
    offset = frame_pointer_needed ? UNITS_PER_WORD * 2 : UNITS_PER_WORD;
  
*************** ix86_compute_frame_layout (frame)
*** 4677,4686 ****
--- 4716,4730 ----
      (size + frame->padding1 + frame->padding2
       + frame->outgoing_arguments_size + frame->va_arg_size);
  
+   if (!frame->to_allocate && frame->nregs <= 1)
+     frame->save_regs_using_mov = false;
+ 
    if (TARGET_64BIT && TARGET_RED_ZONE && current_function_sp_is_unchanging
        && current_function_is_leaf)
      {
        frame->red_zone_size = frame->to_allocate;
+       if (frame->save_regs_using_mov)
+ 	frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
        if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
  	frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
      }
*************** ix86_expand_prologue ()
*** 4749,4783 ****
    rtx insn;
    bool pic_reg_used;
    struct ix86_frame frame;
-   int use_mov = 0;
    HOST_WIDE_INT allocate;
  
    ix86_compute_frame_layout (&frame);
-   if (!optimize_size)
-     {
-       int count = frame.nregs;
- 
-       /* The fast prologue uses move instead of push to save registers.  This
-          is significantly longer, but also executes faster as modern hardware
-          can execute the moves in parallel, but can't do that for push/pop.
- 	 
- 	 Be careful about choosing what prologue to emit:  When function takes
- 	 many instructions to execute we may use slow version as well as in
- 	 case function is known to be outside hot spot (this is known with
- 	 feedback only).  Weight the size of function by number of registers
- 	 to save as it is cheap to use one or two push instructions but very
- 	 slow to use many of them.  */
-       if (count)
- 	count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
-       if (cfun->function_frequency < FUNCTION_FREQUENCY_NORMAL
- 	  || (flag_branch_probabilities
- 	      && cfun->function_frequency < FUNCTION_FREQUENCY_HOT))
- 	use_fast_prologue_epilogue = 0;
-       else
-         use_fast_prologue_epilogue = !expensive_function_p (count);
-       if (TARGET_PROLOGUE_USING_MOVE)
-         use_mov = use_fast_prologue_epilogue;
-     }
  
    /* Note: AT&T enter does NOT have reversed args.  Enter is probably
       slower on all targets.  Also sdb doesn't like it.  */
--- 4793,4801 ----
*************** ix86_expand_prologue ()
*** 4792,4807 ****
      }
  
    allocate = frame.to_allocate;
-   /* In case we are dealing only with single register and empty frame,
-      push is equivalent of the mov+add sequence.  */
-   if (allocate == 0 && frame.nregs <= 1)
-     use_mov = 0;
  
!   if (!use_mov)
      ix86_emit_save_regs ();
    else
      allocate += frame.nregs * UNITS_PER_WORD;
  
    if (allocate == 0)
      ;
    else if (! TARGET_STACK_PROBE || allocate < CHECK_STACK_LIMIT)
--- 4810,4828 ----
      }
  
    allocate = frame.to_allocate;
  
!   if (!frame.save_regs_using_mov)
      ix86_emit_save_regs ();
    else
      allocate += frame.nregs * UNITS_PER_WORD;
  
+   /* When using red zone we may start register saving before allocating
+      the stack frame saving one cycle of the prologue.  */
+   if (TARGET_RED_ZONE && frame.save_regs_using_mov)
+     ix86_emit_save_regs_using_mov (frame_pointer_needed ? hard_frame_pointer_rtx
+ 				   : stack_pointer_rtx,
+ 				   -frame.nregs * UNITS_PER_WORD);
+ 
    if (allocate == 0)
      ;
    else if (! TARGET_STACK_PROBE || allocate < CHECK_STACK_LIMIT)
*************** ix86_expand_prologue ()
*** 4835,4841 ****
           call.  */
        emit_insn (gen_blockage (const0_rtx));
      }
!   if (use_mov)
      {
        if (!frame_pointer_needed || !frame.to_allocate)
          ix86_emit_save_regs_using_mov (stack_pointer_rtx, frame.to_allocate);
--- 4856,4862 ----
           call.  */
        emit_insn (gen_blockage (const0_rtx));
      }
!   if (frame.save_regs_using_mov && !TARGET_RED_ZONE)
      {
        if (!frame_pointer_needed || !frame.to_allocate)
          ix86_emit_save_regs_using_mov (stack_pointer_rtx, frame.to_allocate);
*************** ix86_expand_epilogue (style)
*** 4934,4944 ****
       tuning in future.  */
    if ((!sp_valid && frame.nregs <= 1)
        || (TARGET_EPILOGUE_USING_MOVE
! 	  && use_fast_prologue_epilogue
  	  && (frame.nregs > 1 || frame.to_allocate))
        || (frame_pointer_needed && !frame.nregs && frame.to_allocate)
        || (frame_pointer_needed && TARGET_USE_LEAVE
! 	  && use_fast_prologue_epilogue && frame.nregs == 1)
        || current_function_calls_eh_return)
      {
        /* Restore registers.  We can use ebp or esp to address the memory
--- 4955,4966 ----
       tuning in future.  */
    if ((!sp_valid && frame.nregs <= 1)
        || (TARGET_EPILOGUE_USING_MOVE
! 	  && cfun->machine->use_fast_prologue_epilogue
  	  && (frame.nregs > 1 || frame.to_allocate))
        || (frame_pointer_needed && !frame.nregs && frame.to_allocate)
        || (frame_pointer_needed && TARGET_USE_LEAVE
! 	  && cfun->machine->use_fast_prologue_epilogue
! 	  && frame.nregs == 1)
        || current_function_calls_eh_return)
      {
        /* Restore registers.  We can use ebp or esp to address the memory
*************** ix86_expand_epilogue (style)
*** 4985,4991 ****
  		    GEN_INT (frame.to_allocate
  			     + frame.nregs * UNITS_PER_WORD)));
        /* If not an i386, mov & pop is faster than "leave".  */
!       else if (TARGET_USE_LEAVE || optimize_size || !use_fast_prologue_epilogue)
  	emit_insn (TARGET_64BIT ? gen_leave_rex64 () : gen_leave ());
        else
  	{
--- 5007,5014 ----
  		    GEN_INT (frame.to_allocate
  			     + frame.nregs * UNITS_PER_WORD)));
        /* If not an i386, mov & pop is faster than "leave".  */
!       else if (TARGET_USE_LEAVE || optimize_size
! 	       || !cfun->machine->use_fast_prologue_epilogue)
  	emit_insn (TARGET_64BIT ? gen_leave_rex64 () : gen_leave ());
        else
  	{


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]