This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

SSE regs prologues/epilogues


Hi,
this patch adds the SSE saving/restoring code.  It is bit tricky as it needs
to be 16 byte aligned and there is no push/pop instruction so we need to
handle it separately of the general purpose registers (now the general purpose
regs can be saved/restored by push/pop while SSE by moves).

I will commit it tomorrow if there are no complains and Kai is sucsefull with mingw rebuild.
I've tested it on x86_64-linux and i686-linux.

Honza

	* i386.h (CONDITIONAL_CALL_USAGE): SSE regs are not used for w64 ABI.
	* i386.c (struct ix86_frame): Add padding0 and nsseregs.
	(ix86_nsaved_regs): Count only general purpose regs.
	(ix86_nsaved_sseregs): New.
	(ix86_compute_frame_layout): Update nsseregs; set preferred alignment
	to 16 for w64; compute padding and size of sse reg save area.
	(ix86_emit_save_regs, ix86_emit_save_regs_using_mov): Save only general
	purpose regs.
	(ix86_emit_save_sse_regs_using_mov): New.
	(ix86_expand_prologue): Save SSE regs if needed.
	(ix86_emit_restore_regs_using_mov): Use only general purpose regs.
	(ix86_emit_restore_sse_regs_using_mov): New.
	(ix86_expand_epilogue): Save SSE regs if needed.
Index: config/i386/i386.h
===================================================================
*** config/i386/i386.h	(revision 142998)
--- config/i386/i386.h	(working copy)
*************** do {									\
*** 968,975 ****
--- 968,979 ----
          && ((cfun && cfun->machine->call_abi == MS_ABI)			\
              || (!cfun && DEFAULT_ABI == MS_ABI)))			\
        {									\
+         int i;								\
          call_used_regs[4 /*RSI*/] = 0;                                  \
          call_used_regs[5 /*RDI*/] = 0;                                  \
+ 	for (i = 0; i < 8; i++)						\
+ 	  call_used_regs[45+i] = 0;					\
+ 	call_used_regs[27] = call_used_regs[28] = 0;			\
        }									\
    } while (0)
  
Index: config/i386/i386.c
===================================================================
*** config/i386/i386.c	(revision 142998)
--- config/i386/i386.c	(working copy)
*************** struct stack_local_entry GTY(())
*** 1655,1660 ****
--- 1655,1664 ----
  					      <- HARD_FRAME_POINTER
     [saved regs]
  
+    [padding0]
+ 
+    [saved SSE regs]
+ 
     [padding1]          \
  		        )
     [va_arg registers]  (
*************** struct stack_local_entry GTY(())
*** 1665,1670 ****
--- 1669,1676 ----
    */
  struct ix86_frame
  {
+   int padding0;
+   int nsseregs;
    int nregs;
    int padding1;
    int va_arg_size;
*************** ix86_save_reg (unsigned int regno, int m
*** 7414,7420 ****
  	  && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
  }
  
! /* Return number of registers to be saved on the stack.  */
  
  static int
  ix86_nsaved_regs (void)
--- 7423,7429 ----
  	  && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
  }
  
! /* Return number of saved general prupose registers.  */
  
  static int
  ix86_nsaved_regs (void)
*************** ix86_nsaved_regs (void)
*** 7422,7430 ****
    int nregs = 0;
    int regno;
  
!   for (regno = FIRST_PSEUDO_REGISTER - 1; regno >= 0; regno--)
!     if (ix86_save_reg (regno, true))
!       nregs++;
    return nregs;
  }
  
--- 7431,7455 ----
    int nregs = 0;
    int regno;
  
!   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
!     if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
!       nregs ++;
!   return nregs;
! }
! 
! /* Return number of saved SSE registrers.  */
! 
! static int
! ix86_nsaved_sseregs (void)
! {
!   int nregs = 0;
!   int regno;
! 
!   if (ix86_cfun_abi () != MS_ABI)
!     return 0;
!   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
!     if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
!       nregs ++;
    return nregs;
  }
  
*************** ix86_compute_frame_layout (struct ix86_f
*** 7484,7494 ****
--- 7509,7530 ----
    HOST_WIDE_INT size = get_frame_size ();
  
    frame->nregs = ix86_nsaved_regs ();
+   frame->nsseregs = ix86_nsaved_sseregs ();
    total_size = size;
  
    stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
    preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
  
+   /* MS ABI seem to require stack alignment to be always 16 except for function
+      prologues.  */
+   if (ix86_cfun_abi () == MS_ABI && preferred_alignment < 16)
+     {
+       preferred_alignment = 16;
+       stack_alignment_needed = 16;
+       crtl->preferred_stack_boundary = 128;
+       crtl->stack_alignment_needed = 128;
+     }
+ 
    gcc_assert (!size || stack_alignment_needed);
    gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
    gcc_assert (preferred_alignment <= stack_alignment_needed);
*************** ix86_compute_frame_layout (struct ix86_f
*** 7543,7548 ****
--- 7579,7593 ----
    /* Register save area */
    offset += frame->nregs * UNITS_PER_WORD;
  
+   /* Align SSE reg save area.  */
+   if (frame->nsseregs)
+     frame->padding0 = ((offset + 16 - 1) & -16) - offset;
+   else
+     frame->padding0 = 0;
+   
+   /* SSE register save area.  */
+   offset += frame->padding0 + frame->nsseregs * 16;
+ 
    /* Va-arg area */
    frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
    offset += frame->va_arg_size;
*************** ix86_compute_frame_layout (struct ix86_f
*** 7612,7619 ****
    frame->stack_pointer_offset -= frame->red_zone_size;
  #if 0
    fprintf (stderr, "\n");
-   fprintf (stderr, "nregs: %ld\n", (long)frame->nregs);
    fprintf (stderr, "size: %ld\n", (long)size);
    fprintf (stderr, "alignment1: %ld\n", (long)stack_alignment_needed);
    fprintf (stderr, "padding1: %ld\n", (long)frame->padding1);
    fprintf (stderr, "va_arg: %ld\n", (long)frame->va_arg_size);
--- 7657,7666 ----
    frame->stack_pointer_offset -= frame->red_zone_size;
  #if 0
    fprintf (stderr, "\n");
    fprintf (stderr, "size: %ld\n", (long)size);
+   fprintf (stderr, "nregs: %ld\n", (long)frame->nregs);
+   fprintf (stderr, "nsseregs: %ld\n", (long)frame->nsseregs);
+   fprintf (stderr, "padding0: %ld\n", (long)frame->padding0);
    fprintf (stderr, "alignment1: %ld\n", (long)stack_alignment_needed);
    fprintf (stderr, "padding1: %ld\n", (long)frame->padding1);
    fprintf (stderr, "va_arg: %ld\n", (long)frame->va_arg_size);
*************** ix86_emit_save_regs (void)
*** 7638,7645 ****
    unsigned int regno;
    rtx insn;
  
!   for (regno = FIRST_PSEUDO_REGISTER; regno-- > 0; )
!     if (ix86_save_reg (regno, true))
        {
  	insn = emit_insn (gen_push (gen_rtx_REG (Pmode, regno)));
  	RTX_FRAME_RELATED_P (insn) = 1;
--- 7685,7692 ----
    unsigned int regno;
    rtx insn;
  
!   for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
!     if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
        {
  	insn = emit_insn (gen_push (gen_rtx_REG (Pmode, regno)));
  	RTX_FRAME_RELATED_P (insn) = 1;
*************** ix86_emit_save_regs_using_mov (rtx point
*** 7655,7661 ****
    rtx insn;
  
    for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
!     if (ix86_save_reg (regno, true))
        {
  	insn = emit_move_insn (adjust_address (gen_rtx_MEM (Pmode, pointer),
  					       Pmode, offset),
--- 7702,7708 ----
    rtx insn;
  
    for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
!     if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
        {
  	insn = emit_move_insn (adjust_address (gen_rtx_MEM (Pmode, pointer),
  					       Pmode, offset),
*************** ix86_emit_save_regs_using_mov (rtx point
*** 7665,7670 ****
--- 7712,7737 ----
        }
  }
  
+ /* Emit code to save registers using MOV insns.  First register
+    is restored from POINTER + OFFSET.  */
+ static void
+ ix86_emit_save_sse_regs_using_mov (rtx pointer, HOST_WIDE_INT offset)
+ {
+   unsigned int regno;
+   rtx insn;
+   rtx mem;
+ 
+   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+     if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
+       {
+ 	mem = adjust_address (gen_rtx_MEM (TImode, pointer), TImode, offset);
+ 	set_mem_align (mem, 128);
+ 	insn = emit_move_insn (mem, gen_rtx_REG (TImode, regno));
+ 	RTX_FRAME_RELATED_P (insn) = 1;
+ 	offset += 16;
+       }
+ }
+ 
  /* Expand prologue or epilogue stack adjustment.
     The pattern exist to put a dependency on all ebp-based memory accesses.
     STYLE should be negative if instructions should be marked as frame related,
*************** ix86_expand_prologue (void)
*** 7969,7975 ****
        RTX_FRAME_RELATED_P (insn) = 1;
      }
  
!   allocate = frame.to_allocate;
  
    if (!frame.save_regs_using_mov)
      ix86_emit_save_regs ();
--- 8036,8042 ----
        RTX_FRAME_RELATED_P (insn) = 1;
      }
  
!   allocate = frame.to_allocate + frame.nsseregs * 16 + frame.padding0;
  
    if (!frame.save_regs_using_mov)
      ix86_emit_save_regs ();
*************** ix86_expand_prologue (void)
*** 8048,8058 ****
  	  || !frame.to_allocate
  	  || crtl->stack_realign_needed)
          ix86_emit_save_regs_using_mov (stack_pointer_rtx,
! 				       frame.to_allocate);
        else
          ix86_emit_save_regs_using_mov (hard_frame_pointer_rtx,
  				       -frame.nregs * UNITS_PER_WORD);
      }
  
    pic_reg_used = false;
    if (pic_offset_table_rtx
--- 8115,8136 ----
  	  || !frame.to_allocate
  	  || crtl->stack_realign_needed)
          ix86_emit_save_regs_using_mov (stack_pointer_rtx,
! 				       frame.to_allocate
! 				       + frame.nsseregs * 16 + frame.padding0);
        else
          ix86_emit_save_regs_using_mov (hard_frame_pointer_rtx,
  				       -frame.nregs * UNITS_PER_WORD);
      }
+   if (!frame_pointer_needed
+       || !frame.to_allocate
+       || crtl->stack_realign_needed)
+     ix86_emit_save_sse_regs_using_mov (stack_pointer_rtx,
+ 				       frame.to_allocate);
+   else
+     ix86_emit_save_sse_regs_using_mov (hard_frame_pointer_rtx,
+ 				       - frame.nregs * UNITS_PER_WORD
+ 				       - frame.nsseregs * 16
+ 				       - frame.padding0);
  
    pic_reg_used = false;
    if (pic_offset_table_rtx
*************** ix86_emit_restore_regs_using_mov (rtx po
*** 8124,8130 ****
    rtx base_address = gen_rtx_MEM (Pmode, pointer);
  
    for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
!     if (ix86_save_reg (regno, maybe_eh_return))
        {
  	/* Ensure that adjust_address won't be forced to produce pointer
  	   out of range allowed by x86-64 instruction set.  */
--- 8202,8208 ----
    rtx base_address = gen_rtx_MEM (Pmode, pointer);
  
    for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
!     if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
        {
  	/* Ensure that adjust_address won't be forced to produce pointer
  	   out of range allowed by x86-64 instruction set.  */
*************** ix86_emit_restore_regs_using_mov (rtx po
*** 8139,8149 ****
  	    offset = 0;
  	  }
  	emit_move_insn (gen_rtx_REG (Pmode, regno),
! 			adjust_address (base_address, Pmode, offset));
  	offset += UNITS_PER_WORD;
        }
  }
  
  /* Restore function stack, frame, and registers.  */
  
  void
--- 8217,8259 ----
  	    offset = 0;
  	  }
  	emit_move_insn (gen_rtx_REG (Pmode, regno),
! 	                adjust_address (base_address, Pmode, offset));
  	offset += UNITS_PER_WORD;
        }
  }
  
+ /* Emit code to restore saved registers using MOV insns.  First register
+    is restored from POINTER + OFFSET.  */
+ static void
+ ix86_emit_restore_sse_regs_using_mov (rtx pointer, HOST_WIDE_INT offset,
+ 				      int maybe_eh_return)
+ {
+   int regno;
+   rtx base_address = gen_rtx_MEM (TImode, pointer);
+   rtx mem;
+ 
+   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+     if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
+       {
+ 	/* Ensure that adjust_address won't be forced to produce pointer
+ 	   out of range allowed by x86-64 instruction set.  */
+ 	if (TARGET_64BIT && offset != trunc_int_for_mode (offset, SImode))
+ 	  {
+ 	    rtx r11;
+ 
+ 	    r11 = gen_rtx_REG (DImode, R11_REG);
+ 	    emit_move_insn (r11, GEN_INT (offset));
+ 	    emit_insn (gen_adddi3 (r11, r11, pointer));
+ 	    base_address = gen_rtx_MEM (TImode, r11);
+ 	    offset = 0;
+ 	  }
+ 	mem = adjust_address (base_address, TImode, offset);
+ 	set_mem_align (mem, 128);
+ 	emit_move_insn (gen_rtx_REG (TImode, regno), mem);
+ 	offset += 16;
+       }
+ }
+ 
  /* Restore function stack, frame, and registers.  */
  
  void
*************** ix86_expand_epilogue (int style)
*** 8171,8176 ****
--- 8281,8287 ----
    if (crtl->calls_eh_return && style != 2)
      offset -= 2;
    offset *= -UNITS_PER_WORD;
+   offset -= frame.nsseregs * 16 + frame.padding0;
  
    /* If we're only restoring one register and sp is not valid then
       using a move instruction to restore the register since it's
*************** ix86_expand_epilogue (int style)
*** 8204,8214 ****
        if (!frame_pointer_needed
  	  || (sp_valid && !frame.to_allocate) 
  	  || stack_realign_fp)
! 	ix86_emit_restore_regs_using_mov (stack_pointer_rtx,
! 					  frame.to_allocate, style == 2);
        else
! 	ix86_emit_restore_regs_using_mov (hard_frame_pointer_rtx,
! 					  offset, style == 2);
  
        /* eh_return epilogues need %ecx added to the stack pointer.  */
        if (style == 2)
--- 8315,8337 ----
        if (!frame_pointer_needed
  	  || (sp_valid && !frame.to_allocate) 
  	  || stack_realign_fp)
! 	{
! 	  ix86_emit_restore_sse_regs_using_mov (stack_pointer_rtx,
! 					        frame.to_allocate, style == 2);
! 	  ix86_emit_restore_regs_using_mov (stack_pointer_rtx,
! 					    frame.to_allocate
! 					    + frame.nsseregs * 16
! 					    + frame.padding0, style == 2);
! 	}
        else
!         {
! 	  ix86_emit_restore_sse_regs_using_mov (hard_frame_pointer_rtx,
! 					        offset, style == 2);
! 	  ix86_emit_restore_regs_using_mov (hard_frame_pointer_rtx,
! 					    offset
! 					    + frame.nsseregs * 16
! 					    + frame.padding0, style == 2);
!         }
  
        /* eh_return epilogues need %ecx added to the stack pointer.  */
        if (style == 2)
*************** ix86_expand_epilogue (int style)
*** 8234,8247 ****
  	    {
  	      tmp = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
  	      tmp = plus_constant (tmp, (frame.to_allocate
!                                          + frame.nregs * UNITS_PER_WORD));
  	      emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, tmp));
  	    }
  	}
        else if (!frame_pointer_needed)
  	pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
  				   GEN_INT (frame.to_allocate
! 					    + frame.nregs * UNITS_PER_WORD),
  				   style);
        /* If not an i386, mov & pop is faster than "leave".  */
        else if (TARGET_USE_LEAVE || optimize_function_for_size_p (cfun)
--- 8357,8374 ----
  	    {
  	      tmp = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
  	      tmp = plus_constant (tmp, (frame.to_allocate
!                                          + frame.nregs * UNITS_PER_WORD
! 					 + frame.nsseregs * 16
! 					 + frame.padding0));
  	      emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, tmp));
  	    }
  	}
        else if (!frame_pointer_needed)
  	pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
  				   GEN_INT (frame.to_allocate
! 					    + frame.nregs * UNITS_PER_WORD
! 					    + frame.nsseregs * 16
! 					    + frame.padding0),
  				   style);
        /* If not an i386, mov & pop is faster than "leave".  */
        else if (TARGET_USE_LEAVE || optimize_function_for_size_p (cfun)
*************** ix86_expand_epilogue (int style)
*** 8272,8284 ****
  	  pro_epilogue_adjust_stack (stack_pointer_rtx,
  				     hard_frame_pointer_rtx,
  				     GEN_INT (offset), style);
  	}
-       else if (frame.to_allocate)
- 	pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
- 				   GEN_INT (frame.to_allocate), style);
  
        for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
! 	if (ix86_save_reg (regno, false))
  	  emit_insn ((*ix86_gen_pop1) (gen_rtx_REG (Pmode, regno)));
        if (frame_pointer_needed)
  	{
--- 8399,8422 ----
  	  pro_epilogue_adjust_stack (stack_pointer_rtx,
  				     hard_frame_pointer_rtx,
  				     GEN_INT (offset), style);
+           ix86_emit_restore_sse_regs_using_mov (stack_pointer_rtx,
+ 					        frame.to_allocate, style == 2);
+ 	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+ 				     GEN_INT (frame.nsseregs * 16), style);
+ 	}
+       else if (frame.to_allocate || frame.nsseregs)
+ 	{
+           ix86_emit_restore_sse_regs_using_mov (stack_pointer_rtx,
+ 					        frame.to_allocate,
+ 						style == 2);
+ 	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+ 				     GEN_INT (frame.to_allocate
+ 				     	      + frame.nsseregs * 16
+ 					      + frame.padding0), style);
  	}
  
        for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
! 	if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
  	  emit_insn ((*ix86_gen_pop1) (gen_rtx_REG (Pmode, regno)));
        if (frame_pointer_needed)
  	{


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]