This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
SSE regs prologues/epilogues
- From: Jan Hubicka <hubicka at ucw dot cz>
- To: gcc-patches at gcc dot gnu dot org
- Date: Sun, 4 Jan 2009 15:21:37 +0100
- Subject: SSE regs prologues/epilogues
Hi,
this patch adds the SSE saving/restoring code. It is bit tricky as it needs
to be 16 byte aligned and there is no push/pop instruction so we need to
handle it separately of the general purpose registers (now the general purpose
regs can be saved/restored by push/pop while SSE by moves).
I will commit it tomorrow if there are no complains and Kai is sucsefull with mingw rebuild.
I've tested it on x86_64-linux and i686-linux.
Honza
* i386.h (CONDITIONAL_CALL_USAGE): SSE regs are not used for w64 ABI.
* i386.c (struct ix86_frame): Add padding0 and nsseregs.
(ix86_nsaved_regs): Count only general purpose regs.
(ix86_nsaved_sseregs): New.
(ix86_compute_frame_layout): Update nsseregs; set preferred alignment
to 16 for w64; compute padding and size of sse reg save area.
(ix86_emit_save_regs, ix86_emit_save_regs_using_mov): Save only general
purpose regs.
(ix86_emit_save_sse_regs_using_mov): New.
(ix86_expand_prologue): Save SSE regs if needed.
(ix86_emit_restore_regs_using_mov): Use only general purpose regs.
(ix86_emit_restore_sse_regs_using_mov): New.
(ix86_expand_epilogue): Save SSE regs if needed.
Index: config/i386/i386.h
===================================================================
*** config/i386/i386.h (revision 142998)
--- config/i386/i386.h (working copy)
*************** do { \
*** 968,975 ****
--- 968,979 ----
&& ((cfun && cfun->machine->call_abi == MS_ABI) \
|| (!cfun && DEFAULT_ABI == MS_ABI))) \
{ \
+ int i; \
call_used_regs[4 /*RSI*/] = 0; \
call_used_regs[5 /*RDI*/] = 0; \
+ for (i = 0; i < 8; i++) \
+ call_used_regs[45+i] = 0; \
+ call_used_regs[27] = call_used_regs[28] = 0; \
} \
} while (0)
Index: config/i386/i386.c
===================================================================
*** config/i386/i386.c (revision 142998)
--- config/i386/i386.c (working copy)
*************** struct stack_local_entry GTY(())
*** 1655,1660 ****
--- 1655,1664 ----
<- HARD_FRAME_POINTER
[saved regs]
+ [padding0]
+
+ [saved SSE regs]
+
[padding1] \
)
[va_arg registers] (
*************** struct stack_local_entry GTY(())
*** 1665,1670 ****
--- 1669,1676 ----
*/
struct ix86_frame
{
+ int padding0;
+ int nsseregs;
int nregs;
int padding1;
int va_arg_size;
*************** ix86_save_reg (unsigned int regno, int m
*** 7414,7420 ****
&& (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
}
! /* Return number of registers to be saved on the stack. */
static int
ix86_nsaved_regs (void)
--- 7423,7429 ----
&& (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
}
! /* Return number of saved general prupose registers. */
static int
ix86_nsaved_regs (void)
*************** ix86_nsaved_regs (void)
*** 7422,7430 ****
int nregs = 0;
int regno;
! for (regno = FIRST_PSEUDO_REGISTER - 1; regno >= 0; regno--)
! if (ix86_save_reg (regno, true))
! nregs++;
return nregs;
}
--- 7431,7455 ----
int nregs = 0;
int regno;
! for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
! if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
! nregs ++;
! return nregs;
! }
!
! /* Return number of saved SSE registrers. */
!
! static int
! ix86_nsaved_sseregs (void)
! {
! int nregs = 0;
! int regno;
!
! if (ix86_cfun_abi () != MS_ABI)
! return 0;
! for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
! if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
! nregs ++;
return nregs;
}
*************** ix86_compute_frame_layout (struct ix86_f
*** 7484,7494 ****
--- 7509,7530 ----
HOST_WIDE_INT size = get_frame_size ();
frame->nregs = ix86_nsaved_regs ();
+ frame->nsseregs = ix86_nsaved_sseregs ();
total_size = size;
stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
+ /* MS ABI seem to require stack alignment to be always 16 except for function
+ prologues. */
+ if (ix86_cfun_abi () == MS_ABI && preferred_alignment < 16)
+ {
+ preferred_alignment = 16;
+ stack_alignment_needed = 16;
+ crtl->preferred_stack_boundary = 128;
+ crtl->stack_alignment_needed = 128;
+ }
+
gcc_assert (!size || stack_alignment_needed);
gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
gcc_assert (preferred_alignment <= stack_alignment_needed);
*************** ix86_compute_frame_layout (struct ix86_f
*** 7543,7548 ****
--- 7579,7593 ----
/* Register save area */
offset += frame->nregs * UNITS_PER_WORD;
+ /* Align SSE reg save area. */
+ if (frame->nsseregs)
+ frame->padding0 = ((offset + 16 - 1) & -16) - offset;
+ else
+ frame->padding0 = 0;
+
+ /* SSE register save area. */
+ offset += frame->padding0 + frame->nsseregs * 16;
+
/* Va-arg area */
frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
offset += frame->va_arg_size;
*************** ix86_compute_frame_layout (struct ix86_f
*** 7612,7619 ****
frame->stack_pointer_offset -= frame->red_zone_size;
#if 0
fprintf (stderr, "\n");
- fprintf (stderr, "nregs: %ld\n", (long)frame->nregs);
fprintf (stderr, "size: %ld\n", (long)size);
fprintf (stderr, "alignment1: %ld\n", (long)stack_alignment_needed);
fprintf (stderr, "padding1: %ld\n", (long)frame->padding1);
fprintf (stderr, "va_arg: %ld\n", (long)frame->va_arg_size);
--- 7657,7666 ----
frame->stack_pointer_offset -= frame->red_zone_size;
#if 0
fprintf (stderr, "\n");
fprintf (stderr, "size: %ld\n", (long)size);
+ fprintf (stderr, "nregs: %ld\n", (long)frame->nregs);
+ fprintf (stderr, "nsseregs: %ld\n", (long)frame->nsseregs);
+ fprintf (stderr, "padding0: %ld\n", (long)frame->padding0);
fprintf (stderr, "alignment1: %ld\n", (long)stack_alignment_needed);
fprintf (stderr, "padding1: %ld\n", (long)frame->padding1);
fprintf (stderr, "va_arg: %ld\n", (long)frame->va_arg_size);
*************** ix86_emit_save_regs (void)
*** 7638,7645 ****
unsigned int regno;
rtx insn;
! for (regno = FIRST_PSEUDO_REGISTER; regno-- > 0; )
! if (ix86_save_reg (regno, true))
{
insn = emit_insn (gen_push (gen_rtx_REG (Pmode, regno)));
RTX_FRAME_RELATED_P (insn) = 1;
--- 7685,7692 ----
unsigned int regno;
rtx insn;
! for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
! if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
{
insn = emit_insn (gen_push (gen_rtx_REG (Pmode, regno)));
RTX_FRAME_RELATED_P (insn) = 1;
*************** ix86_emit_save_regs_using_mov (rtx point
*** 7655,7661 ****
rtx insn;
for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
! if (ix86_save_reg (regno, true))
{
insn = emit_move_insn (adjust_address (gen_rtx_MEM (Pmode, pointer),
Pmode, offset),
--- 7702,7708 ----
rtx insn;
for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
! if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
{
insn = emit_move_insn (adjust_address (gen_rtx_MEM (Pmode, pointer),
Pmode, offset),
*************** ix86_emit_save_regs_using_mov (rtx point
*** 7665,7670 ****
--- 7712,7737 ----
}
}
+ /* Emit code to save registers using MOV insns. First register
+ is restored from POINTER + OFFSET. */
+ static void
+ ix86_emit_save_sse_regs_using_mov (rtx pointer, HOST_WIDE_INT offset)
+ {
+ unsigned int regno;
+ rtx insn;
+ rtx mem;
+
+ for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
+ {
+ mem = adjust_address (gen_rtx_MEM (TImode, pointer), TImode, offset);
+ set_mem_align (mem, 128);
+ insn = emit_move_insn (mem, gen_rtx_REG (TImode, regno));
+ RTX_FRAME_RELATED_P (insn) = 1;
+ offset += 16;
+ }
+ }
+
/* Expand prologue or epilogue stack adjustment.
The pattern exist to put a dependency on all ebp-based memory accesses.
STYLE should be negative if instructions should be marked as frame related,
*************** ix86_expand_prologue (void)
*** 7969,7975 ****
RTX_FRAME_RELATED_P (insn) = 1;
}
! allocate = frame.to_allocate;
if (!frame.save_regs_using_mov)
ix86_emit_save_regs ();
--- 8036,8042 ----
RTX_FRAME_RELATED_P (insn) = 1;
}
! allocate = frame.to_allocate + frame.nsseregs * 16 + frame.padding0;
if (!frame.save_regs_using_mov)
ix86_emit_save_regs ();
*************** ix86_expand_prologue (void)
*** 8048,8058 ****
|| !frame.to_allocate
|| crtl->stack_realign_needed)
ix86_emit_save_regs_using_mov (stack_pointer_rtx,
! frame.to_allocate);
else
ix86_emit_save_regs_using_mov (hard_frame_pointer_rtx,
-frame.nregs * UNITS_PER_WORD);
}
pic_reg_used = false;
if (pic_offset_table_rtx
--- 8115,8136 ----
|| !frame.to_allocate
|| crtl->stack_realign_needed)
ix86_emit_save_regs_using_mov (stack_pointer_rtx,
! frame.to_allocate
! + frame.nsseregs * 16 + frame.padding0);
else
ix86_emit_save_regs_using_mov (hard_frame_pointer_rtx,
-frame.nregs * UNITS_PER_WORD);
}
+ if (!frame_pointer_needed
+ || !frame.to_allocate
+ || crtl->stack_realign_needed)
+ ix86_emit_save_sse_regs_using_mov (stack_pointer_rtx,
+ frame.to_allocate);
+ else
+ ix86_emit_save_sse_regs_using_mov (hard_frame_pointer_rtx,
+ - frame.nregs * UNITS_PER_WORD
+ - frame.nsseregs * 16
+ - frame.padding0);
pic_reg_used = false;
if (pic_offset_table_rtx
*************** ix86_emit_restore_regs_using_mov (rtx po
*** 8124,8130 ****
rtx base_address = gen_rtx_MEM (Pmode, pointer);
for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
! if (ix86_save_reg (regno, maybe_eh_return))
{
/* Ensure that adjust_address won't be forced to produce pointer
out of range allowed by x86-64 instruction set. */
--- 8202,8208 ----
rtx base_address = gen_rtx_MEM (Pmode, pointer);
for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
! if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
{
/* Ensure that adjust_address won't be forced to produce pointer
out of range allowed by x86-64 instruction set. */
*************** ix86_emit_restore_regs_using_mov (rtx po
*** 8139,8149 ****
offset = 0;
}
emit_move_insn (gen_rtx_REG (Pmode, regno),
! adjust_address (base_address, Pmode, offset));
offset += UNITS_PER_WORD;
}
}
/* Restore function stack, frame, and registers. */
void
--- 8217,8259 ----
offset = 0;
}
emit_move_insn (gen_rtx_REG (Pmode, regno),
! adjust_address (base_address, Pmode, offset));
offset += UNITS_PER_WORD;
}
}
+ /* Emit code to restore saved registers using MOV insns. First register
+ is restored from POINTER + OFFSET. */
+ static void
+ ix86_emit_restore_sse_regs_using_mov (rtx pointer, HOST_WIDE_INT offset,
+ int maybe_eh_return)
+ {
+ int regno;
+ rtx base_address = gen_rtx_MEM (TImode, pointer);
+ rtx mem;
+
+ for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
+ {
+ /* Ensure that adjust_address won't be forced to produce pointer
+ out of range allowed by x86-64 instruction set. */
+ if (TARGET_64BIT && offset != trunc_int_for_mode (offset, SImode))
+ {
+ rtx r11;
+
+ r11 = gen_rtx_REG (DImode, R11_REG);
+ emit_move_insn (r11, GEN_INT (offset));
+ emit_insn (gen_adddi3 (r11, r11, pointer));
+ base_address = gen_rtx_MEM (TImode, r11);
+ offset = 0;
+ }
+ mem = adjust_address (base_address, TImode, offset);
+ set_mem_align (mem, 128);
+ emit_move_insn (gen_rtx_REG (TImode, regno), mem);
+ offset += 16;
+ }
+ }
+
/* Restore function stack, frame, and registers. */
void
*************** ix86_expand_epilogue (int style)
*** 8171,8176 ****
--- 8281,8287 ----
if (crtl->calls_eh_return && style != 2)
offset -= 2;
offset *= -UNITS_PER_WORD;
+ offset -= frame.nsseregs * 16 + frame.padding0;
/* If we're only restoring one register and sp is not valid then
using a move instruction to restore the register since it's
*************** ix86_expand_epilogue (int style)
*** 8204,8214 ****
if (!frame_pointer_needed
|| (sp_valid && !frame.to_allocate)
|| stack_realign_fp)
! ix86_emit_restore_regs_using_mov (stack_pointer_rtx,
! frame.to_allocate, style == 2);
else
! ix86_emit_restore_regs_using_mov (hard_frame_pointer_rtx,
! offset, style == 2);
/* eh_return epilogues need %ecx added to the stack pointer. */
if (style == 2)
--- 8315,8337 ----
if (!frame_pointer_needed
|| (sp_valid && !frame.to_allocate)
|| stack_realign_fp)
! {
! ix86_emit_restore_sse_regs_using_mov (stack_pointer_rtx,
! frame.to_allocate, style == 2);
! ix86_emit_restore_regs_using_mov (stack_pointer_rtx,
! frame.to_allocate
! + frame.nsseregs * 16
! + frame.padding0, style == 2);
! }
else
! {
! ix86_emit_restore_sse_regs_using_mov (hard_frame_pointer_rtx,
! offset, style == 2);
! ix86_emit_restore_regs_using_mov (hard_frame_pointer_rtx,
! offset
! + frame.nsseregs * 16
! + frame.padding0, style == 2);
! }
/* eh_return epilogues need %ecx added to the stack pointer. */
if (style == 2)
*************** ix86_expand_epilogue (int style)
*** 8234,8247 ****
{
tmp = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
tmp = plus_constant (tmp, (frame.to_allocate
! + frame.nregs * UNITS_PER_WORD));
emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, tmp));
}
}
else if (!frame_pointer_needed)
pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
GEN_INT (frame.to_allocate
! + frame.nregs * UNITS_PER_WORD),
style);
/* If not an i386, mov & pop is faster than "leave". */
else if (TARGET_USE_LEAVE || optimize_function_for_size_p (cfun)
--- 8357,8374 ----
{
tmp = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
tmp = plus_constant (tmp, (frame.to_allocate
! + frame.nregs * UNITS_PER_WORD
! + frame.nsseregs * 16
! + frame.padding0));
emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, tmp));
}
}
else if (!frame_pointer_needed)
pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
GEN_INT (frame.to_allocate
! + frame.nregs * UNITS_PER_WORD
! + frame.nsseregs * 16
! + frame.padding0),
style);
/* If not an i386, mov & pop is faster than "leave". */
else if (TARGET_USE_LEAVE || optimize_function_for_size_p (cfun)
*************** ix86_expand_epilogue (int style)
*** 8272,8284 ****
pro_epilogue_adjust_stack (stack_pointer_rtx,
hard_frame_pointer_rtx,
GEN_INT (offset), style);
}
- else if (frame.to_allocate)
- pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
- GEN_INT (frame.to_allocate), style);
for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
! if (ix86_save_reg (regno, false))
emit_insn ((*ix86_gen_pop1) (gen_rtx_REG (Pmode, regno)));
if (frame_pointer_needed)
{
--- 8399,8422 ----
pro_epilogue_adjust_stack (stack_pointer_rtx,
hard_frame_pointer_rtx,
GEN_INT (offset), style);
+ ix86_emit_restore_sse_regs_using_mov (stack_pointer_rtx,
+ frame.to_allocate, style == 2);
+ pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+ GEN_INT (frame.nsseregs * 16), style);
+ }
+ else if (frame.to_allocate || frame.nsseregs)
+ {
+ ix86_emit_restore_sse_regs_using_mov (stack_pointer_rtx,
+ frame.to_allocate,
+ style == 2);
+ pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+ GEN_INT (frame.to_allocate
+ + frame.nsseregs * 16
+ + frame.padding0), style);
}
for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
! if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
emit_insn ((*ix86_gen_pop1) (gen_rtx_REG (Pmode, regno)));
if (frame_pointer_needed)
{