This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
push/pop tweeks
- To: jh at suse dot cz, gcc-patches at gcc dot gnu dot org, rth at cygnus dot com, patches at x86-64 dot org
- Subject: push/pop tweeks
- From: Jan Hubicka <jh at suse dot cz>
- Date: Fri, 2 Mar 2001 12:13:59 +0100
Hi
Pentium4 seems to be first architecture cleanly winning by not using push/pop instructions.
Moves+arithmetics results in fewer uops and thus consume less resources of trace cache.
When I was experimenting with this I found the code to bring benefits on Athlons and Celeron
as well, even when I am not sure if the code size bloat will not outweight it in larger
benchmarks.
This patch adds neccesary switches to enable it and I will do more tunning later once I
merge in the preferred-stack-alignment propagating patches that are going to change
the costs noticeably.
Fri Mar 2 12:07:29 CET 2001 Jan Hubicka <jh@suse.cz>
* i386.c (x86_accumulate_outgoing_args, x86_prologue_using_move,
x86_epilogue_using_move): New global variables.
(override_options): Enable ACCUMULATE_OUTGOING_ARGS if preferred.
(ix86_emit_save_regs_using_mov): New static function.
(ix86_expand_prologue, ix86_expand_epilogue): Use moves if preferred.
* i386.h (MASK_MMX, MASK_SSE, MASK_SSE2, MASK_128BIT_LONG_DOUBLE,
MASK_MIX_SSE_I387): Renumber.
(MASK_NO_ACCUMULATE_OUTGOING_ARGS): New.
(x86_accumulate_outgoing_args, x86_prologue_using_move,
x86_epilogue_using_move): Declare.
(TARGET_PROLOGUE_USING_MOVE, TARGET_EPILOGUE_USING_MOVE): New.
Index: config/i386/i386.c
===================================================================
RCS file: /cvs/gcc/egcs/gcc/config/i386/i386.c,v
retrieving revision 1.228
diff -c -3 -p -r1.228 i386.c
*** i386.c 2001/02/28 18:34:34 1.228
--- i386.c 2001/03/02 11:06:24
*************** const int x86_add_esp_8 = m_ATHLON | m_P
*** 313,318 ****
--- 313,321 ----
const int x86_integer_DFmode_moves = ~(m_ATHLON | m_PENT4);
const int x86_partial_reg_dependency = m_ATHLON | m_PENT4;
const int x86_memory_mismatch_stall = m_ATHLON | m_PENT4;
+ const int x86_accumulate_outgoing_args = m_ATHLON | m_PENT4 | m_PPRO;
+ const int x86_prologue_using_move = m_ATHLON | m_PENT4 | m_PPRO;
+ const int x86_epilogue_using_move = m_ATHLON | m_PENT4 | m_PPRO;
#define AT_BP(mode) (gen_rtx_MEM ((mode), hard_frame_pointer_rtx))
*************** static int ix86_safe_length_prefix PARAM
*** 537,542 ****
--- 540,546 ----
static int ix86_nsaved_regs PARAMS((void));
static void ix86_emit_save_regs PARAMS((void));
static void ix86_emit_restore_regs_using_mov PARAMS ((rtx, int));
+ static void ix86_emit_save_regs_using_mov PARAMS ((rtx, int));
static void ix86_emit_epilogue_esp_adjustment PARAMS((int));
static void ix86_set_move_mem_attrs_1 PARAMS ((rtx, rtx, rtx, rtx, rtx));
static void ix86_sched_reorder_pentium PARAMS((rtx *, rtx *));
*************** override_options ()
*** 755,760 ****
--- 759,769 ----
on by -msse. */
if (TARGET_SSE)
target_flags |= MASK_MMX;
+
+ if ((x86_accumulate_outgoing_args & CPUMASK)
+ && !(target_flags & MASK_NO_ACCUMULATE_OUTGOING_ARGS)
+ && !optimize_size)
+ target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
}
void
*************** ix86_emit_save_regs ()
*** 2059,2064 ****
--- 2070,2095 ----
}
}
+ /* Emit code to save registers using MOV insns. First register
+ is restored from POINTER + OFFSET. */
+ static void
+ ix86_emit_save_regs_using_mov (pointer, offset)
+ rtx pointer;
+ int offset;
+ {
+ int regno;
+
+ for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if (ix86_save_reg (regno))
+ {
+ emit_move_insn (adj_offsettable_operand (gen_rtx_MEM (Pmode,
+ pointer),
+ offset),
+ gen_rtx_REG (Pmode, regno));
+ offset += UNITS_PER_WORD;
+ }
+ }
+
/* Expand the prologue into a bunch of separate insns. */
void
*************** ix86_expand_prologue ()
*** 2068,2073 ****
--- 2099,2106 ----
int pic_reg_used = flag_pic && (current_function_uses_pic_offset_table
|| current_function_uses_const_pool);
struct ix86_frame frame;
+ int use_mov = (TARGET_PROLOGUE_USING_MOVE && !optimize_size);
+ int allocate;
ix86_compute_frame_layout (&frame);
*************** ix86_expand_prologue ()
*** 2083,2101 ****
RTX_FRAME_RELATED_P (insn) = 1;
}
! ix86_emit_save_regs ();
! if (frame.to_allocate == 0)
;
! else if (! TARGET_STACK_PROBE || frame.to_allocate < CHECK_STACK_LIMIT)
{
if (frame_pointer_needed)
insn = emit_insn (gen_pro_epilogue_adjust_stack
(stack_pointer_rtx, stack_pointer_rtx,
! GEN_INT (-frame.to_allocate), hard_frame_pointer_rtx));
else
insn = emit_insn (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx,
! GEN_INT (-frame.to_allocate)));
RTX_FRAME_RELATED_P (insn) = 1;
}
else
--- 2116,2143 ----
RTX_FRAME_RELATED_P (insn) = 1;
}
! allocate = frame.to_allocate;
! /* In case we are dealing only with single register and empty frame,
! push is equivalent of the mov+add sequence. */
! if (allocate == 0 && frame.nregs <= 1)
! use_mov = 0;
! if (!use_mov)
! ix86_emit_save_regs ();
! else
! allocate += frame.nregs * UNITS_PER_WORD;
!
! if (allocate == 0)
;
! else if (! TARGET_STACK_PROBE || allocate < CHECK_STACK_LIMIT)
{
if (frame_pointer_needed)
insn = emit_insn (gen_pro_epilogue_adjust_stack
(stack_pointer_rtx, stack_pointer_rtx,
! GEN_INT (-allocate), hard_frame_pointer_rtx));
else
insn = emit_insn (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx,
! GEN_INT (-allocate)));
RTX_FRAME_RELATED_P (insn) = 1;
}
else
*************** ix86_expand_prologue ()
*** 2105,2111 ****
rtx arg0, sym;
arg0 = gen_rtx_REG (SImode, 0);
! emit_move_insn (arg0, GEN_INT (frame.to_allocate));
sym = gen_rtx_MEM (FUNCTION_MODE,
gen_rtx_SYMBOL_REF (Pmode, "_alloca"));
--- 2147,2153 ----
rtx arg0, sym;
arg0 = gen_rtx_REG (SImode, 0);
! emit_move_insn (arg0, GEN_INT (allocate));
sym = gen_rtx_MEM (FUNCTION_MODE,
gen_rtx_SYMBOL_REF (Pmode, "_alloca"));
*************** ix86_expand_prologue ()
*** 2115,2120 ****
--- 2157,2170 ----
= gen_rtx_EXPR_LIST (VOIDmode, gen_rtx_USE (VOIDmode, arg0),
CALL_INSN_FUNCTION_USAGE (insn));
}
+ if (use_mov)
+ {
+ if (!frame_pointer_needed || !frame.to_allocate)
+ ix86_emit_save_regs_using_mov (stack_pointer_rtx, frame.to_allocate);
+ else
+ ix86_emit_save_regs_using_mov (hard_frame_pointer_rtx,
+ -frame.nregs * UNITS_PER_WORD);
+ }
#ifdef SUBTARGET_PROLOGUE
SUBTARGET_PROLOGUE;
*************** ix86_expand_epilogue (emit_return)
*** 2203,2208 ****
--- 2253,2260 ----
and there is exactly one register to pop. This heruistic may need some
tuning in future. */
if ((!sp_valid && frame.nregs <= 1)
+ || (TARGET_EPILOGUE_USING_MOVE && !optimize_size
+ && (frame.nregs > 1 || frame.to_allocate))
|| (frame_pointer_needed && !frame.nregs && frame.to_allocate)
|| (frame_pointer_needed && TARGET_USE_LEAVE && !optimize_size
&& frame.nregs == 1))
Index: config/i386/i386.h
===================================================================
RCS file: /cvs/gcc/egcs/gcc/config/i386/i386.h,v
retrieving revision 1.159
diff -c -3 -p -r1.159 i386.h
*** i386.h 2001/02/28 18:34:35 1.159
--- i386.h 2001/03/02 11:06:25
*************** extern int target_flags;
*** 114,124 ****
#define MASK_INLINE_ALL_STROPS 0x00002000 /* Inline stringops in all cases */
#define MASK_NO_PUSH_ARGS 0x00004000 /* Use push instructions */
#define MASK_ACCUMULATE_OUTGOING_ARGS 0x00008000/* Accumulate outgoing args */
! #define MASK_MMX 0x00010000 /* Support MMX regs/builtins */
! #define MASK_SSE 0x00020000 /* Support SSE regs/builtins */
! #define MASK_SSE2 0x00040000 /* Support SSE2 regs/builtins */
! #define MASK_128BIT_LONG_DOUBLE 0x00080000 /* long double size is 128bit */
! #define MASK_MIX_SSE_I387 0x00100000 /* Mix SSE and i387 instructions */
/* Temporary codegen switches */
#define MASK_INTEL_SYNTAX 0x00000200
--- 114,125 ----
#define MASK_INLINE_ALL_STROPS 0x00002000 /* Inline stringops in all cases */
#define MASK_NO_PUSH_ARGS 0x00004000 /* Use push instructions */
#define MASK_ACCUMULATE_OUTGOING_ARGS 0x00008000/* Accumulate outgoing args */
! #define MASK_NO_ACCUMULATE_OUTGOING_ARGS 0x00010000
! #define MASK_MMX 0x00020000 /* Support MMX regs/builtins */
! #define MASK_SSE 0x00040000 /* Support SSE regs/builtins */
! #define MASK_SSE2 0x00080000 /* Support SSE2 regs/builtins */
! #define MASK_128BIT_LONG_DOUBLE 0x00100000 /* long double size is 128bit */
! #define MASK_MIX_SSE_I387 0x00200000 /* Mix SSE and i387 instructions */
/* Temporary codegen switches */
#define MASK_INTEL_SYNTAX 0x00000200
*************** extern const int x86_himode_math, x86_qi
*** 199,204 ****
--- 200,207 ----
extern const int x86_promote_hi_regs, x86_integer_DFmode_moves;
extern const int x86_add_esp_4, x86_add_esp_8, x86_sub_esp_4, x86_sub_esp_8;
extern const int x86_partial_reg_dependency, x86_memory_mismatch_stall;
+ extern const int x86_accumulate_outgoing_args, x86_prologue_using_move;
+ extern const int x86_epilogue_using_move;
#define TARGET_USE_LEAVE (x86_use_leave & CPUMASK)
#define TARGET_PUSH_MEMORY (x86_push_memory & CPUMASK)
*************** extern const int x86_partial_reg_depende
*** 233,238 ****
--- 236,243 ----
#define TARGET_INTEGER_DFMODE_MOVES (x86_integer_DFmode_moves & CPUMASK)
#define TARGET_PARTIAL_REG_DEPENDENCY (x86_partial_reg_dependency & CPUMASK)
#define TARGET_MEMORY_MISMATCH_STALL (x86_memory_mismatch_stall & CPUMASK)
+ #define TARGET_PROLOGUE_USING_MOVE (x86_prologue_using_move & CPUMASK)
+ #define TARGET_EPILOGUE_USING_MOVE (x86_epilogue_using_move & CPUMASK)
#define TARGET_STACK_PROBE (target_flags & MASK_STACK_PROBE)