i386 ESP adjustments opt.
Jan Hubicka
hubicka@atrey.karlin.mff.cuni.cz
Fri Apr 7 02:43:00 GMT 2000
Hi
this patch removes the pop optimization from emit_epilogue_adjustment
and does it using peepholes instead (same is done for pushes now).
This allows combine_stack_adjustments to do it's job better.
Also converison is done for non epilogue adjustments too.
The patch saves roughtly 4% of size with -Os and brings small, but consistent
speedups on PPro/Athlon.
I've added couple of configuration options to control this, since the rules
are really irregular.
This is an update of my older patch, that now once combine_stack_adjustents
is in is more interesting I hope.
Honza
Fri Apr 7 11:41:25 MET DST 2000 Jan Hubicka <jh@suse.cz>
* i386.c (x86_sub_esp_4, x86_sub_esp_8, x86_add_esp_4, x86_add_esp_8):
New global variables.
(ix86_emit_epilogue_adjustment): Do not attempt to use pop for the
adjustment.
* i386.h (x86_sub_esp_4, x86_sub_esp_8, x86_add_esp_4, x86_add_esp_8):
Declare,
(TARGET_SUB_ESP_4, TARGET_SUB_EDP_8, TARGET_ADD_ESP_4,
TARGET_ADD_ESP_8): New macros.
* i386.md: Add peep2s to convert esp adjustments to push and pop
instructions.
diff -Nrc3p config/i386.old/i386.c config/i386/i386.c
*** config/i386.old/i386.c Fri Apr 7 07:29:01 2000
--- config/i386/i386.c Fri Apr 7 07:36:31 2000
*************** const int x86_qimode_math = ~(0);
*** 218,223 ****
--- 218,227 ----
const int x86_promote_qi_regs = 0;
const int x86_himode_math = ~(m_PPRO);
const int x86_promote_hi_regs = m_PPRO;
+ const int x86_sub_esp_4 = m_ATHLON | m_PPRO;
+ const int x86_sub_esp_8 = m_ATHLON | m_PPRO | m_386 | m_486;
+ const int x86_add_esp_4 = m_ATHLON | m_K6;
+ const int x86_add_esp_8 = m_ATHLON | m_PPRO | m_K6 | m_386 | m_486;
#define AT_BP(mode) (gen_rtx_MEM ((mode), hard_frame_pointer_rtx))
*************** static void
*** 1923,1973 ****
ix86_emit_epilogue_esp_adjustment (tsize)
int tsize;
{
! /* Intel's docs say that for 4 or 8 bytes of stack frame one should
! use `pop' and not `add'. */
! int use_pop = tsize == 4;
! rtx edx = 0, ecx;
!
! /* Use two pops only for the Pentium processors. */
! if (tsize == 8 && !TARGET_386 && !TARGET_486)
! {
! rtx retval = current_function_return_rtx;
!
! edx = gen_rtx_REG (SImode, 1);
!
! /* This case is a bit more complex. Since we cannot pop into
! %ecx twice we need a second register. But this is only
! available if the return value is not of DImode in which
! case the %edx register is not available. */
! use_pop = (retval == NULL
! || !reg_overlap_mentioned_p (edx, retval));
! }
!
! if (use_pop)
! {
! ecx = gen_rtx_REG (SImode, 2);
!
! /* We have to prevent the two pops here from being scheduled.
! GCC otherwise would try in some situation to put other
! instructions in between them which has a bad effect. */
! emit_insn (gen_blockage ());
! emit_insn (gen_popsi1 (ecx));
! if (tsize == 8)
! emit_insn (gen_popsi1 (edx));
! }
else
! {
! /* If a frame pointer is present, we must be sure to tie the sp
! to the fp so that we don't mis-schedule. */
! if (frame_pointer_needed)
! emit_insn (gen_pro_epilogue_adjust_stack (stack_pointer_rtx,
! stack_pointer_rtx,
! GEN_INT (tsize),
! hard_frame_pointer_rtx));
! else
! emit_insn (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx,
! GEN_INT (tsize)));
! }
}
/* Emit code to restore saved registers using MOV insns. First register
--- 1928,1943 ----
ix86_emit_epilogue_esp_adjustment (tsize)
int tsize;
{
! /* If a frame pointer is present, we must be sure to tie the sp
! to the fp so that we don't mis-schedule. */
! if (frame_pointer_needed)
! emit_insn (gen_pro_epilogue_adjust_stack (stack_pointer_rtx,
! stack_pointer_rtx,
! GEN_INT (tsize),
! hard_frame_pointer_rtx));
else
! emit_insn (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx,
! GEN_INT (tsize)));
}
/* Emit code to restore saved registers using MOV insns. First register
diff -Nrc3p config/i386.old/i386.h config/i386/i386.h
*** config/i386.old/i386.h Fri Apr 7 07:29:01 2000
--- config/i386/i386.h Fri Apr 7 07:39:08 2000
*************** extern const int x86_read_modify, x86_sp
*** 174,179 ****
--- 174,180 ----
extern const int x86_promote_QImode, x86_single_stringop;
extern const int x86_himode_math, x86_qimode_math, x86_promote_qi_regs;
extern const int x86_promote_hi_regs;
+ extern const int x86_add_esp_4, x86_add_esp_8, x86_sub_esp_4, x86_sub_esp_8;
#define TARGET_USE_LEAVE (x86_use_leave & CPUMASK)
#define TARGET_PUSH_MEMORY (x86_push_memory & CPUMASK)
*************** extern const int x86_promote_hi_regs;
*** 201,206 ****
--- 202,211 ----
#define TARGET_HIMODE_MATH (x86_himode_math & CPUMASK)
#define TARGET_PROMOTE_QI_REGS (x86_promote_qi_regs & CPUMASK)
#define TARGET_PROMOTE_HI_REGS (x86_promote_hi_regs & CPUMASK)
+ #define TARGET_ADD_ESP_4 (x86_add_esp_4 & CPUMASK)
+ #define TARGET_ADD_ESP_8 (x86_add_esp_8 & CPUMASK)
+ #define TARGET_SUB_ESP_4 (x86_sub_esp_4 & CPUMASK)
+ #define TARGET_SUB_ESP_8 (x86_sub_esp_8 & CPUMASK)
#define TARGET_STACK_PROBE (target_flags & MASK_STACK_PROBE)
diff -Nrc3p config/i386.old/i386.md config/i386/i386.md
*** config/i386.old/i386.md Fri Apr 7 07:29:01 2000
--- config/i386/i386.md Fri Apr 7 07:40:21 2000
***************
*** 9411,9416 ****
--- 9411,9547 ----
[(parallel [(set (match_dup 0) (ashift:SI (match_dup 0) (match_dup 2)))
(clobber (reg:CC 17))])]
"operands[2] = GEN_INT (exact_log2 (INTVAL (operands[1])));")
+
+ ;; The ESP adjustments can be done by the push and pop instructions. Resulting
+ ;; code is shorter, since push is only 1 byte, while add imm, %esp 3 bytes. On
+ ;; many CPUs it is also faster, since special hardware to avoid esp
+ ;; dependancies is present.
+
+ ;; While some of these converisons may be done using splitters, we use peepholes
+ ;; in order to allow combine_stack_adjustments pass to see nonobfstructated RTL.
+
+ ;; Convert prologue esp substractions to push.
+ ;; We need register to push. In order to keep verify_flow_info happy we have
+ ;; two choices
+ ;; - use scratch and clobber it in order to avoid dependencies
+ ;; - use already live register
+ ;; We can't use the second way right now, since there is no reliable way how to
+ ;; verify that given register is live. First choice will also most likely in fewer
+ ;; dependencies. On the place of esp adjustments it is very likely that call clobbered
+ ;; registers are dead. We may want to use base pointer as an alternative when no
+ ;; register is available later.
+
+ (define_peephole2
+ [(match_scratch:SI 0 "r")
+ (parallel [(set (reg:SI 7) (plus:SI (reg:SI 7) (const_int -4)))
+ (set (reg:SI 6) (reg:SI 6))
+ (clobber (reg:CC 17))])]
+ "optimize_size || !TARGET_SUB_ESP_4"
+ [(clobber (match_dup 0))
+ (parallel [(set (mem:SI (pre_dec:SI (reg:SI 7))) (match_dup 0))
+ (set (reg:SI 6) (reg:SI 6))])])
+
+ (define_peephole2
+ [(match_scratch:SI 0 "r")
+ (parallel [(set (reg:SI 7) (plus:SI (reg:SI 7) (const_int -8)))
+ (set (reg:SI 6) (reg:SI 6))
+ (clobber (reg:CC 17))])]
+ "optimize_size || !TARGET_SUB_ESP_8"
+ [(clobber (match_dup 0))
+ (set (mem:SI (pre_dec:SI (reg:SI 7))) (match_dup 0))
+ (parallel [(set (mem:SI (pre_dec:SI (reg:SI 7))) (match_dup 0))
+ (set (reg:SI 6) (reg:SI 6))])])
+
+ ;; Convert esp substractions to push.
+ (define_peephole2
+ [(match_scratch:SI 0 "r")
+ (parallel [(set (reg:SI 7) (plus:SI (reg:SI 7) (const_int -4)))
+ (clobber (reg:CC 17))])]
+ "optimize_size || !TARGET_SUB_ESP_4"
+ [(clobber (match_dup 0))
+ (set (mem:SI (pre_dec:SI (reg:SI 7))) (match_dup 0))])
+
+ (define_peephole2
+ [(match_scratch:SI 0 "r")
+ (parallel [(set (reg:SI 7) (plus:SI (reg:SI 7) (const_int -8)))
+ (clobber (reg:CC 17))])]
+ "optimize_size || !TARGET_SUB_ESP_8"
+ [(clobber (match_dup 0))
+ (set (mem:SI (pre_dec:SI (reg:SI 7))) (match_dup 0))
+ (set (mem:SI (pre_dec:SI (reg:SI 7))) (match_dup 0))])
+
+ ;; Convert epilogue deallocator to pop.
+ (define_peephole2
+ [(match_scratch:SI 0 "r")
+ (parallel [(set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 4)))
+ (set (reg:SI 6) (reg:SI 6))
+ (clobber (reg:CC 17))])]
+ "optimize_size || !TARGET_ADD_ESP_4"
+ [(parallel [(set (match_dup 0) (mem:SI (reg:SI 7)))
+ (set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 4)))])]
+ "")
+
+ ;; Two pops case is tricky, since pop causes dependency on destination register.
+ ;; We use two registers if available.
+ (define_peephole2
+ [(match_scratch:SI 0 "r")
+ (match_scratch:SI 1 "r")
+ (parallel [(set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 8)))
+ (set (reg:SI 6) (reg:SI 6))
+ (clobber (reg:CC 17))])]
+ "optimize_size || !TARGET_ADD_ESP_8"
+ [(parallel [(set (match_dup 0) (mem:SI (reg:SI 7)))
+ (set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 4)))])
+ (parallel [(set (match_dup 1) (mem:SI (reg:SI 7)))
+ (set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 4)))])]
+ "gen_blockage();")
+
+ (define_peephole2
+ [(match_scratch:SI 0 "r")
+ (parallel [(set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 8)))
+ (set (reg:SI 6) (reg:SI 6))
+ (clobber (reg:CC 17))])]
+ "optimize_size"
+ [(parallel [(set (match_dup 0) (mem:SI (reg:SI 7)))
+ (set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 4)))])
+ (parallel [(set (match_dup 0) (mem:SI (reg:SI 7)))
+ (set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 4)))])]
+ "gen_blockage();")
+
+ ;; Convert esp additions to pop.
+ (define_peephole2
+ [(match_scratch:SI 0 "r")
+ (parallel [(set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 4)))
+ (clobber (reg:CC 17))])]
+ ""
+ [(parallel [(set (match_dup 0) (mem:SI (reg:SI 7)))
+ (set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 4)))])]
+ "")
+
+ ;; Two pops case is tricky, since pop causes dependency on destination register.
+ ;; We use two registers if available.
+ (define_peephole2
+ [(match_scratch:SI 0 "r")
+ (match_scratch:SI 1 "r")
+ (parallel [(set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 8)))
+ (clobber (reg:CC 17))])]
+ ""
+ [(parallel [(set (match_dup 0) (mem:SI (reg:SI 7)))
+ (set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 4)))])
+ (parallel [(set (match_dup 1) (mem:SI (reg:SI 7)))
+ (set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 4)))])]
+ "")
+
+ (define_peephole2
+ [(match_scratch:SI 0 "r")
+ (parallel [(set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 8)))
+ (clobber (reg:CC 17))])]
+ "optimize_size"
+ [(parallel [(set (match_dup 0) (mem:SI (reg:SI 7)))
+ (set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 4)))])
+ (parallel [(set (match_dup 0) (mem:SI (reg:SI 7)))
+ (set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 4)))])]
+ "")
;; Call-value patterns last so that the wildcard operand does not
;; disrupt insn-recog's switch tables.
More information about the Gcc-patches
mailing list