i386 ESP adjustments opt.

Jan Hubicka hubicka@atrey.karlin.mff.cuni.cz
Fri Apr 7 02:43:00 GMT 2000


Hi
this patch removes the pop optimization from emit_epilogue_adjustment
and does it using peepholes instead (same is done for pushes now).
This allows combine_stack_adjustments to do it's job better.
Also converison is done for non epilogue adjustments too.
The patch saves roughtly 4% of size with -Os and brings small, but consistent
speedups on PPro/Athlon.

I've added couple of configuration options to control this, since the rules
are really irregular.

This is an update of my older patch, that now once combine_stack_adjustents
is in is more interesting I hope.

Honza

Fri Apr  7 11:41:25 MET DST 2000  Jan Hubicka  <jh@suse.cz>
	* i386.c (x86_sub_esp_4, x86_sub_esp_8, x86_add_esp_4, x86_add_esp_8):
	New global variables.
	(ix86_emit_epilogue_adjustment): Do not attempt to use pop for the
	adjustment.
	* i386.h (x86_sub_esp_4, x86_sub_esp_8, x86_add_esp_4, x86_add_esp_8):
	Declare,
	(TARGET_SUB_ESP_4, TARGET_SUB_EDP_8, TARGET_ADD_ESP_4,
	 TARGET_ADD_ESP_8): New macros.
	* i386.md: Add peep2s to convert esp adjustments to push and pop
	instructions.

diff -Nrc3p config/i386.old/i386.c config/i386/i386.c
*** config/i386.old/i386.c	Fri Apr  7 07:29:01 2000
--- config/i386/i386.c	Fri Apr  7 07:36:31 2000
*************** const int x86_qimode_math = ~(0);
*** 218,223 ****
--- 218,227 ----
  const int x86_promote_qi_regs = 0;
  const int x86_himode_math = ~(m_PPRO);
  const int x86_promote_hi_regs = m_PPRO;
+ const int x86_sub_esp_4 = m_ATHLON | m_PPRO;
+ const int x86_sub_esp_8 = m_ATHLON | m_PPRO | m_386 | m_486;
+ const int x86_add_esp_4 = m_ATHLON | m_K6;
+ const int x86_add_esp_8 = m_ATHLON | m_PPRO | m_K6 | m_386 | m_486;
  
  #define AT_BP(mode) (gen_rtx_MEM ((mode), hard_frame_pointer_rtx))
  
*************** static void
*** 1923,1973 ****
  ix86_emit_epilogue_esp_adjustment (tsize)
       int tsize;
  {
!   /* Intel's docs say that for 4 or 8 bytes of stack frame one should
!      use `pop' and not `add'.  */
!   int use_pop = tsize == 4;
!   rtx edx = 0, ecx;
! 
!   /* Use two pops only for the Pentium processors.  */
!   if (tsize == 8 && !TARGET_386 && !TARGET_486)
!     {
!       rtx retval = current_function_return_rtx;
! 
!       edx = gen_rtx_REG (SImode, 1);
! 
!       /* This case is a bit more complex.  Since we cannot pop into
!          %ecx twice we need a second register.  But this is only
!          available if the return value is not of DImode in which
!          case the %edx register is not available.  */
!       use_pop = (retval == NULL
! 		 || !reg_overlap_mentioned_p (edx, retval));
!     }
! 
!   if (use_pop)
!     {
!       ecx = gen_rtx_REG (SImode, 2);
! 
!       /* We have to prevent the two pops here from being scheduled.
!          GCC otherwise would try in some situation to put other
!          instructions in between them which has a bad effect.  */
!       emit_insn (gen_blockage ());
!       emit_insn (gen_popsi1 (ecx));
!       if (tsize == 8)
! 	emit_insn (gen_popsi1 (edx));
!     }
    else
!     {
!       /* If a frame pointer is present, we must be sure to tie the sp
! 	 to the fp so that we don't mis-schedule.  */
!       if (frame_pointer_needed)
!         emit_insn (gen_pro_epilogue_adjust_stack (stack_pointer_rtx,
! 						  stack_pointer_rtx,
! 						  GEN_INT (tsize),
! 						  hard_frame_pointer_rtx));
!       else
!         emit_insn (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx,
! 			       GEN_INT (tsize)));
!     }
  }
  
  /* Emit code to restore saved registers using MOV insns.  First register
--- 1928,1943 ----
  ix86_emit_epilogue_esp_adjustment (tsize)
       int tsize;
  {
!   /* If a frame pointer is present, we must be sure to tie the sp
!      to the fp so that we don't mis-schedule.  */
!   if (frame_pointer_needed)
!     emit_insn (gen_pro_epilogue_adjust_stack (stack_pointer_rtx,
! 					      stack_pointer_rtx,
! 					      GEN_INT (tsize),
! 					      hard_frame_pointer_rtx));
    else
!     emit_insn (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx,
! 			   GEN_INT (tsize)));
  }
  
  /* Emit code to restore saved registers using MOV insns.  First register
diff -Nrc3p config/i386.old/i386.h config/i386/i386.h
*** config/i386.old/i386.h	Fri Apr  7 07:29:01 2000
--- config/i386/i386.h	Fri Apr  7 07:39:08 2000
*************** extern const int x86_read_modify, x86_sp
*** 174,179 ****
--- 174,180 ----
  extern const int x86_promote_QImode, x86_single_stringop;
  extern const int x86_himode_math, x86_qimode_math, x86_promote_qi_regs;
  extern const int x86_promote_hi_regs;
+ extern const int x86_add_esp_4, x86_add_esp_8, x86_sub_esp_4, x86_sub_esp_8;
  
  #define TARGET_USE_LEAVE (x86_use_leave & CPUMASK)
  #define TARGET_PUSH_MEMORY (x86_push_memory & CPUMASK)
*************** extern const int x86_promote_hi_regs;
*** 201,206 ****
--- 202,211 ----
  #define TARGET_HIMODE_MATH (x86_himode_math & CPUMASK)
  #define TARGET_PROMOTE_QI_REGS (x86_promote_qi_regs & CPUMASK)
  #define TARGET_PROMOTE_HI_REGS (x86_promote_hi_regs & CPUMASK)
+ #define TARGET_ADD_ESP_4 (x86_add_esp_4 & CPUMASK)
+ #define TARGET_ADD_ESP_8 (x86_add_esp_8 & CPUMASK)
+ #define TARGET_SUB_ESP_4 (x86_sub_esp_4 & CPUMASK)
+ #define TARGET_SUB_ESP_8 (x86_sub_esp_8 & CPUMASK)
  
  #define TARGET_STACK_PROBE (target_flags & MASK_STACK_PROBE)
  
diff -Nrc3p config/i386.old/i386.md config/i386/i386.md
*** config/i386.old/i386.md	Fri Apr  7 07:29:01 2000
--- config/i386/i386.md	Fri Apr  7 07:40:21 2000
***************
*** 9411,9416 ****
--- 9411,9547 ----
    [(parallel [(set (match_dup 0) (ashift:SI (match_dup 0) (match_dup 2)))
  	      (clobber (reg:CC 17))])]
    "operands[2] = GEN_INT (exact_log2 (INTVAL (operands[1])));")
+ 
+ ;; The ESP adjustments can be done by the push and pop instructions.  Resulting
+ ;; code is shorter, since push is only 1 byte, while add imm, %esp 3 bytes.  On
+ ;; many CPUs it is also faster, since special hardware to avoid esp
+ ;; dependancies is present.
+ 
+ ;; While some of these converisons may be done using splitters, we use peepholes
+ ;; in order to allow combine_stack_adjustments pass to see nonobfstructated RTL.
+ 
+ ;; Convert prologue esp substractions to push.
+ ;; We need register to push.  In order to keep verify_flow_info happy we have
+ ;; two choices
+ ;; - use scratch and clobber it in order to avoid dependencies
+ ;; - use already live register
+ ;; We can't use the second way right now, since there is no reliable way how to
+ ;; verify that given register is live.  First choice will also most likely in fewer
+ ;; dependencies.  On the place of esp adjustments it is very likely that call clobbered
+ ;; registers are dead.  We may want to use base pointer as an alternative when no
+ ;; register is available later.
+ 
+ (define_peephole2
+   [(match_scratch:SI 0 "r")
+    (parallel [(set (reg:SI 7) (plus:SI (reg:SI 7) (const_int -4)))
+ 	      (set (reg:SI 6) (reg:SI 6))
+ 	      (clobber (reg:CC 17))])]
+   "optimize_size || !TARGET_SUB_ESP_4"
+   [(clobber (match_dup 0))
+    (parallel [(set (mem:SI (pre_dec:SI (reg:SI 7))) (match_dup 0))
+ 	      (set (reg:SI 6) (reg:SI 6))])])
+ 
+ (define_peephole2
+   [(match_scratch:SI 0 "r")
+    (parallel [(set (reg:SI 7) (plus:SI (reg:SI 7) (const_int -8)))
+ 	      (set (reg:SI 6) (reg:SI 6))
+ 	      (clobber (reg:CC 17))])]
+   "optimize_size || !TARGET_SUB_ESP_8"
+   [(clobber (match_dup 0))
+    (set (mem:SI (pre_dec:SI (reg:SI 7))) (match_dup 0))
+    (parallel [(set (mem:SI (pre_dec:SI (reg:SI 7))) (match_dup 0))
+ 	      (set (reg:SI 6) (reg:SI 6))])])
+ 
+ ;; Convert esp substractions to push.
+ (define_peephole2
+   [(match_scratch:SI 0 "r")
+    (parallel [(set (reg:SI 7) (plus:SI (reg:SI 7) (const_int -4)))
+ 	      (clobber (reg:CC 17))])]
+   "optimize_size || !TARGET_SUB_ESP_4"
+   [(clobber (match_dup 0))
+    (set (mem:SI (pre_dec:SI (reg:SI 7))) (match_dup 0))])
+ 
+ (define_peephole2
+   [(match_scratch:SI 0 "r")
+    (parallel [(set (reg:SI 7) (plus:SI (reg:SI 7) (const_int -8)))
+ 	      (clobber (reg:CC 17))])]
+   "optimize_size || !TARGET_SUB_ESP_8"
+   [(clobber (match_dup 0))
+    (set (mem:SI (pre_dec:SI (reg:SI 7))) (match_dup 0))
+    (set (mem:SI (pre_dec:SI (reg:SI 7))) (match_dup 0))])
+ 
+ ;; Convert epilogue deallocator to pop.
+ (define_peephole2
+   [(match_scratch:SI 0 "r")
+    (parallel [(set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 4)))
+ 	      (set (reg:SI 6) (reg:SI 6))
+ 	      (clobber (reg:CC 17))])]
+   "optimize_size || !TARGET_ADD_ESP_4"
+   [(parallel [(set (match_dup 0) (mem:SI (reg:SI 7)))
+ 	      (set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 4)))])]
+   "")
+ 
+ ;; Two pops case is tricky, since pop causes dependency on destination register.
+ ;; We use two registers if available.
+ (define_peephole2
+   [(match_scratch:SI 0 "r")
+    (match_scratch:SI 1 "r")
+    (parallel [(set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 8)))
+ 	      (set (reg:SI 6) (reg:SI 6))
+ 	      (clobber (reg:CC 17))])]
+   "optimize_size || !TARGET_ADD_ESP_8"
+   [(parallel [(set (match_dup 0) (mem:SI (reg:SI 7)))
+ 	      (set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 4)))])
+    (parallel [(set (match_dup 1) (mem:SI (reg:SI 7)))
+ 	      (set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 4)))])]
+   "gen_blockage();")
+ 
+ (define_peephole2
+   [(match_scratch:SI 0 "r")
+    (parallel [(set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 8)))
+ 	      (set (reg:SI 6) (reg:SI 6))
+ 	      (clobber (reg:CC 17))])]
+   "optimize_size"
+   [(parallel [(set (match_dup 0) (mem:SI (reg:SI 7)))
+ 	      (set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 4)))])
+    (parallel [(set (match_dup 0) (mem:SI (reg:SI 7)))
+ 	      (set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 4)))])]
+   "gen_blockage();")
+ 
+ ;; Convert esp additions to pop.
+ (define_peephole2
+   [(match_scratch:SI 0 "r")
+    (parallel [(set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 4)))
+ 	      (clobber (reg:CC 17))])]
+   ""
+   [(parallel [(set (match_dup 0) (mem:SI (reg:SI 7)))
+ 	      (set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 4)))])]
+   "")
+ 
+ ;; Two pops case is tricky, since pop causes dependency on destination register.
+ ;; We use two registers if available.
+ (define_peephole2
+   [(match_scratch:SI 0 "r")
+    (match_scratch:SI 1 "r")
+    (parallel [(set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 8)))
+ 	      (clobber (reg:CC 17))])]
+   ""
+   [(parallel [(set (match_dup 0) (mem:SI (reg:SI 7)))
+ 	      (set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 4)))])
+    (parallel [(set (match_dup 1) (mem:SI (reg:SI 7)))
+ 	      (set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 4)))])]
+   "")
+ 
+ (define_peephole2
+   [(match_scratch:SI 0 "r")
+    (parallel [(set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 8)))
+ 	      (clobber (reg:CC 17))])]
+   "optimize_size"
+   [(parallel [(set (match_dup 0) (mem:SI (reg:SI 7)))
+ 	      (set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 4)))])
+    (parallel [(set (match_dup 0) (mem:SI (reg:SI 7)))
+ 	      (set (reg:SI 7) (plus:SI (reg:SI 7) (const_int 4)))])]
+   "")
  
  ;; Call-value patterns last so that the wildcard operand does not
  ;; disrupt insn-recog's switch tables.


More information about the Gcc-patches mailing list