This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Enable ix86_avoid_jump_misspredicts for P3/P4 too


Hi,
looking on recent copy of Intel optimization manual, it has the same
hint as AMD manual about 4 jumps per cache line.
I did SPEC run on the P4 and there is no change except for bzip2 that
improves by about 3%, that is quite expected as the scenario where 5
jumps happens to be in same window is very rare.

Bootstrapped/regtested i686-pc-linux.
OK?

2004-02-14  Jan Hubicka  <jh@suse.cz>
	* i386.c (x86_four_jump_limit): New variable.
	(k8_avoid_jump_misspredicts): Rename to ...
	(ix86_avoid_jump_misspredicts): .. this one.
	(ix86_pad_returns): Break out from ...
	(ix86_reorg): ... this one; do ix86_avoid_jump_misspredicts when asked
	to.
	* i386.h (TARGET_FOUR_JUMP_LIMIT): New macro.
Index: config/i386/i386.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/i386.c,v
retrieving revision 1.646
diff -c -3 -p -r1.646 i386.c
*** config/i386/i386.c	8 Feb 2004 23:08:40 -0000	1.646
--- config/i386/i386.c	14 Feb 2004 00:49:53 -0000
*************** const int x86_use_ffreep = m_ATHLON_K8;
*** 524,529 ****
--- 524,532 ----
  const int x86_rep_movl_optimal = m_386 | m_PENT | m_PPRO | m_K6;
  const int x86_inter_unit_moves = ~(m_ATHLON_K8);
  const int x86_ext_80387_constants = m_K6 | m_ATHLON | m_PENT4 | m_PPRO;
+ /* Some CPU cores are not able to predict more than 4 branch instructions in
+    the 16 byte window.  */
+ const int x86_four_jump_limit = m_PPRO | m_ATHLON_K8 | m_PENT4;
  
  /* In case the average insn count for single function invocation is
     lower than this constant, emit fast (but longer) prologue and
*************** static tree ix86_handle_struct_attribute
*** 883,889 ****
  static int extended_reg_mentioned_1 (rtx *, void *);
  static bool ix86_rtx_costs (rtx, int, int, int *);
  static int min_insn_size (rtx);
- static void k8_avoid_jump_misspredicts (void);
  
  #if defined (DO_GLOBAL_CTORS_BODY) && defined (HAS_INIT_SECTION)
  static void ix86_svr3_asm_out_constructor (rtx, int);
--- 886,891 ----
*************** min_insn_size (rtx insn)
*** 15714,15720 ****
     window.  */
  
  static void
! k8_avoid_jump_misspredicts (void)
  {
    rtx insn, start = get_insns ();
    int nbytes = 0, njumps = 0;
--- 15716,15722 ----
     window.  */
  
  static void
! ix86_avoid_jump_misspredicts (void)
  {
    rtx insn, start = get_insns ();
    int nbytes = 0, njumps = 0;
*************** k8_avoid_jump_misspredicts (void)
*** 15774,15791 ****
      }
  }
  
! /* Implement machine specific optimizations.
!    At the moment we implement single transformation: AMD Athlon works faster
     when RET is not destination of conditional jump or directly preceded
     by other jump instruction.  We avoid the penalty by inserting NOP just
     before the RET instructions in such cases.  */
  static void
! ix86_reorg (void)
  {
    edge e;
  
-   if (!TARGET_ATHLON_K8 || !optimize || optimize_size)
-     return;
    for (e = EXIT_BLOCK_PTR->pred; e; e = e->pred_next)
    {
      basic_block bb = e->src;
--- 15776,15790 ----
      }
  }
  
! /* AMD Athlon works faster
     when RET is not destination of conditional jump or directly preceded
     by other jump instruction.  We avoid the penalty by inserting NOP just
     before the RET instructions in such cases.  */
  static void
! ix86_pad_returns (void)
  {
    edge e;
  
    for (e = EXIT_BLOCK_PTR->pred; e; e = e->pred_next)
    {
      basic_block bb = e->src;
*************** ix86_reorg (void)
*** 15825,15831 ****
  	delete_insn (ret);
        }
    }
!   k8_avoid_jump_misspredicts ();
  }
  
  /* Return nonzero when QImode register that must be represented via REX prefix
--- 15824,15840 ----
  	delete_insn (ret);
        }
    }
! }
! 
! /* Implement machine specific optimizations.  We implement padding of returns
!    for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
! static void
! ix86_reorg (void)
! {
!   if (TARGET_ATHLON_K8 && optimize && !optimize_size)
!     ix86_pad_returns ();
!   if (TARGET_FOUR_JUMP_LIMIT && optimize && !optimize_size)
!     ix86_avoid_jump_misspredicts ();
  }
  
  /* Return nonzero when QImode register that must be represented via REX prefix
Index: config/i386/i386.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/i386.h,v
retrieving revision 1.375
diff -c -3 -p -r1.375 i386.h
*** config/i386/i386.h	7 Feb 2004 17:06:20 -0000	1.375
--- config/i386/i386.h	14 Feb 2004 00:49:53 -0000
*************** extern int x86_prefetch_sse;
*** 293,298 ****
--- 293,299 ----
  #define TARGET_USE_FFREEP (x86_use_ffreep & TUNEMASK)
  #define TARGET_REP_MOVL_OPTIMAL (x86_rep_movl_optimal & TUNEMASK)
  #define TARGET_INTER_UNIT_MOVES (x86_inter_unit_moves & TUNEMASK)
+ #define TARGET_FOUR_JUMP_LIMIT (x86_four_jump_limit & TUNEMASK)
  
  #define TARGET_STACK_PROBE (target_flags & MASK_STACK_PROBE)
  


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]