This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Revisit Core tunning flags


Hi,
this is upated version of patch discussed at 
http://gcc.gnu.org/ml/gcc-patches/2012-12/msg00841.html

It makes CORE tuning to more follow the optimization guidelines.
In particular it removes some tuning flags for features I implemented years
back specifically for K7/K8 chips that ended up in Core tunning becuase
it was based on generic. Incrementally I plan to drop some of these from
generic, too.

Compared to previous version of patch I left out INC_DEC change, even
though Core I7+ should resolve dependencies on partial flags correctly.
Optimization manual still seems to suggest to not use this:

Assembly/Compiler Coding Rule 33. (M impact, H generality)
INC and DEC instructions should be replaced with ADD or SUB instructions,
because ADD and SUB overwrite all flags, whereas INC and DEC do not, therefore
creating false dependencies on earlier instructions that set the flags. 

Other change dropped is use_vector_fp_converts that seems to improve
Core perofrmance.

I benchmarked the patch on SPEC2k and earlier it was benchmarked on 2k6
and the performance difference seems in noise.  It causes about 0.3% code
size reduction.  Main motivation for the patch is to drop some codegen
oddities that do not make sense on modern chips.

Bootstrapped/regtested x86_64-linux, will commit it shortly.
Honza

	* x86-tune.def (partial_reg_stall): Disable for CoreI7 and newer.
	(sse_typeless_stores): Enable for core
	(sse_load0_by_pxor): Likewise.
	(four_jump_limit): Disable for core.
	(pad_returns): Likewise.
	(avoid_vector_decode): Likewise.
	(fuse_cmp_and_branch): Enable for cores.
	* i386.c (x86_accumulate_outgoing_args): Disable for cores.
Index: x86-tune.def
===================================================================
*** x86-tune.def	(revision 202812)
--- x86-tune.def	(working copy)
*************** DEF_TUNE (X86_TUNE_MOVX, "movx",
*** 52,58 ****
     and can happen in caller/callee saving sequences.  */
  DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO)
  DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
!           m_CORE_ALL | m_GENERIC)
  /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
   * on 16-bit immediate moves into memory on Core2 and Corei7.  */
  DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_GENERIC)
--- 52,58 ----
     and can happen in caller/callee saving sequences.  */
  DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO)
  DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
!           m_CORE2 | m_GENERIC)
  /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
   * on 16-bit immediate moves into memory on Core2 and Corei7.  */
  DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_GENERIC)
*************** DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INS
*** 125,132 ****
     maintain just lower part of scalar values in proper format leaving the
     upper part undefined.  */
  DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8)
! DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores", m_AMD_MULTIPLE)
! DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor", m_PPRO | m_P4_NOCONA)
  DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall",
            m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC)
  DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move", 
--- 125,134 ----
     maintain just lower part of scalar values in proper format leaving the
     upper part undefined.  */
  DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8)
! DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores",
! 	  m_AMD_MULTIPLE | m_CORE_ALL)
! DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor",
! 	  m_PPRO | m_P4_NOCONA | m_CORE_ALL)
  DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall",
            m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC)
  DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move", 
*************** DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSION
*** 144,150 ****
  /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
     than 4 branch instructions in the 16 byte window.  */
  DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit",
!           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM| m_AMD_MULTIPLE 
            | m_GENERIC)
  DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
            m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE 
--- 146,152 ----
  /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
     than 4 branch instructions in the 16 byte window.  */
  DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit",
!           m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_AMD_MULTIPLE 
            | m_GENERIC)
  DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
            m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE 
*************** DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
*** 154,166 ****
  DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec",
            ~(m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GENERIC))
  DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns",
!           m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC)
  DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function", m_ATOM)
  DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants",
            m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE
            | m_ATHLON_K8 | m_GENERIC)
  DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode",
!           m_CORE_ALL | m_K8 | m_GENERIC)
  /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
     and SImode multiply, but 386 and 486 do HImode multiply faster.  */
  DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul",
--- 156,168 ----
  DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec",
            ~(m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GENERIC))
  DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns",
!           m_AMD_MULTIPLE | m_GENERIC)
  DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function", m_ATOM)
  DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants",
            m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE
            | m_ATHLON_K8 | m_GENERIC)
  DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode",
!           m_K8 | m_GENERIC)
  /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
     and SImode multiply, but 386 and 486 do HImode multiply faster.  */
  DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul",
*************** DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS,
*** 193,199 ****
  /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
     with a subsequent conditional jump instruction into a single
     compare-and-branch uop.  */
! DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH, "fuse_cmp_and_branch", m_BDVER)
  /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
     will impact LEA instruction selection. */
  DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_ATOM | m_SLM)
--- 195,201 ----
  /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
     with a subsequent conditional jump instruction into a single
     compare-and-branch uop.  */
! DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH, "fuse_cmp_and_branch", m_BDVER | m_CORE_ALL)
  /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
     will impact LEA instruction selection. */
  DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_ATOM | m_SLM)
Index: i386.c
===================================================================
*** i386.c	(revision 202812)
--- i386.c	(working copy)
*************** static unsigned int initial_ix86_arch_fe
*** 1899,1905 ****
  };
  
  static const unsigned int x86_accumulate_outgoing_args
!   = m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC;
  
  static const unsigned int x86_arch_always_fancy_math_387
    = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC;
--- 1899,1905 ----
  };
  
  static const unsigned int x86_accumulate_outgoing_args
!   = m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC;
  
  static const unsigned int x86_arch_always_fancy_math_387
    = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC;


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]