Honnor ix86_accumulate_outgoing_args again

Thu Oct 10 18:48:00 GMT 2013

Hi,
this patch makes ACCUMULATE_OUTGOING_ARGS to disable itself when function is
cold.  I did some extra testing and to my amusement we now seem to output
more compact unwind info when ACCUMULATE_OUTGOING_ARGS is disabled, so this
seems quite consistent code size win.

We actually can do better and enable ACCUMULATE_OUTGOING_ARGS only when
function contains hot calls.  This should also avoid need for frame allocation
in prologue/epilogue on hot path then. I will look into this incrementally.

I also noticed that we still have some tuning flags in i386.c rather than
in x86-tune.c so I moved them there.

Testing x86_64-linux and will commit it once testing converge.

Honza
	* config/i386/i386.h (ACCUMULATE_OUTGOING_ARGS): Disable accumulation
	for cold functions.
	* x86-tune.def (X86_TUNE_USE_LEAVE): Update comment.
	(X86_TUNE_PUSH_MEMORY): Likewise.
	(X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL,
	X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL): New.
	(X86_TUNE_ACCUMULATE_OUTGOING_ARGS, X86_TUNE_ALWAYS_FANCY_MATH_387): New.
	* i386.c (x86_accumulate_outgoing_args, x86_arch_always_fancy_math_387,
	x86_avx256_split_unaligned_load, x86_avx256_split_unaligned_store):
	Remove.
	(ix86_option_override_internal): Update to use tune features instead
	of variables.
Index: config/i386/i386.h
===================================================================

--- config/i386/i386.h	(revision 203380)
+++ config/i386/i386.h	(working copy)
@@ -1492,13 +1492,26 @@ enum reg_class
    will be computed and placed into the variable `crtl->outgoing_args_size'.
    No space will be pushed onto the stack for each call; instead, the
    function prologue should increase the stack frame size by this amount.  
+
+   In 32bit mode enabling argument accumulation results in about 5% code size
+   growth becuase move instructions are less compact than push.  In 64bit
+   mode the difference is less drastic but visible.  
+
+   FIXME: Unlike earlier implementations, the size of unwind info seems to
+   actually grouw with accumulation.  Is that because accumulated args
+   unwind info became unnecesarily bloated?
    
    64-bit MS ABI seem to require 16 byte alignment everywhere except for
-   function prologue and apilogue.  This is not possible without
-   ACCUMULATE_OUTGOING_ARGS.  */
+   function prologue and epilogue.  This is not possible without
+   ACCUMULATE_OUTGOING_ARGS.  
+
+   If stack probes are required, the space used for large function
+   arguments on the stack must also be probed, so enable
+   -maccumulate-outgoing-args so this happens in the prologue.  */
 
 #define ACCUMULATE_OUTGOING_ARGS \
-  (TARGET_ACCUMULATE_OUTGOING_ARGS || TARGET_64BIT_MS_ABI)
+  ((TARGET_ACCUMULATE_OUTGOING_ARGS && optimize_function_for_speed_p (cfun)) \
+   || TARGET_STACK_PROBE || TARGET_64BIT_MS_ABI)
 
 /* If defined, a C expression whose value is nonzero when we want to use PUSH
    instructions to pass outgoing arguments.  */
Index: config/i386/x86-tune.def
===================================================================
--- config/i386/x86-tune.def	(revision 203387)
+++ config/i386/x86-tune.def	(working copy)
@@ -18,15 +18,13 @@ a copy of the GCC Runtime Library Except
 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 <http://www.gnu.org/licenses/>.  */
 
-/* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
-   negatively, so enabling for Generic64 seems like good code size
-   tradeoff.  We can't enable it for 32bit generic because it does not
-   work well with PPro base chips.  */
+/* X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits.  */
 DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave", 
 	  m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
 
 /* X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions.
-   Some chips, like 486 and Pentium have problems with these sequences.  */
+   Some chips, like 486 and Pentium works faster with separate load
+   and push instructions.  */
 DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory", 
           m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE 
           | m_GENERIC)
@@ -210,6 +208,16 @@ DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OP
 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
           m_COREI7 | m_BDVER | m_SLM | m_GENERIC)
 
+/* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if true, unaligned loads are
+   split.  */
+DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal", 
+          ~(m_COREI7 | m_GENERIC))
+
+/* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if true, unaligned loads are
+   split.  */
+DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_load_optimal", 
+          ~(m_COREI7 | m_BDVER | m_GENERIC))
+
 /* Use packed single precision instructions where posisble.  I.e. movups instead
    of movupd.  */
 DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal",
@@ -398,3 +406,24 @@ DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CM
    fp converts to destination register.  */
 DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts",
           m_SLM)
+
+/* X86_TUNE_ACCUMULATE_OUTGOING_ARGS: Allocate stack space for outgoing
+   arguments in prologue/epilogue instead of separately for each call
+   by push/pop instructions.
+   This increase code size by about 5% in 32bit mode, less so in 64bit mode
+   because parameters are passed in registers.  It is considerable
+   win for targets without stack engine that prevents multple push operations
+   to happen in parallel.
+
+   FIXME: the flags is incorrectly enabled for amdfam10, Bulldozer,
+   Bobcat and Generic.  This is because disabling it causes large
+   regression on mgrid due to IRA limitation leading to unecessary
+   use of the frame pointer in 32bit mode.  */
+DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args", 
+	  m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC)
+
+/* X86_TUNE_ALWAYS_FANCY_MATH_387: controls use of fancy 387 operations,
+   such as fsqrt, fprem, fsin, fcos, fsincos etc.
+   Should be enabled for all targets that always has coprocesor.  */
+DEF_TUNE (X86_TUNE_ALWAYS_FANCY_MATH_387, "always_fancy_math_387", 
+          ~(m_386 | m_486))
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 203380)
+++ config/i386/i386.c	(working copy)
@@ -1898,18 +1898,6 @@ static unsigned int initial_ix86_arch_fe
   ~m_386,
 };
 
-static const unsigned int x86_accumulate_outgoing_args
-  = m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC;
-
-static const unsigned int x86_arch_always_fancy_math_387
-  = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC;
-
-static const unsigned int x86_avx256_split_unaligned_load
-  = m_COREI7 | m_GENERIC;
-
-static const unsigned int x86_avx256_split_unaligned_store
-  = m_COREI7 | m_BDVER | m_GENERIC;
-
 /* In case the average insn count for single function invocation is
    lower than this constant, emit fast (but longer) prologue and
    epilogue code.  */
@@ -2920,7 +2908,7 @@ static void
 ix86_option_override_internal (bool main_args_p)
 {
   int i;
-  unsigned int ix86_arch_mask, ix86_tune_mask;
+  unsigned int ix86_arch_mask;
   const bool ix86_tune_specified = (ix86_tune_string != NULL);
   const char *prefix;
   const char *suffix;
@@ -3673,7 +3661,7 @@ ix86_option_override_internal (bool main
 
   /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
      since the insns won't need emulation.  */
-  if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
+  if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
     target_flags &= ~MASK_NO_FANCY_MATH_387;
 
   /* Likewise, if the target doesn't have a 387, or we've specified
@@ -3805,8 +3793,7 @@ ix86_option_override_internal (bool main
 	gcc_unreachable ();
       }
 
-  ix86_tune_mask = 1u << ix86_tune;
-  if ((x86_accumulate_outgoing_args & ix86_tune_mask)
+  if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
       && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
       && !optimize_size)
     target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
@@ -3946,10 +3933,10 @@ ix86_option_override_internal (bool main
       if (flag_expensive_optimizations
 	  && !(target_flags_explicit & MASK_VZEROUPPER))
 	target_flags |= MASK_VZEROUPPER;
-      if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
+      if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL]
 	  && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
 	target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
-      if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
+      if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL]
 	  && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
 	target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
       /* Enable 128-bit AVX instruction generation