Honnor ix86_accumulate_outgoing_args again
Jan Hubicka
hubicka@ucw.cz
Thu Oct 10 18:48:00 GMT 2013
Hi,
this patch makes ACCUMULATE_OUTGOING_ARGS to disable itself when function is
cold. I did some extra testing and to my amusement we now seem to output
more compact unwind info when ACCUMULATE_OUTGOING_ARGS is disabled, so this
seems quite consistent code size win.
We actually can do better and enable ACCUMULATE_OUTGOING_ARGS only when
function contains hot calls. This should also avoid need for frame allocation
in prologue/epilogue on hot path then. I will look into this incrementally.
I also noticed that we still have some tuning flags in i386.c rather than
in x86-tune.c so I moved them there.
Testing x86_64-linux and will commit it once testing converge.
Honza
* config/i386/i386.h (ACCUMULATE_OUTGOING_ARGS): Disable accumulation
for cold functions.
* x86-tune.def (X86_TUNE_USE_LEAVE): Update comment.
(X86_TUNE_PUSH_MEMORY): Likewise.
(X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL,
X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL): New.
(X86_TUNE_ACCUMULATE_OUTGOING_ARGS, X86_TUNE_ALWAYS_FANCY_MATH_387): New.
* i386.c (x86_accumulate_outgoing_args, x86_arch_always_fancy_math_387,
x86_avx256_split_unaligned_load, x86_avx256_split_unaligned_store):
Remove.
(ix86_option_override_internal): Update to use tune features instead
of variables.
Index: config/i386/i386.h
===================================================================
--- config/i386/i386.h (revision 203380)
+++ config/i386/i386.h (working copy)
@@ -1492,13 +1492,26 @@ enum reg_class
will be computed and placed into the variable `crtl->outgoing_args_size'.
No space will be pushed onto the stack for each call; instead, the
function prologue should increase the stack frame size by this amount.
+
+ In 32bit mode enabling argument accumulation results in about 5% code size
+ growth becuase move instructions are less compact than push. In 64bit
+ mode the difference is less drastic but visible.
+
+ FIXME: Unlike earlier implementations, the size of unwind info seems to
+ actually grouw with accumulation. Is that because accumulated args
+ unwind info became unnecesarily bloated?
64-bit MS ABI seem to require 16 byte alignment everywhere except for
- function prologue and apilogue. This is not possible without
- ACCUMULATE_OUTGOING_ARGS. */
+ function prologue and epilogue. This is not possible without
+ ACCUMULATE_OUTGOING_ARGS.
+
+ If stack probes are required, the space used for large function
+ arguments on the stack must also be probed, so enable
+ -maccumulate-outgoing-args so this happens in the prologue. */
#define ACCUMULATE_OUTGOING_ARGS \
- (TARGET_ACCUMULATE_OUTGOING_ARGS || TARGET_64BIT_MS_ABI)
+ ((TARGET_ACCUMULATE_OUTGOING_ARGS && optimize_function_for_speed_p (cfun)) \
+ || TARGET_STACK_PROBE || TARGET_64BIT_MS_ABI)
/* If defined, a C expression whose value is nonzero when we want to use PUSH
instructions to pass outgoing arguments. */
Index: config/i386/x86-tune.def
===================================================================
--- config/i386/x86-tune.def (revision 203387)
+++ config/i386/x86-tune.def (working copy)
@@ -18,15 +18,13 @@ a copy of the GCC Runtime Library Except
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
-/* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
- negatively, so enabling for Generic64 seems like good code size
- tradeoff. We can't enable it for 32bit generic because it does not
- work well with PPro base chips. */
+/* X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits. */
DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave",
m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
/* X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions.
- Some chips, like 486 and Pentium have problems with these sequences. */
+ Some chips, like 486 and Pentium works faster with separate load
+ and push instructions. */
DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory",
m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE
| m_GENERIC)
@@ -210,6 +208,16 @@ DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OP
DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
m_COREI7 | m_BDVER | m_SLM | m_GENERIC)
+/* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if true, unaligned loads are
+ split. */
+DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal",
+ ~(m_COREI7 | m_GENERIC))
+
+/* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if true, unaligned loads are
+ split. */
+DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_load_optimal",
+ ~(m_COREI7 | m_BDVER | m_GENERIC))
+
/* Use packed single precision instructions where posisble. I.e. movups instead
of movupd. */
DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal",
@@ -398,3 +406,24 @@ DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CM
fp converts to destination register. */
DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts",
m_SLM)
+
+/* X86_TUNE_ACCUMULATE_OUTGOING_ARGS: Allocate stack space for outgoing
+ arguments in prologue/epilogue instead of separately for each call
+ by push/pop instructions.
+ This increase code size by about 5% in 32bit mode, less so in 64bit mode
+ because parameters are passed in registers. It is considerable
+ win for targets without stack engine that prevents multple push operations
+ to happen in parallel.
+
+ FIXME: the flags is incorrectly enabled for amdfam10, Bulldozer,
+ Bobcat and Generic. This is because disabling it causes large
+ regression on mgrid due to IRA limitation leading to unecessary
+ use of the frame pointer in 32bit mode. */
+DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args",
+ m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC)
+
+/* X86_TUNE_ALWAYS_FANCY_MATH_387: controls use of fancy 387 operations,
+ such as fsqrt, fprem, fsin, fcos, fsincos etc.
+ Should be enabled for all targets that always has coprocesor. */
+DEF_TUNE (X86_TUNE_ALWAYS_FANCY_MATH_387, "always_fancy_math_387",
+ ~(m_386 | m_486))
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c (revision 203380)
+++ config/i386/i386.c (working copy)
@@ -1898,18 +1898,6 @@ static unsigned int initial_ix86_arch_fe
~m_386,
};
-static const unsigned int x86_accumulate_outgoing_args
- = m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC;
-
-static const unsigned int x86_arch_always_fancy_math_387
- = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC;
-
-static const unsigned int x86_avx256_split_unaligned_load
- = m_COREI7 | m_GENERIC;
-
-static const unsigned int x86_avx256_split_unaligned_store
- = m_COREI7 | m_BDVER | m_GENERIC;
-
/* In case the average insn count for single function invocation is
lower than this constant, emit fast (but longer) prologue and
epilogue code. */
@@ -2920,7 +2908,7 @@ static void
ix86_option_override_internal (bool main_args_p)
{
int i;
- unsigned int ix86_arch_mask, ix86_tune_mask;
+ unsigned int ix86_arch_mask;
const bool ix86_tune_specified = (ix86_tune_string != NULL);
const char *prefix;
const char *suffix;
@@ -3673,7 +3661,7 @@ ix86_option_override_internal (bool main
/* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
since the insns won't need emulation. */
- if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
+ if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
target_flags &= ~MASK_NO_FANCY_MATH_387;
/* Likewise, if the target doesn't have a 387, or we've specified
@@ -3805,8 +3793,7 @@ ix86_option_override_internal (bool main
gcc_unreachable ();
}
- ix86_tune_mask = 1u << ix86_tune;
- if ((x86_accumulate_outgoing_args & ix86_tune_mask)
+ if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
&& !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
&& !optimize_size)
target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
@@ -3946,10 +3933,10 @@ ix86_option_override_internal (bool main
if (flag_expensive_optimizations
&& !(target_flags_explicit & MASK_VZEROUPPER))
target_flags |= MASK_VZEROUPPER;
- if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
+ if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL]
&& !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
- if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
+ if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL]
&& !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
/* Enable 128-bit AVX instruction generation
More information about the Gcc-patches
mailing list