This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[split] Add CFI ops, fix call/return prediction, other changes
- From: Ian Lance Taylor <iant at google dot com>
- To: gcc-patches at gcc dot gnu dot org
- Cc: rth at redhat dot com, rridge at csclub dot uwaterloo dot ca (Ross Ridge)
- Date: Tue, 29 Sep 2009 23:48:12 -0700
- Subject: [split] Add CFI ops, fix call/return prediction, other changes
This patch for -fstack-split does the following:
* Adds CFI pseudo-ops to the assembly __morestack routine, and changes
it to use a frame pointer. This is from Richard Henderson as
modified by me.
* Uses a new TCB address--the last trans-mem entry--rather than
stealing the stack protector slot in the TCB.
* Wraps pthread_create so that the TCB entry is updated.
* Implements proper call/return prediction (I hope) along the lines
suggested by Ross Ridge. Sadly this required me to steal back %ecx
which Richard had saved.
* Adds register fusage to the call to __morestack.
This now works in both 32-bit and 64-bit mode.
Backtraces through the __morestack routine are now much nicer, but are
not perfect. They get confused by the extra return instruction. In
32-bit mode __morestack is effectively a stdcall function but we
aren't telling gdb that.
Bootstrapped on x86_64-unknown-linux-gnu, committed to split branch.
Ian
gcc/:
2009-09-29 Ian Lance Taylor <iant@google.com>
* gcc.c (STACK_SPLIT_SPEC): Define.
(LINK_COMMAND_SPEC): Use STACK_SPLIT_SPEC.
* config/i386/i386.c (ix86_supports_split_stack): Remove test of
flag_stack_protect.
(ix86_expand_split_stack_prologue): Set call insn function usage
to include register parameters. Emit return instruction after
call to __morestack.
* config/i386/i386.md (split_stack_check_small): Use unsigned
comparison.
(split_stack_check_large): Likewise.
(split_stack_return): New insn.
* config/i386/linux.h (TARGET_THREAD_SPLIT_STACK_OFFSET): Change
value.
* config/i386/linux64.h (TARGET_THREAD_SPLIT_STACK_OFFSET):
Likewise.
libgcc/:
2009-09-29 Ian Lance Taylor <iant@google.com>
Richard Henderson <rth@redhat.com>
* generic-morestack.c: #include <pthread.h>.
(__generic_morestack): Make hidden.
(__generic_releasestack): Likewise.
(allocate_segment): Only use sync_val_compare_and_swap if it is
available.
(struct pthread_create_args): Define.
(stack_split_initialize_thread): New static function.
(__wrap_pthread_create): New function.
* config/i386/morestack.S (__morestack): Add CFI pseudo-ops. Use
a frame pointer. Change call/return sequence. Remove __PIC__
code. Change TCB offsets.
(__stack_split_initialize): Rename from init. Make global.
Change TCB offsets.
Index: libgcc/generic-morestack.c
===================================================================
--- libgcc/generic-morestack.c (revision 152204)
+++ libgcc/generic-morestack.c (working copy)
@@ -38,6 +38,7 @@ see the files COPYING3 and COPYING.RUNTI
#include <assert.h>
#include <errno.h>
+#include <pthread.h>
#include <signal.h>
#include <unistd.h>
#include <sys/mman.h>
@@ -54,11 +55,11 @@ see the files COPYING3 and COPYING.RUNTI
extern void *
__generic_morestack (size_t *frame_size, void *old_stack, size_t param_size)
- __attribute__ ((no_split_stack, flatten));
+ __attribute__ ((no_split_stack, flatten, visibility ("hidden")));
extern void *
__generic_releasestack (void)
- __attribute__ ((no_split_stack, flatten));
+ __attribute__ ((no_split_stack, flatten, visibility ("hidden")));
/* When we allocate a stack segment we put this header at the
start. */
@@ -154,7 +155,15 @@ allocate_segment (size_t frame_size)
unsigned int p;
pagesize = getpagesize ();
+
+#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4
p = __sync_val_compare_and_swap (&static_pagesize, 0, pagesize);
+#else
+ /* Just hope this assignment is atomic. */
+ static_pagesize = pagesize;
+ p = 0;
+#endif
+
/* FIXME: I'm not sure this assert should be in the released
code. */
assert (p == 0 || p == pagesize);
@@ -346,4 +355,62 @@ __generic_releasestack (void)
return current->old_stack;
}
+/* Pass information from the pthread_create wrapper to
+ stack_split_initialize_thread. */
+
+struct pthread_create_args
+{
+ void *(*start_routine) (void *);
+ void *arg;
+};
+
+/* Initialize a thread. This is called via pthread_create. It calls
+ a target dependent function to set up any required stack guard. */
+
+static void* stack_split_initialize_thread (void *)
+ __attribute__ ((no_split_stack));
+
+extern void __stack_split_initialize (void)
+ __attribute__ ((visibility ("hidden")));
+
+static void *
+stack_split_initialize_thread (void *varg)
+{
+ struct pthread_create_args *args = (struct pthread_create_args *) varg;
+ void *(*start_routine) (void *);
+ void *arg;
+
+ __stack_split_initialize ();
+ start_routine = args->start_routine;
+ arg = args->arg;
+ free (args);
+ return (*start_routine) (arg);
+}
+
+/* This function wraps calls to pthread_create to make sure that the
+ stack guard is initialized for new threads. FIXME: This hack will
+ not be necessary if glibc supports -fsplit-stack directly. */
+
+int __wrap_pthread_create (pthread_t *, const pthread_attr_t *,
+ void *(*start_routine) (void *), void *)
+ __attribute__ ((visibility ("hidden")));
+
+extern int __real_pthread_create (pthread_t *, const pthread_attr_t *,
+ void *(*start_routine) (void *), void *)
+ __attribute__ ((weak));
+
+int
+__wrap_pthread_create (pthread_t *tid, const pthread_attr_t *attr,
+ void *(*start_routine) (void *), void *arg)
+{
+ struct pthread_create_args* args;
+
+ args = malloc (sizeof (struct pthread_create_args));
+ if (args == NULL)
+ return EAGAIN;
+ args->start_routine = start_routine;
+ args->arg = arg;
+ return __real_pthread_create (tid, attr, stack_split_initialize_thread, args);
+}
+
#endif /* !defined (inhibit_libc) */
Index: libgcc/config/i386/morestack.S
===================================================================
--- libgcc/config/i386/morestack.S (revision 152198)
+++ libgcc/config/i386/morestack.S (working copy)
@@ -55,6 +55,21 @@
# __generic_releasestack to retrieve the old stack pointer and
# release the newly allocated stack.
+# We do a little dance so that the processor's call/return return
+# address prediction works out. The compiler arranges for the caller
+# to look like this:
+# call __generic_morestack
+# ret
+# L:
+# // carry on with function
+# After we allocate more stack, we call L, which is in our caller.
+# When that returns (to the predicted instruction), we release the
+# stack segment and reset the stack pointer. We then return to the
+# predicted instruction, namely the ret instruction immediately after
+# the call to __generic_morestack. That then returns to the caller of
+# the original caller.
+
+
# void *__generic_releasestack (void);
.global __morestack
@@ -64,9 +79,17 @@
#endif
__morestack:
+ .cfi_startproc
#ifndef __x86_64__
+ # Set up a normal backtrace.
+ pushl %ebp
+ .cfi_def_cfa_offset 8
+ .cfi_offset %ebp, -8
+ movl %esp, %ebp
+ .cfi_def_cfa_register %ebp
+
# In 32-bit mode the parameters are pushed on the stack. The
# argument size is pushed then the new stack frame size is
# pushed.
@@ -79,35 +102,33 @@ __morestack:
pushl %eax
pushl %edx
- pushl 16(%esp) # The size of the parameters.
- leal 28(%esp),%eax # The caller's parameters.
+ pushl 12(%ebp) # The size of the parameters.
+ leal 20(%ebp),%eax # Address of caller's parameters.
pushl %eax
- leal 20(%esp),%eax # The address of the new frame size.
+ leal 8(%ebp),%eax # The address of the new frame size.
pushl %eax
-#ifdef __PIC__
- call __generic_morestack@PLT
-#else
+ # Note that %esp is exactly 32 bytes below the CFA -- perfect for
+ # a 16-byte aligned stack. That said, we still ought to compile
+ # generic-morestack.c with -mpreferred-stack-boundary=2. FIXME.
call __generic_morestack
-#endif
- movl %eax,%ecx # Copy the new stack pointer.
- subl 24(%esp),%ecx # The end of the stack space.
- addl $256,%ecx # Back off 256 bytes.
+ movl %eax,%esp # Switch to the new stack.
+ subl 8(%ebp),%eax # The end of the stack space.
+ addl $256,%eax # Back off 256 bytes.
# FIXME: The offset must match
# TARGET_THREAD_SPLIT_STACK_OFFSET in
# gcc/config/i386/linux.h.
- movl %ecx,%gs:0x14 # Save the new stack boundary.
+ movl %eax,%gs:0x30 # Save the new stack boundary.
- movl 16(%esp),%eax # Restore registers.
- movl 12(%esp),%edx
+ movl -4(%ebp),%eax # Restore registers.
+ movl -8(%ebp),%edx
- # We do a little dance here so that the processor's
- # call/return prediction works out right.
+ movl 4(%ebp),%ecx # Increment the return address
+ inc %ecx # to skip the ret instruction;
+ # see above.
- movl 20(%esp),%ecx # Where we are in the caller.
- movl %eax,%esp # Switch to the new stack.
call *%ecx # Call our caller!
# The caller will return here, as predicted.
@@ -118,26 +139,33 @@ __morestack:
pushl %eax
pushl %edx
-#ifdef __PIC__
- call __generic_releasestack@PLT
-#else
call __generic_releasestack
-#endif
-
- movl %eax,%ecx # Hold onto old stack pointer.
popl %edx # Restore possible return value
popl %eax
- movl %ecx,%esp # Switch back to the old stack.
-
- ret # Return to caller's caller.
+ # Switch back to the old stack via copy back from %ebp.
+ leave
+ .cfi_restore %ebp
+ .cfi_def_cfa %esp, 4
+ ret $8 # Return to caller, which will
+ # immediately return. Pop
+ # arguments as we go.
#else /* defined(__x86_64__) */
+ # Set up a normal backtrace.
+ pushq %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+
# In 64-bit mode the new stack frame size is passed in r10
# and the argument size is passed in r11.
+ pushq %r10 # Save new frame size.
+
# In 64-bit mode the registers %rdi, %rsi, %rdx, %rcx, %r8,
# and %r9 may be used for parameters.
@@ -148,39 +176,39 @@ __morestack:
pushq %r8
pushq %r9
- pushq %r10 # The new frame size.
- leaq 0(%rsp),%rdi # Its address.
- leaq 64(%rsp),%rsi # The caller's parameters.
+ # An extra push to maintain 16-byte stack alignment. After this,
+ # we'll be 80 bytes below the (presumably aligned) CFA.
+ pushq %rdi
+
+ leaq -8(%rbp),%rdi # Address of new frame size.
+ leaq 24(%rbp),%rsi # The caller's parameters.
movq %r11,%rdx # The size of the parameters.
-#ifdef __PIC__
- call __generic_morestack@PLT
-#else
call __generic_morestack
-#endif
- movq %rax,%rcx # Copy the new stack pointer.
- subq 0(%rsp),%rcx # The end of the stack space.
- addq $256,%rcx # Back off 256 bytes.
+ popq %rdi # Restore registers
+ popq %r9
+ popq %r8
+ popq %rcx
+ popq %rdx
+ popq %rsi
+ popq %rdi
+
+ popq %r10 # Reload modified frame size
+ movq %rax,%rsp # Switch to the new stack.
+ subq %r10,%rax # The end of the stack space.
+ addq $256,%rax # Back off 256 bytes.
# FIXME: The offset must match
# TARGET_THREAD_SPLIT_STACK_OFFSET in
# gcc/config/i386/linux64.h.
- movq %rcx,%fs:0x28 # Save the new stack boundary.
+ movq %rax,%fs:0x70 # Save the new stack boundary.
- movq 48(%rsp),%rdi # Restore registers.
- movq 40(%rsp),%rsi
- movq 32(%rsp),%rdx
- movq 24(%rsp),%rcx
- movq 16(%rsp),%r8
- movq 8(%rsp),%r9
+ movq 8(%rbp),%r10 # Increment the return address
+ incq %r10 # to skip the ret instruction;
+ # see above.
- # We do a little dance here so that the processor's
- # call/return prediction works out right.
-
- movq 56(%rsp),%rcx # Where we are in the caller.
- movq %rax,%rsp # Switch to the new stack.
- call *%rcx # Call our caller!
+ call *%r10 # Call our caller!
# The caller will return here, as predicted.
@@ -190,53 +218,53 @@ __morestack:
pushq %rax
pushq %rdx
-#ifdef __PIC__
- call __generic_releasestack@PLT
-#else
call __generic_releasestack
-#endif
-
- movq %rax,%rcx # Hold onto old stack pointer.
popq %rdx # Restore possible return value
popq %rax
- movq %rcx,%rsp # Switch back to the old stack.
-
- ret # Return to caller's caller.
+ # Switch back to the old stack via copy back from %rbp.
+ leave
+ .cfi_restore %rbp
+ .cfi_def_cfa %rsp, 8
+ ret # Return to caller, which will
+ # immediately return.
#endif /* defined(__x86_64__) */
+ .cfi_endproc
#ifdef __ELF__
.size __morestack, . - __morestack
#endif
-# Initialize the stack test value when the program starts. We don't
-# know how large the main stack is, so we guess conservatively. We
-# might be able to use getrlimit here.
+# Initialize the stack test value when the program starts or when a
+# new thread starts. We don't know how large the main stack is, so we
+# guess conservatively. We might be able to use getrlimit here.
+
+ .global __stack_split_initialize
#ifdef __ELF__
.type init, @function
#endif
-init:
+__stack_split_initialize:
#ifndef __x86_64__
leal -16000(%esp),%eax # We should have at least 16K.
- movl %eax,%gs:0x14
+ movl %eax,%gs:0x30
ret
#else /* defined(__x86_64__) */
leaq -16000(%rsp),%rax # We should have at least 16K.
- movq %rax,%fs:0x28
+ movq %rax,%fs:0x70
ret
#endif /* defined(__x86_64__) */
#ifdef __ELF__
- .size init, . - init
+ .size __stack_split_initialize, . - __stack_split_initialize
#endif
# Make init a high priority constructor. FIXME: This is ELF
@@ -246,10 +274,10 @@ init:
#ifndef __x86_64__
.align 4
- .long init
+ .long __stack_split_initialize
#else
.align 8
- .quad init
+ .quad __stack_split_initialize
#endif
#ifdef __ELF__
Index: gcc/gcc.c
===================================================================
--- gcc/gcc.c (revision 152193)
+++ gcc/gcc.c (working copy)
@@ -664,6 +664,16 @@ proper position among the other output f
#define MFLIB_SPEC "%{fmudflap|fmudflapth: -export-dynamic}"
#endif
+/* When using -fsplit-stack we need to wrap pthread_create, in order
+ to initialize the stack guard. We always use wrapping, rather than
+ shared library ordering, and we keep the wrapper function in
+ libgcc. This is not yet a real spec, though it could become one;
+ it is currently just stuffed into LINK_SPEC. FIXME: This wrapping
+ only works with GNU ld and gold. FIXME: This is incompatible with
+ -fmudflap when linking statically, which wants to do its own
+ wrapping. */
+#define STACK_SPLIT_SPEC " %{fsplit-stack: --wrap=pthread_create}"
+
/* config.h can define LIBGCC_SPEC to override how and when libgcc.a is
included. */
#ifndef LIBGCC_SPEC
@@ -770,7 +780,8 @@ proper position among the other output f
%(linker) %l " LINK_PIE_SPEC "%X %{o*} %{A} %{d} %{e*} %{m} %{N} %{n} %{r}\
%{s} %{t} %{u*} %{x} %{z} %{Z} %{!A:%{!nostdlib:%{!nostartfiles:%S}}}\
%{static:} %{L*} %(mfwrap) %(link_libgcc) %o\
- %{fopenmp|ftree-parallelize-loops=*:%:include(libgomp.spec)%(link_gomp)} %(mflib)\
+ %{fopenmp|ftree-parallelize-loops=*:%:include(libgomp.spec)%(link_gomp)}\
+ %(mflib) " STACK_SPLIT_SPEC "\
%{fprofile-arcs|fprofile-generate*|coverage:-lgcov}\
%{!nostdlib:%{!nodefaultlibs:%(link_ssp) %(link_gcc_c_sequence)}}\
%{!A:%{!nostdlib:%{!nostartfiles:%E}}} %{T*} }}}}}}"
Index: gcc/ChangeLog.split
===================================================================
--- gcc/ChangeLog.split (revision 152199)
+++ gcc/ChangeLog.split (working copy)
@@ -24,7 +24,7 @@
Define.
* config/i386/i386-protos.h (ix86_expand_split_stack_prologue):
Declare.
- (ix86_ewxpand_call): Change return type in declaration.
+ (ix86_expand_call): Change return type in declaration.
* common.opt (fsplit-stack): New option.
* opts.c (decode_options): Set flag_split_stack.
Index: gcc/config/i386/linux.h
===================================================================
--- gcc/config/i386/linux.h (revision 152195)
+++ gcc/config/i386/linux.h (working copy)
@@ -214,6 +214,6 @@ along with GCC; see the file COPYING3.
/* i386 glibc provides __stack_chk_guard in %gs:0x14. */
#define TARGET_THREAD_SSP_OFFSET 0x14
-/* For now -fsplit-stack uses the same field. */
-#define TARGET_THREAD_SPLIT_STACK_OFFSET TARGET_THREAD_SSP_OFFSET
+/* We steal the last transactional memory word. */
+#define TARGET_THREAD_SPLIT_STACK_OFFSET 0x30
#endif
Index: gcc/config/i386/i386.md
===================================================================
--- gcc/config/i386/i386.md (revision 152195)
+++ gcc/config/i386/i386.md (working copy)
@@ -15927,7 +15927,7 @@
emit_insn (gen_split_stack_compare_small_32 (ssp_offset));
ix86_compare_op0 = gen_rtx_REG (CCmode, FLAGS_REG);
ix86_compare_op1 = GEN_INT (0);
- ix86_expand_branch (GT, operands[0]);
+ ix86_expand_branch (GTU, operands[0]);
JUMP_LABEL (get_last_insn ()) = operands[0];
DONE;
})
@@ -15989,7 +15989,7 @@
emit_insn (gen_split_stack_compare_large_32 (operands[0], ssp_offset));
ix86_compare_op0 = gen_rtx_REG (CCmode, FLAGS_REG);
ix86_compare_op1 = GEN_INT (0);
- ix86_expand_branch (GT, operands[1]);
+ ix86_expand_branch (GTU, operands[1]);
JUMP_LABEL (get_last_insn ()) = operands[1];
DONE;
})
@@ -16023,6 +16023,17 @@
(set_attr "length_immediate" "4")
(set_attr "memory" "none")
(set_attr "imm_disp" "false")])
+
+;; In order to support the call/return predictor, we use a return
+;; instruction which the middle-end doesn't see.
+(define_insn "split_stack_return"
+ [(unspec_volatile [(const_int 0)] UNSPEC_STACK_CHECK)]
+ ""
+ "ret"
+ [(set_attr "length" "1")
+ (set_attr "atom_unit" "jeu")
+ (set_attr "length_immediate" "0")
+ (set_attr "modrm" "0")])
(define_expand "ffs_cmove"
[(set (match_dup 2) (const_int -1))
Index: gcc/config/i386/linux64.h
===================================================================
--- gcc/config/i386/linux64.h (revision 152195)
+++ gcc/config/i386/linux64.h (working copy)
@@ -118,6 +118,6 @@ see the files COPYING3 and COPYING.RUNTI
x86_64 glibc provides it in %fs:0x28. */
#define TARGET_THREAD_SSP_OFFSET (TARGET_64BIT ? 0x28 : 0x14)
-/* For now -fsplit-stack uses the same field. */
-#define TARGET_THREAD_SPLIT_STACK_OFFSET TARGET_THREAD_SSP_OFFSET
+/* We steal the last transactional memory word. */
+#define TARGET_THREAD_SPLIT_STACK_OFFSET (TARGET_64BIT ? 0x70 : 0x30)
#endif
Index: gcc/config/i386/i386.c
===================================================================
--- gcc/config/i386/i386.c (revision 152195)
+++ gcc/config/i386/i386.c (working copy)
@@ -7799,12 +7799,6 @@ ix86_supports_split_stack (void)
#ifndef TARGET_THREAD_SPLIT_STACK_OFFSET
error ("%<-fsplit-stack%> currently only supported on GNU/Linux");
ret = false;
-#else
- if (flag_stack_protect)
- {
- error ("%<-fstack-protector%> is not compatible with %<-fsplit-stack%>");
- ret = false;
- }
#endif
return ret;
@@ -9139,7 +9133,7 @@ ix86_expand_split_stack_prologue (void)
tree decl;
bool is_fastcall;
int regparm, args_size;
- rtx label, jump_insn, allocate_rtx, call_insn;
+ rtx label, jump_insn, allocate_rtx, call_insn, call_fusage;
gcc_assert (flag_split_stack && reload_completed);
@@ -9222,6 +9216,7 @@ ix86_expand_split_stack_prologue (void)
r11. */
allocate_rtx = GEN_INT (allocate);
args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
+ call_fusage = NULL_RTX;
if (!TARGET_64BIT)
{
/* In order to give __morestack a scratch register, we save %ecx
@@ -9236,14 +9231,29 @@ ix86_expand_split_stack_prologue (void)
}
else
{
- emit_move_insn (gen_rtx_REG (Pmode, R10_REG), allocate_rtx);
- emit_move_insn (gen_rtx_REG (Pmode, R11_REG), GEN_INT (args_size));
+ rtx reg;
+
+ reg = gen_rtx_REG (Pmode, R10_REG);
+ emit_move_insn (reg, allocate_rtx);
+ use_reg (&call_fusage, reg);
+ reg = gen_rtx_REG (Pmode, R11_REG);
+ emit_move_insn (reg, GEN_INT (args_size));
+ use_reg (&call_fusage, reg);
}
if (split_stack_fn == NULL_RTX)
split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, split_stack_fn),
GEN_INT (UNITS_PER_WORD), constm1_rtx,
NULL_RTX, 0);
+ add_function_usage_to (call_insn, call_fusage);
+
+ /* In order to make call/return prediction work right, we now need
+ to execute a return instruction. See
+ libgcc/config/i386/morestack.S for the details on how this works.
+ However, for flow purposes gcc must not see this as a return
+ instruction--we need control flow to continue at the subsequent
+ label. Therefore, we use an unspec. */
+ emit_insn (gen_split_stack_return ());
if (!TARGET_64BIT && (is_fastcall || regparm > 2))
{