This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH][RFA/RFC] Stack clash mitigation patch 06/08 - V3
- From: Jeff Law <law at redhat dot com>
- To: gcc-patches <gcc-patches at gcc dot gnu dot org>
- Date: Sun, 30 Jul 2017 23:45:16 -0600
- Subject: [PATCH][RFA/RFC] Stack clash mitigation patch 06/08 - V3
- Authentication-results: sourceware.org; auth=none
- Authentication-results: ext-mx04.extmail.prod.ext.phx2.redhat.com; dmarc=none (p=none dis=none) header.from=redhat.com
- Authentication-results: ext-mx04.extmail.prod.ext.phx2.redhat.com; spf=fail smtp.mailfrom=law at redhat dot com
- Dmarc-filter: OpenDMARC Filter v1.3.2 mx1.redhat.com C937BDE465
This contains the PPC bits for stack clash protection.
Changes since V2:
Exploits inlined/unrolled probes and rotated loops for the dynamic area.
Some trivial simplifications. It also uses the new params to control
if probes are needed and how often to probe.
Jeff
* config/rs6000/rs6000-protos.h (output_probe_stack_range): Update
prototype for new argument.
* config/rs6000/rs6000.c (wrap_frame_mem): New function extracted
from rs6000_emit_allocate_stack.
(handle_residual): New function.
(rs6000_emit_probe_stack_range_stack_clash): New function.
(rs6000_emit_allocate_stack): Use wrap_frame_mem.
Call rs6000_emit_probe_stack_range_stack_clash as needed.
(rs6000_emit_probe_stack_range): Add additional argument
to call to gen_probe_stack_range{si,di}.
(output_probe_stack_range): New.
(output_probe_stack_range_1): Renamed from output_probe_stack_range.
(output_probe_stack_range_stack_clash): New.
(rs6000_emit_prologue): Emit notes into dump file as requested.
* rs6000.md (allocate_stack): Handle -fstack-clash-protection
(probe_stack_range<P:mode>): Operand 0 is now early-clobbered.
Add additional operand and pass it to output_probe_stack_range.
diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
index aeec9b2..451c442 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -134,7 +134,7 @@ extern void rs6000_emit_sISEL (machine_mode, rtx[]);
extern void rs6000_emit_sCOND (machine_mode, rtx[]);
extern void rs6000_emit_cbranch (machine_mode, rtx[]);
extern char * output_cbranch (rtx, const char *, int, rtx_insn *);
-extern const char * output_probe_stack_range (rtx, rtx);
+extern const char * output_probe_stack_range (rtx, rtx, rtx);
extern void rs6000_emit_dot_insn (rtx dst, rtx src, int dot, rtx ccreg);
extern bool rs6000_emit_set_const (rtx, rtx);
extern int rs6000_emit_cmove (rtx, rtx, rtx, rtx);
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index aa70e30..7936451 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -25618,6 +25618,211 @@ rs6000_emit_stack_tie (rtx fp, bool hard_frame_needed)
emit_insn (gen_stack_tie (gen_rtx_PARALLEL (VOIDmode, p)));
}
+/* INSN allocates SIZE bytes on the stack (STACK_REG) using a store
+ with update style insn.
+
+ Set INSN's alias set/attributes and add suitable flags and notes
+ for the dwarf CFI machinery. */
+static void
+wrap_frame_mem (rtx insn, rtx stack_reg, HOST_WIDE_INT size)
+{
+ rtx par = PATTERN (insn);
+ gcc_assert (GET_CODE (par) == PARALLEL);
+ rtx set = XVECEXP (par, 0, 0);
+ gcc_assert (GET_CODE (set) == SET);
+ rtx mem = SET_DEST (set);
+ gcc_assert (MEM_P (mem));
+ MEM_NOTRAP_P (mem) = 1;
+ set_mem_alias_set (mem, get_frame_alias_set ());
+
+ RTX_FRAME_RELATED_P (insn) = 1;
+ add_reg_note (insn, REG_FRAME_RELATED_EXPR,
+ gen_rtx_SET (stack_reg, gen_rtx_PLUS (Pmode, stack_reg,
+ GEN_INT (-size))));
+}
+
+/* Allocate ROUNDED_SIZE - ORIG_SIZE bytes on the stack, storing
+ ORIG_SP into *sp after the allocation.
+
+ ROUNDED_SIZE will be a multiple of
+ STACK_CLASH_PROTECTION_PROBE_INTERVAL and ORIG_SIZE - ROUNDED_SIZE
+ will be less than STACK_CLASH_PROTECTION_PROBE_INTERVAL.
+
+ Return the insn that allocates the residual space. */
+static rtx_insn *
+handle_residual (HOST_WIDE_INT orig_size,
+ HOST_WIDE_INT rounded_size,
+ rtx orig_sp)
+{
+ /* Allocate (and implicitly probe) any residual space. */
+ HOST_WIDE_INT residual = orig_size - rounded_size;
+ rtx_insn *insn;
+
+ if (Pmode == SImode)
+ insn = emit_insn (gen_movsi_update_stack (stack_pointer_rtx,
+ stack_pointer_rtx,
+ GEN_INT (-residual),
+ orig_sp));
+ else
+ insn = emit_insn (gen_movdi_di_update_stack (stack_pointer_rtx,
+ stack_pointer_rtx,
+ GEN_INT (-residual),
+ orig_sp));
+ wrap_frame_mem (insn, stack_pointer_rtx, residual);
+ return insn;
+}
+
+/* Allocate ORIG_SIZE bytes on the stack and probe the newly
+ allocated space every STACK_CLASH_PROTECTION_PROBE_INTERVAL bytes.
+
+ COPY_REG, if non-null, should contain a copy of the original
+ stack pointer modified by COPY_OFF at exit from this function.
+
+ This is subtly different than the Ada probing in that it tries hard to
+ prevent attacks that jump the stack guard. Thus it is never allowed to
+ allocate more than STACK_CLASH_PROTECTION_PROBE_INTERVAL bytes of stack
+ space without a suitable probe. */
+static rtx_insn *
+rs6000_emit_probe_stack_range_stack_clash (HOST_WIDE_INT orig_size,
+ rtx copy_reg, int copy_off)
+{
+ rtx orig_sp = copy_reg;
+
+ HOST_WIDE_INT probe_interval
+ = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
+
+ /* Round the size down to a multiple of PROBE_INTERVAL. */
+ HOST_WIDE_INT rounded_size = orig_size & -probe_interval;
+
+ /* If explicitly requested,
+ or the rounded size is not the same as the original size
+ or the the rounded size is greater than a page,
+ then we will need a copy of the original stack pointer. */
+ if (rounded_size != orig_size
+ || rounded_size > probe_interval
+ || copy_reg)
+ {
+ /* If the caller requested a copy of the incoming stack pointer,
+ then ORIG_SP == COPY_REG and will not be NULL.
+
+ If no copy was requested, then we use r0 to hold the copy. */
+ if (orig_sp == NULL_RTX)
+ orig_sp = gen_rtx_REG (Pmode, 0);
+ emit_move_insn (orig_sp, stack_pointer_rtx);
+ }
+
+ /* There's three cases here.
+
+ One is a single probe which is the most common and most efficiently
+ implemented as it does not have to have a copy of the original
+ stack pointer if there are no residuals.
+
+ Second is unrolled allocation/probes which we use if there's just
+ a few of them. It needs to save the original stack pointer into a
+ temporary for use as a source register in the allocation/probe.
+
+ Last is a loop. This is the most uncommon case and least efficient. */
+ rtx_insn *insn;
+ rtx_insn *retval = NULL;
+ if (rounded_size == probe_interval)
+ {
+ if (Pmode == SImode)
+ insn = emit_insn (gen_movsi_update_stack (stack_pointer_rtx,
+ stack_pointer_rtx,
+ GEN_INT (-probe_interval),
+ stack_pointer_rtx));
+ else
+ insn = emit_insn (gen_movdi_di_update_stack (stack_pointer_rtx,
+ stack_pointer_rtx,
+ GEN_INT (-probe_interval),
+ stack_pointer_rtx));
+ wrap_frame_mem (insn, stack_pointer_rtx, probe_interval);
+
+ /* If this was the only allocation, then we can return the allocating
+ insn. */
+ if (rounded_size == orig_size)
+ retval = insn;
+
+ dump_stack_clash_frame_info (PROBE_INLINE, rounded_size != orig_size);
+ }
+ else if (rounded_size <= 8 * probe_interval)
+ {
+ /* The ABI requires using the store with update insns to allocate
+ space and store the outer frame into the stack.
+
+ So we save the current stack pointer into a temporary, then
+ emit the store-with-update insns to store the saved stack pointer
+ into the right location in each new page. */
+ rtx probe_int = GEN_INT (-probe_interval);
+ for (int i = 0; i < rounded_size; i += probe_interval)
+ {
+ if (Pmode == SImode)
+ insn = emit_insn (gen_movsi_update_stack (stack_pointer_rtx,
+ stack_pointer_rtx,
+ probe_int, orig_sp));
+ else
+ insn = emit_insn (gen_movdi_di_update_stack (stack_pointer_rtx,
+ stack_pointer_rtx,
+ probe_int, orig_sp));
+ wrap_frame_mem (insn, stack_pointer_rtx, probe_interval);
+ }
+ retval = NULL;
+ dump_stack_clash_frame_info (PROBE_INLINE, rounded_size != orig_size);
+ }
+ else
+ {
+ /* Compute the ending address. */
+ rtx end_addr
+ = copy_reg ? gen_rtx_REG (Pmode, 0) : gen_rtx_REG (Pmode, 12);
+ rtx rs = GEN_INT (-rounded_size);
+ rtx insn;
+ if (add_operand (rs, Pmode))
+ insn = emit_insn (gen_add3_insn (end_addr, stack_pointer_rtx, rs));
+ else
+ {
+ emit_move_insn (end_addr, GEN_INT (-rounded_size));
+ insn = emit_insn (gen_add3_insn (end_addr, end_addr,
+ stack_pointer_rtx));
+ add_reg_note (insn, REG_FRAME_RELATED_EXPR,
+ gen_rtx_SET (end_addr,
+ gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+ rs)));
+ }
+ RTX_FRAME_RELATED_P (insn) = 1;
+
+ /* Emit the loop. */
+ if (TARGET_64BIT)
+ insn = emit_insn (gen_probe_stack_rangedi (stack_pointer_rtx,
+ stack_pointer_rtx, orig_sp,
+ end_addr));
+ else
+ insn = emit_insn (gen_probe_stack_rangesi (stack_pointer_rtx,
+ stack_pointer_rtx, orig_sp,
+ end_addr));
+ RTX_FRAME_RELATED_P (insn) = 1;
+ add_reg_note (insn, REG_FRAME_RELATED_EXPR,
+ gen_rtx_SET (stack_pointer_rtx, end_addr));
+ retval = NULL;
+ dump_stack_clash_frame_info (PROBE_LOOP, rounded_size != orig_size);
+ }
+
+ if (orig_size != rounded_size)
+ {
+ insn = handle_residual (orig_size, rounded_size, orig_sp);
+
+ /* If the residual was the only allocation, then we can return the
+ allocating insn. */
+ if (rounded_size == 0)
+ retval = insn;
+ }
+
+ /* If we asked for a copy with an offset, then we still need add in tnhe
+ offset. */
+ if (copy_reg && copy_off)
+ emit_insn (gen_add3_insn (copy_reg, copy_reg, GEN_INT (copy_off)));
+ return retval;
+}
+
/* Emit the correct code for allocating stack space, as insns.
If COPY_REG, make sure a copy of the old frame is left there.
The generated code may use hard register 0 as a temporary. */
@@ -25629,7 +25834,6 @@ rs6000_emit_allocate_stack (HOST_WIDE_INT size, rtx copy_reg, int copy_off)
rtx stack_reg = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
rtx tmp_reg = gen_rtx_REG (Pmode, 0);
rtx todec = gen_int_mode (-size, Pmode);
- rtx par, set, mem;
if (INTVAL (todec) != -size)
{
@@ -25669,6 +25873,16 @@ rs6000_emit_allocate_stack (HOST_WIDE_INT size, rtx copy_reg, int copy_off)
warning (0, "stack limit expression is not supported");
}
+ if (flag_stack_clash_protection)
+ {
+ if (size < PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE))
+ dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
+ else
+ return rs6000_emit_probe_stack_range_stack_clash (size,
+ copy_reg,
+ copy_off);
+ }
+
if (copy_reg)
{
if (copy_off != 0)
@@ -25692,23 +25906,12 @@ rs6000_emit_allocate_stack (HOST_WIDE_INT size, rtx copy_reg, int copy_off)
todec, stack_reg)
: gen_movdi_di_update_stack (stack_reg, stack_reg,
todec, stack_reg));
+
/* Since we didn't use gen_frame_mem to generate the MEM, grab
it now and set the alias set/attributes. The above gen_*_update
calls will generate a PARALLEL with the MEM set being the first
operation. */
- par = PATTERN (insn);
- gcc_assert (GET_CODE (par) == PARALLEL);
- set = XVECEXP (par, 0, 0);
- gcc_assert (GET_CODE (set) == SET);
- mem = SET_DEST (set);
- gcc_assert (MEM_P (mem));
- MEM_NOTRAP_P (mem) = 1;
- set_mem_alias_set (mem, get_frame_alias_set ());
-
- RTX_FRAME_RELATED_P (insn) = 1;
- add_reg_note (insn, REG_FRAME_RELATED_EXPR,
- gen_rtx_SET (stack_reg, gen_rtx_PLUS (Pmode, stack_reg,
- GEN_INT (-size))));
+ wrap_frame_mem (insn, stack_reg, size);
return insn;
}
@@ -25790,9 +25993,9 @@ rs6000_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
until it is equal to ROUNDED_SIZE. */
if (TARGET_64BIT)
- emit_insn (gen_probe_stack_rangedi (r12, r12, r0));
+ emit_insn (gen_probe_stack_rangedi (r12, r12, stack_pointer_rtx, r0));
else
- emit_insn (gen_probe_stack_rangesi (r12, r12, r0));
+ emit_insn (gen_probe_stack_rangesi (r12, r12, stack_pointer_rtx, r0));
/* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
@@ -25806,8 +26009,8 @@ rs6000_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
/* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
absolute addresses. */
-const char *
-output_probe_stack_range (rtx reg1, rtx reg2)
+static const char *
+output_probe_stack_range_1 (rtx reg1, rtx reg2)
{
static int labelno = 0;
char loop_lab[32];
@@ -25842,6 +26045,65 @@ output_probe_stack_range (rtx reg1, rtx reg2)
return "";
}
+
+/* Probe a range of stack addresses from REG1 to REG3 inclusive. These are
+ absolute addresses. REG2 contains the outer stack pointer that must be
+ stored into *sp at each allocation.
+
+ This is subtly different than the Ada probing above in that it tries hard
+ to prevent attacks that jump the stack guard. Thus, it is never allowed
+ to allocate more than PROBE_INTERVAL bytes of stack space without a
+ suitable probe. */
+
+static const char *
+output_probe_stack_range_stack_clash (rtx reg1, rtx reg2, rtx reg3)
+{
+ static int labelno = 0;
+ char loop_lab[32];
+ rtx xops[3];
+
+ HOST_WIDE_INT probe_interval
+ = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
+
+ ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
+
+ ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
+
+ /* This allocates and probes. */
+ xops[0] = reg1;
+ xops[1] = reg2;
+ xops[2] = GEN_INT (-probe_interval);
+ if (TARGET_64BIT)
+ output_asm_insn ("stdu %1,%2(%0)", xops);
+ else
+ output_asm_insn ("stwu %1,%2(%0)", xops);
+
+ /* Jump to LOOP_LAB if TEST_ADDR != LAST_ADDR. */
+ xops[0] = reg1;
+ xops[1] = reg3;
+ if (TARGET_64BIT)
+ output_asm_insn ("cmpd 0,%0,%1", xops);
+ else
+ output_asm_insn ("cmpw 0,%0,%1", xops);
+
+ fputs ("\tbne 0,", asm_out_file);
+ assemble_name_raw (asm_out_file, loop_lab);
+ fputc ('\n', asm_out_file);
+
+ return "";
+}
+
+/* Wrapper around the output_probe_stack_range routines. */
+const char *
+output_probe_stack_range (rtx reg1, rtx reg2, rtx reg3)
+{
+ if (flag_stack_clash_protection)
+ return output_probe_stack_range_stack_clash (reg1, reg2, reg3);
+ else
+ return output_probe_stack_range_1 (reg1, reg3);
+}
+
+
/* Add to 'insn' a note which is PATTERN (INSN) but with REG replaced
with (plus:P (reg 1) VAL), and with REG2 replaced with REPL2 if REG2
is not NULL. It would be nice if dwarf2out_frame_debug_expr could
@@ -27457,6 +27719,13 @@ rs6000_emit_prologue (void)
}
}
+ /* If we are emitting stack probes, but allocate no stack, then
+ just note that in the dump file. */
+ if (flag_stack_clash_protection
+ && dump_file
+ && !info->push_p)
+ dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
+
/* Update stack and set back pointer unless this is V.4,
for which it was done previously. */
if (!WORLD_SAVE_P (info) && info->push_p
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index f78dbf9..9262c82 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -10262,7 +10262,7 @@
(define_expand "allocate_stack"
[(set (match_operand 0 "gpc_reg_operand" "")
- (minus (reg 1) (match_operand 1 "reg_or_short_operand" "")))
+ (minus (reg 1) (match_operand 1 "reg_or_cint_operand" "")))
(set (reg 1)
(minus (reg 1) (match_dup 1)))]
""
@@ -10272,6 +10272,15 @@
rtx neg_op0;
rtx insn, par, set, mem;
+ /* By allowing reg_or_cint_operand as the predicate we can get
+ better code for stack-clash-protection because we do not lose
+ size information. But the rest of the code expects the operand
+ to be reg_or_short_operand. If it isn't, then force it into
+ a register. */
+ rtx orig_op1 = operands[1];
+ if (!reg_or_short_operand (operands[1], Pmode))
+ operands[1] = force_reg (Pmode, operands[1]);
+
emit_move_insn (chain, stack_bot);
/* Check stack bounds if necessary. */
@@ -10284,6 +10293,55 @@
emit_insn (gen_cond_trap (LTU, available, operands[1], const0_rtx));
}
+ /* Allocate and probe if requested.
+ This may look similar to the loop we use for prologue allocations,
+ but it is critically different. For the former we know the loop
+ will iterate, but do not know that generally here. The former
+ uses that knowledge to rotate the loop. Combining them would be
+ possible with some performance cost. */
+ if (flag_stack_clash_protection)
+ {
+ rtx rounded_size, last_addr, residual;
+ HOST_WIDE_INT probe_interval;
+ compute_stack_clash_protection_loop_data (&rounded_size, &last_addr,
+ &residual, &probe_interval,
+ orig_op1);
+
+ /* We do occasionally get in here with constant sizes, we might
+ as well do a reasonable job when we obviously can. */
+ if (rounded_size != CONST0_RTX (Pmode))
+ {
+ rtx loop_lab, end_loop;
+ bool rotated = GET_CODE (rounded_size) == CONST_INT;
+
+ emit_stack_clash_protection_probe_loop_start (&loop_lab, &end_loop,
+ last_addr, rotated);
+
+ if (Pmode == SImode)
+ emit_insn (gen_movsi_update_stack (stack_pointer_rtx,
+ stack_pointer_rtx,
+ GEN_INT (-probe_interval),
+ chain));
+ else
+ emit_insn (gen_movdi_di_update_stack (stack_pointer_rtx,
+ stack_pointer_rtx,
+ GEN_INT (-probe_interval),
+ chain));
+ emit_stack_clash_protection_probe_loop_end (loop_lab, end_loop,
+ last_addr, rotated);
+ }
+
+ /* Now handle residuals. We just have to set operands[1] correctly
+ and let the rest of the expander run. */
+ if (REG_P (residual) || GET_CODE (residual) == CONST_INT)
+ operands[1] = residual;
+ else
+ {
+ operands[1] = gen_reg_rtx (Pmode);
+ force_operand (residual, operands[1]);
+ }
+ }
+
if (GET_CODE (operands[1]) != CONST_INT
|| INTVAL (operands[1]) < -32767
|| INTVAL (operands[1]) > 32768)
@@ -11422,12 +11480,13 @@
(set_attr "length" "4")])
(define_insn "probe_stack_range<P:mode>"
- [(set (match_operand:P 0 "register_operand" "=r")
+ [(set (match_operand:P 0 "register_operand" "=&r")
(unspec_volatile:P [(match_operand:P 1 "register_operand" "0")
- (match_operand:P 2 "register_operand" "r")]
+ (match_operand:P 2 "register_operand" "r")
+ (match_operand:P 3 "register_operand" "r")]
UNSPECV_PROBE_STACK_RANGE))]
""
- "* return output_probe_stack_range (operands[0], operands[2]);"
+ "* return output_probe_stack_range (operands[0], operands[2], operands[3]);"
[(set_attr "type" "three")])
;; Compare insns are next. Note that the RS/6000 has two types of compares,