This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[RFA/RFC] Stack clash mitigation patch 07/08 - V3
- From: Jeff Law <law at redhat dot com>
- To: gcc-patches <gcc-patches at gcc dot gnu dot org>
- Date: Sun, 30 Jul 2017 23:45:27 -0600
- Subject: [RFA/RFC] Stack clash mitigation patch 07/08 - V3
- Authentication-results: sourceware.org; auth=none
- Authentication-results: ext-mx10.extmail.prod.ext.phx2.redhat.com; dmarc=none (p=none dis=none) header.from=redhat.com
- Authentication-results: ext-mx10.extmail.prod.ext.phx2.redhat.com; spf=fail smtp.mailfrom=law at redhat dot com
- Dmarc-filter: OpenDMARC Filter v1.3.2 mx1.redhat.com 4FD8E61494
This patch adds aarch64 supports for stack clash mitigation.
Changes since V2:
Uses new PARAMS to control the guard size and probing interval. Uses a
64k guard by default and a 1k outgoing args size.
Simplified code slightly as a result of "can't happen" cases.
Probe the last allocated word in the outgoing argument space if >1kb of
outgoing argument space was allocated.
Defines STACK_CLASH_PROTECTION_NEEDS_FINAL_DYNAMIC_PROBE to force
generic handling of dynamic area to probe its last word when necessary.
Epilogue doesn't assume IP0/IP1 contain usable values for
-fstack-clash-protection.
Adds some aarch64 specific tests for problems found by Wilco and myself.
Jeff
* config/aarch/aarch64.c (aarch64_output_probe_stack_range): Handle
-fstack-clash-protection probing too.
(aarch64_allocate_and_probe_stack_space): New function.
(aarch64_expand_prologue): Assert we never have both an initial
adjustment and callee save adjustment. Dump as needed when
-fstack-clash-protection is enabled. For an initial or final
adjustment that is large enough, call
aarch64_allocate_and_probe_stack_space. For final adjustment
that is large enough, also emit a trailing probe.
(aarch64_expand_epilogue): Do not assume the prologue has
set IP1_REGNUM or IP0_REGNUM when -fstack-clash-protection is
active.
* config/aarch64/aarch64.h
(STACK_CLASH_PROTECTION_NEEDS_FINAL_DYNAMIC_PROBE): Define.
* testsuite/gcc.target/aarch64.c/stack-check-12.c: New test.
* testsuite/gcc.target/aarch64.c/stack-check-13.c: New test.
* testsuite/gcc.target/aarch64.c/stack-check-14.c: New test.
* testsuite/gcc.target/aarch64.c/stack-check-15.c: New test.
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 0a8b40a..7d57299 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -2830,6 +2830,9 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
char loop_lab[32];
rtx xops[2];
+ if (flag_stack_clash_protection)
+ reg1 = stack_pointer_rtx;
+
ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
/* Loop. */
@@ -2841,7 +2844,14 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
output_asm_insn ("sub\t%0, %0, %1", xops);
/* Probe at TEST_ADDR. */
- output_asm_insn ("str\txzr, [%0]", xops);
+ if (flag_stack_clash_protection)
+ {
+ gcc_assert (xops[0] == stack_pointer_rtx);
+ xops[1] = GEN_INT (PROBE_INTERVAL - 8);
+ output_asm_insn ("str\txzr, [%0, %1]", xops);
+ }
+ else
+ output_asm_insn ("str\txzr, [%0]", xops);
/* Test if TEST_ADDR == LAST_ADDR. */
xops[1] = reg2;
@@ -3605,6 +3615,96 @@ aarch64_set_handled_components (sbitmap components)
cfun->machine->reg_is_wrapped_separately[regno] = true;
}
+/* Allocate SIZE bytes of stack space using SCRATCH_REG as a scratch
+ register. */
+
+static void
+aarch64_allocate_and_probe_stack_space (int scratchreg, HOST_WIDE_INT size)
+{
+ HOST_WIDE_INT probe_interval
+ = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
+ HOST_WIDE_INT guard_size
+ = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
+ HOST_WIDE_INT guard_used_by_caller = 1024;
+
+ /* SIZE should be large enough to require probing here. ie, it
+ must be larger than GUARD_SIZE - GUARD_USED_BY_CALLER.
+
+ We can allocate GUARD_SIZE - GUARD_USED_BY_CALLER as a single chunk
+ without any probing. */
+ gcc_assert (size >= guard_size - guard_used_by_caller);
+ aarch64_sub_sp (scratchreg, guard_size - guard_used_by_caller, true);
+ size -= (guard_size - guard_used_by_caller);
+
+ HOST_WIDE_INT rounded_size = size & -probe_interval;
+ HOST_WIDE_INT residual = size - rounded_size;
+
+ /* We can handle a small number of allocations/probes inline. Otherwise
+ punt to a loop. */
+ if (rounded_size && rounded_size <= 4 * probe_interval)
+ {
+ /* We don't use aarch64_sub_sp here because we don't want to
+ repeatedly load SCRATCHREG. */
+ rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
+ if (probe_interval > ARITH_FACTOR)
+ emit_move_insn (scratch_rtx, GEN_INT (-probe_interval));
+ else
+ scratch_rtx = GEN_INT (-probe_interval);
+
+ for (HOST_WIDE_INT i = 0; i < rounded_size; i += probe_interval)
+ {
+ rtx_insn *insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
+ scratch_rtx));
+ add_reg_note (insn, REG_STACK_CHECK, const0_rtx);
+
+ if (probe_interval > ARITH_FACTOR)
+ {
+ RTX_FRAME_RELATED_P (insn) = 1;
+ rtx adj = plus_constant (Pmode, stack_pointer_rtx, rounded_size);
+ add_reg_note (insn, REG_CFA_ADJUST_CFA,
+ gen_rtx_SET (stack_pointer_rtx, adj));
+ }
+
+ emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
+ (probe_interval
+ - GET_MODE_SIZE (word_mode))));
+ emit_insn (gen_blockage ());
+ }
+ dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
+ }
+ else if (rounded_size)
+ {
+ /* Compute the ending address. */
+ rtx temp = gen_rtx_REG (word_mode, scratchreg);
+ emit_move_insn (temp, GEN_INT (-rounded_size));
+ emit_insn (gen_add3_insn (temp, stack_pointer_rtx, temp));
+
+ /* This allocates and probes the stack.
+
+ It almost certainly does not update CFIs correctly.
+
+ It also probes at a 4k interval regardless of the value of
+ PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL. */
+ emit_insn (gen_probe_stack_range (temp, temp, temp));
+ emit_insn (gen_blockage ());
+ dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
+ }
+ else
+ dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
+
+
+ /* Handle any residuals.
+ Note that any residual must be probed. */
+ if (residual)
+ {
+ aarch64_sub_sp (scratchreg, residual, true);
+ emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
+ (residual - GET_MODE_SIZE (word_mode))));
+ emit_insn (gen_blockage ());
+ }
+ return;
+}
+
/* AArch64 stack frames generated by this compiler look like:
+-------------------------------+
@@ -3686,7 +3786,65 @@ aarch64_expand_prologue (void)
aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
}
- aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
+ /* We do not fully protect aarch64 against stack clash style attacks
+ as doing so would be prohibitively expensive with less utility over
+ time as newer compilers are deployed.
+
+ We assume the guard is at least 64k. Furthermore, we assume that
+ the caller has not pushed the stack pointer more than 1k into
+ the guard. A caller that pushes the stack pointer than 1k into
+ the guard is considered invalid.
+
+ Note that the caller's ability to push the stack pointer into the
+ guard is a function of the number and size of outgoing arguments and/or
+ dynamic stack allocations due to the mandatory save of the link register
+ in the caller's frame.
+
+ With those assumptions the callee can allocate up to 63k of stack
+ space without probing.
+
+ When probing is needed, we emit a probe at the start of the prologue
+ and every PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL bytes thereafter.
+
+ We have to track how much space has been allocated, but we do not
+ track stores into the stack as implicit probes except for the
+ fp/lr store. */
+ HOST_WIDE_INT guard_size
+ = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
+ HOST_WIDE_INT guard_used_by_caller = 1024;
+ if (flag_stack_clash_protection)
+ {
+ if (frame_size == 0)
+ dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
+ else if (initial_adjust < guard_size - guard_used_by_caller
+ && final_adjust < guard_size - guard_used_by_caller)
+ dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
+ else
+ {
+ /* This probes into the red zone, which is sub-optimal, but we
+ allow it to avoid the async signal race. Valgrind may complain
+ about this probe. */
+ emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
+ - GET_MODE_SIZE (word_mode)));
+ }
+ }
+
+ /* In theory we should never have both an initial adjustment
+ and a callee save adjustment. Verify that is the case since the
+ code below does not handle it for -fstack-clash-protection. */
+ gcc_assert (initial_adjust == 0 || callee_adjust == 0);
+
+ /* We have to track the offset to the last probe in the stack so that
+ we know when to emit probes for stack clash protection.
+
+ If this function needs probes (most do not), then the code above
+ already emitted one. Thus we can consider the last probe into the
+ stack was at offset zero. */
+ if (flag_stack_clash_protection
+ && initial_adjust >= guard_size - guard_used_by_caller)
+ aarch64_allocate_and_probe_stack_space (IP0_REGNUM, initial_adjust);
+ else
+ aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
if (callee_adjust != 0)
aarch64_push_regs (reg1, reg2, callee_adjust);
@@ -3707,7 +3865,26 @@ aarch64_expand_prologue (void)
callee_adjust != 0 || frame_pointer_needed);
aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
callee_adjust != 0 || frame_pointer_needed);
- aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
+
+ if (flag_stack_clash_protection && final_adjust != 0)
+ {
+ if (final_adjust >= guard_size - guard_used_by_caller)
+ aarch64_allocate_and_probe_stack_space (IP1_REGNUM, final_adjust);
+ else
+ aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
+
+ /* We must probe if the final adjustment is larger than the guard
+ that is assumed used by the caller. */
+ if (final_adjust >= guard_used_by_caller)
+ {
+ if (dump_file)
+ fprintf (dump_file,
+ "Stack clash aarch64 large outgoing arg, probing\n");
+ emit_stack_probe (stack_pointer_rtx);
+ }
+ }
+ else
+ aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
}
/* Return TRUE if we can use a simple_return insn.
@@ -3773,7 +3950,11 @@ aarch64_expand_epilogue (bool for_sibcall)
RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
}
else
- aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
+ aarch64_add_sp (IP1_REGNUM, final_adjust,
+ /* A stack clash protection prologue may not have
+ left IP1_REGNUM in a usable state. */
+ (flag_stack_clash_protection
+ || df_regs_ever_live_p (IP1_REGNUM)));
aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
callee_adjust != 0, &cfi_ops);
@@ -3796,7 +3977,11 @@ aarch64_expand_epilogue (bool for_sibcall)
cfi_ops = NULL;
}
- aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
+ /* A stack clash protection prologue may not have left IP0_REGNUM
+ in a usable state. */
+ aarch64_add_sp (IP0_REGNUM, initial_adjust,
+ (flag_stack_clash_protection
+ || df_regs_ever_live_p (IP0_REGNUM)));
if (cfi_ops)
{
@@ -8939,6 +9124,12 @@ aarch64_override_options_internal (struct gcc_options *opts)
&& opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
opts->x_flag_prefetch_loop_arrays = 1;
+ /* We assume the guard page is 64k. */
+ maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
+ 65536,
+ opts->x_param_values,
+ global_options_set.x_param_values);
+
aarch64_override_options_after_change_1 (opts);
}
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 106cf3a..d2d84dd 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -969,4 +969,12 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
extern tree aarch64_fp16_type_node;
extern tree aarch64_fp16_ptr_type_node;
+/* The prologue assumes that the most recent probe is within 1k of the
+ incoming stack pointer. We may need an additional probe for
+ dynamically allocated stack space. */
+#define STACK_CLASH_PROTECTION_NEEDS_FINAL_DYNAMIC_PROBE(RESIDUAL) \
+ ((RESIDUAL) == CONST0_RTX (Pmode) \
+ || GET_CODE (RESIDUAL) != CONST_INT \
+ || INTVAL (RESIDUAL) >= 1024)
+
#endif /* GCC_AARCH64_H */
diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-12.c b/gcc/testsuite/gcc.target/aarch64/stack-check-12.c
new file mode 100644
index 0000000..9ad4133
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-12.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fstack-clash-protection --param stack-clash-protection-guard-size=4096" } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+
+extern void arf (unsigned long int *, unsigned long int *);
+void
+frob ()
+{
+ unsigned long int num[1000];
+ unsigned long int den[1000];
+ arf (den, num);
+}
+
+/* This verifies that the scheduler did not break the dependencies
+ by adjusting the offsets within the probe and that the scheduler
+ did not reorder around the stack probes. */
+/* { dg-final { scan-assembler-times "sub\\tsp, sp, #4096\\n\\tstr\\txzr, .sp, 4088." 3 } } */
+
+
+
diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-13.c b/gcc/testsuite/gcc.target/aarch64/stack-check-13.c
new file mode 100644
index 0000000..2404777
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-13.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fstack-clash-protection --param stack-clash-protection-guard-size=4096" } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+
+#define ARG32(X) X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X
+#define ARG192(X) ARG32(X),ARG32(X),ARG32(X),ARG32(X),ARG32(X),ARG32(X)
+void out1(ARG192(__int128));
+int t1(int);
+
+int t3(int x)
+{
+ if (x < 1000)
+ return t1 (x) + 1;
+
+ out1 (ARG192(1));
+ return 0;
+}
+
+
+
+/* This test creates a large (> 1k) outgoing argument area that needs
+ to be probed. We don't test the exact size of the space or the
+ exact offset to make the test a little less sensitive to trivial
+ output changes. */
+/* { dg-final { scan-assembler-times "sub\\tsp, sp, #....\\n\\tstr\\txzr, \\\[sp" 1 } } */
+
+
+
diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-14.c b/gcc/testsuite/gcc.target/aarch64/stack-check-14.c
new file mode 100644
index 0000000..38ff5b0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-14.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fstack-clash-protection --param stack-clash-protection-guard-size=4096" } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+
+int t1(int);
+
+int t2(int x)
+{
+ char *p = __builtin_alloca (4050);
+ x = t1 (x);
+ return p[x];
+}
+
+
+/* This test has a constant sized alloca that is smaller than the
+ probe interval. But it actually requires two probes instead
+ of one because of the optimistic assumptions we made in the
+ aarch64 prologue code WRT probing state.
+
+ The form can change quite a bit so we just check for two
+ probes without looking at the actual address. */
+/* { dg-final { scan-assembler-times "str\\txzr," 2 } } */
+
+
+
diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-15.c b/gcc/testsuite/gcc.target/aarch64/stack-check-15.c
new file mode 100644
index 0000000..302bbe8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-15.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fstack-clash-protection --param stack-clash-protection-guard-size=4096" } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+
+int t1(int);
+
+int t2(int x)
+{
+ char *p = __builtin_alloca (x);
+ x = t1 (x);
+ return p[x];
+}
+
+
+/* This test has a variable sized alloca. It requires 3 probes.
+ One in the loop, one for the residual and at the end of the
+ alloca area.
+
+ The form can change quite a bit so we just check for two
+ probes without looking at the actual address. */
+/* { dg-final { scan-assembler-times "str\\txzr," 3 } } */
+
+
+