This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH][RFA/RFC] Stack clash mitigation patch 07/08 V2


So this patch has changed considerably since V1 as well.

First, we no longer track the bulk of the register stores in the
prologue.  Those may be separately shrink wrapped and thus not executed
on all paths and as such are not candidates for implicit probes.

Second, per the discussions we've had on-list, we're less aggressive at
probing.  We assume the caller has not pushed us more than 1kbyte into
the stack guard.  Thus stacks of < 3kbytes in the callee need no probes.

Third, the implicit probe tracking is simplified.  I'm exceedingly happy
to find out that we can never have a nonzero initial_adjust and
callee_adjust at the same time.  That's a significant help.

We still use the save of lr/fp as an implicit probe.

This ought to be much more efficient than the prior version.


Hopefully this is closer to something the aarch64 maintainers are
comfortable with.

--
	* config/aarch/aarch64.c (aarch64_output_probe_stack_range): Handle
	-fstack-clash-protection probing too.
	(aarch64_allocate_and_probe_stack_space): New function.
	(aarch64_expand_prologue): Assert we never have both an initial
	adjustment and callee save adjustment.  Track distance between SP and
	most recent probe.  Use aarch64_allocate_and_probe_stack_space
	when -fstack-clash-protect is enabled rather than just adjusting sp.
	Dump actions via dump_stack_clash_frame_info.

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 0a8b40a..8764d62 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -2830,6 +2830,9 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
   char loop_lab[32];
   rtx xops[2];
 
+  if (flag_stack_clash_protection)
+    reg1 = stack_pointer_rtx;
+
   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
 
   /* Loop.  */
@@ -2841,7 +2844,14 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
   output_asm_insn ("sub\t%0, %0, %1", xops);
 
   /* Probe at TEST_ADDR.  */
-  output_asm_insn ("str\txzr, [%0]", xops);
+  if (flag_stack_clash_protection)
+    {
+      gcc_assert (xops[0] == stack_pointer_rtx);
+      xops[1] = GEN_INT (PROBE_INTERVAL - 8);
+      output_asm_insn ("str\txzr, [%0, %1]", xops);
+    }
+  else
+    output_asm_insn ("str\txzr, [%0]", xops);
 
   /* Test if TEST_ADDR == LAST_ADDR.  */
   xops[1] = reg2;
 static void
 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
@@ -3605,6 +3617,68 @@ aarch64_set_handled_components (sbitmap components)
       cfun->machine->reg_is_wrapped_separately[regno] = true;
 }
 
+/* Allocate SIZE bytes of stack space using SCRATCH_REG as a scratch
+   register.
+
+   LAST_PROBE_OFFSET contains the offset between the stack pointer and
+   the last known probe.  As LAST_PROBE_OFFSET crosses PROBE_INTERVAL
+   emit a probe and adjust LAST_PROBE_OFFSET.  */
+static void
+aarch64_allocate_and_probe_stack_space (int scratchreg, HOST_WIDE_INT size,
+					HOST_WIDE_INT *last_probe_offset)
+{
+  rtx temp = gen_rtx_REG (word_mode, scratchreg);
+
+  HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
+  HOST_WIDE_INT residual = size - rounded_size;
+
+  /* We can handle a small number of allocations/probes inline.  Otherwise
+     punt to a loop.  */
+  if (rounded_size && rounded_size <= 4 * PROBE_INTERVAL)
+    {
+      for (HOST_WIDE_INT i = 0; i < rounded_size; i += PROBE_INTERVAL)
+	{
+	  /* We should never need a scratch register for this adjustment.  */
+	  aarch64_sub_sp (-1, PROBE_INTERVAL, true);
+
+	  /* We just allocated PROBE_INTERVAL bytes.  Thus, a probe is
+	     mandatory.  Note that LAST_PROBE_OFFSET does not change here.  */
+	  emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
+					   (PROBE_INTERVAL
+					    - GET_MODE_SIZE (word_mode))));
+	}
+      dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
+    }
+  else if (rounded_size)
+    {
+      /* Compute the ending address.  */
+      emit_move_insn (temp, GEN_INT (-rounded_size));
+      emit_insn (gen_add3_insn (temp, stack_pointer_rtx, temp));
+
+      /* This allocates and probes the stack.  Like the inline version above
+	 it does not need to change LAST_PROBE_OFFSET.
+
+	 It almost certainly does not update CFIs correctly.  */
+      emit_insn (gen_probe_stack_range (temp, temp, temp));
+      dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
+    }
+
+  /* Handle any residuals.  */
+  if (residual)
+    {
+      aarch64_sub_sp (-1, residual, true);
+      *last_probe_offset += residual;
+      if (*last_probe_offset >= PROBE_INTERVAL)
+	{
+	  *last_probe_offset -= PROBE_INTERVAL;
+	  emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
+					   (residual
+					    - GET_MODE_SIZE (word_mode))));
+	}
+    }
+  return;
+}
+
 /* AArch64 stack frames generated by this compiler look like:
 
 	+-------------------------------+
@@ -3686,10 +3760,59 @@ aarch64_expand_prologue (void)
 	aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
     }
 
-  aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
+  /* We do not fully protect aarch64 against stack clash style attacks
+     as doing so would be prohibitively expensive.
+
+     We assume that a caller can not push the stack pointer more than 1k
+     into the guard, which allows the current function to allocate up to
+     3k of total space without any probing.
+
+     In the relatively rare case where we are going to emit probes to
+     protect against stack-clash, we start the function with a probe
+     and probe every PROBE_INTERVAL bytes after that.
+
+     We have to track how much space has been allocated, but we do not
+     track stores into the stack as implicit probes.  */
+  if (flag_stack_clash_protection)
+    {
+      if (frame_size == 0)
+	dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
+      else if (frame_size < 3 * 1024)
+	dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
+      else
+	{
+	  /* This probes into the red zone, which is sub-optimal, but we
+	     allow it to avoid the async signal race.  */
+	  emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
+					   - GET_MODE_SIZE (word_mode)));
+	}
+    }
+
+  /* In theory we should never have both an initial adjustment
+     and a callee save adjustment.  Verify that is the case since the
+     code below does not handle it for -fstack-clash-protection.  */
+  gcc_assert (initial_adjust == 0 || callee_adjust == 0);
+
+  /* We have to track the offset to the last probe in the stack so that
+     we know when to emit probes for stack clash protection.
+
+     If this function needs probes (most do not), then the code above
+     already emitted one.  Thus we can consider the last probe into the
+     stack was at offset zero.  */
+  HOST_WIDE_INT last_probe_offset = 0;
+  if (flag_stack_clash_protection && initial_adjust != 0)
+    aarch64_allocate_and_probe_stack_space (IP0_REGNUM, initial_adjust,
+					    &last_probe_offset);
+  else
+    aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
 
   if (callee_adjust != 0)
-    aarch64_push_regs (reg1, reg2, callee_adjust);
+    {
+      aarch64_push_regs (reg1, reg2, callee_adjust);
+
+      /* We just wrote *sp, so we can trivially adjust LAST_PROBE_OFFSET.  */
+      last_probe_offset = 0;
+    }
 
   if (frame_pointer_needed)
     {
@@ -3707,7 +3830,12 @@ aarch64_expand_prologue (void)
 			     callee_adjust != 0 || frame_pointer_needed);
   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
 			     callee_adjust != 0 || frame_pointer_needed);
-  aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
+
+  if (flag_stack_clash_protection && final_adjust != 0)
+    aarch64_allocate_and_probe_stack_space (IP1_REGNUM, final_adjust,
+					    &last_probe_offset);
+  else
+    aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
 }
 
 /* Return TRUE if we can use a simple_return insn.

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]