This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH][RFA/RFC] Stack clash mitigation patch 06/08 - V3


This contains the PPC bits for stack clash protection.

Changes since V2:

Exploits inlined/unrolled probes and rotated loops for the dynamic area.
 Some trivial simplifications.  It also uses the new params to control
if probes are needed and how often to probe.

Jeff
	* config/rs6000/rs6000-protos.h (output_probe_stack_range): Update
	prototype for new argument.
	* config/rs6000/rs6000.c (wrap_frame_mem): New function extracted
	from rs6000_emit_allocate_stack.
	(handle_residual): New function. 
	(rs6000_emit_probe_stack_range_stack_clash): New function.
	(rs6000_emit_allocate_stack): Use wrap_frame_mem.
	Call rs6000_emit_probe_stack_range_stack_clash as needed.
	(rs6000_emit_probe_stack_range): Add additional argument
	to call to gen_probe_stack_range{si,di}.
	(output_probe_stack_range): New.
	(output_probe_stack_range_1):  Renamed from output_probe_stack_range.
	(output_probe_stack_range_stack_clash): New.
	(rs6000_emit_prologue): Emit notes into dump file as requested.
	* rs6000.md (allocate_stack): Handle -fstack-clash-protection
	(probe_stack_range<P:mode>): Operand 0 is now early-clobbered.
	Add additional operand and pass it to output_probe_stack_range.


diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
index aeec9b2..451c442 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -134,7 +134,7 @@ extern void rs6000_emit_sISEL (machine_mode, rtx[]);
 extern void rs6000_emit_sCOND (machine_mode, rtx[]);
 extern void rs6000_emit_cbranch (machine_mode, rtx[]);
 extern char * output_cbranch (rtx, const char *, int, rtx_insn *);
-extern const char * output_probe_stack_range (rtx, rtx);
+extern const char * output_probe_stack_range (rtx, rtx, rtx);
 extern void rs6000_emit_dot_insn (rtx dst, rtx src, int dot, rtx ccreg);
 extern bool rs6000_emit_set_const (rtx, rtx);
 extern int rs6000_emit_cmove (rtx, rtx, rtx, rtx);
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index aa70e30..7936451 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -25618,6 +25618,211 @@ rs6000_emit_stack_tie (rtx fp, bool hard_frame_needed)
   emit_insn (gen_stack_tie (gen_rtx_PARALLEL (VOIDmode, p)));
 }
 
+/* INSN allocates SIZE bytes on the stack (STACK_REG) using a store
+   with update style insn.
+
+   Set INSN's alias set/attributes and add suitable flags and notes
+   for the dwarf CFI machinery.  */
+static void
+wrap_frame_mem (rtx insn, rtx stack_reg, HOST_WIDE_INT size)
+{
+  rtx par = PATTERN (insn);
+  gcc_assert (GET_CODE (par) == PARALLEL);
+  rtx set = XVECEXP (par, 0, 0);
+  gcc_assert (GET_CODE (set) == SET);
+  rtx mem = SET_DEST (set);
+  gcc_assert (MEM_P (mem));
+  MEM_NOTRAP_P (mem) = 1;
+  set_mem_alias_set (mem, get_frame_alias_set ());
+
+  RTX_FRAME_RELATED_P (insn) = 1;
+  add_reg_note (insn, REG_FRAME_RELATED_EXPR,
+		gen_rtx_SET (stack_reg, gen_rtx_PLUS (Pmode, stack_reg,
+						      GEN_INT (-size))));
+}
+
+/* Allocate ROUNDED_SIZE - ORIG_SIZE bytes on the stack, storing
+   ORIG_SP into *sp after the allocation.
+
+   ROUNDED_SIZE will be a multiple of
+   STACK_CLASH_PROTECTION_PROBE_INTERVAL and ORIG_SIZE - ROUNDED_SIZE
+   will be less than STACK_CLASH_PROTECTION_PROBE_INTERVAL.
+
+   Return the insn that allocates the residual space.  */
+static rtx_insn *
+handle_residual (HOST_WIDE_INT orig_size,
+		 HOST_WIDE_INT rounded_size,
+		 rtx orig_sp)
+{
+  /* Allocate (and implicitly probe) any residual space.   */
+  HOST_WIDE_INT residual = orig_size - rounded_size;
+  rtx_insn *insn;
+
+  if (Pmode == SImode)
+    insn = emit_insn (gen_movsi_update_stack (stack_pointer_rtx,
+					      stack_pointer_rtx,
+					      GEN_INT (-residual),
+					      orig_sp));
+  else
+    insn = emit_insn (gen_movdi_di_update_stack (stack_pointer_rtx,
+						 stack_pointer_rtx,
+						 GEN_INT (-residual),
+						 orig_sp));
+  wrap_frame_mem (insn, stack_pointer_rtx, residual);
+  return insn;
+}
+
+/* Allocate ORIG_SIZE bytes on the stack and probe the newly
+   allocated space every STACK_CLASH_PROTECTION_PROBE_INTERVAL bytes.
+
+   COPY_REG, if non-null, should contain a copy of the original
+   stack pointer modified by COPY_OFF at exit from this function.
+
+   This is subtly different than the Ada probing in that it tries hard to
+   prevent attacks that jump the stack guard.  Thus it is never allowed to
+   allocate more than STACK_CLASH_PROTECTION_PROBE_INTERVAL bytes of stack
+   space without a suitable probe.  */
+static rtx_insn *
+rs6000_emit_probe_stack_range_stack_clash (HOST_WIDE_INT orig_size,
+					   rtx copy_reg, int copy_off)
+{
+  rtx orig_sp = copy_reg;
+
+  HOST_WIDE_INT probe_interval
+    = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
+
+  /* Round the size down to a multiple of PROBE_INTERVAL.  */
+  HOST_WIDE_INT rounded_size = orig_size & -probe_interval;
+
+  /* If explicitly requested,
+       or the rounded size is not the same as the original size
+       or the the rounded size is greater than a page,
+     then we will need a copy of the original stack pointer.  */
+  if (rounded_size != orig_size
+      || rounded_size > probe_interval
+      || copy_reg)
+    {
+      /* If the caller requested a copy of the incoming stack pointer,
+	 then ORIG_SP == COPY_REG and will not be NULL.
+
+	 If no copy was requested, then we use r0 to hold the copy.  */
+      if (orig_sp == NULL_RTX)
+	orig_sp = gen_rtx_REG (Pmode, 0);
+      emit_move_insn (orig_sp, stack_pointer_rtx);
+    }
+
+  /* There's three cases here.
+
+     One is a single probe which is the most common and most efficiently
+     implemented as it does not have to have a copy of the original
+     stack pointer if there are no residuals.
+
+     Second is unrolled allocation/probes which we use if there's just
+     a few of them.  It needs to save the original stack pointer into a
+     temporary for use as a source register in the allocation/probe.
+
+     Last is a loop.  This is the most uncommon case and least efficient.  */
+  rtx_insn *insn;
+  rtx_insn *retval = NULL;
+  if (rounded_size == probe_interval)
+    {
+      if (Pmode == SImode)
+	insn = emit_insn (gen_movsi_update_stack (stack_pointer_rtx,
+						  stack_pointer_rtx,
+						  GEN_INT (-probe_interval),
+						  stack_pointer_rtx));
+      else
+	insn = emit_insn (gen_movdi_di_update_stack (stack_pointer_rtx,
+						     stack_pointer_rtx,
+						     GEN_INT (-probe_interval),
+						     stack_pointer_rtx));
+      wrap_frame_mem (insn, stack_pointer_rtx, probe_interval);
+
+      /* If this was the only allocation, then we can return the allocating
+	 insn.  */
+      if (rounded_size == orig_size)
+	retval = insn;
+
+      dump_stack_clash_frame_info (PROBE_INLINE, rounded_size != orig_size);
+    }
+  else if (rounded_size <= 8 * probe_interval)
+    {
+      /* The ABI requires using the store with update insns to allocate
+	 space and store the outer frame into the stack.
+
+	 So we save the current stack pointer into a temporary, then
+	 emit the store-with-update insns to store the saved stack pointer
+	 into the right location in each new page.  */
+      rtx probe_int = GEN_INT (-probe_interval);
+      for (int i = 0; i < rounded_size; i += probe_interval)
+	{
+	  if (Pmode == SImode)
+	    insn = emit_insn (gen_movsi_update_stack (stack_pointer_rtx,
+						      stack_pointer_rtx,
+						      probe_int, orig_sp));
+	  else
+	    insn = emit_insn (gen_movdi_di_update_stack (stack_pointer_rtx,
+							 stack_pointer_rtx,
+							 probe_int, orig_sp));
+	  wrap_frame_mem (insn, stack_pointer_rtx, probe_interval);
+	}
+      retval = NULL;
+      dump_stack_clash_frame_info (PROBE_INLINE, rounded_size != orig_size);
+    }
+  else
+    {
+      /* Compute the ending address.  */
+      rtx end_addr
+	= copy_reg ? gen_rtx_REG (Pmode, 0) : gen_rtx_REG (Pmode, 12);
+      rtx rs = GEN_INT (-rounded_size);
+      rtx insn;
+      if (add_operand (rs, Pmode))
+	insn = emit_insn (gen_add3_insn (end_addr, stack_pointer_rtx, rs));
+      else
+	{
+	  emit_move_insn (end_addr, GEN_INT (-rounded_size));
+	  insn = emit_insn (gen_add3_insn (end_addr, end_addr,
+					   stack_pointer_rtx));
+	  add_reg_note (insn, REG_FRAME_RELATED_EXPR,
+			gen_rtx_SET (end_addr,
+				     gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+						   rs)));
+	}
+      RTX_FRAME_RELATED_P (insn) = 1;
+
+      /* Emit the loop.  */
+      if (TARGET_64BIT)
+	insn = emit_insn (gen_probe_stack_rangedi (stack_pointer_rtx,
+						   stack_pointer_rtx, orig_sp,
+						   end_addr));
+      else
+	insn = emit_insn (gen_probe_stack_rangesi (stack_pointer_rtx,
+						   stack_pointer_rtx, orig_sp,
+						   end_addr));
+      RTX_FRAME_RELATED_P (insn) = 1;
+      add_reg_note (insn, REG_FRAME_RELATED_EXPR,
+		    gen_rtx_SET (stack_pointer_rtx, end_addr));
+      retval = NULL;
+      dump_stack_clash_frame_info (PROBE_LOOP, rounded_size != orig_size);
+    }
+
+  if (orig_size != rounded_size)
+    {
+      insn = handle_residual (orig_size, rounded_size, orig_sp);
+
+      /* If the residual was the only allocation, then we can return the
+	 allocating insn.  */
+      if (rounded_size == 0)
+	retval = insn;
+    }
+
+  /* If we asked for a copy with an offset, then we still need add in tnhe
+     offset.  */
+  if (copy_reg && copy_off)
+    emit_insn (gen_add3_insn (copy_reg, copy_reg, GEN_INT (copy_off)));
+  return retval;
+}
+
 /* Emit the correct code for allocating stack space, as insns.
    If COPY_REG, make sure a copy of the old frame is left there.
    The generated code may use hard register 0 as a temporary.  */
@@ -25629,7 +25834,6 @@ rs6000_emit_allocate_stack (HOST_WIDE_INT size, rtx copy_reg, int copy_off)
   rtx stack_reg = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
   rtx tmp_reg = gen_rtx_REG (Pmode, 0);
   rtx todec = gen_int_mode (-size, Pmode);
-  rtx par, set, mem;
 
   if (INTVAL (todec) != -size)
     {
@@ -25669,6 +25873,16 @@ rs6000_emit_allocate_stack (HOST_WIDE_INT size, rtx copy_reg, int copy_off)
 	warning (0, "stack limit expression is not supported");
     }
 
+  if (flag_stack_clash_protection)
+    {
+      if (size < PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE))
+	dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
+      else
+	return rs6000_emit_probe_stack_range_stack_clash (size,
+							  copy_reg,
+							  copy_off);
+    }
+
   if (copy_reg)
     {
       if (copy_off != 0)
@@ -25692,23 +25906,12 @@ rs6000_emit_allocate_stack (HOST_WIDE_INT size, rtx copy_reg, int copy_off)
 					      todec, stack_reg)
 		    : gen_movdi_di_update_stack (stack_reg, stack_reg,
 						 todec, stack_reg));
+
   /* Since we didn't use gen_frame_mem to generate the MEM, grab
      it now and set the alias set/attributes. The above gen_*_update
      calls will generate a PARALLEL with the MEM set being the first
      operation. */
-  par = PATTERN (insn);
-  gcc_assert (GET_CODE (par) == PARALLEL);
-  set = XVECEXP (par, 0, 0);
-  gcc_assert (GET_CODE (set) == SET);
-  mem = SET_DEST (set);
-  gcc_assert (MEM_P (mem));
-  MEM_NOTRAP_P (mem) = 1;
-  set_mem_alias_set (mem, get_frame_alias_set ());
-
-  RTX_FRAME_RELATED_P (insn) = 1;
-  add_reg_note (insn, REG_FRAME_RELATED_EXPR,
-		gen_rtx_SET (stack_reg, gen_rtx_PLUS (Pmode, stack_reg,
-						      GEN_INT (-size))));
+  wrap_frame_mem (insn, stack_reg, size);
   return insn;
 }
 
@@ -25790,9 +25993,9 @@ rs6000_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
 	 until it is equal to ROUNDED_SIZE.  */
 
       if (TARGET_64BIT)
-	emit_insn (gen_probe_stack_rangedi (r12, r12, r0));
+	emit_insn (gen_probe_stack_rangedi (r12, r12, stack_pointer_rtx, r0));
       else
-	emit_insn (gen_probe_stack_rangesi (r12, r12, r0));
+	emit_insn (gen_probe_stack_rangesi (r12, r12, stack_pointer_rtx, r0));
 
 
       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
@@ -25806,8 +26009,8 @@ rs6000_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
    absolute addresses.  */
 
-const char *
-output_probe_stack_range (rtx reg1, rtx reg2)
+static const char *
+output_probe_stack_range_1 (rtx reg1, rtx reg2)
 {
   static int labelno = 0;
   char loop_lab[32];
@@ -25842,6 +26045,65 @@ output_probe_stack_range (rtx reg1, rtx reg2)
   return "";
 }
 
+
+/* Probe a range of stack addresses from REG1 to REG3 inclusive.  These are
+   absolute addresses.  REG2 contains the outer stack pointer that must be
+   stored into *sp at each allocation.
+
+   This is subtly different than the Ada probing above in that it tries hard
+   to prevent attacks that jump the stack guard.  Thus, it is never allowed
+   to allocate more than PROBE_INTERVAL bytes of stack space without a
+   suitable probe.  */
+
+static const char *
+output_probe_stack_range_stack_clash (rtx reg1, rtx reg2, rtx reg3)
+{
+  static int labelno = 0;
+  char loop_lab[32];
+  rtx xops[3];
+
+  HOST_WIDE_INT probe_interval
+    = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
+
+  ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
+
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
+
+  /* This allocates and probes.  */
+  xops[0] = reg1;
+  xops[1] = reg2;
+  xops[2] = GEN_INT (-probe_interval);
+  if (TARGET_64BIT)
+    output_asm_insn ("stdu %1,%2(%0)", xops);
+  else
+    output_asm_insn ("stwu %1,%2(%0)", xops);
+
+  /* Jump to LOOP_LAB if TEST_ADDR != LAST_ADDR.  */
+  xops[0] = reg1;
+  xops[1] = reg3;
+  if (TARGET_64BIT)
+    output_asm_insn ("cmpd 0,%0,%1", xops);
+  else
+    output_asm_insn ("cmpw 0,%0,%1", xops);
+
+  fputs ("\tbne 0,", asm_out_file);
+  assemble_name_raw (asm_out_file, loop_lab);
+  fputc ('\n', asm_out_file);
+
+  return "";
+}
+
+/* Wrapper around the output_probe_stack_range routines.  */
+const char *
+output_probe_stack_range (rtx reg1, rtx reg2, rtx reg3)
+{
+  if (flag_stack_clash_protection)
+    return output_probe_stack_range_stack_clash (reg1, reg2, reg3);
+  else
+    return output_probe_stack_range_1 (reg1, reg3);
+}
+
+
 /* Add to 'insn' a note which is PATTERN (INSN) but with REG replaced
    with (plus:P (reg 1) VAL), and with REG2 replaced with REPL2 if REG2
    is not NULL.  It would be nice if dwarf2out_frame_debug_expr could
@@ -27457,6 +27719,13 @@ rs6000_emit_prologue (void)
 	  }
     }
 
+  /* If we are emitting stack probes, but allocate no stack, then
+     just note that in the dump file.  */
+  if (flag_stack_clash_protection
+      && dump_file
+      && !info->push_p)
+    dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
+
   /* Update stack and set back pointer unless this is V.4,
      for which it was done previously.  */
   if (!WORLD_SAVE_P (info) && info->push_p
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index f78dbf9..9262c82 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -10262,7 +10262,7 @@
 
 (define_expand "allocate_stack"
   [(set (match_operand 0 "gpc_reg_operand" "")
-	(minus (reg 1) (match_operand 1 "reg_or_short_operand" "")))
+	(minus (reg 1) (match_operand 1 "reg_or_cint_operand" "")))
    (set (reg 1)
 	(minus (reg 1) (match_dup 1)))]
   ""
@@ -10272,6 +10272,15 @@
   rtx neg_op0;
   rtx insn, par, set, mem;
 
+  /* By allowing reg_or_cint_operand as the predicate we can get
+     better code for stack-clash-protection because we do not lose
+     size information.  But the rest of the code expects the operand
+     to be reg_or_short_operand.  If it isn't, then force it into
+     a register.  */
+  rtx orig_op1 = operands[1];
+  if (!reg_or_short_operand (operands[1], Pmode))
+    operands[1] = force_reg (Pmode, operands[1]);
+
   emit_move_insn (chain, stack_bot);
 
   /* Check stack bounds if necessary.  */
@@ -10284,6 +10293,55 @@
       emit_insn (gen_cond_trap (LTU, available, operands[1], const0_rtx));
     }
 
+  /* Allocate and probe if requested.
+     This may look similar to the loop we use for prologue allocations,
+     but it is critically different.  For the former we know the loop
+     will iterate, but do not know that generally here.  The former
+     uses that knowledge to rotate the loop.  Combining them would be
+     possible with some performance cost.  */
+  if (flag_stack_clash_protection)
+    {
+      rtx rounded_size, last_addr, residual;
+      HOST_WIDE_INT probe_interval;
+      compute_stack_clash_protection_loop_data (&rounded_size, &last_addr,
+						&residual, &probe_interval,
+						orig_op1);
+      
+      /* We do occasionally get in here with constant sizes, we might
+	 as well do a reasonable job when we obviously can.  */
+      if (rounded_size != CONST0_RTX (Pmode))
+	{
+	  rtx loop_lab, end_loop;
+	  bool rotated = GET_CODE (rounded_size) == CONST_INT;
+
+	  emit_stack_clash_protection_probe_loop_start (&loop_lab, &end_loop,
+							last_addr, rotated);
+
+	  if (Pmode == SImode)
+	    emit_insn (gen_movsi_update_stack (stack_pointer_rtx,
+					       stack_pointer_rtx,
+					       GEN_INT (-probe_interval),
+					       chain));
+	  else
+	    emit_insn (gen_movdi_di_update_stack (stack_pointer_rtx,
+					          stack_pointer_rtx,
+					          GEN_INT (-probe_interval),
+					          chain));
+	  emit_stack_clash_protection_probe_loop_end (loop_lab, end_loop,
+						      last_addr, rotated);
+	}
+
+      /* Now handle residuals.  We just have to set operands[1] correctly
+	 and let the rest of the expander run.  */
+      if (REG_P (residual) || GET_CODE (residual) == CONST_INT)
+	operands[1] = residual;
+      else
+	{
+	  operands[1] = gen_reg_rtx (Pmode);
+	  force_operand (residual, operands[1]);
+	}
+    }
+
   if (GET_CODE (operands[1]) != CONST_INT
       || INTVAL (operands[1]) < -32767
       || INTVAL (operands[1]) > 32768)
@@ -11422,12 +11480,13 @@
    (set_attr "length" "4")])
 
 (define_insn "probe_stack_range<P:mode>"
-  [(set (match_operand:P 0 "register_operand" "=r")
+  [(set (match_operand:P 0 "register_operand" "=&r")
 	(unspec_volatile:P [(match_operand:P 1 "register_operand" "0")
-			    (match_operand:P 2 "register_operand" "r")]
+			    (match_operand:P 2 "register_operand" "r")
+			    (match_operand:P 3 "register_operand" "r")]
 			   UNSPECV_PROBE_STACK_RANGE))]
   ""
-  "* return output_probe_stack_range (operands[0], operands[2]);"
+  "* return output_probe_stack_range (operands[0], operands[2], operands[3]);"
   [(set_attr "type" "three")])
 
 ;; Compare insns are next.  Note that the RS/6000 has two types of compares,

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]